/**
* SEO 优化和结构化数据生成工具
* 为 OceanPress 生成的 HTML 页面添加 JSON-LD 结构化数据和其他 SEO 优化
*/
import { Config } from './config.ts'
import { S_Node } from './siyuan_type.ts'
// 简化的停用词集合(减少内存占用)
const simplifiedStopWords = new Set([
'的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一', '个', '上', '也', '很', '到', '说', '要', '去', '你', '会', '着', '没有', '看', '好', '自己', '这', '那', '现在', '可以', '但是', '还是', '因为', '什么', '如果', '所以',
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they'
])
// HTML标签清理缓存(针对相同内容的重复处理)
const htmlCleanupCache = new Map()
// 限制缓存大小
const MAX_CACHE_SIZE = 500
function manageCacheSize(): void {
if (htmlCleanupCache.size > MAX_CACHE_SIZE) {
const firstKey = htmlCleanupCache.keys().next().value
if (firstKey) {
htmlCleanupCache.delete(firstKey)
}
}
}
/**
* 从思源文档ID中提取日期时间并格式化为 ISO 8601
* 思源ID格式: 20250801084546-uvfteud -> 2025-08-01T08:45:46Z
*/
function extractDateFromId(id?: string): string {
if (!id || id.length < 14) return new Date().toISOString()
try {
const datePart = id.slice(0, 14) // 取前14位: 20250801084546
if (datePart.length === 14) {
const year = datePart.slice(0, 4)
const month = datePart.slice(4, 6)
const day = datePart.slice(6, 8)
const hour = datePart.slice(8, 10)
const minute = datePart.slice(10, 12)
const second = datePart.slice(12, 14)
return `${year}-${month}-${day}T${hour}:${minute}:${second}Z`
}
} catch {
// 忽略解析错误,使用默认值
}
return new Date().toISOString()
}
/**
* 格式化日期为 ISO 8601 格式
*/
function formatDate(dateString?: string): string {
if (!dateString) return new Date().toISOString()
try {
// 思源笔记的日期格式通常是 YYYYMMDDHHMMSS 或时间戳
if (dateString.length === 14) {
// YYYYMMDDHHMMSS 格式
const year = dateString.slice(0, 4)
const month = dateString.slice(4, 6)
const day = dateString.slice(6, 8)
const hour = dateString.slice(8, 10)
const minute = dateString.slice(10, 12)
const second = dateString.slice(12, 14)
return `${year}-${month}-${day}T${hour}:${minute}:${second}Z`
}
// 尝试解析为时间戳
const timestamp = parseInt(dateString)
if (!isNaN(timestamp)) {
return new Date(timestamp).toISOString()
}
// 尝试直接解析
return new Date(dateString).toISOString()
} catch {
return new Date().toISOString()
}
}
/**
* 优化的HTML清理函数(带缓存)
*/
function cleanHtmlContent(content: string): string {
const cacheKey = content
if (htmlCleanupCache.has(cacheKey)) {
return htmlCleanupCache.get(cacheKey)!
}
const cleaned = content
.replace(/<[^>]*>/g, ' ')
.replace(/&[^;]+;/g, ' ')
.replace(/\s+/g, ' ')
.trim()
htmlCleanupCache.set(cacheKey, cleaned)
manageCacheSize()
return cleaned
}
/**
* 从文档内容中提取描述文本(带缓存)
*/
function extractDescription(content: string, maxLength = 160): string {
const plainText = cleanHtmlContent(content)
if (plainText.length <= maxLength) {
return plainText
}
// 尝试在句子边界截断
const truncated = plainText.substring(0, maxLength)
const lastSentenceEnd = Math.max(
truncated.lastIndexOf('。'),
truncated.lastIndexOf('!'),
truncated.lastIndexOf('?'),
truncated.lastIndexOf('.'),
truncated.lastIndexOf('!'),
truncated.lastIndexOf('?')
)
return lastSentenceEnd > maxLength * 0.7
? truncated.substring(0, lastSentenceEnd + 1)
: truncated + '...'
}
/**
* 优化的关键词提取(带缓存和简化算法)
*/
function extractKeywords(content: string): string[] {
const plainText = cleanHtmlContent(content)
return simplifiedKeywordExtraction(plainText, 8)
}
/**
* 简化的关键词提取算法(性能优化)
*/
function simplifiedKeywordExtraction(text: string, maxKeywords: number = 8): string[] {
const words: string[] = []
const wordCount = new Map()
// 快速提取英文单词和技术术语
const englishWords = text.match(/[a-zA-Z]{3,}/g) || []
for (const word of englishWords) {
const lowerWord = word.toLowerCase()
if (!simplifiedStopWords.has(lowerWord) && lowerWord.length >= 3) {
words.push(lowerWord)
wordCount.set(lowerWord, (wordCount.get(lowerWord) || 0) + 1)
}
}
// 快速提取中文词汇(2-4字)
const chineseWords = text.match(/[\u4e00-\u9fa5]{2,4}/g) || []
for (const word of chineseWords) {
if (!simplifiedStopWords.has(word)) {
words.push(word)
wordCount.set(word, (wordCount.get(word) || 0) + 1)
}
}
// 基于词频排序,返回前N个关键词
return Array.from(wordCount.entries())
.sort((a, b) => b[1] - a[1])
.slice(0, maxKeywords)
.map(([word]) => word)
}
/**
* 生成文章的 JSON-LD 结构化数据
*/
export function generateArticleJsonLd(
doc: S_Node,
config: Config,
pageUrl: string,
content: string
): string {
const title = doc.Properties?.title || '未命名文档'
const description = extractDescription(content)
const keywords = extractKeywords(content)
// 从ID中提取创建时间,如果没有则使用updated时间
const datePublished = extractDateFromId(doc.ID)
const dateModified = formatDate(doc.Properties?.updated)
const jsonLd = {
'@context': 'https://schema.org',
'@type': 'Article',
headline: title,
description: description,
keywords: keywords.join(', '),
datePublished: datePublished,
dateModified: dateModified,
author: {
'@type': 'Person',
name: config.sitemap?.title || '崮生',
url: config.sitemap?.siteLink || ''
},
publisher: {
'@type': 'Organization',
name: config.sitemap?.title || 'OceanPress',
logo: {
'@type': 'ImageObject',
url: `${config.sitemap?.siteLink || ''}/assets/logo.png`
}
},
mainEntityOfPage: {
'@type': 'WebPage',
'@id': pageUrl
},
image: doc.Properties?.['title-img'] ? {
'@type': 'ImageObject',
url: `${config.sitemap?.siteLink || ''}${doc.Properties['title-img'].replace('assets', '/assets')}`,
width: 1200,
height: 630
} : undefined,
wordCount: content.replace(/<[^>]*>/g, '').length,
articleSection: '技术文档',
inLanguage: 'zh-CN'
}
return ``
}
/**
* 生成面包屑导航的 JSON-LD
*/
export function generateBreadcrumbJsonLd(
breadcrumbs: Array<{ name: string; url: string }>
): string {
const itemListElement = breadcrumbs.map((crumb, index) => ({
'@type': 'ListItem',
position: index + 1,
name: crumb.name,
item: crumb.url
}))
const jsonLd = {
'@context': 'https://schema.org',
'@type': 'BreadcrumbList',
itemListElement: itemListElement
}
return ``
}
/**
* 生成网站的 JSON-LD
*/
export function generateWebsiteJsonLd(config: Config): string {
const jsonLd = {
'@context': 'https://schema.org',
'@type': 'WebSite',
name: config.sitemap?.title || 'OceanPress 站点',
description: config.sitemap?.description || '基于思源笔记的静态站点',
url: config.sitemap?.siteLink || '',
author: {
'@type': 'Person',
name: '崮生'
},
potentialAction: {
'@type': 'SearchAction',
target: {
'@type': 'EntryPoint',
urlTemplate: `${config.sitemap?.siteLink || ''}/search?q={search_term_string}`
},
'query-input': 'required name=search_term_string'
},
inLanguage: 'zh-CN'
}
return ``
}
/**
* 生成完整的 SEO meta 标签
*/
export function generateSeoMetaTags(
doc: S_Node,
config: Config,
pageUrl: string,
content: string
): string {
const title = doc.Properties?.title || '未命名文档'
const description = extractDescription(content)
const keywords = extractKeywords(content)
let metaTags = ''
// 基础 meta 标签
metaTags += ` \n`
metaTags += ` \n`
metaTags += ` \n`
// Open Graph 标签
metaTags += ` \n`
metaTags += ` \n`
metaTags += ` \n`
metaTags += ` \n`
metaTags += ` \n`
if (doc.Properties?.['title-img']) {
const imageUrl = `${config.sitemap?.siteLink || ''}${doc.Properties['title-img'].replace('assets', '/assets')}`
metaTags += ` \n`
metaTags += ` \n`
metaTags += ` \n`
}
// 从ID中提取创建时间
const datePublished = extractDateFromId(doc.ID)
const dateModified = formatDate(doc.Properties?.updated)
metaTags += ` \n`
metaTags += ` \n`
// Twitter Card 标签
metaTags += ` \n`
metaTags += ` \n`
metaTags += ` \n`
if (doc.Properties?.['title-img']) {
const imageUrl = `${config.sitemap?.siteLink || ''}${doc.Properties['title-img'].replace('assets', '/assets')}`
metaTags += ` \n`
}
// 其他 SEO 标签
metaTags += ` \n`
metaTags += ` \n`
return metaTags
}
/**
* SEO 数据接口
*/
export interface SeoData {
doc: S_Node
config: Config
pageUrl: string
content: string
breadcrumbs?: Array<{ name: string; url: string }>
}
/**
* 生成完整的 SEO 优化代码(性能优化版本)
*/
export function generateSeoContent(data: SeoData): {
metaTags: string
jsonLd: string
} {
// 只对内容处理进行缓存,不缓存整个SEO结果
const plainText = cleanHtmlContent(data.content)
const description = extractDescriptionFromPlainText(plainText)
const keywords = simplifiedKeywordExtraction(plainText, 8)
const metaTags = generateSeoMetaTagsFromCache(data.doc, data.config, data.pageUrl, { plainText, description, keywords })
let jsonLd = generateArticleJsonLdFromCache(data.doc, data.config, data.pageUrl, { plainText, description, keywords })
// 面包屑导航不需要缓存,每个文档都不同
if (data.breadcrumbs && data.breadcrumbs.length > 0) {
jsonLd += '\n' + generateBreadcrumbJsonLd(data.breadcrumbs)
}
return { metaTags, jsonLd }
}
/**
* 从缓存生成SEO meta标签
*/
function generateSeoMetaTagsFromCache(
doc: S_Node,
config: Config,
pageUrl: string,
cache: { plainText: string; description: string; keywords: string[] }
): string {
const title = doc.Properties?.title || '未命名文档'
const { description, keywords } = cache
let metaTags = ''
// 基础 meta 标签
metaTags += ` \n`
metaTags += ` \n`
metaTags += ` \n`
// Open Graph 标签
metaTags += ` \n`
metaTags += ` \n`
metaTags += ` \n`
metaTags += ` \n`
metaTags += ` \n`
if (doc.Properties?.['title-img']) {
const imageUrl = `${config.sitemap?.siteLink || ''}${doc.Properties['title-img'].replace('assets', '/assets')}`
metaTags += ` \n`
metaTags += ` \n`
metaTags += ` \n`
}
// 从ID中提取创建时间
const datePublished = extractDateFromId(doc.ID)
const dateModified = formatDate(doc.Properties?.updated)
metaTags += ` \n`
metaTags += ` \n`
// Twitter Card 标签
metaTags += ` \n`
metaTags += ` \n`
metaTags += ` \n`
if (doc.Properties?.['title-img']) {
const imageUrl = `${config.sitemap?.siteLink || ''}${doc.Properties['title-img'].replace('assets', '/assets')}`
metaTags += ` \n`
}
// 其他 SEO 标签
metaTags += ` \n`
metaTags += ` \n`
return metaTags
}
/**
* 从缓存生成JSON-LD
*/
function generateArticleJsonLdFromCache(
doc: S_Node,
config: Config,
pageUrl: string,
cache: { plainText: string; description: string; keywords: string[] }
): string {
const title = doc.Properties?.title || '未命名文档'
const { description, keywords } = cache
const datePublished = extractDateFromId(doc.ID)
const dateModified = formatDate(doc.Properties?.updated)
const jsonLd = {
'@context': 'https://schema.org',
'@type': 'Article',
headline: title,
description: description,
keywords: keywords.join(', '),
datePublished: datePublished,
dateModified: dateModified,
author: {
'@type': 'Person',
name: config.sitemap?.title || '崮生',
url: config.sitemap?.siteLink || ''
},
publisher: {
'@type': 'Organization',
name: config.sitemap?.title || 'OceanPress',
logo: {
'@type': 'ImageObject',
url: `${config.sitemap?.siteLink || ''}/assets/logo.png`
}
},
mainEntityOfPage: {
'@type': 'WebPage',
'@id': pageUrl
},
image: doc.Properties?.['title-img'] ? {
'@type': 'ImageObject',
url: `${config.sitemap?.siteLink || ''}${doc.Properties['title-img'].replace('assets', '/assets')}`,
width: 1200,
height: 630
} : undefined,
wordCount: cache.plainText.length,
articleSection: '技术文档',
inLanguage: 'zh-CN'
}
return ``
}
/**
* 从纯文本提取描述(优化版本)
*/
function extractDescriptionFromPlainText(plainText: string, maxLength = 160): string {
if (plainText.length <= maxLength) return plainText
const truncated = plainText.substring(0, maxLength)
const lastSentenceEnd = Math.max(
truncated.lastIndexOf('。'),
truncated.lastIndexOf('!'),
truncated.lastIndexOf('?'),
truncated.lastIndexOf('.'),
truncated.lastIndexOf('!'),
truncated.lastIndexOf('?')
)
return lastSentenceEnd > maxLength * 0.7
? truncated.substring(0, lastSentenceEnd + 1)
: truncated + '...'
}