feat(getPageContentText): refactor content text extraction logic

- add special case handling for various block types
- maintain same functionality while improving structure

Close #2151
This commit is contained in:
anime
2025-07-08 00:21:29 +08:00
parent 5e6fa26c90
commit 722aa2dafa

View File

@@ -3,6 +3,7 @@ import { getDataFromCache } from '@/lib/cache/cache_manager'
import { siteConfig } from '@/lib/config'
import { getGlobalData } from '@/lib/db/getSiteData'
import { DynamicLayout } from '@/themes/theme'
import { checkStrIsUuid } from '@/lib/utils'
const Index = props => {
const theme = siteConfig('THEME', BLOG.THEME, props.NOTION_CONFIG)
@@ -58,42 +59,6 @@ export function getStaticPaths() {
}
}
/**
* 将对象的指定字段拼接到字符串
* @param sourceTextArray
* @param targetObj
* @param key
* @returns {*}
*/
function appendText(sourceTextArray, targetObj, key) {
if (!targetObj) {
return sourceTextArray
}
const textArray = targetObj[key]
const text = textArray ? getTextContent(textArray) : ''
if (text && text !== 'Untitled') {
return sourceTextArray.concat(text)
}
return sourceTextArray
}
/**
* 递归获取层层嵌套的数组
* @param {*} textArray
* @returns
*/
function getTextContent(textArray) {
if (typeof textArray === 'object' && isIterable(textArray)) {
let result = ''
for (const textObj of textArray) {
result = result + getTextContent(textObj)
}
return result
} else if (typeof textArray === 'string') {
return textArray
}
}
/**
* 对象是否可以遍历
* @param {*} obj
@@ -124,12 +89,12 @@ async function filterByMemCache(allPosts, keyword) {
: ''
const articleInfo = post.title + post.summary + tagContent + categoryContent
let hit = articleInfo.toLowerCase().indexOf(keyword) > -1
const indexContent = getPageContentText(post, page)
const contentTextList = getPageContentText(post, page)
// console.log('全文搜索缓存', cacheKey, page != null)
post.results = []
let hitCount = 0
for (const i of indexContent) {
const c = indexContent[i]
for (const i of contentTextList) {
const c = contentTextList[i]
if (!c) {
continue
}
@@ -152,17 +117,120 @@ async function filterByMemCache(allPosts, keyword) {
}
export function getPageContentText(post, pageBlockMap) {
let indexContent = []
/**
* 将对象的指定字段拼接到字符串
* @param sourceTextArray
* @param targetObj
* @param key
* @returns string
*/
function getText(targetObj) {
if (!targetObj) {
return ''
}
const textArray = targetObj['title'] || targetObj['caption']
return getTextArray(textArray)
}
function getTextArray(textArray) {
const text = textArray ? getTextContent(textArray) : ''
if (text && text !== 'Untitled') {
return text
}
return ''
}
const removeTypeFlag = ['a', 'p', '‣']
/**
* 递归获取层层嵌套的数组
* @param {*} textArray
* @returns string
*/
function getTextContent(textArray) {
if (typeof textArray === 'object' && isIterable(textArray)) {
let result = ''
for (const textObj of textArray) {
if (textArray.length > 1 && removeTypeFlag.includes(textArray[0])) {
return result
}
result = result + getTextContent(textObj)
}
return result
} else if (typeof textArray === 'string') {
if (checkStrIsUuid(textArray) && pageBlockMap.block[textArray]) {
return getBlockContentText(textArray)
} else if (textArray === pageBlockMap.block[postId].value.space_id) {
return ''
}
return textArray
}
}
function getTransclusionReference(block) {
const result = []
const blockPointer = block.format.transclusion_reference_pointer
const blockPointerId = blockPointer.id
if (blockPointer) {
const blockContentList = pageBlockMap.block[blockPointerId].value.content
for (const blockContent of blockContentList) {
result.push(getBlockContentText(blockContent))
}
}
return result.join('')
}
function getBlockContentText(id) {
const block = pageBlockMap?.block[id].value
const blockType = block.type
switch (blockType) {
case 'transclusion_reference':
return getTransclusionReference(block)
case 'table':
return getTableText(block.content)
case 'page':
if (id !== postId) {
return getText(block.properties)
}
return ''
case 'breadcrumb':
case 'divider':
return ''
case 'quote':
default:
const properties = block?.properties
return getText(properties)
}
}
function getTableText(tableRowIds) {
const result = []
for (const blockRowId of tableRowIds) {
if (pageBlockMap.block[blockRowId]) {
const blockRow = pageBlockMap.block[blockRowId].value
const blockRowProperties = blockRow.properties
for (const blockRowPropertyValue of Object.values(blockRowProperties)) {
result.push(getTextArray(blockRowPropertyValue))
}
}
}
return result.join('')
}
const postId = post.id
let contentTextList = []
// 防止搜到加密文章的内容
if (pageBlockMap && pageBlockMap.block && !post.password) {
const contentIds = Object.keys(pageBlockMap.block)
contentIds.forEach(id => {
const properties = pageBlockMap?.block[id]?.value?.properties
indexContent = appendText(indexContent, properties, 'title')
indexContent = appendText(indexContent, properties, 'caption')
})
for (const id of contentIds) {
const blockContentText = getBlockContentText(id)
if (blockContentText) {
contentTextList.push(blockContentText)
}
}
}
return indexContent.join('')
console.log(contentTextList.join(''))
return contentTextList.join('')
}
export default Index