From 722aa2daface07931a47d897047ad543d447c31d Mon Sep 17 00:00:00 2001 From: anime Date: Tue, 8 Jul 2025 00:21:29 +0800 Subject: [PATCH 01/14] feat(getPageContentText): refactor content text extraction logic - add special case handling for various block types - maintain same functionality while improving structure Close #2151 --- pages/search/[keyword]/index.js | 160 +++++++++++++++++++++++--------- 1 file changed, 114 insertions(+), 46 deletions(-) diff --git a/pages/search/[keyword]/index.js b/pages/search/[keyword]/index.js index 4116e689..f0e63668 100644 --- a/pages/search/[keyword]/index.js +++ b/pages/search/[keyword]/index.js @@ -3,6 +3,7 @@ import { getDataFromCache } from '@/lib/cache/cache_manager' import { siteConfig } from '@/lib/config' import { getGlobalData } from '@/lib/db/getSiteData' import { DynamicLayout } from '@/themes/theme' +import { checkStrIsUuid } from '@/lib/utils' const Index = props => { const theme = siteConfig('THEME', BLOG.THEME, props.NOTION_CONFIG) @@ -58,42 +59,6 @@ export function getStaticPaths() { } } -/** - * 将对象的指定字段拼接到字符串 - * @param sourceTextArray - * @param targetObj - * @param key - * @returns {*} - */ -function appendText(sourceTextArray, targetObj, key) { - if (!targetObj) { - return sourceTextArray - } - const textArray = targetObj[key] - const text = textArray ? getTextContent(textArray) : '' - if (text && text !== 'Untitled') { - return sourceTextArray.concat(text) - } - return sourceTextArray -} - -/** - * 递归获取层层嵌套的数组 - * @param {*} textArray - * @returns - */ -function getTextContent(textArray) { - if (typeof textArray === 'object' && isIterable(textArray)) { - let result = '' - for (const textObj of textArray) { - result = result + getTextContent(textObj) - } - return result - } else if (typeof textArray === 'string') { - return textArray - } -} - /** * 对象是否可以遍历 * @param {*} obj @@ -124,12 +89,12 @@ async function filterByMemCache(allPosts, keyword) { : '' const articleInfo = post.title + post.summary + tagContent + categoryContent let hit = articleInfo.toLowerCase().indexOf(keyword) > -1 - const indexContent = getPageContentText(post, page) + const contentTextList = getPageContentText(post, page) // console.log('全文搜索缓存', cacheKey, page != null) post.results = [] let hitCount = 0 - for (const i of indexContent) { - const c = indexContent[i] + for (const i of contentTextList) { + const c = contentTextList[i] if (!c) { continue } @@ -152,17 +117,120 @@ async function filterByMemCache(allPosts, keyword) { } export function getPageContentText(post, pageBlockMap) { - let indexContent = [] + /** + * 将对象的指定字段拼接到字符串 + * @param sourceTextArray + * @param targetObj + * @param key + * @returns string + */ + function getText(targetObj) { + if (!targetObj) { + return '' + } + const textArray = targetObj['title'] || targetObj['caption'] + return getTextArray(textArray) + } + + function getTextArray(textArray) { + const text = textArray ? getTextContent(textArray) : '' + if (text && text !== 'Untitled') { + return text + } + return '' + } + + const removeTypeFlag = ['a', 'p', '‣'] + + /** + * 递归获取层层嵌套的数组 + * @param {*} textArray + * @returns string + */ + function getTextContent(textArray) { + if (typeof textArray === 'object' && isIterable(textArray)) { + let result = '' + for (const textObj of textArray) { + if (textArray.length > 1 && removeTypeFlag.includes(textArray[0])) { + return result + } + result = result + getTextContent(textObj) + } + return result + } else if (typeof textArray === 'string') { + if (checkStrIsUuid(textArray) && pageBlockMap.block[textArray]) { + return getBlockContentText(textArray) + } else if (textArray === pageBlockMap.block[postId].value.space_id) { + return '' + } + return textArray + } + } + + function getTransclusionReference(block) { + const result = [] + const blockPointer = block.format.transclusion_reference_pointer + const blockPointerId = blockPointer.id + if (blockPointer) { + const blockContentList = pageBlockMap.block[blockPointerId].value.content + for (const blockContent of blockContentList) { + result.push(getBlockContentText(blockContent)) + } + } + return result.join('') + } + + function getBlockContentText(id) { + const block = pageBlockMap?.block[id].value + const blockType = block.type + switch (blockType) { + case 'transclusion_reference': + return getTransclusionReference(block) + case 'table': + return getTableText(block.content) + case 'page': + if (id !== postId) { + return getText(block.properties) + } + return '' + case 'breadcrumb': + case 'divider': + return '' + case 'quote': + default: + const properties = block?.properties + return getText(properties) + } + } + + function getTableText(tableRowIds) { + const result = [] + for (const blockRowId of tableRowIds) { + if (pageBlockMap.block[blockRowId]) { + const blockRow = pageBlockMap.block[blockRowId].value + const blockRowProperties = blockRow.properties + for (const blockRowPropertyValue of Object.values(blockRowProperties)) { + result.push(getTextArray(blockRowPropertyValue)) + } + } + } + return result.join('') + } + + const postId = post.id + let contentTextList = [] // 防止搜到加密文章的内容 if (pageBlockMap && pageBlockMap.block && !post.password) { const contentIds = Object.keys(pageBlockMap.block) - contentIds.forEach(id => { - const properties = pageBlockMap?.block[id]?.value?.properties - indexContent = appendText(indexContent, properties, 'title') - indexContent = appendText(indexContent, properties, 'caption') - }) + for (const id of contentIds) { + const blockContentText = getBlockContentText(id) + if (blockContentText) { + contentTextList.push(blockContentText) + } + } } - return indexContent.join('') + console.log(contentTextList.join('')) + return contentTextList.join('') } export default Index From b4ba7d8f23a41294421c307e4633755ac421c907 Mon Sep 17 00:00:00 2001 From: anime Date: Tue, 8 Jul 2025 00:28:58 +0800 Subject: [PATCH 02/14] refactor(getPageContentText): Relocate getPageContentText to a dedicated file and eliminate the redundant isIterable function. --- lib/notion/getPageContentText.js | 118 ++++++++++++++++++++++++++++ lib/plugins/algolia.js | 2 +- lib/utils/post.js | 2 +- pages/search/[keyword]/index.js | 127 +------------------------------ 4 files changed, 121 insertions(+), 128 deletions(-) create mode 100644 lib/notion/getPageContentText.js diff --git a/lib/notion/getPageContentText.js b/lib/notion/getPageContentText.js new file mode 100644 index 00000000..e636fab4 --- /dev/null +++ b/lib/notion/getPageContentText.js @@ -0,0 +1,118 @@ +import { checkStrIsUuid, isIterable } from '@/lib/utils' + +export function getPageContentText(post, pageBlockMap) { + /** + * 将对象的指定字段拼接到字符串 + * @param sourceTextArray + * @param targetObj + * @param key + * @returns string + */ + function getText(targetObj) { + if (!targetObj) { + return '' + } + const textArray = targetObj['title'] || targetObj['caption'] + return getTextArray(textArray) + } + + function getTextArray(textArray) { + const text = textArray ? getTextContent(textArray) : '' + if (text && text !== 'Untitled') { + return text + } + return '' + } + + const removeTypeFlag = ['a', 'p', '‣'] + + /** + * 递归获取层层嵌套的数组 + * @param {*} textArray + * @returns string + */ + function getTextContent(textArray) { + if (typeof textArray === 'object' && isIterable(textArray)) { + let result = '' + for (const textObj of textArray) { + if (textArray.length > 1 && removeTypeFlag.includes(textArray[0])) { + return result + } + result = result + getTextContent(textObj) + } + return result + } else if (typeof textArray === 'string') { + if (checkStrIsUuid(textArray) && pageBlockMap.block[textArray]) { + return getBlockContentText(textArray) + } else if (textArray === pageBlockMap.block[postId].value.space_id) { + return '' + } + return textArray + } + } + + function getTransclusionReference(block) { + const result = [] + const blockPointer = block.format.transclusion_reference_pointer + const blockPointerId = blockPointer.id + if (blockPointer) { + const blockContentList = pageBlockMap.block[blockPointerId].value.content + for (const blockContent of blockContentList) { + result.push(getBlockContentText(blockContent)) + } + } + return result.join('') + } + + function getBlockContentText(id) { + const block = pageBlockMap?.block[id].value + const blockType = block.type + switch (blockType) { + case 'transclusion_reference': + return getTransclusionReference(block) + case 'table': + return getTableText(block.content) + case 'page': + if (id !== postId) { + return getText(block.properties) + } + return '' + case 'breadcrumb': + case 'divider': + return '' + case 'quote': + default: + const properties = block?.properties + return getText(properties) + } + } + + function getTableText(tableRowIds) { + const result = [] + for (const blockRowId of tableRowIds) { + if (pageBlockMap.block[blockRowId]) { + const blockRow = pageBlockMap.block[blockRowId].value + const blockRowProperties = blockRow.properties + for (const blockRowPropertyValue of Object.values(blockRowProperties)) { + result.push(getTextArray(blockRowPropertyValue)) + } + } + } + return result.join('') + } + + const postId = post.id + let contentTextList = [] + // 防止搜到加密文章的内容 + if (pageBlockMap && pageBlockMap.block && !post.password) { + const contentIds = Object.keys(pageBlockMap.block) + for (const id of contentIds) { + const blockContentText = getBlockContentText(id) + if (blockContentText) { + contentTextList.push(blockContentText) + } + } + } + console.log(contentTextList.join('')) + return contentTextList.join('') +} diff --git a/lib/plugins/algolia.js b/lib/plugins/algolia.js index e6c76422..fceb7acc 100644 --- a/lib/plugins/algolia.js +++ b/lib/plugins/algolia.js @@ -1,6 +1,6 @@ import BLOG from '@/blog.config' -import { getPageContentText } from '@/pages/search/[keyword]' import algoliasearch from 'algoliasearch' +import { getPageContentText } from '@/lib/notion/getPageContentText' /** * 生成全文索引 diff --git a/lib/utils/post.js b/lib/utils/post.js index a3ee0f91..e4a22d2e 100644 --- a/lib/utils/post.js +++ b/lib/utils/post.js @@ -6,11 +6,11 @@ import { getPostBlocks } from '@/lib/db/getSiteData' import { getPageTableOfContents } from '@/lib/notion/getPageTableOfContents' import { siteConfig } from '@/lib/config' import { getDataFromCache, setDataToCache } from '@/lib/cache/cache_manager' -import { getPageContentText } from '@/pages/search/[keyword]' import { getAiSummary } from '@/lib/plugins/aiSummary' import BLOG from '@/blog.config' import { uploadDataToAlgolia } from '@/lib/plugins/algolia' import { countWords } from '@/lib/plugins/wordCount' +import { getPageContentText } from '@/lib/notion/getPageContentText' /** * 获取文章的关联推荐文章列表,目前根据标签关联性筛选 diff --git a/pages/search/[keyword]/index.js b/pages/search/[keyword]/index.js index f0e63668..cb486895 100644 --- a/pages/search/[keyword]/index.js +++ b/pages/search/[keyword]/index.js @@ -3,7 +3,7 @@ import { getDataFromCache } from '@/lib/cache/cache_manager' import { siteConfig } from '@/lib/config' import { getGlobalData } from '@/lib/db/getSiteData' import { DynamicLayout } from '@/themes/theme' -import { checkStrIsUuid } from '@/lib/utils' +import { getPageContentText } from '@/lib/notion/getPageContentText' const Index = props => { const theme = siteConfig('THEME', BLOG.THEME, props.NOTION_CONFIG) @@ -59,14 +59,6 @@ export function getStaticPaths() { } } -/** - * 对象是否可以遍历 - * @param {*} obj - * @returns - */ -const isIterable = obj => - obj != null && typeof obj[Symbol.iterator] === 'function' - /** * 在内存缓存中进行全文索引 * @param {*} allPosts @@ -116,121 +108,4 @@ async function filterByMemCache(allPosts, keyword) { return filterPosts } -export function getPageContentText(post, pageBlockMap) { - /** - * 将对象的指定字段拼接到字符串 - * @param sourceTextArray - * @param targetObj - * @param key - * @returns string - */ - function getText(targetObj) { - if (!targetObj) { - return '' - } - const textArray = targetObj['title'] || targetObj['caption'] - return getTextArray(textArray) - } - - function getTextArray(textArray) { - const text = textArray ? getTextContent(textArray) : '' - if (text && text !== 'Untitled') { - return text - } - return '' - } - - const removeTypeFlag = ['a', 'p', '‣'] - - /** - * 递归获取层层嵌套的数组 - * @param {*} textArray - * @returns string - */ - function getTextContent(textArray) { - if (typeof textArray === 'object' && isIterable(textArray)) { - let result = '' - for (const textObj of textArray) { - if (textArray.length > 1 && removeTypeFlag.includes(textArray[0])) { - return result - } - result = result + getTextContent(textObj) - } - return result - } else if (typeof textArray === 'string') { - if (checkStrIsUuid(textArray) && pageBlockMap.block[textArray]) { - return getBlockContentText(textArray) - } else if (textArray === pageBlockMap.block[postId].value.space_id) { - return '' - } - return textArray - } - } - - function getTransclusionReference(block) { - const result = [] - const blockPointer = block.format.transclusion_reference_pointer - const blockPointerId = blockPointer.id - if (blockPointer) { - const blockContentList = pageBlockMap.block[blockPointerId].value.content - for (const blockContent of blockContentList) { - result.push(getBlockContentText(blockContent)) - } - } - return result.join('') - } - - function getBlockContentText(id) { - const block = pageBlockMap?.block[id].value - const blockType = block.type - switch (blockType) { - case 'transclusion_reference': - return getTransclusionReference(block) - case 'table': - return getTableText(block.content) - case 'page': - if (id !== postId) { - return getText(block.properties) - } - return '' - case 'breadcrumb': - case 'divider': - return '' - case 'quote': - default: - const properties = block?.properties - return getText(properties) - } - } - - function getTableText(tableRowIds) { - const result = [] - for (const blockRowId of tableRowIds) { - if (pageBlockMap.block[blockRowId]) { - const blockRow = pageBlockMap.block[blockRowId].value - const blockRowProperties = blockRow.properties - for (const blockRowPropertyValue of Object.values(blockRowProperties)) { - result.push(getTextArray(blockRowPropertyValue)) - } - } - } - return result.join('') - } - - const postId = post.id - let contentTextList = [] - // 防止搜到加密文章的内容 - if (pageBlockMap && pageBlockMap.block && !post.password) { - const contentIds = Object.keys(pageBlockMap.block) - for (const id of contentIds) { - const blockContentText = getBlockContentText(id) - if (blockContentText) { - contentTextList.push(blockContentText) - } - } - } - console.log(contentTextList.join('')) - return contentTextList.join('') -} - export default Index From 2a89027bb6c26d4164d1015f4ec036b1438ae046 Mon Sep 17 00:00:00 2001 From: anime Date: Tue, 8 Jul 2025 00:32:16 +0800 Subject: [PATCH 03/14] chore(getPageContentText): add todo comments for future improvements - Add todo comment for cleaning up more useless tags - Add todo comment for handling more block types --- lib/notion/getPageContentText.js | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/notion/getPageContentText.js b/lib/notion/getPageContentText.js index e636fab4..9c29cc44 100644 --- a/lib/notion/getPageContentText.js +++ b/lib/notion/getPageContentText.js @@ -24,6 +24,7 @@ export function getPageContentText(post, pageBlockMap) { return '' } + // todo: 清除更多无用标签 const removeTypeFlag = ['a', 'p', '‣'] /** @@ -67,6 +68,7 @@ export function getPageContentText(post, pageBlockMap) { function getBlockContentText(id) { const block = pageBlockMap?.block[id].value const blockType = block.type + // todo: 处理更多类型 switch (blockType) { case 'transclusion_reference': return getTransclusionReference(block) From d22e8bb177f2a48b616f4b25b97ca4cdfa3fed3d Mon Sep 17 00:00:00 2001 From: anime Date: Tue, 8 Jul 2025 15:50:54 +0800 Subject: [PATCH 04/14] feat(getPageContentText): add null checks for block references - Add validation for transclusion reference pointer existence - Return empty string when block is not found - Prevent potential errors from undefined block references --- lib/notion/getPageContentText.js | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/notion/getPageContentText.js b/lib/notion/getPageContentText.js index 9c29cc44..20c5a451 100644 --- a/lib/notion/getPageContentText.js +++ b/lib/notion/getPageContentText.js @@ -56,7 +56,7 @@ export function getPageContentText(post, pageBlockMap) { const result = [] const blockPointer = block.format.transclusion_reference_pointer const blockPointerId = blockPointer.id - if (blockPointer) { + if (blockPointer && pageBlockMap.block[blockPointerId].value) { const blockContentList = pageBlockMap.block[blockPointerId].value.content for (const blockContent of blockContentList) { result.push(getBlockContentText(blockContent)) @@ -67,6 +67,9 @@ export function getPageContentText(post, pageBlockMap) { function getBlockContentText(id) { const block = pageBlockMap?.block[id].value + if (!block) { + return '' + } const blockType = block.type // todo: 处理更多类型 switch (blockType) { From b52f81815461bd428ce8759978da79ebc12205e6 Mon Sep 17 00:00:00 2001 From: anime Date: Wed, 9 Jul 2025 13:11:52 +0800 Subject: [PATCH 05/14] feat(getPageContentText): replace custom text extraction with notion-utils The getTextContent function was removed and replaced with an import from notion-utils, simplifying the text extraction logic for Notion page content. This change removes the custom recursive implementation and uses the standardized utility function instead. --- lib/notion/getPageContentText.js | 30 +----------------------------- 1 file changed, 1 insertion(+), 29 deletions(-) diff --git a/lib/notion/getPageContentText.js b/lib/notion/getPageContentText.js index 20c5a451..fa93c9a8 100644 --- a/lib/notion/getPageContentText.js +++ b/lib/notion/getPageContentText.js @@ -1,4 +1,4 @@ -import { checkStrIsUuid, isIterable } from '@/lib/utils' +import { getTextContent } from 'notion-utils' export function getPageContentText(post, pageBlockMap) { /** @@ -24,34 +24,6 @@ export function getPageContentText(post, pageBlockMap) { return '' } - // todo: 清除更多无用标签 - const removeTypeFlag = ['a', 'p', '‣'] - - /** - * 递归获取层层嵌套的数组 - * @param {*} textArray - * @returns string - */ - function getTextContent(textArray) { - if (typeof textArray === 'object' && isIterable(textArray)) { - let result = '' - for (const textObj of textArray) { - if (textArray.length > 1 && removeTypeFlag.includes(textArray[0])) { - return result - } - result = result + getTextContent(textObj) - } - return result - } else if (typeof textArray === 'string') { - if (checkStrIsUuid(textArray) && pageBlockMap.block[textArray]) { - return getBlockContentText(textArray) - } else if (textArray === pageBlockMap.block[postId].value.space_id) { - return '' - } - return textArray - } - } - function getTransclusionReference(block) { const result = [] const blockPointer = block.format.transclusion_reference_pointer From 44ff4bcb230a6f1145df8b934727062e72f22028 Mon Sep 17 00:00:00 2001 From: anime Date: Wed, 9 Jul 2025 13:14:41 +0800 Subject: [PATCH 06/14] feat(getPageContentText): remove debug console.log from getPageContentText Commented out console.log statement in getPageContentText.js to remove debug output while maintaining the functionality. --- lib/notion/getPageContentText.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/notion/getPageContentText.js b/lib/notion/getPageContentText.js index fa93c9a8..19130991 100644 --- a/lib/notion/getPageContentText.js +++ b/lib/notion/getPageContentText.js @@ -90,6 +90,6 @@ export function getPageContentText(post, pageBlockMap) { } } } - console.log(contentTextList.join('')) + // console.log(contentTextList.join('')) return contentTextList.join('') } From 72d64d7184db69bb9b01f82bf585d38ac541935f Mon Sep 17 00:00:00 2001 From: anime Date: Wed, 9 Jul 2025 13:51:43 +0800 Subject: [PATCH 07/14] feat(getPageContentText): enhance getPageContentText to handle nested block content - refactor getText function to process block properties and content - add support for recursive processing of nested block content - improve null checks and error handling in block processing - update getBlockContentText to handle block value safely --- lib/notion/getPageContentText.js | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/lib/notion/getPageContentText.js b/lib/notion/getPageContentText.js index 19130991..e4b79b37 100644 --- a/lib/notion/getPageContentText.js +++ b/lib/notion/getPageContentText.js @@ -3,17 +3,23 @@ import { getTextContent } from 'notion-utils' export function getPageContentText(post, pageBlockMap) { /** * 将对象的指定字段拼接到字符串 - * @param sourceTextArray - * @param targetObj - * @param key + * @param block * @returns string */ - function getText(targetObj) { - if (!targetObj) { + function getText(block) { + const result = [] + const properties = block.properties + if (!properties) { return '' } - const textArray = targetObj['title'] || targetObj['caption'] - return getTextArray(textArray) + const textArray = properties['title'] || properties['caption'] + result.push(getTextArray(textArray)) + if (block['content']?.length > 0) { + for (const blockContent of block.content) { + result.push(getBlockContentText(blockContent)) + } + } + return result.join('') } function getTextArray(textArray) { @@ -38,7 +44,7 @@ export function getPageContentText(post, pageBlockMap) { } function getBlockContentText(id) { - const block = pageBlockMap?.block[id].value + const block = pageBlockMap?.block[id]?.value if (!block) { return '' } @@ -51,7 +57,7 @@ export function getPageContentText(post, pageBlockMap) { return getTableText(block.content) case 'page': if (id !== postId) { - return getText(block.properties) + return getText(block) } return '' case 'breadcrumb': @@ -59,8 +65,7 @@ export function getPageContentText(post, pageBlockMap) { return '' case 'quote': default: - const properties = block?.properties - return getText(properties) + return getText(block) } } From 2a0f4fd49c086d92f3f98284dd45ff019621e4df Mon Sep 17 00:00:00 2001 From: anime Date: Wed, 9 Jul 2025 13:58:48 +0800 Subject: [PATCH 08/14] feat(getPageContentText): ensure proper spacing in concatenated text content Modify getPageContentText.js to use space delimiter when joining text blocks instead of empty string. This change affects three functions: getPageContentText, getTextArray, and getBlockContentText to improve text readability and proper spacing between concatenated content blocks. --- lib/notion/getPageContentText.js | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/notion/getPageContentText.js b/lib/notion/getPageContentText.js index e4b79b37..79fe5689 100644 --- a/lib/notion/getPageContentText.js +++ b/lib/notion/getPageContentText.js @@ -19,7 +19,7 @@ export function getPageContentText(post, pageBlockMap) { result.push(getBlockContentText(blockContent)) } } - return result.join('') + return result.join(' ') } function getTextArray(textArray) { @@ -40,7 +40,7 @@ export function getPageContentText(post, pageBlockMap) { result.push(getBlockContentText(blockContent)) } } - return result.join('') + return result.join(' ') } function getBlockContentText(id) { @@ -80,7 +80,7 @@ export function getPageContentText(post, pageBlockMap) { } } } - return result.join('') + return result.join(' ') } const postId = post.id From 3ad7605abe0551d3a583fb52c34517728a76639d Mon Sep 17 00:00:00 2001 From: anime Date: Wed, 9 Jul 2025 14:09:37 +0800 Subject: [PATCH 09/14] fix: prevent processing content for page type blocks - Skip content processing when block type is 'page' - Update debug log message for content text concatenation --- lib/notion/getPageContentText.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/notion/getPageContentText.js b/lib/notion/getPageContentText.js index 79fe5689..18672cc0 100644 --- a/lib/notion/getPageContentText.js +++ b/lib/notion/getPageContentText.js @@ -14,7 +14,7 @@ export function getPageContentText(post, pageBlockMap) { } const textArray = properties['title'] || properties['caption'] result.push(getTextArray(textArray)) - if (block['content']?.length > 0) { + if (block.type !== 'page' && block['content']?.length > 0) { for (const blockContent of block.content) { result.push(getBlockContentText(blockContent)) } @@ -95,6 +95,6 @@ export function getPageContentText(post, pageBlockMap) { } } } - // console.log(contentTextList.join('')) + console.log('开始', contentTextList.join(''), '结束') return contentTextList.join('') } From 9321b2dfaed30d631784472c91d05ae2eaa3d273 Mon Sep 17 00:00:00 2001 From: anime Date: Wed, 9 Jul 2025 14:39:47 +0800 Subject: [PATCH 10/14] feat(getPageContentText): update getPageContentText to use post.content array - modify content extraction logic to use post.content array - avoid extra but wrong result by checking pageBlockMap.block --- lib/notion/getPageContentText.js | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/notion/getPageContentText.js b/lib/notion/getPageContentText.js index 18672cc0..94f791a5 100644 --- a/lib/notion/getPageContentText.js +++ b/lib/notion/getPageContentText.js @@ -84,12 +84,12 @@ export function getPageContentText(post, pageBlockMap) { } const postId = post.id + const postContent = post.content let contentTextList = [] // 防止搜到加密文章的内容 - if (pageBlockMap && pageBlockMap.block && !post.password) { - const contentIds = Object.keys(pageBlockMap.block) - for (const id of contentIds) { - const blockContentText = getBlockContentText(id) + if (postContent.length > 0 && !post.password) { + for (const postContentId of postContent) { + const blockContentText = getBlockContentText(postContentId) if (blockContentText) { contentTextList.push(blockContentText) } From 23550a61d3580d593a98e0b6bf886311e133d6ea Mon Sep 17 00:00:00 2001 From: anime Date: Wed, 9 Jul 2025 15:19:47 +0800 Subject: [PATCH 11/14] feat(getPageContentText): add flexible property value retrieval and support for more block types - Add getPropertyValue helper function for flexible property retrieval - Enhance getText function to accept custom keys for property lookup - Add support for additional block types: image, bookmark, callout, header - Improve documentation with JSDoc comments for better code understanding --- lib/notion/getPageContentText.js | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/lib/notion/getPageContentText.js b/lib/notion/getPageContentText.js index 94f791a5..dc94c8fc 100644 --- a/lib/notion/getPageContentText.js +++ b/lib/notion/getPageContentText.js @@ -1,18 +1,34 @@ import { getTextContent } from 'notion-utils' +/** + * 获取属性值,优先从 overrides 中读取,否则按顺序从 properties 中读取,最后返回默认值 + * @param {Object} properties 原始属性对象 + * @param {Array} keys 优先级字段名列表 + * @param {Object} overrides 自定义覆盖对象(可选) + * @param {string} defaultValue 默认值(可选) + */ +function getPropertyValue(properties, keys, overrides = {}, defaultValue = '') { + for (const key of keys) { + if (overrides[key]) return overrides[key] + if (properties[key]) return properties[key] + } + return defaultValue +} + export function getPageContentText(post, pageBlockMap) { /** * 将对象的指定字段拼接到字符串 * @param block + * @param customKeys 优先级字段名列表 * @returns string */ - function getText(block) { + function getText(block, customKeys = ['title', 'caption']) { const result = [] const properties = block.properties if (!properties) { return '' } - const textArray = properties['title'] || properties['caption'] + const textArray = getPropertyValue(properties, customKeys) result.push(getTextArray(textArray)) if (block.type !== 'page' && block['content']?.length > 0) { for (const blockContent of block.content) { @@ -61,9 +77,20 @@ export function getPageContentText(post, pageBlockMap) { } return '' case 'breadcrumb': + case 'external_object_instance': case 'divider': return '' + case 'image': + return getText(block, ['alt_text', 'title']) + // 除title以外,还有额外的link和description可供索引,但认为不需要 + case 'bookmark': case 'quote': + case 'callout': + case 'header': + case 'sub_header': + case 'code': + case 'equation': + case 'text': default: return getText(block) } From f9ac624498832b0d1b661b2bd158e1d92c9483f6 Mon Sep 17 00:00:00 2001 From: anime Date: Wed, 9 Jul 2025 16:00:15 +0800 Subject: [PATCH 12/14] feat(getPageContentText): implement getFullTextContent for enhanced text extraction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add new getFullTextContent function to handle various Notion text formats - Support equation extraction from decorated text - Handle special characters like '⁍' and '‣' with proper content resolution - Process date mentions, link mentions, and other reference types - Replace getTextContent with getFullTextContent in getTextArray function - Maintain backward compatibility with existing text processing --- lib/notion/getPageContentText.js | 60 ++++++++++++++++++++++++++++++-- 1 file changed, 57 insertions(+), 3 deletions(-) diff --git a/lib/notion/getPageContentText.js b/lib/notion/getPageContentText.js index dc94c8fc..d8095588 100644 --- a/lib/notion/getPageContentText.js +++ b/lib/notion/getPageContentText.js @@ -1,5 +1,3 @@ -import { getTextContent } from 'notion-utils' - /** * 获取属性值,优先从 overrides 中读取,否则按顺序从 properties 中读取,最后返回默认值 * @param {Object} properties 原始属性对象 @@ -15,6 +13,62 @@ function getPropertyValue(properties, keys, overrides = {}, defaultValue = '') { return defaultValue } +/** + * 提取 Notion 装饰文本的纯文本内容。 + * 可选传入 resolveRef 来解析引用(例如 '‣' 指向的页面标题) + * + * @param {Array} text - Notion Decoration[] 格式的文本数组 + * @returns {string} + */ +function getFullTextContent(text) { + if (!text) return '' + + if (!Array.isArray(text)) return String(text) + + return text.reduce((result, item) => { + const value = item[0] + const decorations = item[1] + + if (value === '⁍') { + // 检查是否有公式 + const equation = decorations?.find(d => d[0] === 'e') + if (equation) { + return result + equation[1] // 提取 LaTeX 内容 + } + return result // 否则什么都不加 + } + + if (value === '‣') { + const ref = Array.isArray(decorations) ? decorations[0] : null + const type = ref?.[0] + const data = ref?.[1] + + switch (type) { + case 'd': + // 日期字符串 + const date = + data?.start_date || + data?.start_time || + data?.end_date || + data?.end_time || + '[Date]' + return result + date + case 'lm': + // Link Mention + const title = data?.title || data?.href || '[Link]' + return result + title + // 用户 ID,这里不展开,默认忽略或标记 + case 'u': + default: + return result + } + } + + // 默认拼接普通文本 + return result + value + }, '') +} + export function getPageContentText(post, pageBlockMap) { /** * 将对象的指定字段拼接到字符串 @@ -39,7 +93,7 @@ export function getPageContentText(post, pageBlockMap) { } function getTextArray(textArray) { - const text = textArray ? getTextContent(textArray) : '' + const text = textArray ? getFullTextContent(textArray) : '' if (text && text !== 'Untitled') { return text } From 9886e4d146cf7270d7b833e7dc847bde797192c6 Mon Sep 17 00:00:00 2001 From: anime Date: Wed, 9 Jul 2025 16:12:12 +0800 Subject: [PATCH 13/14] feat(getPageContentText): add references to NotionX types and comment out debug log - Add reference links to NotionX type definitions for better documentation - Comment out debug console.log statement to clean up output - Maintain existing functionality while improving code clarity --- lib/notion/getPageContentText.js | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/notion/getPageContentText.js b/lib/notion/getPageContentText.js index d8095588..06a11f7f 100644 --- a/lib/notion/getPageContentText.js +++ b/lib/notion/getPageContentText.js @@ -42,7 +42,7 @@ function getFullTextContent(text) { const ref = Array.isArray(decorations) ? decorations[0] : null const type = ref?.[0] const data = ref?.[1] - + // todo: 处理更多类型 https://github.com/NotionX/react-notion-x/blob/9ee2d9334e260ee3600f4f8d7212f66b641b19cc/packages/notion-types/src/core.ts#L108 switch (type) { case 'd': // 日期字符串 @@ -119,7 +119,7 @@ export function getPageContentText(post, pageBlockMap) { return '' } const blockType = block.type - // todo: 处理更多类型 + // todo: 处理更多类型 https://github.com/NotionX/react-notion-x/blob/9ee2d9334e260ee3600f4f8d7212f66b641b19cc/packages/notion-types/src/block.ts#L3 switch (blockType) { case 'transclusion_reference': return getTransclusionReference(block) @@ -176,6 +176,6 @@ export function getPageContentText(post, pageBlockMap) { } } } - console.log('开始', contentTextList.join(''), '结束') + // console.log('开始', contentTextList.join(''), '结束') return contentTextList.join('') } From ea1b76f5b7c2f10ce80efea85e022a64d8d9ca06 Mon Sep 17 00:00:00 2001 From: anime Date: Wed, 9 Jul 2025 16:22:38 +0800 Subject: [PATCH 14/14] fix(getPageContentText): add null check for postContent before accessing length property Ensure postContent exists before checking its length property to prevent potential runtime errors when postContent is null or undefined. --- lib/notion/getPageContentText.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/notion/getPageContentText.js b/lib/notion/getPageContentText.js index 06a11f7f..0e52083b 100644 --- a/lib/notion/getPageContentText.js +++ b/lib/notion/getPageContentText.js @@ -168,7 +168,7 @@ export function getPageContentText(post, pageBlockMap) { const postContent = post.content let contentTextList = [] // 防止搜到加密文章的内容 - if (postContent.length > 0 && !post.password) { + if (postContent && postContent.length > 0 && !post.password) { for (const postContentId of postContent) { const blockContentText = getBlockContentText(postContentId) if (blockContentText) {