From 94eceaed9694aa5a2a143ea2df2d9be001642ec0 Mon Sep 17 00:00:00 2001 From: bielie Date: Tue, 17 Feb 2026 14:49:47 +0800 Subject: [PATCH] improve lexical warmup and standardize stopword pipeline --- modules/story-summary/story-summary.js | 2 + .../story-summary/vector/llm/llm-service.js | 1 - .../vector/retrieval/lexical-index.js | 563 ++--- .../story-summary/vector/retrieval/metrics.js | 18 + .../vector/retrieval/query-builder.js | 10 +- .../story-summary/vector/retrieval/recall.js | 13 +- .../vector/utils/stopwords-base.js | 2231 +++++++++++++++++ .../stopwords-data/LICENSE.stopwords-iso.txt | 21 + .../vector/utils/stopwords-data/SOURCES.md | 15 + .../utils/stopwords-data/stopwords-iso.en.txt | 1298 ++++++++++ .../utils/stopwords-data/stopwords-iso.ja.txt | 134 + .../utils/stopwords-data/stopwords-iso.zh.txt | 794 ++++++ .../vector/utils/stopwords-patch.js | 9 + .../story-summary/vector/utils/tokenizer.js | 61 +- 14 files changed, 4840 insertions(+), 330 deletions(-) create mode 100644 modules/story-summary/vector/utils/stopwords-base.js create mode 100644 modules/story-summary/vector/utils/stopwords-data/LICENSE.stopwords-iso.txt create mode 100644 modules/story-summary/vector/utils/stopwords-data/SOURCES.md create mode 100644 modules/story-summary/vector/utils/stopwords-data/stopwords-iso.en.txt create mode 100644 modules/story-summary/vector/utils/stopwords-data/stopwords-iso.ja.txt create mode 100644 modules/story-summary/vector/utils/stopwords-data/stopwords-iso.zh.txt create mode 100644 modules/story-summary/vector/utils/stopwords-patch.js diff --git a/modules/story-summary/story-summary.js b/modules/story-summary/story-summary.js index 9cb242f..4f95d0b 100644 --- a/modules/story-summary/story-summary.js +++ b/modules/story-summary/story-summary.js @@ -1551,6 +1551,7 @@ async function handleMessageReceived(scheduledChatId) { // Refresh entity lexicon after new message (new roles may appear) refreshEntityLexiconAndWarmup(); + scheduleLexicalWarmup(100); // Auto backfill missing L0 (delay to avoid contention with current floor) setTimeout(() => maybeAutoExtractL0(), 2000); @@ -1559,6 +1560,7 @@ async function handleMessageReceived(scheduledChatId) { function handleMessageSent(scheduledChatId) { if (isChatStale(scheduledChatId)) return; initButtonsForAll(); + scheduleLexicalWarmup(0); setTimeout(() => maybeAutoRunSummary("before_user"), 1000); } diff --git a/modules/story-summary/vector/llm/llm-service.js b/modules/story-summary/vector/llm/llm-service.js index 13ec391..7120b64 100644 --- a/modules/story-summary/vector/llm/llm-service.js +++ b/modules/story-summary/vector/llm/llm-service.js @@ -2,7 +2,6 @@ // vector/llm/llm-service.js - 修复 prefill 传递方式 // ═══════════════════════════════════════════════════════════════════════════ import { xbLog } from '../../../../core/debug-core.js'; -import { getVectorConfig } from '../../data/config.js'; import { getApiKey } from './siliconflow.js'; const MODULE_ID = 'vector-llm-service'; diff --git a/modules/story-summary/vector/retrieval/lexical-index.js b/modules/story-summary/vector/retrieval/lexical-index.js index 83124d6..f464af7 100644 --- a/modules/story-summary/vector/retrieval/lexical-index.js +++ b/modules/story-summary/vector/retrieval/lexical-index.js @@ -1,16 +1,3 @@ -// ═══════════════════════════════════════════════════════════════════════════ -// lexical-index.js - MiniSearch 词法检索索引 -// -// 职责: -// 1. 对 L0 atoms + L1 chunks + L2 events 建立词法索引 -// 2. 提供词法检索接口(专名精确匹配兜底) -// 3. 惰性构建 + 异步预热 + 缓存失效机制 -// -// 索引存储:纯内存(不持久化) -// 分词器:统一使用 tokenizer.js(结巴 + 实体保护 + 降级) -// 重建时机:CHAT_CHANGED / L0提取完成 / L2总结完成 -// ═══════════════════════════════════════════════════════════════════════════ - import MiniSearch from '../../../../libs/minisearch.mjs'; import { getContext } from '../../../../../../../extensions.js'; import { getSummaryStore } from '../../data/store.js'; @@ -20,76 +7,166 @@ import { tokenizeForIndex } from '../utils/tokenizer.js'; const MODULE_ID = 'lexical-index'; -// ───────────────────────────────────────────────────────────────────────── -// 缓存 -// ───────────────────────────────────────────────────────────────────────── - -/** @type {MiniSearch|null} */ +// In-memory index cache let cachedIndex = null; - -/** @type {string|null} */ let cachedChatId = null; - -/** @type {string|null} 数据指纹(atoms + chunks + events 数量) */ let cachedFingerprint = null; - -/** @type {boolean} 是否正在构建 */ let building = false; - -/** @type {Promise|null} 当前构建 Promise(防重入) */ let buildPromise = null; -/** @type {Map} floor → 该楼层的 doc IDs(仅 L1 chunks) */ + +// floor -> chunk doc ids (L1 only) let floorDocIds = new Map(); -// ───────────────────────────────────────────────────────────────────────── -// 工具函数 -// ───────────────────────────────────────────────────────────────────────── +// IDF stats over lexical docs (L1 chunks + L2 events) +let termDfMap = new Map(); +let docTokenSets = new Map(); // docId -> Set +let lexicalDocCount = 0; + +const IDF_MIN = 1.0; +const IDF_MAX = 4.0; +const BUILD_BATCH_SIZE = 500; -/** - * 清理事件摘要(移除楼层标记) - * @param {string} summary - * @returns {string} - */ function cleanSummary(summary) { return String(summary || '') .replace(/\s*\(#\d+(?:-\d+)?\)\s*$/, '') .trim(); } -/** - * 计算缓存指纹 - * @param {number} chunkCount - * @param {number} eventCount - * @returns {string} - */ -function computeFingerprint(chunkCount, eventCount) { - return `${chunkCount}:${eventCount}`; +function fnv1a32(input, seed = 0x811C9DC5) { + let hash = seed >>> 0; + const text = String(input || ''); + for (let i = 0; i < text.length; i++) { + hash ^= text.charCodeAt(i); + hash = Math.imul(hash, 0x01000193) >>> 0; + } + return hash >>> 0; +} + +function compareDocKeys(a, b) { + const ka = `${a?.type || ''}:${a?.id || ''}`; + const kb = `${b?.type || ''}:${b?.id || ''}`; + if (ka < kb) return -1; + if (ka > kb) return 1; + return 0; +} + +function computeFingerprintFromDocs(docs) { + const normalizedDocs = Array.isArray(docs) ? [...docs].sort(compareDocKeys) : []; + let hash = 0x811C9DC5; + + for (const doc of normalizedDocs) { + const payload = `${doc?.type || ''}\u001F${doc?.id || ''}\u001F${doc?.floor ?? ''}\u001F${doc?.text || ''}\u001E`; + hash = fnv1a32(payload, hash); + } + + return `${normalizedDocs.length}:${(hash >>> 0).toString(16)}`; } -/** - * 让出主线程(避免长时间阻塞 UI) - * @returns {Promise} - */ function yieldToMain() { return new Promise(resolve => setTimeout(resolve, 0)); } -// ───────────────────────────────────────────────────────────────────────── -// 文档收集 -// ───────────────────────────────────────────────────────────────────────── +function clamp(v, min, max) { + return Math.max(min, Math.min(max, v)); +} + +function normalizeTerm(term) { + return String(term || '').trim().toLowerCase(); +} + +function computeIdfFromDf(df, docCount) { + if (!docCount || docCount <= 0) return 1; + const raw = Math.log((docCount + 1) / ((df || 0) + 1)) + 1; + return clamp(raw, IDF_MIN, IDF_MAX); +} + +function computeIdf(term) { + const t = normalizeTerm(term); + if (!t || lexicalDocCount <= 0) return 1; + return computeIdfFromDf(termDfMap.get(t) || 0, lexicalDocCount); +} + +function extractUniqueTokens(text) { + return new Set(tokenizeForIndex(String(text || '')).map(normalizeTerm).filter(Boolean)); +} + +function clearIdfState() { + termDfMap = new Map(); + docTokenSets = new Map(); + lexicalDocCount = 0; +} + +function removeDocumentIdf(docId) { + const id = String(docId || ''); + if (!id) return; + + const tokens = docTokenSets.get(id); + if (!tokens) return; + + for (const token of tokens) { + const current = termDfMap.get(token) || 0; + if (current <= 1) { + termDfMap.delete(token); + } else { + termDfMap.set(token, current - 1); + } + } + + docTokenSets.delete(id); + lexicalDocCount = Math.max(0, lexicalDocCount - 1); +} + +function addDocumentIdf(docId, text) { + const id = String(docId || ''); + if (!id) return; + + // Replace semantics: remove old token set first if this id already exists. + removeDocumentIdf(id); + + const tokens = extractUniqueTokens(text); + docTokenSets.set(id, tokens); + lexicalDocCount += 1; + + for (const token of tokens) { + termDfMap.set(token, (termDfMap.get(token) || 0) + 1); + } +} + +function rebuildIdfFromDocs(docs) { + clearIdfState(); + for (const doc of docs || []) { + const id = String(doc?.id || ''); + const text = String(doc?.text || ''); + if (!id || !text.trim()) continue; + addDocumentIdf(id, text); + } +} + +function buildEventDoc(ev) { + if (!ev?.id) return null; + + const parts = []; + if (ev.title) parts.push(ev.title); + if (ev.participants?.length) parts.push(ev.participants.join(' ')); + + const summary = cleanSummary(ev.summary); + if (summary) parts.push(summary); + + const text = parts.join(' ').trim(); + if (!text) return null; + + return { + id: ev.id, + type: 'event', + floor: null, + text, + }; +} -/** - * 收集所有待索引文档 - * - * @param {object[]} chunks - getAllChunks(chatId) 返回值 - * @param {object[]} events - store.json.events - * @returns {object[]} 文档数组 - */ function collectDocuments(chunks, events) { const docs = []; - // L1 chunks + 填充 floorDocIds - for (const chunk of (chunks || [])) { + for (const chunk of chunks || []) { if (!chunk?.chunkId || !chunk.text) continue; const floor = chunk.floor ?? -1; @@ -101,48 +178,19 @@ function collectDocuments(chunks, events) { }); if (floor >= 0) { - if (!floorDocIds.has(floor)) { - floorDocIds.set(floor, []); - } + if (!floorDocIds.has(floor)) floorDocIds.set(floor, []); floorDocIds.get(floor).push(chunk.chunkId); } } - // L2 events - for (const ev of (events || [])) { - if (!ev?.id) continue; - const parts = []; - if (ev.title) parts.push(ev.title); - if (ev.participants?.length) parts.push(ev.participants.join(' ')); - const summary = cleanSummary(ev.summary); - if (summary) parts.push(summary); - const text = parts.join(' ').trim(); - if (!text) continue; - - docs.push({ - id: ev.id, - type: 'event', - floor: null, - text, - }); + for (const ev of events || []) { + const doc = buildEventDoc(ev); + if (doc) docs.push(doc); } return docs; } -// ───────────────────────────────────────────────────────────────────────── -// 索引构建(分片,不阻塞主线程) -// ───────────────────────────────────────────────────────────────────────── - -/** 每批添加的文档数 */ -const BUILD_BATCH_SIZE = 500; - -/** - * 构建 MiniSearch 索引(分片异步) - * - * @param {object[]} docs - 文档数组 - * @returns {Promise} - */ async function buildIndexAsync(docs) { const T0 = performance.now(); @@ -158,49 +206,43 @@ async function buildIndexAsync(docs) { tokenize: tokenizeForIndex, }); - if (!docs.length) { - return index; - } + if (!docs.length) return index; - // 分片添加,每批 BUILD_BATCH_SIZE 条后让出主线程 for (let i = 0; i < docs.length; i += BUILD_BATCH_SIZE) { const batch = docs.slice(i, i + BUILD_BATCH_SIZE); index.addAll(batch); - // 非最后一批时让出主线程 if (i + BUILD_BATCH_SIZE < docs.length) { await yieldToMain(); } } const elapsed = Math.round(performance.now() - T0); - xbLog.info(MODULE_ID, - `索引构建完成: ${docs.length} 文档 (${elapsed}ms)` - ); - + xbLog.info(MODULE_ID, `Index built: ${docs.length} docs (${elapsed}ms)`); return index; } -// ───────────────────────────────────────────────────────────────────────── -// 检索 -// ───────────────────────────────────────────────────────────────────────── - /** * @typedef {object} LexicalSearchResult - * @property {string[]} atomIds - 命中的 L0 atom IDs - * @property {Set} atomFloors - 命中的 L0 楼层集合 - * @property {string[]} chunkIds - 命中的 L1 chunk IDs - * @property {Set} chunkFloors - 命中的 L1 楼层集合 - * @property {string[]} eventIds - 命中的 L2 event IDs - * @property {object[]} chunkScores - chunk 命中详情 [{ chunkId, score }] - * @property {number} searchTime - 检索耗时 ms + * @property {string[]} atomIds - Reserved for backward compatibility (currently empty). + * @property {Set} atomFloors - Reserved for backward compatibility (currently empty). + * @property {string[]} chunkIds - Matched L1 chunk ids sorted by weighted lexical score. + * @property {Set} chunkFloors - Floor ids covered by matched chunks. + * @property {string[]} eventIds - Matched L2 event ids sorted by weighted lexical score. + * @property {object[]} chunkScores - Weighted lexical scores for matched chunks. + * @property {boolean} idfEnabled - Whether IDF stats are available for weighting. + * @property {number} idfDocCount - Number of lexical docs used to compute IDF. + * @property {Array<{term:string,idf:number}>} topIdfTerms - Top query terms by IDF. + * @property {number} termSearches - Number of per-term MiniSearch queries executed. + * @property {number} searchTime - Total lexical search time in milliseconds. */ /** - * 在词法索引中检索 + * Search lexical index by terms, using per-term MiniSearch and IDF-weighted score aggregation. + * This keeps existing outputs compatible while adding observability fields. * - * @param {MiniSearch} index - 索引实例 - * @param {string[]} terms - 查询词列表 + * @param {MiniSearch} index + * @param {string[]} terms * @returns {LexicalSearchResult} */ export function searchLexicalIndex(index, terms) { @@ -213,6 +255,10 @@ export function searchLexicalIndex(index, terms) { chunkFloors: new Set(), eventIds: [], chunkScores: [], + idfEnabled: lexicalDocCount > 0, + idfDocCount: lexicalDocCount, + topIdfTerms: [], + termSearches: 0, searchTime: 0, }; @@ -221,79 +267,84 @@ export function searchLexicalIndex(index, terms) { return result; } - // 用所有 terms 联合查询 - const queryString = terms.join(' '); + const queryTerms = Array.from(new Set((terms || []).map(normalizeTerm).filter(Boolean))); + const weightedScores = new Map(); // docId -> score + const hitMeta = new Map(); // docId -> { type, floor } + const idfPairs = []; - let hits; - try { - hits = index.search(queryString, { - boost: { text: 1 }, - fuzzy: 0.2, - prefix: true, - combineWith: 'OR', - // 使用与索引相同的分词器 - tokenize: tokenizeForIndex, - }); - } catch (e) { - xbLog.warn(MODULE_ID, '检索失败', e); - result.searchTime = Math.round(performance.now() - T0); - return result; + for (const term of queryTerms) { + const idf = computeIdf(term); + idfPairs.push({ term, idf }); + + let hits = []; + try { + hits = index.search(term, { + boost: { text: 1 }, + fuzzy: 0.2, + prefix: true, + combineWith: 'OR', + tokenize: tokenizeForIndex, + }); + } catch (e) { + xbLog.warn(MODULE_ID, `Lexical term search failed: ${term}`, e); + continue; + } + + result.termSearches += 1; + + for (const hit of hits) { + const id = String(hit.id || ''); + if (!id) continue; + + const weighted = (hit.score || 0) * idf; + weightedScores.set(id, (weightedScores.get(id) || 0) + weighted); + + if (!hitMeta.has(id)) { + hitMeta.set(id, { + type: hit.type, + floor: hit.floor, + }); + } + } } - // 分类结果 - const chunkIdSet = new Set(); - const eventIdSet = new Set(); + idfPairs.sort((a, b) => b.idf - a.idf); + result.topIdfTerms = idfPairs.slice(0, 5); - for (const hit of hits) { - const type = hit.type; - const id = hit.id; - const floor = hit.floor; + const sortedHits = Array.from(weightedScores.entries()) + .sort((a, b) => b[1] - a[1]); - switch (type) { - case 'chunk': - if (!chunkIdSet.has(id)) { - chunkIdSet.add(id); - result.chunkIds.push(id); - result.chunkScores.push({ chunkId: id, score: hit.score }); - if (typeof floor === 'number' && floor >= 0) { - result.chunkFloors.add(floor); - } - } - break; + for (const [id, score] of sortedHits) { + const meta = hitMeta.get(id); + if (!meta) continue; - case 'event': - if (!eventIdSet.has(id)) { - eventIdSet.add(id); - result.eventIds.push(id); - } - break; + if (meta.type === 'chunk') { + result.chunkIds.push(id); + result.chunkScores.push({ chunkId: id, score }); + if (typeof meta.floor === 'number' && meta.floor >= 0) { + result.chunkFloors.add(meta.floor); + } + continue; + } + + if (meta.type === 'event') { + result.eventIds.push(id); } } result.searchTime = Math.round(performance.now() - T0); - xbLog.info(MODULE_ID, - `检索完成: terms=[${terms.slice(0, 5).join(',')}] → atoms=${result.atomIds.length} chunks=${result.chunkIds.length} events=${result.eventIds.length} (${result.searchTime}ms)` + xbLog.info( + MODULE_ID, + `Lexical search terms=[${queryTerms.slice(0, 5).join(',')}] chunks=${result.chunkIds.length} events=${result.eventIds.length} termSearches=${result.termSearches} (${result.searchTime}ms)`, ); return result; } -// ───────────────────────────────────────────────────────────────────────── -// 内部构建流程(收集数据 + 构建索引) -// ───────────────────────────────────────────────────────────────────────── - -/** - * 收集数据并构建索引 - * - * @param {string} chatId - * @returns {Promise<{index: MiniSearch, fingerprint: string}>} - */ async function collectAndBuild(chatId) { - // 清空侧索引(全量重建) floorDocIds = new Map(); - // 收集数据(不含 L0 atoms) const store = getSummaryStore(); const events = store?.json?.events || []; @@ -301,48 +352,44 @@ async function collectAndBuild(chatId) { try { chunks = await getAllChunks(chatId); } catch (e) { - xbLog.warn(MODULE_ID, '获取 chunks 失败', e); + xbLog.warn(MODULE_ID, 'Failed to load chunks', e); } - const fp = computeFingerprint(chunks.length, events.length); + const docs = collectDocuments(chunks, events); + const fp = computeFingerprintFromDocs(docs); - // 检查是否在收集过程中缓存已被其他调用更新 if (cachedIndex && cachedChatId === chatId && cachedFingerprint === fp) { return { index: cachedIndex, fingerprint: fp }; } - // 收集文档(同时填充 floorDocIds) - const docs = collectDocuments(chunks, events); - - // 异步分片构建 + rebuildIdfFromDocs(docs); const index = await buildIndexAsync(docs); return { index, fingerprint: fp }; } -// ───────────────────────────────────────────────────────────────────────── -// 公开接口:getLexicalIndex(惰性获取) -// ───────────────────────────────────────────────────────────────────────── - /** - * 获取词法索引(惰性构建 + 缓存) - * - * 如果缓存有效则直接返回;否则自动构建。 - * 如果正在构建中,等待构建完成。 - * - * @returns {Promise} + * Expose IDF accessor for query-term selection in query-builder. + * If index stats are not ready, this gracefully falls back to idf=1. */ +export function getLexicalIdfAccessor() { + return { + enabled: lexicalDocCount > 0, + docCount: lexicalDocCount, + getIdf(term) { + return computeIdf(term); + }, + }; +} + export async function getLexicalIndex() { const { chatId } = getContext(); if (!chatId) return null; - // 快速路径:如果缓存存在且 chatId 未变,则直接命中 - // 指纹校验放到构建流程中完成,避免为指纹而额外读一次 IndexedDB if (cachedIndex && cachedChatId === chatId && cachedFingerprint) { return cachedIndex; } - // 正在构建中,等待结果 if (building && buildPromise) { try { await buildPromise; @@ -350,27 +397,23 @@ export async function getLexicalIndex() { return cachedIndex; } } catch { - // 构建失败,继续往下重建 + // Continue to rebuild below. } } - // 需要重建(指纹将在 collectAndBuild 内部计算并写入缓存) - xbLog.info(MODULE_ID, `缓存失效,重建索引 (chatId=${chatId.slice(0, 8)})`); + xbLog.info(MODULE_ID, `Lexical cache miss; rebuilding (chatId=${chatId.slice(0, 8)})`); building = true; buildPromise = collectAndBuild(chatId); try { const { index, fingerprint } = await buildPromise; - - // 原子替换缓存 cachedIndex = index; cachedChatId = chatId; cachedFingerprint = fingerprint; - return index; } catch (e) { - xbLog.error(MODULE_ID, '索引构建失败', e); + xbLog.error(MODULE_ID, 'Index build failed', e); return null; } finally { building = false; @@ -378,74 +421,29 @@ export async function getLexicalIndex() { } } -// ───────────────────────────────────────────────────────────────────────── -// 公开接口:warmupIndex(异步预建) -// ───────────────────────────────────────────────────────────────────────── - -/** - * 异步预建索引 - * - * 在 CHAT_CHANGED 时调用,后台构建索引。 - * 不阻塞调用方,不返回结果。 - * 构建完成后缓存自动更新,后续 getLexicalIndex() 直接命中。 - * - * 调用时机: - * - handleChatChanged(实体注入后) - * - L0 提取完成 - * - L2 总结完成 - */ export function warmupIndex() { const { chatId } = getContext(); - if (!chatId) return; + if (!chatId || building) return; - // 已在构建中,不重复触发 - if (building) return; - - // fire-and-forget getLexicalIndex().catch(e => { - xbLog.warn(MODULE_ID, '预热索引失败', e); + xbLog.warn(MODULE_ID, 'Warmup failed', e); }); } -// ───────────────────────────────────────────────────────────────────────── -// 公开接口:invalidateLexicalIndex(缓存失效) -// ───────────────────────────────────────────────────────────────────────── - -/** - * 使缓存失效(下次 getLexicalIndex / warmupIndex 时自动重建) - * - * 调用时机: - * - CHAT_CHANGED - * - L0 提取完成 - * - L2 总结完成 - */ export function invalidateLexicalIndex() { if (cachedIndex) { - xbLog.info(MODULE_ID, '索引缓存已失效'); + xbLog.info(MODULE_ID, 'Lexical index cache invalidated'); } cachedIndex = null; cachedChatId = null; cachedFingerprint = null; floorDocIds = new Map(); + clearIdfState(); } -// ───────────────────────────────────────────────────────────────────────── -// 增量更新接口 -// ───────────────────────────────────────────────────────────────────────── - -/** - * 为指定楼层添加 L1 chunks 到索引 - * - * 先移除该楼层旧文档,再添加新文档。 - * 如果索引不存在(缓存失效),静默跳过(下次 getLexicalIndex 全量重建)。 - * - * @param {number} floor - 楼层号 - * @param {object[]} chunks - chunk 对象列表(需有 chunkId、text、floor) - */ export function addDocumentsForFloor(floor, chunks) { if (!cachedIndex || !chunks?.length) return; - // 先移除旧文档 removeDocumentsByFloor(floor); const docs = []; @@ -453,30 +451,29 @@ export function addDocumentsForFloor(floor, chunks) { for (const chunk of chunks) { if (!chunk?.chunkId || !chunk.text) continue; - docs.push({ + + const doc = { id: chunk.chunkId, type: 'chunk', floor: chunk.floor ?? floor, text: chunk.text, - }); + }; + docs.push(doc); docIds.push(chunk.chunkId); } - if (docs.length > 0) { - cachedIndex.addAll(docs); - floorDocIds.set(floor, docIds); - xbLog.info(MODULE_ID, `增量添加: floor ${floor}, ${docs.length} 个 chunk`); + if (!docs.length) return; + + cachedIndex.addAll(docs); + floorDocIds.set(floor, docIds); + + for (const doc of docs) { + addDocumentIdf(doc.id, doc.text); } + + xbLog.info(MODULE_ID, `Incremental add floor=${floor} chunks=${docs.length}`); } -/** - * 从索引中移除指定楼层的所有 L1 chunk 文档 - * - * 使用 MiniSearch discard()(软删除)。 - * 如果索引不存在,静默跳过。 - * - * @param {number} floor - 楼层号 - */ export function removeDocumentsByFloor(floor) { if (!cachedIndex) return; @@ -487,55 +484,39 @@ export function removeDocumentsByFloor(floor) { try { cachedIndex.discard(id); } catch { - // 文档可能不存在(已被全量重建替换) + // Ignore if the doc was already removed/rebuilt. } + removeDocumentIdf(id); } floorDocIds.delete(floor); - xbLog.info(MODULE_ID, `增量移除: floor ${floor}, ${docIds.length} 个文档`); + xbLog.info(MODULE_ID, `Incremental remove floor=${floor} chunks=${docIds.length}`); } -/** - * 将新 L2 事件添加到索引 - * - * 如果事件 ID 已存在,先 discard 再 add(覆盖)。 - * 如果索引不存在,静默跳过。 - * - * @param {object[]} events - 事件对象列表(需有 id、title、summary 等) - */ export function addEventDocuments(events) { if (!cachedIndex || !events?.length) return; const docs = []; for (const ev of events) { - if (!ev?.id) continue; + const doc = buildEventDoc(ev); + if (!doc) continue; - const parts = []; - if (ev.title) parts.push(ev.title); - if (ev.participants?.length) parts.push(ev.participants.join(' ')); - const summary = cleanSummary(ev.summary); - if (summary) parts.push(summary); - const text = parts.join(' ').trim(); - if (!text) continue; - - // 覆盖:先尝试移除旧的 try { - cachedIndex.discard(ev.id); + cachedIndex.discard(doc.id); } catch { - // 不存在则忽略 + // Ignore if previous document does not exist. } - - docs.push({ - id: ev.id, - type: 'event', - floor: null, - text, - }); + removeDocumentIdf(doc.id); + docs.push(doc); } - if (docs.length > 0) { - cachedIndex.addAll(docs); - xbLog.info(MODULE_ID, `增量添加: ${docs.length} 个事件`); + if (!docs.length) return; + + cachedIndex.addAll(docs); + for (const doc of docs) { + addDocumentIdf(doc.id, doc.text); } + + xbLog.info(MODULE_ID, `Incremental add events=${docs.length}`); } diff --git a/modules/story-summary/vector/retrieval/metrics.js b/modules/story-summary/vector/retrieval/metrics.js index 4530788..ecd06b4 100644 --- a/modules/story-summary/vector/retrieval/metrics.js +++ b/modules/story-summary/vector/retrieval/metrics.js @@ -52,6 +52,10 @@ export function createMetrics() { eventHits: 0, searchTime: 0, indexReadyTime: 0, + idfEnabled: false, + idfDocCount: 0, + topIdfTerms: [], + termSearches: 0, eventFilteredByDense: 0, floorFilteredByDense: 0, }, @@ -274,6 +278,20 @@ export function formatMetricsLog(metrics) { if (m.lexical.indexReadyTime > 0) { lines.push(`├─ index_ready_time: ${m.lexical.indexReadyTime}ms`); } + lines.push(`├─ idf_enabled: ${!!m.lexical.idfEnabled}`); + if (m.lexical.idfDocCount > 0) { + lines.push(`├─ idf_doc_count: ${m.lexical.idfDocCount}`); + } + if ((m.lexical.topIdfTerms || []).length > 0) { + const topIdfText = m.lexical.topIdfTerms + .slice(0, 5) + .map(x => `${x.term}:${x.idf}`) + .join(', '); + lines.push(`├─ top_idf_terms: [${topIdfText}]`); + } + if (m.lexical.termSearches > 0) { + lines.push(`├─ term_searches: ${m.lexical.termSearches}`); + } if (m.lexical.eventFilteredByDense > 0) { lines.push(`├─ event_filtered_by_dense: ${m.lexical.eventFilteredByDense}`); } diff --git a/modules/story-summary/vector/retrieval/query-builder.js b/modules/story-summary/vector/retrieval/query-builder.js index c5593a0..714a0a9 100644 --- a/modules/story-summary/vector/retrieval/query-builder.js +++ b/modules/story-summary/vector/retrieval/query-builder.js @@ -20,6 +20,7 @@ import { getContext } from '../../../../../../../extensions.js'; import { buildEntityLexicon, buildDisplayNameMap, extractEntitiesFromText, buildCharacterPools } from './entity-lexicon.js'; +import { getLexicalIdfAccessor } from './lexical-index.js'; import { getSummaryStore } from '../../data/store.js'; import { filterText } from '../utils/text-filter.js'; import { tokenizeForIndex as tokenizerTokenizeForIndex } from '../utils/tokenizer.js'; @@ -106,6 +107,7 @@ export function computeLengthFactor(charCount) { function extractKeyTerms(text, maxTerms = LEXICAL_TERMS_MAX) { if (!text) return []; + const idfAccessor = getLexicalIdfAccessor(); const tokens = tokenizerTokenizeForIndex(text); const freq = new Map(); for (const token of tokens) { @@ -115,9 +117,13 @@ function extractKeyTerms(text, maxTerms = LEXICAL_TERMS_MAX) { } return Array.from(freq.entries()) - .sort((a, b) => b[1] - a[1]) + .map(([term, tf]) => { + const idf = idfAccessor.enabled ? idfAccessor.getIdf(term) : 1; + return { term, tf, score: tf * idf }; + }) + .sort((a, b) => (b.score - a.score) || (b.tf - a.tf)) .slice(0, maxTerms) - .map(([term]) => term); + .map(x => x.term); } // ───────────────────────────────────────────────────────────────────────── diff --git a/modules/story-summary/vector/retrieval/recall.js b/modules/story-summary/vector/retrieval/recall.js index b049e32..774f643 100644 --- a/modules/story-summary/vector/retrieval/recall.js +++ b/modules/story-summary/vector/retrieval/recall.js @@ -984,6 +984,12 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) { : CONFIG.LAST_MESSAGES_K; const lastMessages = getLastMessages(chat, lastMessagesCount, excludeLastAi); + // Non-blocking preload: keep recall latency stable. + // If not ready yet, query-builder will gracefully fall back to TF terms. + getLexicalIndex().catch((e) => { + xbLog.warn(MODULE_ID, 'Preload lexical index failed; continue with TF fallback', e); + }); + const bundle = buildQueryBundle(lastMessages, pendingUserMessage); const focusTerms = bundle.focusTerms || bundle.focusEntities || []; const focusCharacters = bundle.focusCharacters || []; @@ -1161,6 +1167,7 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) { atomIds: [], atomFloors: new Set(), chunkIds: [], chunkFloors: new Set(), eventIds: [], chunkScores: [], searchTime: 0, + idfEnabled: false, idfDocCount: 0, topIdfTerms: [], termSearches: 0, }; let indexReadyTime = 0; @@ -1184,6 +1191,10 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) { metrics.lexical.searchTime = lexicalResult.searchTime || 0; metrics.lexical.indexReadyTime = indexReadyTime; metrics.lexical.terms = bundle.lexicalTerms.slice(0, 10); + metrics.lexical.idfEnabled = !!lexicalResult.idfEnabled; + metrics.lexical.idfDocCount = lexicalResult.idfDocCount || 0; + metrics.lexical.topIdfTerms = lexicalResult.topIdfTerms || []; + metrics.lexical.termSearches = lexicalResult.termSearches || 0; } // 合并 L2 events(lexical 命中但 dense 未命中的 events) @@ -1238,7 +1249,7 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) { } xbLog.info(MODULE_ID, - `Lexical: chunks=${lexicalResult.chunkIds.length} events=${lexicalResult.eventIds.length} mergedEvents=+${lexicalEventCount} filteredByDense=${lexicalEventFilteredByDense} floorFiltered=${metrics.lexical.floorFilteredByDense || 0} (indexReady=${indexReadyTime}ms search=${lexicalResult.searchTime || 0}ms total=${lexTime}ms)` + `Lexical: chunks=${lexicalResult.chunkIds.length} events=${lexicalResult.eventIds.length} mergedEvents=+${lexicalEventCount} filteredByDense=${lexicalEventFilteredByDense} floorFiltered=${metrics.lexical.floorFilteredByDense || 0} idfEnabled=${lexicalResult.idfEnabled ? 'yes' : 'no'} idfDocs=${lexicalResult.idfDocCount || 0} termSearches=${lexicalResult.termSearches || 0} (indexReady=${indexReadyTime}ms search=${lexicalResult.searchTime || 0}ms total=${lexTime}ms)` ); // ═══════════════════════════════════════════════════════════════════ diff --git a/modules/story-summary/vector/utils/stopwords-base.js b/modules/story-summary/vector/utils/stopwords-base.js new file mode 100644 index 0000000..2ce6fa0 --- /dev/null +++ b/modules/story-summary/vector/utils/stopwords-base.js @@ -0,0 +1,2231 @@ +// Auto-generated stopword baseline for story-summary. +// Source: stopwords-iso (MIT), snapshot files under ./stopwords-data +// Languages merged: zh + ja + en +// Do not edit manually. Update snapshot files then regenerate. + +export const BASE_STOP_WORDS = [ + "、", + "。", + "〈", + "〉", + "《", + "》", + "一", + "一个", + "一些", + "一何", + "一切", + "一则", + "一方面", + "一旦", + "一来", + "一样", + "一种", + "一般", + "一转眼", + "七", + "万一", + "三", + "上", + "上下", + "下", + "不", + "不仅", + "不但", + "不光", + "不单", + "不只", + "不外乎", + "不如", + "不妨", + "不尽", + "不尽然", + "不得", + "不怕", + "不惟", + "不成", + "不拘", + "不料", + "不是", + "不比", + "不然", + "不特", + "不独", + "不管", + "不至于", + "不若", + "不论", + "不过", + "不问", + "与", + "与其", + "与其说", + "与否", + "与此同时", + "且", + "且不说", + "且说", + "两者", + "个", + "个别", + "中", + "临", + "为", + "为了", + "为什么", + "为何", + "为止", + "为此", + "为着", + "乃", + "乃至", + "乃至于", + "么", + "之", + "之一", + "之所以", + "之类", + "乌乎", + "乎", + "乘", + "九", + "也", + "也好", + "也罢", + "了", + "二", + "二来", + "于", + "于是", + "于是乎", + "云云", + "云尔", + "五", + "些", + "亦", + "人", + "人们", + "人家", + "什", + "什么", + "什么样", + "今", + "介于", + "仍", + "仍旧", + "从", + "从此", + "从而", + "他", + "他人", + "他们", + "他们们", + "以", + "以上", + "以为", + "以便", + "以免", + "以及", + "以故", + "以期", + "以来", + "以至", + "以至于", + "以致", + "们", + "任", + "任何", + "任凭", + "会", + "似的", + "但", + "但凡", + "但是", + "何", + "何以", + "何况", + "何处", + "何时", + "余外", + "作为", + "你", + "你们", + "使", + "使得", + "例如", + "依", + "依据", + "依照", + "便于", + "俺", + "俺们", + "倘", + "倘使", + "倘或", + "倘然", + "倘若", + "借", + "借傥然", + "假使", + "假如", + "假若", + "做", + "像", + "儿", + "先不先", + "光", + "光是", + "全体", + "全部", + "八", + "六", + "兮", + "共", + "关于", + "关于具体地说", + "其", + "其一", + "其中", + "其二", + "其他", + "其余", + "其它", + "其次", + "具体地说", + "具体说来", + "兼之", + "内", + "再", + "再其次", + "再则", + "再有", + "再者", + "再者说", + "再说", + "冒", + "冲", + "况且", + "几", + "几时", + "凡", + "凡是", + "凭", + "凭借", + "出于", + "出来", + "分", + "分别", + "则", + "则甚", + "别", + "别人", + "别处", + "别是", + "别的", + "别管", + "别说", + "到", + "前后", + "前此", + "前者", + "加之", + "加以", + "区", + "即", + "即令", + "即使", + "即便", + "即如", + "即或", + "即若", + "却", + "去", + "又", + "又及", + "及", + "及其", + "及至", + "反之", + "反而", + "反过来", + "反过来说", + "受到", + "另", + "另一方面", + "另外", + "另悉", + "只", + "只当", + "只怕", + "只是", + "只有", + "只消", + "只要", + "只限", + "叫", + "叮咚", + "可", + "可以", + "可是", + "可见", + "各", + "各个", + "各位", + "各种", + "各自", + "同", + "同时", + "后", + "后者", + "向", + "向使", + "向着", + "吓", + "吗", + "否则", + "吧", + "吧哒", + "含", + "吱", + "呀", + "呃", + "呕", + "呗", + "呜", + "呜呼", + "呢", + "呵", + "呵呵", + "呸", + "呼哧", + "咋", + "和", + "咚", + "咦", + "咧", + "咱", + "咱们", + "咳", + "哇", + "哈", + "哈哈", + "哉", + "哎", + "哎呀", + "哎哟", + "哗", + "哟", + "哦", + "哩", + "哪", + "哪个", + "哪些", + "哪儿", + "哪天", + "哪年", + "哪怕", + "哪样", + "哪边", + "哪里", + "哼", + "哼唷", + "唉", + "唯有", + "啊", + "啐", + "啥", + "啦", + "啪达", + "啷当", + "喂", + "喏", + "喔唷", + "喽", + "嗡", + "嗡嗡", + "嗬", + "嗯", + "嗳", + "嘎", + "嘎登", + "嘘", + "嘛", + "嘻", + "嘿", + "嘿嘿", + "四", + "因", + "因为", + "因了", + "因此", + "因着", + "因而", + "固然", + "在", + "在下", + "在于", + "地", + "基于", + "处在", + "多", + "多么", + "多少", + "大", + "大家", + "她", + "她们", + "好", + "如", + "如上", + "如上所述", + "如下", + "如何", + "如其", + "如同", + "如是", + "如果", + "如此", + "如若", + "始而", + "孰料", + "孰知", + "宁", + "宁可", + "宁愿", + "宁肯", + "它", + "它们", + "对", + "对于", + "对待", + "对方", + "对比", + "将", + "小", + "尔", + "尔后", + "尔尔", + "尚且", + "就", + "就是", + "就是了", + "就是说", + "就算", + "就要", + "尽", + "尽管", + "尽管如此", + "岂但", + "己", + "已", + "已矣", + "巴", + "巴巴", + "年", + "并", + "并且", + "庶乎", + "庶几", + "开外", + "开始", + "归", + "归齐", + "当", + "当地", + "当然", + "当着", + "彼", + "彼时", + "彼此", + "往", + "待", + "很", + "得", + "得了", + "怎", + "怎么", + "怎么办", + "怎么样", + "怎奈", + "怎样", + "总之", + "总的来看", + "总的来说", + "总的说来", + "总而言之", + "恰恰相反", + "您", + "惟其", + "慢说", + "我", + "我们", + "或", + "或则", + "或是", + "或曰", + "或者", + "截至", + "所", + "所以", + "所在", + "所幸", + "所有", + "才", + "才能", + "打", + "打从", + "把", + "抑或", + "拿", + "按", + "按照", + "换句话说", + "换言之", + "据", + "据此", + "接着", + "故", + "故此", + "故而", + "旁人", + "无", + "无宁", + "无论", + "既", + "既往", + "既是", + "既然", + "日", + "时", + "时候", + "是", + "是以", + "是的", + "更", + "曾", + "替", + "替代", + "最", + "月", + "有", + "有些", + "有关", + "有及", + "有时", + "有的", + "望", + "朝", + "朝着", + "本", + "本人", + "本地", + "本着", + "本身", + "来", + "来着", + "来自", + "来说", + "极了", + "果然", + "果真", + "某", + "某个", + "某些", + "某某", + "根据", + "欤", + "正值", + "正如", + "正巧", + "正是", + "此", + "此地", + "此处", + "此外", + "此时", + "此次", + "此间", + "毋宁", + "每", + "每当", + "比", + "比及", + "比如", + "比方", + "没奈何", + "沿", + "沿着", + "漫说", + "点", + "焉", + "然则", + "然后", + "然而", + "照", + "照着", + "犹且", + "犹自", + "甚且", + "甚么", + "甚或", + "甚而", + "甚至", + "甚至于", + "用", + "用来", + "由", + "由于", + "由是", + "由此", + "由此可见", + "的", + "的确", + "的话", + "直到", + "相对而言", + "省得", + "看", + "眨眼", + "着", + "着呢", + "矣", + "矣乎", + "矣哉", + "离", + "秒", + "称", + "竟而", + "第", + "等", + "等到", + "等等", + "简言之", + "管", + "类如", + "紧接着", + "纵", + "纵令", + "纵使", + "纵然", + "经", + "经过", + "结果", + "给", + "继之", + "继后", + "继而", + "综上所述", + "罢了", + "者", + "而", + "而且", + "而况", + "而后", + "而外", + "而已", + "而是", + "而言", + "能", + "能否", + "腾", + "自", + "自个儿", + "自从", + "自各儿", + "自后", + "自家", + "自己", + "自打", + "自身", + "至", + "至于", + "至今", + "至若", + "致", + "般的", + "若", + "若夫", + "若是", + "若果", + "若非", + "莫不然", + "莫如", + "莫若", + "虽", + "虽则", + "虽然", + "虽说", + "被", + "要", + "要不", + "要不是", + "要不然", + "要么", + "要是", + "譬喻", + "譬如", + "让", + "许多", + "论", + "设使", + "设或", + "设若", + "诚如", + "诚然", + "该", + "说", + "说来", + "请", + "诸", + "诸位", + "诸如", + "谁", + "谁人", + "谁料", + "谁知", + "贼死", + "赖以", + "赶", + "起", + "起见", + "趁", + "趁着", + "越是", + "距", + "跟", + "较", + "较之", + "边", + "过", + "还", + "还是", + "还有", + "还要", + "这", + "这一来", + "这个", + "这么", + "这么些", + "这么样", + "这么点儿", + "这些", + "这会儿", + "这儿", + "这就是说", + "这时", + "这样", + "这次", + "这般", + "这边", + "这里", + "进而", + "连", + "连同", + "逐步", + "通过", + "遵循", + "遵照", + "那", + "那个", + "那么", + "那么些", + "那么样", + "那些", + "那会儿", + "那儿", + "那时", + "那样", + "那般", + "那边", + "那里", + "都", + "鄙人", + "鉴于", + "针对", + "阿", + "除", + "除了", + "除外", + "除开", + "除此之外", + "除非", + "随", + "随后", + "随时", + "随着", + "难道说", + "零", + "非", + "非但", + "非徒", + "非特", + "非独", + "靠", + "顺", + "顺着", + "首先", + "︿", + "!", + "#", + "$", + "%", + "&", + "(", + ")", + "*", + "+", + ",", + "0", + "1", + "2", + "3", + "4", + "5", + "6", + "7", + "8", + "9", + ":", + ";", + "<", + ">", + "?", + "@", + "[", + "]", + "{", + "|", + "}", + "~", + "¥", + "あそこ", + "あっ", + "あの", + "あのかた", + "あの人", + "あり", + "あります", + "ある", + "あれ", + "い", + "いう", + "います", + "いる", + "う", + "うち", + "え", + "お", + "および", + "おり", + "おります", + "か", + "かつて", + "から", + "が", + "き", + "ここ", + "こちら", + "こと", + "この", + "これ", + "これら", + "さ", + "さらに", + "し", + "しかし", + "する", + "ず", + "せ", + "せる", + "そこ", + "そして", + "その", + "その他", + "その後", + "それ", + "それぞれ", + "それで", + "た", + "ただし", + "たち", + "ため", + "たり", + "だ", + "だっ", + "だれ", + "つ", + "て", + "で", + "でき", + "できる", + "です", + "では", + "でも", + "と", + "という", + "といった", + "とき", + "ところ", + "として", + "とともに", + "とも", + "と共に", + "どこ", + "どの", + "な", + "ない", + "なお", + "なかっ", + "ながら", + "なく", + "なっ", + "など", + "なに", + "なら", + "なり", + "なる", + "なん", + "に", + "において", + "における", + "について", + "にて", + "によって", + "により", + "による", + "に対して", + "に対する", + "に関する", + "の", + "ので", + "のみ", + "は", + "ば", + "へ", + "ほか", + "ほとんど", + "ほど", + "ます", + "また", + "または", + "まで", + "も", + "もの", + "ものの", + "や", + "よう", + "より", + "ら", + "られ", + "られる", + "れ", + "れる", + "を", + "ん", + "及び", + "彼女", + "我々", + "特に", + "私", + "私達", + "貴方", + "貴方方", + "'ll", + "'tis", + "'twas", + "'ve", + "10", + "39", + "a", + "a's", + "able", + "ableabout", + "about", + "above", + "abroad", + "abst", + "accordance", + "according", + "accordingly", + "across", + "act", + "actually", + "ad", + "added", + "adj", + "adopted", + "ae", + "af", + "affected", + "affecting", + "affects", + "after", + "afterwards", + "ag", + "again", + "against", + "ago", + "ah", + "ahead", + "ai", + "ain't", + "aint", + "al", + "all", + "allow", + "allows", + "almost", + "alone", + "along", + "alongside", + "already", + "also", + "although", + "always", + "am", + "amid", + "amidst", + "among", + "amongst", + "amoungst", + "amount", + "an", + "and", + "announce", + "another", + "any", + "anybody", + "anyhow", + "anymore", + "anyone", + "anything", + "anyway", + "anyways", + "anywhere", + "ao", + "apart", + "apparently", + "appear", + "appreciate", + "appropriate", + "approximately", + "aq", + "ar", + "are", + "area", + "areas", + "aren", + "aren't", + "arent", + "arise", + "around", + "arpa", + "as", + "aside", + "ask", + "asked", + "asking", + "asks", + "associated", + "at", + "au", + "auth", + "available", + "aw", + "away", + "awfully", + "az", + "b", + "ba", + "back", + "backed", + "backing", + "backs", + "backward", + "backwards", + "bb", + "bd", + "be", + "became", + "because", + "become", + "becomes", + "becoming", + "been", + "before", + "beforehand", + "began", + "begin", + "beginning", + "beginnings", + "begins", + "behind", + "being", + "beings", + "believe", + "below", + "beside", + "besides", + "best", + "better", + "between", + "beyond", + "bf", + "bg", + "bh", + "bi", + "big", + "bill", + "billion", + "biol", + "bj", + "bm", + "bn", + "bo", + "both", + "bottom", + "br", + "brief", + "briefly", + "bs", + "bt", + "but", + "buy", + "bv", + "bw", + "by", + "bz", + "c", + "c'mon", + "c's", + "ca", + "call", + "came", + "can", + "can't", + "cannot", + "cant", + "caption", + "case", + "cases", + "cause", + "causes", + "cc", + "cd", + "certain", + "certainly", + "cf", + "cg", + "ch", + "changes", + "ci", + "ck", + "cl", + "clear", + "clearly", + "click", + "cm", + "cmon", + "cn", + "co", + "co.", + "com", + "come", + "comes", + "computer", + "con", + "concerning", + "consequently", + "consider", + "considering", + "contain", + "containing", + "contains", + "copy", + "corresponding", + "could", + "could've", + "couldn", + "couldn't", + "couldnt", + "course", + "cr", + "cry", + "cs", + "cu", + "currently", + "cv", + "cx", + "cy", + "cz", + "d", + "dare", + "daren't", + "darent", + "date", + "de", + "dear", + "definitely", + "describe", + "described", + "despite", + "detail", + "did", + "didn", + "didn't", + "didnt", + "differ", + "different", + "differently", + "directly", + "dj", + "dk", + "dm", + "do", + "does", + "doesn", + "doesn't", + "doesnt", + "doing", + "don", + "don't", + "done", + "dont", + "doubtful", + "down", + "downed", + "downing", + "downs", + "downwards", + "due", + "during", + "dz", + "e", + "each", + "early", + "ec", + "ed", + "edu", + "ee", + "effect", + "eg", + "eh", + "eight", + "eighty", + "either", + "eleven", + "else", + "elsewhere", + "empty", + "end", + "ended", + "ending", + "ends", + "enough", + "entirely", + "er", + "es", + "especially", + "et", + "et-al", + "etc", + "even", + "evenly", + "ever", + "evermore", + "every", + "everybody", + "everyone", + "everything", + "everywhere", + "ex", + "exactly", + "example", + "except", + "f", + "face", + "faces", + "fact", + "facts", + "fairly", + "far", + "farther", + "felt", + "few", + "fewer", + "ff", + "fi", + "fifteen", + "fifth", + "fifty", + "fify", + "fill", + "find", + "finds", + "fire", + "first", + "five", + "fix", + "fj", + "fk", + "fm", + "fo", + "followed", + "following", + "follows", + "for", + "forever", + "former", + "formerly", + "forth", + "forty", + "forward", + "found", + "four", + "fr", + "free", + "from", + "front", + "full", + "fully", + "further", + "furthered", + "furthering", + "furthermore", + "furthers", + "fx", + "g", + "ga", + "gave", + "gb", + "gd", + "ge", + "general", + "generally", + "get", + "gets", + "getting", + "gf", + "gg", + "gh", + "gi", + "give", + "given", + "gives", + "giving", + "gl", + "gm", + "gmt", + "gn", + "go", + "goes", + "going", + "gone", + "good", + "goods", + "got", + "gotten", + "gov", + "gp", + "gq", + "gr", + "great", + "greater", + "greatest", + "greetings", + "group", + "grouped", + "grouping", + "groups", + "gs", + "gt", + "gu", + "gw", + "gy", + "h", + "had", + "hadn't", + "hadnt", + "half", + "happens", + "hardly", + "has", + "hasn", + "hasn't", + "hasnt", + "have", + "haven", + "haven't", + "havent", + "having", + "he", + "he'd", + "he'll", + "he's", + "hed", + "hell", + "hello", + "help", + "hence", + "her", + "here", + "here's", + "hereafter", + "hereby", + "herein", + "heres", + "hereupon", + "hers", + "herself", + "herse”", + "hes", + "hi", + "hid", + "high", + "higher", + "highest", + "him", + "himself", + "himse”", + "his", + "hither", + "hk", + "hm", + "hn", + "home", + "homepage", + "hopefully", + "how", + "how'd", + "how'll", + "how's", + "howbeit", + "however", + "hr", + "ht", + "htm", + "html", + "http", + "hu", + "hundred", + "i", + "i'd", + "i'll", + "i'm", + "i've", + "i.e.", + "id", + "ie", + "if", + "ignored", + "ii", + "il", + "ill", + "im", + "immediate", + "immediately", + "importance", + "important", + "in", + "inasmuch", + "inc", + "inc.", + "indeed", + "index", + "indicate", + "indicated", + "indicates", + "information", + "inner", + "inside", + "insofar", + "instead", + "int", + "interest", + "interested", + "interesting", + "interests", + "into", + "invention", + "inward", + "io", + "iq", + "ir", + "is", + "isn", + "isn't", + "isnt", + "it", + "it'd", + "it'll", + "it's", + "itd", + "itll", + "its", + "itself", + "itse”", + "ive", + "j", + "je", + "jm", + "jo", + "join", + "jp", + "just", + "k", + "ke", + "keep", + "keeps", + "kept", + "keys", + "kg", + "kh", + "ki", + "kind", + "km", + "kn", + "knew", + "know", + "known", + "knows", + "kp", + "kr", + "kw", + "ky", + "kz", + "l", + "la", + "large", + "largely", + "last", + "lately", + "later", + "latest", + "latter", + "latterly", + "lb", + "lc", + "least", + "length", + "less", + "lest", + "let", + "let's", + "lets", + "li", + "like", + "liked", + "likely", + "likewise", + "line", + "little", + "lk", + "ll", + "long", + "longer", + "longest", + "look", + "looking", + "looks", + "low", + "lower", + "lr", + "ls", + "lt", + "ltd", + "lu", + "lv", + "ly", + "m", + "ma", + "made", + "mainly", + "make", + "makes", + "making", + "man", + "many", + "may", + "maybe", + "mayn't", + "maynt", + "mc", + "md", + "me", + "mean", + "means", + "meantime", + "meanwhile", + "member", + "members", + "men", + "merely", + "mg", + "mh", + "microsoft", + "might", + "might've", + "mightn't", + "mightnt", + "mil", + "mill", + "million", + "mine", + "minus", + "miss", + "mk", + "ml", + "mm", + "mn", + "mo", + "more", + "moreover", + "most", + "mostly", + "move", + "mp", + "mq", + "mr", + "mrs", + "ms", + "msie", + "mt", + "mu", + "much", + "mug", + "must", + "must've", + "mustn't", + "mustnt", + "mv", + "mw", + "mx", + "my", + "myself", + "myse”", + "mz", + "n", + "na", + "name", + "namely", + "nay", + "nc", + "nd", + "ne", + "near", + "nearly", + "necessarily", + "necessary", + "need", + "needed", + "needing", + "needn't", + "neednt", + "needs", + "neither", + "net", + "netscape", + "never", + "neverf", + "neverless", + "nevertheless", + "new", + "newer", + "newest", + "next", + "nf", + "ng", + "ni", + "nine", + "ninety", + "nl", + "no", + "no-one", + "nobody", + "non", + "none", + "nonetheless", + "noone", + "nor", + "normally", + "nos", + "not", + "noted", + "nothing", + "notwithstanding", + "novel", + "now", + "nowhere", + "np", + "nr", + "nu", + "null", + "number", + "numbers", + "nz", + "o", + "obtain", + "obtained", + "obviously", + "of", + "off", + "often", + "oh", + "ok", + "okay", + "old", + "older", + "oldest", + "om", + "omitted", + "on", + "once", + "one", + "one's", + "ones", + "only", + "onto", + "open", + "opened", + "opening", + "opens", + "opposite", + "or", + "ord", + "order", + "ordered", + "ordering", + "orders", + "org", + "other", + "others", + "otherwise", + "ought", + "oughtn't", + "oughtnt", + "our", + "ours", + "ourselves", + "out", + "outside", + "over", + "overall", + "owing", + "own", + "p", + "pa", + "page", + "pages", + "part", + "parted", + "particular", + "particularly", + "parting", + "parts", + "past", + "pe", + "per", + "perhaps", + "pf", + "pg", + "ph", + "pk", + "pl", + "place", + "placed", + "places", + "please", + "plus", + "pm", + "pmid", + "pn", + "point", + "pointed", + "pointing", + "points", + "poorly", + "possible", + "possibly", + "potentially", + "pp", + "pr", + "predominantly", + "present", + "presented", + "presenting", + "presents", + "presumably", + "previously", + "primarily", + "probably", + "problem", + "problems", + "promptly", + "proud", + "provided", + "provides", + "pt", + "put", + "puts", + "pw", + "py", + "q", + "qa", + "que", + "quickly", + "quite", + "qv", + "r", + "ran", + "rather", + "rd", + "re", + "readily", + "really", + "reasonably", + "recent", + "recently", + "ref", + "refs", + "regarding", + "regardless", + "regards", + "related", + "relatively", + "research", + "reserved", + "respectively", + "resulted", + "resulting", + "results", + "right", + "ring", + "ro", + "room", + "rooms", + "round", + "ru", + "run", + "rw", + "s", + "sa", + "said", + "same", + "saw", + "say", + "saying", + "says", + "sb", + "sc", + "sd", + "se", + "sec", + "second", + "secondly", + "seconds", + "section", + "see", + "seeing", + "seem", + "seemed", + "seeming", + "seems", + "seen", + "sees", + "self", + "selves", + "sensible", + "sent", + "serious", + "seriously", + "seven", + "seventy", + "several", + "sg", + "sh", + "shall", + "shan't", + "shant", + "she", + "she'd", + "she'll", + "she's", + "shed", + "shell", + "shes", + "should", + "should've", + "shouldn", + "shouldn't", + "shouldnt", + "show", + "showed", + "showing", + "shown", + "showns", + "shows", + "si", + "side", + "sides", + "significant", + "significantly", + "similar", + "similarly", + "since", + "sincere", + "site", + "six", + "sixty", + "sj", + "sk", + "sl", + "slightly", + "sm", + "small", + "smaller", + "smallest", + "sn", + "so", + "some", + "somebody", + "someday", + "somehow", + "someone", + "somethan", + "something", + "sometime", + "sometimes", + "somewhat", + "somewhere", + "soon", + "sorry", + "specifically", + "specified", + "specify", + "specifying", + "sr", + "st", + "state", + "states", + "still", + "stop", + "strongly", + "su", + "sub", + "substantially", + "successfully", + "such", + "sufficiently", + "suggest", + "sup", + "sure", + "sv", + "sy", + "system", + "sz", + "t", + "t's", + "take", + "taken", + "taking", + "tc", + "td", + "tell", + "ten", + "tends", + "test", + "text", + "tf", + "tg", + "th", + "than", + "thank", + "thanks", + "thanx", + "that", + "that'll", + "that's", + "that've", + "thatll", + "thats", + "thatve", + "the", + "their", + "theirs", + "them", + "themselves", + "then", + "thence", + "there", + "there'd", + "there'll", + "there're", + "there's", + "there've", + "thereafter", + "thereby", + "thered", + "therefore", + "therein", + "therell", + "thereof", + "therere", + "theres", + "thereto", + "thereupon", + "thereve", + "these", + "they", + "they'd", + "they'll", + "they're", + "they've", + "theyd", + "theyll", + "theyre", + "theyve", + "thick", + "thin", + "thing", + "things", + "think", + "thinks", + "third", + "thirty", + "this", + "thorough", + "thoroughly", + "those", + "thou", + "though", + "thoughh", + "thought", + "thoughts", + "thousand", + "three", + "throug", + "through", + "throughout", + "thru", + "thus", + "til", + "till", + "tip", + "tis", + "tj", + "tk", + "tm", + "tn", + "to", + "today", + "together", + "too", + "took", + "top", + "toward", + "towards", + "tp", + "tr", + "tried", + "tries", + "trillion", + "truly", + "try", + "trying", + "ts", + "tt", + "turn", + "turned", + "turning", + "turns", + "tv", + "tw", + "twas", + "twelve", + "twenty", + "twice", + "two", + "tz", + "u", + "ua", + "ug", + "uk", + "um", + "un", + "under", + "underneath", + "undoing", + "unfortunately", + "unless", + "unlike", + "unlikely", + "until", + "unto", + "up", + "upon", + "ups", + "upwards", + "us", + "use", + "used", + "useful", + "usefully", + "usefulness", + "uses", + "using", + "usually", + "uucp", + "uy", + "uz", + "v", + "va", + "value", + "various", + "vc", + "ve", + "versus", + "very", + "vg", + "vi", + "via", + "viz", + "vn", + "vol", + "vols", + "vs", + "vu", + "w", + "want", + "wanted", + "wanting", + "wants", + "was", + "wasn", + "wasn't", + "wasnt", + "way", + "ways", + "we", + "we'd", + "we'll", + "we're", + "we've", + "web", + "webpage", + "website", + "wed", + "welcome", + "well", + "wells", + "went", + "were", + "weren", + "weren't", + "werent", + "weve", + "wf", + "what", + "what'd", + "what'll", + "what's", + "what've", + "whatever", + "whatll", + "whats", + "whatve", + "when", + "when'd", + "when'll", + "when's", + "whence", + "whenever", + "where", + "where'd", + "where'll", + "where's", + "whereafter", + "whereas", + "whereby", + "wherein", + "wheres", + "whereupon", + "wherever", + "whether", + "which", + "whichever", + "while", + "whilst", + "whim", + "whither", + "who", + "who'd", + "who'll", + "who's", + "whod", + "whoever", + "whole", + "wholl", + "whom", + "whomever", + "whos", + "whose", + "why", + "why'd", + "why'll", + "why's", + "widely", + "width", + "will", + "willing", + "wish", + "with", + "within", + "without", + "won", + "won't", + "wonder", + "wont", + "words", + "work", + "worked", + "working", + "works", + "world", + "would", + "would've", + "wouldn", + "wouldn't", + "wouldnt", + "ws", + "www", + "x", + "y", + "ye", + "year", + "years", + "yes", + "yet", + "you", + "you'd", + "you'll", + "you're", + "you've", + "youd", + "youll", + "young", + "younger", + "youngest", + "your", + "youre", + "yours", + "yourself", + "yourselves", + "youve", + "yt", + "yu", + "z", + "za", + "zero", + "zm", + "zr" +]; diff --git a/modules/story-summary/vector/utils/stopwords-data/LICENSE.stopwords-iso.txt b/modules/story-summary/vector/utils/stopwords-data/LICENSE.stopwords-iso.txt new file mode 100644 index 0000000..0076d3c --- /dev/null +++ b/modules/story-summary/vector/utils/stopwords-data/LICENSE.stopwords-iso.txt @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2020 Gene Diaz + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/modules/story-summary/vector/utils/stopwords-data/SOURCES.md b/modules/story-summary/vector/utils/stopwords-data/SOURCES.md new file mode 100644 index 0000000..1402c7e --- /dev/null +++ b/modules/story-summary/vector/utils/stopwords-data/SOURCES.md @@ -0,0 +1,15 @@ +# stopwords sources for story-summary + +- Dataset: `stopwords-iso` (npm package, version 1.1.0) +- Repository: https://github.com/stopwords-iso/stopwords-iso +- License: MIT +- Snapshot date: 2026-02-16 +- Languages used: `zh`, `ja`, `en` +- Local snapshot files: + - `stopwords-iso.zh.txt` + - `stopwords-iso.ja.txt` + - `stopwords-iso.en.txt` + +Generation note: +- `modules/story-summary/vector/utils/stopwords-base.js` is generated from these snapshot files. +- Keep `stopwords-patch.js` for tiny domain overrides only. diff --git a/modules/story-summary/vector/utils/stopwords-data/stopwords-iso.en.txt b/modules/story-summary/vector/utils/stopwords-data/stopwords-iso.en.txt new file mode 100644 index 0000000..0efb051 --- /dev/null +++ b/modules/story-summary/vector/utils/stopwords-data/stopwords-iso.en.txt @@ -0,0 +1,1298 @@ +'ll +'tis +'twas +'ve +10 +39 +a +a's +able +ableabout +about +above +abroad +abst +accordance +according +accordingly +across +act +actually +ad +added +adj +adopted +ae +af +affected +affecting +affects +after +afterwards +ag +again +against +ago +ah +ahead +ai +ain't +aint +al +all +allow +allows +almost +alone +along +alongside +already +also +although +always +am +amid +amidst +among +amongst +amoungst +amount +an +and +announce +another +any +anybody +anyhow +anymore +anyone +anything +anyway +anyways +anywhere +ao +apart +apparently +appear +appreciate +appropriate +approximately +aq +ar +are +area +areas +aren +aren't +arent +arise +around +arpa +as +aside +ask +asked +asking +asks +associated +at +au +auth +available +aw +away +awfully +az +b +ba +back +backed +backing +backs +backward +backwards +bb +bd +be +became +because +become +becomes +becoming +been +before +beforehand +began +begin +beginning +beginnings +begins +behind +being +beings +believe +below +beside +besides +best +better +between +beyond +bf +bg +bh +bi +big +bill +billion +biol +bj +bm +bn +bo +both +bottom +br +brief +briefly +bs +bt +but +buy +bv +bw +by +bz +c +c'mon +c's +ca +call +came +can +can't +cannot +cant +caption +case +cases +cause +causes +cc +cd +certain +certainly +cf +cg +ch +changes +ci +ck +cl +clear +clearly +click +cm +cmon +cn +co +co. +com +come +comes +computer +con +concerning +consequently +consider +considering +contain +containing +contains +copy +corresponding +could +could've +couldn +couldn't +couldnt +course +cr +cry +cs +cu +currently +cv +cx +cy +cz +d +dare +daren't +darent +date +de +dear +definitely +describe +described +despite +detail +did +didn +didn't +didnt +differ +different +differently +directly +dj +dk +dm +do +does +doesn +doesn't +doesnt +doing +don +don't +done +dont +doubtful +down +downed +downing +downs +downwards +due +during +dz +e +each +early +ec +ed +edu +ee +effect +eg +eh +eight +eighty +either +eleven +else +elsewhere +empty +end +ended +ending +ends +enough +entirely +er +es +especially +et +et-al +etc +even +evenly +ever +evermore +every +everybody +everyone +everything +everywhere +ex +exactly +example +except +f +face +faces +fact +facts +fairly +far +farther +felt +few +fewer +ff +fi +fifteen +fifth +fifty +fify +fill +find +finds +fire +first +five +fix +fj +fk +fm +fo +followed +following +follows +for +forever +former +formerly +forth +forty +forward +found +four +fr +free +from +front +full +fully +further +furthered +furthering +furthermore +furthers +fx +g +ga +gave +gb +gd +ge +general +generally +get +gets +getting +gf +gg +gh +gi +give +given +gives +giving +gl +gm +gmt +gn +go +goes +going +gone +good +goods +got +gotten +gov +gp +gq +gr +great +greater +greatest +greetings +group +grouped +grouping +groups +gs +gt +gu +gw +gy +h +had +hadn't +hadnt +half +happens +hardly +has +hasn +hasn't +hasnt +have +haven +haven't +havent +having +he +he'd +he'll +he's +hed +hell +hello +help +hence +her +here +here's +hereafter +hereby +herein +heres +hereupon +hers +herself +herse” +hes +hi +hid +high +higher +highest +him +himself +himse” +his +hither +hk +hm +hn +home +homepage +hopefully +how +how'd +how'll +how's +howbeit +however +hr +ht +htm +html +http +hu +hundred +i +i'd +i'll +i'm +i've +i.e. +id +ie +if +ignored +ii +il +ill +im +immediate +immediately +importance +important +in +inasmuch +inc +inc. +indeed +index +indicate +indicated +indicates +information +inner +inside +insofar +instead +int +interest +interested +interesting +interests +into +invention +inward +io +iq +ir +is +isn +isn't +isnt +it +it'd +it'll +it's +itd +itll +its +itself +itse” +ive +j +je +jm +jo +join +jp +just +k +ke +keep +keeps +kept +keys +kg +kh +ki +kind +km +kn +knew +know +known +knows +kp +kr +kw +ky +kz +l +la +large +largely +last +lately +later +latest +latter +latterly +lb +lc +least +length +less +lest +let +let's +lets +li +like +liked +likely +likewise +line +little +lk +ll +long +longer +longest +look +looking +looks +low +lower +lr +ls +lt +ltd +lu +lv +ly +m +ma +made +mainly +make +makes +making +man +many +may +maybe +mayn't +maynt +mc +md +me +mean +means +meantime +meanwhile +member +members +men +merely +mg +mh +microsoft +might +might've +mightn't +mightnt +mil +mill +million +mine +minus +miss +mk +ml +mm +mn +mo +more +moreover +most +mostly +move +mp +mq +mr +mrs +ms +msie +mt +mu +much +mug +must +must've +mustn't +mustnt +mv +mw +mx +my +myself +myse” +mz +n +na +name +namely +nay +nc +nd +ne +near +nearly +necessarily +necessary +need +needed +needing +needn't +neednt +needs +neither +net +netscape +never +neverf +neverless +nevertheless +new +newer +newest +next +nf +ng +ni +nine +ninety +nl +no +no-one +nobody +non +none +nonetheless +noone +nor +normally +nos +not +noted +nothing +notwithstanding +novel +now +nowhere +np +nr +nu +null +number +numbers +nz +o +obtain +obtained +obviously +of +off +often +oh +ok +okay +old +older +oldest +om +omitted +on +once +one +one's +ones +only +onto +open +opened +opening +opens +opposite +or +ord +order +ordered +ordering +orders +org +other +others +otherwise +ought +oughtn't +oughtnt +our +ours +ourselves +out +outside +over +overall +owing +own +p +pa +page +pages +part +parted +particular +particularly +parting +parts +past +pe +per +perhaps +pf +pg +ph +pk +pl +place +placed +places +please +plus +pm +pmid +pn +point +pointed +pointing +points +poorly +possible +possibly +potentially +pp +pr +predominantly +present +presented +presenting +presents +presumably +previously +primarily +probably +problem +problems +promptly +proud +provided +provides +pt +put +puts +pw +py +q +qa +que +quickly +quite +qv +r +ran +rather +rd +re +readily +really +reasonably +recent +recently +ref +refs +regarding +regardless +regards +related +relatively +research +reserved +respectively +resulted +resulting +results +right +ring +ro +room +rooms +round +ru +run +rw +s +sa +said +same +saw +say +saying +says +sb +sc +sd +se +sec +second +secondly +seconds +section +see +seeing +seem +seemed +seeming +seems +seen +sees +self +selves +sensible +sent +serious +seriously +seven +seventy +several +sg +sh +shall +shan't +shant +she +she'd +she'll +she's +shed +shell +shes +should +should've +shouldn +shouldn't +shouldnt +show +showed +showing +shown +showns +shows +si +side +sides +significant +significantly +similar +similarly +since +sincere +site +six +sixty +sj +sk +sl +slightly +sm +small +smaller +smallest +sn +so +some +somebody +someday +somehow +someone +somethan +something +sometime +sometimes +somewhat +somewhere +soon +sorry +specifically +specified +specify +specifying +sr +st +state +states +still +stop +strongly +su +sub +substantially +successfully +such +sufficiently +suggest +sup +sure +sv +sy +system +sz +t +t's +take +taken +taking +tc +td +tell +ten +tends +test +text +tf +tg +th +than +thank +thanks +thanx +that +that'll +that's +that've +thatll +thats +thatve +the +their +theirs +them +themselves +then +thence +there +there'd +there'll +there're +there's +there've +thereafter +thereby +thered +therefore +therein +therell +thereof +therere +theres +thereto +thereupon +thereve +these +they +they'd +they'll +they're +they've +theyd +theyll +theyre +theyve +thick +thin +thing +things +think +thinks +third +thirty +this +thorough +thoroughly +those +thou +though +thoughh +thought +thoughts +thousand +three +throug +through +throughout +thru +thus +til +till +tip +tis +tj +tk +tm +tn +to +today +together +too +took +top +toward +towards +tp +tr +tried +tries +trillion +truly +try +trying +ts +tt +turn +turned +turning +turns +tv +tw +twas +twelve +twenty +twice +two +tz +u +ua +ug +uk +um +un +under +underneath +undoing +unfortunately +unless +unlike +unlikely +until +unto +up +upon +ups +upwards +us +use +used +useful +usefully +usefulness +uses +using +usually +uucp +uy +uz +v +va +value +various +vc +ve +versus +very +vg +vi +via +viz +vn +vol +vols +vs +vu +w +want +wanted +wanting +wants +was +wasn +wasn't +wasnt +way +ways +we +we'd +we'll +we're +we've +web +webpage +website +wed +welcome +well +wells +went +were +weren +weren't +werent +weve +wf +what +what'd +what'll +what's +what've +whatever +whatll +whats +whatve +when +when'd +when'll +when's +whence +whenever +where +where'd +where'll +where's +whereafter +whereas +whereby +wherein +wheres +whereupon +wherever +whether +which +whichever +while +whilst +whim +whither +who +who'd +who'll +who's +whod +whoever +whole +wholl +whom +whomever +whos +whose +why +why'd +why'll +why's +widely +width +will +willing +wish +with +within +without +won +won't +wonder +wont +words +work +worked +working +works +world +would +would've +wouldn +wouldn't +wouldnt +ws +www +x +y +ye +year +years +yes +yet +you +you'd +you'll +you're +you've +youd +youll +young +younger +youngest +your +youre +yours +yourself +yourselves +youve +yt +yu +z +za +zero +zm +zr diff --git a/modules/story-summary/vector/utils/stopwords-data/stopwords-iso.ja.txt b/modules/story-summary/vector/utils/stopwords-data/stopwords-iso.ja.txt new file mode 100644 index 0000000..0e74864 --- /dev/null +++ b/modules/story-summary/vector/utils/stopwords-data/stopwords-iso.ja.txt @@ -0,0 +1,134 @@ +あそこ +あっ +あの +あのかた +あの人 +あり +あります +ある +あれ +い +いう +います +いる +う +うち +え +お +および +おり +おります +か +かつて +から +が +き +ここ +こちら +こと +この +これ +これら +さ +さらに +し +しかし +する +ず +せ +せる +そこ +そして +その +その他 +その後 +それ +それぞれ +それで +た +ただし +たち +ため +たり +だ +だっ +だれ +つ +て +で +でき +できる +です +では +でも +と +という +といった +とき +ところ +として +とともに +とも +と共に +どこ +どの +な +ない +なお +なかっ +ながら +なく +なっ +など +なに +なら +なり +なる +なん +に +において +における +について +にて +によって +により +による +に対して +に対する +に関する +の +ので +のみ +は +ば +へ +ほか +ほとんど +ほど +ます +また +または +まで +も +もの +ものの +や +よう +より +ら +られ +られる +れ +れる +を +ん +何 +及び +彼 +彼女 +我々 +特に +私 +私達 +貴方 +貴方方 diff --git a/modules/story-summary/vector/utils/stopwords-data/stopwords-iso.zh.txt b/modules/story-summary/vector/utils/stopwords-data/stopwords-iso.zh.txt new file mode 100644 index 0000000..15dea1c --- /dev/null +++ b/modules/story-summary/vector/utils/stopwords-data/stopwords-iso.zh.txt @@ -0,0 +1,794 @@ +、 +。 +〈 +〉 +《 +》 +一 +一个 +一些 +一何 +一切 +一则 +一方面 +一旦 +一来 +一样 +一种 +一般 +一转眼 +七 +万一 +三 +上 +上下 +下 +不 +不仅 +不但 +不光 +不单 +不只 +不外乎 +不如 +不妨 +不尽 +不尽然 +不得 +不怕 +不惟 +不成 +不拘 +不料 +不是 +不比 +不然 +不特 +不独 +不管 +不至于 +不若 +不论 +不过 +不问 +与 +与其 +与其说 +与否 +与此同时 +且 +且不说 +且说 +两者 +个 +个别 +中 +临 +为 +为了 +为什么 +为何 +为止 +为此 +为着 +乃 +乃至 +乃至于 +么 +之 +之一 +之所以 +之类 +乌乎 +乎 +乘 +九 +也 +也好 +也罢 +了 +二 +二来 +于 +于是 +于是乎 +云云 +云尔 +五 +些 +亦 +人 +人们 +人家 +什 +什么 +什么样 +今 +介于 +仍 +仍旧 +从 +从此 +从而 +他 +他人 +他们 +他们们 +以 +以上 +以为 +以便 +以免 +以及 +以故 +以期 +以来 +以至 +以至于 +以致 +们 +任 +任何 +任凭 +会 +似的 +但 +但凡 +但是 +何 +何以 +何况 +何处 +何时 +余外 +作为 +你 +你们 +使 +使得 +例如 +依 +依据 +依照 +便于 +俺 +俺们 +倘 +倘使 +倘或 +倘然 +倘若 +借 +借傥然 +假使 +假如 +假若 +做 +像 +儿 +先不先 +光 +光是 +全体 +全部 +八 +六 +兮 +共 +关于 +关于具体地说 +其 +其一 +其中 +其二 +其他 +其余 +其它 +其次 +具体地说 +具体说来 +兼之 +内 +再 +再其次 +再则 +再有 +再者 +再者说 +再说 +冒 +冲 +况且 +几 +几时 +凡 +凡是 +凭 +凭借 +出于 +出来 +分 +分别 +则 +则甚 +别 +别人 +别处 +别是 +别的 +别管 +别说 +到 +前后 +前此 +前者 +加之 +加以 +区 +即 +即令 +即使 +即便 +即如 +即或 +即若 +却 +去 +又 +又及 +及 +及其 +及至 +反之 +反而 +反过来 +反过来说 +受到 +另 +另一方面 +另外 +另悉 +只 +只当 +只怕 +只是 +只有 +只消 +只要 +只限 +叫 +叮咚 +可 +可以 +可是 +可见 +各 +各个 +各位 +各种 +各自 +同 +同时 +后 +后者 +向 +向使 +向着 +吓 +吗 +否则 +吧 +吧哒 +含 +吱 +呀 +呃 +呕 +呗 +呜 +呜呼 +呢 +呵 +呵呵 +呸 +呼哧 +咋 +和 +咚 +咦 +咧 +咱 +咱们 +咳 +哇 +哈 +哈哈 +哉 +哎 +哎呀 +哎哟 +哗 +哟 +哦 +哩 +哪 +哪个 +哪些 +哪儿 +哪天 +哪年 +哪怕 +哪样 +哪边 +哪里 +哼 +哼唷 +唉 +唯有 +啊 +啐 +啥 +啦 +啪达 +啷当 +喂 +喏 +喔唷 +喽 +嗡 +嗡嗡 +嗬 +嗯 +嗳 +嘎 +嘎登 +嘘 +嘛 +嘻 +嘿 +嘿嘿 +四 +因 +因为 +因了 +因此 +因着 +因而 +固然 +在 +在下 +在于 +地 +基于 +处在 +多 +多么 +多少 +大 +大家 +她 +她们 +好 +如 +如上 +如上所述 +如下 +如何 +如其 +如同 +如是 +如果 +如此 +如若 +始而 +孰料 +孰知 +宁 +宁可 +宁愿 +宁肯 +它 +它们 +对 +对于 +对待 +对方 +对比 +将 +小 +尔 +尔后 +尔尔 +尚且 +就 +就是 +就是了 +就是说 +就算 +就要 +尽 +尽管 +尽管如此 +岂但 +己 +已 +已矣 +巴 +巴巴 +年 +并 +并且 +庶乎 +庶几 +开外 +开始 +归 +归齐 +当 +当地 +当然 +当着 +彼 +彼时 +彼此 +往 +待 +很 +得 +得了 +怎 +怎么 +怎么办 +怎么样 +怎奈 +怎样 +总之 +总的来看 +总的来说 +总的说来 +总而言之 +恰恰相反 +您 +惟其 +慢说 +我 +我们 +或 +或则 +或是 +或曰 +或者 +截至 +所 +所以 +所在 +所幸 +所有 +才 +才能 +打 +打从 +把 +抑或 +拿 +按 +按照 +换句话说 +换言之 +据 +据此 +接着 +故 +故此 +故而 +旁人 +无 +无宁 +无论 +既 +既往 +既是 +既然 +日 +时 +时候 +是 +是以 +是的 +更 +曾 +替 +替代 +最 +月 +有 +有些 +有关 +有及 +有时 +有的 +望 +朝 +朝着 +本 +本人 +本地 +本着 +本身 +来 +来着 +来自 +来说 +极了 +果然 +果真 +某 +某个 +某些 +某某 +根据 +欤 +正值 +正如 +正巧 +正是 +此 +此地 +此处 +此外 +此时 +此次 +此间 +毋宁 +每 +每当 +比 +比及 +比如 +比方 +没奈何 +沿 +沿着 +漫说 +点 +焉 +然则 +然后 +然而 +照 +照着 +犹且 +犹自 +甚且 +甚么 +甚或 +甚而 +甚至 +甚至于 +用 +用来 +由 +由于 +由是 +由此 +由此可见 +的 +的确 +的话 +直到 +相对而言 +省得 +看 +眨眼 +着 +着呢 +矣 +矣乎 +矣哉 +离 +秒 +称 +竟而 +第 +等 +等到 +等等 +简言之 +管 +类如 +紧接着 +纵 +纵令 +纵使 +纵然 +经 +经过 +结果 +给 +继之 +继后 +继而 +综上所述 +罢了 +者 +而 +而且 +而况 +而后 +而外 +而已 +而是 +而言 +能 +能否 +腾 +自 +自个儿 +自从 +自各儿 +自后 +自家 +自己 +自打 +自身 +至 +至于 +至今 +至若 +致 +般的 +若 +若夫 +若是 +若果 +若非 +莫不然 +莫如 +莫若 +虽 +虽则 +虽然 +虽说 +被 +要 +要不 +要不是 +要不然 +要么 +要是 +譬喻 +譬如 +让 +许多 +论 +设使 +设或 +设若 +诚如 +诚然 +该 +说 +说来 +请 +诸 +诸位 +诸如 +谁 +谁人 +谁料 +谁知 +贼死 +赖以 +赶 +起 +起见 +趁 +趁着 +越是 +距 +跟 +较 +较之 +边 +过 +还 +还是 +还有 +还要 +这 +这一来 +这个 +这么 +这么些 +这么样 +这么点儿 +这些 +这会儿 +这儿 +这就是说 +这时 +这样 +这次 +这般 +这边 +这里 +进而 +连 +连同 +逐步 +通过 +遵循 +遵照 +那 +那个 +那么 +那么些 +那么样 +那些 +那会儿 +那儿 +那时 +那样 +那般 +那边 +那里 +都 +鄙人 +鉴于 +针对 +阿 +除 +除了 +除外 +除开 +除此之外 +除非 +随 +随后 +随时 +随着 +难道说 +零 +非 +非但 +非徒 +非特 +非独 +靠 +顺 +顺着 +首先 +︿ +! +# +$ +% +& +( +) +* ++ +, +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +; +< +> +? +@ +[ +] +{ +| +} +~ +¥ diff --git a/modules/story-summary/vector/utils/stopwords-patch.js b/modules/story-summary/vector/utils/stopwords-patch.js new file mode 100644 index 0000000..51f7614 --- /dev/null +++ b/modules/story-summary/vector/utils/stopwords-patch.js @@ -0,0 +1,9 @@ +// Small domain-level tuning surface. +// Keep this file tiny: add/remove only words that are repeatedly noisy in real logs. + +// Extra stopwords on top of BASE_STOP_WORDS. +export const DOMAIN_STOP_WORDS = []; + +// High-value words that must never be filtered as stopwords. +// Default to empty for plugin-wide deployment; entity names are already protected dynamically. +export const KEEP_WORDS = []; diff --git a/modules/story-summary/vector/utils/tokenizer.js b/modules/story-summary/vector/utils/tokenizer.js index a39e4e9..37ab59c 100644 --- a/modules/story-summary/vector/utils/tokenizer.js +++ b/modules/story-summary/vector/utils/tokenizer.js @@ -18,6 +18,8 @@ import { extensionFolderPath } from '../../../../core/constants.js'; import { xbLog } from '../../../../core/debug-core.js'; +import { BASE_STOP_WORDS } from './stopwords-base.js'; +import { DOMAIN_STOP_WORDS, KEEP_WORDS } from './stopwords-patch.js'; const MODULE_ID = 'tokenizer'; @@ -61,44 +63,30 @@ let entityList = []; /** @type {Set} 已注入结巴的实体(避免重复 add_word) */ let injectedEntities = new Set(); +let entityKeepSet = new Set(); // ═══════════════════════════════════════════════════════════════════════════ // 停用词 // ═══════════════════════════════════════════════════════════════════════════ -const STOP_WORDS = new Set([ - // 中文高频虚词 - '的', '了', '在', '是', '我', '有', '和', '就', '不', '人', - '都', '一', '一个', '上', '也', '很', '到', '说', '要', '去', - '你', '会', '着', '没有', '看', '好', '自己', '这', '他', '她', - '它', '吗', '什么', '那', '里', '来', '吧', '呢', '啊', '哦', - '嗯', '呀', '哈', '嘿', '喂', '哎', '唉', '哇', '呃', '嘛', - '把', '被', '让', '给', '从', '向', '对', '跟', '比', '但', - '而', '或', '如果', '因为', '所以', '虽然', '但是', '然后', - '可以', '这样', '那样', '怎么', '为什么', '什么样', '哪里', - '时候', '现在', '已经', '还是', '只是', '可能', '应该', '知道', - '觉得', '开始', '一下', '一些', '这个', '那个', '他们', '我们', - '你们', '自己', '起来', '出来', '进去', '回来', '过来', '下去', - // 日语常见虚词(≥2字,匹配 TinySegmenter 产出粒度) - 'です', 'ます', 'した', 'して', 'する', 'ない', 'いる', 'ある', - 'なる', 'れる', 'られ', 'られる', - 'この', 'その', 'あの', 'どの', 'ここ', 'そこ', 'あそこ', - 'これ', 'それ', 'あれ', 'どれ', - 'ても', 'から', 'まで', 'ので', 'のに', 'けど', 'だけ', - 'もう', 'まだ', 'とても', 'ちょっと', 'やっぱり', - // 英文常见停用词 - 'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been', - 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', - 'would', 'could', 'should', 'may', 'might', 'can', 'shall', - 'and', 'but', 'or', 'not', 'no', 'nor', 'so', 'yet', - 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from', - 'it', 'its', 'he', 'she', 'his', 'her', 'they', 'them', - 'this', 'that', 'these', 'those', 'i', 'me', 'my', 'you', 'your', - 'we', 'our', 'if', 'then', 'than', 'when', 'what', 'which', - 'who', 'how', 'where', 'there', 'here', 'all', 'each', 'every', - 'both', 'few', 'more', 'most', 'other', 'some', 'such', - 'only', 'own', 'same', 'just', 'very', 'also', 'about', -]); +const STATIC_KEEP_WORDS = new Set((KEEP_WORDS || []) + .map(w => String(w || '').trim().toLowerCase()) + .filter(Boolean)); + +// Standard source only: stopwords-iso snapshot + small domain patch. +const EFFECTIVE_STOP_WORDS = new Set( + [...BASE_STOP_WORDS, ...DOMAIN_STOP_WORDS] + .map(w => String(w || '').trim().toLowerCase()) + .filter(Boolean), +); + +function shouldKeepTokenByWhitelist(token) { + const t = String(token || '').trim().toLowerCase(); + if (!t) return false; + if (STATIC_KEEP_WORDS.has(t)) return true; + if (entityKeepSet.has(t)) return true; + return false; +} // ═══════════════════════════════════════════════════════════════════════════ // Unicode 分类 @@ -571,6 +559,7 @@ export function getState() { export function injectEntities(lexicon, displayMap) { if (!lexicon?.size) { entityList = []; + entityKeepSet = new Set(); return; } @@ -586,6 +575,7 @@ export function injectEntities(lexicon, displayMap) { // 按长度降序(最长匹配优先) entities.sort((a, b) => b.length - a.length); entityList = entities; + entityKeepSet = new Set(entities.map(e => String(e || '').trim().toLowerCase()).filter(Boolean)); // 如果结巴已就绪,注入自定义词 if (wasmState === WasmState.READY && jiebaAddWord) { @@ -656,7 +646,7 @@ export function tokenize(text) { if (!cleaned) continue; if (cleaned.length < 2) continue; - if (STOP_WORDS.has(cleaned)) continue; + if (EFFECTIVE_STOP_WORDS.has(cleaned) && !shouldKeepTokenByWhitelist(cleaned)) continue; if (seen.has(cleaned)) continue; // 过滤纯标点/特殊字符 @@ -728,7 +718,7 @@ export function tokenizeForIndex(text) { .map(t => t.trim().toLowerCase()) .filter(t => { if (!t || t.length < 2) return false; - if (STOP_WORDS.has(t)) return false; + if (EFFECTIVE_STOP_WORDS.has(t) && !shouldKeepTokenByWhitelist(t)) return false; if (/^[\s\x00-\x1F\p{P}\p{S}]+$/u.test(t)) return false; return true; }); @@ -744,6 +734,7 @@ export function tokenizeForIndex(text) { */ export function reset() { entityList = []; + entityKeepSet = new Set(); injectedEntities.clear(); // 不重置 WASM 状态(避免重复加载) }