-
-
-
diff --git a/modules/story-summary/story-summary.js b/modules/story-summary/story-summary.js
index 9cb242f..4f95d0b 100644
--- a/modules/story-summary/story-summary.js
+++ b/modules/story-summary/story-summary.js
@@ -1551,6 +1551,7 @@ async function handleMessageReceived(scheduledChatId) {
// Refresh entity lexicon after new message (new roles may appear)
refreshEntityLexiconAndWarmup();
+ scheduleLexicalWarmup(100);
// Auto backfill missing L0 (delay to avoid contention with current floor)
setTimeout(() => maybeAutoExtractL0(), 2000);
@@ -1559,6 +1560,7 @@ async function handleMessageReceived(scheduledChatId) {
function handleMessageSent(scheduledChatId) {
if (isChatStale(scheduledChatId)) return;
initButtonsForAll();
+ scheduleLexicalWarmup(0);
setTimeout(() => maybeAutoRunSummary("before_user"), 1000);
}
diff --git a/modules/story-summary/vector/llm/llm-service.js b/modules/story-summary/vector/llm/llm-service.js
index 13ec391..7120b64 100644
--- a/modules/story-summary/vector/llm/llm-service.js
+++ b/modules/story-summary/vector/llm/llm-service.js
@@ -2,7 +2,6 @@
// vector/llm/llm-service.js - 修复 prefill 传递方式
// ═══════════════════════════════════════════════════════════════════════════
import { xbLog } from '../../../../core/debug-core.js';
-import { getVectorConfig } from '../../data/config.js';
import { getApiKey } from './siliconflow.js';
const MODULE_ID = 'vector-llm-service';
diff --git a/modules/story-summary/vector/retrieval/lexical-index.js b/modules/story-summary/vector/retrieval/lexical-index.js
index 83124d6..bc39dd5 100644
--- a/modules/story-summary/vector/retrieval/lexical-index.js
+++ b/modules/story-summary/vector/retrieval/lexical-index.js
@@ -1,16 +1,3 @@
-// ═══════════════════════════════════════════════════════════════════════════
-// lexical-index.js - MiniSearch 词法检索索引
-//
-// 职责:
-// 1. 对 L0 atoms + L1 chunks + L2 events 建立词法索引
-// 2. 提供词法检索接口(专名精确匹配兜底)
-// 3. 惰性构建 + 异步预热 + 缓存失效机制
-//
-// 索引存储:纯内存(不持久化)
-// 分词器:统一使用 tokenizer.js(结巴 + 实体保护 + 降级)
-// 重建时机:CHAT_CHANGED / L0提取完成 / L2总结完成
-// ═══════════════════════════════════════════════════════════════════════════
-
import MiniSearch from '../../../../libs/minisearch.mjs';
import { getContext } from '../../../../../../../extensions.js';
import { getSummaryStore } from '../../data/store.js';
@@ -20,76 +7,166 @@ import { tokenizeForIndex } from '../utils/tokenizer.js';
const MODULE_ID = 'lexical-index';
-// ─────────────────────────────────────────────────────────────────────────
-// 缓存
-// ─────────────────────────────────────────────────────────────────────────
-
-/** @type {MiniSearch|null} */
+// In-memory index cache
let cachedIndex = null;
-
-/** @type {string|null} */
let cachedChatId = null;
-
-/** @type {string|null} 数据指纹(atoms + chunks + events 数量) */
let cachedFingerprint = null;
-
-/** @type {boolean} 是否正在构建 */
let building = false;
-
-/** @type {Promise|null} 当前构建 Promise(防重入) */
let buildPromise = null;
-/** @type {Map} floor → 该楼层的 doc IDs(仅 L1 chunks) */
+
+// floor -> chunk doc ids (L1 only)
let floorDocIds = new Map();
-// ─────────────────────────────────────────────────────────────────────────
-// 工具函数
-// ─────────────────────────────────────────────────────────────────────────
+// IDF stats over lexical docs (L1 chunks + L2 events)
+let termDfMap = new Map();
+let docTokenSets = new Map(); // docId -> Set
+let lexicalDocCount = 0;
+
+const IDF_MIN = 1.0;
+const IDF_MAX = 4.0;
+const BUILD_BATCH_SIZE = 500;
-/**
- * 清理事件摘要(移除楼层标记)
- * @param {string} summary
- * @returns {string}
- */
function cleanSummary(summary) {
return String(summary || '')
.replace(/\s*\(#\d+(?:-\d+)?\)\s*$/, '')
.trim();
}
-/**
- * 计算缓存指纹
- * @param {number} chunkCount
- * @param {number} eventCount
- * @returns {string}
- */
-function computeFingerprint(chunkCount, eventCount) {
- return `${chunkCount}:${eventCount}`;
+function fnv1a32(input, seed = 0x811C9DC5) {
+ let hash = seed >>> 0;
+ const text = String(input || '');
+ for (let i = 0; i < text.length; i++) {
+ hash ^= text.charCodeAt(i);
+ hash = Math.imul(hash, 0x01000193) >>> 0;
+ }
+ return hash >>> 0;
+}
+
+function compareDocKeys(a, b) {
+ const ka = `${a?.type || ''}:${a?.id || ''}`;
+ const kb = `${b?.type || ''}:${b?.id || ''}`;
+ if (ka < kb) return -1;
+ if (ka > kb) return 1;
+ return 0;
+}
+
+function computeFingerprintFromDocs(docs) {
+ const normalizedDocs = Array.isArray(docs) ? [...docs].sort(compareDocKeys) : [];
+ let hash = 0x811C9DC5;
+
+ for (const doc of normalizedDocs) {
+ const payload = `${doc?.type || ''}\u001F${doc?.id || ''}\u001F${doc?.floor ?? ''}\u001F${doc?.text || ''}\u001E`;
+ hash = fnv1a32(payload, hash);
+ }
+
+ return `${normalizedDocs.length}:${(hash >>> 0).toString(16)}`;
}
-/**
- * 让出主线程(避免长时间阻塞 UI)
- * @returns {Promise}
- */
function yieldToMain() {
return new Promise(resolve => setTimeout(resolve, 0));
}
-// ─────────────────────────────────────────────────────────────────────────
-// 文档收集
-// ─────────────────────────────────────────────────────────────────────────
+function clamp(v, min, max) {
+ return Math.max(min, Math.min(max, v));
+}
+
+function normalizeTerm(term) {
+ return String(term || '').trim().toLowerCase();
+}
+
+function computeIdfFromDf(df, docCount) {
+ if (!docCount || docCount <= 0) return 1;
+ const raw = Math.log((docCount + 1) / ((df || 0) + 1)) + 1;
+ return clamp(raw, IDF_MIN, IDF_MAX);
+}
+
+function computeIdf(term) {
+ const t = normalizeTerm(term);
+ if (!t || lexicalDocCount <= 0) return 1;
+ return computeIdfFromDf(termDfMap.get(t) || 0, lexicalDocCount);
+}
+
+function extractUniqueTokens(text) {
+ return new Set(tokenizeForIndex(String(text || '')).map(normalizeTerm).filter(Boolean));
+}
+
+function clearIdfState() {
+ termDfMap = new Map();
+ docTokenSets = new Map();
+ lexicalDocCount = 0;
+}
+
+function removeDocumentIdf(docId) {
+ const id = String(docId || '');
+ if (!id) return;
+
+ const tokens = docTokenSets.get(id);
+ if (!tokens) return;
+
+ for (const token of tokens) {
+ const current = termDfMap.get(token) || 0;
+ if (current <= 1) {
+ termDfMap.delete(token);
+ } else {
+ termDfMap.set(token, current - 1);
+ }
+ }
+
+ docTokenSets.delete(id);
+ lexicalDocCount = Math.max(0, lexicalDocCount - 1);
+}
+
+function addDocumentIdf(docId, text) {
+ const id = String(docId || '');
+ if (!id) return;
+
+ // Replace semantics: remove old token set first if this id already exists.
+ removeDocumentIdf(id);
+
+ const tokens = extractUniqueTokens(text);
+ docTokenSets.set(id, tokens);
+ lexicalDocCount += 1;
+
+ for (const token of tokens) {
+ termDfMap.set(token, (termDfMap.get(token) || 0) + 1);
+ }
+}
+
+function rebuildIdfFromDocs(docs) {
+ clearIdfState();
+ for (const doc of docs || []) {
+ const id = String(doc?.id || '');
+ const text = String(doc?.text || '');
+ if (!id || !text.trim()) continue;
+ addDocumentIdf(id, text);
+ }
+}
+
+function buildEventDoc(ev) {
+ if (!ev?.id) return null;
+
+ const parts = [];
+ if (ev.title) parts.push(ev.title);
+ if (ev.participants?.length) parts.push(ev.participants.join(' '));
+
+ const summary = cleanSummary(ev.summary);
+ if (summary) parts.push(summary);
+
+ const text = parts.join(' ').trim();
+ if (!text) return null;
+
+ return {
+ id: ev.id,
+ type: 'event',
+ floor: null,
+ text,
+ };
+}
-/**
- * 收集所有待索引文档
- *
- * @param {object[]} chunks - getAllChunks(chatId) 返回值
- * @param {object[]} events - store.json.events
- * @returns {object[]} 文档数组
- */
function collectDocuments(chunks, events) {
const docs = [];
- // L1 chunks + 填充 floorDocIds
- for (const chunk of (chunks || [])) {
+ for (const chunk of chunks || []) {
if (!chunk?.chunkId || !chunk.text) continue;
const floor = chunk.floor ?? -1;
@@ -101,48 +178,19 @@ function collectDocuments(chunks, events) {
});
if (floor >= 0) {
- if (!floorDocIds.has(floor)) {
- floorDocIds.set(floor, []);
- }
+ if (!floorDocIds.has(floor)) floorDocIds.set(floor, []);
floorDocIds.get(floor).push(chunk.chunkId);
}
}
- // L2 events
- for (const ev of (events || [])) {
- if (!ev?.id) continue;
- const parts = [];
- if (ev.title) parts.push(ev.title);
- if (ev.participants?.length) parts.push(ev.participants.join(' '));
- const summary = cleanSummary(ev.summary);
- if (summary) parts.push(summary);
- const text = parts.join(' ').trim();
- if (!text) continue;
-
- docs.push({
- id: ev.id,
- type: 'event',
- floor: null,
- text,
- });
+ for (const ev of events || []) {
+ const doc = buildEventDoc(ev);
+ if (doc) docs.push(doc);
}
return docs;
}
-// ─────────────────────────────────────────────────────────────────────────
-// 索引构建(分片,不阻塞主线程)
-// ─────────────────────────────────────────────────────────────────────────
-
-/** 每批添加的文档数 */
-const BUILD_BATCH_SIZE = 500;
-
-/**
- * 构建 MiniSearch 索引(分片异步)
- *
- * @param {object[]} docs - 文档数组
- * @returns {Promise}
- */
async function buildIndexAsync(docs) {
const T0 = performance.now();
@@ -158,49 +206,46 @@ async function buildIndexAsync(docs) {
tokenize: tokenizeForIndex,
});
- if (!docs.length) {
- return index;
- }
+ if (!docs.length) return index;
- // 分片添加,每批 BUILD_BATCH_SIZE 条后让出主线程
for (let i = 0; i < docs.length; i += BUILD_BATCH_SIZE) {
const batch = docs.slice(i, i + BUILD_BATCH_SIZE);
index.addAll(batch);
- // 非最后一批时让出主线程
if (i + BUILD_BATCH_SIZE < docs.length) {
await yieldToMain();
}
}
const elapsed = Math.round(performance.now() - T0);
- xbLog.info(MODULE_ID,
- `索引构建完成: ${docs.length} 文档 (${elapsed}ms)`
- );
-
+ xbLog.info(MODULE_ID, `Index built: ${docs.length} docs (${elapsed}ms)`);
return index;
}
-// ─────────────────────────────────────────────────────────────────────────
-// 检索
-// ─────────────────────────────────────────────────────────────────────────
-
/**
* @typedef {object} LexicalSearchResult
- * @property {string[]} atomIds - 命中的 L0 atom IDs
- * @property {Set} atomFloors - 命中的 L0 楼层集合
- * @property {string[]} chunkIds - 命中的 L1 chunk IDs
- * @property {Set} chunkFloors - 命中的 L1 楼层集合
- * @property {string[]} eventIds - 命中的 L2 event IDs
- * @property {object[]} chunkScores - chunk 命中详情 [{ chunkId, score }]
- * @property {number} searchTime - 检索耗时 ms
+ * @property {string[]} atomIds - Reserved for backward compatibility (currently empty).
+ * @property {Set} atomFloors - Reserved for backward compatibility (currently empty).
+ * @property {string[]} chunkIds - Matched L1 chunk ids sorted by weighted lexical score.
+ * @property {Set} chunkFloors - Floor ids covered by matched chunks.
+ * @property {string[]} eventIds - Matched L2 event ids sorted by weighted lexical score.
+ * @property {object[]} chunkScores - Weighted lexical scores for matched chunks.
+ * @property {boolean} idfEnabled - Whether IDF stats are available for weighting.
+ * @property {number} idfDocCount - Number of lexical docs used to compute IDF.
+ * @property {Array<{term:string,idf:number}>} topIdfTerms - Top query terms by IDF.
+ * @property {string[]} queryTerms - Normalized query terms actually searched.
+ * @property {Record>} termFloorHits - Chunk-floor hits by term.
+ * @property {Array<{floor:number, score:number, hitTermsCount:number}>} floorLexScores - Aggregated lexical floor scores (debug).
+ * @property {number} termSearches - Number of per-term MiniSearch queries executed.
+ * @property {number} searchTime - Total lexical search time in milliseconds.
*/
/**
- * 在词法索引中检索
+ * Search lexical index by terms, using per-term MiniSearch and IDF-weighted score aggregation.
+ * This keeps existing outputs compatible while adding observability fields.
*
- * @param {MiniSearch} index - 索引实例
- * @param {string[]} terms - 查询词列表
+ * @param {MiniSearch} index
+ * @param {string[]} terms
* @returns {LexicalSearchResult}
*/
export function searchLexicalIndex(index, terms) {
@@ -213,6 +258,13 @@ export function searchLexicalIndex(index, terms) {
chunkFloors: new Set(),
eventIds: [],
chunkScores: [],
+ idfEnabled: lexicalDocCount > 0,
+ idfDocCount: lexicalDocCount,
+ topIdfTerms: [],
+ queryTerms: [],
+ termFloorHits: {},
+ floorLexScores: [],
+ termSearches: 0,
searchTime: 0,
};
@@ -221,79 +273,111 @@ export function searchLexicalIndex(index, terms) {
return result;
}
- // 用所有 terms 联合查询
- const queryString = terms.join(' ');
+ const queryTerms = Array.from(new Set((terms || []).map(normalizeTerm).filter(Boolean)));
+ result.queryTerms = [...queryTerms];
+ const weightedScores = new Map(); // docId -> score
+ const hitMeta = new Map(); // docId -> { type, floor }
+ const idfPairs = [];
+ const termFloorHits = new Map(); // term -> [{ floor, weightedScore, chunkId }]
+ const floorLexAgg = new Map(); // floor -> { score, terms:Set }
- let hits;
- try {
- hits = index.search(queryString, {
- boost: { text: 1 },
- fuzzy: 0.2,
- prefix: true,
- combineWith: 'OR',
- // 使用与索引相同的分词器
- tokenize: tokenizeForIndex,
- });
- } catch (e) {
- xbLog.warn(MODULE_ID, '检索失败', e);
- result.searchTime = Math.round(performance.now() - T0);
- return result;
+ for (const term of queryTerms) {
+ const idf = computeIdf(term);
+ idfPairs.push({ term, idf });
+
+ let hits = [];
+ try {
+ hits = index.search(term, {
+ boost: { text: 1 },
+ fuzzy: 0.2,
+ prefix: true,
+ combineWith: 'OR',
+ tokenize: tokenizeForIndex,
+ });
+ } catch (e) {
+ xbLog.warn(MODULE_ID, `Lexical term search failed: ${term}`, e);
+ continue;
+ }
+
+ result.termSearches += 1;
+
+ for (const hit of hits) {
+ const id = String(hit.id || '');
+ if (!id) continue;
+
+ const weighted = (hit.score || 0) * idf;
+ weightedScores.set(id, (weightedScores.get(id) || 0) + weighted);
+
+ if (!hitMeta.has(id)) {
+ hitMeta.set(id, {
+ type: hit.type,
+ floor: hit.floor,
+ });
+ }
+
+ if (hit.type === 'chunk' && typeof hit.floor === 'number' && hit.floor >= 0) {
+ if (!termFloorHits.has(term)) termFloorHits.set(term, []);
+ termFloorHits.get(term).push({
+ floor: hit.floor,
+ weightedScore: weighted,
+ chunkId: id,
+ });
+
+ const floorAgg = floorLexAgg.get(hit.floor) || { score: 0, terms: new Set() };
+ floorAgg.score += weighted;
+ floorAgg.terms.add(term);
+ floorLexAgg.set(hit.floor, floorAgg);
+ }
+ }
}
- // 分类结果
- const chunkIdSet = new Set();
- const eventIdSet = new Set();
+ idfPairs.sort((a, b) => b.idf - a.idf);
+ result.topIdfTerms = idfPairs.slice(0, 5);
+ result.termFloorHits = Object.fromEntries(
+ [...termFloorHits.entries()].map(([term, hits]) => [term, hits]),
+ );
+ result.floorLexScores = [...floorLexAgg.entries()]
+ .map(([floor, info]) => ({
+ floor,
+ score: Number(info.score.toFixed(6)),
+ hitTermsCount: info.terms.size,
+ }))
+ .sort((a, b) => b.score - a.score);
- for (const hit of hits) {
- const type = hit.type;
- const id = hit.id;
- const floor = hit.floor;
+ const sortedHits = Array.from(weightedScores.entries())
+ .sort((a, b) => b[1] - a[1]);
- switch (type) {
- case 'chunk':
- if (!chunkIdSet.has(id)) {
- chunkIdSet.add(id);
- result.chunkIds.push(id);
- result.chunkScores.push({ chunkId: id, score: hit.score });
- if (typeof floor === 'number' && floor >= 0) {
- result.chunkFloors.add(floor);
- }
- }
- break;
+ for (const [id, score] of sortedHits) {
+ const meta = hitMeta.get(id);
+ if (!meta) continue;
- case 'event':
- if (!eventIdSet.has(id)) {
- eventIdSet.add(id);
- result.eventIds.push(id);
- }
- break;
+ if (meta.type === 'chunk') {
+ result.chunkIds.push(id);
+ result.chunkScores.push({ chunkId: id, score });
+ if (typeof meta.floor === 'number' && meta.floor >= 0) {
+ result.chunkFloors.add(meta.floor);
+ }
+ continue;
+ }
+
+ if (meta.type === 'event') {
+ result.eventIds.push(id);
}
}
result.searchTime = Math.round(performance.now() - T0);
- xbLog.info(MODULE_ID,
- `检索完成: terms=[${terms.slice(0, 5).join(',')}] → atoms=${result.atomIds.length} chunks=${result.chunkIds.length} events=${result.eventIds.length} (${result.searchTime}ms)`
+ xbLog.info(
+ MODULE_ID,
+ `Lexical search terms=[${queryTerms.slice(0, 5).join(',')}] chunks=${result.chunkIds.length} events=${result.eventIds.length} termSearches=${result.termSearches} (${result.searchTime}ms)`,
);
return result;
}
-// ─────────────────────────────────────────────────────────────────────────
-// 内部构建流程(收集数据 + 构建索引)
-// ─────────────────────────────────────────────────────────────────────────
-
-/**
- * 收集数据并构建索引
- *
- * @param {string} chatId
- * @returns {Promise<{index: MiniSearch, fingerprint: string}>}
- */
async function collectAndBuild(chatId) {
- // 清空侧索引(全量重建)
floorDocIds = new Map();
- // 收集数据(不含 L0 atoms)
const store = getSummaryStore();
const events = store?.json?.events || [];
@@ -301,48 +385,44 @@ async function collectAndBuild(chatId) {
try {
chunks = await getAllChunks(chatId);
} catch (e) {
- xbLog.warn(MODULE_ID, '获取 chunks 失败', e);
+ xbLog.warn(MODULE_ID, 'Failed to load chunks', e);
}
- const fp = computeFingerprint(chunks.length, events.length);
+ const docs = collectDocuments(chunks, events);
+ const fp = computeFingerprintFromDocs(docs);
- // 检查是否在收集过程中缓存已被其他调用更新
if (cachedIndex && cachedChatId === chatId && cachedFingerprint === fp) {
return { index: cachedIndex, fingerprint: fp };
}
- // 收集文档(同时填充 floorDocIds)
- const docs = collectDocuments(chunks, events);
-
- // 异步分片构建
+ rebuildIdfFromDocs(docs);
const index = await buildIndexAsync(docs);
return { index, fingerprint: fp };
}
-// ─────────────────────────────────────────────────────────────────────────
-// 公开接口:getLexicalIndex(惰性获取)
-// ─────────────────────────────────────────────────────────────────────────
-
/**
- * 获取词法索引(惰性构建 + 缓存)
- *
- * 如果缓存有效则直接返回;否则自动构建。
- * 如果正在构建中,等待构建完成。
- *
- * @returns {Promise}
+ * Expose IDF accessor for query-term selection in query-builder.
+ * If index stats are not ready, this gracefully falls back to idf=1.
*/
+export function getLexicalIdfAccessor() {
+ return {
+ enabled: lexicalDocCount > 0,
+ docCount: lexicalDocCount,
+ getIdf(term) {
+ return computeIdf(term);
+ },
+ };
+}
+
export async function getLexicalIndex() {
const { chatId } = getContext();
if (!chatId) return null;
- // 快速路径:如果缓存存在且 chatId 未变,则直接命中
- // 指纹校验放到构建流程中完成,避免为指纹而额外读一次 IndexedDB
if (cachedIndex && cachedChatId === chatId && cachedFingerprint) {
return cachedIndex;
}
- // 正在构建中,等待结果
if (building && buildPromise) {
try {
await buildPromise;
@@ -350,27 +430,23 @@ export async function getLexicalIndex() {
return cachedIndex;
}
} catch {
- // 构建失败,继续往下重建
+ // Continue to rebuild below.
}
}
- // 需要重建(指纹将在 collectAndBuild 内部计算并写入缓存)
- xbLog.info(MODULE_ID, `缓存失效,重建索引 (chatId=${chatId.slice(0, 8)})`);
+ xbLog.info(MODULE_ID, `Lexical cache miss; rebuilding (chatId=${chatId.slice(0, 8)})`);
building = true;
buildPromise = collectAndBuild(chatId);
try {
const { index, fingerprint } = await buildPromise;
-
- // 原子替换缓存
cachedIndex = index;
cachedChatId = chatId;
cachedFingerprint = fingerprint;
-
return index;
} catch (e) {
- xbLog.error(MODULE_ID, '索引构建失败', e);
+ xbLog.error(MODULE_ID, 'Index build failed', e);
return null;
} finally {
building = false;
@@ -378,74 +454,29 @@ export async function getLexicalIndex() {
}
}
-// ─────────────────────────────────────────────────────────────────────────
-// 公开接口:warmupIndex(异步预建)
-// ─────────────────────────────────────────────────────────────────────────
-
-/**
- * 异步预建索引
- *
- * 在 CHAT_CHANGED 时调用,后台构建索引。
- * 不阻塞调用方,不返回结果。
- * 构建完成后缓存自动更新,后续 getLexicalIndex() 直接命中。
- *
- * 调用时机:
- * - handleChatChanged(实体注入后)
- * - L0 提取完成
- * - L2 总结完成
- */
export function warmupIndex() {
const { chatId } = getContext();
- if (!chatId) return;
+ if (!chatId || building) return;
- // 已在构建中,不重复触发
- if (building) return;
-
- // fire-and-forget
getLexicalIndex().catch(e => {
- xbLog.warn(MODULE_ID, '预热索引失败', e);
+ xbLog.warn(MODULE_ID, 'Warmup failed', e);
});
}
-// ─────────────────────────────────────────────────────────────────────────
-// 公开接口:invalidateLexicalIndex(缓存失效)
-// ─────────────────────────────────────────────────────────────────────────
-
-/**
- * 使缓存失效(下次 getLexicalIndex / warmupIndex 时自动重建)
- *
- * 调用时机:
- * - CHAT_CHANGED
- * - L0 提取完成
- * - L2 总结完成
- */
export function invalidateLexicalIndex() {
if (cachedIndex) {
- xbLog.info(MODULE_ID, '索引缓存已失效');
+ xbLog.info(MODULE_ID, 'Lexical index cache invalidated');
}
cachedIndex = null;
cachedChatId = null;
cachedFingerprint = null;
floorDocIds = new Map();
+ clearIdfState();
}
-// ─────────────────────────────────────────────────────────────────────────
-// 增量更新接口
-// ─────────────────────────────────────────────────────────────────────────
-
-/**
- * 为指定楼层添加 L1 chunks 到索引
- *
- * 先移除该楼层旧文档,再添加新文档。
- * 如果索引不存在(缓存失效),静默跳过(下次 getLexicalIndex 全量重建)。
- *
- * @param {number} floor - 楼层号
- * @param {object[]} chunks - chunk 对象列表(需有 chunkId、text、floor)
- */
export function addDocumentsForFloor(floor, chunks) {
if (!cachedIndex || !chunks?.length) return;
- // 先移除旧文档
removeDocumentsByFloor(floor);
const docs = [];
@@ -453,30 +484,29 @@ export function addDocumentsForFloor(floor, chunks) {
for (const chunk of chunks) {
if (!chunk?.chunkId || !chunk.text) continue;
- docs.push({
+
+ const doc = {
id: chunk.chunkId,
type: 'chunk',
floor: chunk.floor ?? floor,
text: chunk.text,
- });
+ };
+ docs.push(doc);
docIds.push(chunk.chunkId);
}
- if (docs.length > 0) {
- cachedIndex.addAll(docs);
- floorDocIds.set(floor, docIds);
- xbLog.info(MODULE_ID, `增量添加: floor ${floor}, ${docs.length} 个 chunk`);
+ if (!docs.length) return;
+
+ cachedIndex.addAll(docs);
+ floorDocIds.set(floor, docIds);
+
+ for (const doc of docs) {
+ addDocumentIdf(doc.id, doc.text);
}
+
+ xbLog.info(MODULE_ID, `Incremental add floor=${floor} chunks=${docs.length}`);
}
-/**
- * 从索引中移除指定楼层的所有 L1 chunk 文档
- *
- * 使用 MiniSearch discard()(软删除)。
- * 如果索引不存在,静默跳过。
- *
- * @param {number} floor - 楼层号
- */
export function removeDocumentsByFloor(floor) {
if (!cachedIndex) return;
@@ -487,55 +517,39 @@ export function removeDocumentsByFloor(floor) {
try {
cachedIndex.discard(id);
} catch {
- // 文档可能不存在(已被全量重建替换)
+ // Ignore if the doc was already removed/rebuilt.
}
+ removeDocumentIdf(id);
}
floorDocIds.delete(floor);
- xbLog.info(MODULE_ID, `增量移除: floor ${floor}, ${docIds.length} 个文档`);
+ xbLog.info(MODULE_ID, `Incremental remove floor=${floor} chunks=${docIds.length}`);
}
-/**
- * 将新 L2 事件添加到索引
- *
- * 如果事件 ID 已存在,先 discard 再 add(覆盖)。
- * 如果索引不存在,静默跳过。
- *
- * @param {object[]} events - 事件对象列表(需有 id、title、summary 等)
- */
export function addEventDocuments(events) {
if (!cachedIndex || !events?.length) return;
const docs = [];
for (const ev of events) {
- if (!ev?.id) continue;
+ const doc = buildEventDoc(ev);
+ if (!doc) continue;
- const parts = [];
- if (ev.title) parts.push(ev.title);
- if (ev.participants?.length) parts.push(ev.participants.join(' '));
- const summary = cleanSummary(ev.summary);
- if (summary) parts.push(summary);
- const text = parts.join(' ').trim();
- if (!text) continue;
-
- // 覆盖:先尝试移除旧的
try {
- cachedIndex.discard(ev.id);
+ cachedIndex.discard(doc.id);
} catch {
- // 不存在则忽略
+ // Ignore if previous document does not exist.
}
-
- docs.push({
- id: ev.id,
- type: 'event',
- floor: null,
- text,
- });
+ removeDocumentIdf(doc.id);
+ docs.push(doc);
}
- if (docs.length > 0) {
- cachedIndex.addAll(docs);
- xbLog.info(MODULE_ID, `增量添加: ${docs.length} 个事件`);
+ if (!docs.length) return;
+
+ cachedIndex.addAll(docs);
+ for (const doc of docs) {
+ addDocumentIdf(doc.id, doc.text);
}
+
+ xbLog.info(MODULE_ID, `Incremental add events=${docs.length}`);
}
diff --git a/modules/story-summary/vector/retrieval/metrics.js b/modules/story-summary/vector/retrieval/metrics.js
index 4530788..375822b 100644
--- a/modules/story-summary/vector/retrieval/metrics.js
+++ b/modules/story-summary/vector/retrieval/metrics.js
@@ -52,6 +52,10 @@ export function createMetrics() {
eventHits: 0,
searchTime: 0,
indexReadyTime: 0,
+ idfEnabled: false,
+ idfDocCount: 0,
+ topIdfTerms: [],
+ termSearches: 0,
eventFilteredByDense: 0,
floorFilteredByDense: 0,
},
@@ -97,6 +101,11 @@ export function createMetrics() {
floorCandidates: 0,
floorsSelected: 0,
l0Collected: 0,
+ mustKeepTermsCount: 0,
+ mustKeepFloorsCount: 0,
+ mustKeepFloors: [],
+ droppedByRerankCount: 0,
+ lexHitButNotSelected: 0,
rerankApplied: false,
rerankFailed: false,
beforeRerank: 0,
@@ -274,6 +283,20 @@ export function formatMetricsLog(metrics) {
if (m.lexical.indexReadyTime > 0) {
lines.push(`├─ index_ready_time: ${m.lexical.indexReadyTime}ms`);
}
+ lines.push(`├─ idf_enabled: ${!!m.lexical.idfEnabled}`);
+ if (m.lexical.idfDocCount > 0) {
+ lines.push(`├─ idf_doc_count: ${m.lexical.idfDocCount}`);
+ }
+ if ((m.lexical.topIdfTerms || []).length > 0) {
+ const topIdfText = m.lexical.topIdfTerms
+ .slice(0, 5)
+ .map(x => `${x.term}:${x.idf}`)
+ .join(', ');
+ lines.push(`├─ top_idf_terms: [${topIdfText}]`);
+ }
+ if (m.lexical.termSearches > 0) {
+ lines.push(`├─ term_searches: ${m.lexical.termSearches}`);
+ }
if (m.lexical.eventFilteredByDense > 0) {
lines.push(`├─ event_filtered_by_dense: ${m.lexical.eventFilteredByDense}`);
}
@@ -295,6 +318,20 @@ export function formatMetricsLog(metrics) {
lines.push(`└─ time: ${m.fusion.time}ms`);
lines.push('');
+ // Fusion Guard (must-keep lexical floors)
+ lines.push('[Fusion Guard] Lexical Must-Keep');
+ lines.push(`├─ must_keep_terms: ${m.evidence.mustKeepTermsCount || 0}`);
+ lines.push(`├─ must_keep_floors: ${m.evidence.mustKeepFloorsCount || 0}`);
+ if ((m.evidence.mustKeepFloors || []).length > 0) {
+ lines.push(`│ └─ floors: [${m.evidence.mustKeepFloors.slice(0, 10).join(', ')}]`);
+ }
+ if ((m.evidence.lexHitButNotSelected || 0) > 0) {
+ lines.push(`└─ lex_hit_but_not_selected: ${m.evidence.lexHitButNotSelected}`);
+ } else {
+ lines.push(`└─ lex_hit_but_not_selected: 0`);
+ }
+ lines.push('');
+
// Constraint (L3 Facts)
lines.push('[Constraint] L3 Facts - 世界约束');
lines.push(`├─ total: ${m.constraint.total}`);
@@ -358,6 +395,9 @@ export function formatMetricsLog(metrics) {
lines.push(`│ │ ├─ before: ${m.evidence.beforeRerank} floors`);
lines.push(`│ │ ├─ after: ${m.evidence.afterRerank} floors`);
lines.push(`│ │ └─ time: ${m.evidence.rerankTime}ms`);
+ if ((m.evidence.droppedByRerankCount || 0) > 0) {
+ lines.push(`│ ├─ dropped_normal: ${m.evidence.droppedByRerankCount}`);
+ }
if (m.evidence.rerankScores) {
const rs = m.evidence.rerankScores;
lines.push(`│ ├─ rerank_scores: min=${rs.min}, max=${rs.max}, mean=${rs.mean}`);
diff --git a/modules/story-summary/vector/retrieval/query-builder.js b/modules/story-summary/vector/retrieval/query-builder.js
index c5593a0..714a0a9 100644
--- a/modules/story-summary/vector/retrieval/query-builder.js
+++ b/modules/story-summary/vector/retrieval/query-builder.js
@@ -20,6 +20,7 @@
import { getContext } from '../../../../../../../extensions.js';
import { buildEntityLexicon, buildDisplayNameMap, extractEntitiesFromText, buildCharacterPools } from './entity-lexicon.js';
+import { getLexicalIdfAccessor } from './lexical-index.js';
import { getSummaryStore } from '../../data/store.js';
import { filterText } from '../utils/text-filter.js';
import { tokenizeForIndex as tokenizerTokenizeForIndex } from '../utils/tokenizer.js';
@@ -106,6 +107,7 @@ export function computeLengthFactor(charCount) {
function extractKeyTerms(text, maxTerms = LEXICAL_TERMS_MAX) {
if (!text) return [];
+ const idfAccessor = getLexicalIdfAccessor();
const tokens = tokenizerTokenizeForIndex(text);
const freq = new Map();
for (const token of tokens) {
@@ -115,9 +117,13 @@ function extractKeyTerms(text, maxTerms = LEXICAL_TERMS_MAX) {
}
return Array.from(freq.entries())
- .sort((a, b) => b[1] - a[1])
+ .map(([term, tf]) => {
+ const idf = idfAccessor.enabled ? idfAccessor.getIdf(term) : 1;
+ return { term, tf, score: tf * idf };
+ })
+ .sort((a, b) => (b.score - a.score) || (b.tf - a.tf))
.slice(0, maxTerms)
- .map(([term]) => term);
+ .map(x => x.term);
}
// ─────────────────────────────────────────────────────────────────────────
diff --git a/modules/story-summary/vector/retrieval/recall.js b/modules/story-summary/vector/retrieval/recall.js
index b049e32..1d8486b 100644
--- a/modules/story-summary/vector/retrieval/recall.js
+++ b/modules/story-summary/vector/retrieval/recall.js
@@ -42,6 +42,7 @@ import { getLexicalIndex, searchLexicalIndex } from './lexical-index.js';
import { rerankChunks } from '../llm/reranker.js';
import { createMetrics, calcSimilarityStats } from './metrics.js';
import { diffuseFromSeeds } from './diffusion.js';
+import { tokenizeForIndex } from '../utils/tokenizer.js';
const MODULE_ID = 'recall';
@@ -81,6 +82,11 @@ const CONFIG = {
RERANK_TOP_N: 20,
RERANK_MIN_SCORE: 0.10,
+ // Fusion guard: lexical must-keep floors
+ MUST_KEEP_MAX_FLOORS: 3,
+ MUST_KEEP_MIN_IDF: 2.2,
+ MUST_KEEP_CLUSTER_WINDOW: 2,
+
// 因果链
CAUSAL_CHAIN_MAX_DEPTH: 10,
CAUSAL_INJECT_MAX: 30,
@@ -517,13 +523,107 @@ function fuseByFloor(denseRank, lexRank, cap = CONFIG.FUSION_CAP) {
return { top: scored.slice(0, cap), totalUnique };
}
+function mapChunkFloorToAiFloor(floor, chat) {
+ let mapped = Number(floor);
+ if (!Number.isInteger(mapped) || mapped < 0) return null;
+
+ if (chat?.[mapped]?.is_user) {
+ const aiFloor = mapped + 1;
+ if (aiFloor < (chat?.length || 0) && !chat?.[aiFloor]?.is_user) {
+ mapped = aiFloor;
+ } else {
+ return null;
+ }
+ }
+ return mapped;
+}
+
+function isNonStopwordTerm(term) {
+ const norm = normalize(term);
+ if (!norm) return false;
+ const tokens = tokenizeForIndex(norm).map(normalize);
+ return tokens.includes(norm);
+}
+
+function buildMustKeepFloors(lexicalResult, lexicalTerms, atomFloorSet, chat) {
+ const out = {
+ terms: [],
+ floors: [],
+ floorSet: new Set(),
+ lexHitButNotSelected: 0,
+ };
+
+ if (!lexicalResult || !lexicalTerms?.length || !atomFloorSet?.size) return out;
+
+ const queryTermSet = new Set((lexicalTerms || []).map(normalize).filter(Boolean));
+ const topIdfTerms = (lexicalResult.topIdfTerms || [])
+ .filter(x => {
+ const term = normalize(x?.term);
+ if (!term) return false;
+ if (!queryTermSet.has(term)) return false;
+ if (term.length < 2) return false;
+ if (!isNonStopwordTerm(term)) return false;
+ if ((x?.idf || 0) < CONFIG.MUST_KEEP_MIN_IDF) return false;
+ const hits = lexicalResult.termFloorHits?.[term];
+ return Array.isArray(hits) && hits.length > 0;
+ })
+ .sort((a, b) => (b.idf || 0) - (a.idf || 0));
+
+ if (!topIdfTerms.length) return out;
+
+ out.terms = topIdfTerms.map(x => ({ term: normalize(x.term), idf: x.idf || 0 }));
+
+ const floorAgg = new Map(); // floor -> { lexHitScore, terms:Set }
+ for (const { term } of out.terms) {
+ const hits = lexicalResult.termFloorHits?.[term] || [];
+ for (const hit of hits) {
+ const aiFloor = mapChunkFloorToAiFloor(hit.floor, chat);
+ if (aiFloor == null) continue;
+ if (!atomFloorSet.has(aiFloor)) continue;
+
+ const cur = floorAgg.get(aiFloor) || { lexHitScore: 0, terms: new Set() };
+ cur.lexHitScore += Number(hit?.weightedScore || 0);
+ cur.terms.add(term);
+ floorAgg.set(aiFloor, cur);
+ }
+ }
+
+ const candidates = [...floorAgg.entries()]
+ .map(([floor, info]) => {
+ const termCoverage = info.terms.size;
+ const finalFloorScore = info.lexHitScore * (1 + 0.2 * Math.max(0, termCoverage - 1));
+ return {
+ floor,
+ score: finalFloorScore,
+ termCoverage,
+ terms: [...info.terms],
+ };
+ })
+ .sort((a, b) => b.score - a.score);
+
+ out.lexHitButNotSelected = candidates.length;
+
+ // Cluster by floor distance and keep the highest score per cluster.
+ const selected = [];
+ for (const c of candidates) {
+ const conflict = selected.some(s => Math.abs(s.floor - c.floor) <= CONFIG.MUST_KEEP_CLUSTER_WINDOW);
+ if (conflict) continue;
+ selected.push(c);
+ if (selected.length >= CONFIG.MUST_KEEP_MAX_FLOORS) break;
+ }
+
+ out.floors = selected;
+ out.floorSet = new Set(selected.map(x => x.floor));
+ return out;
+}
+
// ═══════════════════════════════════════════════════════════════════════════
// [Stage 6] Floor 融合 + Rerank
// ═══════════════════════════════════════════════════════════════════════════
-async function locateAndPullEvidence(anchorHits, queryVector, rerankQuery, lexicalResult, metrics) {
+async function locateAndPullEvidence(anchorHits, queryVector, rerankQuery, lexicalResult, lexicalTerms, metrics) {
const { chatId, chat, name1, name2 } = getContext();
- if (!chatId) return { l0Selected: [], l1ScoredByFloor: new Map() };
+ if (!chatId) return { l0Selected: [], l1ScoredByFloor: new Map(), mustKeepFloors: [] };
const T_Start = performance.now();
@@ -558,17 +658,8 @@ async function locateAndPullEvidence(anchorHits, queryVector, rerankQuery, lexic
for (const { chunkId, score } of (lexicalResult?.chunkScores || [])) {
const match = chunkId?.match(/^c-(\d+)-/);
if (!match) continue;
- let floor = parseInt(match[1], 10);
-
- // USER floor → AI floor 映射
- if (chat?.[floor]?.is_user) {
- const aiFloor = floor + 1;
- if (aiFloor < chat.length && !chat[aiFloor]?.is_user) {
- floor = aiFloor;
- } else {
- continue;
- }
- }
+ const floor = mapChunkFloorToAiFloor(parseInt(match[1], 10), chat);
+ if (floor == null) continue;
// 预过滤:必须有 L0 atoms
if (!atomFloorSet.has(floor)) continue;
@@ -600,6 +691,12 @@ async function locateAndPullEvidence(anchorHits, queryVector, rerankQuery, lexic
metrics.lexical.floorFilteredByDense = lexFloorFilteredByDense;
}
+ // ─────────────────────────────────────────────────────────────────
+ // 6b.5 Fusion Guard: lexical must-keep floors
+ // ─────────────────────────────────────────────────────────────────
+
+ const mustKeep = buildMustKeepFloors(lexicalResult, lexicalTerms, atomFloorSet, chat);
+
// ─────────────────────────────────────────────────────────────────
// 6c. Floor W-RRF 融合
// ─────────────────────────────────────────────────────────────────
@@ -617,6 +714,10 @@ async function locateAndPullEvidence(anchorHits, queryVector, rerankQuery, lexic
metrics.fusion.denseAggMethod = 'maxSim';
metrics.fusion.lexDensityBonus = CONFIG.LEX_DENSITY_BONUS;
metrics.evidence.floorCandidates = fusedFloors.length;
+ metrics.evidence.mustKeepTermsCount = mustKeep.terms.length;
+ metrics.evidence.mustKeepFloorsCount = mustKeep.floors.length;
+ metrics.evidence.mustKeepFloors = mustKeep.floors.map(x => x.floor).slice(0, 10);
+ metrics.evidence.lexHitButNotSelected = Math.max(0, mustKeep.lexHitButNotSelected - mustKeep.floors.length);
}
if (fusedFloors.length === 0) {
@@ -628,7 +729,7 @@ async function locateAndPullEvidence(anchorHits, queryVector, rerankQuery, lexic
metrics.evidence.l1CosineTime = 0;
metrics.evidence.rerankApplied = false;
}
- return { l0Selected: [], l1ScoredByFloor: new Map() };
+ return { l0Selected: [], l1ScoredByFloor: new Map(), mustKeepFloors: [] };
}
// ─────────────────────────────────────────────────────────────────
@@ -650,8 +751,10 @@ async function locateAndPullEvidence(anchorHits, queryVector, rerankQuery, lexic
// 6e. 构建 rerank documents(每个 floor: USER chunks + AI chunks)
// ─────────────────────────────────────────────────────────────────
+ const normalFloors = fusedFloors.filter(f => !mustKeep.floorSet.has(f.id));
+
const rerankCandidates = [];
- for (const f of fusedFloors) {
+ for (const f of normalFloors) {
const aiFloor = f.id;
const userFloor = aiFloor - 1;
@@ -698,6 +801,7 @@ async function locateAndPullEvidence(anchorHits, queryVector, rerankQuery, lexic
metrics.evidence.rerankApplied = true;
metrics.evidence.beforeRerank = rerankCandidates.length;
metrics.evidence.afterRerank = reranked.length;
+ metrics.evidence.droppedByRerankCount = Math.max(0, rerankCandidates.length - reranked.length);
metrics.evidence.rerankFailed = reranked.some(c => c._rerankFailed);
metrics.evidence.rerankTime = rerankTime;
metrics.timing.evidenceRerank = rerankTime;
@@ -722,9 +826,12 @@ async function locateAndPullEvidence(anchorHits, queryVector, rerankQuery, lexic
// 6g. 收集 L0 atoms
// ─────────────────────────────────────────────────────────────────
- // 仅保留“真实 dense 命中”的 L0 原子:
- // 旧逻辑按 floor 全塞,容易把同层无关原子带进来。
- const atomById = new Map(getStateAtoms().map(a => [a.atomId, a]));
+ // Floor-based L0 collection:
+ // once a floor is selected by fusion/rerank, L0 atoms come from that floor.
+ // Dense anchor hits are used as similarity signals (ranking), not hard admission.
+ const allAtoms = getStateAtoms();
+ const atomById = new Map(allAtoms.map(a => [a.atomId, a]));
+ const anchorSimilarityByAtomId = new Map((anchorHits || []).map(h => [h.atomId, h.similarity || 0]));
const matchedAtomsByFloor = new Map();
for (const hit of (anchorHits || [])) {
const atom = hit.atom || atomById.get(hit.atomId);
@@ -739,15 +846,42 @@ async function locateAndPullEvidence(anchorHits, queryVector, rerankQuery, lexic
arr.sort((a, b) => b.similarity - a.similarity);
}
+ const mustKeepMissing = mustKeep.floors
+ .filter(mf => !reranked.some(r => r.floor === mf.floor))
+ .map(mf => ({
+ floor: mf.floor,
+ _rerankScore: 0.12 + Math.min(0.05, 0.01 * (mf.termCoverage || 1)),
+ _isMustKeep: true,
+ }));
+
+ const finalFloorItems = [
+ ...reranked.map(r => ({ ...r, _isMustKeep: false })),
+ ...mustKeepMissing,
+ ];
+
+ const allAtomsByFloor = new Map();
+ for (const atom of allAtoms) {
+ const f = Number(atom?.floor);
+ if (!Number.isInteger(f) || f < 0) continue;
+ if (!allAtomsByFloor.has(f)) allAtomsByFloor.set(f, []);
+ allAtomsByFloor.get(f).push(atom);
+ }
+
const l0Selected = [];
- for (const item of reranked) {
+ for (const item of finalFloorItems) {
const floor = item.floor;
- const rerankScore = item._rerankScore || 0;
+ const rerankScore = Number.isFinite(item?._rerankScore) ? item._rerankScore : 0;
- // 仅收集该 floor 中真实命中的 L0 atoms
- const floorMatchedAtoms = matchedAtomsByFloor.get(floor) || [];
- for (const { atom, similarity } of floorMatchedAtoms) {
+ const floorAtoms = allAtomsByFloor.get(floor) || [];
+ floorAtoms.sort((a, b) => {
+ const sa = anchorSimilarityByAtomId.get(a.atomId) || 0;
+ const sb = anchorSimilarityByAtomId.get(b.atomId) || 0;
+ return sb - sa;
+ });
+
+ for (const atom of floorAtoms) {
+ const similarity = anchorSimilarityByAtomId.get(atom.atomId) || 0;
l0Selected.push({
id: `anchor-${atom.atomId}`,
atomId: atom.atomId,
@@ -762,7 +896,7 @@ async function locateAndPullEvidence(anchorHits, queryVector, rerankQuery, lexic
}
if (metrics) {
- metrics.evidence.floorsSelected = reranked.length;
+ metrics.evidence.floorsSelected = finalFloorItems.length;
metrics.evidence.l0Collected = l0Selected.length;
metrics.evidence.l1Pulled = 0;
@@ -777,10 +911,14 @@ async function locateAndPullEvidence(anchorHits, queryVector, rerankQuery, lexic
}
xbLog.info(MODULE_ID,
- `Evidence: ${denseFloorRank.length} dense floors + ${lexFloorRank.length} lex floors (${lexFloorFilteredByDense} lex filtered by dense) → fusion=${fusedFloors.length} → rerank=${reranked.length} floors → L0=${l0Selected.length} (${totalTime}ms)`
+ `Evidence: ${denseFloorRank.length} dense floors + ${lexFloorRank.length} lex floors (${lexFloorFilteredByDense} lex filtered by dense) → fusion=${fusedFloors.length} → rerank(normal)=${reranked.length} + mustKeep=${mustKeepMissing.length} floors → L0=${l0Selected.length} (${totalTime}ms)`
);
- return { l0Selected, l1ScoredByFloor };
+ return {
+ l0Selected,
+ l1ScoredByFloor,
+ mustKeepFloors: mustKeep.floors.map(x => x.floor),
+ };
}
// ═══════════════════════════════════════════════════════════════════════════
@@ -965,6 +1103,7 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) {
focusEntities: [],
focusTerms: [],
focusCharacters: [],
+ mustKeepFloors: [],
elapsed: metrics.timing.total,
logText: 'No events.',
metrics,
@@ -984,6 +1123,12 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) {
: CONFIG.LAST_MESSAGES_K;
const lastMessages = getLastMessages(chat, lastMessagesCount, excludeLastAi);
+ // Non-blocking preload: keep recall latency stable.
+ // If not ready yet, query-builder will gracefully fall back to TF terms.
+ getLexicalIndex().catch((e) => {
+ xbLog.warn(MODULE_ID, 'Preload lexical index failed; continue with TF fallback', e);
+ });
+
const bundle = buildQueryBundle(lastMessages, pendingUserMessage);
const focusTerms = bundle.focusTerms || bundle.focusEntities || [];
const focusCharacters = bundle.focusCharacters || [];
@@ -1015,6 +1160,7 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) {
focusEntities: focusTerms,
focusTerms,
focusCharacters,
+ mustKeepFloors: [],
elapsed: metrics.timing.total,
logText: 'No query segments.',
metrics,
@@ -1037,6 +1183,7 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) {
focusEntities: focusTerms,
focusTerms,
focusCharacters,
+ mustKeepFloors: [],
elapsed: metrics.timing.total,
logText: 'Embedding failed (round 1, after retry).',
metrics,
@@ -1051,6 +1198,7 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) {
focusEntities: focusTerms,
focusTerms,
focusCharacters,
+ mustKeepFloors: [],
elapsed: metrics.timing.total,
logText: 'Empty query vectors (round 1).',
metrics,
@@ -1071,6 +1219,7 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) {
focusEntities: focusTerms,
focusTerms,
focusCharacters,
+ mustKeepFloors: [],
elapsed: metrics.timing.total,
logText: 'Weighted average produced empty vector.',
metrics,
@@ -1161,6 +1310,10 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) {
atomIds: [], atomFloors: new Set(),
chunkIds: [], chunkFloors: new Set(),
eventIds: [], chunkScores: [], searchTime: 0,
+ idfEnabled: false, idfDocCount: 0, topIdfTerms: [], termSearches: 0,
+ queryTerms: [],
+ termFloorHits: {},
+ floorLexScores: [],
};
let indexReadyTime = 0;
@@ -1184,6 +1337,10 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) {
metrics.lexical.searchTime = lexicalResult.searchTime || 0;
metrics.lexical.indexReadyTime = indexReadyTime;
metrics.lexical.terms = bundle.lexicalTerms.slice(0, 10);
+ metrics.lexical.idfEnabled = !!lexicalResult.idfEnabled;
+ metrics.lexical.idfDocCount = lexicalResult.idfDocCount || 0;
+ metrics.lexical.topIdfTerms = lexicalResult.topIdfTerms || [];
+ metrics.lexical.termSearches = lexicalResult.termSearches || 0;
}
// 合并 L2 events(lexical 命中但 dense 未命中的 events)
@@ -1238,18 +1395,19 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) {
}
xbLog.info(MODULE_ID,
- `Lexical: chunks=${lexicalResult.chunkIds.length} events=${lexicalResult.eventIds.length} mergedEvents=+${lexicalEventCount} filteredByDense=${lexicalEventFilteredByDense} floorFiltered=${metrics.lexical.floorFilteredByDense || 0} (indexReady=${indexReadyTime}ms search=${lexicalResult.searchTime || 0}ms total=${lexTime}ms)`
+ `Lexical: chunks=${lexicalResult.chunkIds.length} events=${lexicalResult.eventIds.length} mergedEvents=+${lexicalEventCount} filteredByDense=${lexicalEventFilteredByDense} floorFiltered=${metrics.lexical.floorFilteredByDense || 0} idfEnabled=${lexicalResult.idfEnabled ? 'yes' : 'no'} idfDocs=${lexicalResult.idfDocCount || 0} termSearches=${lexicalResult.termSearches || 0} (indexReady=${indexReadyTime}ms search=${lexicalResult.searchTime || 0}ms total=${lexTime}ms)`
);
// ═══════════════════════════════════════════════════════════════════
// 阶段 6: Floor 粒度融合 + Rerank + L1 配对
// ═══════════════════════════════════════════════════════════════════
- const { l0Selected, l1ScoredByFloor } = await locateAndPullEvidence(
+ const { l0Selected, l1ScoredByFloor, mustKeepFloors } = await locateAndPullEvidence(
anchorHits,
queryVector_v1,
bundle.rerankQuery,
lexicalResult,
+ bundle.lexicalTerms,
metrics
);
@@ -1379,6 +1537,7 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) {
console.log(`Round 2 Anchors: ${anchorHits.length} hits → ${anchorFloors_dense.size} floors`);
console.log(`Lexical: chunks=${lexicalResult.chunkIds.length} events=${lexicalResult.eventIds.length} evtMerged=+${lexicalEventCount} evtFiltered=${lexicalEventFilteredByDense} floorFiltered=${metrics.lexical.floorFilteredByDense || 0} (idx=${indexReadyTime}ms search=${lexicalResult.searchTime || 0}ms total=${lexTime}ms)`);
console.log(`Fusion (floor, weighted): dense=${metrics.fusion.denseFloors} lex=${metrics.fusion.lexFloors} → cap=${metrics.fusion.afterCap} (${metrics.fusion.time}ms)`);
+ console.log(`Fusion Guard: mustKeepTerms=${metrics.evidence.mustKeepTermsCount || 0} mustKeepFloors=[${(metrics.evidence.mustKeepFloors || []).join(', ')}]`);
console.log(`Floor Rerank: ${metrics.evidence.beforeRerank || 0} → ${metrics.evidence.floorsSelected || 0} floors → L0=${metrics.evidence.l0Collected || 0} (${metrics.evidence.rerankTime || 0}ms)`);
console.log(`L1: ${metrics.evidence.l1Pulled || 0} pulled → ${metrics.evidence.l1Attached || 0} attached (${metrics.evidence.l1CosineTime || 0}ms)`);
console.log(`Events: ${eventHits.length} hits (l0Linked=+${l0LinkedCount}), ${causalChain.length} causal`);
@@ -1393,6 +1552,7 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) {
focusEntities: focusTerms,
focusTerms,
focusCharacters,
+ mustKeepFloors: mustKeepFloors || [],
elapsed: metrics.timing.total,
metrics,
};
diff --git a/modules/story-summary/vector/utils/stopwords-base.js b/modules/story-summary/vector/utils/stopwords-base.js
new file mode 100644
index 0000000..2ce6fa0
--- /dev/null
+++ b/modules/story-summary/vector/utils/stopwords-base.js
@@ -0,0 +1,2231 @@
+// Auto-generated stopword baseline for story-summary.
+// Source: stopwords-iso (MIT), snapshot files under ./stopwords-data
+// Languages merged: zh + ja + en
+// Do not edit manually. Update snapshot files then regenerate.
+
+export const BASE_STOP_WORDS = [
+ "、",
+ "。",
+ "〈",
+ "〉",
+ "《",
+ "》",
+ "一",
+ "一个",
+ "一些",
+ "一何",
+ "一切",
+ "一则",
+ "一方面",
+ "一旦",
+ "一来",
+ "一样",
+ "一种",
+ "一般",
+ "一转眼",
+ "七",
+ "万一",
+ "三",
+ "上",
+ "上下",
+ "下",
+ "不",
+ "不仅",
+ "不但",
+ "不光",
+ "不单",
+ "不只",
+ "不外乎",
+ "不如",
+ "不妨",
+ "不尽",
+ "不尽然",
+ "不得",
+ "不怕",
+ "不惟",
+ "不成",
+ "不拘",
+ "不料",
+ "不是",
+ "不比",
+ "不然",
+ "不特",
+ "不独",
+ "不管",
+ "不至于",
+ "不若",
+ "不论",
+ "不过",
+ "不问",
+ "与",
+ "与其",
+ "与其说",
+ "与否",
+ "与此同时",
+ "且",
+ "且不说",
+ "且说",
+ "两者",
+ "个",
+ "个别",
+ "中",
+ "临",
+ "为",
+ "为了",
+ "为什么",
+ "为何",
+ "为止",
+ "为此",
+ "为着",
+ "乃",
+ "乃至",
+ "乃至于",
+ "么",
+ "之",
+ "之一",
+ "之所以",
+ "之类",
+ "乌乎",
+ "乎",
+ "乘",
+ "九",
+ "也",
+ "也好",
+ "也罢",
+ "了",
+ "二",
+ "二来",
+ "于",
+ "于是",
+ "于是乎",
+ "云云",
+ "云尔",
+ "五",
+ "些",
+ "亦",
+ "人",
+ "人们",
+ "人家",
+ "什",
+ "什么",
+ "什么样",
+ "今",
+ "介于",
+ "仍",
+ "仍旧",
+ "从",
+ "从此",
+ "从而",
+ "他",
+ "他人",
+ "他们",
+ "他们们",
+ "以",
+ "以上",
+ "以为",
+ "以便",
+ "以免",
+ "以及",
+ "以故",
+ "以期",
+ "以来",
+ "以至",
+ "以至于",
+ "以致",
+ "们",
+ "任",
+ "任何",
+ "任凭",
+ "会",
+ "似的",
+ "但",
+ "但凡",
+ "但是",
+ "何",
+ "何以",
+ "何况",
+ "何处",
+ "何时",
+ "余外",
+ "作为",
+ "你",
+ "你们",
+ "使",
+ "使得",
+ "例如",
+ "依",
+ "依据",
+ "依照",
+ "便于",
+ "俺",
+ "俺们",
+ "倘",
+ "倘使",
+ "倘或",
+ "倘然",
+ "倘若",
+ "借",
+ "借傥然",
+ "假使",
+ "假如",
+ "假若",
+ "做",
+ "像",
+ "儿",
+ "先不先",
+ "光",
+ "光是",
+ "全体",
+ "全部",
+ "八",
+ "六",
+ "兮",
+ "共",
+ "关于",
+ "关于具体地说",
+ "其",
+ "其一",
+ "其中",
+ "其二",
+ "其他",
+ "其余",
+ "其它",
+ "其次",
+ "具体地说",
+ "具体说来",
+ "兼之",
+ "内",
+ "再",
+ "再其次",
+ "再则",
+ "再有",
+ "再者",
+ "再者说",
+ "再说",
+ "冒",
+ "冲",
+ "况且",
+ "几",
+ "几时",
+ "凡",
+ "凡是",
+ "凭",
+ "凭借",
+ "出于",
+ "出来",
+ "分",
+ "分别",
+ "则",
+ "则甚",
+ "别",
+ "别人",
+ "别处",
+ "别是",
+ "别的",
+ "别管",
+ "别说",
+ "到",
+ "前后",
+ "前此",
+ "前者",
+ "加之",
+ "加以",
+ "区",
+ "即",
+ "即令",
+ "即使",
+ "即便",
+ "即如",
+ "即或",
+ "即若",
+ "却",
+ "去",
+ "又",
+ "又及",
+ "及",
+ "及其",
+ "及至",
+ "反之",
+ "反而",
+ "反过来",
+ "反过来说",
+ "受到",
+ "另",
+ "另一方面",
+ "另外",
+ "另悉",
+ "只",
+ "只当",
+ "只怕",
+ "只是",
+ "只有",
+ "只消",
+ "只要",
+ "只限",
+ "叫",
+ "叮咚",
+ "可",
+ "可以",
+ "可是",
+ "可见",
+ "各",
+ "各个",
+ "各位",
+ "各种",
+ "各自",
+ "同",
+ "同时",
+ "后",
+ "后者",
+ "向",
+ "向使",
+ "向着",
+ "吓",
+ "吗",
+ "否则",
+ "吧",
+ "吧哒",
+ "含",
+ "吱",
+ "呀",
+ "呃",
+ "呕",
+ "呗",
+ "呜",
+ "呜呼",
+ "呢",
+ "呵",
+ "呵呵",
+ "呸",
+ "呼哧",
+ "咋",
+ "和",
+ "咚",
+ "咦",
+ "咧",
+ "咱",
+ "咱们",
+ "咳",
+ "哇",
+ "哈",
+ "哈哈",
+ "哉",
+ "哎",
+ "哎呀",
+ "哎哟",
+ "哗",
+ "哟",
+ "哦",
+ "哩",
+ "哪",
+ "哪个",
+ "哪些",
+ "哪儿",
+ "哪天",
+ "哪年",
+ "哪怕",
+ "哪样",
+ "哪边",
+ "哪里",
+ "哼",
+ "哼唷",
+ "唉",
+ "唯有",
+ "啊",
+ "啐",
+ "啥",
+ "啦",
+ "啪达",
+ "啷当",
+ "喂",
+ "喏",
+ "喔唷",
+ "喽",
+ "嗡",
+ "嗡嗡",
+ "嗬",
+ "嗯",
+ "嗳",
+ "嘎",
+ "嘎登",
+ "嘘",
+ "嘛",
+ "嘻",
+ "嘿",
+ "嘿嘿",
+ "四",
+ "因",
+ "因为",
+ "因了",
+ "因此",
+ "因着",
+ "因而",
+ "固然",
+ "在",
+ "在下",
+ "在于",
+ "地",
+ "基于",
+ "处在",
+ "多",
+ "多么",
+ "多少",
+ "大",
+ "大家",
+ "她",
+ "她们",
+ "好",
+ "如",
+ "如上",
+ "如上所述",
+ "如下",
+ "如何",
+ "如其",
+ "如同",
+ "如是",
+ "如果",
+ "如此",
+ "如若",
+ "始而",
+ "孰料",
+ "孰知",
+ "宁",
+ "宁可",
+ "宁愿",
+ "宁肯",
+ "它",
+ "它们",
+ "对",
+ "对于",
+ "对待",
+ "对方",
+ "对比",
+ "将",
+ "小",
+ "尔",
+ "尔后",
+ "尔尔",
+ "尚且",
+ "就",
+ "就是",
+ "就是了",
+ "就是说",
+ "就算",
+ "就要",
+ "尽",
+ "尽管",
+ "尽管如此",
+ "岂但",
+ "己",
+ "已",
+ "已矣",
+ "巴",
+ "巴巴",
+ "年",
+ "并",
+ "并且",
+ "庶乎",
+ "庶几",
+ "开外",
+ "开始",
+ "归",
+ "归齐",
+ "当",
+ "当地",
+ "当然",
+ "当着",
+ "彼",
+ "彼时",
+ "彼此",
+ "往",
+ "待",
+ "很",
+ "得",
+ "得了",
+ "怎",
+ "怎么",
+ "怎么办",
+ "怎么样",
+ "怎奈",
+ "怎样",
+ "总之",
+ "总的来看",
+ "总的来说",
+ "总的说来",
+ "总而言之",
+ "恰恰相反",
+ "您",
+ "惟其",
+ "慢说",
+ "我",
+ "我们",
+ "或",
+ "或则",
+ "或是",
+ "或曰",
+ "或者",
+ "截至",
+ "所",
+ "所以",
+ "所在",
+ "所幸",
+ "所有",
+ "才",
+ "才能",
+ "打",
+ "打从",
+ "把",
+ "抑或",
+ "拿",
+ "按",
+ "按照",
+ "换句话说",
+ "换言之",
+ "据",
+ "据此",
+ "接着",
+ "故",
+ "故此",
+ "故而",
+ "旁人",
+ "无",
+ "无宁",
+ "无论",
+ "既",
+ "既往",
+ "既是",
+ "既然",
+ "日",
+ "时",
+ "时候",
+ "是",
+ "是以",
+ "是的",
+ "更",
+ "曾",
+ "替",
+ "替代",
+ "最",
+ "月",
+ "有",
+ "有些",
+ "有关",
+ "有及",
+ "有时",
+ "有的",
+ "望",
+ "朝",
+ "朝着",
+ "本",
+ "本人",
+ "本地",
+ "本着",
+ "本身",
+ "来",
+ "来着",
+ "来自",
+ "来说",
+ "极了",
+ "果然",
+ "果真",
+ "某",
+ "某个",
+ "某些",
+ "某某",
+ "根据",
+ "欤",
+ "正值",
+ "正如",
+ "正巧",
+ "正是",
+ "此",
+ "此地",
+ "此处",
+ "此外",
+ "此时",
+ "此次",
+ "此间",
+ "毋宁",
+ "每",
+ "每当",
+ "比",
+ "比及",
+ "比如",
+ "比方",
+ "没奈何",
+ "沿",
+ "沿着",
+ "漫说",
+ "点",
+ "焉",
+ "然则",
+ "然后",
+ "然而",
+ "照",
+ "照着",
+ "犹且",
+ "犹自",
+ "甚且",
+ "甚么",
+ "甚或",
+ "甚而",
+ "甚至",
+ "甚至于",
+ "用",
+ "用来",
+ "由",
+ "由于",
+ "由是",
+ "由此",
+ "由此可见",
+ "的",
+ "的确",
+ "的话",
+ "直到",
+ "相对而言",
+ "省得",
+ "看",
+ "眨眼",
+ "着",
+ "着呢",
+ "矣",
+ "矣乎",
+ "矣哉",
+ "离",
+ "秒",
+ "称",
+ "竟而",
+ "第",
+ "等",
+ "等到",
+ "等等",
+ "简言之",
+ "管",
+ "类如",
+ "紧接着",
+ "纵",
+ "纵令",
+ "纵使",
+ "纵然",
+ "经",
+ "经过",
+ "结果",
+ "给",
+ "继之",
+ "继后",
+ "继而",
+ "综上所述",
+ "罢了",
+ "者",
+ "而",
+ "而且",
+ "而况",
+ "而后",
+ "而外",
+ "而已",
+ "而是",
+ "而言",
+ "能",
+ "能否",
+ "腾",
+ "自",
+ "自个儿",
+ "自从",
+ "自各儿",
+ "自后",
+ "自家",
+ "自己",
+ "自打",
+ "自身",
+ "至",
+ "至于",
+ "至今",
+ "至若",
+ "致",
+ "般的",
+ "若",
+ "若夫",
+ "若是",
+ "若果",
+ "若非",
+ "莫不然",
+ "莫如",
+ "莫若",
+ "虽",
+ "虽则",
+ "虽然",
+ "虽说",
+ "被",
+ "要",
+ "要不",
+ "要不是",
+ "要不然",
+ "要么",
+ "要是",
+ "譬喻",
+ "譬如",
+ "让",
+ "许多",
+ "论",
+ "设使",
+ "设或",
+ "设若",
+ "诚如",
+ "诚然",
+ "该",
+ "说",
+ "说来",
+ "请",
+ "诸",
+ "诸位",
+ "诸如",
+ "谁",
+ "谁人",
+ "谁料",
+ "谁知",
+ "贼死",
+ "赖以",
+ "赶",
+ "起",
+ "起见",
+ "趁",
+ "趁着",
+ "越是",
+ "距",
+ "跟",
+ "较",
+ "较之",
+ "边",
+ "过",
+ "还",
+ "还是",
+ "还有",
+ "还要",
+ "这",
+ "这一来",
+ "这个",
+ "这么",
+ "这么些",
+ "这么样",
+ "这么点儿",
+ "这些",
+ "这会儿",
+ "这儿",
+ "这就是说",
+ "这时",
+ "这样",
+ "这次",
+ "这般",
+ "这边",
+ "这里",
+ "进而",
+ "连",
+ "连同",
+ "逐步",
+ "通过",
+ "遵循",
+ "遵照",
+ "那",
+ "那个",
+ "那么",
+ "那么些",
+ "那么样",
+ "那些",
+ "那会儿",
+ "那儿",
+ "那时",
+ "那样",
+ "那般",
+ "那边",
+ "那里",
+ "都",
+ "鄙人",
+ "鉴于",
+ "针对",
+ "阿",
+ "除",
+ "除了",
+ "除外",
+ "除开",
+ "除此之外",
+ "除非",
+ "随",
+ "随后",
+ "随时",
+ "随着",
+ "难道说",
+ "零",
+ "非",
+ "非但",
+ "非徒",
+ "非特",
+ "非独",
+ "靠",
+ "顺",
+ "顺着",
+ "首先",
+ "︿",
+ "!",
+ "#",
+ "$",
+ "%",
+ "&",
+ "(",
+ ")",
+ "*",
+ "+",
+ ",",
+ "0",
+ "1",
+ "2",
+ "3",
+ "4",
+ "5",
+ "6",
+ "7",
+ "8",
+ "9",
+ ":",
+ ";",
+ "<",
+ ">",
+ "?",
+ "@",
+ "[",
+ "]",
+ "{",
+ "|",
+ "}",
+ "~",
+ "¥",
+ "あそこ",
+ "あっ",
+ "あの",
+ "あのかた",
+ "あの人",
+ "あり",
+ "あります",
+ "ある",
+ "あれ",
+ "い",
+ "いう",
+ "います",
+ "いる",
+ "う",
+ "うち",
+ "え",
+ "お",
+ "および",
+ "おり",
+ "おります",
+ "か",
+ "かつて",
+ "から",
+ "が",
+ "き",
+ "ここ",
+ "こちら",
+ "こと",
+ "この",
+ "これ",
+ "これら",
+ "さ",
+ "さらに",
+ "し",
+ "しかし",
+ "する",
+ "ず",
+ "せ",
+ "せる",
+ "そこ",
+ "そして",
+ "その",
+ "その他",
+ "その後",
+ "それ",
+ "それぞれ",
+ "それで",
+ "た",
+ "ただし",
+ "たち",
+ "ため",
+ "たり",
+ "だ",
+ "だっ",
+ "だれ",
+ "つ",
+ "て",
+ "で",
+ "でき",
+ "できる",
+ "です",
+ "では",
+ "でも",
+ "と",
+ "という",
+ "といった",
+ "とき",
+ "ところ",
+ "として",
+ "とともに",
+ "とも",
+ "と共に",
+ "どこ",
+ "どの",
+ "な",
+ "ない",
+ "なお",
+ "なかっ",
+ "ながら",
+ "なく",
+ "なっ",
+ "など",
+ "なに",
+ "なら",
+ "なり",
+ "なる",
+ "なん",
+ "に",
+ "において",
+ "における",
+ "について",
+ "にて",
+ "によって",
+ "により",
+ "による",
+ "に対して",
+ "に対する",
+ "に関する",
+ "の",
+ "ので",
+ "のみ",
+ "は",
+ "ば",
+ "へ",
+ "ほか",
+ "ほとんど",
+ "ほど",
+ "ます",
+ "また",
+ "または",
+ "まで",
+ "も",
+ "もの",
+ "ものの",
+ "や",
+ "よう",
+ "より",
+ "ら",
+ "られ",
+ "られる",
+ "れ",
+ "れる",
+ "を",
+ "ん",
+ "及び",
+ "彼女",
+ "我々",
+ "特に",
+ "私",
+ "私達",
+ "貴方",
+ "貴方方",
+ "'ll",
+ "'tis",
+ "'twas",
+ "'ve",
+ "10",
+ "39",
+ "a",
+ "a's",
+ "able",
+ "ableabout",
+ "about",
+ "above",
+ "abroad",
+ "abst",
+ "accordance",
+ "according",
+ "accordingly",
+ "across",
+ "act",
+ "actually",
+ "ad",
+ "added",
+ "adj",
+ "adopted",
+ "ae",
+ "af",
+ "affected",
+ "affecting",
+ "affects",
+ "after",
+ "afterwards",
+ "ag",
+ "again",
+ "against",
+ "ago",
+ "ah",
+ "ahead",
+ "ai",
+ "ain't",
+ "aint",
+ "al",
+ "all",
+ "allow",
+ "allows",
+ "almost",
+ "alone",
+ "along",
+ "alongside",
+ "already",
+ "also",
+ "although",
+ "always",
+ "am",
+ "amid",
+ "amidst",
+ "among",
+ "amongst",
+ "amoungst",
+ "amount",
+ "an",
+ "and",
+ "announce",
+ "another",
+ "any",
+ "anybody",
+ "anyhow",
+ "anymore",
+ "anyone",
+ "anything",
+ "anyway",
+ "anyways",
+ "anywhere",
+ "ao",
+ "apart",
+ "apparently",
+ "appear",
+ "appreciate",
+ "appropriate",
+ "approximately",
+ "aq",
+ "ar",
+ "are",
+ "area",
+ "areas",
+ "aren",
+ "aren't",
+ "arent",
+ "arise",
+ "around",
+ "arpa",
+ "as",
+ "aside",
+ "ask",
+ "asked",
+ "asking",
+ "asks",
+ "associated",
+ "at",
+ "au",
+ "auth",
+ "available",
+ "aw",
+ "away",
+ "awfully",
+ "az",
+ "b",
+ "ba",
+ "back",
+ "backed",
+ "backing",
+ "backs",
+ "backward",
+ "backwards",
+ "bb",
+ "bd",
+ "be",
+ "became",
+ "because",
+ "become",
+ "becomes",
+ "becoming",
+ "been",
+ "before",
+ "beforehand",
+ "began",
+ "begin",
+ "beginning",
+ "beginnings",
+ "begins",
+ "behind",
+ "being",
+ "beings",
+ "believe",
+ "below",
+ "beside",
+ "besides",
+ "best",
+ "better",
+ "between",
+ "beyond",
+ "bf",
+ "bg",
+ "bh",
+ "bi",
+ "big",
+ "bill",
+ "billion",
+ "biol",
+ "bj",
+ "bm",
+ "bn",
+ "bo",
+ "both",
+ "bottom",
+ "br",
+ "brief",
+ "briefly",
+ "bs",
+ "bt",
+ "but",
+ "buy",
+ "bv",
+ "bw",
+ "by",
+ "bz",
+ "c",
+ "c'mon",
+ "c's",
+ "ca",
+ "call",
+ "came",
+ "can",
+ "can't",
+ "cannot",
+ "cant",
+ "caption",
+ "case",
+ "cases",
+ "cause",
+ "causes",
+ "cc",
+ "cd",
+ "certain",
+ "certainly",
+ "cf",
+ "cg",
+ "ch",
+ "changes",
+ "ci",
+ "ck",
+ "cl",
+ "clear",
+ "clearly",
+ "click",
+ "cm",
+ "cmon",
+ "cn",
+ "co",
+ "co.",
+ "com",
+ "come",
+ "comes",
+ "computer",
+ "con",
+ "concerning",
+ "consequently",
+ "consider",
+ "considering",
+ "contain",
+ "containing",
+ "contains",
+ "copy",
+ "corresponding",
+ "could",
+ "could've",
+ "couldn",
+ "couldn't",
+ "couldnt",
+ "course",
+ "cr",
+ "cry",
+ "cs",
+ "cu",
+ "currently",
+ "cv",
+ "cx",
+ "cy",
+ "cz",
+ "d",
+ "dare",
+ "daren't",
+ "darent",
+ "date",
+ "de",
+ "dear",
+ "definitely",
+ "describe",
+ "described",
+ "despite",
+ "detail",
+ "did",
+ "didn",
+ "didn't",
+ "didnt",
+ "differ",
+ "different",
+ "differently",
+ "directly",
+ "dj",
+ "dk",
+ "dm",
+ "do",
+ "does",
+ "doesn",
+ "doesn't",
+ "doesnt",
+ "doing",
+ "don",
+ "don't",
+ "done",
+ "dont",
+ "doubtful",
+ "down",
+ "downed",
+ "downing",
+ "downs",
+ "downwards",
+ "due",
+ "during",
+ "dz",
+ "e",
+ "each",
+ "early",
+ "ec",
+ "ed",
+ "edu",
+ "ee",
+ "effect",
+ "eg",
+ "eh",
+ "eight",
+ "eighty",
+ "either",
+ "eleven",
+ "else",
+ "elsewhere",
+ "empty",
+ "end",
+ "ended",
+ "ending",
+ "ends",
+ "enough",
+ "entirely",
+ "er",
+ "es",
+ "especially",
+ "et",
+ "et-al",
+ "etc",
+ "even",
+ "evenly",
+ "ever",
+ "evermore",
+ "every",
+ "everybody",
+ "everyone",
+ "everything",
+ "everywhere",
+ "ex",
+ "exactly",
+ "example",
+ "except",
+ "f",
+ "face",
+ "faces",
+ "fact",
+ "facts",
+ "fairly",
+ "far",
+ "farther",
+ "felt",
+ "few",
+ "fewer",
+ "ff",
+ "fi",
+ "fifteen",
+ "fifth",
+ "fifty",
+ "fify",
+ "fill",
+ "find",
+ "finds",
+ "fire",
+ "first",
+ "five",
+ "fix",
+ "fj",
+ "fk",
+ "fm",
+ "fo",
+ "followed",
+ "following",
+ "follows",
+ "for",
+ "forever",
+ "former",
+ "formerly",
+ "forth",
+ "forty",
+ "forward",
+ "found",
+ "four",
+ "fr",
+ "free",
+ "from",
+ "front",
+ "full",
+ "fully",
+ "further",
+ "furthered",
+ "furthering",
+ "furthermore",
+ "furthers",
+ "fx",
+ "g",
+ "ga",
+ "gave",
+ "gb",
+ "gd",
+ "ge",
+ "general",
+ "generally",
+ "get",
+ "gets",
+ "getting",
+ "gf",
+ "gg",
+ "gh",
+ "gi",
+ "give",
+ "given",
+ "gives",
+ "giving",
+ "gl",
+ "gm",
+ "gmt",
+ "gn",
+ "go",
+ "goes",
+ "going",
+ "gone",
+ "good",
+ "goods",
+ "got",
+ "gotten",
+ "gov",
+ "gp",
+ "gq",
+ "gr",
+ "great",
+ "greater",
+ "greatest",
+ "greetings",
+ "group",
+ "grouped",
+ "grouping",
+ "groups",
+ "gs",
+ "gt",
+ "gu",
+ "gw",
+ "gy",
+ "h",
+ "had",
+ "hadn't",
+ "hadnt",
+ "half",
+ "happens",
+ "hardly",
+ "has",
+ "hasn",
+ "hasn't",
+ "hasnt",
+ "have",
+ "haven",
+ "haven't",
+ "havent",
+ "having",
+ "he",
+ "he'd",
+ "he'll",
+ "he's",
+ "hed",
+ "hell",
+ "hello",
+ "help",
+ "hence",
+ "her",
+ "here",
+ "here's",
+ "hereafter",
+ "hereby",
+ "herein",
+ "heres",
+ "hereupon",
+ "hers",
+ "herself",
+ "herse”",
+ "hes",
+ "hi",
+ "hid",
+ "high",
+ "higher",
+ "highest",
+ "him",
+ "himself",
+ "himse”",
+ "his",
+ "hither",
+ "hk",
+ "hm",
+ "hn",
+ "home",
+ "homepage",
+ "hopefully",
+ "how",
+ "how'd",
+ "how'll",
+ "how's",
+ "howbeit",
+ "however",
+ "hr",
+ "ht",
+ "htm",
+ "html",
+ "http",
+ "hu",
+ "hundred",
+ "i",
+ "i'd",
+ "i'll",
+ "i'm",
+ "i've",
+ "i.e.",
+ "id",
+ "ie",
+ "if",
+ "ignored",
+ "ii",
+ "il",
+ "ill",
+ "im",
+ "immediate",
+ "immediately",
+ "importance",
+ "important",
+ "in",
+ "inasmuch",
+ "inc",
+ "inc.",
+ "indeed",
+ "index",
+ "indicate",
+ "indicated",
+ "indicates",
+ "information",
+ "inner",
+ "inside",
+ "insofar",
+ "instead",
+ "int",
+ "interest",
+ "interested",
+ "interesting",
+ "interests",
+ "into",
+ "invention",
+ "inward",
+ "io",
+ "iq",
+ "ir",
+ "is",
+ "isn",
+ "isn't",
+ "isnt",
+ "it",
+ "it'd",
+ "it'll",
+ "it's",
+ "itd",
+ "itll",
+ "its",
+ "itself",
+ "itse”",
+ "ive",
+ "j",
+ "je",
+ "jm",
+ "jo",
+ "join",
+ "jp",
+ "just",
+ "k",
+ "ke",
+ "keep",
+ "keeps",
+ "kept",
+ "keys",
+ "kg",
+ "kh",
+ "ki",
+ "kind",
+ "km",
+ "kn",
+ "knew",
+ "know",
+ "known",
+ "knows",
+ "kp",
+ "kr",
+ "kw",
+ "ky",
+ "kz",
+ "l",
+ "la",
+ "large",
+ "largely",
+ "last",
+ "lately",
+ "later",
+ "latest",
+ "latter",
+ "latterly",
+ "lb",
+ "lc",
+ "least",
+ "length",
+ "less",
+ "lest",
+ "let",
+ "let's",
+ "lets",
+ "li",
+ "like",
+ "liked",
+ "likely",
+ "likewise",
+ "line",
+ "little",
+ "lk",
+ "ll",
+ "long",
+ "longer",
+ "longest",
+ "look",
+ "looking",
+ "looks",
+ "low",
+ "lower",
+ "lr",
+ "ls",
+ "lt",
+ "ltd",
+ "lu",
+ "lv",
+ "ly",
+ "m",
+ "ma",
+ "made",
+ "mainly",
+ "make",
+ "makes",
+ "making",
+ "man",
+ "many",
+ "may",
+ "maybe",
+ "mayn't",
+ "maynt",
+ "mc",
+ "md",
+ "me",
+ "mean",
+ "means",
+ "meantime",
+ "meanwhile",
+ "member",
+ "members",
+ "men",
+ "merely",
+ "mg",
+ "mh",
+ "microsoft",
+ "might",
+ "might've",
+ "mightn't",
+ "mightnt",
+ "mil",
+ "mill",
+ "million",
+ "mine",
+ "minus",
+ "miss",
+ "mk",
+ "ml",
+ "mm",
+ "mn",
+ "mo",
+ "more",
+ "moreover",
+ "most",
+ "mostly",
+ "move",
+ "mp",
+ "mq",
+ "mr",
+ "mrs",
+ "ms",
+ "msie",
+ "mt",
+ "mu",
+ "much",
+ "mug",
+ "must",
+ "must've",
+ "mustn't",
+ "mustnt",
+ "mv",
+ "mw",
+ "mx",
+ "my",
+ "myself",
+ "myse”",
+ "mz",
+ "n",
+ "na",
+ "name",
+ "namely",
+ "nay",
+ "nc",
+ "nd",
+ "ne",
+ "near",
+ "nearly",
+ "necessarily",
+ "necessary",
+ "need",
+ "needed",
+ "needing",
+ "needn't",
+ "neednt",
+ "needs",
+ "neither",
+ "net",
+ "netscape",
+ "never",
+ "neverf",
+ "neverless",
+ "nevertheless",
+ "new",
+ "newer",
+ "newest",
+ "next",
+ "nf",
+ "ng",
+ "ni",
+ "nine",
+ "ninety",
+ "nl",
+ "no",
+ "no-one",
+ "nobody",
+ "non",
+ "none",
+ "nonetheless",
+ "noone",
+ "nor",
+ "normally",
+ "nos",
+ "not",
+ "noted",
+ "nothing",
+ "notwithstanding",
+ "novel",
+ "now",
+ "nowhere",
+ "np",
+ "nr",
+ "nu",
+ "null",
+ "number",
+ "numbers",
+ "nz",
+ "o",
+ "obtain",
+ "obtained",
+ "obviously",
+ "of",
+ "off",
+ "often",
+ "oh",
+ "ok",
+ "okay",
+ "old",
+ "older",
+ "oldest",
+ "om",
+ "omitted",
+ "on",
+ "once",
+ "one",
+ "one's",
+ "ones",
+ "only",
+ "onto",
+ "open",
+ "opened",
+ "opening",
+ "opens",
+ "opposite",
+ "or",
+ "ord",
+ "order",
+ "ordered",
+ "ordering",
+ "orders",
+ "org",
+ "other",
+ "others",
+ "otherwise",
+ "ought",
+ "oughtn't",
+ "oughtnt",
+ "our",
+ "ours",
+ "ourselves",
+ "out",
+ "outside",
+ "over",
+ "overall",
+ "owing",
+ "own",
+ "p",
+ "pa",
+ "page",
+ "pages",
+ "part",
+ "parted",
+ "particular",
+ "particularly",
+ "parting",
+ "parts",
+ "past",
+ "pe",
+ "per",
+ "perhaps",
+ "pf",
+ "pg",
+ "ph",
+ "pk",
+ "pl",
+ "place",
+ "placed",
+ "places",
+ "please",
+ "plus",
+ "pm",
+ "pmid",
+ "pn",
+ "point",
+ "pointed",
+ "pointing",
+ "points",
+ "poorly",
+ "possible",
+ "possibly",
+ "potentially",
+ "pp",
+ "pr",
+ "predominantly",
+ "present",
+ "presented",
+ "presenting",
+ "presents",
+ "presumably",
+ "previously",
+ "primarily",
+ "probably",
+ "problem",
+ "problems",
+ "promptly",
+ "proud",
+ "provided",
+ "provides",
+ "pt",
+ "put",
+ "puts",
+ "pw",
+ "py",
+ "q",
+ "qa",
+ "que",
+ "quickly",
+ "quite",
+ "qv",
+ "r",
+ "ran",
+ "rather",
+ "rd",
+ "re",
+ "readily",
+ "really",
+ "reasonably",
+ "recent",
+ "recently",
+ "ref",
+ "refs",
+ "regarding",
+ "regardless",
+ "regards",
+ "related",
+ "relatively",
+ "research",
+ "reserved",
+ "respectively",
+ "resulted",
+ "resulting",
+ "results",
+ "right",
+ "ring",
+ "ro",
+ "room",
+ "rooms",
+ "round",
+ "ru",
+ "run",
+ "rw",
+ "s",
+ "sa",
+ "said",
+ "same",
+ "saw",
+ "say",
+ "saying",
+ "says",
+ "sb",
+ "sc",
+ "sd",
+ "se",
+ "sec",
+ "second",
+ "secondly",
+ "seconds",
+ "section",
+ "see",
+ "seeing",
+ "seem",
+ "seemed",
+ "seeming",
+ "seems",
+ "seen",
+ "sees",
+ "self",
+ "selves",
+ "sensible",
+ "sent",
+ "serious",
+ "seriously",
+ "seven",
+ "seventy",
+ "several",
+ "sg",
+ "sh",
+ "shall",
+ "shan't",
+ "shant",
+ "she",
+ "she'd",
+ "she'll",
+ "she's",
+ "shed",
+ "shell",
+ "shes",
+ "should",
+ "should've",
+ "shouldn",
+ "shouldn't",
+ "shouldnt",
+ "show",
+ "showed",
+ "showing",
+ "shown",
+ "showns",
+ "shows",
+ "si",
+ "side",
+ "sides",
+ "significant",
+ "significantly",
+ "similar",
+ "similarly",
+ "since",
+ "sincere",
+ "site",
+ "six",
+ "sixty",
+ "sj",
+ "sk",
+ "sl",
+ "slightly",
+ "sm",
+ "small",
+ "smaller",
+ "smallest",
+ "sn",
+ "so",
+ "some",
+ "somebody",
+ "someday",
+ "somehow",
+ "someone",
+ "somethan",
+ "something",
+ "sometime",
+ "sometimes",
+ "somewhat",
+ "somewhere",
+ "soon",
+ "sorry",
+ "specifically",
+ "specified",
+ "specify",
+ "specifying",
+ "sr",
+ "st",
+ "state",
+ "states",
+ "still",
+ "stop",
+ "strongly",
+ "su",
+ "sub",
+ "substantially",
+ "successfully",
+ "such",
+ "sufficiently",
+ "suggest",
+ "sup",
+ "sure",
+ "sv",
+ "sy",
+ "system",
+ "sz",
+ "t",
+ "t's",
+ "take",
+ "taken",
+ "taking",
+ "tc",
+ "td",
+ "tell",
+ "ten",
+ "tends",
+ "test",
+ "text",
+ "tf",
+ "tg",
+ "th",
+ "than",
+ "thank",
+ "thanks",
+ "thanx",
+ "that",
+ "that'll",
+ "that's",
+ "that've",
+ "thatll",
+ "thats",
+ "thatve",
+ "the",
+ "their",
+ "theirs",
+ "them",
+ "themselves",
+ "then",
+ "thence",
+ "there",
+ "there'd",
+ "there'll",
+ "there're",
+ "there's",
+ "there've",
+ "thereafter",
+ "thereby",
+ "thered",
+ "therefore",
+ "therein",
+ "therell",
+ "thereof",
+ "therere",
+ "theres",
+ "thereto",
+ "thereupon",
+ "thereve",
+ "these",
+ "they",
+ "they'd",
+ "they'll",
+ "they're",
+ "they've",
+ "theyd",
+ "theyll",
+ "theyre",
+ "theyve",
+ "thick",
+ "thin",
+ "thing",
+ "things",
+ "think",
+ "thinks",
+ "third",
+ "thirty",
+ "this",
+ "thorough",
+ "thoroughly",
+ "those",
+ "thou",
+ "though",
+ "thoughh",
+ "thought",
+ "thoughts",
+ "thousand",
+ "three",
+ "throug",
+ "through",
+ "throughout",
+ "thru",
+ "thus",
+ "til",
+ "till",
+ "tip",
+ "tis",
+ "tj",
+ "tk",
+ "tm",
+ "tn",
+ "to",
+ "today",
+ "together",
+ "too",
+ "took",
+ "top",
+ "toward",
+ "towards",
+ "tp",
+ "tr",
+ "tried",
+ "tries",
+ "trillion",
+ "truly",
+ "try",
+ "trying",
+ "ts",
+ "tt",
+ "turn",
+ "turned",
+ "turning",
+ "turns",
+ "tv",
+ "tw",
+ "twas",
+ "twelve",
+ "twenty",
+ "twice",
+ "two",
+ "tz",
+ "u",
+ "ua",
+ "ug",
+ "uk",
+ "um",
+ "un",
+ "under",
+ "underneath",
+ "undoing",
+ "unfortunately",
+ "unless",
+ "unlike",
+ "unlikely",
+ "until",
+ "unto",
+ "up",
+ "upon",
+ "ups",
+ "upwards",
+ "us",
+ "use",
+ "used",
+ "useful",
+ "usefully",
+ "usefulness",
+ "uses",
+ "using",
+ "usually",
+ "uucp",
+ "uy",
+ "uz",
+ "v",
+ "va",
+ "value",
+ "various",
+ "vc",
+ "ve",
+ "versus",
+ "very",
+ "vg",
+ "vi",
+ "via",
+ "viz",
+ "vn",
+ "vol",
+ "vols",
+ "vs",
+ "vu",
+ "w",
+ "want",
+ "wanted",
+ "wanting",
+ "wants",
+ "was",
+ "wasn",
+ "wasn't",
+ "wasnt",
+ "way",
+ "ways",
+ "we",
+ "we'd",
+ "we'll",
+ "we're",
+ "we've",
+ "web",
+ "webpage",
+ "website",
+ "wed",
+ "welcome",
+ "well",
+ "wells",
+ "went",
+ "were",
+ "weren",
+ "weren't",
+ "werent",
+ "weve",
+ "wf",
+ "what",
+ "what'd",
+ "what'll",
+ "what's",
+ "what've",
+ "whatever",
+ "whatll",
+ "whats",
+ "whatve",
+ "when",
+ "when'd",
+ "when'll",
+ "when's",
+ "whence",
+ "whenever",
+ "where",
+ "where'd",
+ "where'll",
+ "where's",
+ "whereafter",
+ "whereas",
+ "whereby",
+ "wherein",
+ "wheres",
+ "whereupon",
+ "wherever",
+ "whether",
+ "which",
+ "whichever",
+ "while",
+ "whilst",
+ "whim",
+ "whither",
+ "who",
+ "who'd",
+ "who'll",
+ "who's",
+ "whod",
+ "whoever",
+ "whole",
+ "wholl",
+ "whom",
+ "whomever",
+ "whos",
+ "whose",
+ "why",
+ "why'd",
+ "why'll",
+ "why's",
+ "widely",
+ "width",
+ "will",
+ "willing",
+ "wish",
+ "with",
+ "within",
+ "without",
+ "won",
+ "won't",
+ "wonder",
+ "wont",
+ "words",
+ "work",
+ "worked",
+ "working",
+ "works",
+ "world",
+ "would",
+ "would've",
+ "wouldn",
+ "wouldn't",
+ "wouldnt",
+ "ws",
+ "www",
+ "x",
+ "y",
+ "ye",
+ "year",
+ "years",
+ "yes",
+ "yet",
+ "you",
+ "you'd",
+ "you'll",
+ "you're",
+ "you've",
+ "youd",
+ "youll",
+ "young",
+ "younger",
+ "youngest",
+ "your",
+ "youre",
+ "yours",
+ "yourself",
+ "yourselves",
+ "youve",
+ "yt",
+ "yu",
+ "z",
+ "za",
+ "zero",
+ "zm",
+ "zr"
+];
diff --git a/modules/story-summary/vector/utils/stopwords-data/LICENSE.stopwords-iso.txt b/modules/story-summary/vector/utils/stopwords-data/LICENSE.stopwords-iso.txt
new file mode 100644
index 0000000..0076d3c
--- /dev/null
+++ b/modules/story-summary/vector/utils/stopwords-data/LICENSE.stopwords-iso.txt
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2020 Gene Diaz
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/modules/story-summary/vector/utils/stopwords-data/SOURCES.md b/modules/story-summary/vector/utils/stopwords-data/SOURCES.md
new file mode 100644
index 0000000..1402c7e
--- /dev/null
+++ b/modules/story-summary/vector/utils/stopwords-data/SOURCES.md
@@ -0,0 +1,15 @@
+# stopwords sources for story-summary
+
+- Dataset: `stopwords-iso` (npm package, version 1.1.0)
+- Repository: https://github.com/stopwords-iso/stopwords-iso
+- License: MIT
+- Snapshot date: 2026-02-16
+- Languages used: `zh`, `ja`, `en`
+- Local snapshot files:
+ - `stopwords-iso.zh.txt`
+ - `stopwords-iso.ja.txt`
+ - `stopwords-iso.en.txt`
+
+Generation note:
+- `modules/story-summary/vector/utils/stopwords-base.js` is generated from these snapshot files.
+- Keep `stopwords-patch.js` for tiny domain overrides only.
diff --git a/modules/story-summary/vector/utils/stopwords-data/stopwords-iso.en.txt b/modules/story-summary/vector/utils/stopwords-data/stopwords-iso.en.txt
new file mode 100644
index 0000000..0efb051
--- /dev/null
+++ b/modules/story-summary/vector/utils/stopwords-data/stopwords-iso.en.txt
@@ -0,0 +1,1298 @@
+'ll
+'tis
+'twas
+'ve
+10
+39
+a
+a's
+able
+ableabout
+about
+above
+abroad
+abst
+accordance
+according
+accordingly
+across
+act
+actually
+ad
+added
+adj
+adopted
+ae
+af
+affected
+affecting
+affects
+after
+afterwards
+ag
+again
+against
+ago
+ah
+ahead
+ai
+ain't
+aint
+al
+all
+allow
+allows
+almost
+alone
+along
+alongside
+already
+also
+although
+always
+am
+amid
+amidst
+among
+amongst
+amoungst
+amount
+an
+and
+announce
+another
+any
+anybody
+anyhow
+anymore
+anyone
+anything
+anyway
+anyways
+anywhere
+ao
+apart
+apparently
+appear
+appreciate
+appropriate
+approximately
+aq
+ar
+are
+area
+areas
+aren
+aren't
+arent
+arise
+around
+arpa
+as
+aside
+ask
+asked
+asking
+asks
+associated
+at
+au
+auth
+available
+aw
+away
+awfully
+az
+b
+ba
+back
+backed
+backing
+backs
+backward
+backwards
+bb
+bd
+be
+became
+because
+become
+becomes
+becoming
+been
+before
+beforehand
+began
+begin
+beginning
+beginnings
+begins
+behind
+being
+beings
+believe
+below
+beside
+besides
+best
+better
+between
+beyond
+bf
+bg
+bh
+bi
+big
+bill
+billion
+biol
+bj
+bm
+bn
+bo
+both
+bottom
+br
+brief
+briefly
+bs
+bt
+but
+buy
+bv
+bw
+by
+bz
+c
+c'mon
+c's
+ca
+call
+came
+can
+can't
+cannot
+cant
+caption
+case
+cases
+cause
+causes
+cc
+cd
+certain
+certainly
+cf
+cg
+ch
+changes
+ci
+ck
+cl
+clear
+clearly
+click
+cm
+cmon
+cn
+co
+co.
+com
+come
+comes
+computer
+con
+concerning
+consequently
+consider
+considering
+contain
+containing
+contains
+copy
+corresponding
+could
+could've
+couldn
+couldn't
+couldnt
+course
+cr
+cry
+cs
+cu
+currently
+cv
+cx
+cy
+cz
+d
+dare
+daren't
+darent
+date
+de
+dear
+definitely
+describe
+described
+despite
+detail
+did
+didn
+didn't
+didnt
+differ
+different
+differently
+directly
+dj
+dk
+dm
+do
+does
+doesn
+doesn't
+doesnt
+doing
+don
+don't
+done
+dont
+doubtful
+down
+downed
+downing
+downs
+downwards
+due
+during
+dz
+e
+each
+early
+ec
+ed
+edu
+ee
+effect
+eg
+eh
+eight
+eighty
+either
+eleven
+else
+elsewhere
+empty
+end
+ended
+ending
+ends
+enough
+entirely
+er
+es
+especially
+et
+et-al
+etc
+even
+evenly
+ever
+evermore
+every
+everybody
+everyone
+everything
+everywhere
+ex
+exactly
+example
+except
+f
+face
+faces
+fact
+facts
+fairly
+far
+farther
+felt
+few
+fewer
+ff
+fi
+fifteen
+fifth
+fifty
+fify
+fill
+find
+finds
+fire
+first
+five
+fix
+fj
+fk
+fm
+fo
+followed
+following
+follows
+for
+forever
+former
+formerly
+forth
+forty
+forward
+found
+four
+fr
+free
+from
+front
+full
+fully
+further
+furthered
+furthering
+furthermore
+furthers
+fx
+g
+ga
+gave
+gb
+gd
+ge
+general
+generally
+get
+gets
+getting
+gf
+gg
+gh
+gi
+give
+given
+gives
+giving
+gl
+gm
+gmt
+gn
+go
+goes
+going
+gone
+good
+goods
+got
+gotten
+gov
+gp
+gq
+gr
+great
+greater
+greatest
+greetings
+group
+grouped
+grouping
+groups
+gs
+gt
+gu
+gw
+gy
+h
+had
+hadn't
+hadnt
+half
+happens
+hardly
+has
+hasn
+hasn't
+hasnt
+have
+haven
+haven't
+havent
+having
+he
+he'd
+he'll
+he's
+hed
+hell
+hello
+help
+hence
+her
+here
+here's
+hereafter
+hereby
+herein
+heres
+hereupon
+hers
+herself
+herse”
+hes
+hi
+hid
+high
+higher
+highest
+him
+himself
+himse”
+his
+hither
+hk
+hm
+hn
+home
+homepage
+hopefully
+how
+how'd
+how'll
+how's
+howbeit
+however
+hr
+ht
+htm
+html
+http
+hu
+hundred
+i
+i'd
+i'll
+i'm
+i've
+i.e.
+id
+ie
+if
+ignored
+ii
+il
+ill
+im
+immediate
+immediately
+importance
+important
+in
+inasmuch
+inc
+inc.
+indeed
+index
+indicate
+indicated
+indicates
+information
+inner
+inside
+insofar
+instead
+int
+interest
+interested
+interesting
+interests
+into
+invention
+inward
+io
+iq
+ir
+is
+isn
+isn't
+isnt
+it
+it'd
+it'll
+it's
+itd
+itll
+its
+itself
+itse”
+ive
+j
+je
+jm
+jo
+join
+jp
+just
+k
+ke
+keep
+keeps
+kept
+keys
+kg
+kh
+ki
+kind
+km
+kn
+knew
+know
+known
+knows
+kp
+kr
+kw
+ky
+kz
+l
+la
+large
+largely
+last
+lately
+later
+latest
+latter
+latterly
+lb
+lc
+least
+length
+less
+lest
+let
+let's
+lets
+li
+like
+liked
+likely
+likewise
+line
+little
+lk
+ll
+long
+longer
+longest
+look
+looking
+looks
+low
+lower
+lr
+ls
+lt
+ltd
+lu
+lv
+ly
+m
+ma
+made
+mainly
+make
+makes
+making
+man
+many
+may
+maybe
+mayn't
+maynt
+mc
+md
+me
+mean
+means
+meantime
+meanwhile
+member
+members
+men
+merely
+mg
+mh
+microsoft
+might
+might've
+mightn't
+mightnt
+mil
+mill
+million
+mine
+minus
+miss
+mk
+ml
+mm
+mn
+mo
+more
+moreover
+most
+mostly
+move
+mp
+mq
+mr
+mrs
+ms
+msie
+mt
+mu
+much
+mug
+must
+must've
+mustn't
+mustnt
+mv
+mw
+mx
+my
+myself
+myse”
+mz
+n
+na
+name
+namely
+nay
+nc
+nd
+ne
+near
+nearly
+necessarily
+necessary
+need
+needed
+needing
+needn't
+neednt
+needs
+neither
+net
+netscape
+never
+neverf
+neverless
+nevertheless
+new
+newer
+newest
+next
+nf
+ng
+ni
+nine
+ninety
+nl
+no
+no-one
+nobody
+non
+none
+nonetheless
+noone
+nor
+normally
+nos
+not
+noted
+nothing
+notwithstanding
+novel
+now
+nowhere
+np
+nr
+nu
+null
+number
+numbers
+nz
+o
+obtain
+obtained
+obviously
+of
+off
+often
+oh
+ok
+okay
+old
+older
+oldest
+om
+omitted
+on
+once
+one
+one's
+ones
+only
+onto
+open
+opened
+opening
+opens
+opposite
+or
+ord
+order
+ordered
+ordering
+orders
+org
+other
+others
+otherwise
+ought
+oughtn't
+oughtnt
+our
+ours
+ourselves
+out
+outside
+over
+overall
+owing
+own
+p
+pa
+page
+pages
+part
+parted
+particular
+particularly
+parting
+parts
+past
+pe
+per
+perhaps
+pf
+pg
+ph
+pk
+pl
+place
+placed
+places
+please
+plus
+pm
+pmid
+pn
+point
+pointed
+pointing
+points
+poorly
+possible
+possibly
+potentially
+pp
+pr
+predominantly
+present
+presented
+presenting
+presents
+presumably
+previously
+primarily
+probably
+problem
+problems
+promptly
+proud
+provided
+provides
+pt
+put
+puts
+pw
+py
+q
+qa
+que
+quickly
+quite
+qv
+r
+ran
+rather
+rd
+re
+readily
+really
+reasonably
+recent
+recently
+ref
+refs
+regarding
+regardless
+regards
+related
+relatively
+research
+reserved
+respectively
+resulted
+resulting
+results
+right
+ring
+ro
+room
+rooms
+round
+ru
+run
+rw
+s
+sa
+said
+same
+saw
+say
+saying
+says
+sb
+sc
+sd
+se
+sec
+second
+secondly
+seconds
+section
+see
+seeing
+seem
+seemed
+seeming
+seems
+seen
+sees
+self
+selves
+sensible
+sent
+serious
+seriously
+seven
+seventy
+several
+sg
+sh
+shall
+shan't
+shant
+she
+she'd
+she'll
+she's
+shed
+shell
+shes
+should
+should've
+shouldn
+shouldn't
+shouldnt
+show
+showed
+showing
+shown
+showns
+shows
+si
+side
+sides
+significant
+significantly
+similar
+similarly
+since
+sincere
+site
+six
+sixty
+sj
+sk
+sl
+slightly
+sm
+small
+smaller
+smallest
+sn
+so
+some
+somebody
+someday
+somehow
+someone
+somethan
+something
+sometime
+sometimes
+somewhat
+somewhere
+soon
+sorry
+specifically
+specified
+specify
+specifying
+sr
+st
+state
+states
+still
+stop
+strongly
+su
+sub
+substantially
+successfully
+such
+sufficiently
+suggest
+sup
+sure
+sv
+sy
+system
+sz
+t
+t's
+take
+taken
+taking
+tc
+td
+tell
+ten
+tends
+test
+text
+tf
+tg
+th
+than
+thank
+thanks
+thanx
+that
+that'll
+that's
+that've
+thatll
+thats
+thatve
+the
+their
+theirs
+them
+themselves
+then
+thence
+there
+there'd
+there'll
+there're
+there's
+there've
+thereafter
+thereby
+thered
+therefore
+therein
+therell
+thereof
+therere
+theres
+thereto
+thereupon
+thereve
+these
+they
+they'd
+they'll
+they're
+they've
+theyd
+theyll
+theyre
+theyve
+thick
+thin
+thing
+things
+think
+thinks
+third
+thirty
+this
+thorough
+thoroughly
+those
+thou
+though
+thoughh
+thought
+thoughts
+thousand
+three
+throug
+through
+throughout
+thru
+thus
+til
+till
+tip
+tis
+tj
+tk
+tm
+tn
+to
+today
+together
+too
+took
+top
+toward
+towards
+tp
+tr
+tried
+tries
+trillion
+truly
+try
+trying
+ts
+tt
+turn
+turned
+turning
+turns
+tv
+tw
+twas
+twelve
+twenty
+twice
+two
+tz
+u
+ua
+ug
+uk
+um
+un
+under
+underneath
+undoing
+unfortunately
+unless
+unlike
+unlikely
+until
+unto
+up
+upon
+ups
+upwards
+us
+use
+used
+useful
+usefully
+usefulness
+uses
+using
+usually
+uucp
+uy
+uz
+v
+va
+value
+various
+vc
+ve
+versus
+very
+vg
+vi
+via
+viz
+vn
+vol
+vols
+vs
+vu
+w
+want
+wanted
+wanting
+wants
+was
+wasn
+wasn't
+wasnt
+way
+ways
+we
+we'd
+we'll
+we're
+we've
+web
+webpage
+website
+wed
+welcome
+well
+wells
+went
+were
+weren
+weren't
+werent
+weve
+wf
+what
+what'd
+what'll
+what's
+what've
+whatever
+whatll
+whats
+whatve
+when
+when'd
+when'll
+when's
+whence
+whenever
+where
+where'd
+where'll
+where's
+whereafter
+whereas
+whereby
+wherein
+wheres
+whereupon
+wherever
+whether
+which
+whichever
+while
+whilst
+whim
+whither
+who
+who'd
+who'll
+who's
+whod
+whoever
+whole
+wholl
+whom
+whomever
+whos
+whose
+why
+why'd
+why'll
+why's
+widely
+width
+will
+willing
+wish
+with
+within
+without
+won
+won't
+wonder
+wont
+words
+work
+worked
+working
+works
+world
+would
+would've
+wouldn
+wouldn't
+wouldnt
+ws
+www
+x
+y
+ye
+year
+years
+yes
+yet
+you
+you'd
+you'll
+you're
+you've
+youd
+youll
+young
+younger
+youngest
+your
+youre
+yours
+yourself
+yourselves
+youve
+yt
+yu
+z
+za
+zero
+zm
+zr
diff --git a/modules/story-summary/vector/utils/stopwords-data/stopwords-iso.ja.txt b/modules/story-summary/vector/utils/stopwords-data/stopwords-iso.ja.txt
new file mode 100644
index 0000000..0e74864
--- /dev/null
+++ b/modules/story-summary/vector/utils/stopwords-data/stopwords-iso.ja.txt
@@ -0,0 +1,134 @@
+あそこ
+あっ
+あの
+あのかた
+あの人
+あり
+あります
+ある
+あれ
+い
+いう
+います
+いる
+う
+うち
+え
+お
+および
+おり
+おります
+か
+かつて
+から
+が
+き
+ここ
+こちら
+こと
+この
+これ
+これら
+さ
+さらに
+し
+しかし
+する
+ず
+せ
+せる
+そこ
+そして
+その
+その他
+その後
+それ
+それぞれ
+それで
+た
+ただし
+たち
+ため
+たり
+だ
+だっ
+だれ
+つ
+て
+で
+でき
+できる
+です
+では
+でも
+と
+という
+といった
+とき
+ところ
+として
+とともに
+とも
+と共に
+どこ
+どの
+な
+ない
+なお
+なかっ
+ながら
+なく
+なっ
+など
+なに
+なら
+なり
+なる
+なん
+に
+において
+における
+について
+にて
+によって
+により
+による
+に対して
+に対する
+に関する
+の
+ので
+のみ
+は
+ば
+へ
+ほか
+ほとんど
+ほど
+ます
+また
+または
+まで
+も
+もの
+ものの
+や
+よう
+より
+ら
+られ
+られる
+れ
+れる
+を
+ん
+何
+及び
+彼
+彼女
+我々
+特に
+私
+私達
+貴方
+貴方方
diff --git a/modules/story-summary/vector/utils/stopwords-data/stopwords-iso.zh.txt b/modules/story-summary/vector/utils/stopwords-data/stopwords-iso.zh.txt
new file mode 100644
index 0000000..15dea1c
--- /dev/null
+++ b/modules/story-summary/vector/utils/stopwords-data/stopwords-iso.zh.txt
@@ -0,0 +1,794 @@
+、
+。
+〈
+〉
+《
+》
+一
+一个
+一些
+一何
+一切
+一则
+一方面
+一旦
+一来
+一样
+一种
+一般
+一转眼
+七
+万一
+三
+上
+上下
+下
+不
+不仅
+不但
+不光
+不单
+不只
+不外乎
+不如
+不妨
+不尽
+不尽然
+不得
+不怕
+不惟
+不成
+不拘
+不料
+不是
+不比
+不然
+不特
+不独
+不管
+不至于
+不若
+不论
+不过
+不问
+与
+与其
+与其说
+与否
+与此同时
+且
+且不说
+且说
+两者
+个
+个别
+中
+临
+为
+为了
+为什么
+为何
+为止
+为此
+为着
+乃
+乃至
+乃至于
+么
+之
+之一
+之所以
+之类
+乌乎
+乎
+乘
+九
+也
+也好
+也罢
+了
+二
+二来
+于
+于是
+于是乎
+云云
+云尔
+五
+些
+亦
+人
+人们
+人家
+什
+什么
+什么样
+今
+介于
+仍
+仍旧
+从
+从此
+从而
+他
+他人
+他们
+他们们
+以
+以上
+以为
+以便
+以免
+以及
+以故
+以期
+以来
+以至
+以至于
+以致
+们
+任
+任何
+任凭
+会
+似的
+但
+但凡
+但是
+何
+何以
+何况
+何处
+何时
+余外
+作为
+你
+你们
+使
+使得
+例如
+依
+依据
+依照
+便于
+俺
+俺们
+倘
+倘使
+倘或
+倘然
+倘若
+借
+借傥然
+假使
+假如
+假若
+做
+像
+儿
+先不先
+光
+光是
+全体
+全部
+八
+六
+兮
+共
+关于
+关于具体地说
+其
+其一
+其中
+其二
+其他
+其余
+其它
+其次
+具体地说
+具体说来
+兼之
+内
+再
+再其次
+再则
+再有
+再者
+再者说
+再说
+冒
+冲
+况且
+几
+几时
+凡
+凡是
+凭
+凭借
+出于
+出来
+分
+分别
+则
+则甚
+别
+别人
+别处
+别是
+别的
+别管
+别说
+到
+前后
+前此
+前者
+加之
+加以
+区
+即
+即令
+即使
+即便
+即如
+即或
+即若
+却
+去
+又
+又及
+及
+及其
+及至
+反之
+反而
+反过来
+反过来说
+受到
+另
+另一方面
+另外
+另悉
+只
+只当
+只怕
+只是
+只有
+只消
+只要
+只限
+叫
+叮咚
+可
+可以
+可是
+可见
+各
+各个
+各位
+各种
+各自
+同
+同时
+后
+后者
+向
+向使
+向着
+吓
+吗
+否则
+吧
+吧哒
+含
+吱
+呀
+呃
+呕
+呗
+呜
+呜呼
+呢
+呵
+呵呵
+呸
+呼哧
+咋
+和
+咚
+咦
+咧
+咱
+咱们
+咳
+哇
+哈
+哈哈
+哉
+哎
+哎呀
+哎哟
+哗
+哟
+哦
+哩
+哪
+哪个
+哪些
+哪儿
+哪天
+哪年
+哪怕
+哪样
+哪边
+哪里
+哼
+哼唷
+唉
+唯有
+啊
+啐
+啥
+啦
+啪达
+啷当
+喂
+喏
+喔唷
+喽
+嗡
+嗡嗡
+嗬
+嗯
+嗳
+嘎
+嘎登
+嘘
+嘛
+嘻
+嘿
+嘿嘿
+四
+因
+因为
+因了
+因此
+因着
+因而
+固然
+在
+在下
+在于
+地
+基于
+处在
+多
+多么
+多少
+大
+大家
+她
+她们
+好
+如
+如上
+如上所述
+如下
+如何
+如其
+如同
+如是
+如果
+如此
+如若
+始而
+孰料
+孰知
+宁
+宁可
+宁愿
+宁肯
+它
+它们
+对
+对于
+对待
+对方
+对比
+将
+小
+尔
+尔后
+尔尔
+尚且
+就
+就是
+就是了
+就是说
+就算
+就要
+尽
+尽管
+尽管如此
+岂但
+己
+已
+已矣
+巴
+巴巴
+年
+并
+并且
+庶乎
+庶几
+开外
+开始
+归
+归齐
+当
+当地
+当然
+当着
+彼
+彼时
+彼此
+往
+待
+很
+得
+得了
+怎
+怎么
+怎么办
+怎么样
+怎奈
+怎样
+总之
+总的来看
+总的来说
+总的说来
+总而言之
+恰恰相反
+您
+惟其
+慢说
+我
+我们
+或
+或则
+或是
+或曰
+或者
+截至
+所
+所以
+所在
+所幸
+所有
+才
+才能
+打
+打从
+把
+抑或
+拿
+按
+按照
+换句话说
+换言之
+据
+据此
+接着
+故
+故此
+故而
+旁人
+无
+无宁
+无论
+既
+既往
+既是
+既然
+日
+时
+时候
+是
+是以
+是的
+更
+曾
+替
+替代
+最
+月
+有
+有些
+有关
+有及
+有时
+有的
+望
+朝
+朝着
+本
+本人
+本地
+本着
+本身
+来
+来着
+来自
+来说
+极了
+果然
+果真
+某
+某个
+某些
+某某
+根据
+欤
+正值
+正如
+正巧
+正是
+此
+此地
+此处
+此外
+此时
+此次
+此间
+毋宁
+每
+每当
+比
+比及
+比如
+比方
+没奈何
+沿
+沿着
+漫说
+点
+焉
+然则
+然后
+然而
+照
+照着
+犹且
+犹自
+甚且
+甚么
+甚或
+甚而
+甚至
+甚至于
+用
+用来
+由
+由于
+由是
+由此
+由此可见
+的
+的确
+的话
+直到
+相对而言
+省得
+看
+眨眼
+着
+着呢
+矣
+矣乎
+矣哉
+离
+秒
+称
+竟而
+第
+等
+等到
+等等
+简言之
+管
+类如
+紧接着
+纵
+纵令
+纵使
+纵然
+经
+经过
+结果
+给
+继之
+继后
+继而
+综上所述
+罢了
+者
+而
+而且
+而况
+而后
+而外
+而已
+而是
+而言
+能
+能否
+腾
+自
+自个儿
+自从
+自各儿
+自后
+自家
+自己
+自打
+自身
+至
+至于
+至今
+至若
+致
+般的
+若
+若夫
+若是
+若果
+若非
+莫不然
+莫如
+莫若
+虽
+虽则
+虽然
+虽说
+被
+要
+要不
+要不是
+要不然
+要么
+要是
+譬喻
+譬如
+让
+许多
+论
+设使
+设或
+设若
+诚如
+诚然
+该
+说
+说来
+请
+诸
+诸位
+诸如
+谁
+谁人
+谁料
+谁知
+贼死
+赖以
+赶
+起
+起见
+趁
+趁着
+越是
+距
+跟
+较
+较之
+边
+过
+还
+还是
+还有
+还要
+这
+这一来
+这个
+这么
+这么些
+这么样
+这么点儿
+这些
+这会儿
+这儿
+这就是说
+这时
+这样
+这次
+这般
+这边
+这里
+进而
+连
+连同
+逐步
+通过
+遵循
+遵照
+那
+那个
+那么
+那么些
+那么样
+那些
+那会儿
+那儿
+那时
+那样
+那般
+那边
+那里
+都
+鄙人
+鉴于
+针对
+阿
+除
+除了
+除外
+除开
+除此之外
+除非
+随
+随后
+随时
+随着
+难道说
+零
+非
+非但
+非徒
+非特
+非独
+靠
+顺
+顺着
+首先
+︿
+!
+#
+$
+%
+&
+(
+)
+*
++
+,
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+:
+;
+<
+>
+?
+@
+[
+]
+{
+|
+}
+~
+¥
diff --git a/modules/story-summary/vector/utils/stopwords-patch.js b/modules/story-summary/vector/utils/stopwords-patch.js
new file mode 100644
index 0000000..51f7614
--- /dev/null
+++ b/modules/story-summary/vector/utils/stopwords-patch.js
@@ -0,0 +1,9 @@
+// Small domain-level tuning surface.
+// Keep this file tiny: add/remove only words that are repeatedly noisy in real logs.
+
+// Extra stopwords on top of BASE_STOP_WORDS.
+export const DOMAIN_STOP_WORDS = [];
+
+// High-value words that must never be filtered as stopwords.
+// Default to empty for plugin-wide deployment; entity names are already protected dynamically.
+export const KEEP_WORDS = [];
diff --git a/modules/story-summary/vector/utils/tokenizer.js b/modules/story-summary/vector/utils/tokenizer.js
index a39e4e9..37ab59c 100644
--- a/modules/story-summary/vector/utils/tokenizer.js
+++ b/modules/story-summary/vector/utils/tokenizer.js
@@ -18,6 +18,8 @@
import { extensionFolderPath } from '../../../../core/constants.js';
import { xbLog } from '../../../../core/debug-core.js';
+import { BASE_STOP_WORDS } from './stopwords-base.js';
+import { DOMAIN_STOP_WORDS, KEEP_WORDS } from './stopwords-patch.js';
const MODULE_ID = 'tokenizer';
@@ -61,44 +63,30 @@ let entityList = [];
/** @type {Set} 已注入结巴的实体(避免重复 add_word) */
let injectedEntities = new Set();
+let entityKeepSet = new Set();
// ═══════════════════════════════════════════════════════════════════════════
// 停用词
// ═══════════════════════════════════════════════════════════════════════════
-const STOP_WORDS = new Set([
- // 中文高频虚词
- '的', '了', '在', '是', '我', '有', '和', '就', '不', '人',
- '都', '一', '一个', '上', '也', '很', '到', '说', '要', '去',
- '你', '会', '着', '没有', '看', '好', '自己', '这', '他', '她',
- '它', '吗', '什么', '那', '里', '来', '吧', '呢', '啊', '哦',
- '嗯', '呀', '哈', '嘿', '喂', '哎', '唉', '哇', '呃', '嘛',
- '把', '被', '让', '给', '从', '向', '对', '跟', '比', '但',
- '而', '或', '如果', '因为', '所以', '虽然', '但是', '然后',
- '可以', '这样', '那样', '怎么', '为什么', '什么样', '哪里',
- '时候', '现在', '已经', '还是', '只是', '可能', '应该', '知道',
- '觉得', '开始', '一下', '一些', '这个', '那个', '他们', '我们',
- '你们', '自己', '起来', '出来', '进去', '回来', '过来', '下去',
- // 日语常见虚词(≥2字,匹配 TinySegmenter 产出粒度)
- 'です', 'ます', 'した', 'して', 'する', 'ない', 'いる', 'ある',
- 'なる', 'れる', 'られ', 'られる',
- 'この', 'その', 'あの', 'どの', 'ここ', 'そこ', 'あそこ',
- 'これ', 'それ', 'あれ', 'どれ',
- 'ても', 'から', 'まで', 'ので', 'のに', 'けど', 'だけ',
- 'もう', 'まだ', 'とても', 'ちょっと', 'やっぱり',
- // 英文常见停用词
- 'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been',
- 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
- 'would', 'could', 'should', 'may', 'might', 'can', 'shall',
- 'and', 'but', 'or', 'not', 'no', 'nor', 'so', 'yet',
- 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from',
- 'it', 'its', 'he', 'she', 'his', 'her', 'they', 'them',
- 'this', 'that', 'these', 'those', 'i', 'me', 'my', 'you', 'your',
- 'we', 'our', 'if', 'then', 'than', 'when', 'what', 'which',
- 'who', 'how', 'where', 'there', 'here', 'all', 'each', 'every',
- 'both', 'few', 'more', 'most', 'other', 'some', 'such',
- 'only', 'own', 'same', 'just', 'very', 'also', 'about',
-]);
+const STATIC_KEEP_WORDS = new Set((KEEP_WORDS || [])
+ .map(w => String(w || '').trim().toLowerCase())
+ .filter(Boolean));
+
+// Standard source only: stopwords-iso snapshot + small domain patch.
+const EFFECTIVE_STOP_WORDS = new Set(
+ [...BASE_STOP_WORDS, ...DOMAIN_STOP_WORDS]
+ .map(w => String(w || '').trim().toLowerCase())
+ .filter(Boolean),
+);
+
+function shouldKeepTokenByWhitelist(token) {
+ const t = String(token || '').trim().toLowerCase();
+ if (!t) return false;
+ if (STATIC_KEEP_WORDS.has(t)) return true;
+ if (entityKeepSet.has(t)) return true;
+ return false;
+}
// ═══════════════════════════════════════════════════════════════════════════
// Unicode 分类
@@ -571,6 +559,7 @@ export function getState() {
export function injectEntities(lexicon, displayMap) {
if (!lexicon?.size) {
entityList = [];
+ entityKeepSet = new Set();
return;
}
@@ -586,6 +575,7 @@ export function injectEntities(lexicon, displayMap) {
// 按长度降序(最长匹配优先)
entities.sort((a, b) => b.length - a.length);
entityList = entities;
+ entityKeepSet = new Set(entities.map(e => String(e || '').trim().toLowerCase()).filter(Boolean));
// 如果结巴已就绪,注入自定义词
if (wasmState === WasmState.READY && jiebaAddWord) {
@@ -656,7 +646,7 @@ export function tokenize(text) {
if (!cleaned) continue;
if (cleaned.length < 2) continue;
- if (STOP_WORDS.has(cleaned)) continue;
+ if (EFFECTIVE_STOP_WORDS.has(cleaned) && !shouldKeepTokenByWhitelist(cleaned)) continue;
if (seen.has(cleaned)) continue;
// 过滤纯标点/特殊字符
@@ -728,7 +718,7 @@ export function tokenizeForIndex(text) {
.map(t => t.trim().toLowerCase())
.filter(t => {
if (!t || t.length < 2) return false;
- if (STOP_WORDS.has(t)) return false;
+ if (EFFECTIVE_STOP_WORDS.has(t) && !shouldKeepTokenByWhitelist(t)) return false;
if (/^[\s\x00-\x1F\p{P}\p{S}]+$/u.test(t)) return false;
return true;
});
@@ -744,6 +734,7 @@ export function tokenizeForIndex(text) {
*/
export function reset() {
entityList = [];
+ entityKeepSet = new Set();
injectedEntities.clear();
// 不重置 WASM 状态(避免重复加载)
}