improve lexical warmup and standardize stopword pipeline

2026-02-17 14:49:47 +08:00
parent 246eb7a7e2
commit 94eceaed96
14 changed files with 4840 additions and 330 deletions
--- a/modules/story-summary/story-summary.js
+++ b/modules/story-summary/story-summary.js
@@ -1551,6 +1551,7 @@ async function handleMessageReceived(scheduledChatId) {
    // Refresh entity lexicon after new message (new roles may appear)
    refreshEntityLexiconAndWarmup();
    scheduleLexicalWarmup(100);
    // Auto backfill missing L0 (delay to avoid contention with current floor)
    setTimeout(() => maybeAutoExtractL0(), 2000);
@@ -1559,6 +1560,7 @@ async function handleMessageReceived(scheduledChatId) {
 function handleMessageSent(scheduledChatId) {
    if (isChatStale(scheduledChatId)) return;
    initButtonsForAll();
    scheduleLexicalWarmup(0);
    setTimeout(() => maybeAutoRunSummary("before_user"), 1000);
 }
--- a/modules/story-summary/vector/llm/llm-service.js
+++ b/modules/story-summary/vector/llm/llm-service.js
@@ -2,7 +2,6 @@
 // vector/llm/llm-service.js - 修复 prefill 传递方式
 // ═══════════════════════════════════════════════════════════════════════════
 import { xbLog } from '../../../../core/debug-core.js';
 import { getVectorConfig } from '../../data/config.js';
 import { getApiKey } from './siliconflow.js';
 const MODULE_ID = 'vector-llm-service';
--- a/modules/story-summary/vector/retrieval/lexical-index.js
+++ b/modules/story-summary/vector/retrieval/lexical-index.js
@@ -1,16 +1,3 @@
 // ═══════════════════════════════════════════════════════════════════════════
 // lexical-index.js - MiniSearch 词法检索索引
 //
 // 职责：
 // 1. 对 L0 atoms + L1 chunks + L2 events 建立词法索引
 // 2. 提供词法检索接口（专名精确匹配兜底）
 // 3. 惰性构建 + 异步预热 + 缓存失效机制
 //
 // 索引存储：纯内存（不持久化）
 // 分词器：统一使用 tokenizer.js（结巴 + 实体保护 + 降级）
 // 重建时机：CHAT_CHANGED / L0提取完成 / L2总结完成
 // ═══════════════════════════════════════════════════════════════════════════
 import MiniSearch from '../../../../libs/minisearch.mjs';
 import { getContext } from '../../../../../../../extensions.js';
 import { getSummaryStore } from '../../data/store.js';
@@ -20,76 +7,166 @@ import { tokenizeForIndex } from '../utils/tokenizer.js';
 const MODULE_ID = 'lexical-index';
-// ─────────────────────────────────────────────────────────────────────────
+// In-memory index cache
 // 缓存
 // ─────────────────────────────────────────────────────────────────────────
 /** @type {MiniSearch|null} */
 let cachedIndex = null;
 /** @type {string|null} */
 let cachedChatId = null;
 /** @type {string|null} 数据指纹（atoms + chunks + events 数量） */
 let cachedFingerprint = null;
 /** @type {boolean} 是否正在构建 */
 let building = false;
 /** @type {Promise<MiniSearch|null>|null} 当前构建 Promise（防重入） */
 let buildPromise = null;
-/** @type {Map<number, string[]>} floor → 该楼层的 doc IDs（仅 L1 chunks） */
+
 // floor -> chunk doc ids (L1 only)
 let floorDocIds = new Map();
-// ─────────────────────────────────────────────────────────────────────────
+// IDF stats over lexical docs (L1 chunks + L2 events)
-// 工具函数
+let termDfMap = new Map();
-// ─────────────────────────────────────────────────────────────────────────
+let docTokenSets = new Map(); // docId -> Set<token>
 let lexicalDocCount = 0;
 const IDF_MIN = 1.0;
 const IDF_MAX = 4.0;
 const BUILD_BATCH_SIZE = 500;
 /**
 * 清理事件摘要（移除楼层标记）
 * @param {string} summary
 * @returns {string}
 */
 function cleanSummary(summary) {
    return String(summary || '')
        .replace(/\s*\(#\d+(?:-\d+)?\)\s*$/, '')
        .trim();
 }
-/**
+function fnv1a32(input, seed = 0x811C9DC5) {
- * 计算缓存指纹
+    let hash = seed >>> 0;
- * @param {number} chunkCount
+    const text = String(input || '');
- * @param {number} eventCount
+    for (let i = 0; i < text.length; i++) {
- * @returns {string}
+        hash ^= text.charCodeAt(i);
- */
+        hash = Math.imul(hash, 0x01000193) >>> 0;
-function computeFingerprint(chunkCount, eventCount) {
+    }
-    return `${chunkCount}:${eventCount}`;
+    return hash >>> 0;
 }
 function compareDocKeys(a, b) {
    const ka = `${a?.type || ''}:${a?.id || ''}`;
    const kb = `${b?.type || ''}:${b?.id || ''}`;
    if (ka < kb) return -1;
    if (ka > kb) return 1;
    return 0;
 }
 function computeFingerprintFromDocs(docs) {
    const normalizedDocs = Array.isArray(docs) ? [...docs].sort(compareDocKeys) : [];
    let hash = 0x811C9DC5;
    for (const doc of normalizedDocs) {
        const payload = `${doc?.type || ''}\u001F${doc?.id || ''}\u001F${doc?.floor ?? ''}\u001F${doc?.text || ''}\u001E`;
        hash = fnv1a32(payload, hash);
    }
    return `${normalizedDocs.length}:${(hash >>> 0).toString(16)}`;
 }
 /**
 * 让出主线程（避免长时间阻塞 UI）
 * @returns {Promise<void>}
 */
 function yieldToMain() {
    return new Promise(resolve => setTimeout(resolve, 0));
 }
-// ─────────────────────────────────────────────────────────────────────────
+function clamp(v, min, max) {
-// 文档收集
+    return Math.max(min, Math.min(max, v));
-// ─────────────────────────────────────────────────────────────────────────
+}
 function normalizeTerm(term) {
    return String(term || '').trim().toLowerCase();
 }
 function computeIdfFromDf(df, docCount) {
    if (!docCount || docCount <= 0) return 1;
    const raw = Math.log((docCount + 1) / ((df || 0) + 1)) + 1;
    return clamp(raw, IDF_MIN, IDF_MAX);
 }
 function computeIdf(term) {
    const t = normalizeTerm(term);
    if (!t || lexicalDocCount <= 0) return 1;
    return computeIdfFromDf(termDfMap.get(t) || 0, lexicalDocCount);
 }
 function extractUniqueTokens(text) {
    return new Set(tokenizeForIndex(String(text || '')).map(normalizeTerm).filter(Boolean));
 }
 function clearIdfState() {
    termDfMap = new Map();
    docTokenSets = new Map();
    lexicalDocCount = 0;
 }
 function removeDocumentIdf(docId) {
    const id = String(docId || '');
    if (!id) return;
    const tokens = docTokenSets.get(id);
    if (!tokens) return;
    for (const token of tokens) {
        const current = termDfMap.get(token) || 0;
        if (current <= 1) {
            termDfMap.delete(token);
        } else {
            termDfMap.set(token, current - 1);
        }
    }
    docTokenSets.delete(id);
    lexicalDocCount = Math.max(0, lexicalDocCount - 1);
 }
 function addDocumentIdf(docId, text) {
    const id = String(docId || '');
    if (!id) return;
    // Replace semantics: remove old token set first if this id already exists.
    removeDocumentIdf(id);
    const tokens = extractUniqueTokens(text);
    docTokenSets.set(id, tokens);
    lexicalDocCount += 1;
    for (const token of tokens) {
        termDfMap.set(token, (termDfMap.get(token) || 0) + 1);
    }
 }
 function rebuildIdfFromDocs(docs) {
    clearIdfState();
    for (const doc of docs || []) {
        const id = String(doc?.id || '');
        const text = String(doc?.text || '');
        if (!id || !text.trim()) continue;
        addDocumentIdf(id, text);
    }
 }
 function buildEventDoc(ev) {
    if (!ev?.id) return null;
    const parts = [];
    if (ev.title) parts.push(ev.title);
    if (ev.participants?.length) parts.push(ev.participants.join(' '));
    const summary = cleanSummary(ev.summary);
    if (summary) parts.push(summary);
    const text = parts.join(' ').trim();
    if (!text) return null;
    return {
        id: ev.id,
        type: 'event',
        floor: null,
        text,
    };
 }
 /**
 * 收集所有待索引文档
 *
 * @param {object[]} chunks - getAllChunks(chatId) 返回值
 * @param {object[]} events - store.json.events
 * @returns {object[]} 文档数组
 */
 function collectDocuments(chunks, events) {
    const docs = [];
-    // L1 chunks + 填充 floorDocIds
+    for (const chunk of chunks || []) {
    for (const chunk of (chunks || [])) {
        if (!chunk?.chunkId || !chunk.text) continue;
        const floor = chunk.floor ?? -1;
@@ -101,48 +178,19 @@ function collectDocuments(chunks, events) {
        });
        if (floor >= 0) {
-            if (!floorDocIds.has(floor)) {
+            if (!floorDocIds.has(floor)) floorDocIds.set(floor, []);
                floorDocIds.set(floor, []);
            }
            floorDocIds.get(floor).push(chunk.chunkId);
        }
    }
-    // L2 events
+    for (const ev of events || []) {
-    for (const ev of (events || [])) {
+        const doc = buildEventDoc(ev);
-        if (!ev?.id) continue;
+        if (doc) docs.push(doc);
        const parts = [];
        if (ev.title) parts.push(ev.title);
        if (ev.participants?.length) parts.push(ev.participants.join(' '));
        const summary = cleanSummary(ev.summary);
        if (summary) parts.push(summary);
        const text = parts.join(' ').trim();
        if (!text) continue;
        docs.push({
            id: ev.id,
            type: 'event',
            floor: null,
            text,
        });
    }
    return docs;
 }
 // ─────────────────────────────────────────────────────────────────────────
 // 索引构建（分片，不阻塞主线程）
 // ─────────────────────────────────────────────────────────────────────────
 /** 每批添加的文档数 */
 const BUILD_BATCH_SIZE = 500;
 /**
 * 构建 MiniSearch 索引（分片异步）
 *
 * @param {object[]} docs - 文档数组
 * @returns {Promise<MiniSearch>}
 */
 async function buildIndexAsync(docs) {
    const T0 = performance.now();
@@ -158,49 +206,43 @@ async function buildIndexAsync(docs) {
        tokenize: tokenizeForIndex,
    });
-    if (!docs.length) {
+    if (!docs.length) return index;
        return index;
    }
    // 分片添加，每批 BUILD_BATCH_SIZE 条后让出主线程
    for (let i = 0; i < docs.length; i += BUILD_BATCH_SIZE) {
        const batch = docs.slice(i, i + BUILD_BATCH_SIZE);
        index.addAll(batch);
        // 非最后一批时让出主线程
        if (i + BUILD_BATCH_SIZE < docs.length) {
            await yieldToMain();
        }
    }
    const elapsed = Math.round(performance.now() - T0);
-    xbLog.info(MODULE_ID,
+    xbLog.info(MODULE_ID, `Index built: ${docs.length} docs (${elapsed}ms)`);
        `索引构建完成: ${docs.length} 文档 (${elapsed}ms)`
    );
    return index;
 }
 // ─────────────────────────────────────────────────────────────────────────
 // 检索
 // ─────────────────────────────────────────────────────────────────────────
 /**
 * @typedef {object} LexicalSearchResult
- * @property {string[]} atomIds    - 命中的 L0 atom IDs
+ * @property {string[]} atomIds - Reserved for backward compatibility (currently empty).
- * @property {Set<number>} atomFloors - 命中的 L0 楼层集合
+ * @property {Set<number>} atomFloors - Reserved for backward compatibility (currently empty).
- * @property {string[]} chunkIds   - 命中的 L1 chunk IDs
+ * @property {string[]} chunkIds - Matched L1 chunk ids sorted by weighted lexical score.
- * @property {Set<number>} chunkFloors - 命中的 L1 楼层集合
+ * @property {Set<number>} chunkFloors - Floor ids covered by matched chunks.
- * @property {string[]} eventIds   - 命中的 L2 event IDs
+ * @property {string[]} eventIds - Matched L2 event ids sorted by weighted lexical score.
- * @property {object[]} chunkScores - chunk 命中详情 [{ chunkId, score }]
+ * @property {object[]} chunkScores - Weighted lexical scores for matched chunks.
- * @property {number}   searchTime - 检索耗时 ms
+ * @property {boolean} idfEnabled - Whether IDF stats are available for weighting.
 * @property {number} idfDocCount - Number of lexical docs used to compute IDF.
 * @property {Array<{term:string,idf:number}>} topIdfTerms - Top query terms by IDF.
 * @property {number} termSearches - Number of per-term MiniSearch queries executed.
 * @property {number} searchTime - Total lexical search time in milliseconds.
 */
 /**
- * 在词法索引中检索
+ * Search lexical index by terms, using per-term MiniSearch and IDF-weighted score aggregation.
 * This keeps existing outputs compatible while adding observability fields.
 *
- * @param {MiniSearch} index - 索引实例
+ * @param {MiniSearch} index
- * @param {string[]} terms - 查询词列表
+ * @param {string[]} terms
 * @returns {LexicalSearchResult}
 */
 export function searchLexicalIndex(index, terms) {
@@ -213,6 +255,10 @@ export function searchLexicalIndex(index, terms) {
        chunkFloors: new Set(),
        eventIds: [],
        chunkScores: [],
        idfEnabled: lexicalDocCount > 0,
        idfDocCount: lexicalDocCount,
        topIdfTerms: [],
        termSearches: 0,
        searchTime: 0,
    };
@@ -221,79 +267,84 @@ export function searchLexicalIndex(index, terms) {
        return result;
    }
-    // 用所有 terms 联合查询
+    const queryTerms = Array.from(new Set((terms || []).map(normalizeTerm).filter(Boolean)));
-    const queryString = terms.join(' ');
+    const weightedScores = new Map(); // docId -> score
    const hitMeta = new Map(); // docId -> { type, floor }
    const idfPairs = [];
-    let hits;
+    for (const term of queryTerms) {
        const idf = computeIdf(term);
        idfPairs.push({ term, idf });
        let hits = [];
        try {
-        hits = index.search(queryString, {
+            hits = index.search(term, {
                boost: { text: 1 },
                fuzzy: 0.2,
                prefix: true,
                combineWith: 'OR',
            // 使用与索引相同的分词器
                tokenize: tokenizeForIndex,
            });
        } catch (e) {
-        xbLog.warn(MODULE_ID, '检索失败', e);
+            xbLog.warn(MODULE_ID, `Lexical term search failed: ${term}`, e);
-        result.searchTime = Math.round(performance.now() - T0);
+            continue;
        return result;
        }
-    // 分类结果
+        result.termSearches += 1;
    const chunkIdSet = new Set();
    const eventIdSet = new Set();
        for (const hit of hits) {
-        const type = hit.type;
+            const id = String(hit.id || '');
-        const id = hit.id;
+            if (!id) continue;
        const floor = hit.floor;
-        switch (type) {
+            const weighted = (hit.score || 0) * idf;
-            case 'chunk':
+            weightedScores.set(id, (weightedScores.get(id) || 0) + weighted);
-                if (!chunkIdSet.has(id)) {
+
-                    chunkIdSet.add(id);
+            if (!hitMeta.has(id)) {
                hitMeta.set(id, {
                    type: hit.type,
                    floor: hit.floor,
                });
            }
        }
    }
    idfPairs.sort((a, b) => b.idf - a.idf);
    result.topIdfTerms = idfPairs.slice(0, 5);
    const sortedHits = Array.from(weightedScores.entries())
        .sort((a, b) => b[1] - a[1]);
    for (const [id, score] of sortedHits) {
        const meta = hitMeta.get(id);
        if (!meta) continue;
        if (meta.type === 'chunk') {
            result.chunkIds.push(id);
-                    result.chunkScores.push({ chunkId: id, score: hit.score });
+            result.chunkScores.push({ chunkId: id, score });
-                    if (typeof floor === 'number' && floor >= 0) {
+            if (typeof meta.floor === 'number' && meta.floor >= 0) {
-                        result.chunkFloors.add(floor);
+                result.chunkFloors.add(meta.floor);
            }
            continue;
        }
                break;
-            case 'event':
+        if (meta.type === 'event') {
                if (!eventIdSet.has(id)) {
                    eventIdSet.add(id);
            result.eventIds.push(id);
        }
                break;
        }
    }
    result.searchTime = Math.round(performance.now() - T0);
-    xbLog.info(MODULE_ID,
+    xbLog.info(
-        `检索完成: terms=[${terms.slice(0, 5).join(',')}] → atoms=${result.atomIds.length} chunks=${result.chunkIds.length} events=${result.eventIds.length} (${result.searchTime}ms)`
+        MODULE_ID,
        `Lexical search terms=[${queryTerms.slice(0, 5).join(',')}] chunks=${result.chunkIds.length} events=${result.eventIds.length} termSearches=${result.termSearches} (${result.searchTime}ms)`,
    );
    return result;
 }
 // ─────────────────────────────────────────────────────────────────────────
 // 内部构建流程（收集数据 + 构建索引）
 // ─────────────────────────────────────────────────────────────────────────
 /**
 * 收集数据并构建索引
 *
 * @param {string} chatId
 * @returns {Promise<{index: MiniSearch, fingerprint: string}>}
 */
 async function collectAndBuild(chatId) {
    // 清空侧索引（全量重建）
    floorDocIds = new Map();
    // 收集数据（不含 L0 atoms）
    const store = getSummaryStore();
    const events = store?.json?.events || [];
@@ -301,48 +352,44 @@ async function collectAndBuild(chatId) {
    try {
        chunks = await getAllChunks(chatId);
    } catch (e) {
-        xbLog.warn(MODULE_ID, '获取 chunks 失败', e);
+        xbLog.warn(MODULE_ID, 'Failed to load chunks', e);
    }
-    const fp = computeFingerprint(chunks.length, events.length);
+    const docs = collectDocuments(chunks, events);
    const fp = computeFingerprintFromDocs(docs);
    // 检查是否在收集过程中缓存已被其他调用更新
    if (cachedIndex && cachedChatId === chatId && cachedFingerprint === fp) {
        return { index: cachedIndex, fingerprint: fp };
    }
-    // 收集文档（同时填充 floorDocIds）
+    rebuildIdfFromDocs(docs);
    const docs = collectDocuments(chunks, events);
    // 异步分片构建
    const index = await buildIndexAsync(docs);
    return { index, fingerprint: fp };
 }
 // ─────────────────────────────────────────────────────────────────────────
 // 公开接口：getLexicalIndex（惰性获取）
 // ─────────────────────────────────────────────────────────────────────────
 /**
- * 获取词法索引（惰性构建 + 缓存）
+ * Expose IDF accessor for query-term selection in query-builder.
- *
+ * If index stats are not ready, this gracefully falls back to idf=1.
 * 如果缓存有效则直接返回；否则自动构建。
 * 如果正在构建中，等待构建完成。
 *
 * @returns {Promise<MiniSearch|null>}
 */
 export function getLexicalIdfAccessor() {
    return {
        enabled: lexicalDocCount > 0,
        docCount: lexicalDocCount,
        getIdf(term) {
            return computeIdf(term);
        },
    };
 }
 export async function getLexicalIndex() {
    const { chatId } = getContext();
    if (!chatId) return null;
    // 快速路径：如果缓存存在且 chatId 未变，则直接命中
    // 指纹校验放到构建流程中完成，避免为指纹而额外读一次 IndexedDB
    if (cachedIndex && cachedChatId === chatId && cachedFingerprint) {
        return cachedIndex;
    }
    // 正在构建中，等待结果
    if (building && buildPromise) {
        try {
            await buildPromise;
@@ -350,27 +397,23 @@ export async function getLexicalIndex() {
                return cachedIndex;
            }
        } catch {
-            // 构建失败，继续往下重建
+            // Continue to rebuild below.
        }
    }
-    // 需要重建（指纹将在 collectAndBuild 内部计算并写入缓存）
+    xbLog.info(MODULE_ID, `Lexical cache miss; rebuilding (chatId=${chatId.slice(0, 8)})`);
    xbLog.info(MODULE_ID, `缓存失效，重建索引 (chatId=${chatId.slice(0, 8)})`);
    building = true;
    buildPromise = collectAndBuild(chatId);
    try {
        const { index, fingerprint } = await buildPromise;
        // 原子替换缓存
        cachedIndex = index;
        cachedChatId = chatId;
        cachedFingerprint = fingerprint;
        return index;
    } catch (e) {
-        xbLog.error(MODULE_ID, '索引构建失败', e);
+        xbLog.error(MODULE_ID, 'Index build failed', e);
        return null;
    } finally {
        building = false;
@@ -378,74 +421,29 @@ export async function getLexicalIndex() {
    }
 }
 // ─────────────────────────────────────────────────────────────────────────
 // 公开接口：warmupIndex（异步预建）
 // ─────────────────────────────────────────────────────────────────────────
 /**
 * 异步预建索引
 *
 * 在 CHAT_CHANGED 时调用，后台构建索引。
 * 不阻塞调用方，不返回结果。
 * 构建完成后缓存自动更新，后续 getLexicalIndex() 直接命中。
 *
 * 调用时机：
 * - handleChatChanged（实体注入后）
 * - L0 提取完成
 * - L2 总结完成
 */
 export function warmupIndex() {
    const { chatId } = getContext();
-    if (!chatId) return;
+    if (!chatId || building) return;
    // 已在构建中，不重复触发
    if (building) return;
    // fire-and-forget
    getLexicalIndex().catch(e => {
-        xbLog.warn(MODULE_ID, '预热索引失败', e);
+        xbLog.warn(MODULE_ID, 'Warmup failed', e);
    });
 }
 // ─────────────────────────────────────────────────────────────────────────
 // 公开接口：invalidateLexicalIndex（缓存失效）
 // ─────────────────────────────────────────────────────────────────────────
 /**
 * 使缓存失效（下次 getLexicalIndex / warmupIndex 时自动重建）
 *
 * 调用时机：
 * - CHAT_CHANGED
 * - L0 提取完成
 * - L2 总结完成
 */
 export function invalidateLexicalIndex() {
    if (cachedIndex) {
-        xbLog.info(MODULE_ID, '索引缓存已失效');
+        xbLog.info(MODULE_ID, 'Lexical index cache invalidated');
    }
    cachedIndex = null;
    cachedChatId = null;
    cachedFingerprint = null;
    floorDocIds = new Map();
    clearIdfState();
 }
 // ─────────────────────────────────────────────────────────────────────────
 // 增量更新接口
 // ─────────────────────────────────────────────────────────────────────────
 /**
 * 为指定楼层添加 L1 chunks 到索引
 *
 * 先移除该楼层旧文档，再添加新文档。
 * 如果索引不存在（缓存失效），静默跳过（下次 getLexicalIndex 全量重建）。
 *
 * @param {number} floor - 楼层号
 * @param {object[]} chunks - chunk 对象列表（需有 chunkId、text、floor）
 */
 export function addDocumentsForFloor(floor, chunks) {
    if (!cachedIndex || !chunks?.length) return;
    // 先移除旧文档
    removeDocumentsByFloor(floor);
    const docs = [];
@@ -453,30 +451,29 @@ export function addDocumentsForFloor(floor, chunks) {
    for (const chunk of chunks) {
        if (!chunk?.chunkId || !chunk.text) continue;
-        docs.push({
+
        const doc = {
            id: chunk.chunkId,
            type: 'chunk',
            floor: chunk.floor ?? floor,
            text: chunk.text,
-        });
+        };
        docs.push(doc);
        docIds.push(chunk.chunkId);
    }
-    if (docs.length > 0) {
+    if (!docs.length) return;
    cachedIndex.addAll(docs);
    floorDocIds.set(floor, docIds);
-        xbLog.info(MODULE_ID, `增量添加: floor ${floor}, ${docs.length} 个 chunk`);
+
    for (const doc of docs) {
        addDocumentIdf(doc.id, doc.text);
    }
    xbLog.info(MODULE_ID, `Incremental add floor=${floor} chunks=${docs.length}`);
 }
 /**
 * 从索引中移除指定楼层的所有 L1 chunk 文档
 *
 * 使用 MiniSearch discard()（软删除）。
 * 如果索引不存在，静默跳过。
 *
 * @param {number} floor - 楼层号
 */
 export function removeDocumentsByFloor(floor) {
    if (!cachedIndex) return;
@@ -487,55 +484,39 @@ export function removeDocumentsByFloor(floor) {
        try {
            cachedIndex.discard(id);
        } catch {
-            // 文档可能不存在（已被全量重建替换）
+            // Ignore if the doc was already removed/rebuilt.
        }
        removeDocumentIdf(id);
    }
    floorDocIds.delete(floor);
-    xbLog.info(MODULE_ID, `增量移除: floor ${floor}, ${docIds.length} 个文档`);
+    xbLog.info(MODULE_ID, `Incremental remove floor=${floor} chunks=${docIds.length}`);
 }
 /**
 * 将新 L2 事件添加到索引
 *
 * 如果事件 ID 已存在，先 discard 再 add（覆盖）。
 * 如果索引不存在，静默跳过。
 *
 * @param {object[]} events - 事件对象列表（需有 id、title、summary 等）
 */
 export function addEventDocuments(events) {
    if (!cachedIndex || !events?.length) return;
    const docs = [];
    for (const ev of events) {
-        if (!ev?.id) continue;
+        const doc = buildEventDoc(ev);
        if (!doc) continue;
        const parts = [];
        if (ev.title) parts.push(ev.title);
        if (ev.participants?.length) parts.push(ev.participants.join(' '));
        const summary = cleanSummary(ev.summary);
        if (summary) parts.push(summary);
        const text = parts.join(' ').trim();
        if (!text) continue;
        // 覆盖：先尝试移除旧的
        try {
-            cachedIndex.discard(ev.id);
+            cachedIndex.discard(doc.id);
        } catch {
-            // 不存在则忽略
+            // Ignore if previous document does not exist.
        }
        removeDocumentIdf(doc.id);
        docs.push(doc);
    }
-        docs.push({
+    if (!docs.length) return;
            id: ev.id,
            type: 'event',
            floor: null,
            text,
        });
    }
    if (docs.length > 0) {
    cachedIndex.addAll(docs);
-        xbLog.info(MODULE_ID, `增量添加: ${docs.length} 个事件`);
+    for (const doc of docs) {
        addDocumentIdf(doc.id, doc.text);
    }
    xbLog.info(MODULE_ID, `Incremental add events=${docs.length}`);
 }
--- a/modules/story-summary/vector/retrieval/metrics.js
+++ b/modules/story-summary/vector/retrieval/metrics.js
@@ -52,6 +52,10 @@ export function createMetrics() {
            eventHits: 0,
            searchTime: 0,
            indexReadyTime: 0,
            idfEnabled: false,
            idfDocCount: 0,
            topIdfTerms: [],
            termSearches: 0,
            eventFilteredByDense: 0,
            floorFilteredByDense: 0,
        },
@@ -274,6 +278,20 @@ export function formatMetricsLog(metrics) {
    if (m.lexical.indexReadyTime > 0) {
        lines.push(`├─ index_ready_time: ${m.lexical.indexReadyTime}ms`);
    }
    lines.push(`├─ idf_enabled: ${!!m.lexical.idfEnabled}`);
    if (m.lexical.idfDocCount > 0) {
        lines.push(`├─ idf_doc_count: ${m.lexical.idfDocCount}`);
    }
    if ((m.lexical.topIdfTerms || []).length > 0) {
        const topIdfText = m.lexical.topIdfTerms
            .slice(0, 5)
            .map(x => `${x.term}:${x.idf}`)
            .join(', ');
        lines.push(`├─ top_idf_terms: [${topIdfText}]`);
    }
    if (m.lexical.termSearches > 0) {
        lines.push(`├─ term_searches: ${m.lexical.termSearches}`);
    }
    if (m.lexical.eventFilteredByDense > 0) {
        lines.push(`├─ event_filtered_by_dense: ${m.lexical.eventFilteredByDense}`);
    }
--- a/modules/story-summary/vector/retrieval/query-builder.js
+++ b/modules/story-summary/vector/retrieval/query-builder.js
@@ -20,6 +20,7 @@
 import { getContext } from '../../../../../../../extensions.js';
 import { buildEntityLexicon, buildDisplayNameMap, extractEntitiesFromText, buildCharacterPools } from './entity-lexicon.js';
 import { getLexicalIdfAccessor } from './lexical-index.js';
 import { getSummaryStore } from '../../data/store.js';
 import { filterText } from '../utils/text-filter.js';
 import { tokenizeForIndex as tokenizerTokenizeForIndex } from '../utils/tokenizer.js';
@@ -106,6 +107,7 @@ export function computeLengthFactor(charCount) {
 function extractKeyTerms(text, maxTerms = LEXICAL_TERMS_MAX) {
    if (!text) return [];
    const idfAccessor = getLexicalIdfAccessor();
    const tokens = tokenizerTokenizeForIndex(text);
    const freq = new Map();
    for (const token of tokens) {
@@ -115,9 +117,13 @@ function extractKeyTerms(text, maxTerms = LEXICAL_TERMS_MAX) {
    }
    return Array.from(freq.entries())
-        .sort((a, b) => b[1] - a[1])
+        .map(([term, tf]) => {
            const idf = idfAccessor.enabled ? idfAccessor.getIdf(term) : 1;
            return { term, tf, score: tf * idf };
        })
        .sort((a, b) => (b.score - a.score) || (b.tf - a.tf))
        .slice(0, maxTerms)
-        .map(([term]) => term);
+        .map(x => x.term);
 }
 // ─────────────────────────────────────────────────────────────────────────
--- a/modules/story-summary/vector/retrieval/recall.js
+++ b/modules/story-summary/vector/retrieval/recall.js
@@ -984,6 +984,12 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) {
        : CONFIG.LAST_MESSAGES_K;
    const lastMessages = getLastMessages(chat, lastMessagesCount, excludeLastAi);
    // Non-blocking preload: keep recall latency stable.
    // If not ready yet, query-builder will gracefully fall back to TF terms.
    getLexicalIndex().catch((e) => {
        xbLog.warn(MODULE_ID, 'Preload lexical index failed; continue with TF fallback', e);
    });
    const bundle = buildQueryBundle(lastMessages, pendingUserMessage);
    const focusTerms = bundle.focusTerms || bundle.focusEntities || [];
    const focusCharacters = bundle.focusCharacters || [];
@@ -1161,6 +1167,7 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) {
        atomIds: [], atomFloors: new Set(),
        chunkIds: [], chunkFloors: new Set(),
        eventIds: [], chunkScores: [], searchTime: 0,
        idfEnabled: false, idfDocCount: 0, topIdfTerms: [], termSearches: 0,
    };
    let indexReadyTime = 0;
@@ -1184,6 +1191,10 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) {
        metrics.lexical.searchTime = lexicalResult.searchTime || 0;
        metrics.lexical.indexReadyTime = indexReadyTime;
        metrics.lexical.terms = bundle.lexicalTerms.slice(0, 10);
        metrics.lexical.idfEnabled = !!lexicalResult.idfEnabled;
        metrics.lexical.idfDocCount = lexicalResult.idfDocCount || 0;
        metrics.lexical.topIdfTerms = lexicalResult.topIdfTerms || [];
        metrics.lexical.termSearches = lexicalResult.termSearches || 0;
    }
    // 合并 L2 events（lexical 命中但 dense 未命中的 events）
@@ -1238,7 +1249,7 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) {
    }
    xbLog.info(MODULE_ID,
-        `Lexical: chunks=${lexicalResult.chunkIds.length} events=${lexicalResult.eventIds.length} mergedEvents=+${lexicalEventCount} filteredByDense=${lexicalEventFilteredByDense} floorFiltered=${metrics.lexical.floorFilteredByDense || 0} (indexReady=${indexReadyTime}ms search=${lexicalResult.searchTime || 0}ms total=${lexTime}ms)`
+        `Lexical: chunks=${lexicalResult.chunkIds.length} events=${lexicalResult.eventIds.length} mergedEvents=+${lexicalEventCount} filteredByDense=${lexicalEventFilteredByDense} floorFiltered=${metrics.lexical.floorFilteredByDense || 0} idfEnabled=${lexicalResult.idfEnabled ? 'yes' : 'no'} idfDocs=${lexicalResult.idfDocCount || 0} termSearches=${lexicalResult.termSearches || 0} (indexReady=${indexReadyTime}ms search=${lexicalResult.searchTime || 0}ms total=${lexTime}ms)`
    );
    // ═══════════════════════════════════════════════════════════════════
--- a/modules/story-summary/vector/utils/stopwords-base.js
+++ b/modules/story-summary/vector/utils/stopwords-base.js
--- a/modules/story-summary/vector/utils/stopwords-data/LICENSE.stopwords-iso.txt
+++ b/modules/story-summary/vector/utils/stopwords-data/LICENSE.stopwords-iso.txt
@@ -0,0 +1,21 @@
 The MIT License (MIT)
 Copyright (c) 2020 Gene Diaz
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
--- a/modules/story-summary/vector/utils/stopwords-data/SOURCES.md
+++ b/modules/story-summary/vector/utils/stopwords-data/SOURCES.md
@@ -0,0 +1,15 @@
 # stopwords sources for story-summary
 - Dataset: `stopwords-iso` (npm package, version 1.1.0)
 - Repository: https://github.com/stopwords-iso/stopwords-iso
 - License: MIT
 - Snapshot date: 2026-02-16
 - Languages used: `zh`, `ja`, `en`
 - Local snapshot files:
  - `stopwords-iso.zh.txt`
  - `stopwords-iso.ja.txt`
  - `stopwords-iso.en.txt`
 Generation note:
 - `modules/story-summary/vector/utils/stopwords-base.js` is generated from these snapshot files.
 - Keep `stopwords-patch.js` for tiny domain overrides only.
--- a/modules/story-summary/vector/utils/stopwords-data/stopwords-iso.en.txt
+++ b/modules/story-summary/vector/utils/stopwords-data/stopwords-iso.en.txt
--- a/modules/story-summary/vector/utils/stopwords-data/stopwords-iso.ja.txt
+++ b/modules/story-summary/vector/utils/stopwords-data/stopwords-iso.ja.txt
@@ -0,0 +1,134 @@
 あそこ
 あっ
 あの
 あのかた
 あの人
 あり
 あります
 ある
 あれ
 い
 いう
 います
 いる
 う
 うち
 え
 お
 および
 おり
 おります
 か
 かつて
 から
 が
 き
 ここ
 こちら
 こと
 この
 これ
 これら
 さ
 さらに
 し
 しかし
 する
 ず
 せ
 せる
 そこ
 そして
 その
 その他
 その後
 それ
 それぞれ
 それで
 た
 ただし
 たち
 ため
 たり
 だ
 だっ
 だれ
 つ
 て
 で
 でき
 できる
 です
 では
 でも
 と
 という
 といった
 とき
 ところ
 として
 とともに
 とも
 と共に
 どこ
 どの
 な
 ない
 なお
 なかっ
 ながら
 なく
 なっ
 など
 なに
 なら
 なり
 なる
 なん
 に
 において
 における
 について
 にて
 によって
 により
 による
 に対して
 に対する
 に関する
 の
 ので
 のみ
 は
 ば
 へ
 ほか
 ほとんど
 ほど
 ます
 また
 または
 まで
 も
 もの
 ものの
 や
 よう
 より
 ら
 られ
 られる
 れ
 れる
 を
 ん
 何
 及び
 彼
 彼女
 我々
 特に
 私
 私達
 貴方
 貴方方
--- a/modules/story-summary/vector/utils/stopwords-data/stopwords-iso.zh.txt
+++ b/modules/story-summary/vector/utils/stopwords-data/stopwords-iso.zh.txt
@@ -0,0 +1,794 @@
 、
 。
 〈
 〉
 《
 》
 一
 一个
 一些
 一何
 一切
 一则
 一方面
 一旦
 一来
 一样
 一种
 一般
 一转眼
 七
 万一
 三
 上
 上下
 下
 不
 不仅
 不但
 不光
 不单
 不只
 不外乎
 不如
 不妨
 不尽
 不尽然
 不得
 不怕
 不惟
 不成
 不拘
 不料
 不是
 不比
 不然
 不特
 不独
 不管
 不至于
 不若
 不论
 不过
 不问
 与
 与其
 与其说
 与否
 与此同时
 且
 且不说
 且说
 两者
 个
 个别
 中
 临
 为
 为了
 为什么
 为何
 为止
 为此
 为着
 乃
 乃至
 乃至于
 么
 之
 之一
 之所以
 之类
 乌乎
 乎
 乘
 九
 也
 也好
 也罢
 了
 二
 二来
 于
 于是
 于是乎
 云云
 云尔
 五
 些
 亦
 人
 人们
 人家
 什
 什么
 什么样
 今
 介于
 仍
 仍旧
 从
 从此
 从而
 他
 他人
 他们
 他们们
 以
 以上
 以为
 以便
 以免
 以及
 以故
 以期
 以来
 以至
 以至于
 以致
 们
 任
 任何
 任凭
 会
 似的
 但
 但凡
 但是
 何
 何以
 何况
 何处
 何时
 余外
 作为
 你
 你们
 使
 使得
 例如
 依
 依据
 依照
 便于
 俺
 俺们
 倘
 倘使
 倘或
 倘然
 倘若
 借
 借傥然
 假使
 假如
 假若
 做
 像
 儿
 先不先
 光
 光是
 全体
 全部
 八
 六
 兮
 共
 关于
 关于具体地说
 其
 其一
 其中
 其二
 其他
 其余
 其它
 其次
 具体地说
 具体说来
 兼之
 内
 再
 再其次
 再则
 再有
 再者
 再者说
 再说
 冒
 冲
 况且
 几
 几时
 凡
 凡是
 凭
 凭借
 出于
 出来
 分
 分别
 则
 则甚
 别
 别人
 别处
 别是
 别的
 别管
 别说
 到
 前后
 前此
 前者
 加之
 加以
 区
 即
 即令
 即使
 即便
 即如
 即或
 即若
 却
 去
 又
 又及
 及
 及其
 及至
 反之
 反而
 反过来
 反过来说
 受到
 另
 另一方面
 另外
 另悉
 只
 只当
 只怕
 只是
 只有
 只消
 只要
 只限
 叫
 叮咚
 可
 可以
 可是
 可见
 各
 各个
 各位
 各种
 各自
 同
 同时
 后
 后者
 向
 向使
 向着
 吓
 吗
 否则
 吧
 吧哒
 含
 吱
 呀
 呃
 呕
 呗
 呜
 呜呼
 呢
 呵
 呵呵
 呸
 呼哧
 咋
 和
 咚
 咦
 咧
 咱
 咱们
 咳
 哇
 哈
 哈哈
 哉
 哎
 哎呀
 哎哟
 哗
 哟
 哦
 哩
 哪
 哪个
 哪些
 哪儿
 哪天
 哪年
 哪怕
 哪样
 哪边
 哪里
 哼
 哼唷
 唉
 唯有
 啊
 啐
 啥
 啦
 啪达
 啷当
 喂
 喏
 喔唷
 喽
 嗡
 嗡嗡
 嗬
 嗯
 嗳
 嘎
 嘎登
 嘘
 嘛
 嘻
 嘿
 嘿嘿
 四
 因
 因为
 因了
 因此
 因着
 因而
 固然
 在
 在下
 在于
 地
 基于
 处在
 多
 多么
 多少
 大
 大家
 她
 她们
 好
 如
 如上
 如上所述
 如下
 如何
 如其
 如同
 如是
 如果
 如此
 如若
 始而
 孰料
 孰知
 宁
 宁可
 宁愿
 宁肯
 它
 它们
 对
 对于
 对待
 对方
 对比
 将
 小
 尔
 尔后
 尔尔
 尚且
 就
 就是
 就是了
 就是说
 就算
 就要
 尽
 尽管
 尽管如此
 岂但
 己
 已
 已矣
 巴
 巴巴
 年
 并
 并且
 庶乎
 庶几
 开外
 开始
 归
 归齐
 当
 当地
 当然
 当着
 彼
 彼时
 彼此
 往
 待
 很
 得
 得了
 怎
 怎么
 怎么办
 怎么样
 怎奈
 怎样
 总之
 总的来看
 总的来说
 总的说来
 总而言之
 恰恰相反
 您
 惟其
 慢说
 我
 我们
 或
 或则
 或是
 或曰
 或者
 截至
 所
 所以
 所在
 所幸
 所有
 才
 才能
 打
 打从
 把
 抑或
 拿
 按
 按照
 换句话说
 换言之
 据
 据此
 接着
 故
 故此
 故而
 旁人
 无
 无宁
 无论
 既
 既往
 既是
 既然
 日
 时
 时候
 是
 是以
 是的
 更
 曾
 替
 替代
 最
 月
 有
 有些
 有关
 有及
 有时
 有的
 望
 朝
 朝着
 本
 本人
 本地
 本着
 本身
 来
 来着
 来自
 来说
 极了
 果然
 果真
 某
 某个
 某些
 某某
 根据
 欤
 正值
 正如
 正巧
 正是
 此
 此地
 此处
 此外
 此时
 此次
 此间
 毋宁
 每
 每当
 比
 比及
 比如
 比方
 没奈何
 沿
 沿着
 漫说
 点
 焉
 然则
 然后
 然而
 照
 照着
 犹且
 犹自
 甚且
 甚么
 甚或
 甚而
 甚至
 甚至于
 用
 用来
 由
 由于
 由是
 由此
 由此可见
 的
 的确
 的话
 直到
 相对而言
 省得
 看
 眨眼
 着
 着呢
 矣
 矣乎
 矣哉
 离
 秒
 称
 竟而
 第
 等
 等到
 等等
 简言之
 管
 类如
 紧接着
 纵
 纵令
 纵使
 纵然
 经
 经过
 结果
 给
 继之
 继后
 继而
 综上所述
 罢了
 者
 而
 而且
 而况
 而后
 而外
 而已
 而是
 而言
 能
 能否
 腾
 自
 自个儿
 自从
 自各儿
 自后
 自家
 自己
 自打
 自身
 至
 至于
 至今
 至若
 致
 般的
 若
 若夫
 若是
 若果
 若非
 莫不然
 莫如
 莫若
 虽
 虽则
 虽然
 虽说
 被
 要
 要不
 要不是
 要不然
 要么
 要是
 譬喻
 譬如
 让
 许多
 论
 设使
 设或
 设若
 诚如
 诚然
 该
 说
 说来
 请
 诸
 诸位
 诸如
 谁
 谁人
 谁料
 谁知
 贼死
 赖以
 赶
 起
 起见
 趁
 趁着
 越是
 距
 跟
 较
 较之
 边
 过
 还
 还是
 还有
 还要
 这
 这一来
 这个
 这么
 这么些
 这么样
 这么点儿
 这些
 这会儿
 这儿
 这就是说
 这时
 这样
 这次
 这般
 这边
 这里
 进而
 连
 连同
 逐步
 通过
 遵循
 遵照
 那
 那个
 那么
 那么些
 那么样
 那些
 那会儿
 那儿
 那时
 那样
 那般
 那边
 那里
 都
 鄙人
 鉴于
 针对
 阿
 除
 除了
 除外
 除开
 除此之外
 除非
 随
 随后
 随时
 随着
 难道说
 零
 非
 非但
 非徒
 非特
 非独
 靠
 顺
 顺着
 首先
 ︿
 ！
 ＃
 ＄
 ％
 ＆
 （
 ）
 ＊
 ＋
 ，
 ０
 １
 ２
 ３
 ４
 ５
 ６
 ７
 ８
 ９
 ：
 ；
 ＜
 ＞
 ？
 ＠
 ［
 ］
 ｛
 ｜
 ｝
 ～
 ￥
--- a/modules/story-summary/vector/utils/stopwords-patch.js
+++ b/modules/story-summary/vector/utils/stopwords-patch.js
@@ -0,0 +1,9 @@
 // Small domain-level tuning surface.
 // Keep this file tiny: add/remove only words that are repeatedly noisy in real logs.
 // Extra stopwords on top of BASE_STOP_WORDS.
 export const DOMAIN_STOP_WORDS = [];
 // High-value words that must never be filtered as stopwords.
 // Default to empty for plugin-wide deployment; entity names are already protected dynamically.
 export const KEEP_WORDS = [];
--- a/modules/story-summary/vector/utils/tokenizer.js
+++ b/modules/story-summary/vector/utils/tokenizer.js
@@ -18,6 +18,8 @@
 import { extensionFolderPath } from '../../../../core/constants.js';
 import { xbLog } from '../../../../core/debug-core.js';
 import { BASE_STOP_WORDS } from './stopwords-base.js';
 import { DOMAIN_STOP_WORDS, KEEP_WORDS } from './stopwords-patch.js';
 const MODULE_ID = 'tokenizer';
@@ -61,44 +63,30 @@ let entityList = [];
 /** @type {Set<string>} 已注入结巴的实体（避免重复 add_word） */
 let injectedEntities = new Set();
 let entityKeepSet = new Set();
 // ═══════════════════════════════════════════════════════════════════════════
 // 停用词
 // ═══════════════════════════════════════════════════════════════════════════
-const STOP_WORDS = new Set([
+const STATIC_KEEP_WORDS = new Set((KEEP_WORDS || [])
-    // 中文高频虚词
+    .map(w => String(w || '').trim().toLowerCase())
-    '的', '了', '在', '是', '我', '有', '和', '就', '不', '人',
+    .filter(Boolean));
-    '都', '一', '一个', '上', '也', '很', '到', '说', '要', '去',
+
-    '你', '会', '着', '没有', '看', '好', '自己', '这', '他', '她',
+// Standard source only: stopwords-iso snapshot + small domain patch.
-    '它', '吗', '什么', '那', '里', '来', '吧', '呢', '啊', '哦',
+const EFFECTIVE_STOP_WORDS = new Set(
-    '嗯', '呀', '哈', '嘿', '喂', '哎', '唉', '哇', '呃', '嘛',
+    [...BASE_STOP_WORDS, ...DOMAIN_STOP_WORDS]
-    '把', '被', '让', '给', '从', '向', '对', '跟', '比', '但',
+        .map(w => String(w || '').trim().toLowerCase())
-    '而', '或', '如果', '因为', '所以', '虽然', '但是', '然后',
+        .filter(Boolean),
-    '可以', '这样', '那样', '怎么', '为什么', '什么样', '哪里',
+);
-    '时候', '现在', '已经', '还是', '只是', '可能', '应该', '知道',
+
-    '觉得', '开始', '一下', '一些', '这个', '那个', '他们', '我们',
+function shouldKeepTokenByWhitelist(token) {
-    '你们', '自己', '起来', '出来', '进去', '回来', '过来', '下去',
+    const t = String(token || '').trim().toLowerCase();
-    // 日语常见虚词（≥2字，匹配 TinySegmenter 产出粒度）
+    if (!t) return false;
-    'です', 'ます', 'した', 'して', 'する', 'ない', 'いる', 'ある',
+    if (STATIC_KEEP_WORDS.has(t)) return true;
-    'なる', 'れる', 'られ', 'られる',
+    if (entityKeepSet.has(t)) return true;
-    'この', 'その', 'あの', 'どの', 'ここ', 'そこ', 'あそこ',
+    return false;
-    'これ', 'それ', 'あれ', 'どれ',
+}
    'ても', 'から', 'まで', 'ので', 'のに', 'けど', 'だけ',
    'もう', 'まだ', 'とても', 'ちょっと', 'やっぱり',
    // 英文常见停用词
    'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been',
    'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
    'would', 'could', 'should', 'may', 'might', 'can', 'shall',
    'and', 'but', 'or', 'not', 'no', 'nor', 'so', 'yet',
    'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from',
    'it', 'its', 'he', 'she', 'his', 'her', 'they', 'them',
    'this', 'that', 'these', 'those', 'i', 'me', 'my', 'you', 'your',
    'we', 'our', 'if', 'then', 'than', 'when', 'what', 'which',
    'who', 'how', 'where', 'there', 'here', 'all', 'each', 'every',
    'both', 'few', 'more', 'most', 'other', 'some', 'such',
    'only', 'own', 'same', 'just', 'very', 'also', 'about',
 ]);
 // ═══════════════════════════════════════════════════════════════════════════
 // Unicode 分类
@@ -571,6 +559,7 @@ export function getState() {
 export function injectEntities(lexicon, displayMap) {
    if (!lexicon?.size) {
        entityList = [];
        entityKeepSet = new Set();
        return;
    }
@@ -586,6 +575,7 @@ export function injectEntities(lexicon, displayMap) {
    // 按长度降序（最长匹配优先）
    entities.sort((a, b) => b.length - a.length);
    entityList = entities;
    entityKeepSet = new Set(entities.map(e => String(e || '').trim().toLowerCase()).filter(Boolean));
    // 如果结巴已就绪，注入自定义词
    if (wasmState === WasmState.READY && jiebaAddWord) {
@@ -656,7 +646,7 @@ export function tokenize(text) {
        if (!cleaned) continue;
        if (cleaned.length < 2) continue;
-        if (STOP_WORDS.has(cleaned)) continue;
+        if (EFFECTIVE_STOP_WORDS.has(cleaned) && !shouldKeepTokenByWhitelist(cleaned)) continue;
        if (seen.has(cleaned)) continue;
        // 过滤纯标点/特殊字符
@@ -728,7 +718,7 @@ export function tokenizeForIndex(text) {
        .map(t => t.trim().toLowerCase())
        .filter(t => {
            if (!t || t.length < 2) return false;
-            if (STOP_WORDS.has(t)) return false;
+            if (EFFECTIVE_STOP_WORDS.has(t) && !shouldKeepTokenByWhitelist(t)) return false;
            if (/^[\s\x00-\x1F\p{P}\p{S}]+$/u.test(t)) return false;
            return true;
        });
@@ -744,6 +734,7 @@ export function tokenizeForIndex(text) {
 */
 export function reset() {
    entityList = [];
    entityKeepSet = new Set();
    injectedEntities.clear();
    // 不重置 WASM 状态（避免重复加载）
 }