improve lexical warmup and standardize stopword pipeline

2026-02-17 14:49:47 +08:00
parent 246eb7a7e2
commit 94eceaed96
14 changed files with 4840 additions and 330 deletions
--- a/modules/story-summary/story-summary.js
+++ b/modules/story-summary/story-summary.js
@@ -1551,6 +1551,7 @@ async function handleMessageReceived(scheduledChatId) {

    // Refresh entity lexicon after new message (new roles may appear)
    refreshEntityLexiconAndWarmup();
+    scheduleLexicalWarmup(100);

    // Auto backfill missing L0 (delay to avoid contention with current floor)
    setTimeout(() => maybeAutoExtractL0(), 2000);
@@ -1559,6 +1560,7 @@ async function handleMessageReceived(scheduledChatId) {
 function handleMessageSent(scheduledChatId) {
    if (isChatStale(scheduledChatId)) return;
    initButtonsForAll();
+    scheduleLexicalWarmup(0);
    setTimeout(() => maybeAutoRunSummary("before_user"), 1000);
 }

--- a/modules/story-summary/vector/llm/llm-service.js
+++ b/modules/story-summary/vector/llm/llm-service.js
@@ -2,7 +2,6 @@
 // vector/llm/llm-service.js - 修复 prefill 传递方式
 // ═══════════════════════════════════════════════════════════════════════════
 import { xbLog } from '../../../../core/debug-core.js';
-import { getVectorConfig } from '../../data/config.js';
 import { getApiKey } from './siliconflow.js';

 const MODULE_ID = 'vector-llm-service';
--- a/modules/story-summary/vector/retrieval/lexical-index.js
+++ b/modules/story-summary/vector/retrieval/lexical-index.js
@@ -1,16 +1,3 @@
-// ═══════════════════════════════════════════════════════════════════════════
-// lexical-index.js - MiniSearch 词法检索索引
-//
-// 职责：
-// 1. 对 L0 atoms + L1 chunks + L2 events 建立词法索引
-// 2. 提供词法检索接口（专名精确匹配兜底）
-// 3. 惰性构建 + 异步预热 + 缓存失效机制
-//
-// 索引存储：纯内存（不持久化）
-// 分词器：统一使用 tokenizer.js（结巴 + 实体保护 + 降级）
-// 重建时机：CHAT_CHANGED / L0提取完成 / L2总结完成
-// ═══════════════════════════════════════════════════════════════════════════
-
 import MiniSearch from '../../../../libs/minisearch.mjs';
 import { getContext } from '../../../../../../../extensions.js';
 import { getSummaryStore } from '../../data/store.js';
@@ -20,76 +7,166 @@ import { tokenizeForIndex } from '../utils/tokenizer.js';

 const MODULE_ID = 'lexical-index';

-// ─────────────────────────────────────────────────────────────────────────
-// 缓存
-// ─────────────────────────────────────────────────────────────────────────
-
-/** @type {MiniSearch|null} */
+// In-memory index cache
 let cachedIndex = null;
-
-/** @type {string|null} */
 let cachedChatId = null;
-
-/** @type {string|null} 数据指纹（atoms + chunks + events 数量） */
 let cachedFingerprint = null;
-
-/** @type {boolean} 是否正在构建 */
 let building = false;
-
-/** @type {Promise<MiniSearch|null>|null} 当前构建 Promise（防重入） */
 let buildPromise = null;
-/** @type {Map<number, string[]>} floor → 该楼层的 doc IDs（仅 L1 chunks） */
+
+// floor -> chunk doc ids (L1 only)
 let floorDocIds = new Map();

-// ─────────────────────────────────────────────────────────────────────────
-// 工具函数
-// ─────────────────────────────────────────────────────────────────────────
+// IDF stats over lexical docs (L1 chunks + L2 events)
+let termDfMap = new Map();
+let docTokenSets = new Map(); // docId -> Set<token>
+let lexicalDocCount = 0;
+
+const IDF_MIN = 1.0;
+const IDF_MAX = 4.0;
+const BUILD_BATCH_SIZE = 500;

-/**
- * 清理事件摘要（移除楼层标记）
- * @param {string} summary
- * @returns {string}
- */
 function cleanSummary(summary) {
    return String(summary || '')
        .replace(/\s*\(#\d+(?:-\d+)?\)\s*$/, '')
        .trim();
 }

-/**
- * 计算缓存指纹
- * @param {number} chunkCount
- * @param {number} eventCount
- * @returns {string}
- */
-function computeFingerprint(chunkCount, eventCount) {
-    return `${chunkCount}:${eventCount}`;
+function fnv1a32(input, seed = 0x811C9DC5) {
+    let hash = seed >>> 0;
+    const text = String(input || '');
+    for (let i = 0; i < text.length; i++) {
+        hash ^= text.charCodeAt(i);
+        hash = Math.imul(hash, 0x01000193) >>> 0;
+    }
+    return hash >>> 0;
+}
+
+function compareDocKeys(a, b) {
+    const ka = `${a?.type || ''}:${a?.id || ''}`;
+    const kb = `${b?.type || ''}:${b?.id || ''}`;
+    if (ka < kb) return -1;
+    if (ka > kb) return 1;
+    return 0;
+}
+
+function computeFingerprintFromDocs(docs) {
+    const normalizedDocs = Array.isArray(docs) ? [...docs].sort(compareDocKeys) : [];
+    let hash = 0x811C9DC5;
+
+    for (const doc of normalizedDocs) {
+        const payload = `${doc?.type || ''}\u001F${doc?.id || ''}\u001F${doc?.floor ?? ''}\u001F${doc?.text || ''}\u001E`;
+        hash = fnv1a32(payload, hash);
+    }
+
+    return `${normalizedDocs.length}:${(hash >>> 0).toString(16)}`;
 }

-/**
- * 让出主线程（避免长时间阻塞 UI）
- * @returns {Promise<void>}
- */
 function yieldToMain() {
    return new Promise(resolve => setTimeout(resolve, 0));
 }

-// ─────────────────────────────────────────────────────────────────────────
-// 文档收集
-// ─────────────────────────────────────────────────────────────────────────
+function clamp(v, min, max) {
+    return Math.max(min, Math.min(max, v));
+}
+
+function normalizeTerm(term) {
+    return String(term || '').trim().toLowerCase();
+}
+
+function computeIdfFromDf(df, docCount) {
+    if (!docCount || docCount <= 0) return 1;
+    const raw = Math.log((docCount + 1) / ((df || 0) + 1)) + 1;
+    return clamp(raw, IDF_MIN, IDF_MAX);
+}
+
+function computeIdf(term) {
+    const t = normalizeTerm(term);
+    if (!t || lexicalDocCount <= 0) return 1;
+    return computeIdfFromDf(termDfMap.get(t) || 0, lexicalDocCount);
+}
+
+function extractUniqueTokens(text) {
+    return new Set(tokenizeForIndex(String(text || '')).map(normalizeTerm).filter(Boolean));
+}
+
+function clearIdfState() {
+    termDfMap = new Map();
+    docTokenSets = new Map();
+    lexicalDocCount = 0;
+}
+
+function removeDocumentIdf(docId) {
+    const id = String(docId || '');
+    if (!id) return;
+
+    const tokens = docTokenSets.get(id);
+    if (!tokens) return;
+
+    for (const token of tokens) {
+        const current = termDfMap.get(token) || 0;
+        if (current <= 1) {
+            termDfMap.delete(token);
+        } else {
+            termDfMap.set(token, current - 1);
+        }
+    }
+
+    docTokenSets.delete(id);
+    lexicalDocCount = Math.max(0, lexicalDocCount - 1);
+}
+
+function addDocumentIdf(docId, text) {
+    const id = String(docId || '');
+    if (!id) return;
+
+    // Replace semantics: remove old token set first if this id already exists.
+    removeDocumentIdf(id);
+
+    const tokens = extractUniqueTokens(text);
+    docTokenSets.set(id, tokens);
+    lexicalDocCount += 1;
+
+    for (const token of tokens) {
+        termDfMap.set(token, (termDfMap.get(token) || 0) + 1);
+    }
+}
+
+function rebuildIdfFromDocs(docs) {
+    clearIdfState();
+    for (const doc of docs || []) {
+        const id = String(doc?.id || '');
+        const text = String(doc?.text || '');
+        if (!id || !text.trim()) continue;
+        addDocumentIdf(id, text);
+    }
+}
+
+function buildEventDoc(ev) {
+    if (!ev?.id) return null;
+
+    const parts = [];
+    if (ev.title) parts.push(ev.title);
+    if (ev.participants?.length) parts.push(ev.participants.join(' '));
+
+    const summary = cleanSummary(ev.summary);
+    if (summary) parts.push(summary);
+
+    const text = parts.join(' ').trim();
+    if (!text) return null;
+
+    return {
+        id: ev.id,
+        type: 'event',
+        floor: null,
+        text,
+    };
+}

-/**
- * 收集所有待索引文档
- *
- * @param {object[]} chunks - getAllChunks(chatId) 返回值
- * @param {object[]} events - store.json.events
- * @returns {object[]} 文档数组
- */
 function collectDocuments(chunks, events) {
    const docs = [];

-    // L1 chunks + 填充 floorDocIds
-    for (const chunk of (chunks || [])) {
+    for (const chunk of chunks || []) {
        if (!chunk?.chunkId || !chunk.text) continue;

        const floor = chunk.floor ?? -1;
@@ -101,48 +178,19 @@ function collectDocuments(chunks, events) {
        });

        if (floor >= 0) {
-            if (!floorDocIds.has(floor)) {
-                floorDocIds.set(floor, []);
-            }
+            if (!floorDocIds.has(floor)) floorDocIds.set(floor, []);
            floorDocIds.get(floor).push(chunk.chunkId);
        }
    }

-    // L2 events
-    for (const ev of (events || [])) {
-        if (!ev?.id) continue;
-        const parts = [];
-        if (ev.title) parts.push(ev.title);
-        if (ev.participants?.length) parts.push(ev.participants.join(' '));
-        const summary = cleanSummary(ev.summary);
-        if (summary) parts.push(summary);
-        const text = parts.join(' ').trim();
-        if (!text) continue;
-
-        docs.push({
-            id: ev.id,
-            type: 'event',
-            floor: null,
-            text,
-        });
+    for (const ev of events || []) {
+        const doc = buildEventDoc(ev);
+        if (doc) docs.push(doc);
    }

    return docs;
 }

-// ─────────────────────────────────────────────────────────────────────────
-// 索引构建（分片，不阻塞主线程）
-// ─────────────────────────────────────────────────────────────────────────
-
-/** 每批添加的文档数 */
-const BUILD_BATCH_SIZE = 500;
-
-/**
- * 构建 MiniSearch 索引（分片异步）
- *
- * @param {object[]} docs - 文档数组
- * @returns {Promise<MiniSearch>}
- */
 async function buildIndexAsync(docs) {
    const T0 = performance.now();

@@ -158,49 +206,43 @@ async function buildIndexAsync(docs) {
        tokenize: tokenizeForIndex,
    });

-    if (!docs.length) {
-        return index;
-    }
+    if (!docs.length) return index;

-    // 分片添加，每批 BUILD_BATCH_SIZE 条后让出主线程
    for (let i = 0; i < docs.length; i += BUILD_BATCH_SIZE) {
        const batch = docs.slice(i, i + BUILD_BATCH_SIZE);
        index.addAll(batch);

-        // 非最后一批时让出主线程
        if (i + BUILD_BATCH_SIZE < docs.length) {
            await yieldToMain();
        }
    }

    const elapsed = Math.round(performance.now() - T0);
-    xbLog.info(MODULE_ID,
-        `索引构建完成: ${docs.length} 文档 (${elapsed}ms)`
-    );
-
+    xbLog.info(MODULE_ID, `Index built: ${docs.length} docs (${elapsed}ms)`);
    return index;
 }

-// ─────────────────────────────────────────────────────────────────────────
-// 检索
-// ─────────────────────────────────────────────────────────────────────────
-
 /**
 * @typedef {object} LexicalSearchResult
- * @property {string[]} atomIds    - 命中的 L0 atom IDs
- * @property {Set<number>} atomFloors - 命中的 L0 楼层集合
- * @property {string[]} chunkIds   - 命中的 L1 chunk IDs
- * @property {Set<number>} chunkFloors - 命中的 L1 楼层集合
- * @property {string[]} eventIds   - 命中的 L2 event IDs
- * @property {object[]} chunkScores - chunk 命中详情 [{ chunkId, score }]
- * @property {number}   searchTime - 检索耗时 ms
+ * @property {string[]} atomIds - Reserved for backward compatibility (currently empty).
+ * @property {Set<number>} atomFloors - Reserved for backward compatibility (currently empty).
+ * @property {string[]} chunkIds - Matched L1 chunk ids sorted by weighted lexical score.
+ * @property {Set<number>} chunkFloors - Floor ids covered by matched chunks.
+ * @property {string[]} eventIds - Matched L2 event ids sorted by weighted lexical score.
+ * @property {object[]} chunkScores - Weighted lexical scores for matched chunks.
+ * @property {boolean} idfEnabled - Whether IDF stats are available for weighting.
+ * @property {number} idfDocCount - Number of lexical docs used to compute IDF.
+ * @property {Array<{term:string,idf:number}>} topIdfTerms - Top query terms by IDF.
+ * @property {number} termSearches - Number of per-term MiniSearch queries executed.
+ * @property {number} searchTime - Total lexical search time in milliseconds.
 */

 /**
- * 在词法索引中检索
+ * Search lexical index by terms, using per-term MiniSearch and IDF-weighted score aggregation.
+ * This keeps existing outputs compatible while adding observability fields.
 *
- * @param {MiniSearch} index - 索引实例
- * @param {string[]} terms - 查询词列表
+ * @param {MiniSearch} index
+ * @param {string[]} terms
 * @returns {LexicalSearchResult}
 */
 export function searchLexicalIndex(index, terms) {
@@ -213,6 +255,10 @@ export function searchLexicalIndex(index, terms) {
        chunkFloors: new Set(),
        eventIds: [],
        chunkScores: [],
+        idfEnabled: lexicalDocCount > 0,
+        idfDocCount: lexicalDocCount,
+        topIdfTerms: [],
+        termSearches: 0,
        searchTime: 0,
    };

@@ -221,79 +267,84 @@ export function searchLexicalIndex(index, terms) {
        return result;
    }

-    // 用所有 terms 联合查询
-    const queryString = terms.join(' ');
+    const queryTerms = Array.from(new Set((terms || []).map(normalizeTerm).filter(Boolean)));
+    const weightedScores = new Map(); // docId -> score
+    const hitMeta = new Map(); // docId -> { type, floor }
+    const idfPairs = [];

-    let hits;
-    try {
-        hits = index.search(queryString, {
-            boost: { text: 1 },
-            fuzzy: 0.2,
-            prefix: true,
-            combineWith: 'OR',
-            // 使用与索引相同的分词器
-            tokenize: tokenizeForIndex,
-        });
-    } catch (e) {
-        xbLog.warn(MODULE_ID, '检索失败', e);
-        result.searchTime = Math.round(performance.now() - T0);
-        return result;
+    for (const term of queryTerms) {
+        const idf = computeIdf(term);
+        idfPairs.push({ term, idf });
+
+        let hits = [];
+        try {
+            hits = index.search(term, {
+                boost: { text: 1 },
+                fuzzy: 0.2,
+                prefix: true,
+                combineWith: 'OR',
+                tokenize: tokenizeForIndex,
+            });
+        } catch (e) {
+            xbLog.warn(MODULE_ID, `Lexical term search failed: ${term}`, e);
+            continue;
+        }
+
+        result.termSearches += 1;
+
+        for (const hit of hits) {
+            const id = String(hit.id || '');
+            if (!id) continue;
+
+            const weighted = (hit.score || 0) * idf;
+            weightedScores.set(id, (weightedScores.get(id) || 0) + weighted);
+
+            if (!hitMeta.has(id)) {
+                hitMeta.set(id, {
+                    type: hit.type,
+                    floor: hit.floor,
+                });
+            }
+        }
    }

-    // 分类结果
-    const chunkIdSet = new Set();
-    const eventIdSet = new Set();
+    idfPairs.sort((a, b) => b.idf - a.idf);
+    result.topIdfTerms = idfPairs.slice(0, 5);

-    for (const hit of hits) {
-        const type = hit.type;
-        const id = hit.id;
-        const floor = hit.floor;
+    const sortedHits = Array.from(weightedScores.entries())
+        .sort((a, b) => b[1] - a[1]);

-        switch (type) {
-            case 'chunk':
-                if (!chunkIdSet.has(id)) {
-                    chunkIdSet.add(id);
-                    result.chunkIds.push(id);
-                    result.chunkScores.push({ chunkId: id, score: hit.score });
-                    if (typeof floor === 'number' && floor >= 0) {
-                        result.chunkFloors.add(floor);
-                    }
-                }
-                break;
+    for (const [id, score] of sortedHits) {
+        const meta = hitMeta.get(id);
+        if (!meta) continue;

-            case 'event':
-                if (!eventIdSet.has(id)) {
-                    eventIdSet.add(id);
-                    result.eventIds.push(id);
-                }
-                break;
+        if (meta.type === 'chunk') {
+            result.chunkIds.push(id);
+            result.chunkScores.push({ chunkId: id, score });
+            if (typeof meta.floor === 'number' && meta.floor >= 0) {
+                result.chunkFloors.add(meta.floor);
+            }
+            continue;
+        }
+
+        if (meta.type === 'event') {
+            result.eventIds.push(id);
        }
    }

    result.searchTime = Math.round(performance.now() - T0);

-    xbLog.info(MODULE_ID,
-        `检索完成: terms=[${terms.slice(0, 5).join(',')}] → atoms=${result.atomIds.length} chunks=${result.chunkIds.length} events=${result.eventIds.length} (${result.searchTime}ms)`
+    xbLog.info(
+        MODULE_ID,
+        `Lexical search terms=[${queryTerms.slice(0, 5).join(',')}] chunks=${result.chunkIds.length} events=${result.eventIds.length} termSearches=${result.termSearches} (${result.searchTime}ms)`,
    );

    return result;
 }

-// ─────────────────────────────────────────────────────────────────────────
-// 内部构建流程（收集数据 + 构建索引）
-// ─────────────────────────────────────────────────────────────────────────
-
-/**
- * 收集数据并构建索引
- *
- * @param {string} chatId
- * @returns {Promise<{index: MiniSearch, fingerprint: string}>}
- */
 async function collectAndBuild(chatId) {
-    // 清空侧索引（全量重建）
    floorDocIds = new Map();

-    // 收集数据（不含 L0 atoms）
    const store = getSummaryStore();
    const events = store?.json?.events || [];

@@ -301,48 +352,44 @@ async function collectAndBuild(chatId) {
    try {
        chunks = await getAllChunks(chatId);
    } catch (e) {
-        xbLog.warn(MODULE_ID, '获取 chunks 失败', e);
+        xbLog.warn(MODULE_ID, 'Failed to load chunks', e);
    }

-    const fp = computeFingerprint(chunks.length, events.length);
+    const docs = collectDocuments(chunks, events);
+    const fp = computeFingerprintFromDocs(docs);

-    // 检查是否在收集过程中缓存已被其他调用更新
    if (cachedIndex && cachedChatId === chatId && cachedFingerprint === fp) {
        return { index: cachedIndex, fingerprint: fp };
    }

-    // 收集文档（同时填充 floorDocIds）
-    const docs = collectDocuments(chunks, events);
-
-    // 异步分片构建
+    rebuildIdfFromDocs(docs);
    const index = await buildIndexAsync(docs);

    return { index, fingerprint: fp };
 }

-// ─────────────────────────────────────────────────────────────────────────
-// 公开接口：getLexicalIndex（惰性获取）
-// ─────────────────────────────────────────────────────────────────────────
-
 /**
- * 获取词法索引（惰性构建 + 缓存）
- *
- * 如果缓存有效则直接返回；否则自动构建。
- * 如果正在构建中，等待构建完成。
- *
- * @returns {Promise<MiniSearch|null>}
+ * Expose IDF accessor for query-term selection in query-builder.
+ * If index stats are not ready, this gracefully falls back to idf=1.
 */
+export function getLexicalIdfAccessor() {
+    return {
+        enabled: lexicalDocCount > 0,
+        docCount: lexicalDocCount,
+        getIdf(term) {
+            return computeIdf(term);
+        },
+    };
+}
+
 export async function getLexicalIndex() {
    const { chatId } = getContext();
    if (!chatId) return null;

-    // 快速路径：如果缓存存在且 chatId 未变，则直接命中
-    // 指纹校验放到构建流程中完成，避免为指纹而额外读一次 IndexedDB
    if (cachedIndex && cachedChatId === chatId && cachedFingerprint) {
        return cachedIndex;
    }

-    // 正在构建中，等待结果
    if (building && buildPromise) {
        try {
            await buildPromise;
@@ -350,27 +397,23 @@ export async function getLexicalIndex() {
                return cachedIndex;
            }
        } catch {
-            // 构建失败，继续往下重建
+            // Continue to rebuild below.
        }
    }

-    // 需要重建（指纹将在 collectAndBuild 内部计算并写入缓存）
-    xbLog.info(MODULE_ID, `缓存失效，重建索引 (chatId=${chatId.slice(0, 8)})`);
+    xbLog.info(MODULE_ID, `Lexical cache miss; rebuilding (chatId=${chatId.slice(0, 8)})`);

    building = true;
    buildPromise = collectAndBuild(chatId);

    try {
        const { index, fingerprint } = await buildPromise;
-
-        // 原子替换缓存
        cachedIndex = index;
        cachedChatId = chatId;
        cachedFingerprint = fingerprint;
-
        return index;
    } catch (e) {
-        xbLog.error(MODULE_ID, '索引构建失败', e);
+        xbLog.error(MODULE_ID, 'Index build failed', e);
        return null;
    } finally {
        building = false;
@@ -378,74 +421,29 @@ export async function getLexicalIndex() {
    }
 }

-// ─────────────────────────────────────────────────────────────────────────
-// 公开接口：warmupIndex（异步预建）
-// ─────────────────────────────────────────────────────────────────────────
-
-/**
- * 异步预建索引
- *
- * 在 CHAT_CHANGED 时调用，后台构建索引。
- * 不阻塞调用方，不返回结果。
- * 构建完成后缓存自动更新，后续 getLexicalIndex() 直接命中。
- *
- * 调用时机：
- * - handleChatChanged（实体注入后）
- * - L0 提取完成
- * - L2 总结完成
- */
 export function warmupIndex() {
    const { chatId } = getContext();
-    if (!chatId) return;
+    if (!chatId || building) return;

-    // 已在构建中，不重复触发
-    if (building) return;
-
-    // fire-and-forget
    getLexicalIndex().catch(e => {
-        xbLog.warn(MODULE_ID, '预热索引失败', e);
+        xbLog.warn(MODULE_ID, 'Warmup failed', e);
    });
 }

-// ─────────────────────────────────────────────────────────────────────────
-// 公开接口：invalidateLexicalIndex（缓存失效）
-// ─────────────────────────────────────────────────────────────────────────
-
-/**
- * 使缓存失效（下次 getLexicalIndex / warmupIndex 时自动重建）
- *
- * 调用时机：
- * - CHAT_CHANGED
- * - L0 提取完成
- * - L2 总结完成
- */
 export function invalidateLexicalIndex() {
    if (cachedIndex) {
-        xbLog.info(MODULE_ID, '索引缓存已失效');
+        xbLog.info(MODULE_ID, 'Lexical index cache invalidated');
    }
    cachedIndex = null;
    cachedChatId = null;
    cachedFingerprint = null;
    floorDocIds = new Map();
+    clearIdfState();
 }

-// ─────────────────────────────────────────────────────────────────────────
-// 增量更新接口
-// ─────────────────────────────────────────────────────────────────────────
-
-/**
- * 为指定楼层添加 L1 chunks 到索引
- *
- * 先移除该楼层旧文档，再添加新文档。
- * 如果索引不存在（缓存失效），静默跳过（下次 getLexicalIndex 全量重建）。
- *
- * @param {number} floor - 楼层号
- * @param {object[]} chunks - chunk 对象列表（需有 chunkId、text、floor）
- */
 export function addDocumentsForFloor(floor, chunks) {
    if (!cachedIndex || !chunks?.length) return;

-    // 先移除旧文档
    removeDocumentsByFloor(floor);

    const docs = [];
@@ -453,30 +451,29 @@ export function addDocumentsForFloor(floor, chunks) {

    for (const chunk of chunks) {
        if (!chunk?.chunkId || !chunk.text) continue;
-        docs.push({
+
+        const doc = {
            id: chunk.chunkId,
            type: 'chunk',
            floor: chunk.floor ?? floor,
            text: chunk.text,
-        });
+        };
+        docs.push(doc);
        docIds.push(chunk.chunkId);
    }

-    if (docs.length > 0) {
-        cachedIndex.addAll(docs);
-        floorDocIds.set(floor, docIds);
-        xbLog.info(MODULE_ID, `增量添加: floor ${floor}, ${docs.length} 个 chunk`);
+    if (!docs.length) return;
+
+    cachedIndex.addAll(docs);
+    floorDocIds.set(floor, docIds);
+
+    for (const doc of docs) {
+        addDocumentIdf(doc.id, doc.text);
    }
+
+    xbLog.info(MODULE_ID, `Incremental add floor=${floor} chunks=${docs.length}`);
 }

-/**
- * 从索引中移除指定楼层的所有 L1 chunk 文档
- *
- * 使用 MiniSearch discard()（软删除）。
- * 如果索引不存在，静默跳过。
- *
- * @param {number} floor - 楼层号
- */
 export function removeDocumentsByFloor(floor) {
    if (!cachedIndex) return;

@@ -487,55 +484,39 @@ export function removeDocumentsByFloor(floor) {
        try {
            cachedIndex.discard(id);
        } catch {
-            // 文档可能不存在（已被全量重建替换）
+            // Ignore if the doc was already removed/rebuilt.
        }
+        removeDocumentIdf(id);
    }

    floorDocIds.delete(floor);
-    xbLog.info(MODULE_ID, `增量移除: floor ${floor}, ${docIds.length} 个文档`);
+    xbLog.info(MODULE_ID, `Incremental remove floor=${floor} chunks=${docIds.length}`);
 }

-/**
- * 将新 L2 事件添加到索引
- *
- * 如果事件 ID 已存在，先 discard 再 add（覆盖）。
- * 如果索引不存在，静默跳过。
- *
- * @param {object[]} events - 事件对象列表（需有 id、title、summary 等）
- */
 export function addEventDocuments(events) {
    if (!cachedIndex || !events?.length) return;

    const docs = [];

    for (const ev of events) {
-        if (!ev?.id) continue;
+        const doc = buildEventDoc(ev);
+        if (!doc) continue;

-        const parts = [];
-        if (ev.title) parts.push(ev.title);
-        if (ev.participants?.length) parts.push(ev.participants.join(' '));
-        const summary = cleanSummary(ev.summary);
-        if (summary) parts.push(summary);
-        const text = parts.join(' ').trim();
-        if (!text) continue;
-
-        // 覆盖：先尝试移除旧的
        try {
-            cachedIndex.discard(ev.id);
+            cachedIndex.discard(doc.id);
        } catch {
-            // 不存在则忽略
+            // Ignore if previous document does not exist.
        }
-
-        docs.push({
-            id: ev.id,
-            type: 'event',
-            floor: null,
-            text,
-        });
+        removeDocumentIdf(doc.id);
+        docs.push(doc);
    }

-    if (docs.length > 0) {
-        cachedIndex.addAll(docs);
-        xbLog.info(MODULE_ID, `增量添加: ${docs.length} 个事件`);
+    if (!docs.length) return;
+
+    cachedIndex.addAll(docs);
+    for (const doc of docs) {
+        addDocumentIdf(doc.id, doc.text);
    }
+
+    xbLog.info(MODULE_ID, `Incremental add events=${docs.length}`);
 }
--- a/modules/story-summary/vector/retrieval/metrics.js
+++ b/modules/story-summary/vector/retrieval/metrics.js
@@ -52,6 +52,10 @@ export function createMetrics() {
            eventHits: 0,
            searchTime: 0,
            indexReadyTime: 0,
+            idfEnabled: false,
+            idfDocCount: 0,
+            topIdfTerms: [],
+            termSearches: 0,
            eventFilteredByDense: 0,
            floorFilteredByDense: 0,
        },
@@ -274,6 +278,20 @@ export function formatMetricsLog(metrics) {
    if (m.lexical.indexReadyTime > 0) {
        lines.push(`├─ index_ready_time: ${m.lexical.indexReadyTime}ms`);
    }
+    lines.push(`├─ idf_enabled: ${!!m.lexical.idfEnabled}`);
+    if (m.lexical.idfDocCount > 0) {
+        lines.push(`├─ idf_doc_count: ${m.lexical.idfDocCount}`);
+    }
+    if ((m.lexical.topIdfTerms || []).length > 0) {
+        const topIdfText = m.lexical.topIdfTerms
+            .slice(0, 5)
+            .map(x => `${x.term}:${x.idf}`)
+            .join(', ');
+        lines.push(`├─ top_idf_terms: [${topIdfText}]`);
+    }
+    if (m.lexical.termSearches > 0) {
+        lines.push(`├─ term_searches: ${m.lexical.termSearches}`);
+    }
    if (m.lexical.eventFilteredByDense > 0) {
        lines.push(`├─ event_filtered_by_dense: ${m.lexical.eventFilteredByDense}`);
    }
--- a/modules/story-summary/vector/retrieval/query-builder.js
+++ b/modules/story-summary/vector/retrieval/query-builder.js
@@ -20,6 +20,7 @@

 import { getContext } from '../../../../../../../extensions.js';
 import { buildEntityLexicon, buildDisplayNameMap, extractEntitiesFromText, buildCharacterPools } from './entity-lexicon.js';
+import { getLexicalIdfAccessor } from './lexical-index.js';
 import { getSummaryStore } from '../../data/store.js';
 import { filterText } from '../utils/text-filter.js';
 import { tokenizeForIndex as tokenizerTokenizeForIndex } from '../utils/tokenizer.js';
@@ -106,6 +107,7 @@ export function computeLengthFactor(charCount) {
 function extractKeyTerms(text, maxTerms = LEXICAL_TERMS_MAX) {
    if (!text) return [];

+    const idfAccessor = getLexicalIdfAccessor();
    const tokens = tokenizerTokenizeForIndex(text);
    const freq = new Map();
    for (const token of tokens) {
@@ -115,9 +117,13 @@ function extractKeyTerms(text, maxTerms = LEXICAL_TERMS_MAX) {
    }

    return Array.from(freq.entries())
-        .sort((a, b) => b[1] - a[1])
+        .map(([term, tf]) => {
+            const idf = idfAccessor.enabled ? idfAccessor.getIdf(term) : 1;
+            return { term, tf, score: tf * idf };
+        })
+        .sort((a, b) => (b.score - a.score) || (b.tf - a.tf))
        .slice(0, maxTerms)
-        .map(([term]) => term);
+        .map(x => x.term);
 }

 // ─────────────────────────────────────────────────────────────────────────
--- a/modules/story-summary/vector/retrieval/recall.js
+++ b/modules/story-summary/vector/retrieval/recall.js
@@ -984,6 +984,12 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) {
        : CONFIG.LAST_MESSAGES_K;
    const lastMessages = getLastMessages(chat, lastMessagesCount, excludeLastAi);

+    // Non-blocking preload: keep recall latency stable.
+    // If not ready yet, query-builder will gracefully fall back to TF terms.
+    getLexicalIndex().catch((e) => {
+        xbLog.warn(MODULE_ID, 'Preload lexical index failed; continue with TF fallback', e);
+    });
+
    const bundle = buildQueryBundle(lastMessages, pendingUserMessage);
    const focusTerms = bundle.focusTerms || bundle.focusEntities || [];
    const focusCharacters = bundle.focusCharacters || [];
@@ -1161,6 +1167,7 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) {
        atomIds: [], atomFloors: new Set(),
        chunkIds: [], chunkFloors: new Set(),
        eventIds: [], chunkScores: [], searchTime: 0,
+        idfEnabled: false, idfDocCount: 0, topIdfTerms: [], termSearches: 0,
    };

    let indexReadyTime = 0;
@@ -1184,6 +1191,10 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) {
        metrics.lexical.searchTime = lexicalResult.searchTime || 0;
        metrics.lexical.indexReadyTime = indexReadyTime;
        metrics.lexical.terms = bundle.lexicalTerms.slice(0, 10);
+        metrics.lexical.idfEnabled = !!lexicalResult.idfEnabled;
+        metrics.lexical.idfDocCount = lexicalResult.idfDocCount || 0;
+        metrics.lexical.topIdfTerms = lexicalResult.topIdfTerms || [];
+        metrics.lexical.termSearches = lexicalResult.termSearches || 0;
    }

    // 合并 L2 events（lexical 命中但 dense 未命中的 events）
@@ -1238,7 +1249,7 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) {
    }

    xbLog.info(MODULE_ID,
-        `Lexical: chunks=${lexicalResult.chunkIds.length} events=${lexicalResult.eventIds.length} mergedEvents=+${lexicalEventCount} filteredByDense=${lexicalEventFilteredByDense} floorFiltered=${metrics.lexical.floorFilteredByDense || 0} (indexReady=${indexReadyTime}ms search=${lexicalResult.searchTime || 0}ms total=${lexTime}ms)`
+        `Lexical: chunks=${lexicalResult.chunkIds.length} events=${lexicalResult.eventIds.length} mergedEvents=+${lexicalEventCount} filteredByDense=${lexicalEventFilteredByDense} floorFiltered=${metrics.lexical.floorFilteredByDense || 0} idfEnabled=${lexicalResult.idfEnabled ? 'yes' : 'no'} idfDocs=${lexicalResult.idfDocCount || 0} termSearches=${lexicalResult.termSearches || 0} (indexReady=${indexReadyTime}ms search=${lexicalResult.searchTime || 0}ms total=${lexTime}ms)`
    );

    // ═══════════════════════════════════════════════════════════════════
--- a/modules/story-summary/vector/utils/stopwords-base.js
+++ b/modules/story-summary/vector/utils/stopwords-base.js
--- a/modules/story-summary/vector/utils/stopwords-data/LICENSE.stopwords-iso.txt
+++ b/modules/story-summary/vector/utils/stopwords-data/LICENSE.stopwords-iso.txt
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2020 Gene Diaz
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/modules/story-summary/vector/utils/stopwords-data/SOURCES.md
+++ b/modules/story-summary/vector/utils/stopwords-data/SOURCES.md
@@ -0,0 +1,15 @@
+# stopwords sources for story-summary
+
+- Dataset: `stopwords-iso` (npm package, version 1.1.0)
+- Repository: https://github.com/stopwords-iso/stopwords-iso
+- License: MIT
+- Snapshot date: 2026-02-16
+- Languages used: `zh`, `ja`, `en`
+- Local snapshot files:
+  - `stopwords-iso.zh.txt`
+  - `stopwords-iso.ja.txt`
+  - `stopwords-iso.en.txt`
+
+Generation note:
+- `modules/story-summary/vector/utils/stopwords-base.js` is generated from these snapshot files.
+- Keep `stopwords-patch.js` for tiny domain overrides only.
--- a/modules/story-summary/vector/utils/stopwords-data/stopwords-iso.en.txt
+++ b/modules/story-summary/vector/utils/stopwords-data/stopwords-iso.en.txt
--- a/modules/story-summary/vector/utils/stopwords-data/stopwords-iso.ja.txt
+++ b/modules/story-summary/vector/utils/stopwords-data/stopwords-iso.ja.txt
@@ -0,0 +1,134 @@
+あそこ
+あっ
+あの
+あのかた
+あの人
+あり
+あります
+ある
+あれ
+い
+いう
+います
+いる
+う
+うち
+え
+お
+および
+おり
+おります
+か
+かつて
+から
+が
+き
+ここ
+こちら
+こと
+この
+これ
+これら
+さ
+さらに
+し
+しかし
+する
+ず
+せ
+せる
+そこ
+そして
+その
+その他
+その後
+それ
+それぞれ
+それで
+た
+ただし
+たち
+ため
+たり
+だ
+だっ
+だれ
+つ
+て
+で
+でき
+できる
+です
+では
+でも
+と
+という
+といった
+とき
+ところ
+として
+とともに
+とも
+と共に
+どこ
+どの
+な
+ない
+なお
+なかっ
+ながら
+なく
+なっ
+など
+なに
+なら
+なり
+なる
+なん
+に
+において
+における
+について
+にて
+によって
+により
+による
+に対して
+に対する
+に関する
+の
+ので
+のみ
+は
+ば
+へ
+ほか
+ほとんど
+ほど
+ます
+また
+または
+まで
+も
+もの
+ものの
+や
+よう
+より
+ら
+られ
+られる
+れ
+れる
+を
+ん
+何
+及び
+彼
+彼女
+我々
+特に
+私
+私達
+貴方
+貴方方
--- a/modules/story-summary/vector/utils/stopwords-data/stopwords-iso.zh.txt
+++ b/modules/story-summary/vector/utils/stopwords-data/stopwords-iso.zh.txt
@@ -0,0 +1,794 @@
+、
+。
+〈
+〉
+《
+》
+一
+一个
+一些
+一何
+一切
+一则
+一方面
+一旦
+一来
+一样
+一种
+一般
+一转眼
+七
+万一
+三
+上
+上下
+下
+不
+不仅
+不但
+不光
+不单
+不只
+不外乎
+不如
+不妨
+不尽
+不尽然
+不得
+不怕
+不惟
+不成
+不拘
+不料
+不是
+不比
+不然
+不特
+不独
+不管
+不至于
+不若
+不论
+不过
+不问
+与
+与其
+与其说
+与否
+与此同时
+且
+且不说
+且说
+两者
+个
+个别
+中
+临
+为
+为了
+为什么
+为何
+为止
+为此
+为着
+乃
+乃至
+乃至于
+么
+之
+之一
+之所以
+之类
+乌乎
+乎
+乘
+九
+也
+也好
+也罢
+了
+二
+二来
+于
+于是
+于是乎
+云云
+云尔
+五
+些
+亦
+人
+人们
+人家
+什
+什么
+什么样
+今
+介于
+仍
+仍旧
+从
+从此
+从而
+他
+他人
+他们
+他们们
+以
+以上
+以为
+以便
+以免
+以及
+以故
+以期
+以来
+以至
+以至于
+以致
+们
+任
+任何
+任凭
+会
+似的
+但
+但凡
+但是
+何
+何以
+何况
+何处
+何时
+余外
+作为
+你
+你们
+使
+使得
+例如
+依
+依据
+依照
+便于
+俺
+俺们
+倘
+倘使
+倘或
+倘然
+倘若
+借
+借傥然
+假使
+假如
+假若
+做
+像
+儿
+先不先
+光
+光是
+全体
+全部
+八
+六
+兮
+共
+关于
+关于具体地说
+其
+其一
+其中
+其二
+其他
+其余
+其它
+其次
+具体地说
+具体说来
+兼之
+内
+再
+再其次
+再则
+再有
+再者
+再者说
+再说
+冒
+冲
+况且
+几
+几时
+凡
+凡是
+凭
+凭借
+出于
+出来
+分
+分别
+则
+则甚
+别
+别人
+别处
+别是
+别的
+别管
+别说
+到
+前后
+前此
+前者
+加之
+加以
+区
+即
+即令
+即使
+即便
+即如
+即或
+即若
+却
+去
+又
+又及
+及
+及其
+及至
+反之
+反而
+反过来
+反过来说
+受到
+另
+另一方面
+另外
+另悉
+只
+只当
+只怕
+只是
+只有
+只消
+只要
+只限
+叫
+叮咚
+可
+可以
+可是
+可见
+各
+各个
+各位
+各种
+各自
+同
+同时
+后
+后者
+向
+向使
+向着
+吓
+吗
+否则
+吧
+吧哒
+含
+吱
+呀
+呃
+呕
+呗
+呜
+呜呼
+呢
+呵
+呵呵
+呸
+呼哧
+咋
+和
+咚
+咦
+咧
+咱
+咱们
+咳
+哇
+哈
+哈哈
+哉
+哎
+哎呀
+哎哟
+哗
+哟
+哦
+哩
+哪
+哪个
+哪些
+哪儿
+哪天
+哪年
+哪怕
+哪样
+哪边
+哪里
+哼
+哼唷
+唉
+唯有
+啊
+啐
+啥
+啦
+啪达
+啷当
+喂
+喏
+喔唷
+喽
+嗡
+嗡嗡
+嗬
+嗯
+嗳
+嘎
+嘎登
+嘘
+嘛
+嘻
+嘿
+嘿嘿
+四
+因
+因为
+因了
+因此
+因着
+因而
+固然
+在
+在下
+在于
+地
+基于
+处在
+多
+多么
+多少
+大
+大家
+她
+她们
+好
+如
+如上
+如上所述
+如下
+如何
+如其
+如同
+如是
+如果
+如此
+如若
+始而
+孰料
+孰知
+宁
+宁可
+宁愿
+宁肯
+它
+它们
+对
+对于
+对待
+对方
+对比
+将
+小
+尔
+尔后
+尔尔
+尚且
+就
+就是
+就是了
+就是说
+就算
+就要
+尽
+尽管
+尽管如此
+岂但
+己
+已
+已矣
+巴
+巴巴
+年
+并
+并且
+庶乎
+庶几
+开外
+开始
+归
+归齐
+当
+当地
+当然
+当着
+彼
+彼时
+彼此
+往
+待
+很
+得
+得了
+怎
+怎么
+怎么办
+怎么样
+怎奈
+怎样
+总之
+总的来看
+总的来说
+总的说来
+总而言之
+恰恰相反
+您
+惟其
+慢说
+我
+我们
+或
+或则
+或是
+或曰
+或者
+截至
+所
+所以
+所在
+所幸
+所有
+才
+才能
+打
+打从
+把
+抑或
+拿
+按
+按照
+换句话说
+换言之
+据
+据此
+接着
+故
+故此
+故而
+旁人
+无
+无宁
+无论
+既
+既往
+既是
+既然
+日
+时
+时候
+是
+是以
+是的
+更
+曾
+替
+替代
+最
+月
+有
+有些
+有关
+有及
+有时
+有的
+望
+朝
+朝着
+本
+本人
+本地
+本着
+本身
+来
+来着
+来自
+来说
+极了
+果然
+果真
+某
+某个
+某些
+某某
+根据
+欤
+正值
+正如
+正巧
+正是
+此
+此地
+此处
+此外
+此时
+此次
+此间
+毋宁
+每
+每当
+比
+比及
+比如
+比方
+没奈何
+沿
+沿着
+漫说
+点
+焉
+然则
+然后
+然而
+照
+照着
+犹且
+犹自
+甚且
+甚么
+甚或
+甚而
+甚至
+甚至于
+用
+用来
+由
+由于
+由是
+由此
+由此可见
+的
+的确
+的话
+直到
+相对而言
+省得
+看
+眨眼
+着
+着呢
+矣
+矣乎
+矣哉
+离
+秒
+称
+竟而
+第
+等
+等到
+等等
+简言之
+管
+类如
+紧接着
+纵
+纵令
+纵使
+纵然
+经
+经过
+结果
+给
+继之
+继后
+继而
+综上所述
+罢了
+者
+而
+而且
+而况
+而后
+而外
+而已
+而是
+而言
+能
+能否
+腾
+自
+自个儿
+自从
+自各儿
+自后
+自家
+自己
+自打
+自身
+至
+至于
+至今
+至若
+致
+般的
+若
+若夫
+若是
+若果
+若非
+莫不然
+莫如
+莫若
+虽
+虽则
+虽然
+虽说
+被
+要
+要不
+要不是
+要不然
+要么
+要是
+譬喻
+譬如
+让
+许多
+论
+设使
+设或
+设若
+诚如
+诚然
+该
+说
+说来
+请
+诸
+诸位
+诸如
+谁
+谁人
+谁料
+谁知
+贼死
+赖以
+赶
+起
+起见
+趁
+趁着
+越是
+距
+跟
+较
+较之
+边
+过
+还
+还是
+还有
+还要
+这
+这一来
+这个
+这么
+这么些
+这么样
+这么点儿
+这些
+这会儿
+这儿
+这就是说
+这时
+这样
+这次
+这般
+这边
+这里
+进而
+连
+连同
+逐步
+通过
+遵循
+遵照
+那
+那个
+那么
+那么些
+那么样
+那些
+那会儿
+那儿
+那时
+那样
+那般
+那边
+那里
+都
+鄙人
+鉴于
+针对
+阿
+除
+除了
+除外
+除开
+除此之外
+除非
+随
+随后
+随时
+随着
+难道说
+零
+非
+非但
+非徒
+非特
+非独
+靠
+顺
+顺着
+首先
+︿
+！
+＃
+＄
+％
+＆
+（
+）
+＊
+＋
+，
+０
+１
+２
+３
+４
+５
+６
+７
+８
+９
+：
+；
+＜
+＞
+？
+＠
+［
+］
+｛
+｜
+｝
+～
+￥
--- a/modules/story-summary/vector/utils/stopwords-patch.js
+++ b/modules/story-summary/vector/utils/stopwords-patch.js
@@ -0,0 +1,9 @@
+// Small domain-level tuning surface.
+// Keep this file tiny: add/remove only words that are repeatedly noisy in real logs.
+
+// Extra stopwords on top of BASE_STOP_WORDS.
+export const DOMAIN_STOP_WORDS = [];
+
+// High-value words that must never be filtered as stopwords.
+// Default to empty for plugin-wide deployment; entity names are already protected dynamically.
+export const KEEP_WORDS = [];
--- a/modules/story-summary/vector/utils/tokenizer.js
+++ b/modules/story-summary/vector/utils/tokenizer.js
@@ -18,6 +18,8 @@

 import { extensionFolderPath } from '../../../../core/constants.js';
 import { xbLog } from '../../../../core/debug-core.js';
+import { BASE_STOP_WORDS } from './stopwords-base.js';
+import { DOMAIN_STOP_WORDS, KEEP_WORDS } from './stopwords-patch.js';

 const MODULE_ID = 'tokenizer';

@@ -61,44 +63,30 @@ let entityList = [];

 /** @type {Set<string>} 已注入结巴的实体（避免重复 add_word） */
 let injectedEntities = new Set();
+let entityKeepSet = new Set();

 // ═══════════════════════════════════════════════════════════════════════════
 // 停用词
 // ═══════════════════════════════════════════════════════════════════════════

-const STOP_WORDS = new Set([
-    // 中文高频虚词
-    '的', '了', '在', '是', '我', '有', '和', '就', '不', '人',
-    '都', '一', '一个', '上', '也', '很', '到', '说', '要', '去',
-    '你', '会', '着', '没有', '看', '好', '自己', '这', '他', '她',
-    '它', '吗', '什么', '那', '里', '来', '吧', '呢', '啊', '哦',
-    '嗯', '呀', '哈', '嘿', '喂', '哎', '唉', '哇', '呃', '嘛',
-    '把', '被', '让', '给', '从', '向', '对', '跟', '比', '但',
-    '而', '或', '如果', '因为', '所以', '虽然', '但是', '然后',
-    '可以', '这样', '那样', '怎么', '为什么', '什么样', '哪里',
-    '时候', '现在', '已经', '还是', '只是', '可能', '应该', '知道',
-    '觉得', '开始', '一下', '一些', '这个', '那个', '他们', '我们',
-    '你们', '自己', '起来', '出来', '进去', '回来', '过来', '下去',
-    // 日语常见虚词（≥2字，匹配 TinySegmenter 产出粒度）
-    'です', 'ます', 'した', 'して', 'する', 'ない', 'いる', 'ある',
-    'なる', 'れる', 'られ', 'られる',
-    'この', 'その', 'あの', 'どの', 'ここ', 'そこ', 'あそこ',
-    'これ', 'それ', 'あれ', 'どれ',
-    'ても', 'から', 'まで', 'ので', 'のに', 'けど', 'だけ',
-    'もう', 'まだ', 'とても', 'ちょっと', 'やっぱり',
-    // 英文常见停用词
-    'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been',
-    'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
-    'would', 'could', 'should', 'may', 'might', 'can', 'shall',
-    'and', 'but', 'or', 'not', 'no', 'nor', 'so', 'yet',
-    'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from',
-    'it', 'its', 'he', 'she', 'his', 'her', 'they', 'them',
-    'this', 'that', 'these', 'those', 'i', 'me', 'my', 'you', 'your',
-    'we', 'our', 'if', 'then', 'than', 'when', 'what', 'which',
-    'who', 'how', 'where', 'there', 'here', 'all', 'each', 'every',
-    'both', 'few', 'more', 'most', 'other', 'some', 'such',
-    'only', 'own', 'same', 'just', 'very', 'also', 'about',
-]);
+const STATIC_KEEP_WORDS = new Set((KEEP_WORDS || [])
+    .map(w => String(w || '').trim().toLowerCase())
+    .filter(Boolean));
+
+// Standard source only: stopwords-iso snapshot + small domain patch.
+const EFFECTIVE_STOP_WORDS = new Set(
+    [...BASE_STOP_WORDS, ...DOMAIN_STOP_WORDS]
+        .map(w => String(w || '').trim().toLowerCase())
+        .filter(Boolean),
+);
+
+function shouldKeepTokenByWhitelist(token) {
+    const t = String(token || '').trim().toLowerCase();
+    if (!t) return false;
+    if (STATIC_KEEP_WORDS.has(t)) return true;
+    if (entityKeepSet.has(t)) return true;
+    return false;
+}

 // ═══════════════════════════════════════════════════════════════════════════
 // Unicode 分类
@@ -571,6 +559,7 @@ export function getState() {
 export function injectEntities(lexicon, displayMap) {
    if (!lexicon?.size) {
        entityList = [];
+        entityKeepSet = new Set();
        return;
    }

@@ -586,6 +575,7 @@ export function injectEntities(lexicon, displayMap) {
    // 按长度降序（最长匹配优先）
    entities.sort((a, b) => b.length - a.length);
    entityList = entities;
+    entityKeepSet = new Set(entities.map(e => String(e || '').trim().toLowerCase()).filter(Boolean));

    // 如果结巴已就绪，注入自定义词
    if (wasmState === WasmState.READY && jiebaAddWord) {
@@ -656,7 +646,7 @@ export function tokenize(text) {

        if (!cleaned) continue;
        if (cleaned.length < 2) continue;
-        if (STOP_WORDS.has(cleaned)) continue;
+        if (EFFECTIVE_STOP_WORDS.has(cleaned) && !shouldKeepTokenByWhitelist(cleaned)) continue;
        if (seen.has(cleaned)) continue;

        // 过滤纯标点/特殊字符
@@ -728,7 +718,7 @@ export function tokenizeForIndex(text) {
        .map(t => t.trim().toLowerCase())
        .filter(t => {
            if (!t || t.length < 2) return false;
-            if (STOP_WORDS.has(t)) return false;
+            if (EFFECTIVE_STOP_WORDS.has(t) && !shouldKeepTokenByWhitelist(t)) return false;
            if (/^[\s\x00-\x1F\p{P}\p{S}]+$/u.test(t)) return false;
            return true;
        });
@@ -744,6 +734,7 @@ export function tokenizeForIndex(text) {
 */
 export function reset() {
    entityList = [];
+    entityKeepSet = new Set();
    injectedEntities.clear();
    // 不重置 WASM 状态（避免重复加载）
 }