Zero-darkbox query updates and tokenizer improvements

2026-02-09 20:25:26 +08:00
parent 8131d6a15f
commit 0a28539b29
14 changed files with 1771 additions and 175 deletions
--- a/modules/story-summary/vector/llm/reranker.js
+++ b/modules/story-summary/vector/llm/reranker.js
@@ -78,7 +78,8 @@ export async function rerank(query, documents, options = {}) {
            },
            body: JSON.stringify({
                model: RERANK_MODEL,
-                query: query.slice(0, 1000),  // 限制 query 长度
+                // Zero-darkbox: do not silently truncate query.
+                query,
                documents: validDocs,
                top_n: Math.min(topN, validDocs.length),
                return_documents: false,
--- a/modules/story-summary/vector/retrieval/lexical-index.js
+++ b/modules/story-summary/vector/retrieval/lexical-index.js
@@ -4,9 +4,10 @@
 // 职责：
 // 1. 对 L0 atoms + L1 chunks + L2 events 建立词法索引
 // 2. 提供词法检索接口（专名精确匹配兜底）
-// 3. 惰性构建 + 缓存失效机制
+// 3. 惰性构建 + 异步预热 + 缓存失效机制
 //
 // 索引存储：纯内存（不持久化）
+// 分词器：统一使用 tokenizer.js（结巴 + 实体保护 + 降级）
 // 重建时机：CHAT_CHANGED / L0提取完成 / L2总结完成
 // ═══════════════════════════════════════════════════════════════════════════

@@ -16,6 +17,7 @@ import { getSummaryStore } from '../../data/store.js';
 import { getStateAtoms } from '../storage/state-store.js';
 import { getAllChunks } from '../storage/chunk-store.js';
 import { xbLog } from '../../../../core/debug-core.js';
+import { tokenizeForIndex } from '../utils/tokenizer.js';

 const MODULE_ID = 'lexical-index';

@@ -23,9 +25,20 @@ const MODULE_ID = 'lexical-index';
 // 缓存
 // ─────────────────────────────────────────────────────────────────────────

+/** @type {MiniSearch|null} */
 let cachedIndex = null;
+
+/** @type {string|null} */
 let cachedChatId = null;
-let cachedFingerprint = null; // atoms.length + chunks.length + events.length 的简单指纹
+
+/** @type {string|null} 数据指纹（atoms + chunks + events 数量） */
+let cachedFingerprint = null;
+
+/** @type {boolean} 是否正在构建 */
+let building = false;
+
+/** @type {Promise<MiniSearch|null>|null} 当前构建 Promise（防重入） */
+let buildPromise = null;

 // ─────────────────────────────────────────────────────────────────────────
 // 工具函数
@@ -43,7 +56,7 @@ function cleanSummary(summary) {
 }

 /**
- * 计算缓存指纹（用于判断是否需要重建）
+ * 计算缓存指纹
 * @param {number} atomCount
 * @param {number} chunkCount
 * @param {number} eventCount
@@ -53,39 +66,27 @@ function computeFingerprint(atomCount, chunkCount, eventCount) {
    return `${atomCount}:${chunkCount}:${eventCount}`;
 }

+/**
+ * 让出主线程（避免长时间阻塞 UI）
+ * @returns {Promise<void>}
+ */
+function yieldToMain() {
+    return new Promise(resolve => setTimeout(resolve, 0));
+}
+
 // ─────────────────────────────────────────────────────────────────────────
-// 索引构建
+// 文档收集
 // ─────────────────────────────────────────────────────────────────────────

 /**
- * 构建 MiniSearch 索引
- *
- * 索引三类文档：
- * - L0 atoms: { id: atomId, type: 'atom', floor, text: semantic }
- * - L1 chunks: { id: chunkId, type: 'chunk', floor, text: chunk.text }
- * - L2 events: { id: eventId, type: 'event', floor: null, text: title + participants + summary }
+ * 收集所有待索引文档
 *
 * @param {object[]} atoms  - getStateAtoms() 返回值
 * @param {object[]} chunks - getAllChunks(chatId) 返回值
 * @param {object[]} events - store.json.events
- * @returns {MiniSearch}
+ * @returns {object[]} 文档数组
 */
-export function buildLexicalIndex(atoms, chunks, events) {
-    const T0 = performance.now();
-
-    const index = new MiniSearch({
-        fields: ['text'],
-        storeFields: ['type', 'floor'],
-        idField: 'id',
-        searchOptions: {
-            boost: { text: 1 },
-            fuzzy: 0.2,
-            prefix: true,
-        },
-        // 中文友好的 tokenizer：按字符 bigram + 空格/标点分词
-        tokenize: chineseTokenize,
-    });
-
+function collectDocuments(atoms, chunks, events) {
    const docs = [];

    // L0 atoms
@@ -129,72 +130,58 @@ export function buildLexicalIndex(atoms, chunks, events) {
        });
    }

-    if (docs.length > 0) {
-        index.addAll(docs);
-    }
-
-    const elapsed = Math.round(performance.now() - T0);
-    xbLog.info(MODULE_ID, `索引构建完成: ${docs.length} 文档 (atoms=${atoms?.length || 0}, chunks=${chunks?.length || 0}, events=${events?.length || 0}) ${elapsed}ms`);
-
-    return index;
+    return docs;
 }

 // ─────────────────────────────────────────────────────────────────────────
-// 中文 Tokenizer
+// 索引构建（分片，不阻塞主线程）
 // ─────────────────────────────────────────────────────────────────────────

+/** 每批添加的文档数 */
+const BUILD_BATCH_SIZE = 500;
+
 /**
- * 中文友好的分词器
+ * 构建 MiniSearch 索引（分片异步）
 *
- * 策略：
- * 1. 连续中文字符 → 滑动 bigram（"黄英梅" → "黄英", "英梅"）
- * 2. 连续非中文字符 → 按空格/标点分割
- * 3. 保留完整中文词（2-4字）作为额外 token
- *
- * @param {string} text
- * @returns {string[]}
+ * @param {object[]} docs - 文档数组
+ * @returns {Promise<MiniSearch>}
 */
-function chineseTokenize(text) {
-    if (!text) return [];
+async function buildIndexAsync(docs) {
+    const T0 = performance.now();

-    const tokens = [];
-    const s = String(text).toLowerCase();
+    const index = new MiniSearch({
+        fields: ['text'],
+        storeFields: ['type', 'floor'],
+        idField: 'id',
+        searchOptions: {
+            boost: { text: 1 },
+            fuzzy: 0.2,
+            prefix: true,
+        },
+        tokenize: tokenizeForIndex,
+    });

-    // 分离中文段和非中文段
-    const segments = s.split(/([\u4e00-\u9fff]+)/g);
+    if (!docs.length) {
+        return index;
+    }

-    for (const seg of segments) {
-        if (!seg) continue;
+    // 分片添加，每批 BUILD_BATCH_SIZE 条后让出主线程
+    for (let i = 0; i < docs.length; i += BUILD_BATCH_SIZE) {
+        const batch = docs.slice(i, i + BUILD_BATCH_SIZE);
+        index.addAll(batch);

-        // 中文段：bigram + 完整段（如果 2-6 字）
-        if (/^[\u4e00-\u9fff]+$/.test(seg)) {
-            // 完整段作为一个 token（如果长度合适）
-            if (seg.length >= 2 && seg.length <= 6) {
-                tokens.push(seg);
-            }
-
-            // bigram
-            for (let i = 0; i < seg.length - 1; i++) {
-                tokens.push(seg.slice(i, i + 2));
-            }
-
-            // trigram（对 3+ 字的段）
-            for (let i = 0; i < seg.length - 2; i++) {
-                tokens.push(seg.slice(i, i + 3));
-            }
-        } else {
-            // 非中文段：按空格/标点分割
-const words = seg.split(/[\s\-_.,;:!?'"()[\]{}<>/\\|@#$%^&*+=~`]+/);
-            for (const w of words) {
-                const trimmed = w.trim();
-                if (trimmed.length >= 2) {
-                    tokens.push(trimmed);
-                }
-            }
+        // 非最后一批时让出主线程
+        if (i + BUILD_BATCH_SIZE < docs.length) {
+            await yieldToMain();
        }
    }

-    return tokens;
+    const elapsed = Math.round(performance.now() - T0);
+    xbLog.info(MODULE_ID,
+        `索引构建完成: ${docs.length} 文档 (${elapsed}ms)`
+    );
+
+    return index;
 }

 // ─────────────────────────────────────────────────────────────────────────
@@ -247,6 +234,8 @@ export function searchLexicalIndex(index, terms) {
            fuzzy: 0.2,
            prefix: true,
            combineWith: 'OR',
+            // 使用与索引相同的分词器
+            tokenize: tokenizeForIndex,
        });
    } catch (e) {
        xbLog.warn(MODULE_ID, '检索失败', e);
@@ -305,22 +294,17 @@ export function searchLexicalIndex(index, terms) {
 }

 // ─────────────────────────────────────────────────────────────────────────
-// 惰性缓存管理
+// 内部构建流程（收集数据 + 构建索引）
 // ─────────────────────────────────────────────────────────────────────────

 /**
- * 获取词法索引（惰性构建 + 缓存）
+ * 收集数据并构建索引
 *
- * 如果缓存有效则直接返回；否则自动构建。
- * 缓存失效条件：chatId 变化 / 数据指纹变化 / 手动 invalidate
- *
- * @returns {Promise<MiniSearch>}
+ * @param {string} chatId
+ * @returns {Promise<{index: MiniSearch, fingerprint: string}>}
 */
-export async function getLexicalIndex() {
-    const { chatId } = getContext();
-    if (!chatId) return null;
-
-    // 收集当前数据
+async function collectAndBuild(chatId) {
+    // 收集数据
    const atoms = getStateAtoms() || [];
    const store = getSummaryStore();
    const events = store?.json?.events || [];
@@ -334,30 +318,118 @@ export async function getLexicalIndex() {

    const fp = computeFingerprint(atoms.length, chunks.length, events.length);

-    // 缓存命中
+    // 检查是否在收集过程中缓存已被其他调用更新
    if (cachedIndex && cachedChatId === chatId && cachedFingerprint === fp) {
+        return { index: cachedIndex, fingerprint: fp };
+    }
+
+    // 收集文档
+    const docs = collectDocuments(atoms, chunks, events);
+
+    // 异步分片构建
+    const index = await buildIndexAsync(docs);
+
+    return { index, fingerprint: fp };
+}
+
+// ─────────────────────────────────────────────────────────────────────────
+// 公开接口：getLexicalIndex（惰性获取）
+// ─────────────────────────────────────────────────────────────────────────
+
+/**
+ * 获取词法索引（惰性构建 + 缓存）
+ *
+ * 如果缓存有效则直接返回；否则自动构建。
+ * 如果正在构建中，等待构建完成。
+ *
+ * @returns {Promise<MiniSearch|null>}
+ */
+export async function getLexicalIndex() {
+    const { chatId } = getContext();
+    if (!chatId) return null;
+
+    // 快速路径：如果缓存存在且 chatId 未变，则直接命中
+    // 指纹校验放到构建流程中完成，避免为指纹而额外读一次 IndexedDB
+    if (cachedIndex && cachedChatId === chatId && cachedFingerprint) {
        return cachedIndex;
    }

-    // 重建
-    xbLog.info(MODULE_ID, `缓存失效，重建索引 (chatId=${chatId.slice(0, 8)}, fp=${fp})`);
+    // 正在构建中，等待结果
+    if (building && buildPromise) {
+        try {
+            await buildPromise;
+            if (cachedIndex && cachedChatId === chatId && cachedFingerprint) {
+                return cachedIndex;
+            }
+        } catch {
+            // 构建失败，继续往下重建
+        }
+    }

-    const index = buildLexicalIndex(atoms, chunks, events);
+    // 需要重建（指纹将在 collectAndBuild 内部计算并写入缓存）
+    xbLog.info(MODULE_ID, `缓存失效，重建索引 (chatId=${chatId.slice(0, 8)})`);

-    cachedIndex = index;
-    cachedChatId = chatId;
-    cachedFingerprint = fp;
+    building = true;
+    buildPromise = collectAndBuild(chatId);

-    return index;
+    try {
+        const { index, fingerprint } = await buildPromise;
+
+        // 原子替换缓存
+        cachedIndex = index;
+        cachedChatId = chatId;
+        cachedFingerprint = fingerprint;
+
+        return index;
+    } catch (e) {
+        xbLog.error(MODULE_ID, '索引构建失败', e);
+        return null;
+    } finally {
+        building = false;
+        buildPromise = null;
+    }
 }

+// ─────────────────────────────────────────────────────────────────────────
+// 公开接口：warmupIndex（异步预建）
+// ─────────────────────────────────────────────────────────────────────────
+
 /**
- * 使缓存失效（下次 getLexicalIndex 时自动重建）
+ * 异步预建索引
+ *
+ * 在 CHAT_CHANGED 时调用，后台构建索引。
+ * 不阻塞调用方，不返回结果。
+ * 构建完成后缓存自动更新，后续 getLexicalIndex() 直接命中。
+ *
+ * 调用时机：
+ * - handleChatChanged（实体注入后）
+ * - L0 提取完成
+ * - L2 总结完成
+ */
+export function warmupIndex() {
+    const { chatId } = getContext();
+    if (!chatId) return;
+
+    // 已在构建中，不重复触发
+    if (building) return;
+
+    // fire-and-forget
+    getLexicalIndex().catch(e => {
+        xbLog.warn(MODULE_ID, '预热索引失败', e);
+    });
+}
+
+// ─────────────────────────────────────────────────────────────────────────
+// 公开接口：invalidateLexicalIndex（缓存失效）
+// ─────────────────────────────────────────────────────────────────────────
+
+/**
+ * 使缓存失效（下次 getLexicalIndex / warmupIndex 时自动重建）
 *
 * 调用时机：
 * - CHAT_CHANGED
- * - L0 提取完成（handleAnchorGenerate 完成后）
- * - L2 总结完成（onComplete 回调中）
+ * - L0 提取完成
+ * - L2 总结完成
 */
 export function invalidateLexicalIndex() {
    if (cachedIndex) {
--- a/modules/story-summary/vector/retrieval/metrics.js
+++ b/modules/story-summary/vector/retrieval/metrics.js
@@ -16,6 +16,11 @@ export function createMetrics() {
        query: {
            buildTime: 0,
            refineTime: 0,
+            lengths: {
+                v0Chars: 0,
+                v1Chars: null,     // null = NA
+                rerankChars: 0,
+            },
        },

        // Anchor (L0 StateAtoms) - 语义锚点
@@ -177,6 +182,13 @@ export function formatMetricsLog(metrics) {
    lines.push('════════════════════════════════════════');
    lines.push('');

+    // Query Length
+    lines.push('[Query Length] 查询长度');
+    lines.push(`├─ query_v0_chars: ${m.query?.lengths?.v0Chars ?? 0}`);
+    lines.push(`├─ query_v1_chars: ${m.query?.lengths?.v1Chars == null ? 'NA' : m.query.lengths.v1Chars}`);
+    lines.push(`└─ rerank_query_chars: ${m.query?.lengths?.rerankChars ?? 0}`);
+    lines.push('');
+
    // Query Build
    lines.push('[Query] 查询构建');
    lines.push(`├─ build_time: ${m.query.buildTime}ms`);
--- a/modules/story-summary/vector/retrieval/query-builder.js
+++ b/modules/story-summary/vector/retrieval/query-builder.js
@@ -12,36 +12,18 @@ import { getContext } from '../../../../../../../extensions.js';
 import { buildEntityLexicon, buildDisplayNameMap, extractEntitiesFromText } from './entity-lexicon.js';
 import { getSummaryStore } from '../../data/store.js';
 import { filterText } from '../utils/text-filter.js';
+import { tokenizeForIndex as tokenizerTokenizeForIndex } from '../utils/tokenizer.js';

 // ─────────────────────────────────────────────────────────────────────────
 // 常量
 // ─────────────────────────────────────────────────────────────────────────

-const DIALOGUE_MAX_CHARS = 400;
-const PENDING_MAX_CHARS = 400;
-const MEMORY_HINT_MAX_CHARS = 100;
+// Zero-darkbox policy:
+// - No internal truncation. We rely on model-side truncation / provider limits.
+// - If provider rejects due to length, we fail loudly and degrade explicitly.
 const MEMORY_HINT_ATOMS_MAX = 5;
 const MEMORY_HINT_EVENTS_MAX = 3;
-const RERANK_QUERY_MAX_CHARS = 500;
-const RERANK_SNIPPET_CHARS = 150;
 const LEXICAL_TERMS_MAX = 10;
-const LEXICAL_TERM_MIN_LEN = 2;
-const LEXICAL_TERM_MAX_LEN = 6;
-
-// 中文停用词（高频无意义词）
-const STOP_WORDS = new Set([
-    '的', '了', '在', '是', '我', '有', '和', '就', '不', '人',
-    '都', '一', '一个', '上', '也', '很', '到', '说', '要', '去',
-    '你', '会', '着', '没有', '看', '好', '自己', '这', '他', '她',
-    '它', '吗', '什么', '那', '里', '来', '吧', '呢', '啊', '哦',
-    '嗯', '呀', '哈', '嘿', '喂', '哎', '唉', '哇', '呃', '嘛',
-    '把', '被', '让', '给', '从', '向', '对', '跟', '比', '但',
-    '而', '或', '如果', '因为', '所以', '虽然', '但是', '然后',
-    '可以', '这样', '那样', '怎么', '为什么', '什么样', '哪里',
-    '时候', '现在', '已经', '还是', '只是', '可能', '应该', '知道',
-    '觉得', '开始', '一下', '一些', '这个', '那个', '他们', '我们',
-    '你们', '自己', '起来', '出来', '进去', '回来', '过来', '下去',
-]);

 // ─────────────────────────────────────────────────────────────────────────
 // 工具函数
@@ -65,10 +47,7 @@ function cleanMessageText(text) {
 * @param {number} maxLen
 * @returns {string}
 */
-function truncate(text, maxLen) {
-    if (!text || text.length <= maxLen) return text || '';
-    return text.slice(0, maxLen) + '…';
-}
+// truncate removed by design (zero-darkbox)

 /**
 * 清理事件摘要（移除楼层标记）
@@ -84,8 +63,7 @@ function cleanSummary(summary) {
 /**
 * 从文本中提取高频实词（用于词法检索）
 *
- * 策略：按中文字符边界 + 空格/标点分词，取长度 2-6 的片段
- * 过滤停用词，按频率排序
+ * 使用统一分词器（结巴 + 实体保护 + 停用词过滤），按频率排序
 *
 * @param {string} text - 清洗后的文本
 * @param {number} maxTerms - 最大词数
@@ -94,15 +72,15 @@ function cleanSummary(summary) {
 function extractKeyTerms(text, maxTerms = LEXICAL_TERMS_MAX) {
    if (!text) return [];

-    // 提取连续中文片段 + 英文单词
-    const segments = text.match(/[\u4e00-\u9fff]{2,6}|[a-zA-Z]{3,}/g) || [];
+    // 使用统一分词器（索引用，不去重，保留词频）
+    const tokens = tokenizerTokenizeForIndex(text);

+    // 统计词频
    const freq = new Map();
-    for (const seg of segments) {
-        const s = seg.toLowerCase();
-        if (s.length < LEXICAL_TERM_MIN_LEN || s.length > LEXICAL_TERM_MAX_LEN) continue;
-        if (STOP_WORDS.has(s)) continue;
-        freq.set(s, (freq.get(s) || 0) + 1);
+    for (const token of tokens) {
+        const key = String(token || '').toLowerCase();
+        if (!key) continue;
+        freq.set(key, (freq.get(key) || 0) + 1);
    }

    return Array.from(freq.entries())
@@ -160,8 +138,9 @@ export function buildQueryBundle(lastMessages, pendingUserMessage, store = null,
        const clean = cleanMessageText(m.mes || '');

        if (clean) {
-            // ★ 修复 A：不使用楼层号，embedding 模型不需要
-            dialogueLines.push(`${speaker}: ${truncate(clean, DIALOGUE_MAX_CHARS)}`);
+            // 不使用楼层号，embedding 模型不需要
+            // 不截断，零暗箱
+            dialogueLines.push(`${speaker}: ${clean}`);
            allCleanText.push(clean);
        }
    }
@@ -191,30 +170,15 @@ export function buildQueryBundle(lastMessages, pendingUserMessage, store = null,
    }

    if (pendingClean) {
-        queryParts.push(`[PENDING_USER]\n${truncate(pendingClean, PENDING_MAX_CHARS)}`);
+        // 不截断，零暗箱
+        queryParts.push(`[PENDING_USER]\n${pendingClean}`);
    }

    const queryText_v0 = queryParts.join('\n\n');

-    // 6. 构建 rerankQuery（短版）
-    const rerankParts = [];
-
-    if (focusEntities.length > 0) {
-        rerankParts.push(focusEntities.join(' '));
-    }
-
-    for (const m of (lastMessages || [])) {
-        const clean = cleanMessageText(m.mes || '');
-        if (clean) {
-            rerankParts.push(truncate(clean, RERANK_SNIPPET_CHARS));
-        }
-    }
-
-    if (pendingClean) {
-        rerankParts.push(truncate(pendingClean, RERANK_SNIPPET_CHARS));
-    }
-
-    const rerankQuery = truncate(rerankParts.join('\n'), RERANK_QUERY_MAX_CHARS);
+    // 6. rerankQuery 与 embedding query 同源（零暗箱）
+    // 后续 refine 会把它升级为与 queryText_v1 同源。
+    const rerankQuery = queryText_v0;

    // 7. 构建 lexicalTerms
    const entityTerms = focusEntities.map(e => e.toLowerCase());
@@ -265,7 +229,8 @@ export function refineQueryBundle(bundle, anchorHits, eventHits) {
    for (const hit of topAnchors) {
        const semantic = hit.atom?.semantic || '';
        if (semantic) {
-            hints.push(truncate(semantic, MEMORY_HINT_MAX_CHARS));
+            // 不截断，零暗箱
+            hints.push(semantic);
        }
    }

@@ -279,13 +244,15 @@ export function refineQueryBundle(bundle, anchorHits, eventHits) {
            ? `${title}: ${summary}`
            : title || summary;
        if (line) {
-            hints.push(truncate(line, MEMORY_HINT_MAX_CHARS));
+            // 不截断，零暗箱
+            hints.push(line);
        }
    }

-    // 3. 构建 queryText_v1
+    // 3. 构建 queryText_v1（Hints 前置，最优先）
    if (hints.length > 0) {
-        bundle.queryText_v1 = bundle.queryText_v0 + `\n\n[MEMORY_HINTS]\n${hints.join('\n')}`;
+        const hintText = `[MEMORY_HINTS]\n${hints.join('\n')}`;
+        bundle.queryText_v1 = hintText + `\n\n` + bundle.queryText_v0;
    } else {
        bundle.queryText_v1 = bundle.queryText_v0;
    }
@@ -314,17 +281,8 @@ export function refineQueryBundle(bundle, anchorHits, eventHits) {
        }
    }

-    // 5. 增强 rerankQuery
-    if (hints.length > 0) {
-        const hintKeywords = extractKeyTerms(hints.join(' '), 5);
-        if (hintKeywords.length > 0) {
-            const addition = hintKeywords.join(' ');
-            bundle.rerankQuery = truncate(
-                bundle.rerankQuery + '\n' + addition,
-                RERANK_QUERY_MAX_CHARS
-            );
-        }
-    }
+    // 5. rerankQuery 与最终 query 同源（零暗箱）
+    bundle.rerankQuery = bundle.queryText_v1 || bundle.queryText_v0;

    // 6. 增强 lexicalTerms
    if (hints.length > 0) {
--- a/modules/story-summary/vector/retrieval/recall.js
+++ b/modules/story-summary/vector/retrieval/recall.js
@@ -782,6 +782,14 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) {
    metrics.query.buildTime = Math.round(performance.now() - T_Build_Start);
    metrics.anchor.focusEntities = bundle.focusEntities;

+    // Query lengths (v0 available here)
+    if (metrics.query?.lengths) {
+        metrics.query.lengths.v0Chars = String(bundle.queryText_v0 || '').length;
+        // v1 not built yet
+        metrics.query.lengths.v1Chars = null;
+        metrics.query.lengths.rerankChars = String(bundle.rerankQuery || bundle.queryText_v0 || '').length;
+    }
+
    xbLog.info(MODULE_ID,
        `Query Build: focus=[${bundle.focusEntities.join(',')}] lexTerms=[${bundle.lexicalTerms.slice(0, 5).join(',')}]`
    );
@@ -841,6 +849,12 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) {
    // 更新 focusEntities（refinement 可能扩展了）
    metrics.anchor.focusEntities = bundle.focusEntities;

+    // Query lengths (v1/rerank updated here)
+    if (metrics.query?.lengths) {
+        metrics.query.lengths.v1Chars = bundle.queryText_v1 == null ? null : String(bundle.queryText_v1).length;
+        metrics.query.lengths.rerankChars = String(bundle.rerankQuery || bundle.queryText_v1 || bundle.queryText_v0 || '').length;
+    }
+
    xbLog.info(MODULE_ID,
        `Refinement: focus=[${bundle.focusEntities.join(',')}] hasV1=${!!bundle.queryText_v1} (${metrics.query.refineTime}ms)`
    );
--- a/modules/story-summary/vector/utils/tokenizer.js
+++ b/modules/story-summary/vector/utils/tokenizer.js
@@ -0,0 +1,650 @@
+// ═══════════════════════════════════════════════════════════════════════════
+// tokenizer.js - 统一分词器
+//
+// 职责：
+// 1. 管理结巴 WASM 生命周期（预加载 / 就绪检测 / 降级）
+// 2. 实体词典注入（分词前最长匹配保护）
+// 3. 亚洲文字（CJK + 假名）走结巴，拉丁文字走空格分割
+// 4. 提供 tokenize(text): string[] 统一接口
+//
+// 加载时机：
+// - 插件初始化时 storySummary.enabled && vectorConfig.enabled → preload()
+// - 向量开关从 off→on 时 → preload()
+// - CHAT_CHANGED 时 → injectEntities() + warmup 索引（不负责加载 WASM）
+//
+// 降级策略：
+// - WASM 未就绪时 → 实体保护 + 标点分割（不用 bigram）
+// ═══════════════════════════════════════════════════════════════════════════
+
+import { extensionFolderPath } from '../../../../core/constants.js';
+import { xbLog } from '../../../../core/debug-core.js';
+
+const MODULE_ID = 'tokenizer';
+
+// ═══════════════════════════════════════════════════════════════════════════
+// WASM 状态机
+// ═══════════════════════════════════════════════════════════════════════════
+
+/**
+ * @enum {string}
+ */
+const WasmState = {
+    IDLE: 'IDLE',
+    LOADING: 'LOADING',
+    READY: 'READY',
+    FAILED: 'FAILED',
+};
+
+let wasmState = WasmState.IDLE;
+
+/** @type {Promise<void>|null} 当前加载 Promise（防重入） */
+let loadingPromise = null;
+
+/** @type {typeof import('../../../../libs/jieba-wasm/jieba_rs_wasm.js')|null} */
+let jiebaModule = null;
+
+/** @type {Function|null} jieba cut 函数引用 */
+let jiebaCut = null;
+
+/** @type {Function|null} jieba add_word 函数引用 */
+let jiebaAddWord = null;
+
+// ═══════════════════════════════════════════════════════════════════════════
+// 实体词典
+// ═══════════════════════════════════════════════════════════════════════════
+
+/** @type {string[]} 按长度降序排列的实体列表（用于最长匹配） */
+let entityList = [];
+
+/** @type {Set<string>} 已注入结巴的实体（避免重复 add_word） */
+let injectedEntities = new Set();
+
+// ═══════════════════════════════════════════════════════════════════════════
+// 停用词
+// ═══════════════════════════════════════════════════════════════════════════
+
+const STOP_WORDS = new Set([
+    // 中文高频虚词
+    '的', '了', '在', '是', '我', '有', '和', '就', '不', '人',
+    '都', '一', '一个', '上', '也', '很', '到', '说', '要', '去',
+    '你', '会', '着', '没有', '看', '好', '自己', '这', '他', '她',
+    '它', '吗', '什么', '那', '里', '来', '吧', '呢', '啊', '哦',
+    '嗯', '呀', '哈', '嘿', '喂', '哎', '唉', '哇', '呃', '嘛',
+    '把', '被', '让', '给', '从', '向', '对', '跟', '比', '但',
+    '而', '或', '如果', '因为', '所以', '虽然', '但是', '然后',
+    '可以', '这样', '那样', '怎么', '为什么', '什么样', '哪里',
+    '时候', '现在', '已经', '还是', '只是', '可能', '应该', '知道',
+    '觉得', '开始', '一下', '一些', '这个', '那个', '他们', '我们',
+    '你们', '自己', '起来', '出来', '进去', '回来', '过来', '下去',
+    // 日语助词 + 常见虚词
+    'は', 'が', 'を', 'に', 'で', 'と', 'の', 'も', 'へ', 'や',
+    'か', 'な', 'よ', 'ね', 'わ', 'だ', 'です', 'ます', 'た', 'て',
+    'する', 'いる', 'ある', 'なる', 'れる', 'られる', 'ない',
+    'この', 'その', 'あの', 'どの', 'ここ', 'そこ', 'あそこ',
+    'これ', 'それ', 'あれ', 'どれ',
+    // 英文常见停用词
+    'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been',
+    'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
+    'would', 'could', 'should', 'may', 'might', 'can', 'shall',
+    'and', 'but', 'or', 'not', 'no', 'nor', 'so', 'yet',
+    'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from',
+    'it', 'its', 'he', 'she', 'his', 'her', 'they', 'them',
+    'this', 'that', 'these', 'those', 'i', 'me', 'my', 'you', 'your',
+    'we', 'our', 'if', 'then', 'than', 'when', 'what', 'which',
+    'who', 'how', 'where', 'there', 'here', 'all', 'each', 'every',
+    'both', 'few', 'more', 'most', 'other', 'some', 'such',
+    'only', 'own', 'same', 'just', 'very', 'also', 'about',
+]);
+
+// ═══════════════════════════════════════════════════════════════════════════
+// Unicode 分类
+// ═══════════════════════════════════════════════════════════════════════════
+
+/**
+ * 判断字符是否为亚洲文字（CJK + 假名）
+ * @param {number} code - charCode
+ * @returns {boolean}
+ */
+function isAsian(code) {
+    return (
+        (code >= 0x4E00 && code <= 0x9FFF) ||   // CJK Unified Ideographs
+        (code >= 0x3400 && code <= 0x4DBF) ||   // CJK Extension A
+        (code >= 0x3040 && code <= 0x309F) ||   // Hiragana
+        (code >= 0x30A0 && code <= 0x30FF) ||   // Katakana
+        (code >= 0x31F0 && code <= 0x31FF) ||   // Katakana Phonetic Extensions
+        (code >= 0xFF65 && code <= 0xFF9F) ||   // Halfwidth Katakana
+        (code >= 0xF900 && code <= 0xFAFF) ||   // CJK Compatibility Ideographs
+        (code >= 0x20000 && code <= 0x2A6DF)    // CJK Extension B
+    );
+}
+
+/**
+ * 判断字符是否为拉丁字母或数字
+ * @param {number} code - charCode
+ * @returns {boolean}
+ */
+function isLatin(code) {
+    return (
+        (code >= 0x41 && code <= 0x5A) ||       // A-Z
+        (code >= 0x61 && code <= 0x7A) ||       // a-z
+        (code >= 0x30 && code <= 0x39) ||       // 0-9
+        (code >= 0xC0 && code <= 0x024F)        // Latin Extended (àáâ 等)
+    );
+}
+
+// ═══════════════════════════════════════════════════════════════════════════
+// 文本分段（亚洲 vs 拉丁 vs 其他）
+// ═══════════════════════════════════════════════════════════════════════════
+
+/**
+ * @typedef {'asian'|'latin'|'other'} SegmentType
+ */
+
+/**
+ * @typedef {object} TextSegment
+ * @property {SegmentType} type - 段类型
+ * @property {string} text - 段文本
+ */
+
+/**
+ * 将文本按 Unicode 脚本分段
+ * 连续的同类字符归为一段
+ *
+ * @param {string} text
+ * @returns {TextSegment[]}
+ */
+function segmentByScript(text) {
+    if (!text) return [];
+
+    const segments = [];
+    let currentType = null;
+    let currentStart = 0;
+
+    for (let i = 0; i < text.length; i++) {
+        const code = text.charCodeAt(i);
+        let type;
+
+        if (isAsian(code)) {
+            type = 'asian';
+        } else if (isLatin(code)) {
+            type = 'latin';
+        } else {
+            type = 'other';
+        }
+
+        if (type !== currentType) {
+            if (currentType !== null && currentStart < i) {
+                const seg = text.slice(currentStart, i);
+                if (currentType !== 'other' || seg.trim()) {
+                    segments.push({ type: currentType, text: seg });
+                }
+            }
+            currentType = type;
+            currentStart = i;
+        }
+    }
+
+    // 最后一段
+    if (currentStart < text.length) {
+        const seg = text.slice(currentStart);
+        if (currentType !== 'other' || seg.trim()) {
+            segments.push({ type: currentType, text: seg });
+        }
+    }
+
+    return segments;
+}
+
+// ═══════════════════════════════════════════════════════════════════════════
+// 实体保护（最长匹配占位符替换）
+// ═══════════════════════════════════════════════════════════════════════════
+
+// 使用 Unicode Private Use Area (PUA) 字符作为边界，避免控制字符在分词器中产生不可控行为
+const PLACEHOLDER_PREFIX = '\uE000ENT_';
+const PLACEHOLDER_SUFFIX = '\uE001';
+
+/**
+ * 在文本中执行实体最长匹配，替换为占位符
+ *
+ * @param {string} text - 原始文本
+ * @returns {{masked: string, entities: Map<string, string>}} masked 文本 + 占位符→原文映射
+ */
+function maskEntities(text) {
+    const entities = new Map();
+
+    if (!entityList.length || !text) {
+        return { masked: text, entities };
+    }
+
+    let masked = text;
+    let idx = 0;
+
+    // entityList 已按长度降序排列，保证最长匹配优先
+    for (const entity of entityList) {
+        // 大小写不敏感搜索
+        const lowerMasked = masked.toLowerCase();
+        const lowerEntity = entity.toLowerCase();
+        let searchFrom = 0;
+
+        while (true) {
+            const pos = lowerMasked.indexOf(lowerEntity, searchFrom);
+            if (pos === -1) break;
+
+            // 已被占位符覆盖则跳过（检查前后是否存在 PUA 边界字符）
+            const aroundStart = Math.max(0, pos - 4);
+            const aroundEnd = Math.min(masked.length, pos + entity.length + 4);
+            const around = masked.slice(aroundStart, aroundEnd);
+            if (around.includes('\uE000') || around.includes('\uE001')) {
+                searchFrom = pos + 1;
+                continue;
+            }
+
+            const placeholder = `${PLACEHOLDER_PREFIX}${idx}${PLACEHOLDER_SUFFIX}`;
+            const originalText = masked.slice(pos, pos + entity.length);
+            entities.set(placeholder, originalText);
+
+            masked = masked.slice(0, pos) + placeholder + masked.slice(pos + entity.length);
+            idx++;
+
+            // 更新搜索位置（跳过占位符）
+            searchFrom = pos + placeholder.length;
+        }
+    }
+
+    return { masked, entities };
+}
+
+/**
+ * 将 token 数组中的占位符还原为原始实体
+ *
+ * @param {string[]} tokens
+ * @param {Map<string, string>} entities - 占位符→原文映射
+ * @returns {string[]}
+ */
+function unmaskTokens(tokens, entities) {
+    if (!entities.size) return tokens;
+
+    return tokens.map(token => {
+        // token 本身就是一个占位符
+        if (entities.has(token)) {
+            return entities.get(token);
+        }
+
+        // token 中包含占位符（结巴可能把占位符和其他字符连在一起）
+        let result = token;
+        for (const [placeholder, original] of entities) {
+            if (result.includes(placeholder)) {
+                result = result.replace(placeholder, original);
+            }
+        }
+
+        return result;
+    });
+}
+
+// ═══════════════════════════════════════════════════════════════════════════
+// 分词：亚洲文字（结巴 / 降级）
+// ═══════════════════════════════════════════════════════════════════════════
+
+/**
+ * 用结巴分词处理亚洲文字段
+ * @param {string} text
+ * @returns {string[]}
+ */
+function tokenizeAsianJieba(text) {
+    if (!text || !jiebaCut) return [];
+
+    try {
+        const words = jiebaCut(text, true); // hmm=true
+        return Array.from(words)
+            .map(w => String(w || '').trim())
+            .filter(w => w.length >= 2);
+    } catch (e) {
+        xbLog.warn(MODULE_ID, '结巴分词异常，降级处理', e);
+        return tokenizeAsianFallback(text);
+    }
+}
+
+/**
+ * 降级分词：标点/空格分割 + 保留 2-6 字 CJK 片段
+ * 不使用 bigram，避免索引膨胀
+ *
+ * @param {string} text
+ * @returns {string[]}
+ */
+function tokenizeAsianFallback(text) {
+    if (!text) return [];
+
+    const tokens = [];
+
+    // 按标点和空格分割
+    const parts = text.split(/[\s，。！？、；：""''（）【】《》…—\-,.!?;:'"()[\]{}<>/\\|@#$%^&*+=~`]+/);
+
+    for (const part of parts) {
+        const trimmed = part.trim();
+        if (!trimmed) continue;
+
+        if (trimmed.length >= 2 && trimmed.length <= 6) {
+            tokens.push(trimmed);
+        } else if (trimmed.length > 6) {
+            // 长片段按 4 字滑窗切分（比 bigram 稀疏得多）
+            for (let i = 0; i <= trimmed.length - 4; i += 2) {
+                tokens.push(trimmed.slice(i, i + 4));
+            }
+            // 保留完整片段的前 6 字
+            tokens.push(trimmed.slice(0, 6));
+        }
+    }
+
+    return tokens;
+}
+
+// ═══════════════════════════════════════════════════════════════════════════
+// 分词：拉丁文字
+// ═══════════════════════════════════════════════════════════════════════════
+
+/**
+ * 拉丁文字分词：空格/标点分割
+ * @param {string} text
+ * @returns {string[]}
+ */
+function tokenizeLatin(text) {
+    if (!text) return [];
+
+    return text
+        .split(/[\s\-_.,;:!?'"()[\]{}<>/\\|@#$%^&*+=~`]+/)
+        .map(w => w.trim().toLowerCase())
+        .filter(w => w.length >= 3);
+}
+
+// ═══════════════════════════════════════════════════════════════════════════
+// 公开接口：preload
+// ═══════════════════════════════════════════════════════════════════════════
+
+/**
+ * 预加载结巴 WASM
+ *
+ * 可多次调用，内部防重入。
+ * FAILED 状态下再次调用会重试。
+ *
+ * @returns {Promise<boolean>} 是否加载成功
+ */
+export async function preload() {
+    // 已就绪
+    if (wasmState === WasmState.READY) return true;
+
+    // 正在加载，等待结果
+    if (wasmState === WasmState.LOADING && loadingPromise) {
+        try {
+            await loadingPromise;
+            return wasmState === WasmState.READY;
+        } catch {
+            return false;
+        }
+    }
+
+    // IDLE 或 FAILED → 开始加载
+    wasmState = WasmState.LOADING;
+
+    const T0 = performance.now();
+
+    loadingPromise = (async () => {
+        try {
+            // 动态 import 结巴模块
+            const wasmPath = `${extensionFolderPath}/libs/jieba-wasm/jieba_rs_wasm_bg.wasm`;
+
+            // eslint-disable-next-line no-unsanitized/method
+            jiebaModule = await import(
+                `${extensionFolderPath}/libs/jieba-wasm/jieba_rs_wasm.js`
+            );
+
+            // 初始化 WASM
+            if (typeof jiebaModule.default === 'function') {
+                await jiebaModule.default(wasmPath);
+            }
+
+            // 缓存函数引用
+            jiebaCut = jiebaModule.cut;
+            jiebaAddWord = jiebaModule.add_word;
+
+            if (typeof jiebaCut !== 'function') {
+                throw new Error('jieba cut 函数不存在');
+            }
+
+            wasmState = WasmState.READY;
+
+            const elapsed = Math.round(performance.now() - T0);
+            xbLog.info(MODULE_ID, `结巴 WASM 加载完成 (${elapsed}ms)`);
+
+            // 如果有待注入的实体，补做
+            if (entityList.length > 0 && jiebaAddWord) {
+                reInjectAllEntities();
+            }
+
+            return true;
+        } catch (e) {
+            wasmState = WasmState.FAILED;
+            xbLog.error(MODULE_ID, '结巴 WASM 加载失败', e);
+            throw e;
+        }
+    })();
+
+    try {
+        await loadingPromise;
+        return true;
+    } catch {
+        return false;
+    } finally {
+        loadingPromise = null;
+    }
+}
+
+// ═══════════════════════════════════════════════════════════════════════════
+// 公开接口：isReady
+// ═══════════════════════════════════════════════════════════════════════════
+
+/**
+ * 检查结巴是否已就绪
+ * @returns {boolean}
+ */
+export function isReady() {
+    return wasmState === WasmState.READY;
+}
+
+/**
+ * 获取当前 WASM 状态
+ * @returns {string}
+ */
+export function getState() {
+    return wasmState;
+}
+
+// ═══════════════════════════════════════════════════════════════════════════
+// 公开接口：injectEntities
+// ═══════════════════════════════════════════════════════════════════════════
+
+/**
+ * 注入实体词典
+ *
+ * 更新内部实体列表（用于最长匹配保护）
+ * 如果结巴已就绪，同时调用 add_word 注入
+ *
+ * @param {Set<string>} lexicon - 标准化后的实体集合
+ * @param {Map<string, string>} [displayMap] - normalize→原词形映射
+ */
+export function injectEntities(lexicon, displayMap) {
+    if (!lexicon?.size) {
+        entityList = [];
+        return;
+    }
+
+    // 构建实体列表：使用原词形（displayMap），按长度降序排列
+    const entities = [];
+    for (const normalized of lexicon) {
+        const display = displayMap?.get(normalized) || normalized;
+        if (display.length >= 2) {
+            entities.push(display);
+        }
+    }
+
+    // 按长度降序（最长匹配优先）
+    entities.sort((a, b) => b.length - a.length);
+    entityList = entities;
+
+    // 如果结巴已就绪，注入自定义词
+    if (wasmState === WasmState.READY && jiebaAddWord) {
+        injectNewEntitiesToJieba(entities);
+    }
+
+    xbLog.info(MODULE_ID, `实体词典更新: ${entities.length} 个实体`);
+}
+
+/**
+ * 将新实体注入结巴（增量，跳过已注入的）
+ * @param {string[]} entities
+ */
+function injectNewEntitiesToJieba(entities) {
+    let count = 0;
+    for (const entity of entities) {
+        if (!injectedEntities.has(entity)) {
+            try {
+                // freq 设高保证不被切碎
+                jiebaAddWord(entity, 99999);
+                injectedEntities.add(entity);
+                count++;
+            } catch (e) {
+                xbLog.warn(MODULE_ID, `add_word 失败: ${entity}`, e);
+            }
+        }
+    }
+    if (count > 0) {
+        xbLog.info(MODULE_ID, `注入 ${count} 个新实体到结巴`);
+    }
+}
+
+/**
+ * 重新注入所有实体（WASM 刚加载完时调用）
+ */
+function reInjectAllEntities() {
+    injectedEntities.clear();
+    injectNewEntitiesToJieba(entityList);
+}
+
+// ═══════════════════════════════════════════════════════════════════════════
+// 公开接口：tokenize
+// ═══════════════════════════════════════════════════════════════════════════
+
+/**
+ * 统一分词接口
+ *
+ * 流程：
+ * 1. 实体最长匹配 → 占位符保护
+ * 2. 按 Unicode 脚本分段（亚洲 vs 拉丁）
+ * 3. 亚洲段 → 结巴 cut()（或降级）
+ * 4. 拉丁段 → 空格/标点分割
+ * 5. 还原占位符
+ * 6. 过滤停用词 + 去重
+ *
+ * @param {string} text - 输入文本
+ * @returns {string[]} token 数组
+ */
+export function tokenize(text) {
+    const restored = tokenizeCore(text);
+
+    // 5. 过滤停用词 + 去重 + 清理
+    const seen = new Set();
+    const result = [];
+
+    for (const token of restored) {
+        const cleaned = token.trim().toLowerCase();
+
+        if (!cleaned) continue;
+        if (cleaned.length < 2) continue;
+        if (STOP_WORDS.has(cleaned)) continue;
+        if (seen.has(cleaned)) continue;
+
+        // 过滤纯标点/特殊字符
+        if (/^[\s\x00-\x1F\p{P}\p{S}]+$/u.test(cleaned)) continue;
+
+        seen.add(cleaned);
+        result.push(token.trim()); // 保留原始大小写
+    }
+
+    return result;
+}
+
+/**
+ * 内核分词流程（不去重、不 lower、仅完成：实体保护→分段→分词→还原）
+ * @param {string} text
+ * @returns {string[]}
+ */
+function tokenizeCore(text) {
+    if (!text) return [];
+
+    const input = String(text).trim();
+    if (!input) return [];
+
+    // 1. 实体保护
+    const { masked, entities } = maskEntities(input);
+
+    // 2. 分段
+    const segments = segmentByScript(masked);
+
+    // 3. 分段分词
+    const rawTokens = [];
+    for (const seg of segments) {
+        if (seg.type === 'asian') {
+            if (wasmState === WasmState.READY && jiebaCut) {
+                rawTokens.push(...tokenizeAsianJieba(seg.text));
+            } else {
+                rawTokens.push(...tokenizeAsianFallback(seg.text));
+            }
+        } else if (seg.type === 'latin') {
+            rawTokens.push(...tokenizeLatin(seg.text));
+        }
+    }
+
+    // 4. 还原占位符
+    return unmaskTokens(rawTokens, entities);
+}
+
+// ═══════════════════════════════════════════════════════════════════════════
+// 公开接口：tokenizeForIndex
+// ═══════════════════════════════════════════════════════════════════════════
+
+/**
+ * MiniSearch 索引专用分词
+ *
+ * 与 tokenize() 的区别：
+ * - 全部转小写（MiniSearch 内部需要一致性）
+ * - 不去重（MiniSearch 自己处理词频）
+ *
+ * @param {string} text
+ * @returns {string[]}
+ */
+export function tokenizeForIndex(text) {
+    const restored = tokenizeCore(text);
+
+    return restored
+        .map(t => t.trim().toLowerCase())
+        .filter(t => {
+            if (!t || t.length < 2) return false;
+            if (STOP_WORDS.has(t)) return false;
+            if (/^[\s\x00-\x1F\p{P}\p{S}]+$/u.test(t)) return false;
+            return true;
+        });
+}
+
+// ═══════════════════════════════════════════════════════════════════════════
+// 公开接口：reset
+// ═══════════════════════════════════════════════════════════════════════════
+
+/**
+ * 重置分词器状态
+ * 用于测试或模块卸载
+ */
+export function reset() {
+    entityList = [];
+    injectedEntities.clear();
+    // 不重置 WASM 状态（避免重复加载）
+}