Zero-darkbox query updates and tokenizer improvements

2026-02-09 20:25:26 +08:00
parent 8131d6a15f
commit 0a28539b29
14 changed files with 1771 additions and 175 deletions
--- a/modules/story-summary/vector/retrieval/lexical-index.js
+++ b/modules/story-summary/vector/retrieval/lexical-index.js
@@ -4,9 +4,10 @@
 // 职责：
 // 1. 对 L0 atoms + L1 chunks + L2 events 建立词法索引
 // 2. 提供词法检索接口（专名精确匹配兜底）
-// 3. 惰性构建 + 缓存失效机制
+// 3. 惰性构建 + 异步预热 + 缓存失效机制
 //
 // 索引存储：纯内存（不持久化）
+// 分词器：统一使用 tokenizer.js（结巴 + 实体保护 + 降级）
 // 重建时机：CHAT_CHANGED / L0提取完成 / L2总结完成
 // ═══════════════════════════════════════════════════════════════════════════

@@ -16,6 +17,7 @@ import { getSummaryStore } from '../../data/store.js';
 import { getStateAtoms } from '../storage/state-store.js';
 import { getAllChunks } from '../storage/chunk-store.js';
 import { xbLog } from '../../../../core/debug-core.js';
+import { tokenizeForIndex } from '../utils/tokenizer.js';

 const MODULE_ID = 'lexical-index';

@@ -23,9 +25,20 @@ const MODULE_ID = 'lexical-index';
 // 缓存
 // ─────────────────────────────────────────────────────────────────────────

+/** @type {MiniSearch|null} */
 let cachedIndex = null;
+
+/** @type {string|null} */
 let cachedChatId = null;
-let cachedFingerprint = null; // atoms.length + chunks.length + events.length 的简单指纹
+
+/** @type {string|null} 数据指纹（atoms + chunks + events 数量） */
+let cachedFingerprint = null;
+
+/** @type {boolean} 是否正在构建 */
+let building = false;
+
+/** @type {Promise<MiniSearch|null>|null} 当前构建 Promise（防重入） */
+let buildPromise = null;

 // ─────────────────────────────────────────────────────────────────────────
 // 工具函数
@@ -43,7 +56,7 @@ function cleanSummary(summary) {
 }

 /**
- * 计算缓存指纹（用于判断是否需要重建）
+ * 计算缓存指纹
 * @param {number} atomCount
 * @param {number} chunkCount
 * @param {number} eventCount
@@ -53,39 +66,27 @@ function computeFingerprint(atomCount, chunkCount, eventCount) {
    return `${atomCount}:${chunkCount}:${eventCount}`;
 }

+/**
+ * 让出主线程（避免长时间阻塞 UI）
+ * @returns {Promise<void>}
+ */
+function yieldToMain() {
+    return new Promise(resolve => setTimeout(resolve, 0));
+}
+
 // ─────────────────────────────────────────────────────────────────────────
-// 索引构建
+// 文档收集
 // ─────────────────────────────────────────────────────────────────────────

 /**
- * 构建 MiniSearch 索引
- *
- * 索引三类文档：
- * - L0 atoms: { id: atomId, type: 'atom', floor, text: semantic }
- * - L1 chunks: { id: chunkId, type: 'chunk', floor, text: chunk.text }
- * - L2 events: { id: eventId, type: 'event', floor: null, text: title + participants + summary }
+ * 收集所有待索引文档
 *
 * @param {object[]} atoms  - getStateAtoms() 返回值
 * @param {object[]} chunks - getAllChunks(chatId) 返回值
 * @param {object[]} events - store.json.events
- * @returns {MiniSearch}
+ * @returns {object[]} 文档数组
 */
-export function buildLexicalIndex(atoms, chunks, events) {
-    const T0 = performance.now();
-
-    const index = new MiniSearch({
-        fields: ['text'],
-        storeFields: ['type', 'floor'],
-        idField: 'id',
-        searchOptions: {
-            boost: { text: 1 },
-            fuzzy: 0.2,
-            prefix: true,
-        },
-        // 中文友好的 tokenizer：按字符 bigram + 空格/标点分词
-        tokenize: chineseTokenize,
-    });
-
+function collectDocuments(atoms, chunks, events) {
    const docs = [];

    // L0 atoms
@@ -129,72 +130,58 @@ export function buildLexicalIndex(atoms, chunks, events) {
        });
    }

-    if (docs.length > 0) {
-        index.addAll(docs);
-    }
-
-    const elapsed = Math.round(performance.now() - T0);
-    xbLog.info(MODULE_ID, `索引构建完成: ${docs.length} 文档 (atoms=${atoms?.length || 0}, chunks=${chunks?.length || 0}, events=${events?.length || 0}) ${elapsed}ms`);
-
-    return index;
+    return docs;
 }

 // ─────────────────────────────────────────────────────────────────────────
-// 中文 Tokenizer
+// 索引构建（分片，不阻塞主线程）
 // ─────────────────────────────────────────────────────────────────────────

+/** 每批添加的文档数 */
+const BUILD_BATCH_SIZE = 500;
+
 /**
- * 中文友好的分词器
+ * 构建 MiniSearch 索引（分片异步）
 *
- * 策略：
- * 1. 连续中文字符 → 滑动 bigram（"黄英梅" → "黄英", "英梅"）
- * 2. 连续非中文字符 → 按空格/标点分割
- * 3. 保留完整中文词（2-4字）作为额外 token
- *
- * @param {string} text
- * @returns {string[]}
+ * @param {object[]} docs - 文档数组
+ * @returns {Promise<MiniSearch>}
 */
-function chineseTokenize(text) {
-    if (!text) return [];
+async function buildIndexAsync(docs) {
+    const T0 = performance.now();

-    const tokens = [];
-    const s = String(text).toLowerCase();
+    const index = new MiniSearch({
+        fields: ['text'],
+        storeFields: ['type', 'floor'],
+        idField: 'id',
+        searchOptions: {
+            boost: { text: 1 },
+            fuzzy: 0.2,
+            prefix: true,
+        },
+        tokenize: tokenizeForIndex,
+    });

-    // 分离中文段和非中文段
-    const segments = s.split(/([\u4e00-\u9fff]+)/g);
+    if (!docs.length) {
+        return index;
+    }

-    for (const seg of segments) {
-        if (!seg) continue;
+    // 分片添加，每批 BUILD_BATCH_SIZE 条后让出主线程
+    for (let i = 0; i < docs.length; i += BUILD_BATCH_SIZE) {
+        const batch = docs.slice(i, i + BUILD_BATCH_SIZE);
+        index.addAll(batch);

-        // 中文段：bigram + 完整段（如果 2-6 字）
-        if (/^[\u4e00-\u9fff]+$/.test(seg)) {
-            // 完整段作为一个 token（如果长度合适）
-            if (seg.length >= 2 && seg.length <= 6) {
-                tokens.push(seg);
-            }
-
-            // bigram
-            for (let i = 0; i < seg.length - 1; i++) {
-                tokens.push(seg.slice(i, i + 2));
-            }
-
-            // trigram（对 3+ 字的段）
-            for (let i = 0; i < seg.length - 2; i++) {
-                tokens.push(seg.slice(i, i + 3));
-            }
-        } else {
-            // 非中文段：按空格/标点分割
-const words = seg.split(/[\s\-_.,;:!?'"()[\]{}<>/\\|@#$%^&*+=~`]+/);
-            for (const w of words) {
-                const trimmed = w.trim();
-                if (trimmed.length >= 2) {
-                    tokens.push(trimmed);
-                }
-            }
+        // 非最后一批时让出主线程
+        if (i + BUILD_BATCH_SIZE < docs.length) {
+            await yieldToMain();
        }
    }

-    return tokens;
+    const elapsed = Math.round(performance.now() - T0);
+    xbLog.info(MODULE_ID,
+        `索引构建完成: ${docs.length} 文档 (${elapsed}ms)`
+    );
+
+    return index;
 }

 // ─────────────────────────────────────────────────────────────────────────
@@ -247,6 +234,8 @@ export function searchLexicalIndex(index, terms) {
            fuzzy: 0.2,
            prefix: true,
            combineWith: 'OR',
+            // 使用与索引相同的分词器
+            tokenize: tokenizeForIndex,
        });
    } catch (e) {
        xbLog.warn(MODULE_ID, '检索失败', e);
@@ -305,22 +294,17 @@ export function searchLexicalIndex(index, terms) {
 }

 // ─────────────────────────────────────────────────────────────────────────
-// 惰性缓存管理
+// 内部构建流程（收集数据 + 构建索引）
 // ─────────────────────────────────────────────────────────────────────────

 /**
- * 获取词法索引（惰性构建 + 缓存）
+ * 收集数据并构建索引
 *
- * 如果缓存有效则直接返回；否则自动构建。
- * 缓存失效条件：chatId 变化 / 数据指纹变化 / 手动 invalidate
- *
- * @returns {Promise<MiniSearch>}
+ * @param {string} chatId
+ * @returns {Promise<{index: MiniSearch, fingerprint: string}>}
 */
-export async function getLexicalIndex() {
-    const { chatId } = getContext();
-    if (!chatId) return null;
-
-    // 收集当前数据
+async function collectAndBuild(chatId) {
+    // 收集数据
    const atoms = getStateAtoms() || [];
    const store = getSummaryStore();
    const events = store?.json?.events || [];
@@ -334,30 +318,118 @@ export async function getLexicalIndex() {

    const fp = computeFingerprint(atoms.length, chunks.length, events.length);

-    // 缓存命中
+    // 检查是否在收集过程中缓存已被其他调用更新
    if (cachedIndex && cachedChatId === chatId && cachedFingerprint === fp) {
+        return { index: cachedIndex, fingerprint: fp };
+    }
+
+    // 收集文档
+    const docs = collectDocuments(atoms, chunks, events);
+
+    // 异步分片构建
+    const index = await buildIndexAsync(docs);
+
+    return { index, fingerprint: fp };
+}
+
+// ─────────────────────────────────────────────────────────────────────────
+// 公开接口：getLexicalIndex（惰性获取）
+// ─────────────────────────────────────────────────────────────────────────
+
+/**
+ * 获取词法索引（惰性构建 + 缓存）
+ *
+ * 如果缓存有效则直接返回；否则自动构建。
+ * 如果正在构建中，等待构建完成。
+ *
+ * @returns {Promise<MiniSearch|null>}
+ */
+export async function getLexicalIndex() {
+    const { chatId } = getContext();
+    if (!chatId) return null;
+
+    // 快速路径：如果缓存存在且 chatId 未变，则直接命中
+    // 指纹校验放到构建流程中完成，避免为指纹而额外读一次 IndexedDB
+    if (cachedIndex && cachedChatId === chatId && cachedFingerprint) {
        return cachedIndex;
    }

-    // 重建
-    xbLog.info(MODULE_ID, `缓存失效，重建索引 (chatId=${chatId.slice(0, 8)}, fp=${fp})`);
+    // 正在构建中，等待结果
+    if (building && buildPromise) {
+        try {
+            await buildPromise;
+            if (cachedIndex && cachedChatId === chatId && cachedFingerprint) {
+                return cachedIndex;
+            }
+        } catch {
+            // 构建失败，继续往下重建
+        }
+    }

-    const index = buildLexicalIndex(atoms, chunks, events);
+    // 需要重建（指纹将在 collectAndBuild 内部计算并写入缓存）
+    xbLog.info(MODULE_ID, `缓存失效，重建索引 (chatId=${chatId.slice(0, 8)})`);

-    cachedIndex = index;
-    cachedChatId = chatId;
-    cachedFingerprint = fp;
+    building = true;
+    buildPromise = collectAndBuild(chatId);

-    return index;
+    try {
+        const { index, fingerprint } = await buildPromise;
+
+        // 原子替换缓存
+        cachedIndex = index;
+        cachedChatId = chatId;
+        cachedFingerprint = fingerprint;
+
+        return index;
+    } catch (e) {
+        xbLog.error(MODULE_ID, '索引构建失败', e);
+        return null;
+    } finally {
+        building = false;
+        buildPromise = null;
+    }
 }

+// ─────────────────────────────────────────────────────────────────────────
+// 公开接口：warmupIndex（异步预建）
+// ─────────────────────────────────────────────────────────────────────────
+
 /**
- * 使缓存失效（下次 getLexicalIndex 时自动重建）
+ * 异步预建索引
+ *
+ * 在 CHAT_CHANGED 时调用，后台构建索引。
+ * 不阻塞调用方，不返回结果。
+ * 构建完成后缓存自动更新，后续 getLexicalIndex() 直接命中。
+ *
+ * 调用时机：
+ * - handleChatChanged（实体注入后）
+ * - L0 提取完成
+ * - L2 总结完成
+ */
+export function warmupIndex() {
+    const { chatId } = getContext();
+    if (!chatId) return;
+
+    // 已在构建中，不重复触发
+    if (building) return;
+
+    // fire-and-forget
+    getLexicalIndex().catch(e => {
+        xbLog.warn(MODULE_ID, '预热索引失败', e);
+    });
+}
+
+// ─────────────────────────────────────────────────────────────────────────
+// 公开接口：invalidateLexicalIndex（缓存失效）
+// ─────────────────────────────────────────────────────────────────────────
+
+/**
+ * 使缓存失效（下次 getLexicalIndex / warmupIndex 时自动重建）
 *
 * 调用时机：
 * - CHAT_CHANGED
- * - L0 提取完成（handleAnchorGenerate 完成后）
- * - L2 总结完成（onComplete 回调中）
+ * - L0 提取完成
+ * - L2 总结完成
 */
 export function invalidateLexicalIndex() {
    if (cachedIndex) {