Update retrieval, rerank, and indexing changes

2026-02-11 13:55:19 +08:00
parent 8d062d39b5
commit 297cc03770
7 changed files with 501 additions and 287 deletions
--- a/modules/story-summary/vector/retrieval/lexical-index.js
+++ b/modules/story-summary/vector/retrieval/lexical-index.js
@@ -14,7 +14,6 @@
 import MiniSearch from '../../../../libs/minisearch.mjs';
 import { getContext } from '../../../../../../../extensions.js';
 import { getSummaryStore } from '../../data/store.js';
-import { getStateAtoms } from '../storage/state-store.js';
 import { getAllChunks } from '../storage/chunk-store.js';
 import { xbLog } from '../../../../core/debug-core.js';
 import { tokenizeForIndex } from '../utils/tokenizer.js';
@@ -39,6 +38,8 @@ let building = false;

 /** @type {Promise<MiniSearch|null>|null} 当前构建 Promise（防重入） */
 let buildPromise = null;
+/** @type {Map<number, string[]>} floor → 该楼层的 doc IDs（仅 L1 chunks） */
+let floorDocIds = new Map();

 // ─────────────────────────────────────────────────────────────────────────
 // 工具函数
@@ -57,13 +58,12 @@ function cleanSummary(summary) {

 /**
 * 计算缓存指纹
- * @param {number} atomCount
 * @param {number} chunkCount
 * @param {number} eventCount
 * @returns {string}
 */
-function computeFingerprint(atomCount, chunkCount, eventCount) {
-    return `${atomCount}:${chunkCount}:${eventCount}`;
+function computeFingerprint(chunkCount, eventCount) {
+    return `${chunkCount}:${eventCount}`;
 }

 /**
@@ -81,34 +81,31 @@ function yieldToMain() {
 /**
 * 收集所有待索引文档
 *
- * @param {object[]} atoms  - getStateAtoms() 返回值
 * @param {object[]} chunks - getAllChunks(chatId) 返回值
 * @param {object[]} events - store.json.events
 * @returns {object[]} 文档数组
 */
-function collectDocuments(atoms, chunks, events) {
+function collectDocuments(chunks, events) {
    const docs = [];

-    // L0 atoms
-    for (const atom of (atoms || [])) {
-        if (!atom?.atomId || !atom.semantic) continue;
-        docs.push({
-            id: atom.atomId,
-            type: 'atom',
-            floor: atom.floor ?? -1,
-            text: atom.semantic,
-        });
-    }
-
-    // L1 chunks
+    // L1 chunks + 填充 floorDocIds
    for (const chunk of (chunks || [])) {
        if (!chunk?.chunkId || !chunk.text) continue;
+
+        const floor = chunk.floor ?? -1;
        docs.push({
            id: chunk.chunkId,
            type: 'chunk',
-            floor: chunk.floor ?? -1,
+            floor,
            text: chunk.text,
        });
+
+        if (floor >= 0) {
+            if (!floorDocIds.has(floor)) {
+                floorDocIds.set(floor, []);
+            }
+            floorDocIds.get(floor).push(chunk.chunkId);
+        }
    }

    // L2 events
@@ -244,7 +241,6 @@ export function searchLexicalIndex(index, terms) {
    }

    // 分类结果
-    const atomIdSet = new Set();
    const chunkIdSet = new Set();
    const eventIdSet = new Set();

@@ -254,16 +250,6 @@ export function searchLexicalIndex(index, terms) {
        const floor = hit.floor;

        switch (type) {
-            case 'atom':
-                if (!atomIdSet.has(id)) {
-                    atomIdSet.add(id);
-                    result.atomIds.push(id);
-                    if (typeof floor === 'number' && floor >= 0) {
-                        result.atomFloors.add(floor);
-                    }
-                }
-                break;
-
            case 'chunk':
                if (!chunkIdSet.has(id)) {
                    chunkIdSet.add(id);
@@ -304,8 +290,10 @@ export function searchLexicalIndex(index, terms) {
 * @returns {Promise<{index: MiniSearch, fingerprint: string}>}
 */
 async function collectAndBuild(chatId) {
-    // 收集数据
-    const atoms = getStateAtoms() || [];
+    // 清空侧索引（全量重建）
+    floorDocIds = new Map();
+
+    // 收集数据（不含 L0 atoms）
    const store = getSummaryStore();
    const events = store?.json?.events || [];

@@ -316,15 +304,15 @@ async function collectAndBuild(chatId) {
        xbLog.warn(MODULE_ID, '获取 chunks 失败', e);
    }

-    const fp = computeFingerprint(atoms.length, chunks.length, events.length);
+    const fp = computeFingerprint(chunks.length, events.length);

    // 检查是否在收集过程中缓存已被其他调用更新
    if (cachedIndex && cachedChatId === chatId && cachedFingerprint === fp) {
        return { index: cachedIndex, fingerprint: fp };
    }

-    // 收集文档
-    const docs = collectDocuments(atoms, chunks, events);
+    // 收集文档（同时填充 floorDocIds）
+    const docs = collectDocuments(chunks, events);

    // 异步分片构建
    const index = await buildIndexAsync(docs);
@@ -438,4 +426,116 @@ export function invalidateLexicalIndex() {
    cachedIndex = null;
    cachedChatId = null;
    cachedFingerprint = null;
+    floorDocIds = new Map();
+}
+
+// ─────────────────────────────────────────────────────────────────────────
+// 增量更新接口
+// ─────────────────────────────────────────────────────────────────────────
+
+/**
+ * 为指定楼层添加 L1 chunks 到索引
+ *
+ * 先移除该楼层旧文档，再添加新文档。
+ * 如果索引不存在（缓存失效），静默跳过（下次 getLexicalIndex 全量重建）。
+ *
+ * @param {number} floor - 楼层号
+ * @param {object[]} chunks - chunk 对象列表（需有 chunkId、text、floor）
+ */
+export function addDocumentsForFloor(floor, chunks) {
+    if (!cachedIndex || !chunks?.length) return;
+
+    // 先移除旧文档
+    removeDocumentsByFloor(floor);
+
+    const docs = [];
+    const docIds = [];
+
+    for (const chunk of chunks) {
+        if (!chunk?.chunkId || !chunk.text) continue;
+        docs.push({
+            id: chunk.chunkId,
+            type: 'chunk',
+            floor: chunk.floor ?? floor,
+            text: chunk.text,
+        });
+        docIds.push(chunk.chunkId);
+    }
+
+    if (docs.length > 0) {
+        cachedIndex.addAll(docs);
+        floorDocIds.set(floor, docIds);
+        xbLog.info(MODULE_ID, `增量添加: floor ${floor}, ${docs.length} 个 chunk`);
+    }
+}
+
+/**
+ * 从索引中移除指定楼层的所有 L1 chunk 文档
+ *
+ * 使用 MiniSearch discard()（软删除）。
+ * 如果索引不存在，静默跳过。
+ *
+ * @param {number} floor - 楼层号
+ */
+export function removeDocumentsByFloor(floor) {
+    if (!cachedIndex) return;
+
+    const docIds = floorDocIds.get(floor);
+    if (!docIds?.length) return;
+
+    for (const id of docIds) {
+        try {
+            cachedIndex.discard(id);
+        } catch {
+            // 文档可能不存在（已被全量重建替换）
+        }
+    }
+
+    floorDocIds.delete(floor);
+    xbLog.info(MODULE_ID, `增量移除: floor ${floor}, ${docIds.length} 个文档`);
+}
+
+/**
+ * 将新 L2 事件添加到索引
+ *
+ * 如果事件 ID 已存在，先 discard 再 add（覆盖）。
+ * 如果索引不存在，静默跳过。
+ *
+ * @param {object[]} events - 事件对象列表（需有 id、title、summary 等）
+ */
+export function addEventDocuments(events) {
+    if (!cachedIndex || !events?.length) return;
+
+    const docs = [];
+
+    for (const ev of events) {
+        if (!ev?.id) continue;
+
+        const parts = [];
+        if (ev.title) parts.push(ev.title);
+        if (ev.participants?.length) parts.push(ev.participants.join(' '));
+        const summary = cleanSummary(ev.summary);
+        if (summary) parts.push(summary);
+        const text = parts.join(' ').trim();
+        if (!text) continue;
+
+        // 覆盖：先尝试移除旧的
+        try {
+            cachedIndex.discard(ev.id);
+        } catch {
+            // 不存在则忽略
+        }
+
+        docs.push({
+            id: ev.id,
+            type: 'event',
+            floor: null,
+            text,
+        });
+    }
+
+    if (docs.length > 0) {
+        cachedIndex.addAll(docs);
+        xbLog.info(MODULE_ID, `增量添加: ${docs.length} 个事件`);
+    }
 }