// ═══════════════════════════════════════════════════════════════════════════ // lexical-index.js - MiniSearch 词法检索索引 // // 职责: // 1. 对 L0 atoms + L1 chunks + L2 events 建立词法索引 // 2. 提供词法检索接口(专名精确匹配兜底) // 3. 惰性构建 + 缓存失效机制 // // 索引存储:纯内存(不持久化) // 重建时机:CHAT_CHANGED / L0提取完成 / L2总结完成 // ═══════════════════════════════════════════════════════════════════════════ import MiniSearch from '../../../../libs/minisearch.mjs'; import { getContext } from '../../../../../../../extensions.js'; import { getSummaryStore } from '../../data/store.js'; import { getStateAtoms } from '../storage/state-store.js'; import { getAllChunks } from '../storage/chunk-store.js'; import { xbLog } from '../../../../core/debug-core.js'; const MODULE_ID = 'lexical-index'; // ───────────────────────────────────────────────────────────────────────── // 缓存 // ───────────────────────────────────────────────────────────────────────── let cachedIndex = null; let cachedChatId = null; let cachedFingerprint = null; // atoms.length + chunks.length + events.length 的简单指纹 // ───────────────────────────────────────────────────────────────────────── // 工具函数 // ───────────────────────────────────────────────────────────────────────── /** * 清理事件摘要(移除楼层标记) * @param {string} summary * @returns {string} */ function cleanSummary(summary) { return String(summary || '') .replace(/\s*\(#\d+(?:-\d+)?\)\s*$/, '') .trim(); } /** * 计算缓存指纹(用于判断是否需要重建) * @param {number} atomCount * @param {number} chunkCount * @param {number} eventCount * @returns {string} */ function computeFingerprint(atomCount, chunkCount, eventCount) { return `${atomCount}:${chunkCount}:${eventCount}`; } // ───────────────────────────────────────────────────────────────────────── // 索引构建 // ───────────────────────────────────────────────────────────────────────── /** * 构建 MiniSearch 索引 * * 索引三类文档: * - L0 atoms: { id: atomId, type: 'atom', floor, text: semantic } * - L1 chunks: { id: chunkId, type: 'chunk', floor, text: chunk.text } * - L2 events: { id: eventId, type: 'event', floor: null, text: title + participants + summary } * * @param {object[]} atoms - getStateAtoms() 返回值 * @param {object[]} chunks - getAllChunks(chatId) 返回值 * @param {object[]} events - store.json.events * @returns {MiniSearch} */ export function buildLexicalIndex(atoms, chunks, events) { const T0 = performance.now(); const index = new MiniSearch({ fields: ['text'], storeFields: ['type', 'floor'], idField: 'id', searchOptions: { boost: { text: 1 }, fuzzy: 0.2, prefix: true, }, // 中文友好的 tokenizer:按字符 bigram + 空格/标点分词 tokenize: chineseTokenize, }); const docs = []; // L0 atoms for (const atom of (atoms || [])) { if (!atom?.atomId || !atom.semantic) continue; docs.push({ id: atom.atomId, type: 'atom', floor: atom.floor ?? -1, text: atom.semantic, }); } // L1 chunks for (const chunk of (chunks || [])) { if (!chunk?.chunkId || !chunk.text) continue; docs.push({ id: chunk.chunkId, type: 'chunk', floor: chunk.floor ?? -1, text: chunk.text, }); } // L2 events for (const ev of (events || [])) { if (!ev?.id) continue; const parts = []; if (ev.title) parts.push(ev.title); if (ev.participants?.length) parts.push(ev.participants.join(' ')); const summary = cleanSummary(ev.summary); if (summary) parts.push(summary); const text = parts.join(' ').trim(); if (!text) continue; docs.push({ id: ev.id, type: 'event', floor: null, text, }); } if (docs.length > 0) { index.addAll(docs); } const elapsed = Math.round(performance.now() - T0); xbLog.info(MODULE_ID, `索引构建完成: ${docs.length} 文档 (atoms=${atoms?.length || 0}, chunks=${chunks?.length || 0}, events=${events?.length || 0}) ${elapsed}ms`); return index; } // ───────────────────────────────────────────────────────────────────────── // 中文 Tokenizer // ───────────────────────────────────────────────────────────────────────── /** * 中文友好的分词器 * * 策略: * 1. 连续中文字符 → 滑动 bigram("黄英梅" → "黄英", "英梅") * 2. 连续非中文字符 → 按空格/标点分割 * 3. 保留完整中文词(2-4字)作为额外 token * * @param {string} text * @returns {string[]} */ function chineseTokenize(text) { if (!text) return []; const tokens = []; const s = String(text).toLowerCase(); // 分离中文段和非中文段 const segments = s.split(/([\u4e00-\u9fff]+)/g); for (const seg of segments) { if (!seg) continue; // 中文段:bigram + 完整段(如果 2-6 字) if (/^[\u4e00-\u9fff]+$/.test(seg)) { // 完整段作为一个 token(如果长度合适) if (seg.length >= 2 && seg.length <= 6) { tokens.push(seg); } // bigram for (let i = 0; i < seg.length - 1; i++) { tokens.push(seg.slice(i, i + 2)); } // trigram(对 3+ 字的段) for (let i = 0; i < seg.length - 2; i++) { tokens.push(seg.slice(i, i + 3)); } } else { // 非中文段:按空格/标点分割 const words = seg.split(/[\s\-_.,;:!?'"()[\]{}<>/\\|@#$%^&*+=~`]+/); for (const w of words) { const trimmed = w.trim(); if (trimmed.length >= 2) { tokens.push(trimmed); } } } } return tokens; } // ───────────────────────────────────────────────────────────────────────── // 检索 // ───────────────────────────────────────────────────────────────────────── /** * @typedef {object} LexicalSearchResult * @property {string[]} atomIds - 命中的 L0 atom IDs * @property {Set} atomFloors - 命中的 L0 楼层集合 * @property {string[]} chunkIds - 命中的 L1 chunk IDs * @property {Set} chunkFloors - 命中的 L1 楼层集合 * @property {string[]} eventIds - 命中的 L2 event IDs * @property {object[]} chunkScores - chunk 命中详情 [{ chunkId, score }] * @property {number} searchTime - 检索耗时 ms */ /** * 在词法索引中检索 * * @param {MiniSearch} index - 索引实例 * @param {string[]} terms - 查询词列表 * @returns {LexicalSearchResult} */ export function searchLexicalIndex(index, terms) { const T0 = performance.now(); const result = { atomIds: [], atomFloors: new Set(), chunkIds: [], chunkFloors: new Set(), eventIds: [], chunkScores: [], searchTime: 0, }; if (!index || !terms?.length) { result.searchTime = Math.round(performance.now() - T0); return result; } // 用所有 terms 联合查询 const queryString = terms.join(' '); let hits; try { hits = index.search(queryString, { boost: { text: 1 }, fuzzy: 0.2, prefix: true, combineWith: 'OR', }); } catch (e) { xbLog.warn(MODULE_ID, '检索失败', e); result.searchTime = Math.round(performance.now() - T0); return result; } // 分类结果 const atomIdSet = new Set(); const chunkIdSet = new Set(); const eventIdSet = new Set(); for (const hit of hits) { const type = hit.type; const id = hit.id; const floor = hit.floor; switch (type) { case 'atom': if (!atomIdSet.has(id)) { atomIdSet.add(id); result.atomIds.push(id); if (typeof floor === 'number' && floor >= 0) { result.atomFloors.add(floor); } } break; case 'chunk': if (!chunkIdSet.has(id)) { chunkIdSet.add(id); result.chunkIds.push(id); result.chunkScores.push({ chunkId: id, score: hit.score }); if (typeof floor === 'number' && floor >= 0) { result.chunkFloors.add(floor); } } break; case 'event': if (!eventIdSet.has(id)) { eventIdSet.add(id); result.eventIds.push(id); } break; } } result.searchTime = Math.round(performance.now() - T0); xbLog.info(MODULE_ID, `检索完成: terms=[${terms.slice(0, 5).join(',')}] → atoms=${result.atomIds.length} chunks=${result.chunkIds.length} events=${result.eventIds.length} (${result.searchTime}ms)` ); return result; } // ───────────────────────────────────────────────────────────────────────── // 惰性缓存管理 // ───────────────────────────────────────────────────────────────────────── /** * 获取词法索引(惰性构建 + 缓存) * * 如果缓存有效则直接返回;否则自动构建。 * 缓存失效条件:chatId 变化 / 数据指纹变化 / 手动 invalidate * * @returns {Promise} */ export async function getLexicalIndex() { const { chatId } = getContext(); if (!chatId) return null; // 收集当前数据 const atoms = getStateAtoms() || []; const store = getSummaryStore(); const events = store?.json?.events || []; let chunks = []; try { chunks = await getAllChunks(chatId); } catch (e) { xbLog.warn(MODULE_ID, '获取 chunks 失败', e); } const fp = computeFingerprint(atoms.length, chunks.length, events.length); // 缓存命中 if (cachedIndex && cachedChatId === chatId && cachedFingerprint === fp) { return cachedIndex; } // 重建 xbLog.info(MODULE_ID, `缓存失效,重建索引 (chatId=${chatId.slice(0, 8)}, fp=${fp})`); const index = buildLexicalIndex(atoms, chunks, events); cachedIndex = index; cachedChatId = chatId; cachedFingerprint = fp; return index; } /** * 使缓存失效(下次 getLexicalIndex 时自动重建) * * 调用时机: * - CHAT_CHANGED * - L0 提取完成(handleAnchorGenerate 完成后) * - L2 总结完成(onComplete 回调中) */ export function invalidateLexicalIndex() { if (cachedIndex) { xbLog.info(MODULE_ID, '索引缓存已失效'); } cachedIndex = null; cachedChatId = null; cachedFingerprint = null; }