import MiniSearch from '../../../../libs/minisearch.mjs'; import { getContext } from '../../../../../../../extensions.js'; import { getSummaryStore } from '../../data/store.js'; import { getAllChunks } from '../storage/chunk-store.js'; import { xbLog } from '../../../../core/debug-core.js'; import { tokenizeForIndex } from '../utils/tokenizer.js'; const MODULE_ID = 'lexical-index'; // In-memory index cache let cachedIndex = null; let cachedChatId = null; let cachedFingerprint = null; let building = false; let buildPromise = null; // floor -> chunk doc ids (L1 only) let floorDocIds = new Map(); // IDF stats over lexical docs (L1 chunks + L2 events) let termDfMap = new Map(); let docTokenSets = new Map(); // docId -> Set let lexicalDocCount = 0; const IDF_MIN = 1.0; const IDF_MAX = 4.0; const BUILD_BATCH_SIZE = 500; function cleanSummary(summary) { return String(summary || '') .replace(/\s*\(#\d+(?:-\d+)?\)\s*$/, '') .trim(); } function fnv1a32(input, seed = 0x811C9DC5) { let hash = seed >>> 0; const text = String(input || ''); for (let i = 0; i < text.length; i++) { hash ^= text.charCodeAt(i); hash = Math.imul(hash, 0x01000193) >>> 0; } return hash >>> 0; } function compareDocKeys(a, b) { const ka = `${a?.type || ''}:${a?.id || ''}`; const kb = `${b?.type || ''}:${b?.id || ''}`; if (ka < kb) return -1; if (ka > kb) return 1; return 0; } function computeFingerprintFromDocs(docs) { const normalizedDocs = Array.isArray(docs) ? [...docs].sort(compareDocKeys) : []; let hash = 0x811C9DC5; for (const doc of normalizedDocs) { const payload = `${doc?.type || ''}\u001F${doc?.id || ''}\u001F${doc?.floor ?? ''}\u001F${doc?.text || ''}\u001E`; hash = fnv1a32(payload, hash); } return `${normalizedDocs.length}:${(hash >>> 0).toString(16)}`; } function yieldToMain() { return new Promise(resolve => setTimeout(resolve, 0)); } function clamp(v, min, max) { return Math.max(min, Math.min(max, v)); } function normalizeTerm(term) { return String(term || '').trim().toLowerCase(); } function computeIdfFromDf(df, docCount) { if (!docCount || docCount <= 0) return 1; const raw = Math.log((docCount + 1) / ((df || 0) + 1)) + 1; return clamp(raw, IDF_MIN, IDF_MAX); } function computeIdf(term) { const t = normalizeTerm(term); if (!t || lexicalDocCount <= 0) return 1; return computeIdfFromDf(termDfMap.get(t) || 0, lexicalDocCount); } function extractUniqueTokens(text) { return new Set(tokenizeForIndex(String(text || '')).map(normalizeTerm).filter(Boolean)); } function clearIdfState() { termDfMap = new Map(); docTokenSets = new Map(); lexicalDocCount = 0; } function removeDocumentIdf(docId) { const id = String(docId || ''); if (!id) return; const tokens = docTokenSets.get(id); if (!tokens) return; for (const token of tokens) { const current = termDfMap.get(token) || 0; if (current <= 1) { termDfMap.delete(token); } else { termDfMap.set(token, current - 1); } } docTokenSets.delete(id); lexicalDocCount = Math.max(0, lexicalDocCount - 1); } function addDocumentIdf(docId, text) { const id = String(docId || ''); if (!id) return; // Replace semantics: remove old token set first if this id already exists. removeDocumentIdf(id); const tokens = extractUniqueTokens(text); docTokenSets.set(id, tokens); lexicalDocCount += 1; for (const token of tokens) { termDfMap.set(token, (termDfMap.get(token) || 0) + 1); } } function rebuildIdfFromDocs(docs) { clearIdfState(); for (const doc of docs || []) { const id = String(doc?.id || ''); const text = String(doc?.text || ''); if (!id || !text.trim()) continue; addDocumentIdf(id, text); } } function buildEventDoc(ev) { if (!ev?.id) return null; const parts = []; if (ev.title) parts.push(ev.title); if (ev.participants?.length) parts.push(ev.participants.join(' ')); const summary = cleanSummary(ev.summary); if (summary) parts.push(summary); const text = parts.join(' ').trim(); if (!text) return null; return { id: ev.id, type: 'event', floor: null, text, }; } function collectDocuments(chunks, events) { const docs = []; for (const chunk of chunks || []) { if (!chunk?.chunkId || !chunk.text) continue; const floor = chunk.floor ?? -1; docs.push({ id: chunk.chunkId, type: 'chunk', floor, text: chunk.text, }); if (floor >= 0) { if (!floorDocIds.has(floor)) floorDocIds.set(floor, []); floorDocIds.get(floor).push(chunk.chunkId); } } for (const ev of events || []) { const doc = buildEventDoc(ev); if (doc) docs.push(doc); } return docs; } async function buildIndexAsync(docs) { const T0 = performance.now(); const index = new MiniSearch({ fields: ['text'], storeFields: ['type', 'floor'], idField: 'id', searchOptions: { boost: { text: 1 }, fuzzy: 0.2, prefix: true, }, tokenize: tokenizeForIndex, }); if (!docs.length) return index; for (let i = 0; i < docs.length; i += BUILD_BATCH_SIZE) { const batch = docs.slice(i, i + BUILD_BATCH_SIZE); index.addAll(batch); if (i + BUILD_BATCH_SIZE < docs.length) { await yieldToMain(); } } const elapsed = Math.round(performance.now() - T0); xbLog.info(MODULE_ID, `Index built: ${docs.length} docs (${elapsed}ms)`); return index; } /** * @typedef {object} LexicalSearchResult * @property {string[]} atomIds - Reserved for backward compatibility (currently empty). * @property {Set} atomFloors - Reserved for backward compatibility (currently empty). * @property {string[]} chunkIds - Matched L1 chunk ids sorted by weighted lexical score. * @property {Set} chunkFloors - Floor ids covered by matched chunks. * @property {string[]} eventIds - Matched L2 event ids sorted by weighted lexical score. * @property {object[]} chunkScores - Weighted lexical scores for matched chunks. * @property {boolean} idfEnabled - Whether IDF stats are available for weighting. * @property {number} idfDocCount - Number of lexical docs used to compute IDF. * @property {Array<{term:string,idf:number}>} topIdfTerms - Top query terms by IDF. * @property {number} termSearches - Number of per-term MiniSearch queries executed. * @property {number} searchTime - Total lexical search time in milliseconds. */ /** * Search lexical index by terms, using per-term MiniSearch and IDF-weighted score aggregation. * This keeps existing outputs compatible while adding observability fields. * * @param {MiniSearch} index * @param {string[]} terms * @returns {LexicalSearchResult} */ export function searchLexicalIndex(index, terms) { const T0 = performance.now(); const result = { atomIds: [], atomFloors: new Set(), chunkIds: [], chunkFloors: new Set(), eventIds: [], chunkScores: [], idfEnabled: lexicalDocCount > 0, idfDocCount: lexicalDocCount, topIdfTerms: [], termSearches: 0, searchTime: 0, }; if (!index || !terms?.length) { result.searchTime = Math.round(performance.now() - T0); return result; } const queryTerms = Array.from(new Set((terms || []).map(normalizeTerm).filter(Boolean))); const weightedScores = new Map(); // docId -> score const hitMeta = new Map(); // docId -> { type, floor } const idfPairs = []; for (const term of queryTerms) { const idf = computeIdf(term); idfPairs.push({ term, idf }); let hits = []; try { hits = index.search(term, { boost: { text: 1 }, fuzzy: 0.2, prefix: true, combineWith: 'OR', tokenize: tokenizeForIndex, }); } catch (e) { xbLog.warn(MODULE_ID, `Lexical term search failed: ${term}`, e); continue; } result.termSearches += 1; for (const hit of hits) { const id = String(hit.id || ''); if (!id) continue; const weighted = (hit.score || 0) * idf; weightedScores.set(id, (weightedScores.get(id) || 0) + weighted); if (!hitMeta.has(id)) { hitMeta.set(id, { type: hit.type, floor: hit.floor, }); } } } idfPairs.sort((a, b) => b.idf - a.idf); result.topIdfTerms = idfPairs.slice(0, 5); const sortedHits = Array.from(weightedScores.entries()) .sort((a, b) => b[1] - a[1]); for (const [id, score] of sortedHits) { const meta = hitMeta.get(id); if (!meta) continue; if (meta.type === 'chunk') { result.chunkIds.push(id); result.chunkScores.push({ chunkId: id, score }); if (typeof meta.floor === 'number' && meta.floor >= 0) { result.chunkFloors.add(meta.floor); } continue; } if (meta.type === 'event') { result.eventIds.push(id); } } result.searchTime = Math.round(performance.now() - T0); xbLog.info( MODULE_ID, `Lexical search terms=[${queryTerms.slice(0, 5).join(',')}] chunks=${result.chunkIds.length} events=${result.eventIds.length} termSearches=${result.termSearches} (${result.searchTime}ms)`, ); return result; } async function collectAndBuild(chatId) { floorDocIds = new Map(); const store = getSummaryStore(); const events = store?.json?.events || []; let chunks = []; try { chunks = await getAllChunks(chatId); } catch (e) { xbLog.warn(MODULE_ID, 'Failed to load chunks', e); } const docs = collectDocuments(chunks, events); const fp = computeFingerprintFromDocs(docs); if (cachedIndex && cachedChatId === chatId && cachedFingerprint === fp) { return { index: cachedIndex, fingerprint: fp }; } rebuildIdfFromDocs(docs); const index = await buildIndexAsync(docs); return { index, fingerprint: fp }; } /** * Expose IDF accessor for query-term selection in query-builder. * If index stats are not ready, this gracefully falls back to idf=1. */ export function getLexicalIdfAccessor() { return { enabled: lexicalDocCount > 0, docCount: lexicalDocCount, getIdf(term) { return computeIdf(term); }, }; } export async function getLexicalIndex() { const { chatId } = getContext(); if (!chatId) return null; if (cachedIndex && cachedChatId === chatId && cachedFingerprint) { return cachedIndex; } if (building && buildPromise) { try { await buildPromise; if (cachedIndex && cachedChatId === chatId && cachedFingerprint) { return cachedIndex; } } catch { // Continue to rebuild below. } } xbLog.info(MODULE_ID, `Lexical cache miss; rebuilding (chatId=${chatId.slice(0, 8)})`); building = true; buildPromise = collectAndBuild(chatId); try { const { index, fingerprint } = await buildPromise; cachedIndex = index; cachedChatId = chatId; cachedFingerprint = fingerprint; return index; } catch (e) { xbLog.error(MODULE_ID, 'Index build failed', e); return null; } finally { building = false; buildPromise = null; } } export function warmupIndex() { const { chatId } = getContext(); if (!chatId || building) return; getLexicalIndex().catch(e => { xbLog.warn(MODULE_ID, 'Warmup failed', e); }); } export function invalidateLexicalIndex() { if (cachedIndex) { xbLog.info(MODULE_ID, 'Lexical index cache invalidated'); } cachedIndex = null; cachedChatId = null; cachedFingerprint = null; floorDocIds = new Map(); clearIdfState(); } export function addDocumentsForFloor(floor, chunks) { if (!cachedIndex || !chunks?.length) return; removeDocumentsByFloor(floor); const docs = []; const docIds = []; for (const chunk of chunks) { if (!chunk?.chunkId || !chunk.text) continue; const doc = { id: chunk.chunkId, type: 'chunk', floor: chunk.floor ?? floor, text: chunk.text, }; docs.push(doc); docIds.push(chunk.chunkId); } if (!docs.length) return; cachedIndex.addAll(docs); floorDocIds.set(floor, docIds); for (const doc of docs) { addDocumentIdf(doc.id, doc.text); } xbLog.info(MODULE_ID, `Incremental add floor=${floor} chunks=${docs.length}`); } export function removeDocumentsByFloor(floor) { if (!cachedIndex) return; const docIds = floorDocIds.get(floor); if (!docIds?.length) return; for (const id of docIds) { try { cachedIndex.discard(id); } catch { // Ignore if the doc was already removed/rebuilt. } removeDocumentIdf(id); } floorDocIds.delete(floor); xbLog.info(MODULE_ID, `Incremental remove floor=${floor} chunks=${docIds.length}`); } export function addEventDocuments(events) { if (!cachedIndex || !events?.length) return; const docs = []; for (const ev of events) { const doc = buildEventDoc(ev); if (!doc) continue; try { cachedIndex.discard(doc.id); } catch { // Ignore if previous document does not exist. } removeDocumentIdf(doc.id); docs.push(doc); } if (!docs.length) return; cachedIndex.addAll(docs); for (const doc of docs) { addDocumentIdf(doc.id, doc.text); } xbLog.info(MODULE_ID, `Incremental add events=${docs.length}`); }