2026-02-16 00:30:59 +08:00
|
|
|
import MiniSearch from '../../../../libs/minisearch.mjs';
|
|
|
|
|
import { getContext } from '../../../../../../../extensions.js';
|
|
|
|
|
import { getSummaryStore } from '../../data/store.js';
|
|
|
|
|
import { getAllChunks } from '../storage/chunk-store.js';
|
|
|
|
|
import { xbLog } from '../../../../core/debug-core.js';
|
|
|
|
|
import { tokenizeForIndex } from '../utils/tokenizer.js';
|
|
|
|
|
|
|
|
|
|
const MODULE_ID = 'lexical-index';
|
|
|
|
|
|
2026-02-17 22:45:01 +08:00
|
|
|
// In-memory index cache
|
2026-02-16 00:30:59 +08:00
|
|
|
let cachedIndex = null;
|
|
|
|
|
let cachedChatId = null;
|
|
|
|
|
let cachedFingerprint = null;
|
|
|
|
|
let building = false;
|
|
|
|
|
let buildPromise = null;
|
2026-02-17 22:45:01 +08:00
|
|
|
|
|
|
|
|
// floor -> chunk doc ids (L1 only)
|
2026-02-16 00:30:59 +08:00
|
|
|
let floorDocIds = new Map();
|
|
|
|
|
|
2026-02-17 22:45:01 +08:00
|
|
|
// IDF stats over lexical docs (L1 chunks + L2 events)
|
|
|
|
|
let termDfMap = new Map();
|
|
|
|
|
let docTokenSets = new Map(); // docId -> Set<token>
|
|
|
|
|
let lexicalDocCount = 0;
|
|
|
|
|
|
|
|
|
|
const IDF_MIN = 1.0;
|
|
|
|
|
const IDF_MAX = 4.0;
|
|
|
|
|
const BUILD_BATCH_SIZE = 500;
|
2026-02-16 00:30:59 +08:00
|
|
|
|
|
|
|
|
function cleanSummary(summary) {
|
|
|
|
|
return String(summary || '')
|
|
|
|
|
.replace(/\s*\(#\d+(?:-\d+)?\)\s*$/, '')
|
|
|
|
|
.trim();
|
|
|
|
|
}
|
|
|
|
|
|
2026-02-17 22:45:01 +08:00
|
|
|
function fnv1a32(input, seed = 0x811C9DC5) {
|
|
|
|
|
let hash = seed >>> 0;
|
|
|
|
|
const text = String(input || '');
|
|
|
|
|
for (let i = 0; i < text.length; i++) {
|
|
|
|
|
hash ^= text.charCodeAt(i);
|
|
|
|
|
hash = Math.imul(hash, 0x01000193) >>> 0;
|
|
|
|
|
}
|
|
|
|
|
return hash >>> 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function compareDocKeys(a, b) {
|
|
|
|
|
const ka = `${a?.type || ''}:${a?.id || ''}`;
|
|
|
|
|
const kb = `${b?.type || ''}:${b?.id || ''}`;
|
|
|
|
|
if (ka < kb) return -1;
|
|
|
|
|
if (ka > kb) return 1;
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function computeFingerprintFromDocs(docs) {
|
|
|
|
|
const normalizedDocs = Array.isArray(docs) ? [...docs].sort(compareDocKeys) : [];
|
|
|
|
|
let hash = 0x811C9DC5;
|
|
|
|
|
|
|
|
|
|
for (const doc of normalizedDocs) {
|
|
|
|
|
const payload = `${doc?.type || ''}\u001F${doc?.id || ''}\u001F${doc?.floor ?? ''}\u001F${doc?.text || ''}\u001E`;
|
|
|
|
|
hash = fnv1a32(payload, hash);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return `${normalizedDocs.length}:${(hash >>> 0).toString(16)}`;
|
2026-02-16 00:30:59 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function yieldToMain() {
|
|
|
|
|
return new Promise(resolve => setTimeout(resolve, 0));
|
|
|
|
|
}
|
|
|
|
|
|
2026-02-17 22:45:01 +08:00
|
|
|
function clamp(v, min, max) {
|
|
|
|
|
return Math.max(min, Math.min(max, v));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function normalizeTerm(term) {
|
|
|
|
|
return String(term || '').trim().toLowerCase();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function computeIdfFromDf(df, docCount) {
|
|
|
|
|
if (!docCount || docCount <= 0) return 1;
|
|
|
|
|
const raw = Math.log((docCount + 1) / ((df || 0) + 1)) + 1;
|
|
|
|
|
return clamp(raw, IDF_MIN, IDF_MAX);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function computeIdf(term) {
|
|
|
|
|
const t = normalizeTerm(term);
|
|
|
|
|
if (!t || lexicalDocCount <= 0) return 1;
|
|
|
|
|
return computeIdfFromDf(termDfMap.get(t) || 0, lexicalDocCount);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function extractUniqueTokens(text) {
|
|
|
|
|
return new Set(tokenizeForIndex(String(text || '')).map(normalizeTerm).filter(Boolean));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function clearIdfState() {
|
|
|
|
|
termDfMap = new Map();
|
|
|
|
|
docTokenSets = new Map();
|
|
|
|
|
lexicalDocCount = 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function removeDocumentIdf(docId) {
|
|
|
|
|
const id = String(docId || '');
|
|
|
|
|
if (!id) return;
|
|
|
|
|
|
|
|
|
|
const tokens = docTokenSets.get(id);
|
|
|
|
|
if (!tokens) return;
|
|
|
|
|
|
|
|
|
|
for (const token of tokens) {
|
|
|
|
|
const current = termDfMap.get(token) || 0;
|
|
|
|
|
if (current <= 1) {
|
|
|
|
|
termDfMap.delete(token);
|
|
|
|
|
} else {
|
|
|
|
|
termDfMap.set(token, current - 1);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
docTokenSets.delete(id);
|
|
|
|
|
lexicalDocCount = Math.max(0, lexicalDocCount - 1);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function addDocumentIdf(docId, text) {
|
|
|
|
|
const id = String(docId || '');
|
|
|
|
|
if (!id) return;
|
|
|
|
|
|
|
|
|
|
// Replace semantics: remove old token set first if this id already exists.
|
|
|
|
|
removeDocumentIdf(id);
|
|
|
|
|
|
|
|
|
|
const tokens = extractUniqueTokens(text);
|
|
|
|
|
docTokenSets.set(id, tokens);
|
|
|
|
|
lexicalDocCount += 1;
|
|
|
|
|
|
|
|
|
|
for (const token of tokens) {
|
|
|
|
|
termDfMap.set(token, (termDfMap.get(token) || 0) + 1);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function rebuildIdfFromDocs(docs) {
|
|
|
|
|
clearIdfState();
|
|
|
|
|
for (const doc of docs || []) {
|
|
|
|
|
const id = String(doc?.id || '');
|
|
|
|
|
const text = String(doc?.text || '');
|
|
|
|
|
if (!id || !text.trim()) continue;
|
|
|
|
|
addDocumentIdf(id, text);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function buildEventDoc(ev) {
|
|
|
|
|
if (!ev?.id) return null;
|
|
|
|
|
|
|
|
|
|
const parts = [];
|
|
|
|
|
if (ev.title) parts.push(ev.title);
|
|
|
|
|
if (ev.participants?.length) parts.push(ev.participants.join(' '));
|
|
|
|
|
|
|
|
|
|
const summary = cleanSummary(ev.summary);
|
|
|
|
|
if (summary) parts.push(summary);
|
|
|
|
|
|
|
|
|
|
const text = parts.join(' ').trim();
|
|
|
|
|
if (!text) return null;
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
id: ev.id,
|
|
|
|
|
type: 'event',
|
|
|
|
|
floor: null,
|
|
|
|
|
text,
|
|
|
|
|
};
|
|
|
|
|
}
|
2026-02-16 00:30:59 +08:00
|
|
|
|
|
|
|
|
function collectDocuments(chunks, events) {
|
|
|
|
|
const docs = [];
|
|
|
|
|
|
2026-02-17 22:45:01 +08:00
|
|
|
for (const chunk of chunks || []) {
|
2026-02-16 00:30:59 +08:00
|
|
|
if (!chunk?.chunkId || !chunk.text) continue;
|
|
|
|
|
|
|
|
|
|
const floor = chunk.floor ?? -1;
|
|
|
|
|
docs.push({
|
|
|
|
|
id: chunk.chunkId,
|
|
|
|
|
type: 'chunk',
|
|
|
|
|
floor,
|
|
|
|
|
text: chunk.text,
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
if (floor >= 0) {
|
2026-02-17 22:45:01 +08:00
|
|
|
if (!floorDocIds.has(floor)) floorDocIds.set(floor, []);
|
2026-02-16 00:30:59 +08:00
|
|
|
floorDocIds.get(floor).push(chunk.chunkId);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2026-02-17 22:45:01 +08:00
|
|
|
for (const ev of events || []) {
|
|
|
|
|
const doc = buildEventDoc(ev);
|
|
|
|
|
if (doc) docs.push(doc);
|
2026-02-16 00:30:59 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return docs;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async function buildIndexAsync(docs) {
|
|
|
|
|
const T0 = performance.now();
|
|
|
|
|
|
|
|
|
|
const index = new MiniSearch({
|
|
|
|
|
fields: ['text'],
|
|
|
|
|
storeFields: ['type', 'floor'],
|
|
|
|
|
idField: 'id',
|
|
|
|
|
searchOptions: {
|
|
|
|
|
boost: { text: 1 },
|
|
|
|
|
fuzzy: 0.2,
|
|
|
|
|
prefix: true,
|
|
|
|
|
},
|
|
|
|
|
tokenize: tokenizeForIndex,
|
|
|
|
|
});
|
|
|
|
|
|
2026-02-17 22:45:01 +08:00
|
|
|
if (!docs.length) return index;
|
2026-02-16 00:30:59 +08:00
|
|
|
|
|
|
|
|
for (let i = 0; i < docs.length; i += BUILD_BATCH_SIZE) {
|
|
|
|
|
const batch = docs.slice(i, i + BUILD_BATCH_SIZE);
|
|
|
|
|
index.addAll(batch);
|
|
|
|
|
|
|
|
|
|
if (i + BUILD_BATCH_SIZE < docs.length) {
|
|
|
|
|
await yieldToMain();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const elapsed = Math.round(performance.now() - T0);
|
2026-02-17 22:45:01 +08:00
|
|
|
xbLog.info(MODULE_ID, `Index built: ${docs.length} docs (${elapsed}ms)`);
|
2026-02-16 00:30:59 +08:00
|
|
|
return index;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @typedef {object} LexicalSearchResult
|
2026-02-17 22:45:01 +08:00
|
|
|
* @property {string[]} atomIds - Reserved for backward compatibility (currently empty).
|
|
|
|
|
* @property {Set<number>} atomFloors - Reserved for backward compatibility (currently empty).
|
|
|
|
|
* @property {string[]} chunkIds - Matched L1 chunk ids sorted by weighted lexical score.
|
|
|
|
|
* @property {Set<number>} chunkFloors - Floor ids covered by matched chunks.
|
|
|
|
|
* @property {string[]} eventIds - Matched L2 event ids sorted by weighted lexical score.
|
|
|
|
|
* @property {object[]} chunkScores - Weighted lexical scores for matched chunks.
|
|
|
|
|
* @property {boolean} idfEnabled - Whether IDF stats are available for weighting.
|
|
|
|
|
* @property {number} idfDocCount - Number of lexical docs used to compute IDF.
|
|
|
|
|
* @property {Array<{term:string,idf:number}>} topIdfTerms - Top query terms by IDF.
|
|
|
|
|
* @property {string[]} queryTerms - Normalized query terms actually searched.
|
|
|
|
|
* @property {Record<string, Array<{floor:number, weightedScore:number, chunkId:string}>>} termFloorHits - Chunk-floor hits by term.
|
|
|
|
|
* @property {Array<{floor:number, score:number, hitTermsCount:number}>} floorLexScores - Aggregated lexical floor scores (debug).
|
|
|
|
|
* @property {number} termSearches - Number of per-term MiniSearch queries executed.
|
|
|
|
|
* @property {number} searchTime - Total lexical search time in milliseconds.
|
2026-02-16 00:30:59 +08:00
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
/**
|
2026-02-17 22:45:01 +08:00
|
|
|
* Search lexical index by terms, using per-term MiniSearch and IDF-weighted score aggregation.
|
|
|
|
|
* This keeps existing outputs compatible while adding observability fields.
|
2026-02-16 00:30:59 +08:00
|
|
|
*
|
2026-02-17 22:45:01 +08:00
|
|
|
* @param {MiniSearch} index
|
|
|
|
|
* @param {string[]} terms
|
2026-02-16 00:30:59 +08:00
|
|
|
* @returns {LexicalSearchResult}
|
|
|
|
|
*/
|
|
|
|
|
export function searchLexicalIndex(index, terms) {
|
|
|
|
|
const T0 = performance.now();
|
|
|
|
|
|
|
|
|
|
const result = {
|
|
|
|
|
atomIds: [],
|
|
|
|
|
atomFloors: new Set(),
|
|
|
|
|
chunkIds: [],
|
|
|
|
|
chunkFloors: new Set(),
|
|
|
|
|
eventIds: [],
|
|
|
|
|
chunkScores: [],
|
2026-02-17 22:45:01 +08:00
|
|
|
idfEnabled: lexicalDocCount > 0,
|
|
|
|
|
idfDocCount: lexicalDocCount,
|
|
|
|
|
topIdfTerms: [],
|
|
|
|
|
queryTerms: [],
|
|
|
|
|
termFloorHits: {},
|
|
|
|
|
floorLexScores: [],
|
|
|
|
|
termSearches: 0,
|
2026-02-16 00:30:59 +08:00
|
|
|
searchTime: 0,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
if (!index || !terms?.length) {
|
|
|
|
|
result.searchTime = Math.round(performance.now() - T0);
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
2026-02-17 22:45:01 +08:00
|
|
|
const queryTerms = Array.from(new Set((terms || []).map(normalizeTerm).filter(Boolean)));
|
|
|
|
|
result.queryTerms = [...queryTerms];
|
|
|
|
|
const weightedScores = new Map(); // docId -> score
|
|
|
|
|
const hitMeta = new Map(); // docId -> { type, floor }
|
|
|
|
|
const idfPairs = [];
|
|
|
|
|
const termFloorHits = new Map(); // term -> [{ floor, weightedScore, chunkId }]
|
|
|
|
|
const floorLexAgg = new Map(); // floor -> { score, terms:Set<string> }
|
2026-02-16 00:30:59 +08:00
|
|
|
|
2026-02-17 22:45:01 +08:00
|
|
|
for (const term of queryTerms) {
|
|
|
|
|
const idf = computeIdf(term);
|
|
|
|
|
idfPairs.push({ term, idf });
|
|
|
|
|
|
|
|
|
|
let hits = [];
|
|
|
|
|
try {
|
|
|
|
|
hits = index.search(term, {
|
|
|
|
|
boost: { text: 1 },
|
|
|
|
|
fuzzy: 0.2,
|
|
|
|
|
prefix: true,
|
|
|
|
|
combineWith: 'OR',
|
|
|
|
|
tokenize: tokenizeForIndex,
|
|
|
|
|
});
|
|
|
|
|
} catch (e) {
|
|
|
|
|
xbLog.warn(MODULE_ID, `Lexical term search failed: ${term}`, e);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
result.termSearches += 1;
|
|
|
|
|
|
|
|
|
|
for (const hit of hits) {
|
|
|
|
|
const id = String(hit.id || '');
|
|
|
|
|
if (!id) continue;
|
|
|
|
|
|
|
|
|
|
const weighted = (hit.score || 0) * idf;
|
|
|
|
|
weightedScores.set(id, (weightedScores.get(id) || 0) + weighted);
|
|
|
|
|
|
|
|
|
|
if (!hitMeta.has(id)) {
|
|
|
|
|
hitMeta.set(id, {
|
|
|
|
|
type: hit.type,
|
|
|
|
|
floor: hit.floor,
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (hit.type === 'chunk' && typeof hit.floor === 'number' && hit.floor >= 0) {
|
|
|
|
|
if (!termFloorHits.has(term)) termFloorHits.set(term, []);
|
|
|
|
|
termFloorHits.get(term).push({
|
|
|
|
|
floor: hit.floor,
|
|
|
|
|
weightedScore: weighted,
|
|
|
|
|
chunkId: id,
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
const floorAgg = floorLexAgg.get(hit.floor) || { score: 0, terms: new Set() };
|
|
|
|
|
floorAgg.score += weighted;
|
|
|
|
|
floorAgg.terms.add(term);
|
|
|
|
|
floorLexAgg.set(hit.floor, floorAgg);
|
|
|
|
|
}
|
|
|
|
|
}
|
2026-02-16 00:30:59 +08:00
|
|
|
}
|
|
|
|
|
|
2026-02-17 22:45:01 +08:00
|
|
|
idfPairs.sort((a, b) => b.idf - a.idf);
|
|
|
|
|
result.topIdfTerms = idfPairs.slice(0, 5);
|
|
|
|
|
result.termFloorHits = Object.fromEntries(
|
|
|
|
|
[...termFloorHits.entries()].map(([term, hits]) => [term, hits]),
|
|
|
|
|
);
|
|
|
|
|
result.floorLexScores = [...floorLexAgg.entries()]
|
|
|
|
|
.map(([floor, info]) => ({
|
|
|
|
|
floor,
|
|
|
|
|
score: Number(info.score.toFixed(6)),
|
|
|
|
|
hitTermsCount: info.terms.size,
|
|
|
|
|
}))
|
|
|
|
|
.sort((a, b) => b.score - a.score);
|
|
|
|
|
|
|
|
|
|
const sortedHits = Array.from(weightedScores.entries())
|
|
|
|
|
.sort((a, b) => b[1] - a[1]);
|
|
|
|
|
|
|
|
|
|
for (const [id, score] of sortedHits) {
|
|
|
|
|
const meta = hitMeta.get(id);
|
|
|
|
|
if (!meta) continue;
|
|
|
|
|
|
|
|
|
|
if (meta.type === 'chunk') {
|
|
|
|
|
result.chunkIds.push(id);
|
|
|
|
|
result.chunkScores.push({ chunkId: id, score });
|
|
|
|
|
if (typeof meta.floor === 'number' && meta.floor >= 0) {
|
|
|
|
|
result.chunkFloors.add(meta.floor);
|
|
|
|
|
}
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (meta.type === 'event') {
|
|
|
|
|
result.eventIds.push(id);
|
2026-02-16 00:30:59 +08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
result.searchTime = Math.round(performance.now() - T0);
|
|
|
|
|
|
2026-02-17 22:45:01 +08:00
|
|
|
xbLog.info(
|
|
|
|
|
MODULE_ID,
|
|
|
|
|
`Lexical search terms=[${queryTerms.slice(0, 5).join(',')}] chunks=${result.chunkIds.length} events=${result.eventIds.length} termSearches=${result.termSearches} (${result.searchTime}ms)`,
|
2026-02-16 00:30:59 +08:00
|
|
|
);
|
|
|
|
|
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async function collectAndBuild(chatId) {
|
|
|
|
|
floorDocIds = new Map();
|
|
|
|
|
|
|
|
|
|
const store = getSummaryStore();
|
|
|
|
|
const events = store?.json?.events || [];
|
|
|
|
|
|
|
|
|
|
let chunks = [];
|
|
|
|
|
try {
|
|
|
|
|
chunks = await getAllChunks(chatId);
|
|
|
|
|
} catch (e) {
|
2026-02-17 22:45:01 +08:00
|
|
|
xbLog.warn(MODULE_ID, 'Failed to load chunks', e);
|
2026-02-16 00:30:59 +08:00
|
|
|
}
|
|
|
|
|
|
2026-02-17 22:45:01 +08:00
|
|
|
const docs = collectDocuments(chunks, events);
|
|
|
|
|
const fp = computeFingerprintFromDocs(docs);
|
2026-02-16 00:30:59 +08:00
|
|
|
|
|
|
|
|
if (cachedIndex && cachedChatId === chatId && cachedFingerprint === fp) {
|
|
|
|
|
return { index: cachedIndex, fingerprint: fp };
|
|
|
|
|
}
|
|
|
|
|
|
2026-02-17 22:45:01 +08:00
|
|
|
rebuildIdfFromDocs(docs);
|
2026-02-16 00:30:59 +08:00
|
|
|
const index = await buildIndexAsync(docs);
|
|
|
|
|
|
|
|
|
|
return { index, fingerprint: fp };
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
2026-02-17 22:45:01 +08:00
|
|
|
* Expose IDF accessor for query-term selection in query-builder.
|
|
|
|
|
* If index stats are not ready, this gracefully falls back to idf=1.
|
2026-02-16 00:30:59 +08:00
|
|
|
*/
|
2026-02-17 22:45:01 +08:00
|
|
|
export function getLexicalIdfAccessor() {
|
|
|
|
|
return {
|
|
|
|
|
enabled: lexicalDocCount > 0,
|
|
|
|
|
docCount: lexicalDocCount,
|
|
|
|
|
getIdf(term) {
|
|
|
|
|
return computeIdf(term);
|
|
|
|
|
},
|
|
|
|
|
};
|
|
|
|
|
}
|
|
|
|
|
|
2026-02-16 00:30:59 +08:00
|
|
|
export async function getLexicalIndex() {
|
|
|
|
|
const { chatId } = getContext();
|
|
|
|
|
if (!chatId) return null;
|
|
|
|
|
|
|
|
|
|
if (cachedIndex && cachedChatId === chatId && cachedFingerprint) {
|
|
|
|
|
return cachedIndex;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (building && buildPromise) {
|
|
|
|
|
try {
|
|
|
|
|
await buildPromise;
|
|
|
|
|
if (cachedIndex && cachedChatId === chatId && cachedFingerprint) {
|
|
|
|
|
return cachedIndex;
|
|
|
|
|
}
|
|
|
|
|
} catch {
|
2026-02-17 22:45:01 +08:00
|
|
|
// Continue to rebuild below.
|
2026-02-16 00:30:59 +08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2026-02-17 22:45:01 +08:00
|
|
|
xbLog.info(MODULE_ID, `Lexical cache miss; rebuilding (chatId=${chatId.slice(0, 8)})`);
|
2026-02-16 00:30:59 +08:00
|
|
|
|
|
|
|
|
building = true;
|
|
|
|
|
buildPromise = collectAndBuild(chatId);
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
const { index, fingerprint } = await buildPromise;
|
|
|
|
|
cachedIndex = index;
|
|
|
|
|
cachedChatId = chatId;
|
|
|
|
|
cachedFingerprint = fingerprint;
|
|
|
|
|
return index;
|
|
|
|
|
} catch (e) {
|
2026-02-17 22:45:01 +08:00
|
|
|
xbLog.error(MODULE_ID, 'Index build failed', e);
|
2026-02-16 00:30:59 +08:00
|
|
|
return null;
|
|
|
|
|
} finally {
|
|
|
|
|
building = false;
|
|
|
|
|
buildPromise = null;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
export function warmupIndex() {
|
|
|
|
|
const { chatId } = getContext();
|
2026-02-17 22:45:01 +08:00
|
|
|
if (!chatId || building) return;
|
2026-02-16 00:30:59 +08:00
|
|
|
|
|
|
|
|
getLexicalIndex().catch(e => {
|
2026-02-17 22:45:01 +08:00
|
|
|
xbLog.warn(MODULE_ID, 'Warmup failed', e);
|
2026-02-16 00:30:59 +08:00
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
export function invalidateLexicalIndex() {
|
|
|
|
|
if (cachedIndex) {
|
2026-02-17 22:45:01 +08:00
|
|
|
xbLog.info(MODULE_ID, 'Lexical index cache invalidated');
|
2026-02-16 00:30:59 +08:00
|
|
|
}
|
|
|
|
|
cachedIndex = null;
|
|
|
|
|
cachedChatId = null;
|
|
|
|
|
cachedFingerprint = null;
|
|
|
|
|
floorDocIds = new Map();
|
2026-02-17 22:45:01 +08:00
|
|
|
clearIdfState();
|
2026-02-16 00:30:59 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
export function addDocumentsForFloor(floor, chunks) {
|
|
|
|
|
if (!cachedIndex || !chunks?.length) return;
|
|
|
|
|
|
|
|
|
|
removeDocumentsByFloor(floor);
|
|
|
|
|
|
|
|
|
|
const docs = [];
|
|
|
|
|
const docIds = [];
|
|
|
|
|
|
|
|
|
|
for (const chunk of chunks) {
|
|
|
|
|
if (!chunk?.chunkId || !chunk.text) continue;
|
2026-02-17 22:45:01 +08:00
|
|
|
|
|
|
|
|
const doc = {
|
2026-02-16 00:30:59 +08:00
|
|
|
id: chunk.chunkId,
|
|
|
|
|
type: 'chunk',
|
|
|
|
|
floor: chunk.floor ?? floor,
|
|
|
|
|
text: chunk.text,
|
2026-02-17 22:45:01 +08:00
|
|
|
};
|
|
|
|
|
docs.push(doc);
|
2026-02-16 00:30:59 +08:00
|
|
|
docIds.push(chunk.chunkId);
|
|
|
|
|
}
|
|
|
|
|
|
2026-02-17 22:45:01 +08:00
|
|
|
if (!docs.length) return;
|
|
|
|
|
|
|
|
|
|
cachedIndex.addAll(docs);
|
|
|
|
|
floorDocIds.set(floor, docIds);
|
|
|
|
|
|
|
|
|
|
for (const doc of docs) {
|
|
|
|
|
addDocumentIdf(doc.id, doc.text);
|
2026-02-16 00:30:59 +08:00
|
|
|
}
|
2026-02-17 22:45:01 +08:00
|
|
|
|
|
|
|
|
xbLog.info(MODULE_ID, `Incremental add floor=${floor} chunks=${docs.length}`);
|
2026-02-16 00:30:59 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
export function removeDocumentsByFloor(floor) {
|
|
|
|
|
if (!cachedIndex) return;
|
|
|
|
|
|
|
|
|
|
const docIds = floorDocIds.get(floor);
|
|
|
|
|
if (!docIds?.length) return;
|
|
|
|
|
|
|
|
|
|
for (const id of docIds) {
|
|
|
|
|
try {
|
|
|
|
|
cachedIndex.discard(id);
|
|
|
|
|
} catch {
|
2026-02-17 22:45:01 +08:00
|
|
|
// Ignore if the doc was already removed/rebuilt.
|
2026-02-16 00:30:59 +08:00
|
|
|
}
|
2026-02-17 22:45:01 +08:00
|
|
|
removeDocumentIdf(id);
|
2026-02-16 00:30:59 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
floorDocIds.delete(floor);
|
2026-02-17 22:45:01 +08:00
|
|
|
xbLog.info(MODULE_ID, `Incremental remove floor=${floor} chunks=${docIds.length}`);
|
2026-02-16 00:30:59 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
export function addEventDocuments(events) {
|
|
|
|
|
if (!cachedIndex || !events?.length) return;
|
|
|
|
|
|
|
|
|
|
const docs = [];
|
|
|
|
|
|
|
|
|
|
for (const ev of events) {
|
2026-02-17 22:45:01 +08:00
|
|
|
const doc = buildEventDoc(ev);
|
|
|
|
|
if (!doc) continue;
|
2026-02-16 00:30:59 +08:00
|
|
|
|
|
|
|
|
try {
|
2026-02-17 22:45:01 +08:00
|
|
|
cachedIndex.discard(doc.id);
|
2026-02-16 00:30:59 +08:00
|
|
|
} catch {
|
2026-02-17 22:45:01 +08:00
|
|
|
// Ignore if previous document does not exist.
|
2026-02-16 00:30:59 +08:00
|
|
|
}
|
2026-02-17 22:45:01 +08:00
|
|
|
removeDocumentIdf(doc.id);
|
|
|
|
|
docs.push(doc);
|
2026-02-16 00:30:59 +08:00
|
|
|
}
|
|
|
|
|
|
2026-02-17 22:45:01 +08:00
|
|
|
if (!docs.length) return;
|
|
|
|
|
|
|
|
|
|
cachedIndex.addAll(docs);
|
|
|
|
|
for (const doc of docs) {
|
|
|
|
|
addDocumentIdf(doc.id, doc.text);
|
2026-02-16 00:30:59 +08:00
|
|
|
}
|
2026-02-17 22:45:01 +08:00
|
|
|
|
|
|
|
|
xbLog.info(MODULE_ID, `Incremental add events=${docs.length}`);
|
2026-02-16 00:30:59 +08:00
|
|
|
}
|