Update retrieval, rerank, and indexing changes
This commit is contained in:
@@ -14,7 +14,6 @@
|
||||
import MiniSearch from '../../../../libs/minisearch.mjs';
|
||||
import { getContext } from '../../../../../../../extensions.js';
|
||||
import { getSummaryStore } from '../../data/store.js';
|
||||
import { getStateAtoms } from '../storage/state-store.js';
|
||||
import { getAllChunks } from '../storage/chunk-store.js';
|
||||
import { xbLog } from '../../../../core/debug-core.js';
|
||||
import { tokenizeForIndex } from '../utils/tokenizer.js';
|
||||
@@ -39,6 +38,8 @@ let building = false;
|
||||
|
||||
/** @type {Promise<MiniSearch|null>|null} 当前构建 Promise(防重入) */
|
||||
let buildPromise = null;
|
||||
/** @type {Map<number, string[]>} floor → 该楼层的 doc IDs(仅 L1 chunks) */
|
||||
let floorDocIds = new Map();
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────
|
||||
// 工具函数
|
||||
@@ -57,13 +58,12 @@ function cleanSummary(summary) {
|
||||
|
||||
/**
|
||||
* 计算缓存指纹
|
||||
* @param {number} atomCount
|
||||
* @param {number} chunkCount
|
||||
* @param {number} eventCount
|
||||
* @returns {string}
|
||||
*/
|
||||
function computeFingerprint(atomCount, chunkCount, eventCount) {
|
||||
return `${atomCount}:${chunkCount}:${eventCount}`;
|
||||
function computeFingerprint(chunkCount, eventCount) {
|
||||
return `${chunkCount}:${eventCount}`;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -81,34 +81,31 @@ function yieldToMain() {
|
||||
/**
|
||||
* 收集所有待索引文档
|
||||
*
|
||||
* @param {object[]} atoms - getStateAtoms() 返回值
|
||||
* @param {object[]} chunks - getAllChunks(chatId) 返回值
|
||||
* @param {object[]} events - store.json.events
|
||||
* @returns {object[]} 文档数组
|
||||
*/
|
||||
function collectDocuments(atoms, chunks, events) {
|
||||
function collectDocuments(chunks, events) {
|
||||
const docs = [];
|
||||
|
||||
// L0 atoms
|
||||
for (const atom of (atoms || [])) {
|
||||
if (!atom?.atomId || !atom.semantic) continue;
|
||||
docs.push({
|
||||
id: atom.atomId,
|
||||
type: 'atom',
|
||||
floor: atom.floor ?? -1,
|
||||
text: atom.semantic,
|
||||
});
|
||||
}
|
||||
|
||||
// L1 chunks
|
||||
// L1 chunks + 填充 floorDocIds
|
||||
for (const chunk of (chunks || [])) {
|
||||
if (!chunk?.chunkId || !chunk.text) continue;
|
||||
|
||||
const floor = chunk.floor ?? -1;
|
||||
docs.push({
|
||||
id: chunk.chunkId,
|
||||
type: 'chunk',
|
||||
floor: chunk.floor ?? -1,
|
||||
floor,
|
||||
text: chunk.text,
|
||||
});
|
||||
|
||||
if (floor >= 0) {
|
||||
if (!floorDocIds.has(floor)) {
|
||||
floorDocIds.set(floor, []);
|
||||
}
|
||||
floorDocIds.get(floor).push(chunk.chunkId);
|
||||
}
|
||||
}
|
||||
|
||||
// L2 events
|
||||
@@ -244,7 +241,6 @@ export function searchLexicalIndex(index, terms) {
|
||||
}
|
||||
|
||||
// 分类结果
|
||||
const atomIdSet = new Set();
|
||||
const chunkIdSet = new Set();
|
||||
const eventIdSet = new Set();
|
||||
|
||||
@@ -254,16 +250,6 @@ export function searchLexicalIndex(index, terms) {
|
||||
const floor = hit.floor;
|
||||
|
||||
switch (type) {
|
||||
case 'atom':
|
||||
if (!atomIdSet.has(id)) {
|
||||
atomIdSet.add(id);
|
||||
result.atomIds.push(id);
|
||||
if (typeof floor === 'number' && floor >= 0) {
|
||||
result.atomFloors.add(floor);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case 'chunk':
|
||||
if (!chunkIdSet.has(id)) {
|
||||
chunkIdSet.add(id);
|
||||
@@ -304,8 +290,10 @@ export function searchLexicalIndex(index, terms) {
|
||||
* @returns {Promise<{index: MiniSearch, fingerprint: string}>}
|
||||
*/
|
||||
async function collectAndBuild(chatId) {
|
||||
// 收集数据
|
||||
const atoms = getStateAtoms() || [];
|
||||
// 清空侧索引(全量重建)
|
||||
floorDocIds = new Map();
|
||||
|
||||
// 收集数据(不含 L0 atoms)
|
||||
const store = getSummaryStore();
|
||||
const events = store?.json?.events || [];
|
||||
|
||||
@@ -316,15 +304,15 @@ async function collectAndBuild(chatId) {
|
||||
xbLog.warn(MODULE_ID, '获取 chunks 失败', e);
|
||||
}
|
||||
|
||||
const fp = computeFingerprint(atoms.length, chunks.length, events.length);
|
||||
const fp = computeFingerprint(chunks.length, events.length);
|
||||
|
||||
// 检查是否在收集过程中缓存已被其他调用更新
|
||||
if (cachedIndex && cachedChatId === chatId && cachedFingerprint === fp) {
|
||||
return { index: cachedIndex, fingerprint: fp };
|
||||
}
|
||||
|
||||
// 收集文档
|
||||
const docs = collectDocuments(atoms, chunks, events);
|
||||
// 收集文档(同时填充 floorDocIds)
|
||||
const docs = collectDocuments(chunks, events);
|
||||
|
||||
// 异步分片构建
|
||||
const index = await buildIndexAsync(docs);
|
||||
@@ -438,4 +426,116 @@ export function invalidateLexicalIndex() {
|
||||
cachedIndex = null;
|
||||
cachedChatId = null;
|
||||
cachedFingerprint = null;
|
||||
floorDocIds = new Map();
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────
|
||||
// 增量更新接口
|
||||
// ─────────────────────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* 为指定楼层添加 L1 chunks 到索引
|
||||
*
|
||||
* 先移除该楼层旧文档,再添加新文档。
|
||||
* 如果索引不存在(缓存失效),静默跳过(下次 getLexicalIndex 全量重建)。
|
||||
*
|
||||
* @param {number} floor - 楼层号
|
||||
* @param {object[]} chunks - chunk 对象列表(需有 chunkId、text、floor)
|
||||
*/
|
||||
export function addDocumentsForFloor(floor, chunks) {
|
||||
if (!cachedIndex || !chunks?.length) return;
|
||||
|
||||
// 先移除旧文档
|
||||
removeDocumentsByFloor(floor);
|
||||
|
||||
const docs = [];
|
||||
const docIds = [];
|
||||
|
||||
for (const chunk of chunks) {
|
||||
if (!chunk?.chunkId || !chunk.text) continue;
|
||||
docs.push({
|
||||
id: chunk.chunkId,
|
||||
type: 'chunk',
|
||||
floor: chunk.floor ?? floor,
|
||||
text: chunk.text,
|
||||
});
|
||||
docIds.push(chunk.chunkId);
|
||||
}
|
||||
|
||||
if (docs.length > 0) {
|
||||
cachedIndex.addAll(docs);
|
||||
floorDocIds.set(floor, docIds);
|
||||
xbLog.info(MODULE_ID, `增量添加: floor ${floor}, ${docs.length} 个 chunk`);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 从索引中移除指定楼层的所有 L1 chunk 文档
|
||||
*
|
||||
* 使用 MiniSearch discard()(软删除)。
|
||||
* 如果索引不存在,静默跳过。
|
||||
*
|
||||
* @param {number} floor - 楼层号
|
||||
*/
|
||||
export function removeDocumentsByFloor(floor) {
|
||||
if (!cachedIndex) return;
|
||||
|
||||
const docIds = floorDocIds.get(floor);
|
||||
if (!docIds?.length) return;
|
||||
|
||||
for (const id of docIds) {
|
||||
try {
|
||||
cachedIndex.discard(id);
|
||||
} catch {
|
||||
// 文档可能不存在(已被全量重建替换)
|
||||
}
|
||||
}
|
||||
|
||||
floorDocIds.delete(floor);
|
||||
xbLog.info(MODULE_ID, `增量移除: floor ${floor}, ${docIds.length} 个文档`);
|
||||
}
|
||||
|
||||
/**
|
||||
* 将新 L2 事件添加到索引
|
||||
*
|
||||
* 如果事件 ID 已存在,先 discard 再 add(覆盖)。
|
||||
* 如果索引不存在,静默跳过。
|
||||
*
|
||||
* @param {object[]} events - 事件对象列表(需有 id、title、summary 等)
|
||||
*/
|
||||
export function addEventDocuments(events) {
|
||||
if (!cachedIndex || !events?.length) return;
|
||||
|
||||
const docs = [];
|
||||
|
||||
for (const ev of events) {
|
||||
if (!ev?.id) continue;
|
||||
|
||||
const parts = [];
|
||||
if (ev.title) parts.push(ev.title);
|
||||
if (ev.participants?.length) parts.push(ev.participants.join(' '));
|
||||
const summary = cleanSummary(ev.summary);
|
||||
if (summary) parts.push(summary);
|
||||
const text = parts.join(' ').trim();
|
||||
if (!text) continue;
|
||||
|
||||
// 覆盖:先尝试移除旧的
|
||||
try {
|
||||
cachedIndex.discard(ev.id);
|
||||
} catch {
|
||||
// 不存在则忽略
|
||||
}
|
||||
|
||||
docs.push({
|
||||
id: ev.id,
|
||||
type: 'event',
|
||||
floor: null,
|
||||
text,
|
||||
});
|
||||
}
|
||||
|
||||
if (docs.length > 0) {
|
||||
cachedIndex.addAll(docs);
|
||||
xbLog.info(MODULE_ID, `增量添加: ${docs.length} 个事件`);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user