Upload LittleWhiteBox extension
This commit is contained in:
376
modules/story-summary/vector/llm/atom-extraction.js
Normal file
376
modules/story-summary/vector/llm/atom-extraction.js
Normal file
@@ -0,0 +1,376 @@
|
||||
// ============================================================================
|
||||
// atom-extraction.js - L0 场景锚点提取(v2 - 场景摘要 + 图结构)
|
||||
//
|
||||
// 设计依据:
|
||||
// - BGE-M3 (BAAI, 2024): 自然语言段落检索精度最高 → semantic = 纯自然语言
|
||||
// - TransE (Bordes, 2013): s/t/r 三元组方向性 → edges 格式
|
||||
//
|
||||
// 每楼层 1-2 个场景锚点(非碎片原子),60-100 字场景摘要
|
||||
// ============================================================================
|
||||
|
||||
import { callLLM, parseJson } from './llm-service.js';
|
||||
import { xbLog } from '../../../../core/debug-core.js';
|
||||
import { filterText } from '../utils/text-filter.js';
|
||||
|
||||
const MODULE_ID = 'atom-extraction';
|
||||
|
||||
const CONCURRENCY = 10;
|
||||
const RETRY_COUNT = 2;
|
||||
const RETRY_DELAY = 500;
|
||||
const DEFAULT_TIMEOUT = 20000;
|
||||
const STAGGER_DELAY = 80;
|
||||
|
||||
let batchCancelled = false;
|
||||
|
||||
export function cancelBatchExtraction() {
|
||||
batchCancelled = true;
|
||||
}
|
||||
|
||||
export function isBatchCancelled() {
|
||||
return batchCancelled;
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// L0 提取 Prompt
|
||||
// ============================================================================
|
||||
|
||||
const SYSTEM_PROMPT = `你是场景摘要器。从一轮对话中提取1-2个场景锚点,用于语义检索和关系追踪。
|
||||
|
||||
输入格式:
|
||||
<round>
|
||||
<user name="用户名">...</user>
|
||||
<assistant>...</assistant>
|
||||
</round>
|
||||
|
||||
只输出严格JSON:
|
||||
{"anchors":[
|
||||
{
|
||||
"scene": "60-100字完整场景描述",
|
||||
"edges": [{"s":"施事方","t":"受事方","r":"互动行为"}],
|
||||
"where": "地点"
|
||||
}
|
||||
]}
|
||||
|
||||
## scene 写法
|
||||
- 纯自然语言,像旁白或日记,不要任何标签/标记/枚举值
|
||||
- 必须包含:角色名、动作、情感氛围、关键细节
|
||||
- 读者只看 scene 就能复原这一幕
|
||||
- 60-100字,信息密集但流畅
|
||||
|
||||
## edges(关系三元组)
|
||||
- s=施事方 t=受事方 r=互动行为(建议 6-12 字,最多 20 字)
|
||||
- s/t 必须是参与互动的角色正式名称,不用代词或别称
|
||||
- 只从正文内容中识别角色名,不要把标签名(如 user、assistant)当作角色
|
||||
- r 使用动作模板短语:“动作+对象/结果”(例:“提出交易条件”、“拒绝对方请求”、“当众揭露秘密”、“安抚对方情绪”)
|
||||
- r 不要写人名,不要复述整句,不要写心理描写或评价词
|
||||
- r 正例(合格):提出交易条件、拒绝对方请求、当众揭露秘密、安抚对方情绪、强行打断发言、转移谈话焦点
|
||||
- r 反例(不合格):我觉得她现在很害怕、他突然非常生气地大喊起来、user开始说话、assistant解释了很多细节
|
||||
- 每个锚点 1-3 条
|
||||
|
||||
## where
|
||||
- 场景地点,无明确地点时空字符串
|
||||
|
||||
## 数量规则
|
||||
- 最多2个。1个够时不凑2个
|
||||
- 明显场景切换(地点/时间/对象变化)时才2个
|
||||
- 同一场景不拆分
|
||||
- 无角色互动时返回 {"anchors":[]}
|
||||
|
||||
## 示例
|
||||
输入:艾拉在火山口举起圣剑刺穿古龙心脏,龙血溅满她的铠甲,她跪倒在地痛哭
|
||||
输出:
|
||||
{"anchors":[{"scene":"火山口上艾拉举起圣剑刺穿古龙的心脏,龙血溅满铠甲,古龙轰然倒地,艾拉跪倒在滚烫的岩石上痛哭,完成了她不得不做的弑杀","edges":[{"s":"艾拉","t":"古龙","r":"以圣剑刺穿心脏"}],"where":"火山口"}]}`;
|
||||
|
||||
const JSON_PREFILL = '{"anchors":[';
|
||||
|
||||
// ============================================================================
|
||||
// 睡眠工具
|
||||
// ============================================================================
|
||||
|
||||
const sleep = (ms) => new Promise(r => setTimeout(r, ms));
|
||||
|
||||
const ACTION_STRIP_WORDS = [
|
||||
'突然', '非常', '有些', '有点', '轻轻', '悄悄', '缓缓', '立刻',
|
||||
'马上', '然后', '并且', '而且', '开始', '继续', '再次', '正在',
|
||||
];
|
||||
|
||||
function clamp(v, min, max) {
|
||||
return Math.max(min, Math.min(max, v));
|
||||
}
|
||||
|
||||
function sanitizeActionPhrase(raw) {
|
||||
let text = String(raw || '')
|
||||
.normalize('NFKC')
|
||||
.replace(/[\u200B-\u200D\uFEFF]/g, '')
|
||||
.trim();
|
||||
if (!text) return '';
|
||||
|
||||
text = text
|
||||
.replace(/[,。!?、;:,.!?;:"'“”‘’()()[\]{}<>《》]/g, '')
|
||||
.replace(/\s+/g, '');
|
||||
|
||||
for (const word of ACTION_STRIP_WORDS) {
|
||||
text = text.replaceAll(word, '');
|
||||
}
|
||||
|
||||
text = text.replace(/(地|得|了|着|过)+$/g, '');
|
||||
|
||||
if (text.length < 2) return '';
|
||||
if (text.length > 12) text = text.slice(0, 12);
|
||||
return text;
|
||||
}
|
||||
|
||||
function calcAtomQuality(scene, edges, where) {
|
||||
const sceneLen = String(scene || '').length;
|
||||
const sceneScore = clamp(sceneLen / 80, 0, 1);
|
||||
const edgeScore = clamp((edges?.length || 0) / 3, 0, 1);
|
||||
const whereScore = where ? 1 : 0;
|
||||
const quality = 0.55 * sceneScore + 0.35 * edgeScore + 0.10 * whereScore;
|
||||
return Number(quality.toFixed(3));
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// 清洗与构建
|
||||
// ============================================================================
|
||||
|
||||
/**
|
||||
* 清洗 edges 三元组
|
||||
* @param {object[]} raw
|
||||
* @returns {object[]}
|
||||
*/
|
||||
function sanitizeEdges(raw) {
|
||||
if (!Array.isArray(raw)) return [];
|
||||
return raw
|
||||
.filter(e => e && typeof e === 'object')
|
||||
.map(e => ({
|
||||
s: String(e.s || '').trim(),
|
||||
t: String(e.t || '').trim(),
|
||||
r: sanitizeActionPhrase(e.r),
|
||||
}))
|
||||
.filter(e => e.s && e.t && e.r)
|
||||
.slice(0, 3);
|
||||
}
|
||||
|
||||
/**
|
||||
* 将解析后的 anchor 转换为 atom 存储对象
|
||||
*
|
||||
* semantic = scene(纯自然语言,直接用于 embedding)
|
||||
*
|
||||
* @param {object} anchor - LLM 输出的 anchor 对象
|
||||
* @param {number} aiFloor - AI 消息楼层号
|
||||
* @param {number} idx - 同楼层序号(0 或 1)
|
||||
* @returns {object|null} atom 对象
|
||||
*/
|
||||
function anchorToAtom(anchor, aiFloor, idx) {
|
||||
const scene = String(anchor.scene || '').trim();
|
||||
if (!scene) return null;
|
||||
|
||||
// scene 过短(< 15 字)可能是噪音
|
||||
if (scene.length < 15) return null;
|
||||
const edges = sanitizeEdges(anchor.edges);
|
||||
const where = String(anchor.where || '').trim();
|
||||
const quality = calcAtomQuality(scene, edges, where);
|
||||
|
||||
return {
|
||||
atomId: `atom-${aiFloor}-${idx}`,
|
||||
floor: aiFloor,
|
||||
source: 'ai',
|
||||
|
||||
// ═══ 检索层(embedding 的唯一入口) ═══
|
||||
semantic: scene,
|
||||
|
||||
// ═══ 图结构层(扩散的 key) ═══
|
||||
edges,
|
||||
where,
|
||||
quality,
|
||||
};
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// 单轮提取(带重试)
|
||||
// ============================================================================
|
||||
|
||||
async function extractAtomsForRoundWithRetry(userMessage, aiMessage, aiFloor, options = {}) {
|
||||
const { timeout = DEFAULT_TIMEOUT } = options;
|
||||
|
||||
if (!aiMessage?.mes?.trim()) return [];
|
||||
|
||||
const parts = [];
|
||||
const userName = userMessage?.name || '用户';
|
||||
|
||||
if (userMessage?.mes?.trim()) {
|
||||
const userText = filterText(userMessage.mes);
|
||||
parts.push(`<user name="${userName}">\n${userText}\n</user>`);
|
||||
}
|
||||
|
||||
const aiText = filterText(aiMessage.mes);
|
||||
parts.push(`<assistant>\n${aiText}\n</assistant>`);
|
||||
|
||||
const input = `<round>\n${parts.join('\n')}\n</round>`;
|
||||
|
||||
for (let attempt = 0; attempt <= RETRY_COUNT; attempt++) {
|
||||
if (batchCancelled) return [];
|
||||
|
||||
try {
|
||||
const response = await callLLM([
|
||||
{ role: 'system', content: SYSTEM_PROMPT },
|
||||
{ role: 'user', content: input },
|
||||
{ role: 'assistant', content: JSON_PREFILL },
|
||||
], {
|
||||
temperature: 0.3,
|
||||
max_tokens: 600,
|
||||
timeout,
|
||||
});
|
||||
|
||||
const rawText = String(response || '');
|
||||
if (!rawText.trim()) {
|
||||
if (attempt < RETRY_COUNT) {
|
||||
await sleep(RETRY_DELAY);
|
||||
continue;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
const fullJson = JSON_PREFILL + rawText;
|
||||
|
||||
let parsed;
|
||||
try {
|
||||
parsed = parseJson(fullJson);
|
||||
} catch (e) {
|
||||
xbLog.warn(MODULE_ID, `floor ${aiFloor} JSON解析失败 (attempt ${attempt})`);
|
||||
if (attempt < RETRY_COUNT) {
|
||||
await sleep(RETRY_DELAY);
|
||||
continue;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// 兼容:优先 anchors,回退 atoms
|
||||
const rawAnchors = parsed?.anchors;
|
||||
if (!rawAnchors || !Array.isArray(rawAnchors)) {
|
||||
if (attempt < RETRY_COUNT) {
|
||||
await sleep(RETRY_DELAY);
|
||||
continue;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// 转换为 atom 存储格式(最多 2 个)
|
||||
const atoms = rawAnchors
|
||||
.slice(0, 2)
|
||||
.map((a, idx) => anchorToAtom(a, aiFloor, idx))
|
||||
.filter(Boolean);
|
||||
|
||||
return atoms;
|
||||
|
||||
} catch (e) {
|
||||
if (batchCancelled) return null;
|
||||
|
||||
if (attempt < RETRY_COUNT) {
|
||||
await sleep(RETRY_DELAY * (attempt + 1));
|
||||
continue;
|
||||
}
|
||||
xbLog.error(MODULE_ID, `floor ${aiFloor} 失败`, e);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
export async function extractAtomsForRound(userMessage, aiMessage, aiFloor, options = {}) {
|
||||
return extractAtomsForRoundWithRetry(userMessage, aiMessage, aiFloor, options);
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// 批量提取
|
||||
// ============================================================================
|
||||
|
||||
export async function batchExtractAtoms(chat, onProgress) {
|
||||
if (!chat?.length) return [];
|
||||
|
||||
batchCancelled = false;
|
||||
|
||||
const pairs = [];
|
||||
for (let i = 0; i < chat.length; i++) {
|
||||
if (!chat[i].is_user) {
|
||||
const userMsg = (i > 0 && chat[i - 1]?.is_user) ? chat[i - 1] : null;
|
||||
pairs.push({ userMsg, aiMsg: chat[i], aiFloor: i });
|
||||
}
|
||||
}
|
||||
|
||||
if (!pairs.length) return [];
|
||||
|
||||
const allAtoms = [];
|
||||
let completed = 0;
|
||||
let failed = 0;
|
||||
|
||||
for (let i = 0; i < pairs.length; i += CONCURRENCY) {
|
||||
if (batchCancelled) break;
|
||||
|
||||
const batch = pairs.slice(i, i + CONCURRENCY);
|
||||
|
||||
if (i === 0) {
|
||||
const promises = batch.map((pair, idx) => (async () => {
|
||||
await sleep(idx * STAGGER_DELAY);
|
||||
|
||||
if (batchCancelled) return;
|
||||
|
||||
try {
|
||||
const atoms = await extractAtomsForRoundWithRetry(
|
||||
pair.userMsg,
|
||||
pair.aiMsg,
|
||||
pair.aiFloor,
|
||||
{ timeout: DEFAULT_TIMEOUT }
|
||||
);
|
||||
if (atoms?.length) {
|
||||
allAtoms.push(...atoms);
|
||||
} else if (atoms === null) {
|
||||
failed++;
|
||||
}
|
||||
} catch {
|
||||
failed++;
|
||||
}
|
||||
completed++;
|
||||
onProgress?.(completed, pairs.length, failed);
|
||||
})());
|
||||
await Promise.all(promises);
|
||||
} else {
|
||||
const promises = batch.map(pair =>
|
||||
extractAtomsForRoundWithRetry(
|
||||
pair.userMsg,
|
||||
pair.aiMsg,
|
||||
pair.aiFloor,
|
||||
{ timeout: DEFAULT_TIMEOUT }
|
||||
)
|
||||
.then(atoms => {
|
||||
if (batchCancelled) return;
|
||||
if (atoms?.length) {
|
||||
allAtoms.push(...atoms);
|
||||
} else if (atoms === null) {
|
||||
failed++;
|
||||
}
|
||||
completed++;
|
||||
onProgress?.(completed, pairs.length, failed);
|
||||
})
|
||||
.catch(() => {
|
||||
if (batchCancelled) return;
|
||||
failed++;
|
||||
completed++;
|
||||
onProgress?.(completed, pairs.length, failed);
|
||||
})
|
||||
);
|
||||
|
||||
await Promise.all(promises);
|
||||
}
|
||||
|
||||
if (i + CONCURRENCY < pairs.length && !batchCancelled) {
|
||||
await sleep(30);
|
||||
}
|
||||
}
|
||||
|
||||
xbLog.info(MODULE_ID, `批量提取完成: ${allAtoms.length} atoms, ${failed} 失败`);
|
||||
|
||||
return allAtoms;
|
||||
}
|
||||
|
||||
99
modules/story-summary/vector/llm/llm-service.js
Normal file
99
modules/story-summary/vector/llm/llm-service.js
Normal file
@@ -0,0 +1,99 @@
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// vector/llm/llm-service.js - 修复 prefill 传递方式
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
import { xbLog } from '../../../../core/debug-core.js';
|
||||
import { getVectorConfig } from '../../data/config.js';
|
||||
import { getApiKey } from './siliconflow.js';
|
||||
|
||||
const MODULE_ID = 'vector-llm-service';
|
||||
const SILICONFLOW_API_URL = 'https://api.siliconflow.cn/v1';
|
||||
const DEFAULT_L0_MODEL = 'Qwen/Qwen3-8B';
|
||||
|
||||
let callCounter = 0;
|
||||
|
||||
function getStreamingModule() {
|
||||
const mod = window.xiaobaixStreamingGeneration;
|
||||
return mod?.xbgenrawCommand ? mod : null;
|
||||
}
|
||||
|
||||
function generateUniqueId(prefix = 'llm') {
|
||||
callCounter = (callCounter + 1) % 100000;
|
||||
return `${prefix}-${callCounter}-${Date.now().toString(36)}`;
|
||||
}
|
||||
|
||||
function b64UrlEncode(str) {
|
||||
const utf8 = new TextEncoder().encode(String(str));
|
||||
let bin = '';
|
||||
utf8.forEach(b => bin += String.fromCharCode(b));
|
||||
return btoa(bin).replace(/\+/g, '-').replace(/\//g, '_').replace(/=+$/, '');
|
||||
}
|
||||
|
||||
/**
|
||||
* 统一LLM调用 - 走酒馆后端(非流式)
|
||||
* assistant prefill 用 bottomassistant 参数传递
|
||||
*/
|
||||
export async function callLLM(messages, options = {}) {
|
||||
const {
|
||||
temperature = 0.2,
|
||||
max_tokens = 500,
|
||||
} = options;
|
||||
|
||||
const mod = getStreamingModule();
|
||||
if (!mod) throw new Error('Streaming module not ready');
|
||||
|
||||
const apiKey = getApiKey() || '';
|
||||
if (!apiKey) {
|
||||
throw new Error('L0 requires siliconflow API key');
|
||||
}
|
||||
|
||||
// 分离 assistant prefill
|
||||
let topMessages = [...messages];
|
||||
let assistantPrefill = '';
|
||||
|
||||
if (topMessages.length > 0 && topMessages[topMessages.length - 1]?.role === 'assistant') {
|
||||
const lastMsg = topMessages.pop();
|
||||
assistantPrefill = lastMsg.content || '';
|
||||
}
|
||||
|
||||
const top64 = b64UrlEncode(JSON.stringify(topMessages));
|
||||
const uniqueId = generateUniqueId('l0');
|
||||
|
||||
const args = {
|
||||
as: 'user',
|
||||
nonstream: 'true',
|
||||
top64,
|
||||
id: uniqueId,
|
||||
temperature: String(temperature),
|
||||
max_tokens: String(max_tokens),
|
||||
api: 'openai',
|
||||
apiurl: SILICONFLOW_API_URL,
|
||||
apipassword: apiKey,
|
||||
model: DEFAULT_L0_MODEL,
|
||||
};
|
||||
const isQwen3 = String(DEFAULT_L0_MODEL || '').includes('Qwen3');
|
||||
if (isQwen3) {
|
||||
args.enable_thinking = 'false';
|
||||
}
|
||||
|
||||
// ★ 用 bottomassistant 参数传递 prefill
|
||||
if (assistantPrefill) {
|
||||
args.bottomassistant = assistantPrefill;
|
||||
}
|
||||
|
||||
try {
|
||||
const result = await mod.xbgenrawCommand(args, '');
|
||||
return String(result ?? '');
|
||||
} catch (e) {
|
||||
xbLog.error(MODULE_ID, 'LLM调用失败', e);
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
export function parseJson(text) {
|
||||
if (!text) return null;
|
||||
let s = text.trim().replace(/^```(?:json)?\s*/i, '').replace(/\s*```$/i, '').trim();
|
||||
try { return JSON.parse(s); } catch { }
|
||||
const i = s.indexOf('{'), j = s.lastIndexOf('}');
|
||||
if (i !== -1 && j > i) try { return JSON.parse(s.slice(i, j + 1)); } catch { }
|
||||
return null;
|
||||
}
|
||||
266
modules/story-summary/vector/llm/reranker.js
Normal file
266
modules/story-summary/vector/llm/reranker.js
Normal file
@@ -0,0 +1,266 @@
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Reranker - 硅基 bge-reranker-v2-m3
|
||||
// 对候选文档进行精排,过滤与 query 不相关的内容
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
import { xbLog } from '../../../../core/debug-core.js';
|
||||
import { getApiKey } from './siliconflow.js';
|
||||
|
||||
const MODULE_ID = 'reranker';
|
||||
const RERANK_URL = 'https://api.siliconflow.cn/v1/rerank';
|
||||
const RERANK_MODEL = 'BAAI/bge-reranker-v2-m3';
|
||||
const DEFAULT_TIMEOUT = 15000;
|
||||
const MAX_DOCUMENTS = 100; // API 限制
|
||||
const RERANK_BATCH_SIZE = 20;
|
||||
const RERANK_MAX_CONCURRENCY = 5;
|
||||
|
||||
/**
|
||||
* 对文档列表进行 Rerank 精排
|
||||
*
|
||||
* @param {string} query - 查询文本
|
||||
* @param {Array<string>} documents - 文档文本列表
|
||||
* @param {object} options - 选项
|
||||
* @param {number} options.topN - 返回前 N 个结果,默认 40
|
||||
* @param {number} options.timeout - 超时时间,默认 15000ms
|
||||
* @param {AbortSignal} options.signal - 取消信号
|
||||
* @returns {Promise<Array<{index: number, relevance_score: number}>>} 排序后的结果
|
||||
*/
|
||||
export async function rerank(query, documents, options = {}) {
|
||||
const { topN = 40, timeout = DEFAULT_TIMEOUT, signal } = options;
|
||||
|
||||
if (!query?.trim()) {
|
||||
xbLog.warn(MODULE_ID, 'query 为空,跳过 rerank');
|
||||
return { results: documents.map((_, i) => ({ index: i, relevance_score: 0 })), failed: true };
|
||||
}
|
||||
|
||||
if (!documents?.length) {
|
||||
return { results: [], failed: false };
|
||||
}
|
||||
|
||||
const key = getApiKey();
|
||||
if (!key) {
|
||||
xbLog.warn(MODULE_ID, '未配置 API Key,跳过 rerank');
|
||||
return { results: documents.map((_, i) => ({ index: i, relevance_score: 0 })), failed: true };
|
||||
}
|
||||
|
||||
// 截断超长文档列表
|
||||
const truncatedDocs = documents.slice(0, MAX_DOCUMENTS);
|
||||
if (documents.length > MAX_DOCUMENTS) {
|
||||
xbLog.warn(MODULE_ID, `文档数 ${documents.length} 超过限制 ${MAX_DOCUMENTS},已截断`);
|
||||
}
|
||||
|
||||
// 过滤空文档,记录原始索引
|
||||
const validDocs = [];
|
||||
const indexMap = []; // validDocs index → original index
|
||||
|
||||
for (let i = 0; i < truncatedDocs.length; i++) {
|
||||
const text = String(truncatedDocs[i] || '').trim();
|
||||
if (text) {
|
||||
validDocs.push(text);
|
||||
indexMap.push(i);
|
||||
}
|
||||
}
|
||||
|
||||
if (!validDocs.length) {
|
||||
xbLog.warn(MODULE_ID, '无有效文档,跳过 rerank');
|
||||
return { results: [], failed: false };
|
||||
}
|
||||
|
||||
const controller = new AbortController();
|
||||
const timeoutId = setTimeout(() => controller.abort(), timeout);
|
||||
|
||||
try {
|
||||
const T0 = performance.now();
|
||||
|
||||
const response = await fetch(RERANK_URL, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Authorization': `Bearer ${key}`,
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
body: JSON.stringify({
|
||||
model: RERANK_MODEL,
|
||||
// Zero-darkbox: do not silently truncate query.
|
||||
query,
|
||||
documents: validDocs,
|
||||
top_n: Math.min(topN, validDocs.length),
|
||||
return_documents: false,
|
||||
}),
|
||||
signal: signal || controller.signal,
|
||||
});
|
||||
|
||||
clearTimeout(timeoutId);
|
||||
|
||||
if (!response.ok) {
|
||||
const errorText = await response.text().catch(() => '');
|
||||
throw new Error(`Rerank API ${response.status}: ${errorText.slice(0, 200)}`);
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
const results = data.results || [];
|
||||
|
||||
// 映射回原始索引
|
||||
const mapped = results.map(r => ({
|
||||
index: indexMap[r.index],
|
||||
relevance_score: r.relevance_score ?? 0,
|
||||
}));
|
||||
|
||||
const elapsed = Math.round(performance.now() - T0);
|
||||
xbLog.info(MODULE_ID, `Rerank 完成: ${validDocs.length} docs → ${results.length} selected (${elapsed}ms)`);
|
||||
|
||||
return { results: mapped, failed: false };
|
||||
|
||||
} catch (e) {
|
||||
clearTimeout(timeoutId);
|
||||
|
||||
if (e?.name === 'AbortError') {
|
||||
xbLog.warn(MODULE_ID, 'Rerank 超时或取消');
|
||||
} else {
|
||||
xbLog.error(MODULE_ID, 'Rerank 失败', e);
|
||||
}
|
||||
|
||||
// 降级:返回原顺序,分数均匀分布
|
||||
return {
|
||||
results: documents.slice(0, topN).map((_, i) => ({
|
||||
index: i,
|
||||
relevance_score: 0,
|
||||
})),
|
||||
failed: true,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 对 chunk 对象列表进行 Rerank
|
||||
*
|
||||
* @param {string} query - 查询文本
|
||||
* @param {Array<object>} chunks - chunk 对象列表,需要有 text 字段
|
||||
* @param {object} options - 选项
|
||||
* @returns {Promise<Array<object>>} 排序后的 chunk 列表,带 _rerankScore 字段
|
||||
*/
|
||||
export async function rerankChunks(query, chunks, options = {}) {
|
||||
const { topN = 40, minScore = 0.1 } = options;
|
||||
|
||||
if (!chunks?.length) return [];
|
||||
|
||||
const texts = chunks.map(c => c.text || c.semantic || '');
|
||||
|
||||
// ─── 单批:直接调用 ───
|
||||
if (texts.length <= RERANK_BATCH_SIZE) {
|
||||
const { results, failed } = await rerank(query, texts, {
|
||||
topN: Math.min(topN, texts.length),
|
||||
timeout: options.timeout,
|
||||
signal: options.signal,
|
||||
});
|
||||
|
||||
if (failed) {
|
||||
return chunks.map(c => ({ ...c, _rerankScore: 0, _rerankFailed: true }));
|
||||
}
|
||||
|
||||
return results
|
||||
.filter(r => r.relevance_score >= minScore)
|
||||
.sort((a, b) => b.relevance_score - a.relevance_score)
|
||||
.slice(0, topN)
|
||||
.map(r => ({
|
||||
...chunks[r.index],
|
||||
_rerankScore: r.relevance_score,
|
||||
}));
|
||||
}
|
||||
|
||||
// ─── 多批:拆分 → 并发 → 合并 ───
|
||||
const batches = [];
|
||||
for (let i = 0; i < texts.length; i += RERANK_BATCH_SIZE) {
|
||||
batches.push({
|
||||
texts: texts.slice(i, i + RERANK_BATCH_SIZE),
|
||||
offset: i,
|
||||
});
|
||||
}
|
||||
|
||||
const concurrency = Math.min(batches.length, RERANK_MAX_CONCURRENCY);
|
||||
xbLog.info(MODULE_ID, `并发 Rerank: ${batches.length} 批 × ≤${RERANK_BATCH_SIZE} docs, concurrency=${concurrency}`);
|
||||
|
||||
const batchResults = new Array(batches.length);
|
||||
let failedBatches = 0;
|
||||
|
||||
const runBatch = async (batchIdx) => {
|
||||
const batch = batches[batchIdx];
|
||||
const { results, failed } = await rerank(query, batch.texts, {
|
||||
topN: batch.texts.length,
|
||||
timeout: options.timeout,
|
||||
signal: options.signal,
|
||||
});
|
||||
|
||||
if (failed) {
|
||||
failedBatches++;
|
||||
// 单批降级:保留原始顺序,score=0
|
||||
batchResults[batchIdx] = batch.texts.map((_, i) => ({
|
||||
globalIndex: batch.offset + i,
|
||||
relevance_score: 0,
|
||||
_batchFailed: true,
|
||||
}));
|
||||
} else {
|
||||
batchResults[batchIdx] = results.map(r => ({
|
||||
globalIndex: batch.offset + r.index,
|
||||
relevance_score: r.relevance_score,
|
||||
}));
|
||||
}
|
||||
};
|
||||
|
||||
// 并发池
|
||||
let nextIdx = 0;
|
||||
const worker = async () => {
|
||||
while (nextIdx < batches.length) {
|
||||
const idx = nextIdx++;
|
||||
await runBatch(idx);
|
||||
}
|
||||
};
|
||||
await Promise.all(Array.from({ length: concurrency }, () => worker()));
|
||||
|
||||
// 全部失败 → 整体降级
|
||||
if (failedBatches === batches.length) {
|
||||
xbLog.warn(MODULE_ID, `全部 ${batches.length} 批 rerank 失败,整体降级`);
|
||||
return chunks.slice(0, topN).map(c => ({
|
||||
...c,
|
||||
_rerankScore: 0,
|
||||
_rerankFailed: true,
|
||||
}));
|
||||
}
|
||||
|
||||
// 合并所有批次结果
|
||||
const merged = batchResults.flat();
|
||||
|
||||
const selected = merged
|
||||
.filter(r => r._batchFailed || r.relevance_score >= minScore)
|
||||
.sort((a, b) => b.relevance_score - a.relevance_score)
|
||||
.slice(0, topN)
|
||||
.map(r => ({
|
||||
...chunks[r.globalIndex],
|
||||
_rerankScore: r.relevance_score,
|
||||
...(r._batchFailed ? { _rerankFailed: true } : {}),
|
||||
}));
|
||||
|
||||
xbLog.info(MODULE_ID,
|
||||
`Rerank 合并: ${merged.length} candidates, ${failedBatches}/${batches.length} 批失败, 选中 ${selected.length}`
|
||||
);
|
||||
|
||||
return selected;
|
||||
}
|
||||
/**
|
||||
* 测试 Rerank 服务连接
|
||||
*/
|
||||
export async function testRerankService() {
|
||||
const key = getApiKey();
|
||||
if (!key) {
|
||||
throw new Error('请配置硅基 API Key');
|
||||
}
|
||||
|
||||
try {
|
||||
const { results } = await rerank('测试查询', ['测试文档1', '测试文档2'], { topN: 2 });
|
||||
return {
|
||||
success: true,
|
||||
message: `连接成功,返回 ${results.length} 个结果`,
|
||||
};
|
||||
} catch (e) {
|
||||
throw new Error(`连接失败: ${e.message}`);
|
||||
}
|
||||
}
|
||||
101
modules/story-summary/vector/llm/siliconflow.js
Normal file
101
modules/story-summary/vector/llm/siliconflow.js
Normal file
@@ -0,0 +1,101 @@
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// siliconflow.js - Embedding + 多 Key 轮询
|
||||
//
|
||||
// 在 API Key 输入框中用逗号、分号、竖线或换行分隔多个 Key,例如:
|
||||
// sk-aaa,sk-bbb,sk-ccc
|
||||
// 每次调用自动轮询到下一个 Key,并发请求会均匀分布到所有 Key 上。
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
const BASE_URL = 'https://api.siliconflow.cn';
|
||||
const EMBEDDING_MODEL = 'BAAI/bge-m3';
|
||||
|
||||
// ★ 多 Key 轮询状态
|
||||
let _keyIndex = 0;
|
||||
|
||||
/**
|
||||
* 从 localStorage 解析所有 Key(支持逗号、分号、竖线、换行分隔)
|
||||
*/
|
||||
function parseKeys() {
|
||||
try {
|
||||
const raw = localStorage.getItem('summary_panel_config');
|
||||
if (raw) {
|
||||
const parsed = JSON.parse(raw);
|
||||
const keyStr = parsed.vector?.online?.key || '';
|
||||
return keyStr
|
||||
.split(/[,;|\n]+/)
|
||||
.map(k => k.trim())
|
||||
.filter(k => k.length > 0);
|
||||
}
|
||||
} catch { }
|
||||
return [];
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取下一个可用的 API Key(轮询)
|
||||
* 每次调用返回不同的 Key,自动循环
|
||||
*/
|
||||
export function getApiKey() {
|
||||
const keys = parseKeys();
|
||||
if (!keys.length) return null;
|
||||
if (keys.length === 1) return keys[0];
|
||||
|
||||
const idx = _keyIndex % keys.length;
|
||||
const key = keys[idx];
|
||||
_keyIndex = (_keyIndex + 1) % keys.length;
|
||||
const masked = key.length > 10 ? key.slice(0, 6) + '***' + key.slice(-4) : '***';
|
||||
console.log(`[SiliconFlow] 使用 Key ${idx + 1}/${keys.length}: ${masked}`);
|
||||
return key;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取当前配置的 Key 数量(供外部模块动态调整并发用)
|
||||
*/
|
||||
export function getKeyCount() {
|
||||
return Math.max(1, parseKeys().length);
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Embedding
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
export async function embed(texts, options = {}) {
|
||||
if (!texts?.length) return [];
|
||||
|
||||
const key = getApiKey();
|
||||
if (!key) throw new Error('未配置硅基 API Key');
|
||||
|
||||
const { timeout = 30000, signal } = options;
|
||||
const controller = new AbortController();
|
||||
const timeoutId = setTimeout(() => controller.abort(), timeout);
|
||||
|
||||
try {
|
||||
const response = await fetch(`${BASE_URL}/v1/embeddings`, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Authorization': `Bearer ${key}`,
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
body: JSON.stringify({
|
||||
model: EMBEDDING_MODEL,
|
||||
input: texts,
|
||||
}),
|
||||
signal: signal || controller.signal,
|
||||
});
|
||||
|
||||
clearTimeout(timeoutId);
|
||||
|
||||
if (!response.ok) {
|
||||
const errorText = await response.text().catch(() => '');
|
||||
throw new Error(`Embedding ${response.status}: ${errorText.slice(0, 200)}`);
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
return (data.data || [])
|
||||
.sort((a, b) => a.index - b.index)
|
||||
.map(item => Array.isArray(item.embedding) ? item.embedding : Array.from(item.embedding));
|
||||
} finally {
|
||||
clearTimeout(timeoutId);
|
||||
}
|
||||
}
|
||||
|
||||
export { EMBEDDING_MODEL as MODELS };
|
||||
391
modules/story-summary/vector/pipeline/chunk-builder.js
Normal file
391
modules/story-summary/vector/pipeline/chunk-builder.js
Normal file
@@ -0,0 +1,391 @@
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Story Summary - Chunk Builder
|
||||
// 标准 RAG chunking: ~200 tokens per chunk
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
import { getContext } from '../../../../../../../extensions.js';
|
||||
import {
|
||||
getMeta,
|
||||
updateMeta,
|
||||
saveChunks,
|
||||
saveChunkVectors,
|
||||
clearAllChunks,
|
||||
deleteChunksFromFloor,
|
||||
deleteChunksAtFloor,
|
||||
makeChunkId,
|
||||
hashText,
|
||||
CHUNK_MAX_TOKENS,
|
||||
} from '../storage/chunk-store.js';
|
||||
import { embed, getEngineFingerprint } from '../utils/embedder.js';
|
||||
import { xbLog } from '../../../../core/debug-core.js';
|
||||
import { filterText } from '../utils/text-filter.js';
|
||||
import { extractAndStoreAtomsForRound } from './state-integration.js';
|
||||
import {
|
||||
deleteStateAtomsFromFloor,
|
||||
deleteStateVectorsFromFloor,
|
||||
deleteL0IndexFromFloor,
|
||||
} from '../storage/state-store.js';
|
||||
|
||||
const MODULE_ID = 'chunk-builder';
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Token 估算
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
function estimateTokens(text) {
|
||||
if (!text) return 0;
|
||||
const chinese = (text.match(/[\u4e00-\u9fff]/g) || []).length;
|
||||
const other = text.length - chinese;
|
||||
return Math.ceil(chinese + other / 4);
|
||||
}
|
||||
|
||||
function splitSentences(text) {
|
||||
if (!text) return [];
|
||||
const parts = text.split(/(?<=[。!?\n])|(?<=[.!?]\s)/);
|
||||
return parts.map(s => s.trim()).filter(s => s.length > 0);
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Chunk 切分
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
export function chunkMessage(floor, message, maxTokens = CHUNK_MAX_TOKENS) {
|
||||
const text = message.mes || '';
|
||||
const speaker = message.name || (message.is_user ? '用户' : '角色');
|
||||
const isUser = !!message.is_user;
|
||||
|
||||
// 1. 应用用户自定义过滤规则
|
||||
// 2. 移除 TTS 标记(硬编码)
|
||||
// 3. 移除 <state> 标签(硬编码,L0 已单独存储)
|
||||
const cleanText = filterText(text)
|
||||
.replace(/\[tts:[^\]]*\]/gi, '')
|
||||
.replace(/<state>[\s\S]*?<\/state>/gi, '')
|
||||
.trim();
|
||||
|
||||
if (!cleanText) return [];
|
||||
|
||||
const totalTokens = estimateTokens(cleanText);
|
||||
|
||||
if (totalTokens <= maxTokens) {
|
||||
return [{
|
||||
chunkId: makeChunkId(floor, 0),
|
||||
floor,
|
||||
chunkIdx: 0,
|
||||
speaker,
|
||||
isUser,
|
||||
text: cleanText,
|
||||
textHash: hashText(cleanText),
|
||||
}];
|
||||
}
|
||||
|
||||
const sentences = splitSentences(cleanText);
|
||||
const chunks = [];
|
||||
let currentSentences = [];
|
||||
let currentTokens = 0;
|
||||
|
||||
for (const sent of sentences) {
|
||||
const sentTokens = estimateTokens(sent);
|
||||
|
||||
if (sentTokens > maxTokens) {
|
||||
if (currentSentences.length > 0) {
|
||||
const chunkText = currentSentences.join('');
|
||||
chunks.push({
|
||||
chunkId: makeChunkId(floor, chunks.length),
|
||||
floor,
|
||||
chunkIdx: chunks.length,
|
||||
speaker,
|
||||
isUser,
|
||||
text: chunkText,
|
||||
textHash: hashText(chunkText),
|
||||
});
|
||||
currentSentences = [];
|
||||
currentTokens = 0;
|
||||
}
|
||||
|
||||
const sliceSize = maxTokens * 2;
|
||||
for (let i = 0; i < sent.length; i += sliceSize) {
|
||||
const slice = sent.slice(i, i + sliceSize);
|
||||
chunks.push({
|
||||
chunkId: makeChunkId(floor, chunks.length),
|
||||
floor,
|
||||
chunkIdx: chunks.length,
|
||||
speaker,
|
||||
isUser,
|
||||
text: slice,
|
||||
textHash: hashText(slice),
|
||||
});
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (currentTokens + sentTokens > maxTokens && currentSentences.length > 0) {
|
||||
const chunkText = currentSentences.join('');
|
||||
chunks.push({
|
||||
chunkId: makeChunkId(floor, chunks.length),
|
||||
floor,
|
||||
chunkIdx: chunks.length,
|
||||
speaker,
|
||||
isUser,
|
||||
text: chunkText,
|
||||
textHash: hashText(chunkText),
|
||||
});
|
||||
currentSentences = [];
|
||||
currentTokens = 0;
|
||||
}
|
||||
|
||||
currentSentences.push(sent);
|
||||
currentTokens += sentTokens;
|
||||
}
|
||||
|
||||
if (currentSentences.length > 0) {
|
||||
const chunkText = currentSentences.join('');
|
||||
chunks.push({
|
||||
chunkId: makeChunkId(floor, chunks.length),
|
||||
floor,
|
||||
chunkIdx: chunks.length,
|
||||
speaker,
|
||||
isUser,
|
||||
text: chunkText,
|
||||
textHash: hashText(chunkText),
|
||||
});
|
||||
}
|
||||
|
||||
return chunks;
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// 构建状态
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
export async function getChunkBuildStatus() {
|
||||
const { chat, chatId } = getContext();
|
||||
if (!chatId) {
|
||||
return { totalFloors: 0, builtFloors: 0, pending: 0 };
|
||||
}
|
||||
|
||||
const meta = await getMeta(chatId);
|
||||
const totalFloors = chat?.length || 0;
|
||||
const builtFloors = meta.lastChunkFloor + 1;
|
||||
|
||||
return {
|
||||
totalFloors,
|
||||
builtFloors,
|
||||
lastChunkFloor: meta.lastChunkFloor,
|
||||
pending: Math.max(0, totalFloors - builtFloors),
|
||||
};
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// 全量构建
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
export async function buildAllChunks(options = {}) {
|
||||
const { onProgress, shouldCancel, vectorConfig } = options;
|
||||
|
||||
const { chat, chatId } = getContext();
|
||||
if (!chatId || !chat?.length) {
|
||||
return { built: 0, errors: 0 };
|
||||
}
|
||||
|
||||
const fingerprint = getEngineFingerprint(vectorConfig);
|
||||
|
||||
await clearAllChunks(chatId);
|
||||
await updateMeta(chatId, { lastChunkFloor: -1, fingerprint });
|
||||
|
||||
const allChunks = [];
|
||||
for (let floor = 0; floor < chat.length; floor++) {
|
||||
const chunks = chunkMessage(floor, chat[floor]);
|
||||
allChunks.push(...chunks);
|
||||
}
|
||||
|
||||
if (allChunks.length === 0) {
|
||||
return { built: 0, errors: 0 };
|
||||
}
|
||||
|
||||
xbLog.info(MODULE_ID, `开始构建 ${allChunks.length} 个 chunks(${chat.length} 层楼)`);
|
||||
|
||||
await saveChunks(chatId, allChunks);
|
||||
|
||||
const texts = allChunks.map(c => c.text);
|
||||
const batchSize = 20;
|
||||
|
||||
let completed = 0;
|
||||
let errors = 0;
|
||||
const allVectors = [];
|
||||
|
||||
for (let i = 0; i < texts.length; i += batchSize) {
|
||||
if (shouldCancel?.()) break;
|
||||
|
||||
const batch = texts.slice(i, i + batchSize);
|
||||
|
||||
try {
|
||||
const vectors = await embed(batch, vectorConfig);
|
||||
allVectors.push(...vectors);
|
||||
completed += batch.length;
|
||||
onProgress?.(completed, texts.length);
|
||||
} catch (e) {
|
||||
xbLog.error(MODULE_ID, `批次 ${i}/${texts.length} 向量化失败`, e);
|
||||
allVectors.push(...batch.map(() => null));
|
||||
errors++;
|
||||
}
|
||||
}
|
||||
|
||||
if (shouldCancel?.()) {
|
||||
return { built: completed, errors };
|
||||
}
|
||||
|
||||
const vectorItems = allChunks
|
||||
.map((chunk, idx) => allVectors[idx] ? { chunkId: chunk.chunkId, vector: allVectors[idx] } : null)
|
||||
.filter(Boolean);
|
||||
|
||||
if (vectorItems.length > 0) {
|
||||
await saveChunkVectors(chatId, vectorItems, fingerprint);
|
||||
}
|
||||
|
||||
await updateMeta(chatId, { lastChunkFloor: chat.length - 1 });
|
||||
|
||||
xbLog.info(MODULE_ID, `构建完成:${vectorItems.length} 个向量,${errors} 个错误`);
|
||||
|
||||
return { built: vectorItems.length, errors };
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// 增量构建
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
export async function buildIncrementalChunks(options = {}) {
|
||||
const { vectorConfig } = options;
|
||||
|
||||
const { chat, chatId } = getContext();
|
||||
if (!chatId || !chat?.length) {
|
||||
return { built: 0 };
|
||||
}
|
||||
|
||||
const meta = await getMeta(chatId);
|
||||
const fingerprint = getEngineFingerprint(vectorConfig);
|
||||
|
||||
if (meta.fingerprint && meta.fingerprint !== fingerprint) {
|
||||
xbLog.warn(MODULE_ID, '引擎指纹不匹配,跳过增量构建');
|
||||
return { built: 0 };
|
||||
}
|
||||
|
||||
const startFloor = meta.lastChunkFloor + 1;
|
||||
if (startFloor >= chat.length) {
|
||||
return { built: 0 };
|
||||
}
|
||||
|
||||
xbLog.info(MODULE_ID, `增量构建 ${startFloor} - ${chat.length - 1} 层`);
|
||||
|
||||
const newChunks = [];
|
||||
for (let floor = startFloor; floor < chat.length; floor++) {
|
||||
const chunks = chunkMessage(floor, chat[floor]);
|
||||
newChunks.push(...chunks);
|
||||
}
|
||||
|
||||
if (newChunks.length === 0) {
|
||||
await updateMeta(chatId, { lastChunkFloor: chat.length - 1 });
|
||||
return { built: 0 };
|
||||
}
|
||||
|
||||
await saveChunks(chatId, newChunks);
|
||||
|
||||
const texts = newChunks.map(c => c.text);
|
||||
|
||||
try {
|
||||
const vectors = await embed(texts, vectorConfig);
|
||||
const vectorItems = newChunks.map((chunk, idx) => ({
|
||||
chunkId: chunk.chunkId,
|
||||
vector: vectors[idx],
|
||||
}));
|
||||
await saveChunkVectors(chatId, vectorItems, fingerprint);
|
||||
await updateMeta(chatId, { lastChunkFloor: chat.length - 1 });
|
||||
|
||||
return { built: vectorItems.length };
|
||||
} catch (e) {
|
||||
xbLog.error(MODULE_ID, '增量向量化失败', e);
|
||||
return { built: 0 };
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// L1 同步(消息变化时调用)
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* 消息删除后同步:删除 floor >= newLength 的 chunk
|
||||
*/
|
||||
export async function syncOnMessageDeleted(chatId, newLength) {
|
||||
if (!chatId || newLength < 0) return;
|
||||
|
||||
await deleteChunksFromFloor(chatId, newLength);
|
||||
await updateMeta(chatId, { lastChunkFloor: newLength - 1 });
|
||||
|
||||
xbLog.info(MODULE_ID, `消息删除同步:删除 floor >= ${newLength}`);
|
||||
}
|
||||
|
||||
/**
|
||||
* swipe 后同步:删除最后楼层的 chunk(等待后续重建)
|
||||
*/
|
||||
export async function syncOnMessageSwiped(chatId, lastFloor) {
|
||||
if (!chatId || lastFloor < 0) return;
|
||||
|
||||
await deleteChunksAtFloor(chatId, lastFloor);
|
||||
await updateMeta(chatId, { lastChunkFloor: lastFloor - 1 });
|
||||
|
||||
xbLog.info(MODULE_ID, `swipe 同步:删除 floor ${lastFloor}`);
|
||||
}
|
||||
|
||||
/**
|
||||
* 新消息后同步:删除 + 重建最后楼层
|
||||
*/
|
||||
export async function syncOnMessageReceived(chatId, lastFloor, message, vectorConfig, onL0Complete) {
|
||||
if (!chatId || lastFloor < 0 || !message) return { built: 0, chunks: [] };
|
||||
if (!vectorConfig?.enabled) return { built: 0, chunks: [] };
|
||||
|
||||
// 删除该楼层旧的
|
||||
await deleteChunksAtFloor(chatId, lastFloor);
|
||||
|
||||
// 重建
|
||||
const chunks = chunkMessage(lastFloor, message);
|
||||
if (chunks.length === 0) return { built: 0, chunks: [] };
|
||||
|
||||
await saveChunks(chatId, chunks);
|
||||
|
||||
// 向量化
|
||||
const fingerprint = getEngineFingerprint(vectorConfig);
|
||||
const texts = chunks.map(c => c.text);
|
||||
|
||||
let vectorized = false;
|
||||
try {
|
||||
const vectors = await embed(texts, vectorConfig);
|
||||
const items = chunks.map((c, i) => ({ chunkId: c.chunkId, vector: vectors[i] }));
|
||||
await saveChunkVectors(chatId, items, fingerprint);
|
||||
await updateMeta(chatId, { lastChunkFloor: lastFloor });
|
||||
|
||||
vectorized = true;
|
||||
xbLog.info(MODULE_ID, `消息同步:重建 floor ${lastFloor},${chunks.length} 个 chunk`);
|
||||
} catch (e) {
|
||||
xbLog.error(MODULE_ID, `消息同步失败:floor ${lastFloor}`, e);
|
||||
}
|
||||
// L0 配对提取(仅 AI 消息触发)
|
||||
if (!message.is_user) {
|
||||
const { chat } = getContext();
|
||||
const userFloor = lastFloor - 1;
|
||||
const userMessage = (userFloor >= 0 && chat[userFloor]?.is_user) ? chat[userFloor] : null;
|
||||
|
||||
// L0 先删后建(与 L1 deleteChunksAtFloor 对称)
|
||||
// regenerate / swipe 后新消息覆盖旧楼时,清理旧 atoms
|
||||
deleteStateAtomsFromFloor(lastFloor);
|
||||
deleteL0IndexFromFloor(lastFloor);
|
||||
await deleteStateVectorsFromFloor(chatId, lastFloor);
|
||||
|
||||
try {
|
||||
await extractAndStoreAtomsForRound(lastFloor, message, userMessage, onL0Complete);
|
||||
} catch (e) {
|
||||
xbLog.warn(MODULE_ID, `Atom 提取失败: floor ${lastFloor}`, e);
|
||||
}
|
||||
}
|
||||
|
||||
return { built: vectorized ? chunks.length : 0, chunks };
|
||||
}
|
||||
562
modules/story-summary/vector/pipeline/state-integration.js
Normal file
562
modules/story-summary/vector/pipeline/state-integration.js
Normal file
@@ -0,0 +1,562 @@
|
||||
// ============================================================================
|
||||
// state-integration.js - L0 状态层集成
|
||||
// Phase 1: 批量 LLM 提取(只存文本)
|
||||
// Phase 2: 统一向量化(提取完成后)
|
||||
// ============================================================================
|
||||
|
||||
import { getContext } from '../../../../../../../extensions.js';
|
||||
import { saveMetadataDebounced } from '../../../../../../../extensions.js';
|
||||
import { xbLog } from '../../../../core/debug-core.js';
|
||||
import {
|
||||
saveStateAtoms,
|
||||
saveStateVectors,
|
||||
deleteStateAtomsFromFloor,
|
||||
deleteStateVectorsFromFloor,
|
||||
getStateAtoms,
|
||||
clearStateAtoms,
|
||||
clearStateVectors,
|
||||
getL0FloorStatus,
|
||||
setL0FloorStatus,
|
||||
clearL0Index,
|
||||
deleteL0IndexFromFloor,
|
||||
} from '../storage/state-store.js';
|
||||
import { embed } from '../llm/siliconflow.js';
|
||||
import { extractAtomsForRound, cancelBatchExtraction } from '../llm/atom-extraction.js';
|
||||
import { getVectorConfig } from '../../data/config.js';
|
||||
import { getEngineFingerprint } from '../utils/embedder.js';
|
||||
import { filterText } from '../utils/text-filter.js';
|
||||
|
||||
const MODULE_ID = 'state-integration';
|
||||
|
||||
// ★ 并发配置
|
||||
const CONCURRENCY = 50;
|
||||
const STAGGER_DELAY = 15;
|
||||
const DEBUG_CONCURRENCY = true;
|
||||
const R_AGG_MAX_CHARS = 256;
|
||||
|
||||
let initialized = false;
|
||||
let extractionCancelled = false;
|
||||
|
||||
export function cancelL0Extraction() {
|
||||
extractionCancelled = true;
|
||||
cancelBatchExtraction();
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// 初始化
|
||||
// ============================================================================
|
||||
|
||||
export function initStateIntegration() {
|
||||
if (initialized) return;
|
||||
initialized = true;
|
||||
globalThis.LWB_StateRollbackHook = handleStateRollback;
|
||||
xbLog.info(MODULE_ID, 'L0 状态层集成已初始化');
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// 统计
|
||||
// ============================================================================
|
||||
|
||||
export async function getAnchorStats() {
|
||||
const { chat } = getContext();
|
||||
if (!chat?.length) {
|
||||
return { extracted: 0, total: 0, pending: 0, empty: 0, fail: 0 };
|
||||
}
|
||||
|
||||
// 统计 AI 楼层
|
||||
const aiFloors = [];
|
||||
for (let i = 0; i < chat.length; i++) {
|
||||
if (!chat[i]?.is_user) aiFloors.push(i);
|
||||
}
|
||||
|
||||
let ok = 0;
|
||||
let empty = 0;
|
||||
let fail = 0;
|
||||
|
||||
for (const f of aiFloors) {
|
||||
const s = getL0FloorStatus(f);
|
||||
if (!s) continue;
|
||||
if (s.status === 'ok') ok++;
|
||||
else if (s.status === 'empty') empty++;
|
||||
else if (s.status === 'fail') fail++;
|
||||
}
|
||||
|
||||
const total = aiFloors.length;
|
||||
const processed = ok + empty + fail;
|
||||
const pending = Math.max(0, total - processed);
|
||||
|
||||
return {
|
||||
extracted: ok + empty,
|
||||
total,
|
||||
pending,
|
||||
empty,
|
||||
fail
|
||||
};
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// 增量提取 - Phase 1 提取文本,Phase 2 统一向量化
|
||||
// ============================================================================
|
||||
|
||||
function buildL0InputText(userMessage, aiMessage) {
|
||||
const parts = [];
|
||||
const userName = userMessage?.name || '用户';
|
||||
const aiName = aiMessage?.name || '角色';
|
||||
|
||||
if (userMessage?.mes?.trim()) {
|
||||
parts.push(`【用户:${userName}】\n${filterText(userMessage.mes).trim()}`);
|
||||
}
|
||||
if (aiMessage?.mes?.trim()) {
|
||||
parts.push(`【角色:${aiName}】\n${filterText(aiMessage.mes).trim()}`);
|
||||
}
|
||||
|
||||
return parts.join('\n\n---\n\n').trim();
|
||||
}
|
||||
|
||||
function buildRAggregateText(atom) {
|
||||
const uniq = new Set();
|
||||
for (const edge of (atom?.edges || [])) {
|
||||
const r = String(edge?.r || '').trim();
|
||||
if (!r) continue;
|
||||
uniq.add(r);
|
||||
}
|
||||
const joined = [...uniq].join(' ; ');
|
||||
if (!joined) return String(atom?.semantic || '').trim();
|
||||
return joined.length > R_AGG_MAX_CHARS ? joined.slice(0, R_AGG_MAX_CHARS) : joined;
|
||||
}
|
||||
|
||||
export async function incrementalExtractAtoms(chatId, chat, onProgress, options = {}) {
|
||||
const { maxFloors = Infinity } = options;
|
||||
if (!chatId || !chat?.length) return { built: 0 };
|
||||
|
||||
const vectorCfg = getVectorConfig();
|
||||
if (!vectorCfg?.enabled) return { built: 0 };
|
||||
|
||||
// ★ 重置取消标志
|
||||
extractionCancelled = false;
|
||||
|
||||
const pendingPairs = [];
|
||||
|
||||
for (let i = 0; i < chat.length; i++) {
|
||||
const msg = chat[i];
|
||||
if (!msg || msg.is_user) continue;
|
||||
|
||||
const st = getL0FloorStatus(i);
|
||||
// ★ 只跳过 ok 和 empty,fail 的可以重试
|
||||
if (st?.status === 'ok' || st?.status === 'empty') {
|
||||
continue;
|
||||
}
|
||||
|
||||
const userMsg = (i > 0 && chat[i - 1]?.is_user) ? chat[i - 1] : null;
|
||||
const inputText = buildL0InputText(userMsg, msg);
|
||||
|
||||
if (!inputText) {
|
||||
setL0FloorStatus(i, { status: 'empty', reason: 'filtered_empty', atoms: 0 });
|
||||
continue;
|
||||
}
|
||||
|
||||
pendingPairs.push({ userMsg, aiMsg: msg, aiFloor: i });
|
||||
}
|
||||
|
||||
// 限制单次提取楼层数(自动触发时使用)
|
||||
if (pendingPairs.length > maxFloors) {
|
||||
pendingPairs.length = maxFloors;
|
||||
}
|
||||
|
||||
if (!pendingPairs.length) {
|
||||
onProgress?.('已全部提取', 0, 0);
|
||||
return { built: 0 };
|
||||
}
|
||||
|
||||
xbLog.info(MODULE_ID, `增量 L0 提取:pending=${pendingPairs.length}, concurrency=${CONCURRENCY}`);
|
||||
|
||||
let completed = 0;
|
||||
let failed = 0;
|
||||
const total = pendingPairs.length;
|
||||
let builtAtoms = 0;
|
||||
let active = 0;
|
||||
let peakActive = 0;
|
||||
const tStart = performance.now();
|
||||
|
||||
// ★ Phase 1: 收集所有新提取的 atoms(不向量化)
|
||||
const allNewAtoms = [];
|
||||
|
||||
// ★ 限流检测:连续失败 N 次后暂停并降速
|
||||
let consecutiveFailures = 0;
|
||||
let rateLimited = false;
|
||||
const RATE_LIMIT_THRESHOLD = 3; // 连续失败多少次触发限流保护
|
||||
const RATE_LIMIT_WAIT_MS = 60000; // 限流后等待时间(60 秒)
|
||||
const RETRY_INTERVAL_MS = 1000; // 降速模式下每次请求间隔(1 秒)
|
||||
const RETRY_CONCURRENCY = 1; // ★ 降速模式下的并发数(默认1,建议不要超过5)
|
||||
|
||||
// ★ 通用处理单个 pair 的逻辑(复用于正常模式和降速模式)
|
||||
const processPair = async (pair, idx, workerId) => {
|
||||
const floor = pair.aiFloor;
|
||||
const prev = getL0FloorStatus(floor);
|
||||
|
||||
active++;
|
||||
if (active > peakActive) peakActive = active;
|
||||
if (DEBUG_CONCURRENCY && (idx % 10 === 0)) {
|
||||
xbLog.info(MODULE_ID, `L0 pool start idx=${idx} active=${active} peak=${peakActive} worker=${workerId}`);
|
||||
}
|
||||
|
||||
try {
|
||||
const atoms = await extractAtomsForRound(pair.userMsg, pair.aiMsg, floor, { timeout: 20000 });
|
||||
|
||||
if (extractionCancelled) return;
|
||||
|
||||
if (atoms == null) {
|
||||
throw new Error('llm_failed');
|
||||
}
|
||||
|
||||
// ★ 成功:重置连续失败计数
|
||||
consecutiveFailures = 0;
|
||||
|
||||
if (!atoms.length) {
|
||||
setL0FloorStatus(floor, { status: 'empty', reason: 'llm_empty', atoms: 0 });
|
||||
} else {
|
||||
atoms.forEach(a => a.chatId = chatId);
|
||||
saveStateAtoms(atoms);
|
||||
allNewAtoms.push(...atoms);
|
||||
|
||||
setL0FloorStatus(floor, { status: 'ok', atoms: atoms.length });
|
||||
builtAtoms += atoms.length;
|
||||
}
|
||||
} catch (e) {
|
||||
if (extractionCancelled) return;
|
||||
|
||||
setL0FloorStatus(floor, {
|
||||
status: 'fail',
|
||||
attempts: (prev?.attempts || 0) + 1,
|
||||
reason: String(e?.message || e).replace(/\s+/g, ' ').slice(0, 120),
|
||||
});
|
||||
failed++;
|
||||
|
||||
// ★ 限流检测:连续失败累加
|
||||
consecutiveFailures++;
|
||||
if (consecutiveFailures >= RATE_LIMIT_THRESHOLD && !rateLimited) {
|
||||
rateLimited = true;
|
||||
xbLog.warn(MODULE_ID, `连续失败 ${consecutiveFailures} 次,疑似触发 API 限流,将暂停所有并发`);
|
||||
}
|
||||
} finally {
|
||||
active--;
|
||||
if (!extractionCancelled) {
|
||||
completed++;
|
||||
onProgress?.(`提取: ${completed}/${total}`, completed, total);
|
||||
}
|
||||
if (DEBUG_CONCURRENCY && (completed % 25 === 0 || completed === total)) {
|
||||
const elapsed = Math.max(1, Math.round(performance.now() - tStart));
|
||||
xbLog.info(MODULE_ID, `L0 pool progress=${completed}/${total} active=${active} peak=${peakActive} elapsedMs=${elapsed}`);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// ★ 并发池处理(保持固定并发度)
|
||||
const poolSize = Math.min(CONCURRENCY, pendingPairs.length);
|
||||
let nextIndex = 0;
|
||||
let started = 0;
|
||||
const runWorker = async (workerId) => {
|
||||
while (true) {
|
||||
if (extractionCancelled || rateLimited) return;
|
||||
const idx = nextIndex++;
|
||||
if (idx >= pendingPairs.length) return;
|
||||
|
||||
const pair = pendingPairs[idx];
|
||||
const stagger = started++;
|
||||
if (STAGGER_DELAY > 0) {
|
||||
await new Promise(r => setTimeout(r, stagger * STAGGER_DELAY));
|
||||
}
|
||||
|
||||
if (extractionCancelled || rateLimited) return;
|
||||
|
||||
await processPair(pair, idx, workerId);
|
||||
}
|
||||
};
|
||||
|
||||
await Promise.all(Array.from({ length: poolSize }, (_, i) => runWorker(i)));
|
||||
if (DEBUG_CONCURRENCY) {
|
||||
const elapsed = Math.max(1, Math.round(performance.now() - tStart));
|
||||
xbLog.info(MODULE_ID, `L0 pool done completed=${completed}/${total} failed=${failed} peakActive=${peakActive} elapsedMs=${elapsed}`);
|
||||
}
|
||||
|
||||
// ═════════════════════════════════════════════════════════════════════
|
||||
// ★ 限流恢复:重置进度,从头开始以限速模式慢慢跑
|
||||
// ═════════════════════════════════════════════════════════════════════
|
||||
if (rateLimited && !extractionCancelled) {
|
||||
const waitSec = RATE_LIMIT_WAIT_MS / 1000;
|
||||
xbLog.info(MODULE_ID, `限流保护:将重置进度并从头开始降速重来(并发=${RETRY_CONCURRENCY}, 间隔=${RETRY_INTERVAL_MS}ms)`);
|
||||
onProgress?.(`疑似限流,${waitSec}s 后降速重头开始...`, completed, total);
|
||||
|
||||
await new Promise(r => setTimeout(r, RATE_LIMIT_WAIT_MS));
|
||||
|
||||
if (!extractionCancelled) {
|
||||
// ★ 核心逻辑:重置计数器,让 UI 从 0 开始跑,给用户“重头开始”的反馈
|
||||
rateLimited = false;
|
||||
consecutiveFailures = 0;
|
||||
completed = 0;
|
||||
failed = 0;
|
||||
|
||||
let retryNextIdx = 0;
|
||||
|
||||
xbLog.info(MODULE_ID, `限流恢复:开始降速模式扫描 ${pendingPairs.length} 个楼层`);
|
||||
|
||||
const retryWorkers = Math.min(RETRY_CONCURRENCY, pendingPairs.length);
|
||||
const runRetryWorker = async (wid) => {
|
||||
while (true) {
|
||||
if (extractionCancelled) return;
|
||||
const idx = retryNextIdx++;
|
||||
if (idx >= pendingPairs.length) return;
|
||||
|
||||
const pair = pendingPairs[idx];
|
||||
const floor = pair.aiFloor;
|
||||
|
||||
// ★ 检查该楼层状态
|
||||
const st = getL0FloorStatus(floor);
|
||||
if (st?.status === 'ok' || st?.status === 'empty') {
|
||||
// 刚才已经成功了,直接跳过(仅增加进度计数)
|
||||
completed++;
|
||||
onProgress?.(`提取: ${completed}/${total} (跳过已完成)`, completed, total);
|
||||
continue;
|
||||
}
|
||||
|
||||
// ★ 没做过的,用 slow 模式处理
|
||||
await processPair(pair, idx, `retry-${wid}`);
|
||||
|
||||
// 每个请求后休息,避免再次触发限流
|
||||
if (idx < pendingPairs.length - 1 && RETRY_INTERVAL_MS > 0) {
|
||||
await new Promise(r => setTimeout(r, RETRY_INTERVAL_MS));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
await Promise.all(Array.from({ length: retryWorkers }, (_, i) => runRetryWorker(i)));
|
||||
xbLog.info(MODULE_ID, `降速重头开始阶段结束`);
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
saveMetadataDebounced?.();
|
||||
} catch { }
|
||||
|
||||
// ★ Phase 2: 统一向量化所有新提取的 atoms
|
||||
if (allNewAtoms.length > 0 && !extractionCancelled) {
|
||||
onProgress?.(`向量化 L0: 0/${allNewAtoms.length}`, 0, allNewAtoms.length);
|
||||
await vectorizeAtoms(chatId, allNewAtoms, (current, total) => {
|
||||
onProgress?.(`向量化 L0: ${current}/${total}`, current, total);
|
||||
});
|
||||
}
|
||||
|
||||
xbLog.info(MODULE_ID, `L0 ${extractionCancelled ? '已取消' : '完成'}:atoms=${builtAtoms}, completed=${completed}/${total}, failed=${failed}`);
|
||||
return { built: builtAtoms };
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// 向量化(支持进度回调)
|
||||
// ============================================================================
|
||||
|
||||
async function vectorizeAtoms(chatId, atoms, onProgress) {
|
||||
if (!atoms?.length) return;
|
||||
|
||||
const vectorCfg = getVectorConfig();
|
||||
if (!vectorCfg?.enabled) return;
|
||||
|
||||
const semanticTexts = atoms.map(a => a.semantic);
|
||||
const rTexts = atoms.map(a => buildRAggregateText(a));
|
||||
const fingerprint = getEngineFingerprint(vectorCfg);
|
||||
const batchSize = 20;
|
||||
|
||||
try {
|
||||
const allVectors = [];
|
||||
|
||||
for (let i = 0; i < semanticTexts.length; i += batchSize) {
|
||||
if (extractionCancelled) break;
|
||||
|
||||
const semBatch = semanticTexts.slice(i, i + batchSize);
|
||||
const rBatch = rTexts.slice(i, i + batchSize);
|
||||
const payload = semBatch.concat(rBatch);
|
||||
const vectors = await embed(payload, { timeout: 30000 });
|
||||
const split = semBatch.length;
|
||||
if (!Array.isArray(vectors) || vectors.length < split * 2) {
|
||||
throw new Error(`embed length mismatch: expect>=${split * 2}, got=${vectors?.length || 0}`);
|
||||
}
|
||||
const semVectors = vectors.slice(0, split);
|
||||
const rVectors = vectors.slice(split, split + split);
|
||||
|
||||
for (let j = 0; j < split; j++) {
|
||||
allVectors.push({
|
||||
vector: semVectors[j],
|
||||
rVector: rVectors[j] || semVectors[j],
|
||||
});
|
||||
}
|
||||
|
||||
onProgress?.(allVectors.length, semanticTexts.length);
|
||||
}
|
||||
|
||||
if (extractionCancelled) return;
|
||||
|
||||
const items = atoms.slice(0, allVectors.length).map((a, i) => ({
|
||||
atomId: a.atomId,
|
||||
floor: a.floor,
|
||||
vector: allVectors[i].vector,
|
||||
rVector: allVectors[i].rVector,
|
||||
}));
|
||||
|
||||
await saveStateVectors(chatId, items, fingerprint);
|
||||
xbLog.info(MODULE_ID, `L0 向量化完成: ${items.length} 条`);
|
||||
} catch (e) {
|
||||
xbLog.error(MODULE_ID, 'L0 向量化失败', e);
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// 清空
|
||||
// ============================================================================
|
||||
|
||||
export async function clearAllAtomsAndVectors(chatId) {
|
||||
clearStateAtoms();
|
||||
clearL0Index();
|
||||
if (chatId) {
|
||||
await clearStateVectors(chatId);
|
||||
}
|
||||
|
||||
// ★ 立即保存
|
||||
try {
|
||||
saveMetadataDebounced?.();
|
||||
} catch { }
|
||||
|
||||
xbLog.info(MODULE_ID, '已清空所有记忆锚点');
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// 实时增量(AI 消息后触发)- 保持不变
|
||||
// ============================================================================
|
||||
|
||||
let extractionQueue = [];
|
||||
let isProcessing = false;
|
||||
|
||||
export async function extractAndStoreAtomsForRound(aiFloor, aiMessage, userMessage, onComplete) {
|
||||
const { chatId } = getContext();
|
||||
if (!chatId) return;
|
||||
|
||||
const vectorCfg = getVectorConfig();
|
||||
if (!vectorCfg?.enabled) return;
|
||||
|
||||
extractionQueue.push({ aiFloor, aiMessage, userMessage, chatId, onComplete });
|
||||
processQueue();
|
||||
}
|
||||
|
||||
async function processQueue() {
|
||||
if (isProcessing || extractionQueue.length === 0) return;
|
||||
isProcessing = true;
|
||||
|
||||
while (extractionQueue.length > 0) {
|
||||
const { aiFloor, aiMessage, userMessage, chatId, onComplete } = extractionQueue.shift();
|
||||
|
||||
try {
|
||||
const atoms = await extractAtomsForRound(userMessage, aiMessage, aiFloor, { timeout: 12000 });
|
||||
|
||||
if (!atoms?.length) {
|
||||
xbLog.info(MODULE_ID, `floor ${aiFloor}: 无有效 atoms`);
|
||||
onComplete?.({ floor: aiFloor, atomCount: 0 });
|
||||
continue;
|
||||
}
|
||||
|
||||
atoms.forEach(a => a.chatId = chatId);
|
||||
saveStateAtoms(atoms);
|
||||
|
||||
// 单楼实时处理:立即向量化
|
||||
await vectorizeAtomsSimple(chatId, atoms);
|
||||
|
||||
xbLog.info(MODULE_ID, `floor ${aiFloor}: ${atoms.length} atoms 已存储`);
|
||||
onComplete?.({ floor: aiFloor, atomCount: atoms.length });
|
||||
} catch (e) {
|
||||
xbLog.error(MODULE_ID, `floor ${aiFloor} 处理失败`, e);
|
||||
onComplete?.({ floor: aiFloor, atomCount: 0, error: e });
|
||||
}
|
||||
}
|
||||
|
||||
isProcessing = false;
|
||||
}
|
||||
|
||||
// 简单向量化(无进度回调,用于单楼实时处理)
|
||||
async function vectorizeAtomsSimple(chatId, atoms) {
|
||||
if (!atoms?.length) return;
|
||||
|
||||
const vectorCfg = getVectorConfig();
|
||||
if (!vectorCfg?.enabled) return;
|
||||
|
||||
const semanticTexts = atoms.map(a => a.semantic);
|
||||
const rTexts = atoms.map(a => buildRAggregateText(a));
|
||||
const fingerprint = getEngineFingerprint(vectorCfg);
|
||||
|
||||
try {
|
||||
const vectors = await embed(semanticTexts.concat(rTexts), { timeout: 30000 });
|
||||
const split = semanticTexts.length;
|
||||
if (!Array.isArray(vectors) || vectors.length < split * 2) {
|
||||
throw new Error(`embed length mismatch: expect>=${split * 2}, got=${vectors?.length || 0}`);
|
||||
}
|
||||
const semVectors = vectors.slice(0, split);
|
||||
const rVectors = vectors.slice(split, split + split);
|
||||
|
||||
const items = atoms.map((a, i) => ({
|
||||
atomId: a.atomId,
|
||||
floor: a.floor,
|
||||
vector: semVectors[i],
|
||||
rVector: rVectors[i] || semVectors[i],
|
||||
}));
|
||||
|
||||
await saveStateVectors(chatId, items, fingerprint);
|
||||
} catch (e) {
|
||||
xbLog.error(MODULE_ID, 'L0 向量化失败', e);
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// 回滚钩子
|
||||
// ============================================================================
|
||||
|
||||
async function handleStateRollback(floor) {
|
||||
xbLog.info(MODULE_ID, `收到回滚请求: floor >= ${floor}`);
|
||||
|
||||
const { chatId } = getContext();
|
||||
|
||||
deleteStateAtomsFromFloor(floor);
|
||||
deleteL0IndexFromFloor(floor);
|
||||
|
||||
if (chatId) {
|
||||
await deleteStateVectorsFromFloor(chatId, floor);
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// 兼容旧接口
|
||||
// ============================================================================
|
||||
|
||||
export async function batchExtractAndStoreAtoms(chatId, chat, onProgress) {
|
||||
if (!chatId || !chat?.length) return { built: 0 };
|
||||
|
||||
const vectorCfg = getVectorConfig();
|
||||
if (!vectorCfg?.enabled) return { built: 0 };
|
||||
|
||||
xbLog.info(MODULE_ID, `开始批量 L0 提取: ${chat.length} 条消息`);
|
||||
|
||||
clearStateAtoms();
|
||||
clearL0Index();
|
||||
await clearStateVectors(chatId);
|
||||
|
||||
return await incrementalExtractAtoms(chatId, chat, onProgress);
|
||||
}
|
||||
|
||||
export async function rebuildStateVectors(chatId, vectorCfg) {
|
||||
if (!chatId || !vectorCfg?.enabled) return { built: 0 };
|
||||
|
||||
const atoms = getStateAtoms();
|
||||
if (!atoms.length) return { built: 0 };
|
||||
|
||||
xbLog.info(MODULE_ID, `重建 L0 向量: ${atoms.length} 条 atom`);
|
||||
|
||||
await clearStateVectors(chatId);
|
||||
await vectorizeAtomsSimple(chatId, atoms);
|
||||
|
||||
return { built: atoms.length };
|
||||
}
|
||||
928
modules/story-summary/vector/retrieval/diffusion.js
Normal file
928
modules/story-summary/vector/retrieval/diffusion.js
Normal file
@@ -0,0 +1,928 @@
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// diffusion.js - PPR Graph Diffusion (Personalized PageRank)
|
||||
//
|
||||
// Spreads activation from seed L0 atoms through entity co-occurrence graph
|
||||
// to discover narratively-connected but semantically-distant memories.
|
||||
//
|
||||
// Pipeline position: recall.js Stage 7.5
|
||||
// Input: seeds (reranked L0 from Stage 6)
|
||||
// Output: additional L0 atoms → merged into l0Selected
|
||||
//
|
||||
// Algorithm:
|
||||
// 1. Build undirected weighted graph over all L0 atoms
|
||||
// Candidate edges: WHAT + R semantic; WHO/WHERE are reweight-only
|
||||
// 2. Personalized PageRank (Power Iteration)
|
||||
// Seeds weighted by rerankScore — Haveliwala (2002) topic-sensitive variant
|
||||
// α = 0.15 restart probability — Page et al. (1998)
|
||||
// 3. Post-verification (Dense Cosine Gate)
|
||||
// Exclude seeds, cosine ≥ 0.45, final = PPR_norm × cosine ≥ 0.10
|
||||
//
|
||||
// References:
|
||||
// Page et al. "The PageRank Citation Ranking" (1998)
|
||||
// Haveliwala "Topic-Sensitive PageRank" (IEEE TKDE 2003)
|
||||
// Langville & Meyer "Eigenvector Methods for Web IR" (SIAM Review 2005)
|
||||
// Sun et al. "GraftNet" (EMNLP 2018)
|
||||
// Jaccard "Étude comparative de la distribution florale" (1912)
|
||||
// Szymkiewicz "Une contribution statistique" (1934) — Overlap coefficient
|
||||
// Rimmon-Kenan "Narrative Fiction" (2002) — Channel weight rationale
|
||||
//
|
||||
// Core PPR iteration aligned with NetworkX pagerank():
|
||||
// github.com/networkx/networkx — algorithms/link_analysis/pagerank_alg.py
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
import { xbLog } from '../../../../core/debug-core.js';
|
||||
import { getContext } from '../../../../../../../extensions.js';
|
||||
|
||||
const MODULE_ID = 'diffusion';
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Configuration
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
const CONFIG = {
|
||||
// PPR parameters (Page et al. 1998; GraftNet 2018 uses same values)
|
||||
ALPHA: 0.15, // restart probability
|
||||
EPSILON: 1e-5, // L1 convergence threshold
|
||||
MAX_ITER: 50, // hard iteration cap (typically converges in 15-25)
|
||||
|
||||
// Edge weight channel coefficients
|
||||
// Candidate generation uses WHAT + R semantic only.
|
||||
// WHO/WHERE are reweight-only signals.
|
||||
GAMMA: {
|
||||
what: 0.40, // interaction pair overlap
|
||||
rSem: 0.40, // semantic similarity over edges.r aggregate
|
||||
who: 0.10, // endpoint entity overlap (reweight-only)
|
||||
where: 0.05, // location exact match (reweight-only)
|
||||
time: 0.05, // temporal decay score
|
||||
},
|
||||
// R semantic candidate generation
|
||||
R_SEM_MIN_SIM: 0.62,
|
||||
R_SEM_TOPK: 8,
|
||||
TIME_WINDOW_MAX: 80,
|
||||
TIME_DECAY_DIVISOR: 12,
|
||||
WHERE_MAX_GROUP_SIZE: 16, // skip location-only pair expansion for over-common places
|
||||
WHERE_FREQ_DAMP_PIVOT: 6, // location freq <= pivot keeps full WHERE score
|
||||
WHERE_FREQ_DAMP_MIN: 0.20, // lower bound for damped WHERE contribution
|
||||
|
||||
// Post-verification (Cosine Gate)
|
||||
COSINE_GATE: 0.46, // min cosine(queryVector, stateVector)
|
||||
SCORE_FLOOR: 0.10, // min finalScore = PPR_normalized × cosine
|
||||
DIFFUSION_CAP: 100, // max diffused nodes (excluding seeds)
|
||||
};
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Utility functions
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* Unicode-safe text normalization (matches recall.js / entity-lexicon.js)
|
||||
*/
|
||||
function normalize(s) {
|
||||
return String(s || '')
|
||||
.normalize('NFKC')
|
||||
.replace(/[\u200B-\u200D\uFEFF]/g, '')
|
||||
.trim()
|
||||
.toLowerCase();
|
||||
}
|
||||
|
||||
/**
|
||||
* Cosine similarity between two vectors
|
||||
*/
|
||||
function cosineSimilarity(a, b) {
|
||||
if (!a?.length || !b?.length || a.length !== b.length) return 0;
|
||||
let dot = 0, nA = 0, nB = 0;
|
||||
for (let i = 0; i < a.length; i++) {
|
||||
dot += a[i] * b[i];
|
||||
nA += a[i] * a[i];
|
||||
nB += b[i] * b[i];
|
||||
}
|
||||
return nA && nB ? dot / (Math.sqrt(nA) * Math.sqrt(nB)) : 0;
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Feature extraction from L0 atoms
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* Endpoint entity set from edges.s/edges.t (used for candidate pair generation).
|
||||
* @param {object} atom
|
||||
* @param {Set<string>} excludeEntities - entities to exclude (e.g. name1)
|
||||
* @returns {Set<string>}
|
||||
*/
|
||||
function extractEntities(atom, excludeEntities = new Set()) {
|
||||
const set = new Set();
|
||||
for (const e of (atom.edges || [])) {
|
||||
const s = normalize(e?.s);
|
||||
const t = normalize(e?.t);
|
||||
if (s && !excludeEntities.has(s)) set.add(s);
|
||||
if (t && !excludeEntities.has(t)) set.add(t);
|
||||
}
|
||||
return set;
|
||||
}
|
||||
|
||||
/**
|
||||
* WHAT channel: interaction pairs "A↔B" (direction-insensitive).
|
||||
* @param {object} atom
|
||||
* @param {Set<string>} excludeEntities
|
||||
* @returns {Set<string>}
|
||||
*/
|
||||
function extractInteractionPairs(atom, excludeEntities = new Set()) {
|
||||
const set = new Set();
|
||||
for (const e of (atom.edges || [])) {
|
||||
const s = normalize(e?.s);
|
||||
const t = normalize(e?.t);
|
||||
if (s && t && !excludeEntities.has(s) && !excludeEntities.has(t)) {
|
||||
const pair = [s, t].sort().join('\u2194');
|
||||
set.add(pair);
|
||||
}
|
||||
}
|
||||
return set;
|
||||
}
|
||||
|
||||
/**
|
||||
* WHERE channel: normalized location string
|
||||
* @param {object} atom
|
||||
* @returns {string} empty string if absent
|
||||
*/
|
||||
function extractLocation(atom) {
|
||||
return normalize(atom.where);
|
||||
}
|
||||
|
||||
function getFloorDistance(a, b) {
|
||||
const fa = Number(a?.floor || 0);
|
||||
const fb = Number(b?.floor || 0);
|
||||
return Math.abs(fa - fb);
|
||||
}
|
||||
|
||||
function getTimeScore(distance) {
|
||||
return Math.exp(-distance / CONFIG.TIME_DECAY_DIVISOR);
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Set similarity functions
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* Jaccard index: |A∩B| / |A∪B| (Jaccard 1912)
|
||||
* @param {Set<string>} a
|
||||
* @param {Set<string>} b
|
||||
* @returns {number} 0..1
|
||||
*/
|
||||
function jaccard(a, b) {
|
||||
if (!a.size || !b.size) return 0;
|
||||
let inter = 0;
|
||||
const [smaller, larger] = a.size <= b.size ? [a, b] : [b, a];
|
||||
for (const x of smaller) {
|
||||
if (larger.has(x)) inter++;
|
||||
}
|
||||
const union = a.size + b.size - inter;
|
||||
return union > 0 ? inter / union : 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Overlap coefficient: |A∩B| / min(|A|,|B|) (Szymkiewicz-Simpson 1934)
|
||||
* Used for directed pairs where set sizes are small (1-3); Jaccard
|
||||
* over-penalizes small-set asymmetry.
|
||||
* @param {Set<string>} a
|
||||
* @param {Set<string>} b
|
||||
* @returns {number} 0..1
|
||||
*/
|
||||
function overlapCoefficient(a, b) {
|
||||
if (!a.size || !b.size) return 0;
|
||||
let inter = 0;
|
||||
const [smaller, larger] = a.size <= b.size ? [a, b] : [b, a];
|
||||
for (const x of smaller) {
|
||||
if (larger.has(x)) inter++;
|
||||
}
|
||||
return inter / smaller.size;
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Graph construction
|
||||
//
|
||||
// Candidate pairs discovered via WHAT inverted index and R semantic top-k.
|
||||
// WHO/WHERE are reweight-only signals and never create candidate pairs.
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* Pre-extract features for all atoms
|
||||
* @param {object[]} allAtoms
|
||||
* @param {Set<string>} excludeEntities
|
||||
* @returns {object[]} feature objects with entities/interactionPairs/location
|
||||
*/
|
||||
function extractAllFeatures(allAtoms, excludeEntities = new Set()) {
|
||||
return allAtoms.map(atom => ({
|
||||
entities: extractEntities(atom, excludeEntities),
|
||||
interactionPairs: extractInteractionPairs(atom, excludeEntities),
|
||||
location: extractLocation(atom),
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Build inverted index: value → list of atom indices
|
||||
* @param {object[]} features
|
||||
* @returns {{ whatIndex: Map, locationFreq: Map }}
|
||||
*/
|
||||
function buildInvertedIndices(features) {
|
||||
const whatIndex = new Map();
|
||||
const locationFreq = new Map();
|
||||
|
||||
for (let i = 0; i < features.length; i++) {
|
||||
for (const pair of features[i].interactionPairs) {
|
||||
if (!whatIndex.has(pair)) whatIndex.set(pair, []);
|
||||
whatIndex.get(pair).push(i);
|
||||
}
|
||||
const loc = features[i].location;
|
||||
if (loc) locationFreq.set(loc, (locationFreq.get(loc) || 0) + 1);
|
||||
}
|
||||
|
||||
return { whatIndex, locationFreq };
|
||||
}
|
||||
|
||||
/**
|
||||
* Collect candidate pairs from inverted index
|
||||
* @param {Map} index - value → [atomIndex, ...]
|
||||
* @param {Set<number>} pairSet - packed pair collector
|
||||
* @param {number} N - total atom count (for pair packing)
|
||||
*/
|
||||
function collectPairsFromIndex(index, pairSet, N) {
|
||||
for (const indices of index.values()) {
|
||||
for (let a = 0; a < indices.length; a++) {
|
||||
for (let b = a + 1; b < indices.length; b++) {
|
||||
const lo = Math.min(indices[a], indices[b]);
|
||||
const hi = Math.max(indices[a], indices[b]);
|
||||
pairSet.add(lo * N + hi);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Build weighted undirected graph over L0 atoms.
|
||||
*
|
||||
* @param {object[]} allAtoms
|
||||
* @param {object[]} stateVectors
|
||||
* @param {Set<string>} excludeEntities
|
||||
* @returns {{ neighbors: object[][], edgeCount: number, channelStats: object, buildTime: number }}
|
||||
*/
|
||||
function buildGraph(allAtoms, stateVectors = [], excludeEntities = new Set()) {
|
||||
const N = allAtoms.length;
|
||||
const T0 = performance.now();
|
||||
|
||||
const features = extractAllFeatures(allAtoms, excludeEntities);
|
||||
const { whatIndex, locationFreq } = buildInvertedIndices(features);
|
||||
|
||||
// Candidate pairs: WHAT + R semantic
|
||||
const pairSetByWhat = new Set();
|
||||
const pairSetByRSem = new Set();
|
||||
const rSemByPair = new Map();
|
||||
const pairSet = new Set();
|
||||
collectPairsFromIndex(whatIndex, pairSetByWhat, N);
|
||||
|
||||
const rVectorByAtomId = new Map(
|
||||
(stateVectors || [])
|
||||
.filter(v => v?.atomId && v?.rVector?.length)
|
||||
.map(v => [v.atomId, v.rVector])
|
||||
);
|
||||
const rVectors = allAtoms.map(a => rVectorByAtomId.get(a.atomId) || null);
|
||||
|
||||
const directedNeighbors = Array.from({ length: N }, () => []);
|
||||
let rSemSimSum = 0;
|
||||
let rSemSimCount = 0;
|
||||
let topKPrunedPairs = 0;
|
||||
let timeWindowFilteredPairs = 0;
|
||||
|
||||
// Enumerate only pairs within floor window to avoid O(N^2) full scan.
|
||||
const sortedByFloor = allAtoms
|
||||
.map((atom, idx) => ({ idx, floor: Number(atom?.floor || 0) }))
|
||||
.sort((a, b) => a.floor - b.floor);
|
||||
|
||||
for (let left = 0; left < sortedByFloor.length; left++) {
|
||||
const i = sortedByFloor[left].idx;
|
||||
const baseFloor = sortedByFloor[left].floor;
|
||||
|
||||
for (let right = left + 1; right < sortedByFloor.length; right++) {
|
||||
const floorDelta = sortedByFloor[right].floor - baseFloor;
|
||||
if (floorDelta > CONFIG.TIME_WINDOW_MAX) break;
|
||||
|
||||
const j = sortedByFloor[right].idx;
|
||||
const vi = rVectors[i];
|
||||
const vj = rVectors[j];
|
||||
if (!vi?.length || !vj?.length) continue;
|
||||
|
||||
const sim = cosineSimilarity(vi, vj);
|
||||
if (sim < CONFIG.R_SEM_MIN_SIM) continue;
|
||||
|
||||
directedNeighbors[i].push({ target: j, sim });
|
||||
directedNeighbors[j].push({ target: i, sim });
|
||||
rSemSimSum += sim;
|
||||
rSemSimCount++;
|
||||
}
|
||||
}
|
||||
|
||||
for (let i = 0; i < N; i++) {
|
||||
const arr = directedNeighbors[i];
|
||||
if (!arr.length) continue;
|
||||
arr.sort((a, b) => b.sim - a.sim);
|
||||
if (arr.length > CONFIG.R_SEM_TOPK) {
|
||||
topKPrunedPairs += arr.length - CONFIG.R_SEM_TOPK;
|
||||
}
|
||||
for (const n of arr.slice(0, CONFIG.R_SEM_TOPK)) {
|
||||
const lo = Math.min(i, n.target);
|
||||
const hi = Math.max(i, n.target);
|
||||
const packed = lo * N + hi;
|
||||
pairSetByRSem.add(packed);
|
||||
const prev = rSemByPair.get(packed) || 0;
|
||||
if (n.sim > prev) rSemByPair.set(packed, n.sim);
|
||||
}
|
||||
}
|
||||
for (const p of pairSetByWhat) pairSet.add(p);
|
||||
for (const p of pairSetByRSem) pairSet.add(p);
|
||||
|
||||
// Compute edge weights for all candidates
|
||||
const neighbors = Array.from({ length: N }, () => []);
|
||||
let edgeCount = 0;
|
||||
const channelStats = { what: 0, where: 0, rSem: 0, who: 0 };
|
||||
let reweightWhoUsed = 0;
|
||||
let reweightWhereUsed = 0;
|
||||
|
||||
for (const packed of pairSet) {
|
||||
const i = Math.floor(packed / N);
|
||||
const j = packed % N;
|
||||
|
||||
const distance = getFloorDistance(allAtoms[i], allAtoms[j]);
|
||||
if (distance > CONFIG.TIME_WINDOW_MAX) {
|
||||
timeWindowFilteredPairs++;
|
||||
continue;
|
||||
}
|
||||
const wTime = getTimeScore(distance);
|
||||
|
||||
const fi = features[i];
|
||||
const fj = features[j];
|
||||
|
||||
const wWhat = overlapCoefficient(fi.interactionPairs, fj.interactionPairs);
|
||||
const wRSem = rSemByPair.get(packed) || 0;
|
||||
const wWho = jaccard(fi.entities, fj.entities);
|
||||
let wWhere = 0.0;
|
||||
if (fi.location && fi.location === fj.location) {
|
||||
const freq = locationFreq.get(fi.location) || 1;
|
||||
const damp = Math.max(
|
||||
CONFIG.WHERE_FREQ_DAMP_MIN,
|
||||
Math.min(1, CONFIG.WHERE_FREQ_DAMP_PIVOT / Math.max(1, freq))
|
||||
);
|
||||
wWhere = damp;
|
||||
}
|
||||
|
||||
const weight =
|
||||
CONFIG.GAMMA.what * wWhat +
|
||||
CONFIG.GAMMA.rSem * wRSem +
|
||||
CONFIG.GAMMA.who * wWho +
|
||||
CONFIG.GAMMA.where * wWhere +
|
||||
CONFIG.GAMMA.time * wTime;
|
||||
|
||||
if (weight > 0) {
|
||||
neighbors[i].push({ target: j, weight });
|
||||
neighbors[j].push({ target: i, weight });
|
||||
edgeCount++;
|
||||
|
||||
if (wWhat > 0) channelStats.what++;
|
||||
if (wRSem > 0) channelStats.rSem++;
|
||||
if (wWho > 0) channelStats.who++;
|
||||
if (wWhere > 0) channelStats.where++;
|
||||
if (wWho > 0) reweightWhoUsed++;
|
||||
if (wWhere > 0) reweightWhereUsed++;
|
||||
}
|
||||
}
|
||||
|
||||
const buildTime = Math.round(performance.now() - T0);
|
||||
|
||||
xbLog.info(MODULE_ID,
|
||||
`Graph: ${N} nodes, ${edgeCount} edges ` +
|
||||
`(candidate_by_what=${pairSetByWhat.size} candidate_by_r_sem=${pairSetByRSem.size}) ` +
|
||||
`(what=${channelStats.what} r_sem=${channelStats.rSem} who=${channelStats.who} where=${channelStats.where}) ` +
|
||||
`(reweight_who_used=${reweightWhoUsed} reweight_where_used=${reweightWhereUsed}) ` +
|
||||
`(time_window_filtered=${timeWindowFilteredPairs} topk_pruned=${topKPrunedPairs}) ` +
|
||||
`(${buildTime}ms)`
|
||||
);
|
||||
|
||||
const totalPairs = N > 1 ? (N * (N - 1)) / 2 : 0;
|
||||
const edgeDensity = totalPairs > 0 ? Number((edgeCount / totalPairs * 100).toFixed(2)) : 0;
|
||||
|
||||
return {
|
||||
neighbors,
|
||||
edgeCount,
|
||||
channelStats,
|
||||
buildTime,
|
||||
candidatePairs: pairSet.size,
|
||||
pairsFromWhat: pairSetByWhat.size,
|
||||
pairsFromRSem: pairSetByRSem.size,
|
||||
rSemAvgSim: rSemSimCount ? Number((rSemSimSum / rSemSimCount).toFixed(3)) : 0,
|
||||
timeWindowFilteredPairs,
|
||||
topKPrunedPairs,
|
||||
reweightWhoUsed,
|
||||
reweightWhereUsed,
|
||||
edgeDensity,
|
||||
};
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// PPR: Seed vector construction
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* Build personalization vector s from seeds, weighted by rerankScore.
|
||||
* Haveliwala (2002): non-uniform personalization improves topic sensitivity.
|
||||
*
|
||||
* @param {object[]} seeds - seed L0 entries with atomId and rerankScore
|
||||
* @param {Map<string, number>} idToIdx - atomId → array index
|
||||
* @param {number} N - total node count
|
||||
* @returns {Float64Array} personalization vector (L1-normalized, sums to 1)
|
||||
*/
|
||||
function buildSeedVector(seeds, idToIdx, N) {
|
||||
const s = new Float64Array(N);
|
||||
let total = 0;
|
||||
|
||||
for (const seed of seeds) {
|
||||
const idx = idToIdx.get(seed.atomId);
|
||||
if (idx == null) continue;
|
||||
|
||||
const score = Math.max(0, seed.rerankScore || seed.similarity || 0);
|
||||
s[idx] += score;
|
||||
total += score;
|
||||
}
|
||||
|
||||
// L1 normalize to probability distribution
|
||||
if (total > 0) {
|
||||
for (let i = 0; i < N; i++) s[i] /= total;
|
||||
}
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// PPR: Column normalization + dangling node detection
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* Column-normalize adjacency into transition matrix W.
|
||||
*
|
||||
* Column j of W: W_{ij} = weight(i,j) / Σ_k weight(k,j)
|
||||
* Dangling nodes (no outgoing edges): handled in powerIteration
|
||||
* via redistribution to personalization vector s.
|
||||
* (Langville & Meyer 2005, §4.1)
|
||||
*
|
||||
* @param {object[][]} neighbors - neighbors[j] = [{target, weight}, ...]
|
||||
* @param {number} N
|
||||
* @returns {{ columns: object[][], dangling: number[] }}
|
||||
*/
|
||||
function columnNormalize(neighbors, N) {
|
||||
const columns = Array.from({ length: N }, () => []);
|
||||
const dangling = [];
|
||||
|
||||
for (let j = 0; j < N; j++) {
|
||||
const edges = neighbors[j];
|
||||
|
||||
let sum = 0;
|
||||
for (let e = 0; e < edges.length; e++) sum += edges[e].weight;
|
||||
|
||||
if (sum <= 0) {
|
||||
dangling.push(j);
|
||||
continue;
|
||||
}
|
||||
|
||||
const col = columns[j];
|
||||
for (let e = 0; e < edges.length; e++) {
|
||||
col.push({ target: edges[e].target, prob: edges[e].weight / sum });
|
||||
}
|
||||
}
|
||||
|
||||
return { columns, dangling };
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// PPR: Power Iteration
|
||||
//
|
||||
// Aligned with NetworkX pagerank() (pagerank_alg.py):
|
||||
//
|
||||
// NetworkX "alpha" = damping = our (1 − α)
|
||||
// NetworkX "1-alpha" = teleportation = our α
|
||||
//
|
||||
// Per iteration:
|
||||
// π_new[i] = α·s[i] + (1−α)·( Σ_j W_{ij}·π[j] + dangling_sum·s[i] )
|
||||
//
|
||||
// Convergence: Perron-Frobenius theorem guarantees unique stationary
|
||||
// distribution for irreducible aperiodic column-stochastic matrix.
|
||||
// Rate: ‖π^(t+1) − π^t‖₁ ≤ (1−α)^t (geometric).
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* Run PPR Power Iteration.
|
||||
*
|
||||
* @param {object[][]} columns - column-normalized transition matrix
|
||||
* @param {Float64Array} s - personalization vector (sums to 1)
|
||||
* @param {number[]} dangling - dangling node indices
|
||||
* @param {number} N - node count
|
||||
* @returns {{ pi: Float64Array, iterations: number, finalError: number }}
|
||||
*/
|
||||
function powerIteration(columns, s, dangling, N) {
|
||||
const alpha = CONFIG.ALPHA;
|
||||
const d = 1 - alpha; // damping factor = prob of following edges
|
||||
const epsilon = CONFIG.EPSILON;
|
||||
const maxIter = CONFIG.MAX_ITER;
|
||||
|
||||
// Initialize π to personalization vector
|
||||
let pi = new Float64Array(N);
|
||||
for (let i = 0; i < N; i++) pi[i] = s[i];
|
||||
|
||||
let iterations = 0;
|
||||
let finalError = 0;
|
||||
|
||||
for (let iter = 0; iter < maxIter; iter++) {
|
||||
const piNew = new Float64Array(N);
|
||||
|
||||
// Dangling mass: probability at nodes with no outgoing edges
|
||||
// redistributed to personalization vector (Langville & Meyer 2005)
|
||||
let danglingSum = 0;
|
||||
for (let k = 0; k < dangling.length; k++) {
|
||||
danglingSum += pi[dangling[k]];
|
||||
}
|
||||
|
||||
// Sparse matrix-vector product: (1−α) · W · π
|
||||
for (let j = 0; j < N; j++) {
|
||||
const pj = pi[j];
|
||||
if (pj === 0) continue;
|
||||
|
||||
const col = columns[j];
|
||||
const dpj = d * pj;
|
||||
for (let e = 0; e < col.length; e++) {
|
||||
piNew[col[e].target] += dpj * col[e].prob;
|
||||
}
|
||||
}
|
||||
|
||||
// Restart + dangling contribution:
|
||||
// α · s[i] + (1−α) · danglingSum · s[i]
|
||||
const restartCoeff = alpha + d * danglingSum;
|
||||
for (let i = 0; i < N; i++) {
|
||||
piNew[i] += restartCoeff * s[i];
|
||||
}
|
||||
|
||||
// L1 convergence check
|
||||
let l1 = 0;
|
||||
for (let i = 0; i < N; i++) {
|
||||
l1 += Math.abs(piNew[i] - pi[i]);
|
||||
}
|
||||
|
||||
pi = piNew;
|
||||
iterations = iter + 1;
|
||||
finalError = l1;
|
||||
|
||||
if (l1 < epsilon) break;
|
||||
}
|
||||
|
||||
return { pi, iterations, finalError };
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Post-verification: Dense Cosine Gate
|
||||
//
|
||||
// PPR measures graph-structural relevance ("same characters").
|
||||
// Cosine gate measures semantic relevance ("related to current topic").
|
||||
// Product combination ensures both dimensions are satisfied
|
||||
// (CombMNZ — Fox & Shaw, TREC-2 1994).
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* Filter PPR-activated nodes by semantic relevance.
|
||||
*
|
||||
* For each non-seed node with PPR > 0:
|
||||
* 1. cosine(queryVector, stateVector) ≥ COSINE_GATE
|
||||
* 2. finalScore = PPR_normalized × cosine ≥ SCORE_FLOOR
|
||||
* 3. Top DIFFUSION_CAP by finalScore
|
||||
*
|
||||
* @param {Float64Array} pi - PPR stationary distribution
|
||||
* @param {string[]} atomIds - index → atomId
|
||||
* @param {Map<string, object>} atomById - atomId → atom object
|
||||
* @param {Set<string>} seedAtomIds - seed atomIds (excluded from output)
|
||||
* @param {Map<string, Float32Array>} vectorMap - atomId → embedding vector
|
||||
* @param {Float32Array|number[]} queryVector - R2 weighted query vector
|
||||
* @returns {{ diffused: object[], gateStats: object }}
|
||||
*/
|
||||
function postVerify(pi, atomIds, atomById, seedAtomIds, vectorMap, queryVector) {
|
||||
const N = atomIds.length;
|
||||
const gateStats = { passed: 0, filtered: 0, noVector: 0 };
|
||||
|
||||
// Find max PPR score among non-seed nodes (for normalization)
|
||||
let maxPPR = 0;
|
||||
for (let i = 0; i < N; i++) {
|
||||
if (pi[i] > 0 && !seedAtomIds.has(atomIds[i])) {
|
||||
if (pi[i] > maxPPR) maxPPR = pi[i];
|
||||
}
|
||||
}
|
||||
|
||||
if (maxPPR <= 0) {
|
||||
return { diffused: [], gateStats };
|
||||
}
|
||||
|
||||
const candidates = [];
|
||||
|
||||
for (let i = 0; i < N; i++) {
|
||||
const atomId = atomIds[i];
|
||||
|
||||
// Skip seeds and zero-probability nodes
|
||||
if (seedAtomIds.has(atomId)) continue;
|
||||
if (pi[i] <= 0) continue;
|
||||
|
||||
// Require state vector for cosine verification
|
||||
const vec = vectorMap.get(atomId);
|
||||
if (!vec?.length) {
|
||||
gateStats.noVector++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Cosine gate
|
||||
const cos = cosineSimilarity(queryVector, vec);
|
||||
if (cos < CONFIG.COSINE_GATE) {
|
||||
gateStats.filtered++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Final score = PPR_normalized × cosine
|
||||
const pprNorm = pi[i] / maxPPR;
|
||||
const finalScore = pprNorm * cos;
|
||||
|
||||
if (finalScore < CONFIG.SCORE_FLOOR) {
|
||||
gateStats.filtered++;
|
||||
continue;
|
||||
}
|
||||
|
||||
gateStats.passed++;
|
||||
|
||||
const atom = atomById.get(atomId);
|
||||
if (!atom) continue;
|
||||
|
||||
candidates.push({
|
||||
atomId,
|
||||
floor: atom.floor,
|
||||
atom,
|
||||
finalScore,
|
||||
pprScore: pi[i],
|
||||
pprNormalized: pprNorm,
|
||||
cosine: cos,
|
||||
});
|
||||
}
|
||||
|
||||
// Sort by finalScore descending, cap at DIFFUSION_CAP
|
||||
candidates.sort((a, b) => b.finalScore - a.finalScore);
|
||||
const diffused = candidates.slice(0, CONFIG.DIFFUSION_CAP);
|
||||
|
||||
return { diffused, gateStats };
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Main entry point
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* Spread activation from seed L0 atoms through entity co-occurrence graph.
|
||||
*
|
||||
* Called from recall.js Stage 7.5, after locateAndPullEvidence and before
|
||||
* Causation Trace. Results are merged into l0Selected and consumed by
|
||||
* prompt.js through existing budget/formatting pipeline (zero downstream changes).
|
||||
*
|
||||
* @param {object[]} seeds - l0Selected from recall Stage 6
|
||||
* Each: { atomId, rerankScore, similarity, atom, ... }
|
||||
* @param {object[]} allAtoms - getStateAtoms() result
|
||||
* Each: { atomId, floor, semantic, edges, where }
|
||||
* @param {object[]} stateVectors - getAllStateVectors() result
|
||||
* Each: { atomId, floor, vector: Float32Array, rVector?: Float32Array }
|
||||
* @param {Float32Array|number[]} queryVector - R2 weighted query vector
|
||||
* @param {object|null} metrics - metrics object (optional, mutated in-place)
|
||||
* @returns {object[]} Additional L0 atoms for l0Selected
|
||||
* Each: { atomId, floor, atom, finalScore, pprScore, pprNormalized, cosine }
|
||||
*/
|
||||
export function diffuseFromSeeds(seeds, allAtoms, stateVectors, queryVector, metrics) {
|
||||
const T0 = performance.now();
|
||||
|
||||
// ─── Early exits ─────────────────────────────────────────────────
|
||||
|
||||
if (!seeds?.length || !allAtoms?.length || !queryVector?.length) {
|
||||
fillMetricsEmpty(metrics);
|
||||
return [];
|
||||
}
|
||||
|
||||
// Align with entity-lexicon hard rule: exclude name1 from graph features.
|
||||
const { name1 } = getContext();
|
||||
const excludeEntities = new Set();
|
||||
if (name1) excludeEntities.add(normalize(name1));
|
||||
|
||||
// ─── 1. Build atom index ─────────────────────────────────────────
|
||||
|
||||
const atomById = new Map();
|
||||
const atomIds = [];
|
||||
const idToIdx = new Map();
|
||||
|
||||
for (let i = 0; i < allAtoms.length; i++) {
|
||||
const a = allAtoms[i];
|
||||
atomById.set(a.atomId, a);
|
||||
atomIds.push(a.atomId);
|
||||
idToIdx.set(a.atomId, i);
|
||||
}
|
||||
|
||||
const N = allAtoms.length;
|
||||
|
||||
// Validate seeds against atom index
|
||||
const validSeeds = seeds.filter(s => idToIdx.has(s.atomId));
|
||||
const seedAtomIds = new Set(validSeeds.map(s => s.atomId));
|
||||
|
||||
if (!validSeeds.length) {
|
||||
fillMetricsEmpty(metrics);
|
||||
return [];
|
||||
}
|
||||
|
||||
// ─── 2. Build graph ──────────────────────────────────────────────
|
||||
|
||||
const graph = buildGraph(allAtoms, stateVectors, excludeEntities);
|
||||
|
||||
if (graph.edgeCount === 0) {
|
||||
fillMetrics(metrics, {
|
||||
seedCount: validSeeds.length,
|
||||
graphNodes: N,
|
||||
graphEdges: 0,
|
||||
channelStats: graph.channelStats,
|
||||
candidatePairs: graph.candidatePairs,
|
||||
pairsFromWhat: graph.pairsFromWhat,
|
||||
pairsFromRSem: graph.pairsFromRSem,
|
||||
rSemAvgSim: graph.rSemAvgSim,
|
||||
timeWindowFilteredPairs: graph.timeWindowFilteredPairs,
|
||||
topKPrunedPairs: graph.topKPrunedPairs,
|
||||
edgeDensity: graph.edgeDensity,
|
||||
reweightWhoUsed: graph.reweightWhoUsed,
|
||||
reweightWhereUsed: graph.reweightWhereUsed,
|
||||
time: graph.buildTime,
|
||||
});
|
||||
xbLog.info(MODULE_ID, 'No graph edges — skipping diffusion');
|
||||
return [];
|
||||
}
|
||||
|
||||
// ─── 3. Build seed vector ────────────────────────────────────────
|
||||
|
||||
const s = buildSeedVector(validSeeds, idToIdx, N);
|
||||
|
||||
// ─── 4. Column normalize ─────────────────────────────────────────
|
||||
|
||||
const { columns, dangling } = columnNormalize(graph.neighbors, N);
|
||||
|
||||
// ─── 5. PPR Power Iteration ──────────────────────────────────────
|
||||
|
||||
const T_PPR = performance.now();
|
||||
const { pi, iterations, finalError } = powerIteration(columns, s, dangling, N);
|
||||
const pprTime = Math.round(performance.now() - T_PPR);
|
||||
|
||||
// Count activated non-seed nodes
|
||||
let pprActivated = 0;
|
||||
for (let i = 0; i < N; i++) {
|
||||
if (pi[i] > 0 && !seedAtomIds.has(atomIds[i])) pprActivated++;
|
||||
}
|
||||
|
||||
// ─── 6. Post-verification ────────────────────────────────────────
|
||||
|
||||
const vectorMap = new Map();
|
||||
for (const sv of (stateVectors || [])) {
|
||||
vectorMap.set(sv.atomId, sv.vector);
|
||||
}
|
||||
|
||||
const { diffused, gateStats } = postVerify(
|
||||
pi, atomIds, atomById, seedAtomIds, vectorMap, queryVector
|
||||
);
|
||||
|
||||
// ─── 7. Metrics ──────────────────────────────────────────────────
|
||||
|
||||
const totalTime = Math.round(performance.now() - T0);
|
||||
|
||||
fillMetrics(metrics, {
|
||||
seedCount: validSeeds.length,
|
||||
graphNodes: N,
|
||||
graphEdges: graph.edgeCount,
|
||||
channelStats: graph.channelStats,
|
||||
candidatePairs: graph.candidatePairs,
|
||||
pairsFromWhat: graph.pairsFromWhat,
|
||||
pairsFromRSem: graph.pairsFromRSem,
|
||||
rSemAvgSim: graph.rSemAvgSim,
|
||||
timeWindowFilteredPairs: graph.timeWindowFilteredPairs,
|
||||
topKPrunedPairs: graph.topKPrunedPairs,
|
||||
edgeDensity: graph.edgeDensity,
|
||||
reweightWhoUsed: graph.reweightWhoUsed,
|
||||
reweightWhereUsed: graph.reweightWhereUsed,
|
||||
buildTime: graph.buildTime,
|
||||
iterations,
|
||||
convergenceError: finalError,
|
||||
pprActivated,
|
||||
cosineGatePassed: gateStats.passed,
|
||||
cosineGateFiltered: gateStats.filtered,
|
||||
cosineGateNoVector: gateStats.noVector,
|
||||
postGatePassRate: pprActivated > 0
|
||||
? Math.round((gateStats.passed / pprActivated) * 100)
|
||||
: 0,
|
||||
finalCount: diffused.length,
|
||||
scoreDistribution: diffused.length > 0
|
||||
? calcScoreStats(diffused.map(d => d.finalScore))
|
||||
: { min: 0, max: 0, mean: 0 },
|
||||
time: totalTime,
|
||||
});
|
||||
|
||||
xbLog.info(MODULE_ID,
|
||||
`Diffusion: ${validSeeds.length} seeds → ` +
|
||||
`graph(${N}n/${graph.edgeCount}e) → ` +
|
||||
`PPR(${iterations}it, ε=${finalError.toExponential(1)}, ${pprTime}ms) → ` +
|
||||
`${pprActivated} activated → ` +
|
||||
`gate(${gateStats.passed}\u2713/${gateStats.filtered}\u2717` +
|
||||
`${gateStats.noVector ? `/${gateStats.noVector}?` : ''}) → ` +
|
||||
`${diffused.length} final (${totalTime}ms)`
|
||||
);
|
||||
|
||||
return diffused;
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Metrics helpers
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* Compute min/max/mean distribution
|
||||
* @param {number[]} scores
|
||||
* @returns {{ min: number, max: number, mean: number }}
|
||||
*/
|
||||
function calcScoreStats(scores) {
|
||||
if (!scores.length) return { min: 0, max: 0, mean: 0 };
|
||||
const sorted = [...scores].sort((a, b) => a - b);
|
||||
const sum = sorted.reduce((a, b) => a + b, 0);
|
||||
return {
|
||||
min: Number(sorted[0].toFixed(3)),
|
||||
max: Number(sorted[sorted.length - 1].toFixed(3)),
|
||||
mean: Number((sum / sorted.length).toFixed(3)),
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Fill metrics with empty diffusion block
|
||||
*/
|
||||
function fillMetricsEmpty(metrics) {
|
||||
if (!metrics) return;
|
||||
metrics.diffusion = {
|
||||
seedCount: 0,
|
||||
graphNodes: 0,
|
||||
graphEdges: 0,
|
||||
iterations: 0,
|
||||
convergenceError: 0,
|
||||
pprActivated: 0,
|
||||
cosineGatePassed: 0,
|
||||
cosineGateFiltered: 0,
|
||||
cosineGateNoVector: 0,
|
||||
finalCount: 0,
|
||||
scoreDistribution: { min: 0, max: 0, mean: 0 },
|
||||
byChannel: { what: 0, where: 0, rSem: 0, who: 0 },
|
||||
candidatePairs: 0,
|
||||
pairsFromWhat: 0,
|
||||
pairsFromRSem: 0,
|
||||
rSemAvgSim: 0,
|
||||
timeWindowFilteredPairs: 0,
|
||||
topKPrunedPairs: 0,
|
||||
edgeDensity: 0,
|
||||
reweightWhoUsed: 0,
|
||||
reweightWhereUsed: 0,
|
||||
postGatePassRate: 0,
|
||||
time: 0,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Fill metrics with diffusion results
|
||||
*/
|
||||
function fillMetrics(metrics, data) {
|
||||
if (!metrics) return;
|
||||
metrics.diffusion = {
|
||||
seedCount: data.seedCount || 0,
|
||||
graphNodes: data.graphNodes || 0,
|
||||
graphEdges: data.graphEdges || 0,
|
||||
iterations: data.iterations || 0,
|
||||
convergenceError: data.convergenceError || 0,
|
||||
pprActivated: data.pprActivated || 0,
|
||||
cosineGatePassed: data.cosineGatePassed || 0,
|
||||
cosineGateFiltered: data.cosineGateFiltered || 0,
|
||||
cosineGateNoVector: data.cosineGateNoVector || 0,
|
||||
postGatePassRate: data.postGatePassRate || 0,
|
||||
finalCount: data.finalCount || 0,
|
||||
scoreDistribution: data.scoreDistribution || { min: 0, max: 0, mean: 0 },
|
||||
byChannel: data.channelStats || { what: 0, where: 0, rSem: 0, who: 0 },
|
||||
candidatePairs: data.candidatePairs || 0,
|
||||
pairsFromWhat: data.pairsFromWhat || 0,
|
||||
pairsFromRSem: data.pairsFromRSem || 0,
|
||||
rSemAvgSim: data.rSemAvgSim || 0,
|
||||
timeWindowFilteredPairs: data.timeWindowFilteredPairs || 0,
|
||||
topKPrunedPairs: data.topKPrunedPairs || 0,
|
||||
edgeDensity: data.edgeDensity || 0,
|
||||
reweightWhoUsed: data.reweightWhoUsed || 0,
|
||||
reweightWhereUsed: data.reweightWhereUsed || 0,
|
||||
time: data.time || 0,
|
||||
};
|
||||
}
|
||||
221
modules/story-summary/vector/retrieval/entity-lexicon.js
Normal file
221
modules/story-summary/vector/retrieval/entity-lexicon.js
Normal file
@@ -0,0 +1,221 @@
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// entity-lexicon.js - 实体词典(确定性,无 LLM)
|
||||
//
|
||||
// 职责:
|
||||
// 1. 从已有结构化存储构建可信实体词典
|
||||
// 2. 从文本中提取命中的实体
|
||||
//
|
||||
// 硬约束:name1 永不进入词典
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
import { getStateAtoms } from '../storage/state-store.js';
|
||||
|
||||
// 人名词典黑名单:代词、标签词、明显非人物词
|
||||
const PERSON_LEXICON_BLACKLIST = new Set([
|
||||
'我', '你', '他', '她', '它', '我们', '你们', '他们', '她们', '它们',
|
||||
'自己', '对方', '用户', '助手', 'user', 'assistant',
|
||||
'男人', '女性', '成熟女性', '主人', '主角',
|
||||
'龟头', '子宫', '阴道', '阴茎',
|
||||
'电脑', '电脑屏幕', '手机', '监控画面', '摄像头', '阳光', '折叠床', '书房', '卫生间隔间',
|
||||
]);
|
||||
|
||||
/**
|
||||
* 标准化字符串(用于实体匹配)
|
||||
* @param {string} s
|
||||
* @returns {string}
|
||||
*/
|
||||
function normalize(s) {
|
||||
return String(s || '')
|
||||
.normalize('NFKC')
|
||||
.replace(/[\u200B-\u200D\uFEFF]/g, '')
|
||||
.trim()
|
||||
.toLowerCase();
|
||||
}
|
||||
|
||||
function isBlacklistedPersonTerm(raw) {
|
||||
return PERSON_LEXICON_BLACKLIST.has(normalize(raw));
|
||||
}
|
||||
|
||||
function addPersonTerm(set, raw) {
|
||||
const n = normalize(raw);
|
||||
if (!n || n.length < 2) return;
|
||||
if (isBlacklistedPersonTerm(n)) return;
|
||||
set.add(n);
|
||||
}
|
||||
|
||||
function collectTrustedCharacters(store, context) {
|
||||
const trusted = new Set();
|
||||
|
||||
const main = store?.json?.characters?.main || [];
|
||||
for (const m of main) {
|
||||
addPersonTerm(trusted, typeof m === 'string' ? m : m.name);
|
||||
}
|
||||
|
||||
const arcs = store?.json?.arcs || [];
|
||||
for (const a of arcs) {
|
||||
addPersonTerm(trusted, a.name);
|
||||
}
|
||||
|
||||
if (context?.name2) {
|
||||
addPersonTerm(trusted, context.name2);
|
||||
}
|
||||
|
||||
const events = store?.json?.events || [];
|
||||
for (const ev of events) {
|
||||
for (const p of (ev?.participants || [])) {
|
||||
addPersonTerm(trusted, p);
|
||||
}
|
||||
}
|
||||
|
||||
if (context?.name1) {
|
||||
trusted.delete(normalize(context.name1));
|
||||
}
|
||||
|
||||
return trusted;
|
||||
}
|
||||
|
||||
/**
|
||||
* Build trusted character pool only (without scanning L0 candidate atoms).
|
||||
* trustedCharacters: main/arcs/name2/L2 participants, excludes name1.
|
||||
*
|
||||
* @param {object} store
|
||||
* @param {object} context
|
||||
* @returns {Set<string>}
|
||||
*/
|
||||
export function buildTrustedCharacters(store, context) {
|
||||
return collectTrustedCharacters(store, context);
|
||||
}
|
||||
|
||||
function collectCandidateCharactersFromL0(context) {
|
||||
const candidate = new Set();
|
||||
const atoms = getStateAtoms();
|
||||
for (const atom of atoms) {
|
||||
for (const e of (atom.edges || [])) {
|
||||
addPersonTerm(candidate, e?.s);
|
||||
addPersonTerm(candidate, e?.t);
|
||||
}
|
||||
}
|
||||
if (context?.name1) {
|
||||
candidate.delete(normalize(context.name1));
|
||||
}
|
||||
return candidate;
|
||||
}
|
||||
|
||||
/**
|
||||
* Build character pools with trust tiers.
|
||||
* trustedCharacters: main/arcs/name2/L2 participants (clean source)
|
||||
* candidateCharacters: L0 edges.s/t (blacklist-cleaned)
|
||||
*/
|
||||
export function buildCharacterPools(store, context) {
|
||||
const trustedCharacters = collectTrustedCharacters(store, context);
|
||||
const candidateCharacters = collectCandidateCharactersFromL0(context);
|
||||
const allCharacters = new Set([...trustedCharacters, ...candidateCharacters]);
|
||||
return { trustedCharacters, candidateCharacters, allCharacters };
|
||||
}
|
||||
|
||||
/**
|
||||
* 构建实体词典
|
||||
*
|
||||
* 来源(按可信度):
|
||||
* 1. store.json.characters.main — 已确认主要角色
|
||||
* 2. store.json.arcs[].name — 弧光对象
|
||||
* 3. context.name2 — 当前角色
|
||||
* 4. store.json.events[].participants — L2 事件参与者
|
||||
* 5. L0 atoms edges.s/edges.t
|
||||
*
|
||||
* 硬约束:永远排除 normalize(context.name1)
|
||||
*
|
||||
* @param {object} store - getSummaryStore() 返回值
|
||||
* @param {object} context - { name1: string, name2: string }
|
||||
* @returns {Set<string>} 标准化后的实体集合
|
||||
*/
|
||||
export function buildEntityLexicon(store, context) {
|
||||
return buildCharacterPools(store, context).allCharacters;
|
||||
}
|
||||
|
||||
/**
|
||||
* 构建"原词形 → 标准化"映射表
|
||||
* 用于从 lexicon 反查原始显示名
|
||||
*
|
||||
* @param {object} store
|
||||
* @param {object} context
|
||||
* @returns {Map<string, string>} normalize(name) → 原词形
|
||||
*/
|
||||
export function buildDisplayNameMap(store, context) {
|
||||
const map = new Map();
|
||||
|
||||
const register = (raw) => {
|
||||
const n = normalize(raw);
|
||||
if (!n || n.length < 2) return;
|
||||
if (isBlacklistedPersonTerm(n)) return;
|
||||
if (!map.has(n)) {
|
||||
map.set(n, String(raw).trim());
|
||||
}
|
||||
};
|
||||
|
||||
const main = store?.json?.characters?.main || [];
|
||||
for (const m of main) {
|
||||
register(typeof m === 'string' ? m : m.name);
|
||||
}
|
||||
|
||||
const arcs = store?.json?.arcs || [];
|
||||
for (const a of arcs) {
|
||||
register(a.name);
|
||||
}
|
||||
|
||||
if (context?.name2) register(context.name2);
|
||||
|
||||
// 4. L2 events 参与者
|
||||
const events = store?.json?.events || [];
|
||||
for (const ev of events) {
|
||||
for (const p of (ev?.participants || [])) {
|
||||
register(p);
|
||||
}
|
||||
}
|
||||
|
||||
// 5. L0 atoms 的 edges.s/edges.t
|
||||
const atoms = getStateAtoms();
|
||||
for (const atom of atoms) {
|
||||
for (const e of (atom.edges || [])) {
|
||||
register(e?.s);
|
||||
register(e?.t);
|
||||
}
|
||||
}
|
||||
|
||||
// ★ 硬约束:删除 name1
|
||||
if (context?.name1) {
|
||||
map.delete(normalize(context.name1));
|
||||
}
|
||||
|
||||
return map;
|
||||
}
|
||||
|
||||
/**
|
||||
* 从文本中提取命中的实体
|
||||
*
|
||||
* 逻辑:遍历词典,检查文本中是否包含(不区分大小写)
|
||||
* 返回命中的实体原词形(去重)
|
||||
*
|
||||
* @param {string} text - 清洗后的文本
|
||||
* @param {Set<string>} lexicon - 标准化后的实体集合
|
||||
* @param {Map<string, string>} displayMap - normalize → 原词形
|
||||
* @returns {string[]} 命中的实体(原词形)
|
||||
*/
|
||||
export function extractEntitiesFromText(text, lexicon, displayMap) {
|
||||
if (!text || !lexicon?.size) return [];
|
||||
|
||||
const textNorm = normalize(text);
|
||||
const hits = [];
|
||||
const seen = new Set();
|
||||
|
||||
for (const entity of lexicon) {
|
||||
if (textNorm.includes(entity) && !seen.has(entity)) {
|
||||
seen.add(entity);
|
||||
// 优先返回原词形
|
||||
const display = displayMap?.get(entity) || entity;
|
||||
hits.push(display);
|
||||
}
|
||||
}
|
||||
|
||||
return hits;
|
||||
}
|
||||
541
modules/story-summary/vector/retrieval/lexical-index.js
Normal file
541
modules/story-summary/vector/retrieval/lexical-index.js
Normal file
@@ -0,0 +1,541 @@
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// lexical-index.js - MiniSearch 词法检索索引
|
||||
//
|
||||
// 职责:
|
||||
// 1. 对 L0 atoms + L1 chunks + L2 events 建立词法索引
|
||||
// 2. 提供词法检索接口(专名精确匹配兜底)
|
||||
// 3. 惰性构建 + 异步预热 + 缓存失效机制
|
||||
//
|
||||
// 索引存储:纯内存(不持久化)
|
||||
// 分词器:统一使用 tokenizer.js(结巴 + 实体保护 + 降级)
|
||||
// 重建时机:CHAT_CHANGED / L0提取完成 / L2总结完成
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
import MiniSearch from '../../../../libs/minisearch.mjs';
|
||||
import { getContext } from '../../../../../../../extensions.js';
|
||||
import { getSummaryStore } from '../../data/store.js';
|
||||
import { getAllChunks } from '../storage/chunk-store.js';
|
||||
import { xbLog } from '../../../../core/debug-core.js';
|
||||
import { tokenizeForIndex } from '../utils/tokenizer.js';
|
||||
|
||||
const MODULE_ID = 'lexical-index';
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────
|
||||
// 缓存
|
||||
// ─────────────────────────────────────────────────────────────────────────
|
||||
|
||||
/** @type {MiniSearch|null} */
|
||||
let cachedIndex = null;
|
||||
|
||||
/** @type {string|null} */
|
||||
let cachedChatId = null;
|
||||
|
||||
/** @type {string|null} 数据指纹(atoms + chunks + events 数量) */
|
||||
let cachedFingerprint = null;
|
||||
|
||||
/** @type {boolean} 是否正在构建 */
|
||||
let building = false;
|
||||
|
||||
/** @type {Promise<MiniSearch|null>|null} 当前构建 Promise(防重入) */
|
||||
let buildPromise = null;
|
||||
/** @type {Map<number, string[]>} floor → 该楼层的 doc IDs(仅 L1 chunks) */
|
||||
let floorDocIds = new Map();
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────
|
||||
// 工具函数
|
||||
// ─────────────────────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* 清理事件摘要(移除楼层标记)
|
||||
* @param {string} summary
|
||||
* @returns {string}
|
||||
*/
|
||||
function cleanSummary(summary) {
|
||||
return String(summary || '')
|
||||
.replace(/\s*\(#\d+(?:-\d+)?\)\s*$/, '')
|
||||
.trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* 计算缓存指纹
|
||||
* @param {number} chunkCount
|
||||
* @param {number} eventCount
|
||||
* @returns {string}
|
||||
*/
|
||||
function computeFingerprint(chunkCount, eventCount) {
|
||||
return `${chunkCount}:${eventCount}`;
|
||||
}
|
||||
|
||||
/**
|
||||
* 让出主线程(避免长时间阻塞 UI)
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
function yieldToMain() {
|
||||
return new Promise(resolve => setTimeout(resolve, 0));
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────
|
||||
// 文档收集
|
||||
// ─────────────────────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* 收集所有待索引文档
|
||||
*
|
||||
* @param {object[]} chunks - getAllChunks(chatId) 返回值
|
||||
* @param {object[]} events - store.json.events
|
||||
* @returns {object[]} 文档数组
|
||||
*/
|
||||
function collectDocuments(chunks, events) {
|
||||
const docs = [];
|
||||
|
||||
// L1 chunks + 填充 floorDocIds
|
||||
for (const chunk of (chunks || [])) {
|
||||
if (!chunk?.chunkId || !chunk.text) continue;
|
||||
|
||||
const floor = chunk.floor ?? -1;
|
||||
docs.push({
|
||||
id: chunk.chunkId,
|
||||
type: 'chunk',
|
||||
floor,
|
||||
text: chunk.text,
|
||||
});
|
||||
|
||||
if (floor >= 0) {
|
||||
if (!floorDocIds.has(floor)) {
|
||||
floorDocIds.set(floor, []);
|
||||
}
|
||||
floorDocIds.get(floor).push(chunk.chunkId);
|
||||
}
|
||||
}
|
||||
|
||||
// L2 events
|
||||
for (const ev of (events || [])) {
|
||||
if (!ev?.id) continue;
|
||||
const parts = [];
|
||||
if (ev.title) parts.push(ev.title);
|
||||
if (ev.participants?.length) parts.push(ev.participants.join(' '));
|
||||
const summary = cleanSummary(ev.summary);
|
||||
if (summary) parts.push(summary);
|
||||
const text = parts.join(' ').trim();
|
||||
if (!text) continue;
|
||||
|
||||
docs.push({
|
||||
id: ev.id,
|
||||
type: 'event',
|
||||
floor: null,
|
||||
text,
|
||||
});
|
||||
}
|
||||
|
||||
return docs;
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────
|
||||
// 索引构建(分片,不阻塞主线程)
|
||||
// ─────────────────────────────────────────────────────────────────────────
|
||||
|
||||
/** 每批添加的文档数 */
|
||||
const BUILD_BATCH_SIZE = 500;
|
||||
|
||||
/**
|
||||
* 构建 MiniSearch 索引(分片异步)
|
||||
*
|
||||
* @param {object[]} docs - 文档数组
|
||||
* @returns {Promise<MiniSearch>}
|
||||
*/
|
||||
async function buildIndexAsync(docs) {
|
||||
const T0 = performance.now();
|
||||
|
||||
const index = new MiniSearch({
|
||||
fields: ['text'],
|
||||
storeFields: ['type', 'floor'],
|
||||
idField: 'id',
|
||||
searchOptions: {
|
||||
boost: { text: 1 },
|
||||
fuzzy: 0.2,
|
||||
prefix: true,
|
||||
},
|
||||
tokenize: tokenizeForIndex,
|
||||
});
|
||||
|
||||
if (!docs.length) {
|
||||
return index;
|
||||
}
|
||||
|
||||
// 分片添加,每批 BUILD_BATCH_SIZE 条后让出主线程
|
||||
for (let i = 0; i < docs.length; i += BUILD_BATCH_SIZE) {
|
||||
const batch = docs.slice(i, i + BUILD_BATCH_SIZE);
|
||||
index.addAll(batch);
|
||||
|
||||
// 非最后一批时让出主线程
|
||||
if (i + BUILD_BATCH_SIZE < docs.length) {
|
||||
await yieldToMain();
|
||||
}
|
||||
}
|
||||
|
||||
const elapsed = Math.round(performance.now() - T0);
|
||||
xbLog.info(MODULE_ID,
|
||||
`索引构建完成: ${docs.length} 文档 (${elapsed}ms)`
|
||||
);
|
||||
|
||||
return index;
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────
|
||||
// 检索
|
||||
// ─────────────────────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* @typedef {object} LexicalSearchResult
|
||||
* @property {string[]} atomIds - 命中的 L0 atom IDs
|
||||
* @property {Set<number>} atomFloors - 命中的 L0 楼层集合
|
||||
* @property {string[]} chunkIds - 命中的 L1 chunk IDs
|
||||
* @property {Set<number>} chunkFloors - 命中的 L1 楼层集合
|
||||
* @property {string[]} eventIds - 命中的 L2 event IDs
|
||||
* @property {object[]} chunkScores - chunk 命中详情 [{ chunkId, score }]
|
||||
* @property {number} searchTime - 检索耗时 ms
|
||||
*/
|
||||
|
||||
/**
|
||||
* 在词法索引中检索
|
||||
*
|
||||
* @param {MiniSearch} index - 索引实例
|
||||
* @param {string[]} terms - 查询词列表
|
||||
* @returns {LexicalSearchResult}
|
||||
*/
|
||||
export function searchLexicalIndex(index, terms) {
|
||||
const T0 = performance.now();
|
||||
|
||||
const result = {
|
||||
atomIds: [],
|
||||
atomFloors: new Set(),
|
||||
chunkIds: [],
|
||||
chunkFloors: new Set(),
|
||||
eventIds: [],
|
||||
chunkScores: [],
|
||||
searchTime: 0,
|
||||
};
|
||||
|
||||
if (!index || !terms?.length) {
|
||||
result.searchTime = Math.round(performance.now() - T0);
|
||||
return result;
|
||||
}
|
||||
|
||||
// 用所有 terms 联合查询
|
||||
const queryString = terms.join(' ');
|
||||
|
||||
let hits;
|
||||
try {
|
||||
hits = index.search(queryString, {
|
||||
boost: { text: 1 },
|
||||
fuzzy: 0.2,
|
||||
prefix: true,
|
||||
combineWith: 'OR',
|
||||
// 使用与索引相同的分词器
|
||||
tokenize: tokenizeForIndex,
|
||||
});
|
||||
} catch (e) {
|
||||
xbLog.warn(MODULE_ID, '检索失败', e);
|
||||
result.searchTime = Math.round(performance.now() - T0);
|
||||
return result;
|
||||
}
|
||||
|
||||
// 分类结果
|
||||
const chunkIdSet = new Set();
|
||||
const eventIdSet = new Set();
|
||||
|
||||
for (const hit of hits) {
|
||||
const type = hit.type;
|
||||
const id = hit.id;
|
||||
const floor = hit.floor;
|
||||
|
||||
switch (type) {
|
||||
case 'chunk':
|
||||
if (!chunkIdSet.has(id)) {
|
||||
chunkIdSet.add(id);
|
||||
result.chunkIds.push(id);
|
||||
result.chunkScores.push({ chunkId: id, score: hit.score });
|
||||
if (typeof floor === 'number' && floor >= 0) {
|
||||
result.chunkFloors.add(floor);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case 'event':
|
||||
if (!eventIdSet.has(id)) {
|
||||
eventIdSet.add(id);
|
||||
result.eventIds.push(id);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
result.searchTime = Math.round(performance.now() - T0);
|
||||
|
||||
xbLog.info(MODULE_ID,
|
||||
`检索完成: terms=[${terms.slice(0, 5).join(',')}] → atoms=${result.atomIds.length} chunks=${result.chunkIds.length} events=${result.eventIds.length} (${result.searchTime}ms)`
|
||||
);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────
|
||||
// 内部构建流程(收集数据 + 构建索引)
|
||||
// ─────────────────────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* 收集数据并构建索引
|
||||
*
|
||||
* @param {string} chatId
|
||||
* @returns {Promise<{index: MiniSearch, fingerprint: string}>}
|
||||
*/
|
||||
async function collectAndBuild(chatId) {
|
||||
// 清空侧索引(全量重建)
|
||||
floorDocIds = new Map();
|
||||
|
||||
// 收集数据(不含 L0 atoms)
|
||||
const store = getSummaryStore();
|
||||
const events = store?.json?.events || [];
|
||||
|
||||
let chunks = [];
|
||||
try {
|
||||
chunks = await getAllChunks(chatId);
|
||||
} catch (e) {
|
||||
xbLog.warn(MODULE_ID, '获取 chunks 失败', e);
|
||||
}
|
||||
|
||||
const fp = computeFingerprint(chunks.length, events.length);
|
||||
|
||||
// 检查是否在收集过程中缓存已被其他调用更新
|
||||
if (cachedIndex && cachedChatId === chatId && cachedFingerprint === fp) {
|
||||
return { index: cachedIndex, fingerprint: fp };
|
||||
}
|
||||
|
||||
// 收集文档(同时填充 floorDocIds)
|
||||
const docs = collectDocuments(chunks, events);
|
||||
|
||||
// 异步分片构建
|
||||
const index = await buildIndexAsync(docs);
|
||||
|
||||
return { index, fingerprint: fp };
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────
|
||||
// 公开接口:getLexicalIndex(惰性获取)
|
||||
// ─────────────────────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* 获取词法索引(惰性构建 + 缓存)
|
||||
*
|
||||
* 如果缓存有效则直接返回;否则自动构建。
|
||||
* 如果正在构建中,等待构建完成。
|
||||
*
|
||||
* @returns {Promise<MiniSearch|null>}
|
||||
*/
|
||||
export async function getLexicalIndex() {
|
||||
const { chatId } = getContext();
|
||||
if (!chatId) return null;
|
||||
|
||||
// 快速路径:如果缓存存在且 chatId 未变,则直接命中
|
||||
// 指纹校验放到构建流程中完成,避免为指纹而额外读一次 IndexedDB
|
||||
if (cachedIndex && cachedChatId === chatId && cachedFingerprint) {
|
||||
return cachedIndex;
|
||||
}
|
||||
|
||||
// 正在构建中,等待结果
|
||||
if (building && buildPromise) {
|
||||
try {
|
||||
await buildPromise;
|
||||
if (cachedIndex && cachedChatId === chatId && cachedFingerprint) {
|
||||
return cachedIndex;
|
||||
}
|
||||
} catch {
|
||||
// 构建失败,继续往下重建
|
||||
}
|
||||
}
|
||||
|
||||
// 需要重建(指纹将在 collectAndBuild 内部计算并写入缓存)
|
||||
xbLog.info(MODULE_ID, `缓存失效,重建索引 (chatId=${chatId.slice(0, 8)})`);
|
||||
|
||||
building = true;
|
||||
buildPromise = collectAndBuild(chatId);
|
||||
|
||||
try {
|
||||
const { index, fingerprint } = await buildPromise;
|
||||
|
||||
// 原子替换缓存
|
||||
cachedIndex = index;
|
||||
cachedChatId = chatId;
|
||||
cachedFingerprint = fingerprint;
|
||||
|
||||
return index;
|
||||
} catch (e) {
|
||||
xbLog.error(MODULE_ID, '索引构建失败', e);
|
||||
return null;
|
||||
} finally {
|
||||
building = false;
|
||||
buildPromise = null;
|
||||
}
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────
|
||||
// 公开接口:warmupIndex(异步预建)
|
||||
// ─────────────────────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* 异步预建索引
|
||||
*
|
||||
* 在 CHAT_CHANGED 时调用,后台构建索引。
|
||||
* 不阻塞调用方,不返回结果。
|
||||
* 构建完成后缓存自动更新,后续 getLexicalIndex() 直接命中。
|
||||
*
|
||||
* 调用时机:
|
||||
* - handleChatChanged(实体注入后)
|
||||
* - L0 提取完成
|
||||
* - L2 总结完成
|
||||
*/
|
||||
export function warmupIndex() {
|
||||
const { chatId } = getContext();
|
||||
if (!chatId) return;
|
||||
|
||||
// 已在构建中,不重复触发
|
||||
if (building) return;
|
||||
|
||||
// fire-and-forget
|
||||
getLexicalIndex().catch(e => {
|
||||
xbLog.warn(MODULE_ID, '预热索引失败', e);
|
||||
});
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────
|
||||
// 公开接口:invalidateLexicalIndex(缓存失效)
|
||||
// ─────────────────────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* 使缓存失效(下次 getLexicalIndex / warmupIndex 时自动重建)
|
||||
*
|
||||
* 调用时机:
|
||||
* - CHAT_CHANGED
|
||||
* - L0 提取完成
|
||||
* - L2 总结完成
|
||||
*/
|
||||
export function invalidateLexicalIndex() {
|
||||
if (cachedIndex) {
|
||||
xbLog.info(MODULE_ID, '索引缓存已失效');
|
||||
}
|
||||
cachedIndex = null;
|
||||
cachedChatId = null;
|
||||
cachedFingerprint = null;
|
||||
floorDocIds = new Map();
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────
|
||||
// 增量更新接口
|
||||
// ─────────────────────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* 为指定楼层添加 L1 chunks 到索引
|
||||
*
|
||||
* 先移除该楼层旧文档,再添加新文档。
|
||||
* 如果索引不存在(缓存失效),静默跳过(下次 getLexicalIndex 全量重建)。
|
||||
*
|
||||
* @param {number} floor - 楼层号
|
||||
* @param {object[]} chunks - chunk 对象列表(需有 chunkId、text、floor)
|
||||
*/
|
||||
export function addDocumentsForFloor(floor, chunks) {
|
||||
if (!cachedIndex || !chunks?.length) return;
|
||||
|
||||
// 先移除旧文档
|
||||
removeDocumentsByFloor(floor);
|
||||
|
||||
const docs = [];
|
||||
const docIds = [];
|
||||
|
||||
for (const chunk of chunks) {
|
||||
if (!chunk?.chunkId || !chunk.text) continue;
|
||||
docs.push({
|
||||
id: chunk.chunkId,
|
||||
type: 'chunk',
|
||||
floor: chunk.floor ?? floor,
|
||||
text: chunk.text,
|
||||
});
|
||||
docIds.push(chunk.chunkId);
|
||||
}
|
||||
|
||||
if (docs.length > 0) {
|
||||
cachedIndex.addAll(docs);
|
||||
floorDocIds.set(floor, docIds);
|
||||
xbLog.info(MODULE_ID, `增量添加: floor ${floor}, ${docs.length} 个 chunk`);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 从索引中移除指定楼层的所有 L1 chunk 文档
|
||||
*
|
||||
* 使用 MiniSearch discard()(软删除)。
|
||||
* 如果索引不存在,静默跳过。
|
||||
*
|
||||
* @param {number} floor - 楼层号
|
||||
*/
|
||||
export function removeDocumentsByFloor(floor) {
|
||||
if (!cachedIndex) return;
|
||||
|
||||
const docIds = floorDocIds.get(floor);
|
||||
if (!docIds?.length) return;
|
||||
|
||||
for (const id of docIds) {
|
||||
try {
|
||||
cachedIndex.discard(id);
|
||||
} catch {
|
||||
// 文档可能不存在(已被全量重建替换)
|
||||
}
|
||||
}
|
||||
|
||||
floorDocIds.delete(floor);
|
||||
xbLog.info(MODULE_ID, `增量移除: floor ${floor}, ${docIds.length} 个文档`);
|
||||
}
|
||||
|
||||
/**
|
||||
* 将新 L2 事件添加到索引
|
||||
*
|
||||
* 如果事件 ID 已存在,先 discard 再 add(覆盖)。
|
||||
* 如果索引不存在,静默跳过。
|
||||
*
|
||||
* @param {object[]} events - 事件对象列表(需有 id、title、summary 等)
|
||||
*/
|
||||
export function addEventDocuments(events) {
|
||||
if (!cachedIndex || !events?.length) return;
|
||||
|
||||
const docs = [];
|
||||
|
||||
for (const ev of events) {
|
||||
if (!ev?.id) continue;
|
||||
|
||||
const parts = [];
|
||||
if (ev.title) parts.push(ev.title);
|
||||
if (ev.participants?.length) parts.push(ev.participants.join(' '));
|
||||
const summary = cleanSummary(ev.summary);
|
||||
if (summary) parts.push(summary);
|
||||
const text = parts.join(' ').trim();
|
||||
if (!text) continue;
|
||||
|
||||
// 覆盖:先尝试移除旧的
|
||||
try {
|
||||
cachedIndex.discard(ev.id);
|
||||
} catch {
|
||||
// 不存在则忽略
|
||||
}
|
||||
|
||||
docs.push({
|
||||
id: ev.id,
|
||||
type: 'event',
|
||||
floor: null,
|
||||
text,
|
||||
});
|
||||
}
|
||||
|
||||
if (docs.length > 0) {
|
||||
cachedIndex.addAll(docs);
|
||||
xbLog.info(MODULE_ID, `增量添加: ${docs.length} 个事件`);
|
||||
}
|
||||
}
|
||||
685
modules/story-summary/vector/retrieval/metrics.js
Normal file
685
modules/story-summary/vector/retrieval/metrics.js
Normal file
@@ -0,0 +1,685 @@
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Story Summary - Metrics Collector (v6 - Dense-Gated Lexical)
|
||||
//
|
||||
// v5 → v6 变更:
|
||||
// - lexical: 新增 eventFilteredByDense / floorFilteredByDense
|
||||
// - event: entityFilter bypass 阈值改为 CONFIG 驱动(0.80)
|
||||
// - 其余结构不变
|
||||
//
|
||||
// v4 → v5 变更:
|
||||
// - query: 新增 segmentWeights / r2Weights(加权向量诊断)
|
||||
// - fusion: 新增 denseAggMethod / lexDensityBonus(聚合策略可观测)
|
||||
// - quality: 新增 rerankRetentionRate(粗排-精排一致性)
|
||||
// - 移除 timing 中从未写入的死字段(queryBuild/queryRefine/lexicalSearch/fusion)
|
||||
// - 移除从未写入的 arc 区块
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* 创建空的指标对象
|
||||
* @returns {object}
|
||||
*/
|
||||
export function createMetrics() {
|
||||
return {
|
||||
// Query Build - 查询构建
|
||||
query: {
|
||||
buildTime: 0,
|
||||
refineTime: 0,
|
||||
lengths: {
|
||||
v0Chars: 0,
|
||||
v1Chars: null, // null = 无 hints
|
||||
rerankChars: 0,
|
||||
},
|
||||
segmentWeights: [], // R1 归一化后权重 [context..., focus]
|
||||
r2Weights: null, // R2 归一化后权重 [context..., focus, hints](null = 无 hints)
|
||||
},
|
||||
|
||||
// Anchor (L0 StateAtoms) - 语义锚点
|
||||
anchor: {
|
||||
needRecall: false,
|
||||
focusTerms: [],
|
||||
focusCharacters: [],
|
||||
focusEntities: [],
|
||||
matched: 0,
|
||||
floorsHit: 0,
|
||||
topHits: [],
|
||||
},
|
||||
|
||||
// Lexical (MiniSearch) - 词法检索
|
||||
lexical: {
|
||||
terms: [],
|
||||
atomHits: 0,
|
||||
chunkHits: 0,
|
||||
eventHits: 0,
|
||||
searchTime: 0,
|
||||
indexReadyTime: 0,
|
||||
eventFilteredByDense: 0,
|
||||
floorFilteredByDense: 0,
|
||||
},
|
||||
|
||||
// Fusion (W-RRF, floor-level) - 多路融合
|
||||
fusion: {
|
||||
denseFloors: 0,
|
||||
lexFloors: 0,
|
||||
totalUnique: 0,
|
||||
afterCap: 0,
|
||||
time: 0,
|
||||
denseAggMethod: '', // 聚合方法描述(如 "max×0.6+mean×0.4")
|
||||
lexDensityBonus: 0, // 密度加成系数
|
||||
},
|
||||
|
||||
// Constraint (L3 Facts) - 世界约束
|
||||
constraint: {
|
||||
total: 0,
|
||||
filtered: 0,
|
||||
injected: 0,
|
||||
tokens: 0,
|
||||
samples: [],
|
||||
},
|
||||
|
||||
// Event (L2 Events) - 事件摘要
|
||||
event: {
|
||||
inStore: 0,
|
||||
considered: 0,
|
||||
selected: 0,
|
||||
byRecallType: { direct: 0, related: 0, causal: 0, lexical: 0, l0Linked: 0 },
|
||||
similarityDistribution: { min: 0, max: 0, mean: 0, median: 0 },
|
||||
entityFilter: null,
|
||||
causalChainDepth: 0,
|
||||
causalCount: 0,
|
||||
entitiesUsed: 0,
|
||||
focusTermsCount: 0,
|
||||
entityNames: [],
|
||||
},
|
||||
|
||||
// Evidence (Two-Stage: Floor rerank → L1 pull) - 原文证据
|
||||
evidence: {
|
||||
// Stage 1: Floor
|
||||
floorCandidates: 0,
|
||||
floorsSelected: 0,
|
||||
l0Collected: 0,
|
||||
rerankApplied: false,
|
||||
rerankFailed: false,
|
||||
beforeRerank: 0,
|
||||
afterRerank: 0,
|
||||
rerankTime: 0,
|
||||
rerankScores: null,
|
||||
rerankDocAvgLength: 0,
|
||||
|
||||
// Stage 2: L1
|
||||
l1Pulled: 0,
|
||||
l1Attached: 0,
|
||||
l1CosineTime: 0,
|
||||
|
||||
// 装配
|
||||
contextPairsAdded: 0,
|
||||
tokens: 0,
|
||||
assemblyTime: 0,
|
||||
},
|
||||
|
||||
// Diffusion (PPR Spreading Activation) - 图扩散
|
||||
diffusion: {
|
||||
seedCount: 0,
|
||||
graphNodes: 0,
|
||||
graphEdges: 0,
|
||||
candidatePairs: 0,
|
||||
pairsFromWhat: 0,
|
||||
pairsFromRSem: 0,
|
||||
rSemAvgSim: 0,
|
||||
timeWindowFilteredPairs: 0,
|
||||
topKPrunedPairs: 0,
|
||||
edgeDensity: 0,
|
||||
reweightWhoUsed: 0,
|
||||
reweightWhereUsed: 0,
|
||||
iterations: 0,
|
||||
convergenceError: 0,
|
||||
pprActivated: 0,
|
||||
cosineGatePassed: 0,
|
||||
cosineGateFiltered: 0,
|
||||
cosineGateNoVector: 0,
|
||||
postGatePassRate: 0,
|
||||
finalCount: 0,
|
||||
scoreDistribution: { min: 0, max: 0, mean: 0 },
|
||||
byChannel: { what: 0, where: 0, rSem: 0, who: 0 },
|
||||
time: 0,
|
||||
},
|
||||
|
||||
// Formatting - 格式化
|
||||
formatting: {
|
||||
sectionsIncluded: [],
|
||||
time: 0,
|
||||
},
|
||||
|
||||
// Budget Summary - 预算
|
||||
budget: {
|
||||
total: 0,
|
||||
limit: 0,
|
||||
utilization: 0,
|
||||
breakdown: {
|
||||
constraints: 0,
|
||||
events: 0,
|
||||
distantEvidence: 0,
|
||||
recentEvidence: 0,
|
||||
arcs: 0,
|
||||
},
|
||||
},
|
||||
|
||||
// Timing - 计时(仅包含实际写入的字段)
|
||||
timing: {
|
||||
anchorSearch: 0,
|
||||
constraintFilter: 0,
|
||||
eventRetrieval: 0,
|
||||
evidenceRetrieval: 0,
|
||||
evidenceRerank: 0,
|
||||
evidenceAssembly: 0,
|
||||
diffusion: 0,
|
||||
formatting: 0,
|
||||
total: 0,
|
||||
},
|
||||
|
||||
// Quality Indicators - 质量指标
|
||||
quality: {
|
||||
constraintCoverage: 100,
|
||||
eventPrecisionProxy: 0,
|
||||
l1AttachRate: 0,
|
||||
rerankRetentionRate: 0,
|
||||
diffusionEffectiveRate: 0,
|
||||
potentialIssues: [],
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* 计算相似度分布统计
|
||||
* @param {number[]} similarities
|
||||
* @returns {{min: number, max: number, mean: number, median: number}}
|
||||
*/
|
||||
export function calcSimilarityStats(similarities) {
|
||||
if (!similarities?.length) {
|
||||
return { min: 0, max: 0, mean: 0, median: 0 };
|
||||
}
|
||||
|
||||
const sorted = [...similarities].sort((a, b) => a - b);
|
||||
const sum = sorted.reduce((a, b) => a + b, 0);
|
||||
|
||||
return {
|
||||
min: Number(sorted[0].toFixed(3)),
|
||||
max: Number(sorted[sorted.length - 1].toFixed(3)),
|
||||
mean: Number((sum / sorted.length).toFixed(3)),
|
||||
median: Number(sorted[Math.floor(sorted.length / 2)].toFixed(3)),
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* 格式化权重数组为紧凑字符串
|
||||
* @param {number[]|null} weights
|
||||
* @returns {string}
|
||||
*/
|
||||
function fmtWeights(weights) {
|
||||
if (!weights?.length) return 'N/A';
|
||||
return '[' + weights.map(w => (typeof w === 'number' ? w.toFixed(3) : String(w))).join(', ') + ']';
|
||||
}
|
||||
|
||||
/**
|
||||
* 格式化指标为可读日志
|
||||
* @param {object} metrics
|
||||
* @returns {string}
|
||||
*/
|
||||
export function formatMetricsLog(metrics) {
|
||||
const m = metrics;
|
||||
const lines = [];
|
||||
|
||||
lines.push('');
|
||||
lines.push('════════════════════════════════════════');
|
||||
lines.push(' Recall Metrics Report (v5) ');
|
||||
lines.push('════════════════════════════════════════');
|
||||
lines.push('');
|
||||
|
||||
// Query Length
|
||||
lines.push('[Query Length] 查询长度');
|
||||
lines.push(`├─ query_v0_chars: ${m.query?.lengths?.v0Chars ?? 0}`);
|
||||
lines.push(`├─ query_v1_chars: ${m.query?.lengths?.v1Chars == null ? 'N/A' : m.query.lengths.v1Chars}`);
|
||||
lines.push(`└─ rerank_query_chars: ${m.query?.lengths?.rerankChars ?? 0}`);
|
||||
lines.push('');
|
||||
|
||||
// Query Build
|
||||
lines.push('[Query] 查询构建');
|
||||
lines.push(`├─ build_time: ${m.query.buildTime}ms`);
|
||||
lines.push(`├─ refine_time: ${m.query.refineTime}ms`);
|
||||
lines.push(`├─ r1_weights: ${fmtWeights(m.query.segmentWeights)}`);
|
||||
if (m.query.r2Weights) {
|
||||
lines.push(`└─ r2_weights: ${fmtWeights(m.query.r2Weights)}`);
|
||||
} else {
|
||||
lines.push(`└─ r2_weights: N/A (no hints)`);
|
||||
}
|
||||
lines.push('');
|
||||
|
||||
// Anchor (L0 StateAtoms)
|
||||
lines.push('[Anchor] L0 StateAtoms - 语义锚点');
|
||||
lines.push(`├─ need_recall: ${m.anchor.needRecall}`);
|
||||
if (m.anchor.needRecall) {
|
||||
lines.push(`├─ focus_terms: [${(m.anchor.focusTerms || m.anchor.focusEntities || []).join(', ')}]`);
|
||||
lines.push(`├─ focus_characters: [${(m.anchor.focusCharacters || []).join(', ')}]`);
|
||||
lines.push(`├─ matched: ${m.anchor.matched || 0}`);
|
||||
lines.push(`└─ floors_hit: ${m.anchor.floorsHit || 0}`);
|
||||
}
|
||||
lines.push('');
|
||||
|
||||
// Lexical (MiniSearch)
|
||||
lines.push('[Lexical] MiniSearch - 词法检索');
|
||||
lines.push(`├─ terms: [${(m.lexical.terms || []).slice(0, 8).join(', ')}]`);
|
||||
lines.push(`├─ atom_hits: ${m.lexical.atomHits}`);
|
||||
lines.push(`├─ chunk_hits: ${m.lexical.chunkHits}`);
|
||||
lines.push(`├─ event_hits: ${m.lexical.eventHits}`);
|
||||
lines.push(`├─ search_time: ${m.lexical.searchTime}ms`);
|
||||
if (m.lexical.indexReadyTime > 0) {
|
||||
lines.push(`├─ index_ready_time: ${m.lexical.indexReadyTime}ms`);
|
||||
}
|
||||
if (m.lexical.eventFilteredByDense > 0) {
|
||||
lines.push(`├─ event_filtered_by_dense: ${m.lexical.eventFilteredByDense}`);
|
||||
}
|
||||
if (m.lexical.floorFilteredByDense > 0) {
|
||||
lines.push(`├─ floor_filtered_by_dense: ${m.lexical.floorFilteredByDense}`);
|
||||
}
|
||||
lines.push(`└─ dense_gate_threshold: 0.50`);
|
||||
lines.push('');
|
||||
|
||||
// Fusion (W-RRF, floor-level)
|
||||
lines.push('[Fusion] W-RRF (floor-level) - 多路融合');
|
||||
lines.push(`├─ dense_floors: ${m.fusion.denseFloors}`);
|
||||
lines.push(`├─ lex_floors: ${m.fusion.lexFloors}`);
|
||||
if (m.fusion.lexDensityBonus > 0) {
|
||||
lines.push(`│ └─ density_bonus: ${m.fusion.lexDensityBonus}`);
|
||||
}
|
||||
lines.push(`├─ total_unique: ${m.fusion.totalUnique}`);
|
||||
lines.push(`├─ after_cap: ${m.fusion.afterCap}`);
|
||||
lines.push(`└─ time: ${m.fusion.time}ms`);
|
||||
lines.push('');
|
||||
|
||||
// Constraint (L3 Facts)
|
||||
lines.push('[Constraint] L3 Facts - 世界约束');
|
||||
lines.push(`├─ total: ${m.constraint.total}`);
|
||||
lines.push(`├─ filtered: ${m.constraint.filtered || 0}`);
|
||||
lines.push(`├─ injected: ${m.constraint.injected}`);
|
||||
lines.push(`├─ tokens: ${m.constraint.tokens}`);
|
||||
if (m.constraint.samples && m.constraint.samples.length > 0) {
|
||||
lines.push(`└─ samples: "${m.constraint.samples.slice(0, 2).join('", "')}"`);
|
||||
}
|
||||
lines.push('');
|
||||
|
||||
// Event (L2 Events)
|
||||
lines.push('[Event] L2 Events - 事件摘要');
|
||||
lines.push(`├─ in_store: ${m.event.inStore}`);
|
||||
lines.push(`├─ considered: ${m.event.considered}`);
|
||||
|
||||
if (m.event.entityFilter) {
|
||||
const ef = m.event.entityFilter;
|
||||
lines.push(`├─ entity_filter:`);
|
||||
lines.push(`│ ├─ focus_characters: [${(ef.focusCharacters || ef.focusEntities || []).join(', ')}]`);
|
||||
lines.push(`│ ├─ before: ${ef.before}`);
|
||||
lines.push(`│ ├─ after: ${ef.after}`);
|
||||
lines.push(`│ └─ filtered: ${ef.filtered}`);
|
||||
}
|
||||
|
||||
lines.push(`├─ selected: ${m.event.selected}`);
|
||||
lines.push(`├─ by_recall_type:`);
|
||||
lines.push(`│ ├─ direct: ${m.event.byRecallType.direct}`);
|
||||
lines.push(`│ ├─ related: ${m.event.byRecallType.related}`);
|
||||
lines.push(`│ ├─ causal: ${m.event.byRecallType.causal}`);
|
||||
if (m.event.byRecallType.l0Linked) {
|
||||
lines.push(`│ ├─ lexical: ${m.event.byRecallType.lexical}`);
|
||||
lines.push(`│ └─ l0_linked: ${m.event.byRecallType.l0Linked}`);
|
||||
} else {
|
||||
lines.push(`│ └─ lexical: ${m.event.byRecallType.lexical}`);
|
||||
}
|
||||
|
||||
const sim = m.event.similarityDistribution;
|
||||
if (sim && sim.max > 0) {
|
||||
lines.push(`├─ similarity_distribution:`);
|
||||
lines.push(`│ ├─ min: ${sim.min}`);
|
||||
lines.push(`│ ├─ max: ${sim.max}`);
|
||||
lines.push(`│ ├─ mean: ${sim.mean}`);
|
||||
lines.push(`│ └─ median: ${sim.median}`);
|
||||
}
|
||||
|
||||
lines.push(`├─ causal_chain: depth=${m.event.causalChainDepth}, count=${m.event.causalCount}`);
|
||||
lines.push(`└─ focus_characters_used: ${m.event.entitiesUsed} [${(m.event.entityNames || []).join(', ')}], focus_terms_count=${m.event.focusTermsCount || 0}`);
|
||||
lines.push('');
|
||||
|
||||
// Evidence (Two-Stage: Floor Rerank → L1 Pull)
|
||||
lines.push('[Evidence] Two-Stage: Floor Rerank → L1 Pull');
|
||||
lines.push(`├─ Stage 1 (Floor Rerank):`);
|
||||
lines.push(`│ ├─ floor_candidates (post-fusion): ${m.evidence.floorCandidates}`);
|
||||
|
||||
if (m.evidence.rerankApplied) {
|
||||
lines.push(`│ ├─ rerank_applied: true`);
|
||||
if (m.evidence.rerankFailed) {
|
||||
lines.push(`│ │ ⚠ rerank_failed: using fusion order`);
|
||||
}
|
||||
lines.push(`│ │ ├─ before: ${m.evidence.beforeRerank} floors`);
|
||||
lines.push(`│ │ ├─ after: ${m.evidence.afterRerank} floors`);
|
||||
lines.push(`│ │ └─ time: ${m.evidence.rerankTime}ms`);
|
||||
if (m.evidence.rerankScores) {
|
||||
const rs = m.evidence.rerankScores;
|
||||
lines.push(`│ ├─ rerank_scores: min=${rs.min}, max=${rs.max}, mean=${rs.mean}`);
|
||||
}
|
||||
if (m.evidence.rerankDocAvgLength > 0) {
|
||||
lines.push(`│ ├─ rerank_doc_avg_length: ${m.evidence.rerankDocAvgLength} chars`);
|
||||
}
|
||||
} else {
|
||||
lines.push(`│ ├─ rerank_applied: false`);
|
||||
}
|
||||
|
||||
lines.push(`│ ├─ floors_selected: ${m.evidence.floorsSelected}`);
|
||||
lines.push(`│ └─ l0_atoms_collected: ${m.evidence.l0Collected}`);
|
||||
lines.push(`├─ Stage 2 (L1):`);
|
||||
lines.push(`│ ├─ pulled: ${m.evidence.l1Pulled}`);
|
||||
lines.push(`│ ├─ attached: ${m.evidence.l1Attached}`);
|
||||
lines.push(`│ └─ cosine_time: ${m.evidence.l1CosineTime}ms`);
|
||||
lines.push(`├─ tokens: ${m.evidence.tokens}`);
|
||||
lines.push(`└─ assembly_time: ${m.evidence.assemblyTime}ms`);
|
||||
lines.push('');
|
||||
|
||||
// Diffusion (PPR)
|
||||
lines.push('[Diffusion] PPR Spreading Activation');
|
||||
lines.push(`├─ seeds: ${m.diffusion.seedCount}`);
|
||||
lines.push(`├─ graph: ${m.diffusion.graphNodes} nodes, ${m.diffusion.graphEdges} edges`);
|
||||
lines.push(`├─ candidate_pairs: ${m.diffusion.candidatePairs || 0} (what=${m.diffusion.pairsFromWhat || 0}, r_sem=${m.diffusion.pairsFromRSem || 0})`);
|
||||
lines.push(`├─ r_sem_avg_sim: ${m.diffusion.rSemAvgSim || 0}`);
|
||||
lines.push(`├─ pair_filters: time_window=${m.diffusion.timeWindowFilteredPairs || 0}, topk_pruned=${m.diffusion.topKPrunedPairs || 0}`);
|
||||
lines.push(`├─ edge_density: ${m.diffusion.edgeDensity || 0}%`);
|
||||
if (m.diffusion.graphEdges > 0) {
|
||||
const ch = m.diffusion.byChannel || {};
|
||||
lines.push(`│ ├─ by_channel: what=${ch.what || 0}, r_sem=${ch.rSem || 0}, who=${ch.who || 0}, where=${ch.where || 0}`);
|
||||
lines.push(`│ └─ reweight_used: who=${m.diffusion.reweightWhoUsed || 0}, where=${m.diffusion.reweightWhereUsed || 0}`);
|
||||
}
|
||||
if (m.diffusion.iterations > 0) {
|
||||
lines.push(`├─ ppr: ${m.diffusion.iterations} iterations, ε=${Number(m.diffusion.convergenceError).toExponential(1)}`);
|
||||
}
|
||||
lines.push(`├─ activated (excl seeds): ${m.diffusion.pprActivated}`);
|
||||
if (m.diffusion.pprActivated > 0) {
|
||||
lines.push(`├─ cosine_gate: ${m.diffusion.cosineGatePassed} passed, ${m.diffusion.cosineGateFiltered} filtered`);
|
||||
const passPrefix = m.diffusion.cosineGateNoVector > 0 ? '│ ├─' : '│ └─';
|
||||
lines.push(`${passPrefix} pass_rate: ${m.diffusion.postGatePassRate || 0}%`);
|
||||
if (m.diffusion.cosineGateNoVector > 0) {
|
||||
lines.push(`│ ├─ no_vector: ${m.diffusion.cosineGateNoVector}`);
|
||||
}
|
||||
}
|
||||
lines.push(`├─ final_injected: ${m.diffusion.finalCount}`);
|
||||
if (m.diffusion.finalCount > 0) {
|
||||
const ds = m.diffusion.scoreDistribution;
|
||||
lines.push(`├─ scores: min=${ds.min}, max=${ds.max}, mean=${ds.mean}`);
|
||||
}
|
||||
lines.push(`└─ time: ${m.diffusion.time}ms`);
|
||||
lines.push('');
|
||||
|
||||
// Formatting
|
||||
lines.push('[Formatting] 格式化');
|
||||
lines.push(`├─ sections: [${(m.formatting.sectionsIncluded || []).join(', ')}]`);
|
||||
lines.push(`└─ time: ${m.formatting.time}ms`);
|
||||
lines.push('');
|
||||
|
||||
// Budget Summary
|
||||
lines.push('[Budget] 预算');
|
||||
lines.push(`├─ total_tokens: ${m.budget.total}`);
|
||||
lines.push(`├─ limit: ${m.budget.limit}`);
|
||||
lines.push(`├─ utilization: ${m.budget.utilization}%`);
|
||||
lines.push(`└─ breakdown:`);
|
||||
const bd = m.budget.breakdown || {};
|
||||
lines.push(` ├─ constraints: ${bd.constraints || 0}`);
|
||||
lines.push(` ├─ events: ${bd.events || 0}`);
|
||||
lines.push(` ├─ distant_evidence: ${bd.distantEvidence || 0}`);
|
||||
lines.push(` ├─ recent_evidence: ${bd.recentEvidence || 0}`);
|
||||
lines.push(` └─ arcs: ${bd.arcs || 0}`);
|
||||
lines.push('');
|
||||
|
||||
// Timing
|
||||
lines.push('[Timing] 计时');
|
||||
lines.push(`├─ query_build: ${m.query.buildTime}ms`);
|
||||
lines.push(`├─ query_refine: ${m.query.refineTime}ms`);
|
||||
lines.push(`├─ anchor_search: ${m.timing.anchorSearch}ms`);
|
||||
const lexicalTotal = (m.lexical.searchTime || 0) + (m.lexical.indexReadyTime || 0);
|
||||
lines.push(`├─ lexical_search: ${lexicalTotal}ms (query=${m.lexical.searchTime || 0}ms, index_ready=${m.lexical.indexReadyTime || 0}ms)`);
|
||||
lines.push(`├─ fusion: ${m.fusion.time}ms`);
|
||||
lines.push(`├─ constraint_filter: ${m.timing.constraintFilter}ms`);
|
||||
lines.push(`├─ event_retrieval: ${m.timing.eventRetrieval}ms`);
|
||||
lines.push(`├─ evidence_retrieval: ${m.timing.evidenceRetrieval}ms`);
|
||||
lines.push(`├─ floor_rerank: ${m.timing.evidenceRerank || 0}ms`);
|
||||
lines.push(`├─ l1_cosine: ${m.evidence.l1CosineTime}ms`);
|
||||
lines.push(`├─ diffusion: ${m.timing.diffusion}ms`);
|
||||
lines.push(`├─ evidence_assembly: ${m.timing.evidenceAssembly}ms`);
|
||||
lines.push(`├─ formatting: ${m.timing.formatting}ms`);
|
||||
lines.push(`└─ total: ${m.timing.total}ms`);
|
||||
lines.push('');
|
||||
|
||||
// Quality Indicators
|
||||
lines.push('[Quality] 质量指标');
|
||||
lines.push(`├─ constraint_coverage: ${m.quality.constraintCoverage}%`);
|
||||
lines.push(`├─ event_precision_proxy: ${m.quality.eventPrecisionProxy}`);
|
||||
lines.push(`├─ l1_attach_rate: ${m.quality.l1AttachRate}%`);
|
||||
lines.push(`├─ rerank_retention_rate: ${m.quality.rerankRetentionRate}%`);
|
||||
lines.push(`├─ diffusion_effective_rate: ${m.quality.diffusionEffectiveRate}%`);
|
||||
|
||||
if (m.quality.potentialIssues && m.quality.potentialIssues.length > 0) {
|
||||
lines.push(`└─ potential_issues:`);
|
||||
m.quality.potentialIssues.forEach((issue, i) => {
|
||||
const prefix = i === m.quality.potentialIssues.length - 1 ? ' └─' : ' ├─';
|
||||
lines.push(`${prefix} ⚠ ${issue}`);
|
||||
});
|
||||
} else {
|
||||
lines.push(`└─ potential_issues: none`);
|
||||
}
|
||||
|
||||
lines.push('');
|
||||
lines.push('════════════════════════════════════════');
|
||||
lines.push('');
|
||||
|
||||
return lines.join('\n');
|
||||
}
|
||||
|
||||
/**
|
||||
* 检测潜在问题
|
||||
* @param {object} metrics
|
||||
* @returns {string[]}
|
||||
*/
|
||||
export function detectIssues(metrics) {
|
||||
const issues = [];
|
||||
const m = metrics;
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────
|
||||
// 查询构建问题
|
||||
// ─────────────────────────────────────────────────────────────────
|
||||
|
||||
if ((m.anchor.focusTerms || m.anchor.focusEntities || []).length === 0) {
|
||||
issues.push('No focus entities extracted - entity lexicon may be empty or messages too short');
|
||||
}
|
||||
|
||||
// 权重极端退化检测
|
||||
const segWeights = m.query.segmentWeights || [];
|
||||
if (segWeights.length > 0) {
|
||||
const focusWeight = segWeights[segWeights.length - 1] || 0;
|
||||
if (focusWeight < 0.15) {
|
||||
issues.push(`Focus segment weight very low (${(focusWeight * 100).toFixed(0)}%) - focus message may be too short`);
|
||||
}
|
||||
const allLow = segWeights.every(w => w < 0.1);
|
||||
if (allLow) {
|
||||
issues.push('All segment weights below 10% - all messages may be extremely short');
|
||||
}
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────
|
||||
// 锚点匹配问题
|
||||
// ─────────────────────────────────────────────────────────────────
|
||||
|
||||
if ((m.anchor.matched || 0) === 0 && m.anchor.needRecall) {
|
||||
issues.push('No anchors matched - may need to generate anchors');
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────
|
||||
// 词法检索问题
|
||||
// ─────────────────────────────────────────────────────────────────
|
||||
|
||||
if ((m.lexical.terms || []).length > 0 && m.lexical.chunkHits === 0 && m.lexical.eventHits === 0) {
|
||||
issues.push('Lexical search returned zero hits - terms may not match any indexed content');
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────
|
||||
// 融合问题(floor-level)
|
||||
// ─────────────────────────────────────────────────────────────────
|
||||
|
||||
if (m.fusion.lexFloors === 0 && m.fusion.denseFloors > 0) {
|
||||
issues.push('No lexical floors in fusion - hybrid retrieval not contributing');
|
||||
}
|
||||
|
||||
if (m.fusion.afterCap === 0) {
|
||||
issues.push('Fusion produced zero floor candidates - all retrieval paths may have failed');
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────
|
||||
// 事件召回问题
|
||||
// ─────────────────────────────────────────────────────────────────
|
||||
|
||||
if (m.event.considered > 0) {
|
||||
const denseSelected =
|
||||
(m.event.byRecallType?.direct || 0) +
|
||||
(m.event.byRecallType?.related || 0);
|
||||
|
||||
const denseSelectRatio = denseSelected / m.event.considered;
|
||||
|
||||
if (denseSelectRatio < 0.1) {
|
||||
issues.push(`Dense event selection ratio too low (${(denseSelectRatio * 100).toFixed(1)}%) - threshold may be too high`);
|
||||
}
|
||||
if (denseSelectRatio > 0.6 && m.event.considered > 10) {
|
||||
issues.push(`Dense event selection ratio high (${(denseSelectRatio * 100).toFixed(1)}%) - may include noise`);
|
||||
}
|
||||
}
|
||||
|
||||
// 实体过滤问题
|
||||
if (m.event.entityFilter) {
|
||||
const ef = m.event.entityFilter;
|
||||
if (ef.filtered === 0 && ef.before > 10) {
|
||||
issues.push('No events filtered by entity - focus entities may be too broad or missing');
|
||||
}
|
||||
if (ef.before > 0 && ef.filtered > ef.before * 0.8) {
|
||||
issues.push(`Too many events filtered (${ef.filtered}/${ef.before}) - focus may be too narrow`);
|
||||
}
|
||||
}
|
||||
|
||||
// 相似度问题
|
||||
if (m.event.similarityDistribution && m.event.similarityDistribution.min > 0 && m.event.similarityDistribution.min < 0.5) {
|
||||
issues.push(`Low similarity events included (min=${m.event.similarityDistribution.min})`);
|
||||
}
|
||||
|
||||
// 因果链问题
|
||||
if (m.event.selected > 0 && m.event.causalCount === 0 && m.event.byRecallType.direct === 0) {
|
||||
issues.push('No direct or causal events - query may not align with stored events');
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────
|
||||
// Floor Rerank 问题
|
||||
// ─────────────────────────────────────────────────────────────────
|
||||
|
||||
if (m.evidence.rerankFailed) {
|
||||
issues.push('Rerank API failed — using fusion rank order as fallback, relevance scores are zero');
|
||||
}
|
||||
|
||||
if (m.evidence.rerankApplied && !m.evidence.rerankFailed) {
|
||||
if (m.evidence.rerankScores) {
|
||||
const rs = m.evidence.rerankScores;
|
||||
if (rs.max < 0.3) {
|
||||
issues.push(`Low floor rerank scores (max=${rs.max}) - query-document domain mismatch`);
|
||||
}
|
||||
if (rs.mean < 0.2) {
|
||||
issues.push(`Very low average floor rerank score (mean=${rs.mean}) - context may be weak`);
|
||||
}
|
||||
}
|
||||
|
||||
if (m.evidence.rerankTime > 3000) {
|
||||
issues.push(`Slow floor rerank (${m.evidence.rerankTime}ms) - may affect response time`);
|
||||
}
|
||||
|
||||
if (m.evidence.rerankDocAvgLength > 3000) {
|
||||
issues.push(`Large rerank documents (avg ${m.evidence.rerankDocAvgLength} chars) - may reduce rerank precision`);
|
||||
}
|
||||
}
|
||||
|
||||
// Rerank 保留率
|
||||
const retentionRate = m.evidence.floorCandidates > 0
|
||||
? Math.round(m.evidence.floorsSelected / m.evidence.floorCandidates * 100)
|
||||
: 0;
|
||||
m.quality.rerankRetentionRate = retentionRate;
|
||||
|
||||
if (m.evidence.floorCandidates > 0 && retentionRate < 25) {
|
||||
issues.push(`Low rerank retention rate (${retentionRate}%) - fusion ranking poorly aligned with reranker`);
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────
|
||||
// L1 挂载问题
|
||||
// ─────────────────────────────────────────────────────────────────
|
||||
|
||||
if (m.evidence.floorsSelected > 0 && m.evidence.l1Pulled === 0) {
|
||||
issues.push('Zero L1 chunks pulled - L1 vectors may not exist or DB read failed');
|
||||
}
|
||||
|
||||
if (m.evidence.floorsSelected > 0 && m.evidence.l1Attached === 0 && m.evidence.l1Pulled > 0) {
|
||||
issues.push('L1 chunks pulled but none attached - cosine scores may be too low');
|
||||
}
|
||||
|
||||
const l1AttachRate = m.quality.l1AttachRate || 0;
|
||||
if (m.evidence.floorsSelected > 3 && l1AttachRate < 50) {
|
||||
issues.push(`Low L1 attach rate (${l1AttachRate}%) - selected floors lack L1 chunks`);
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────
|
||||
// 预算问题
|
||||
// ─────────────────────────────────────────────────────────────────
|
||||
|
||||
if (m.budget.utilization > 90) {
|
||||
issues.push(`High budget utilization (${m.budget.utilization}%) - may be truncating content`);
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────
|
||||
// 性能问题
|
||||
// ─────────────────────────────────────────────────────────────────
|
||||
|
||||
if (m.timing.total > 8000) {
|
||||
issues.push(`Slow recall (${m.timing.total}ms) - consider optimization`);
|
||||
}
|
||||
|
||||
if (m.query.buildTime > 100) {
|
||||
issues.push(`Slow query build (${m.query.buildTime}ms) - entity lexicon may be too large`);
|
||||
}
|
||||
|
||||
if (m.evidence.l1CosineTime > 1000) {
|
||||
issues.push(`Slow L1 cosine scoring (${m.evidence.l1CosineTime}ms) - too many chunks pulled`);
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────
|
||||
// Diffusion 问题
|
||||
// ─────────────────────────────────────────────────────────────────
|
||||
|
||||
if (m.diffusion.graphEdges === 0 && m.diffusion.seedCount > 0) {
|
||||
issues.push('No diffusion graph edges - atoms may lack edges fields');
|
||||
}
|
||||
|
||||
if (m.diffusion.pprActivated > 0 && m.diffusion.cosineGatePassed === 0) {
|
||||
issues.push('All PPR-activated nodes failed cosine gate - graph structure diverged from query semantics');
|
||||
}
|
||||
|
||||
m.quality.diffusionEffectiveRate = m.diffusion.pprActivated > 0
|
||||
? Math.round((m.diffusion.finalCount / m.diffusion.pprActivated) * 100)
|
||||
: 0;
|
||||
|
||||
if (m.diffusion.cosineGateNoVector > 5) {
|
||||
issues.push(`${m.diffusion.cosineGateNoVector} PPR nodes missing vectors - L0 vectorization may be incomplete`);
|
||||
}
|
||||
|
||||
if (m.diffusion.time > 50) {
|
||||
issues.push(`Slow diffusion (${m.diffusion.time}ms) - graph may be too dense`);
|
||||
}
|
||||
|
||||
if (m.diffusion.pprActivated > 0 && (m.diffusion.postGatePassRate < 20 || m.diffusion.postGatePassRate > 60)) {
|
||||
issues.push(`Diffusion post-gate pass rate out of target (${m.diffusion.postGatePassRate}%)`);
|
||||
}
|
||||
|
||||
return issues;
|
||||
}
|
||||
387
modules/story-summary/vector/retrieval/query-builder.js
Normal file
387
modules/story-summary/vector/retrieval/query-builder.js
Normal file
@@ -0,0 +1,387 @@
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// query-builder.js - 确定性查询构建器(无 LLM)
|
||||
//
|
||||
// 职责:
|
||||
// 1. 从最近 3 条消息构建 QueryBundle(加权向量段)
|
||||
// 2. 用第一轮召回结果产出 hints 段用于 R2 增强
|
||||
//
|
||||
// 加权向量设计:
|
||||
// - 每条消息独立 embed,得到独立向量
|
||||
// - 按位置分配基础权重(焦点 > 近上下文 > 远上下文)
|
||||
// - 短消息通过 lengthFactor 自动降权(下限 35%)
|
||||
// - recall.js 负责 embed + 归一化 + 加权平均
|
||||
//
|
||||
// 焦点确定:
|
||||
// - pendingUserMessage 存在 → 它是焦点
|
||||
// - 否则 → lastMessages 最后一条是焦点
|
||||
//
|
||||
// 不负责:向量化、检索、rerank
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
import { getContext } from '../../../../../../../extensions.js';
|
||||
import { buildEntityLexicon, buildDisplayNameMap, extractEntitiesFromText, buildCharacterPools } from './entity-lexicon.js';
|
||||
import { getSummaryStore } from '../../data/store.js';
|
||||
import { filterText } from '../utils/text-filter.js';
|
||||
import { tokenizeForIndex as tokenizerTokenizeForIndex } from '../utils/tokenizer.js';
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────
|
||||
// 权重常量
|
||||
// ─────────────────────────────────────────────────────────────────────────
|
||||
|
||||
// R1 基础权重:[...context(oldest→newest), focus]
|
||||
// 焦点消息占 55%,最近上下文 30%,更早上下文 15%
|
||||
export const FOCUS_BASE_WEIGHT = 0.55;
|
||||
export const CONTEXT_BASE_WEIGHTS = [0.15, 0.30];
|
||||
|
||||
// R2 基础权重:焦点让权给 hints
|
||||
export const FOCUS_BASE_WEIGHT_R2 = 0.45;
|
||||
export const CONTEXT_BASE_WEIGHTS_R2 = [0.10, 0.20];
|
||||
export const HINTS_BASE_WEIGHT = 0.25;
|
||||
|
||||
// 长度惩罚:< 50 字线性衰减,下限 35%
|
||||
export const LENGTH_FULL_THRESHOLD = 50;
|
||||
export const LENGTH_MIN_FACTOR = 0.35;
|
||||
// 归一化后的焦点最小占比(由 recall.js 在归一化后硬保底)
|
||||
// 语义:即使焦点文本很短,也不能被稀释到过低权重
|
||||
export const FOCUS_MIN_NORMALIZED_WEIGHT = 0.35;
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────
|
||||
// 其他常量
|
||||
// ─────────────────────────────────────────────────────────────────────────
|
||||
|
||||
const MEMORY_HINT_ATOMS_MAX = 5;
|
||||
const MEMORY_HINT_EVENTS_MAX = 3;
|
||||
const LEXICAL_TERMS_MAX = 10;
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────
|
||||
// 工具函数
|
||||
// ─────────────────────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* 清洗消息文本(与 chunk-builder / recall 保持一致)
|
||||
* @param {string} text
|
||||
* @returns {string}
|
||||
*/
|
||||
function cleanMessageText(text) {
|
||||
return filterText(text)
|
||||
.replace(/\[tts:[^\]]*\]/gi, '')
|
||||
.replace(/<state>[\s\S]*?<\/state>/gi, '')
|
||||
.trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* 清理事件摘要(移除楼层标记)
|
||||
* @param {string} summary
|
||||
* @returns {string}
|
||||
*/
|
||||
function cleanSummary(summary) {
|
||||
return String(summary || '')
|
||||
.replace(/\s*\(#\d+(?:-\d+)?\)\s*$/, '')
|
||||
.trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* 计算长度因子
|
||||
*
|
||||
* charCount >= 50 → 1.0
|
||||
* charCount = 0 → 0.35
|
||||
* 中间线性插值
|
||||
*
|
||||
* @param {number} charCount - 清洗后内容字符数(不含 speaker 前缀)
|
||||
* @returns {number} 0.35 ~ 1.0
|
||||
*/
|
||||
export function computeLengthFactor(charCount) {
|
||||
if (charCount >= LENGTH_FULL_THRESHOLD) return 1.0;
|
||||
if (charCount <= 0) return LENGTH_MIN_FACTOR;
|
||||
return LENGTH_MIN_FACTOR + (1.0 - LENGTH_MIN_FACTOR) * (charCount / LENGTH_FULL_THRESHOLD);
|
||||
}
|
||||
|
||||
/**
|
||||
* 从文本中提取高频实词(用于词法检索)
|
||||
*
|
||||
* @param {string} text - 清洗后的文本
|
||||
* @param {number} maxTerms - 最大词数
|
||||
* @returns {string[]}
|
||||
*/
|
||||
function extractKeyTerms(text, maxTerms = LEXICAL_TERMS_MAX) {
|
||||
if (!text) return [];
|
||||
|
||||
const tokens = tokenizerTokenizeForIndex(text);
|
||||
const freq = new Map();
|
||||
for (const token of tokens) {
|
||||
const key = String(token || '').toLowerCase();
|
||||
if (!key) continue;
|
||||
freq.set(key, (freq.get(key) || 0) + 1);
|
||||
}
|
||||
|
||||
return Array.from(freq.entries())
|
||||
.sort((a, b) => b[1] - a[1])
|
||||
.slice(0, maxTerms)
|
||||
.map(([term]) => term);
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────
|
||||
// 类型定义
|
||||
// ─────────────────────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* @typedef {object} QuerySegment
|
||||
* @property {string} text - 待 embed 的文本(含 speaker 前缀,纯自然语言)
|
||||
* @property {number} baseWeight - R1 基础权重
|
||||
* @property {number} charCount - 内容字符数(不含 speaker 前缀,用于 lengthFactor)
|
||||
*/
|
||||
|
||||
/**
|
||||
* @typedef {object} QueryBundle
|
||||
* @property {QuerySegment[]} querySegments - R1 向量段(上下文 oldest→newest,焦点在末尾)
|
||||
* @property {QuerySegment|null} hintsSegment - R2 hints 段(refinement 后填充)
|
||||
* @property {string} rerankQuery - rerank 用的纯自然语言查询(焦点在前)
|
||||
* @property {string[]} lexicalTerms - MiniSearch 查询词
|
||||
* @property {string[]} focusTerms - 焦点词(原 focusEntities)
|
||||
* @property {string[]} focusCharacters - 焦点人物(focusTerms ∩ trustedCharacters)
|
||||
* @property {string[]} focusEntities - Deprecated alias of focusTerms
|
||||
* @property {Set<string>} allEntities - Full entity lexicon (includes non-character entities)
|
||||
* @property {Set<string>} allCharacters - Union of trusted and candidate character pools
|
||||
* @property {Set<string>} trustedCharacters - Clean character pool (main/arcs/name2/L2 participants)
|
||||
* @property {Set<string>} candidateCharacters - Extended character pool from L0 edges.s/t after cleanup
|
||||
* @property {Set<string>} _lexicon - 实体词典(内部使用)
|
||||
* @property {Map<string, string>} _displayMap - 标准化→原词形映射(内部使用)
|
||||
*/
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────
|
||||
// 内部:消息条目构建
|
||||
// ─────────────────────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* @typedef {object} MessageEntry
|
||||
* @property {string} text - speaker:内容(完整文本)
|
||||
* @property {number} charCount - 内容字符数(不含 speaker 前缀)
|
||||
*/
|
||||
|
||||
/**
|
||||
* 清洗消息并构建条目
|
||||
* @param {object} message - chat 消息对象
|
||||
* @param {object} context - { name1, name2 }
|
||||
* @returns {MessageEntry|null}
|
||||
*/
|
||||
function buildMessageEntry(message, context) {
|
||||
if (!message?.mes) return null;
|
||||
|
||||
const speaker = message.is_user
|
||||
? (context.name1 || '用户')
|
||||
: (message.name || context.name2 || '角色');
|
||||
|
||||
const clean = cleanMessageText(message.mes);
|
||||
if (!clean) return null;
|
||||
|
||||
return {
|
||||
text: `${speaker}:${clean}`,
|
||||
charCount: clean.length,
|
||||
};
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────
|
||||
// 阶段 1:构建 QueryBundle
|
||||
// ─────────────────────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* 构建初始查询包
|
||||
*
|
||||
* 消息布局(K=3 时):
|
||||
* msg[0] = USER(#N-2) 上下文 baseWeight = 0.15
|
||||
* msg[1] = AI(#N-1) 上下文 baseWeight = 0.30
|
||||
* msg[2] = USER(#N) 焦点 baseWeight = 0.55
|
||||
*
|
||||
* 焦点确定:
|
||||
* pendingUserMessage 存在 → 焦点,所有 lastMessages 为上下文
|
||||
* pendingUserMessage 不存在 → lastMessages[-1] 为焦点,其余为上下文
|
||||
*
|
||||
* @param {object[]} lastMessages - 最近 K 条消息(由 recall.js 传入)
|
||||
* @param {string|null} pendingUserMessage - 用户刚输入但未进 chat 的消息
|
||||
* @param {object|null} store
|
||||
* @param {object|null} context - { name1, name2 }
|
||||
* @returns {QueryBundle}
|
||||
*/
|
||||
export function buildQueryBundle(lastMessages, pendingUserMessage, store = null, context = null) {
|
||||
if (!store) store = getSummaryStore();
|
||||
if (!context) {
|
||||
const ctx = getContext();
|
||||
context = { name1: ctx.name1, name2: ctx.name2 };
|
||||
}
|
||||
|
||||
// 1. 实体/人物词典
|
||||
const lexicon = buildEntityLexicon(store, context);
|
||||
const displayMap = buildDisplayNameMap(store, context);
|
||||
const { trustedCharacters, candidateCharacters, allCharacters } = buildCharacterPools(store, context);
|
||||
|
||||
// 2. 分离焦点与上下文
|
||||
const contextEntries = [];
|
||||
let focusEntry = null;
|
||||
const allCleanTexts = [];
|
||||
|
||||
if (pendingUserMessage) {
|
||||
// pending 是焦点,所有 lastMessages 是上下文
|
||||
const pendingClean = cleanMessageText(pendingUserMessage);
|
||||
if (pendingClean) {
|
||||
const speaker = context.name1 || '用户';
|
||||
focusEntry = {
|
||||
text: `${speaker}:${pendingClean}`,
|
||||
charCount: pendingClean.length,
|
||||
};
|
||||
allCleanTexts.push(pendingClean);
|
||||
}
|
||||
|
||||
for (const m of (lastMessages || [])) {
|
||||
const entry = buildMessageEntry(m, context);
|
||||
if (entry) {
|
||||
contextEntries.push(entry);
|
||||
allCleanTexts.push(cleanMessageText(m.mes));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// 无 pending → lastMessages[-1] 是焦点
|
||||
const msgs = lastMessages || [];
|
||||
|
||||
if (msgs.length > 0) {
|
||||
const lastMsg = msgs[msgs.length - 1];
|
||||
const entry = buildMessageEntry(lastMsg, context);
|
||||
if (entry) {
|
||||
focusEntry = entry;
|
||||
allCleanTexts.push(cleanMessageText(lastMsg.mes));
|
||||
}
|
||||
}
|
||||
|
||||
for (let i = 0; i < msgs.length - 1; i++) {
|
||||
const entry = buildMessageEntry(msgs[i], context);
|
||||
if (entry) {
|
||||
contextEntries.push(entry);
|
||||
allCleanTexts.push(cleanMessageText(msgs[i].mes));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 3. 提取焦点词与焦点人物
|
||||
const combinedText = allCleanTexts.join(' ');
|
||||
const focusTerms = extractEntitiesFromText(combinedText, lexicon, displayMap);
|
||||
const focusCharacters = focusTerms.filter(term => trustedCharacters.has(term.toLowerCase()));
|
||||
|
||||
// 4. 构建 querySegments
|
||||
// 上下文在前(oldest → newest),焦点在末尾
|
||||
// 上下文权重从 CONTEXT_BASE_WEIGHTS 尾部对齐分配
|
||||
const querySegments = [];
|
||||
|
||||
for (let i = 0; i < contextEntries.length; i++) {
|
||||
const weightIdx = Math.max(0, CONTEXT_BASE_WEIGHTS.length - contextEntries.length + i);
|
||||
querySegments.push({
|
||||
text: contextEntries[i].text,
|
||||
baseWeight: CONTEXT_BASE_WEIGHTS[weightIdx] || CONTEXT_BASE_WEIGHTS[0],
|
||||
charCount: contextEntries[i].charCount,
|
||||
});
|
||||
}
|
||||
|
||||
if (focusEntry) {
|
||||
querySegments.push({
|
||||
text: focusEntry.text,
|
||||
baseWeight: FOCUS_BASE_WEIGHT,
|
||||
charCount: focusEntry.charCount,
|
||||
});
|
||||
}
|
||||
|
||||
// 5. rerankQuery(焦点在前,纯自然语言,无前缀)
|
||||
const contextLines = contextEntries.map(e => e.text);
|
||||
const rerankQuery = focusEntry
|
||||
? [focusEntry.text, ...contextLines].join('\n')
|
||||
: contextLines.join('\n');
|
||||
|
||||
// 6. lexicalTerms(实体优先 + 高频实词补充)
|
||||
const entityTerms = focusTerms.map(e => e.toLowerCase());
|
||||
const textTerms = extractKeyTerms(combinedText);
|
||||
const termSet = new Set(entityTerms);
|
||||
for (const t of textTerms) {
|
||||
if (termSet.size >= LEXICAL_TERMS_MAX) break;
|
||||
termSet.add(t);
|
||||
}
|
||||
|
||||
return {
|
||||
querySegments,
|
||||
hintsSegment: null,
|
||||
rerankQuery,
|
||||
lexicalTerms: Array.from(termSet),
|
||||
focusTerms,
|
||||
focusCharacters,
|
||||
focusEntities: focusTerms, // deprecated alias (compat)
|
||||
allEntities: lexicon,
|
||||
allCharacters,
|
||||
trustedCharacters,
|
||||
candidateCharacters,
|
||||
_lexicon: lexicon,
|
||||
_displayMap: displayMap,
|
||||
};
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────
|
||||
// 阶段 3:Query Refinement(用第一轮召回结果产出 hints 段)
|
||||
// ─────────────────────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* 用第一轮召回结果增强 QueryBundle
|
||||
*
|
||||
* 原地修改 bundle(仅 query/rerank 辅助项):
|
||||
* - hintsSegment:填充 hints 段(供 R2 加权使用)
|
||||
* - lexicalTerms:可能追加 hints 中的关键词
|
||||
* - rerankQuery:不变(保持焦点优先的纯自然语言)
|
||||
*
|
||||
* @param {QueryBundle} bundle - 原始查询包
|
||||
* @param {object[]} anchorHits - 第一轮 L0 命中(按相似度降序)
|
||||
* @param {object[]} eventHits - 第一轮 L2 命中(按相似度降序)
|
||||
*/
|
||||
export function refineQueryBundle(bundle, anchorHits, eventHits) {
|
||||
const hints = [];
|
||||
|
||||
// 1. 从 top anchorHits 提取 memory hints
|
||||
const topAnchors = (anchorHits || []).slice(0, MEMORY_HINT_ATOMS_MAX);
|
||||
for (const hit of topAnchors) {
|
||||
const semantic = hit.atom?.semantic || '';
|
||||
if (semantic) hints.push(semantic);
|
||||
}
|
||||
|
||||
// 2. 从 top eventHits 提取 memory hints
|
||||
const topEvents = (eventHits || []).slice(0, MEMORY_HINT_EVENTS_MAX);
|
||||
for (const hit of topEvents) {
|
||||
const ev = hit.event || {};
|
||||
const title = String(ev.title || '').trim();
|
||||
const summary = cleanSummary(ev.summary);
|
||||
const line = title && summary
|
||||
? `${title}: ${summary}`
|
||||
: title || summary;
|
||||
if (line) hints.push(line);
|
||||
}
|
||||
|
||||
// 3. 构建 hintsSegment
|
||||
if (hints.length > 0) {
|
||||
const hintsText = hints.join('\n');
|
||||
bundle.hintsSegment = {
|
||||
text: hintsText,
|
||||
baseWeight: HINTS_BASE_WEIGHT,
|
||||
charCount: hintsText.length,
|
||||
};
|
||||
} else {
|
||||
bundle.hintsSegment = null;
|
||||
}
|
||||
|
||||
// 4. rerankQuery 不变
|
||||
// cross-encoder 接收纯自然语言 query,不受 hints 干扰
|
||||
|
||||
// 5. 增强 lexicalTerms
|
||||
if (hints.length > 0) {
|
||||
const hintTerms = extractKeyTerms(hints.join(' '), 5);
|
||||
const termSet = new Set(bundle.lexicalTerms);
|
||||
for (const t of hintTerms) {
|
||||
if (termSet.size >= LEXICAL_TERMS_MAX) break;
|
||||
if (!termSet.has(t)) {
|
||||
termSet.add(t);
|
||||
bundle.lexicalTerms.push(t);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
1399
modules/story-summary/vector/retrieval/recall.js
Normal file
1399
modules/story-summary/vector/retrieval/recall.js
Normal file
File diff suppressed because it is too large
Load Diff
261
modules/story-summary/vector/storage/chunk-store.js
Normal file
261
modules/story-summary/vector/storage/chunk-store.js
Normal file
@@ -0,0 +1,261 @@
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Story Summary - Chunk Store (L1/L2 storage)
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
import {
|
||||
metaTable,
|
||||
chunksTable,
|
||||
chunkVectorsTable,
|
||||
eventVectorsTable,
|
||||
CHUNK_MAX_TOKENS,
|
||||
} from '../../data/db.js';
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// 工具函数
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
export function float32ToBuffer(arr) {
|
||||
return arr.buffer.slice(arr.byteOffset, arr.byteOffset + arr.byteLength);
|
||||
}
|
||||
|
||||
export function bufferToFloat32(buffer) {
|
||||
return new Float32Array(buffer);
|
||||
}
|
||||
|
||||
export function makeChunkId(floor, chunkIdx) {
|
||||
return `c-${floor}-${chunkIdx}`;
|
||||
}
|
||||
|
||||
export function hashText(text) {
|
||||
let hash = 0;
|
||||
for (let i = 0; i < text.length; i++) {
|
||||
hash = ((hash << 5) - hash + text.charCodeAt(i)) | 0;
|
||||
}
|
||||
return hash.toString(36);
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Meta 表操作
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
export async function getMeta(chatId) {
|
||||
let meta = await metaTable.get(chatId);
|
||||
if (!meta) {
|
||||
meta = {
|
||||
chatId,
|
||||
fingerprint: null,
|
||||
lastChunkFloor: -1,
|
||||
updatedAt: Date.now(),
|
||||
};
|
||||
await metaTable.put(meta);
|
||||
}
|
||||
return meta;
|
||||
}
|
||||
|
||||
export async function updateMeta(chatId, updates) {
|
||||
await metaTable.update(chatId, {
|
||||
...updates,
|
||||
updatedAt: Date.now(),
|
||||
});
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Chunks 表操作
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
export async function saveChunks(chatId, chunks) {
|
||||
const records = chunks.map(chunk => ({
|
||||
chatId,
|
||||
chunkId: chunk.chunkId,
|
||||
floor: chunk.floor,
|
||||
chunkIdx: chunk.chunkIdx,
|
||||
speaker: chunk.speaker,
|
||||
isUser: chunk.isUser,
|
||||
text: chunk.text,
|
||||
textHash: chunk.textHash,
|
||||
createdAt: Date.now(),
|
||||
}));
|
||||
await chunksTable.bulkPut(records);
|
||||
}
|
||||
|
||||
export async function getAllChunks(chatId) {
|
||||
return await chunksTable.where('chatId').equals(chatId).toArray();
|
||||
}
|
||||
|
||||
export async function getChunksByFloors(chatId, floors) {
|
||||
const chunks = await chunksTable
|
||||
.where('[chatId+floor]')
|
||||
.anyOf(floors.map(f => [chatId, f]))
|
||||
.toArray();
|
||||
return chunks;
|
||||
}
|
||||
|
||||
/**
|
||||
* 删除指定楼层及之后的所有 chunk 和向量
|
||||
*/
|
||||
export async function deleteChunksFromFloor(chatId, fromFloor) {
|
||||
const chunks = await chunksTable
|
||||
.where('chatId')
|
||||
.equals(chatId)
|
||||
.filter(c => c.floor >= fromFloor)
|
||||
.toArray();
|
||||
|
||||
const chunkIds = chunks.map(c => c.chunkId);
|
||||
|
||||
await chunksTable
|
||||
.where('chatId')
|
||||
.equals(chatId)
|
||||
.filter(c => c.floor >= fromFloor)
|
||||
.delete();
|
||||
|
||||
for (const chunkId of chunkIds) {
|
||||
await chunkVectorsTable.delete([chatId, chunkId]);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 删除指定楼层的 chunk 和向量
|
||||
*/
|
||||
export async function deleteChunksAtFloor(chatId, floor) {
|
||||
const chunks = await chunksTable
|
||||
.where('[chatId+floor]')
|
||||
.equals([chatId, floor])
|
||||
.toArray();
|
||||
|
||||
const chunkIds = chunks.map(c => c.chunkId);
|
||||
|
||||
await chunksTable.where('[chatId+floor]').equals([chatId, floor]).delete();
|
||||
|
||||
for (const chunkId of chunkIds) {
|
||||
await chunkVectorsTable.delete([chatId, chunkId]);
|
||||
}
|
||||
}
|
||||
|
||||
export async function clearAllChunks(chatId) {
|
||||
await chunksTable.where('chatId').equals(chatId).delete();
|
||||
await chunkVectorsTable.where('chatId').equals(chatId).delete();
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// ChunkVectors 表操作
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
export async function saveChunkVectors(chatId, items, fingerprint) {
|
||||
const records = items.map(item => ({
|
||||
chatId,
|
||||
chunkId: item.chunkId,
|
||||
vector: float32ToBuffer(new Float32Array(item.vector)),
|
||||
dims: item.vector.length,
|
||||
fingerprint,
|
||||
}));
|
||||
await chunkVectorsTable.bulkPut(records);
|
||||
}
|
||||
|
||||
export async function getAllChunkVectors(chatId) {
|
||||
const records = await chunkVectorsTable.where('chatId').equals(chatId).toArray();
|
||||
return records.map(r => ({
|
||||
...r,
|
||||
vector: bufferToFloat32(r.vector),
|
||||
}));
|
||||
}
|
||||
|
||||
export async function getChunkVectorsByIds(chatId, chunkIds) {
|
||||
if (!chatId || !chunkIds?.length) return [];
|
||||
|
||||
const records = await chunkVectorsTable
|
||||
.where('[chatId+chunkId]')
|
||||
.anyOf(chunkIds.map(id => [chatId, id]))
|
||||
.toArray();
|
||||
|
||||
return records.map(r => ({
|
||||
chunkId: r.chunkId,
|
||||
vector: bufferToFloat32(r.vector),
|
||||
}));
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// EventVectors 表操作
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
export async function saveEventVectors(chatId, items, fingerprint) {
|
||||
const records = items.map(item => ({
|
||||
chatId,
|
||||
eventId: item.eventId,
|
||||
vector: float32ToBuffer(new Float32Array(item.vector)),
|
||||
dims: item.vector.length,
|
||||
fingerprint,
|
||||
}));
|
||||
await eventVectorsTable.bulkPut(records);
|
||||
}
|
||||
|
||||
export async function getAllEventVectors(chatId) {
|
||||
const records = await eventVectorsTable.where('chatId').equals(chatId).toArray();
|
||||
return records.map(r => ({
|
||||
...r,
|
||||
vector: bufferToFloat32(r.vector),
|
||||
}));
|
||||
}
|
||||
|
||||
export async function clearEventVectors(chatId) {
|
||||
await eventVectorsTable.where('chatId').equals(chatId).delete();
|
||||
}
|
||||
|
||||
/**
|
||||
* 按 ID 列表删除 event 向量
|
||||
*/
|
||||
export async function deleteEventVectorsByIds(chatId, eventIds) {
|
||||
for (const eventId of eventIds) {
|
||||
await eventVectorsTable.delete([chatId, eventId]);
|
||||
}
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// 统计与工具
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
export async function getStorageStats(chatId) {
|
||||
const [meta, chunkCount, chunkVectorCount, eventCount] = await Promise.all([
|
||||
getMeta(chatId),
|
||||
chunksTable.where('chatId').equals(chatId).count(),
|
||||
chunkVectorsTable.where('chatId').equals(chatId).count(),
|
||||
eventVectorsTable.where('chatId').equals(chatId).count(),
|
||||
]);
|
||||
|
||||
return {
|
||||
fingerprint: meta.fingerprint,
|
||||
lastChunkFloor: meta.lastChunkFloor,
|
||||
chunks: chunkCount,
|
||||
chunkVectors: chunkVectorCount,
|
||||
eventVectors: eventCount,
|
||||
};
|
||||
}
|
||||
|
||||
export async function clearChatData(chatId) {
|
||||
await Promise.all([
|
||||
metaTable.delete(chatId),
|
||||
chunksTable.where('chatId').equals(chatId).delete(),
|
||||
chunkVectorsTable.where('chatId').equals(chatId).delete(),
|
||||
eventVectorsTable.where('chatId').equals(chatId).delete(),
|
||||
]);
|
||||
}
|
||||
|
||||
export async function ensureFingerprintMatch(chatId, newFingerprint) {
|
||||
const meta = await getMeta(chatId);
|
||||
if (meta.fingerprint && meta.fingerprint !== newFingerprint) {
|
||||
await Promise.all([
|
||||
chunkVectorsTable.where('chatId').equals(chatId).delete(),
|
||||
eventVectorsTable.where('chatId').equals(chatId).delete(),
|
||||
]);
|
||||
await updateMeta(chatId, {
|
||||
fingerprint: newFingerprint,
|
||||
lastChunkFloor: -1,
|
||||
});
|
||||
return false;
|
||||
}
|
||||
if (!meta.fingerprint) {
|
||||
await updateMeta(chatId, { fingerprint: newFingerprint });
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
export { CHUNK_MAX_TOKENS };
|
||||
266
modules/story-summary/vector/storage/state-store.js
Normal file
266
modules/story-summary/vector/storage/state-store.js
Normal file
@@ -0,0 +1,266 @@
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Story Summary - State Store (L0)
|
||||
// StateAtom 存 chat_metadata(持久化)
|
||||
// StateVector 存 IndexedDB(可重建)
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
import { saveMetadataDebounced } from '../../../../../../../extensions.js';
|
||||
import { chat_metadata } from '../../../../../../../../script.js';
|
||||
import { stateVectorsTable } from '../../data/db.js';
|
||||
import { EXT_ID } from '../../../../core/constants.js';
|
||||
import { xbLog } from '../../../../core/debug-core.js';
|
||||
|
||||
const MODULE_ID = 'state-store';
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// 工具函数
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
export function float32ToBuffer(arr) {
|
||||
return arr.buffer.slice(arr.byteOffset, arr.byteOffset + arr.byteLength);
|
||||
}
|
||||
|
||||
export function bufferToFloat32(buffer) {
|
||||
return new Float32Array(buffer);
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// StateAtom 操作(chat_metadata)
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
function ensureStateAtomsArray() {
|
||||
chat_metadata.extensions ||= {};
|
||||
chat_metadata.extensions[EXT_ID] ||= {};
|
||||
chat_metadata.extensions[EXT_ID].stateAtoms ||= [];
|
||||
return chat_metadata.extensions[EXT_ID].stateAtoms;
|
||||
}
|
||||
|
||||
// L0Index: per-floor status (ok | empty | fail)
|
||||
function ensureL0Index() {
|
||||
chat_metadata.extensions ||= {};
|
||||
chat_metadata.extensions[EXT_ID] ||= {};
|
||||
chat_metadata.extensions[EXT_ID].l0Index ||= { version: 1, byFloor: {} };
|
||||
chat_metadata.extensions[EXT_ID].l0Index.byFloor ||= {};
|
||||
return chat_metadata.extensions[EXT_ID].l0Index;
|
||||
}
|
||||
|
||||
export function getL0Index() {
|
||||
return ensureL0Index();
|
||||
}
|
||||
|
||||
export function getL0FloorStatus(floor) {
|
||||
const idx = ensureL0Index();
|
||||
return idx.byFloor?.[String(floor)] || null;
|
||||
}
|
||||
|
||||
export function setL0FloorStatus(floor, record) {
|
||||
const idx = ensureL0Index();
|
||||
idx.byFloor[String(floor)] = {
|
||||
...record,
|
||||
floor,
|
||||
updatedAt: Date.now(),
|
||||
};
|
||||
saveMetadataDebounced();
|
||||
}
|
||||
|
||||
export function clearL0Index() {
|
||||
const idx = ensureL0Index();
|
||||
idx.byFloor = {};
|
||||
saveMetadataDebounced();
|
||||
}
|
||||
|
||||
export function deleteL0IndexFromFloor(fromFloor) {
|
||||
const idx = ensureL0Index();
|
||||
const keys = Object.keys(idx.byFloor || {});
|
||||
let deleted = 0;
|
||||
for (const k of keys) {
|
||||
const f = Number(k);
|
||||
if (Number.isFinite(f) && f >= fromFloor) {
|
||||
delete idx.byFloor[k];
|
||||
deleted++;
|
||||
}
|
||||
}
|
||||
if (deleted > 0) {
|
||||
saveMetadataDebounced();
|
||||
xbLog.info(MODULE_ID, `删除 ${deleted} 条 L0Index (floor >= ${fromFloor})`);
|
||||
}
|
||||
return deleted;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取当前聊天的所有 StateAtoms
|
||||
*/
|
||||
export function getStateAtoms() {
|
||||
return ensureStateAtomsArray();
|
||||
}
|
||||
|
||||
/**
|
||||
* 保存新的 StateAtoms(追加,去重)
|
||||
*/
|
||||
export function saveStateAtoms(atoms) {
|
||||
if (!atoms?.length) return;
|
||||
|
||||
const arr = ensureStateAtomsArray();
|
||||
const existing = new Set(arr.map(a => a.atomId));
|
||||
|
||||
let added = 0;
|
||||
for (const atom of atoms) {
|
||||
// 有效性检查
|
||||
if (!atom?.atomId || typeof atom.floor !== 'number' || atom.floor < 0 || !atom.semantic) {
|
||||
xbLog.warn(MODULE_ID, `跳过无效 atom: ${atom?.atomId}`);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!existing.has(atom.atomId)) {
|
||||
arr.push(atom);
|
||||
existing.add(atom.atomId);
|
||||
added++;
|
||||
}
|
||||
}
|
||||
|
||||
if (added > 0) {
|
||||
saveMetadataDebounced();
|
||||
xbLog.info(MODULE_ID, `存储 ${added} 个 StateAtom`);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 删除指定楼层及之后的 StateAtoms
|
||||
*/
|
||||
export function deleteStateAtomsFromFloor(floor) {
|
||||
const arr = ensureStateAtomsArray();
|
||||
const before = arr.length;
|
||||
|
||||
const filtered = arr.filter(a => a.floor < floor);
|
||||
chat_metadata.extensions[EXT_ID].stateAtoms = filtered;
|
||||
|
||||
const deleted = before - filtered.length;
|
||||
if (deleted > 0) {
|
||||
saveMetadataDebounced();
|
||||
xbLog.info(MODULE_ID, `删除 ${deleted} 个 StateAtom (floor >= ${floor})`);
|
||||
}
|
||||
|
||||
return deleted;
|
||||
}
|
||||
|
||||
/**
|
||||
* 清空所有 StateAtoms
|
||||
*/
|
||||
export function clearStateAtoms() {
|
||||
const arr = ensureStateAtomsArray();
|
||||
const count = arr.length;
|
||||
|
||||
chat_metadata.extensions[EXT_ID].stateAtoms = [];
|
||||
|
||||
if (count > 0) {
|
||||
saveMetadataDebounced();
|
||||
xbLog.info(MODULE_ID, `清空 ${count} 个 StateAtom`);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取 StateAtoms 数量
|
||||
*/
|
||||
export function getStateAtomsCount() {
|
||||
return ensureStateAtomsArray().length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return floors that already have extracted atoms.
|
||||
*/
|
||||
export function getExtractedFloors() {
|
||||
const floors = new Set();
|
||||
const arr = ensureStateAtomsArray();
|
||||
for (const atom of arr) {
|
||||
if (typeof atom?.floor === 'number' && atom.floor >= 0) {
|
||||
floors.add(atom.floor);
|
||||
}
|
||||
}
|
||||
return floors;
|
||||
}
|
||||
|
||||
/**
|
||||
* Replace all stored StateAtoms.
|
||||
*/
|
||||
export function replaceStateAtoms(atoms) {
|
||||
const next = Array.isArray(atoms) ? atoms : [];
|
||||
chat_metadata.extensions[EXT_ID].stateAtoms = next;
|
||||
saveMetadataDebounced();
|
||||
xbLog.info(MODULE_ID, `替换 StateAtoms: ${next.length} 条`);
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// StateVector 操作(IndexedDB)
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* 保存 StateVectors
|
||||
*/
|
||||
export async function saveStateVectors(chatId, items, fingerprint) {
|
||||
if (!chatId || !items?.length) return;
|
||||
|
||||
const records = items.map(item => ({
|
||||
chatId,
|
||||
atomId: item.atomId,
|
||||
floor: item.floor,
|
||||
vector: float32ToBuffer(new Float32Array(item.vector)),
|
||||
dims: item.vector.length,
|
||||
rVector: item.rVector?.length ? float32ToBuffer(new Float32Array(item.rVector)) : null,
|
||||
rDims: item.rVector?.length ? item.rVector.length : 0,
|
||||
fingerprint,
|
||||
}));
|
||||
|
||||
await stateVectorsTable.bulkPut(records);
|
||||
xbLog.info(MODULE_ID, `存储 ${records.length} 个 StateVector`);
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取所有 StateVectors
|
||||
*/
|
||||
export async function getAllStateVectors(chatId) {
|
||||
if (!chatId) return [];
|
||||
|
||||
const records = await stateVectorsTable.where('chatId').equals(chatId).toArray();
|
||||
return records.map(r => ({
|
||||
...r,
|
||||
vector: bufferToFloat32(r.vector),
|
||||
rVector: r.rVector ? bufferToFloat32(r.rVector) : null,
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* 删除指定楼层及之后的 StateVectors
|
||||
*/
|
||||
export async function deleteStateVectorsFromFloor(chatId, floor) {
|
||||
if (!chatId) return;
|
||||
|
||||
const deleted = await stateVectorsTable
|
||||
.where('chatId')
|
||||
.equals(chatId)
|
||||
.filter(v => v.floor >= floor)
|
||||
.delete();
|
||||
|
||||
if (deleted > 0) {
|
||||
xbLog.info(MODULE_ID, `删除 ${deleted} 个 StateVector (floor >= ${floor})`);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 清空所有 StateVectors
|
||||
*/
|
||||
export async function clearStateVectors(chatId) {
|
||||
if (!chatId) return;
|
||||
|
||||
const deleted = await stateVectorsTable.where('chatId').equals(chatId).delete();
|
||||
if (deleted > 0) {
|
||||
xbLog.info(MODULE_ID, `清空 ${deleted} 个 StateVector`);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取 StateVectors 数量
|
||||
*/
|
||||
export async function getStateVectorsCount(chatId) {
|
||||
if (!chatId) return 0;
|
||||
return await stateVectorsTable.where('chatId').equals(chatId).count();
|
||||
}
|
||||
385
modules/story-summary/vector/storage/vector-io.js
Normal file
385
modules/story-summary/vector/storage/vector-io.js
Normal file
@@ -0,0 +1,385 @@
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Vector Import/Export
|
||||
// 向量数据导入导出(当前 chatId 级别)
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
import { zipSync, unzipSync, strToU8, strFromU8 } from '../../../../libs/fflate.mjs';
|
||||
import { getContext } from '../../../../../../../extensions.js';
|
||||
import { xbLog } from '../../../../core/debug-core.js';
|
||||
import {
|
||||
getMeta,
|
||||
updateMeta,
|
||||
getAllChunks,
|
||||
getAllChunkVectors,
|
||||
getAllEventVectors,
|
||||
saveChunks,
|
||||
saveChunkVectors,
|
||||
clearAllChunks,
|
||||
clearEventVectors,
|
||||
saveEventVectors,
|
||||
} from './chunk-store.js';
|
||||
import {
|
||||
getStateAtoms,
|
||||
saveStateAtoms,
|
||||
clearStateAtoms,
|
||||
getAllStateVectors,
|
||||
saveStateVectors,
|
||||
clearStateVectors,
|
||||
} from './state-store.js';
|
||||
import { getEngineFingerprint } from '../utils/embedder.js';
|
||||
import { getVectorConfig } from '../../data/config.js';
|
||||
|
||||
const MODULE_ID = 'vector-io';
|
||||
const EXPORT_VERSION = 2;
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// 工具函数
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
function float32ToBytes(vectors, dims) {
|
||||
const totalFloats = vectors.length * dims;
|
||||
const buffer = new ArrayBuffer(totalFloats * 4);
|
||||
const view = new Float32Array(buffer);
|
||||
|
||||
let offset = 0;
|
||||
for (const vec of vectors) {
|
||||
for (let i = 0; i < dims; i++) {
|
||||
view[offset++] = vec[i] || 0;
|
||||
}
|
||||
}
|
||||
|
||||
return new Uint8Array(buffer);
|
||||
}
|
||||
|
||||
function bytesToFloat32(bytes, dims) {
|
||||
const view = new Float32Array(bytes.buffer, bytes.byteOffset, bytes.byteLength / 4);
|
||||
const vectors = [];
|
||||
|
||||
for (let i = 0; i < view.length; i += dims) {
|
||||
vectors.push(Array.from(view.slice(i, i + dims)));
|
||||
}
|
||||
|
||||
return vectors;
|
||||
}
|
||||
|
||||
function downloadBlob(blob, filename) {
|
||||
const url = URL.createObjectURL(blob);
|
||||
const a = document.createElement('a');
|
||||
a.href = url;
|
||||
a.download = filename;
|
||||
document.body.appendChild(a);
|
||||
a.click();
|
||||
document.body.removeChild(a);
|
||||
URL.revokeObjectURL(url);
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// 导出
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
export async function exportVectors(onProgress) {
|
||||
const { chatId } = getContext();
|
||||
if (!chatId) {
|
||||
throw new Error('未打开聊天');
|
||||
}
|
||||
|
||||
onProgress?.('读取数据...');
|
||||
|
||||
const meta = await getMeta(chatId);
|
||||
const chunks = await getAllChunks(chatId);
|
||||
const chunkVectors = await getAllChunkVectors(chatId);
|
||||
const eventVectors = await getAllEventVectors(chatId);
|
||||
const stateAtoms = getStateAtoms();
|
||||
const stateVectors = await getAllStateVectors(chatId);
|
||||
|
||||
if (chunkVectors.length === 0 && eventVectors.length === 0 && stateVectors.length === 0) {
|
||||
throw new Error('没有可导出的向量数据');
|
||||
}
|
||||
|
||||
// 确定维度
|
||||
const dims = chunkVectors[0]?.vector?.length
|
||||
|| eventVectors[0]?.vector?.length
|
||||
|| stateVectors[0]?.vector?.length
|
||||
|| 0;
|
||||
if (dims === 0) {
|
||||
throw new Error('无法确定向量维度');
|
||||
}
|
||||
|
||||
onProgress?.('构建索引...');
|
||||
|
||||
// 构建 chunk 索引(按 chunkId 排序保证顺序一致)
|
||||
const sortedChunks = [...chunks].sort((a, b) => a.chunkId.localeCompare(b.chunkId));
|
||||
const chunkVectorMap = new Map(chunkVectors.map(cv => [cv.chunkId, cv.vector]));
|
||||
|
||||
// chunks.jsonl
|
||||
const chunksJsonl = sortedChunks.map(c => JSON.stringify({
|
||||
chunkId: c.chunkId,
|
||||
floor: c.floor,
|
||||
chunkIdx: c.chunkIdx,
|
||||
speaker: c.speaker,
|
||||
isUser: c.isUser,
|
||||
text: c.text,
|
||||
textHash: c.textHash,
|
||||
})).join('\n');
|
||||
|
||||
// chunk_vectors.bin(按 sortedChunks 顺序)
|
||||
const chunkVectorsOrdered = sortedChunks.map(c => chunkVectorMap.get(c.chunkId) || new Array(dims).fill(0));
|
||||
|
||||
onProgress?.('压缩向量...');
|
||||
|
||||
// 构建 event 索引
|
||||
const sortedEventVectors = [...eventVectors].sort((a, b) => a.eventId.localeCompare(b.eventId));
|
||||
const eventsJsonl = sortedEventVectors.map(ev => JSON.stringify({
|
||||
eventId: ev.eventId,
|
||||
})).join('\n');
|
||||
|
||||
// event_vectors.bin
|
||||
const eventVectorsOrdered = sortedEventVectors.map(ev => ev.vector);
|
||||
|
||||
// state vectors
|
||||
const sortedStateVectors = [...stateVectors].sort((a, b) => String(a.atomId).localeCompare(String(b.atomId)));
|
||||
const stateVectorsOrdered = sortedStateVectors.map(v => v.vector);
|
||||
const rDims = sortedStateVectors.find(v => v.rVector?.length)?.rVector?.length || dims;
|
||||
const stateRVectorsOrdered = sortedStateVectors.map(v =>
|
||||
v.rVector?.length ? v.rVector : new Array(rDims).fill(0)
|
||||
);
|
||||
const stateVectorsJsonl = sortedStateVectors.map(v => JSON.stringify({
|
||||
atomId: v.atomId,
|
||||
floor: v.floor,
|
||||
hasRVector: !!(v.rVector?.length),
|
||||
rDims: v.rVector?.length || 0,
|
||||
})).join('\n');
|
||||
|
||||
// manifest
|
||||
const manifest = {
|
||||
version: EXPORT_VERSION,
|
||||
exportedAt: Date.now(),
|
||||
chatId,
|
||||
fingerprint: meta.fingerprint || '',
|
||||
dims,
|
||||
chunkCount: sortedChunks.length,
|
||||
chunkVectorCount: chunkVectors.length,
|
||||
eventCount: sortedEventVectors.length,
|
||||
stateAtomCount: stateAtoms.length,
|
||||
stateVectorCount: stateVectors.length,
|
||||
stateRVectorCount: sortedStateVectors.filter(v => v.rVector?.length).length,
|
||||
rDims,
|
||||
lastChunkFloor: meta.lastChunkFloor ?? -1,
|
||||
};
|
||||
|
||||
onProgress?.('打包文件...');
|
||||
|
||||
// 打包 zip
|
||||
const zipData = zipSync({
|
||||
'manifest.json': strToU8(JSON.stringify(manifest, null, 2)),
|
||||
'chunks.jsonl': strToU8(chunksJsonl),
|
||||
'chunk_vectors.bin': float32ToBytes(chunkVectorsOrdered, dims),
|
||||
'events.jsonl': strToU8(eventsJsonl),
|
||||
'event_vectors.bin': float32ToBytes(eventVectorsOrdered, dims),
|
||||
'state_atoms.json': strToU8(JSON.stringify(stateAtoms)),
|
||||
'state_vectors.jsonl': strToU8(stateVectorsJsonl),
|
||||
'state_vectors.bin': stateVectorsOrdered.length
|
||||
? float32ToBytes(stateVectorsOrdered, dims)
|
||||
: new Uint8Array(0),
|
||||
'state_r_vectors.bin': stateRVectorsOrdered.length
|
||||
? float32ToBytes(stateRVectorsOrdered, rDims)
|
||||
: new Uint8Array(0),
|
||||
}, { level: 1 }); // 降低压缩级别,速度优先
|
||||
|
||||
onProgress?.('下载文件...');
|
||||
|
||||
// 生成文件名
|
||||
const timestamp = new Date().toISOString().slice(0, 10).replace(/-/g, '');
|
||||
const shortChatId = chatId.slice(0, 8);
|
||||
const filename = `vectors_${shortChatId}_${timestamp}.zip`;
|
||||
|
||||
downloadBlob(new Blob([zipData]), filename);
|
||||
|
||||
const sizeMB = (zipData.byteLength / 1024 / 1024).toFixed(2);
|
||||
xbLog.info(MODULE_ID, `导出完成: ${filename} (${sizeMB}MB)`);
|
||||
|
||||
return {
|
||||
filename,
|
||||
size: zipData.byteLength,
|
||||
chunkCount: sortedChunks.length,
|
||||
eventCount: sortedEventVectors.length,
|
||||
};
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// 导入
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
export async function importVectors(file, onProgress) {
|
||||
const { chatId } = getContext();
|
||||
if (!chatId) {
|
||||
throw new Error('未打开聊天');
|
||||
}
|
||||
|
||||
onProgress?.('读取文件...');
|
||||
|
||||
const arrayBuffer = await file.arrayBuffer();
|
||||
const zipData = new Uint8Array(arrayBuffer);
|
||||
|
||||
onProgress?.('解压文件...');
|
||||
|
||||
let unzipped;
|
||||
try {
|
||||
unzipped = unzipSync(zipData);
|
||||
} catch (e) {
|
||||
throw new Error('文件格式错误,无法解压');
|
||||
}
|
||||
|
||||
// 读取 manifest
|
||||
if (!unzipped['manifest.json']) {
|
||||
throw new Error('缺少 manifest.json');
|
||||
}
|
||||
|
||||
const manifest = JSON.parse(strFromU8(unzipped['manifest.json']));
|
||||
|
||||
if (![1, 2].includes(manifest.version)) {
|
||||
throw new Error(`不支持的版本: ${manifest.version}`);
|
||||
}
|
||||
|
||||
onProgress?.('校验数据...');
|
||||
|
||||
// 校验 fingerprint
|
||||
const vectorCfg = getVectorConfig();
|
||||
const currentFingerprint = vectorCfg ? getEngineFingerprint(vectorCfg) : '';
|
||||
const fingerprintMismatch = manifest.fingerprint && currentFingerprint && manifest.fingerprint !== currentFingerprint;
|
||||
|
||||
// chatId 校验(警告但允许)
|
||||
const chatIdMismatch = manifest.chatId !== chatId;
|
||||
|
||||
const warnings = [];
|
||||
if (fingerprintMismatch) {
|
||||
warnings.push(`向量引擎不匹配(文件: ${manifest.fingerprint}, 当前: ${currentFingerprint}),导入后需重新生成`);
|
||||
}
|
||||
if (chatIdMismatch) {
|
||||
warnings.push(`聊天ID不匹配(文件: ${manifest.chatId}, 当前: ${chatId})`);
|
||||
}
|
||||
|
||||
onProgress?.('解析数据...');
|
||||
|
||||
// 解析 chunks
|
||||
const chunksJsonl = unzipped['chunks.jsonl'] ? strFromU8(unzipped['chunks.jsonl']) : '';
|
||||
const chunkMetas = chunksJsonl.split('\n').filter(Boolean).map(line => JSON.parse(line));
|
||||
|
||||
// 解析 chunk vectors
|
||||
const chunkVectorsBytes = unzipped['chunk_vectors.bin'];
|
||||
const chunkVectors = chunkVectorsBytes ? bytesToFloat32(chunkVectorsBytes, manifest.dims) : [];
|
||||
|
||||
// 解析 events
|
||||
const eventsJsonl = unzipped['events.jsonl'] ? strFromU8(unzipped['events.jsonl']) : '';
|
||||
const eventMetas = eventsJsonl.split('\n').filter(Boolean).map(line => JSON.parse(line));
|
||||
|
||||
// 解析 event vectors
|
||||
const eventVectorsBytes = unzipped['event_vectors.bin'];
|
||||
const eventVectors = eventVectorsBytes ? bytesToFloat32(eventVectorsBytes, manifest.dims) : [];
|
||||
|
||||
// 解析 L0 state atoms
|
||||
const stateAtoms = unzipped['state_atoms.json']
|
||||
? JSON.parse(strFromU8(unzipped['state_atoms.json']))
|
||||
: [];
|
||||
|
||||
// 解析 L0 state vectors metas
|
||||
const stateVectorsJsonl = unzipped['state_vectors.jsonl'] ? strFromU8(unzipped['state_vectors.jsonl']) : '';
|
||||
const stateVectorMetas = stateVectorsJsonl.split('\n').filter(Boolean).map(line => JSON.parse(line));
|
||||
|
||||
// Parse L0 semantic vectors
|
||||
const stateVectorsBytes = unzipped['state_vectors.bin'];
|
||||
const stateVectors = (stateVectorsBytes && stateVectorMetas.length)
|
||||
? bytesToFloat32(stateVectorsBytes, manifest.dims)
|
||||
: [];
|
||||
// Parse optional L0 r-vectors (for diffusion r-sem edges)
|
||||
const stateRVectorsBytes = unzipped['state_r_vectors.bin'];
|
||||
const stateRVectors = (stateRVectorsBytes && stateVectorMetas.length)
|
||||
? bytesToFloat32(stateRVectorsBytes, manifest.rDims || manifest.dims)
|
||||
: [];
|
||||
const hasRVectorMeta = stateVectorMetas.some(m => typeof m.hasRVector === 'boolean');
|
||||
|
||||
// 校验数量
|
||||
if (chunkMetas.length !== chunkVectors.length) {
|
||||
throw new Error(`chunk 数量不匹配: 元数据 ${chunkMetas.length}, 向量 ${chunkVectors.length}`);
|
||||
}
|
||||
if (eventMetas.length !== eventVectors.length) {
|
||||
throw new Error(`event 数量不匹配: 元数据 ${eventMetas.length}, 向量 ${eventVectors.length}`);
|
||||
}
|
||||
if (stateVectorMetas.length !== stateVectors.length) {
|
||||
throw new Error(`state 向量数量不匹配: 元数据 ${stateVectorMetas.length}, 向量 ${stateVectors.length}`);
|
||||
}
|
||||
if (stateRVectors.length > 0 && stateVectorMetas.length !== stateRVectors.length) {
|
||||
throw new Error(`state r-vector count mismatch: meta=${stateVectorMetas.length}, vectors=${stateRVectors.length}`);
|
||||
}
|
||||
|
||||
onProgress?.('清空旧数据...');
|
||||
|
||||
// 清空当前数据
|
||||
await clearAllChunks(chatId);
|
||||
await clearEventVectors(chatId);
|
||||
await clearStateVectors(chatId);
|
||||
clearStateAtoms();
|
||||
|
||||
onProgress?.('写入数据...');
|
||||
|
||||
// 写入 chunks
|
||||
if (chunkMetas.length > 0) {
|
||||
const chunksToSave = chunkMetas.map(meta => ({
|
||||
chunkId: meta.chunkId,
|
||||
floor: meta.floor,
|
||||
chunkIdx: meta.chunkIdx,
|
||||
speaker: meta.speaker,
|
||||
isUser: meta.isUser,
|
||||
text: meta.text,
|
||||
textHash: meta.textHash,
|
||||
}));
|
||||
await saveChunks(chatId, chunksToSave);
|
||||
|
||||
// 写入 chunk vectors
|
||||
const chunkVectorItems = chunkMetas.map((meta, idx) => ({
|
||||
chunkId: meta.chunkId,
|
||||
vector: chunkVectors[idx],
|
||||
}));
|
||||
await saveChunkVectors(chatId, chunkVectorItems, manifest.fingerprint);
|
||||
}
|
||||
|
||||
// 写入 event vectors
|
||||
if (eventMetas.length > 0) {
|
||||
const eventVectorItems = eventMetas.map((meta, idx) => ({
|
||||
eventId: meta.eventId,
|
||||
vector: eventVectors[idx],
|
||||
}));
|
||||
await saveEventVectors(chatId, eventVectorItems, manifest.fingerprint);
|
||||
}
|
||||
|
||||
// 写入 state atoms
|
||||
if (stateAtoms.length > 0) {
|
||||
saveStateAtoms(stateAtoms);
|
||||
}
|
||||
|
||||
// Write state vectors (semantic + optional r-vector)
|
||||
if (stateVectorMetas.length > 0) {
|
||||
const stateVectorItems = stateVectorMetas.map((meta, idx) => ({
|
||||
atomId: meta.atomId,
|
||||
floor: meta.floor,
|
||||
vector: stateVectors[idx],
|
||||
rVector: (stateRVectors[idx] && (!hasRVectorMeta || meta.hasRVector)) ? stateRVectors[idx] : null,
|
||||
}));
|
||||
await saveStateVectors(chatId, stateVectorItems, manifest.fingerprint);
|
||||
}
|
||||
|
||||
// 更新 meta
|
||||
await updateMeta(chatId, {
|
||||
fingerprint: manifest.fingerprint,
|
||||
lastChunkFloor: manifest.lastChunkFloor,
|
||||
});
|
||||
|
||||
xbLog.info(MODULE_ID, `导入完成: ${chunkMetas.length} chunks, ${eventMetas.length} events, ${stateAtoms.length} state atoms`);
|
||||
|
||||
return {
|
||||
chunkCount: chunkMetas.length,
|
||||
eventCount: eventMetas.length,
|
||||
warnings,
|
||||
fingerprintMismatch,
|
||||
};
|
||||
}
|
||||
83
modules/story-summary/vector/utils/embedder.js
Normal file
83
modules/story-summary/vector/utils/embedder.js
Normal file
@@ -0,0 +1,83 @@
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Story Summary - Embedder (v2 - 统一硅基)
|
||||
// 所有 embedding 请求转发到 siliconflow.js
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
import { embed as sfEmbed, getApiKey } from '../llm/siliconflow.js';
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// 统一 embed 接口
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
export async function embed(texts, config, options = {}) {
|
||||
// 忽略旧的 config 参数,统一走硅基
|
||||
return await sfEmbed(texts, options);
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// 指纹(简化版)
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
export function getEngineFingerprint(config) {
|
||||
// 统一使用硅基 bge-m3
|
||||
return 'siliconflow:bge-m3:1024';
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// 状态检查(简化版)
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
export async function checkLocalModelStatus() {
|
||||
// 不再支持本地模型
|
||||
return { status: 'not_supported', message: '请使用在线服务' };
|
||||
}
|
||||
|
||||
export function isLocalModelLoaded() {
|
||||
return false;
|
||||
}
|
||||
|
||||
export async function downloadLocalModel() {
|
||||
throw new Error('本地模型已移除,请使用在线服务');
|
||||
}
|
||||
|
||||
export function cancelDownload() { }
|
||||
|
||||
export async function deleteLocalModelCache() { }
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// 在线服务测试
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
export async function testOnlineService() {
|
||||
const key = getApiKey();
|
||||
if (!key) {
|
||||
throw new Error('请配置硅基 API Key');
|
||||
}
|
||||
|
||||
try {
|
||||
const [vec] = await sfEmbed(['测试连接']);
|
||||
return { success: true, dims: vec?.length || 0 };
|
||||
} catch (e) {
|
||||
throw new Error(`连接失败: ${e.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
export async function fetchOnlineModels() {
|
||||
// 硅基模型固定
|
||||
return ['BAAI/bge-m3'];
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// 兼容旧接口
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
export const DEFAULT_LOCAL_MODEL = 'bge-m3';
|
||||
|
||||
export const LOCAL_MODELS = {};
|
||||
|
||||
export const ONLINE_PROVIDERS = {
|
||||
siliconflow: {
|
||||
id: 'siliconflow',
|
||||
name: '硅基流动',
|
||||
baseUrl: 'https://api.siliconflow.cn',
|
||||
},
|
||||
};
|
||||
64
modules/story-summary/vector/utils/embedder.worker.js
Normal file
64
modules/story-summary/vector/utils/embedder.worker.js
Normal file
@@ -0,0 +1,64 @@
|
||||
// run local embedding in background
|
||||
|
||||
let pipe = null;
|
||||
let currentModelId = null;
|
||||
|
||||
self.onmessage = async (e) => {
|
||||
const { type, modelId, hfId, texts, requestId } = e.data || {};
|
||||
|
||||
if (type === 'load') {
|
||||
try {
|
||||
self.postMessage({ type: 'status', status: 'loading', requestId });
|
||||
|
||||
const { pipeline, env } = await import(
|
||||
'https://cdn.jsdelivr.net/npm/@xenova/transformers@2.17.2'
|
||||
);
|
||||
|
||||
env.allowLocalModels = false;
|
||||
env.useBrowserCache = false;
|
||||
|
||||
pipe = await pipeline('feature-extraction', hfId, {
|
||||
progress_callback: (progress) => {
|
||||
if (progress.status === 'progress' && typeof progress.progress === 'number') {
|
||||
self.postMessage({ type: 'progress', percent: Math.round(progress.progress), requestId });
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
currentModelId = modelId;
|
||||
self.postMessage({ type: 'loaded', requestId });
|
||||
} catch (err) {
|
||||
self.postMessage({ type: 'error', error: err?.message || String(err), requestId });
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (type === 'embed') {
|
||||
if (!pipe) {
|
||||
self.postMessage({ type: 'error', error: '模型未加载', requestId });
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
const results = [];
|
||||
for (let i = 0; i < texts.length; i++) {
|
||||
const output = await pipe(texts[i], { pooling: 'mean', normalize: true });
|
||||
results.push(Array.from(output.data));
|
||||
self.postMessage({ type: 'embed_progress', current: i + 1, total: texts.length, requestId });
|
||||
}
|
||||
self.postMessage({ type: 'result', vectors: results, requestId });
|
||||
} catch (err) {
|
||||
self.postMessage({ type: 'error', error: err?.message || String(err), requestId });
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (type === 'check') {
|
||||
self.postMessage({
|
||||
type: 'status',
|
||||
loaded: !!pipe,
|
||||
modelId: currentModelId,
|
||||
requestId
|
||||
});
|
||||
}
|
||||
};
|
||||
63
modules/story-summary/vector/utils/text-filter.js
Normal file
63
modules/story-summary/vector/utils/text-filter.js
Normal file
@@ -0,0 +1,63 @@
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Text Filter - 通用文本过滤
|
||||
// 跳过用户定义的「起始→结束」区间
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
import { getTextFilterRules } from '../../data/config.js';
|
||||
|
||||
/**
|
||||
* 转义正则特殊字符
|
||||
*/
|
||||
function escapeRegex(str) {
|
||||
return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
||||
}
|
||||
|
||||
/**
|
||||
* 应用过滤规则
|
||||
* - start + end:删除 start...end(含边界)
|
||||
* - start 空 + end:从开头删到 end(含)
|
||||
* - start + end 空:从 start 删到结尾
|
||||
* - 两者都空:跳过
|
||||
*/
|
||||
export function applyTextFilterRules(text, rules) {
|
||||
if (!text || !rules?.length) return text;
|
||||
|
||||
let result = text;
|
||||
|
||||
for (const rule of rules) {
|
||||
const start = rule.start ?? '';
|
||||
const end = rule.end ?? '';
|
||||
|
||||
if (!start && !end) continue;
|
||||
|
||||
if (start && end) {
|
||||
// 标准区间:删除 start...end(含边界),非贪婪
|
||||
const regex = new RegExp(
|
||||
escapeRegex(start) + '[\\s\\S]*?' + escapeRegex(end),
|
||||
'gi'
|
||||
);
|
||||
result = result.replace(regex, '');
|
||||
} else if (start && !end) {
|
||||
// 从 start 到结尾
|
||||
const idx = result.toLowerCase().indexOf(start.toLowerCase());
|
||||
if (idx !== -1) {
|
||||
result = result.slice(0, idx);
|
||||
}
|
||||
} else if (!start && end) {
|
||||
// 从开头到 end(含)
|
||||
const idx = result.toLowerCase().indexOf(end.toLowerCase());
|
||||
if (idx !== -1) {
|
||||
result = result.slice(idx + end.length);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return result.trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* 便捷方法:使用当前配置过滤文本
|
||||
*/
|
||||
export function filterText(text) {
|
||||
return applyTextFilterRules(text, getTextFilterRules());
|
||||
}
|
||||
749
modules/story-summary/vector/utils/tokenizer.js
Normal file
749
modules/story-summary/vector/utils/tokenizer.js
Normal file
@@ -0,0 +1,749 @@
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// tokenizer.js - 统一分词器
|
||||
//
|
||||
// 职责:
|
||||
// 1. 管理结巴 WASM 生命周期(预加载 / 就绪检测 / 降级)
|
||||
// 2. 实体词典注入(分词前最长匹配保护)
|
||||
// 3. 亚洲文字(CJK + 假名)走结巴,拉丁文字走空格分割
|
||||
// 4. 提供 tokenize(text): string[] 统一接口
|
||||
//
|
||||
// 加载时机:
|
||||
// - 插件初始化时 storySummary.enabled && vectorConfig.enabled → preload()
|
||||
// - 向量开关从 off→on 时 → preload()
|
||||
// - CHAT_CHANGED 时 → injectEntities() + warmup 索引(不负责加载 WASM)
|
||||
//
|
||||
// 降级策略:
|
||||
// - WASM 未就绪时 → 实体保护 + 标点分割(不用 bigram)
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
import { extensionFolderPath } from '../../../../core/constants.js';
|
||||
import { xbLog } from '../../../../core/debug-core.js';
|
||||
|
||||
const MODULE_ID = 'tokenizer';
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// WASM 状态机
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* @enum {string}
|
||||
*/
|
||||
const WasmState = {
|
||||
IDLE: 'IDLE',
|
||||
LOADING: 'LOADING',
|
||||
READY: 'READY',
|
||||
FAILED: 'FAILED',
|
||||
};
|
||||
|
||||
let wasmState = WasmState.IDLE;
|
||||
|
||||
/** @type {Promise<void>|null} 当前加载 Promise(防重入) */
|
||||
let loadingPromise = null;
|
||||
|
||||
/** @type {typeof import('../../../../libs/jieba-wasm/jieba_rs_wasm.js')|null} */
|
||||
let jiebaModule = null;
|
||||
|
||||
/** @type {Function|null} jieba cut 函数引用 */
|
||||
let jiebaCut = null;
|
||||
|
||||
/** @type {Function|null} jieba add_word 函数引用 */
|
||||
let jiebaAddWord = null;
|
||||
|
||||
/** @type {object|null} TinySegmenter 实例 */
|
||||
let tinySegmenter = null;
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// 实体词典
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/** @type {string[]} 按长度降序排列的实体列表(用于最长匹配) */
|
||||
let entityList = [];
|
||||
|
||||
/** @type {Set<string>} 已注入结巴的实体(避免重复 add_word) */
|
||||
let injectedEntities = new Set();
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// 停用词
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
const STOP_WORDS = new Set([
|
||||
// 中文高频虚词
|
||||
'的', '了', '在', '是', '我', '有', '和', '就', '不', '人',
|
||||
'都', '一', '一个', '上', '也', '很', '到', '说', '要', '去',
|
||||
'你', '会', '着', '没有', '看', '好', '自己', '这', '他', '她',
|
||||
'它', '吗', '什么', '那', '里', '来', '吧', '呢', '啊', '哦',
|
||||
'嗯', '呀', '哈', '嘿', '喂', '哎', '唉', '哇', '呃', '嘛',
|
||||
'把', '被', '让', '给', '从', '向', '对', '跟', '比', '但',
|
||||
'而', '或', '如果', '因为', '所以', '虽然', '但是', '然后',
|
||||
'可以', '这样', '那样', '怎么', '为什么', '什么样', '哪里',
|
||||
'时候', '现在', '已经', '还是', '只是', '可能', '应该', '知道',
|
||||
'觉得', '开始', '一下', '一些', '这个', '那个', '他们', '我们',
|
||||
'你们', '自己', '起来', '出来', '进去', '回来', '过来', '下去',
|
||||
// 日语常见虚词(≥2字,匹配 TinySegmenter 产出粒度)
|
||||
'です', 'ます', 'した', 'して', 'する', 'ない', 'いる', 'ある',
|
||||
'なる', 'れる', 'られ', 'られる',
|
||||
'この', 'その', 'あの', 'どの', 'ここ', 'そこ', 'あそこ',
|
||||
'これ', 'それ', 'あれ', 'どれ',
|
||||
'ても', 'から', 'まで', 'ので', 'のに', 'けど', 'だけ',
|
||||
'もう', 'まだ', 'とても', 'ちょっと', 'やっぱり',
|
||||
// 英文常见停用词
|
||||
'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been',
|
||||
'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
|
||||
'would', 'could', 'should', 'may', 'might', 'can', 'shall',
|
||||
'and', 'but', 'or', 'not', 'no', 'nor', 'so', 'yet',
|
||||
'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from',
|
||||
'it', 'its', 'he', 'she', 'his', 'her', 'they', 'them',
|
||||
'this', 'that', 'these', 'those', 'i', 'me', 'my', 'you', 'your',
|
||||
'we', 'our', 'if', 'then', 'than', 'when', 'what', 'which',
|
||||
'who', 'how', 'where', 'there', 'here', 'all', 'each', 'every',
|
||||
'both', 'few', 'more', 'most', 'other', 'some', 'such',
|
||||
'only', 'own', 'same', 'just', 'very', 'also', 'about',
|
||||
]);
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Unicode 分类
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* 判断字符是否为假名(平假名 + 片假名)
|
||||
* @param {number} code - charCode
|
||||
* @returns {boolean}
|
||||
*/
|
||||
function isKana(code) {
|
||||
return (
|
||||
(code >= 0x3040 && code <= 0x309F) || // Hiragana
|
||||
(code >= 0x30A0 && code <= 0x30FF) || // Katakana
|
||||
(code >= 0x31F0 && code <= 0x31FF) || // Katakana Extensions
|
||||
(code >= 0xFF65 && code <= 0xFF9F) // Halfwidth Katakana
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* 判断字符是否为 CJK 汉字(不含假名)
|
||||
* @param {number} code - charCode
|
||||
* @returns {boolean}
|
||||
*/
|
||||
function isCJK(code) {
|
||||
return (
|
||||
(code >= 0x4E00 && code <= 0x9FFF) ||
|
||||
(code >= 0x3400 && code <= 0x4DBF) ||
|
||||
(code >= 0xF900 && code <= 0xFAFF) ||
|
||||
(code >= 0x20000 && code <= 0x2A6DF)
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* 判断字符是否为亚洲文字(CJK + 假名)
|
||||
* @param {number} code - charCode
|
||||
* @returns {boolean}
|
||||
*/
|
||||
function isAsian(code) {
|
||||
return (
|
||||
isCJK(code) || isKana(code)
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* 判断字符是否为拉丁字母或数字
|
||||
* @param {number} code - charCode
|
||||
* @returns {boolean}
|
||||
*/
|
||||
function isLatin(code) {
|
||||
return (
|
||||
(code >= 0x41 && code <= 0x5A) || // A-Z
|
||||
(code >= 0x61 && code <= 0x7A) || // a-z
|
||||
(code >= 0x30 && code <= 0x39) || // 0-9
|
||||
(code >= 0xC0 && code <= 0x024F) // Latin Extended (àáâ 等)
|
||||
);
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// 文本分段(亚洲 vs 拉丁 vs 其他)
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* @typedef {'asian'|'latin'|'other'} SegmentType
|
||||
*/
|
||||
|
||||
/**
|
||||
* @typedef {object} TextSegment
|
||||
* @property {SegmentType} type - 段类型
|
||||
* @property {string} text - 段文本
|
||||
*/
|
||||
|
||||
/**
|
||||
* 将文本按 Unicode 脚本分段
|
||||
* 连续的同类字符归为一段
|
||||
*
|
||||
* @param {string} text
|
||||
* @returns {TextSegment[]}
|
||||
*/
|
||||
function segmentByScript(text) {
|
||||
if (!text) return [];
|
||||
|
||||
const segments = [];
|
||||
let currentType = null;
|
||||
let currentStart = 0;
|
||||
|
||||
for (let i = 0; i < text.length; i++) {
|
||||
const code = text.charCodeAt(i);
|
||||
let type;
|
||||
|
||||
if (isAsian(code)) {
|
||||
type = 'asian';
|
||||
} else if (isLatin(code)) {
|
||||
type = 'latin';
|
||||
} else {
|
||||
type = 'other';
|
||||
}
|
||||
|
||||
if (type !== currentType) {
|
||||
if (currentType !== null && currentStart < i) {
|
||||
const seg = text.slice(currentStart, i);
|
||||
if (currentType !== 'other' || seg.trim()) {
|
||||
segments.push({ type: currentType, text: seg });
|
||||
}
|
||||
}
|
||||
currentType = type;
|
||||
currentStart = i;
|
||||
}
|
||||
}
|
||||
|
||||
// 最后一段
|
||||
if (currentStart < text.length) {
|
||||
const seg = text.slice(currentStart);
|
||||
if (currentType !== 'other' || seg.trim()) {
|
||||
segments.push({ type: currentType, text: seg });
|
||||
}
|
||||
}
|
||||
|
||||
return segments;
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// 亚洲文字语言检测(中文 vs 日语)
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* 检测亚洲文字段的语言
|
||||
*
|
||||
* 假名占比 > 30% 判定为日语(日语文本中假名通常占 40-60%)
|
||||
*
|
||||
* @param {string} text - 亚洲文字段
|
||||
* @returns {'zh'|'ja'|'other'}
|
||||
*/
|
||||
function detectAsianLanguage(text) {
|
||||
let kanaCount = 0;
|
||||
let cjkCount = 0;
|
||||
for (const ch of text) {
|
||||
const code = ch.codePointAt(0);
|
||||
if (isKana(code)) kanaCount++;
|
||||
else if (isCJK(code)) cjkCount++;
|
||||
}
|
||||
const total = kanaCount + cjkCount;
|
||||
if (total === 0) return 'other';
|
||||
return (kanaCount / total) > 0.3 ? 'ja' : 'zh';
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// 实体保护(最长匹配占位符替换)
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
// 使用纯 PUA 字符序列作为占位符,避免拉丁字母泄漏到分词结果
|
||||
const PLACEHOLDER_PREFIX = '\uE000\uE010';
|
||||
const PLACEHOLDER_SUFFIX = '\uE001';
|
||||
|
||||
/**
|
||||
* 在文本中执行实体最长匹配,替换为占位符
|
||||
*
|
||||
* @param {string} text - 原始文本
|
||||
* @returns {{masked: string, entities: Map<string, string>}} masked 文本 + 占位符→原文映射
|
||||
*/
|
||||
function maskEntities(text) {
|
||||
const entities = new Map();
|
||||
|
||||
if (!entityList.length || !text) {
|
||||
return { masked: text, entities };
|
||||
}
|
||||
|
||||
let masked = text;
|
||||
let idx = 0;
|
||||
|
||||
// entityList 已按长度降序排列,保证最长匹配优先
|
||||
for (const entity of entityList) {
|
||||
// 大小写不敏感搜索
|
||||
const lowerMasked = masked.toLowerCase();
|
||||
const lowerEntity = entity.toLowerCase();
|
||||
let searchFrom = 0;
|
||||
|
||||
while (true) {
|
||||
const pos = lowerMasked.indexOf(lowerEntity, searchFrom);
|
||||
if (pos === -1) break;
|
||||
|
||||
// 已被占位符覆盖则跳过(检查前后是否存在 PUA 边界字符)
|
||||
const aroundStart = Math.max(0, pos - 4);
|
||||
const aroundEnd = Math.min(masked.length, pos + entity.length + 4);
|
||||
const around = masked.slice(aroundStart, aroundEnd);
|
||||
if (around.includes('\uE000') || around.includes('\uE001')) {
|
||||
searchFrom = pos + 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
const placeholder = `${PLACEHOLDER_PREFIX}${idx}${PLACEHOLDER_SUFFIX}`;
|
||||
const originalText = masked.slice(pos, pos + entity.length);
|
||||
entities.set(placeholder, originalText);
|
||||
|
||||
masked = masked.slice(0, pos) + placeholder + masked.slice(pos + entity.length);
|
||||
idx++;
|
||||
|
||||
// 更新搜索位置(跳过占位符)
|
||||
searchFrom = pos + placeholder.length;
|
||||
}
|
||||
}
|
||||
|
||||
return { masked, entities };
|
||||
}
|
||||
|
||||
/**
|
||||
* 将 token 数组中的占位符还原为原始实体
|
||||
*
|
||||
* @param {string[]} tokens
|
||||
* @param {Map<string, string>} entities - 占位符→原文映射
|
||||
* @returns {string[]}
|
||||
*/
|
||||
function unmaskTokens(tokens, entities) {
|
||||
if (!entities.size) return tokens;
|
||||
|
||||
return tokens.flatMap(token => {
|
||||
// token 本身就是一个完整占位符
|
||||
if (entities.has(token)) {
|
||||
return [entities.get(token)];
|
||||
}
|
||||
|
||||
// token 中包含 PUA 字符 → 检查是否包含完整占位符
|
||||
if (/[\uE000-\uE0FF]/.test(token)) {
|
||||
for (const [placeholder, original] of entities) {
|
||||
if (token.includes(placeholder)) {
|
||||
return [original];
|
||||
}
|
||||
}
|
||||
// 纯 PUA 碎片,丢弃
|
||||
return [];
|
||||
}
|
||||
|
||||
// 普通 token,原样保留
|
||||
return [token];
|
||||
});
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// 分词:亚洲文字(结巴 / 降级)
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* 用结巴分词处理亚洲文字段
|
||||
* @param {string} text
|
||||
* @returns {string[]}
|
||||
*/
|
||||
function tokenizeAsianJieba(text) {
|
||||
if (!text || !jiebaCut) return [];
|
||||
|
||||
try {
|
||||
const words = jiebaCut(text, true); // hmm=true
|
||||
return Array.from(words)
|
||||
.map(w => String(w || '').trim())
|
||||
.filter(w => w.length >= 2);
|
||||
} catch (e) {
|
||||
xbLog.warn(MODULE_ID, '结巴分词异常,降级处理', e);
|
||||
return tokenizeAsianFallback(text);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 降级分词:标点/空格分割 + 保留 2-6 字 CJK 片段
|
||||
* 不使用 bigram,避免索引膨胀
|
||||
*
|
||||
* @param {string} text
|
||||
* @returns {string[]}
|
||||
*/
|
||||
function tokenizeAsianFallback(text) {
|
||||
if (!text) return [];
|
||||
|
||||
const tokens = [];
|
||||
|
||||
// 按标点和空格分割
|
||||
const parts = text.split(/[\s,。!?、;:""''()【】《》…—\-,.!?;:'"()[\]{}<>/\\|@#$%^&*+=~`]+/);
|
||||
|
||||
for (const part of parts) {
|
||||
const trimmed = part.trim();
|
||||
if (!trimmed) continue;
|
||||
|
||||
if (trimmed.length >= 2 && trimmed.length <= 6) {
|
||||
tokens.push(trimmed);
|
||||
} else if (trimmed.length > 6) {
|
||||
// 长片段按 4 字滑窗切分(比 bigram 稀疏得多)
|
||||
for (let i = 0; i <= trimmed.length - 4; i += 2) {
|
||||
tokens.push(trimmed.slice(i, i + 4));
|
||||
}
|
||||
// 保留完整片段的前 6 字
|
||||
tokens.push(trimmed.slice(0, 6));
|
||||
}
|
||||
}
|
||||
|
||||
return tokens;
|
||||
}
|
||||
|
||||
/**
|
||||
* 用 TinySegmenter 处理日语文字段
|
||||
* @param {string} text
|
||||
* @returns {string[]}
|
||||
*/
|
||||
function tokenizeJapanese(text) {
|
||||
if (tinySegmenter) {
|
||||
try {
|
||||
const words = tinySegmenter.segment(text);
|
||||
return words
|
||||
.map(w => String(w || '').trim())
|
||||
.filter(w => w.length >= 2);
|
||||
} catch (e) {
|
||||
xbLog.warn(MODULE_ID, 'TinySegmenter 分词异常,降级处理', e);
|
||||
return tokenizeAsianFallback(text);
|
||||
}
|
||||
}
|
||||
return tokenizeAsianFallback(text);
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// 分词:拉丁文字
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* 拉丁文字分词:空格/标点分割
|
||||
* @param {string} text
|
||||
* @returns {string[]}
|
||||
*/
|
||||
function tokenizeLatin(text) {
|
||||
if (!text) return [];
|
||||
|
||||
return text
|
||||
.split(/[\s\-_.,;:!?'"()[\]{}<>/\\|@#$%^&*+=~`]+/)
|
||||
.map(w => w.trim().toLowerCase())
|
||||
.filter(w => w.length >= 3);
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// 公开接口:preload
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* 预加载结巴 WASM
|
||||
*
|
||||
* 可多次调用,内部防重入。
|
||||
* FAILED 状态下再次调用会重试。
|
||||
*
|
||||
* @returns {Promise<boolean>} 是否加载成功
|
||||
*/
|
||||
export async function preload() {
|
||||
// TinySegmenter 独立于结巴状态(内部有防重入)
|
||||
loadTinySegmenter();
|
||||
|
||||
// 已就绪
|
||||
if (wasmState === WasmState.READY) return true;
|
||||
|
||||
// 正在加载,等待结果
|
||||
if (wasmState === WasmState.LOADING && loadingPromise) {
|
||||
try {
|
||||
await loadingPromise;
|
||||
return wasmState === WasmState.READY;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// IDLE 或 FAILED → 开始加载
|
||||
wasmState = WasmState.LOADING;
|
||||
|
||||
const T0 = performance.now();
|
||||
|
||||
loadingPromise = (async () => {
|
||||
try {
|
||||
// ★ 使用绝对路径(开头加 /)
|
||||
const wasmPath = `/${extensionFolderPath}/libs/jieba-wasm/jieba_rs_wasm_bg.wasm`;
|
||||
|
||||
// eslint-disable-next-line no-unsanitized/method
|
||||
jiebaModule = await import(
|
||||
`/${extensionFolderPath}/libs/jieba-wasm/jieba_rs_wasm.js`
|
||||
);
|
||||
|
||||
// 初始化 WASM(新版 API 用对象形式)
|
||||
if (typeof jiebaModule.default === 'function') {
|
||||
await jiebaModule.default({ module_or_path: wasmPath });
|
||||
}
|
||||
|
||||
// 缓存函数引用
|
||||
jiebaCut = jiebaModule.cut;
|
||||
jiebaAddWord = jiebaModule.add_word;
|
||||
|
||||
if (typeof jiebaCut !== 'function') {
|
||||
throw new Error('jieba cut 函数不存在');
|
||||
}
|
||||
|
||||
wasmState = WasmState.READY;
|
||||
|
||||
const elapsed = Math.round(performance.now() - T0);
|
||||
xbLog.info(MODULE_ID, `结巴 WASM 加载完成 (${elapsed}ms)`);
|
||||
|
||||
// 如果有待注入的实体,补做
|
||||
if (entityList.length > 0 && jiebaAddWord) {
|
||||
reInjectAllEntities();
|
||||
}
|
||||
|
||||
return true;
|
||||
} catch (e) {
|
||||
wasmState = WasmState.FAILED;
|
||||
xbLog.error(MODULE_ID, '结巴 WASM 加载失败', e);
|
||||
throw e;
|
||||
}
|
||||
})();
|
||||
|
||||
try {
|
||||
await loadingPromise;
|
||||
return true;
|
||||
} catch {
|
||||
return false;
|
||||
} finally {
|
||||
loadingPromise = null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 加载 TinySegmenter(懒加载,不阻塞)
|
||||
*/
|
||||
async function loadTinySegmenter() {
|
||||
if (tinySegmenter) return;
|
||||
|
||||
try {
|
||||
// eslint-disable-next-line no-unsanitized/method
|
||||
const mod = await import(
|
||||
`/${extensionFolderPath}/libs/tiny-segmenter.js`
|
||||
);
|
||||
const Ctor = mod.TinySegmenter || mod.default;
|
||||
tinySegmenter = new Ctor();
|
||||
xbLog.info(MODULE_ID, 'TinySegmenter 加载完成');
|
||||
} catch (e) {
|
||||
xbLog.warn(MODULE_ID, 'TinySegmenter 加载失败,日语将使用降级分词', e);
|
||||
}
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// 公开接口:isReady
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* 检查结巴是否已就绪
|
||||
* @returns {boolean}
|
||||
*/
|
||||
export function isReady() {
|
||||
return wasmState === WasmState.READY;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取当前 WASM 状态
|
||||
* @returns {string}
|
||||
*/
|
||||
export function getState() {
|
||||
return wasmState;
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// 公开接口:injectEntities
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* 注入实体词典
|
||||
*
|
||||
* 更新内部实体列表(用于最长匹配保护)
|
||||
* 如果结巴已就绪,同时调用 add_word 注入
|
||||
*
|
||||
* @param {Set<string>} lexicon - 标准化后的实体集合
|
||||
* @param {Map<string, string>} [displayMap] - normalize→原词形映射
|
||||
*/
|
||||
export function injectEntities(lexicon, displayMap) {
|
||||
if (!lexicon?.size) {
|
||||
entityList = [];
|
||||
return;
|
||||
}
|
||||
|
||||
// 构建实体列表:使用原词形(displayMap),按长度降序排列
|
||||
const entities = [];
|
||||
for (const normalized of lexicon) {
|
||||
const display = displayMap?.get(normalized) || normalized;
|
||||
if (display.length >= 2) {
|
||||
entities.push(display);
|
||||
}
|
||||
}
|
||||
|
||||
// 按长度降序(最长匹配优先)
|
||||
entities.sort((a, b) => b.length - a.length);
|
||||
entityList = entities;
|
||||
|
||||
// 如果结巴已就绪,注入自定义词
|
||||
if (wasmState === WasmState.READY && jiebaAddWord) {
|
||||
injectNewEntitiesToJieba(entities);
|
||||
}
|
||||
|
||||
xbLog.info(MODULE_ID, `实体词典更新: ${entities.length} 个实体`);
|
||||
}
|
||||
|
||||
/**
|
||||
* 将新实体注入结巴(增量,跳过已注入的)
|
||||
* @param {string[]} entities
|
||||
*/
|
||||
function injectNewEntitiesToJieba(entities) {
|
||||
let count = 0;
|
||||
for (const entity of entities) {
|
||||
if (!injectedEntities.has(entity)) {
|
||||
try {
|
||||
// freq 设高保证不被切碎
|
||||
jiebaAddWord(entity, 99999);
|
||||
injectedEntities.add(entity);
|
||||
count++;
|
||||
} catch (e) {
|
||||
xbLog.warn(MODULE_ID, `add_word 失败: ${entity}`, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (count > 0) {
|
||||
xbLog.info(MODULE_ID, `注入 ${count} 个新实体到结巴`);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 重新注入所有实体(WASM 刚加载完时调用)
|
||||
*/
|
||||
function reInjectAllEntities() {
|
||||
injectedEntities.clear();
|
||||
injectNewEntitiesToJieba(entityList);
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// 公开接口:tokenize
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* 统一分词接口
|
||||
*
|
||||
* 流程:
|
||||
* 1. 实体最长匹配 → 占位符保护
|
||||
* 2. 按 Unicode 脚本分段(亚洲 vs 拉丁)
|
||||
* 3. 亚洲段 → 结巴 cut()(或降级)
|
||||
* 4. 拉丁段 → 空格/标点分割
|
||||
* 5. 还原占位符
|
||||
* 6. 过滤停用词 + 去重
|
||||
*
|
||||
* @param {string} text - 输入文本
|
||||
* @returns {string[]} token 数组
|
||||
*/
|
||||
export function tokenize(text) {
|
||||
const restored = tokenizeCore(text);
|
||||
|
||||
// 5. 过滤停用词 + 去重 + 清理
|
||||
const seen = new Set();
|
||||
const result = [];
|
||||
|
||||
for (const token of restored) {
|
||||
const cleaned = token.trim().toLowerCase();
|
||||
|
||||
if (!cleaned) continue;
|
||||
if (cleaned.length < 2) continue;
|
||||
if (STOP_WORDS.has(cleaned)) continue;
|
||||
if (seen.has(cleaned)) continue;
|
||||
|
||||
// 过滤纯标点/特殊字符
|
||||
if (/^[\s\x00-\x1F\p{P}\p{S}]+$/u.test(cleaned)) continue;
|
||||
|
||||
seen.add(cleaned);
|
||||
result.push(token.trim()); // 保留原始大小写
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* 内核分词流程(不去重、不 lower、仅完成:实体保护→分段→分词→还原)
|
||||
* @param {string} text
|
||||
* @returns {string[]}
|
||||
*/
|
||||
function tokenizeCore(text) {
|
||||
if (!text) return [];
|
||||
|
||||
const input = String(text).trim();
|
||||
if (!input) return [];
|
||||
|
||||
// 1. 实体保护
|
||||
const { masked, entities } = maskEntities(input);
|
||||
|
||||
// 2. 分段
|
||||
const segments = segmentByScript(masked);
|
||||
|
||||
// 3. 分段分词
|
||||
const rawTokens = [];
|
||||
for (const seg of segments) {
|
||||
if (seg.type === 'asian') {
|
||||
const lang = detectAsianLanguage(seg.text);
|
||||
if (lang === 'ja') {
|
||||
rawTokens.push(...tokenizeJapanese(seg.text));
|
||||
} else if (wasmState === WasmState.READY && jiebaCut) {
|
||||
rawTokens.push(...tokenizeAsianJieba(seg.text));
|
||||
} else {
|
||||
rawTokens.push(...tokenizeAsianFallback(seg.text));
|
||||
}
|
||||
} else if (seg.type === 'latin') {
|
||||
rawTokens.push(...tokenizeLatin(seg.text));
|
||||
}
|
||||
}
|
||||
|
||||
// 4. 还原占位符
|
||||
return unmaskTokens(rawTokens, entities);
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// 公开接口:tokenizeForIndex
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* MiniSearch 索引专用分词
|
||||
*
|
||||
* 与 tokenize() 的区别:
|
||||
* - 全部转小写(MiniSearch 内部需要一致性)
|
||||
* - 不去重(MiniSearch 自己处理词频)
|
||||
*
|
||||
* @param {string} text
|
||||
* @returns {string[]}
|
||||
*/
|
||||
export function tokenizeForIndex(text) {
|
||||
const restored = tokenizeCore(text);
|
||||
|
||||
return restored
|
||||
.map(t => t.trim().toLowerCase())
|
||||
.filter(t => {
|
||||
if (!t || t.length < 2) return false;
|
||||
if (STOP_WORDS.has(t)) return false;
|
||||
if (/^[\s\x00-\x1F\p{P}\p{S}]+$/u.test(t)) return false;
|
||||
return true;
|
||||
});
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// 公开接口:reset
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* 重置分词器状态
|
||||
* 用于测试或模块卸载
|
||||
*/
|
||||
export function reset() {
|
||||
entityList = [];
|
||||
injectedEntities.clear();
|
||||
// 不重置 WASM 状态(避免重复加载)
|
||||
}
|
||||
Reference in New Issue
Block a user