Files
LittleWhiteBox/modules/story-summary/vector/retrieval/query-builder.js

339 lines
13 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// ═══════════════════════════════════════════════════════════════════════════
// query-builder.js - 确定性查询构建器(无 LLM
//
// 职责:
// 1. 从最近消息 + 实体词典构建 QueryBundle_v0
// 2. 用第一轮召回结果增强为 QueryBundle_v1
//
// 不负责向量化、检索、rerank
// ═══════════════════════════════════════════════════════════════════════════
import { getContext } from '../../../../../../../extensions.js';
import { buildEntityLexicon, buildDisplayNameMap, extractEntitiesFromText } from './entity-lexicon.js';
import { getSummaryStore } from '../../data/store.js';
import { filterText } from '../utils/text-filter.js';
import { tokenizeForIndex as tokenizerTokenizeForIndex } from '../utils/tokenizer.js';
// ─────────────────────────────────────────────────────────────────────────
// 常量
// ─────────────────────────────────────────────────────────────────────────
// Zero-darkbox policy:
// - No internal truncation. We rely on model-side truncation / provider limits.
// - If provider rejects due to length, we fail loudly and degrade explicitly.
const MEMORY_HINT_ATOMS_MAX = 5;
const MEMORY_HINT_EVENTS_MAX = 3;
const LEXICAL_TERMS_MAX = 10;
// ─────────────────────────────────────────────────────────────────────────
// 工具函数
// ─────────────────────────────────────────────────────────────────────────
/**
* 清洗消息文本(与 chunk-builder / recall 保持一致)
* @param {string} text
* @returns {string}
*/
function cleanMessageText(text) {
return filterText(text)
.replace(/\[tts:[^\]]*\]/gi, '')
.replace(/<state>[\s\S]*?<\/state>/gi, '')
.trim();
}
/**
* 截断文本到指定长度
* @param {string} text
* @param {number} maxLen
* @returns {string}
*/
// truncate removed by design (zero-darkbox)
/**
* 清理事件摘要(移除楼层标记)
* @param {string} summary
* @returns {string}
*/
function cleanSummary(summary) {
return String(summary || '')
.replace(/\s*\(#\d+(?:-\d+)?\)\s*$/, '')
.trim();
}
/**
* 从文本中提取高频实词(用于词法检索)
*
* 使用统一分词器(结巴 + 实体保护 + 停用词过滤),按频率排序
*
* @param {string} text - 清洗后的文本
* @param {number} maxTerms - 最大词数
* @returns {string[]}
*/
function extractKeyTerms(text, maxTerms = LEXICAL_TERMS_MAX) {
if (!text) return [];
// 使用统一分词器(索引用,不去重,保留词频)
const tokens = tokenizerTokenizeForIndex(text);
// 统计词频
const freq = new Map();
for (const token of tokens) {
const key = String(token || '').toLowerCase();
if (!key) continue;
freq.set(key, (freq.get(key) || 0) + 1);
}
return Array.from(freq.entries())
.sort((a, b) => b[1] - a[1])
.slice(0, maxTerms)
.map(([term]) => term);
}
/**
* 构建 rerank 专用查询(纯自然语言,不带结构标签)
*
* rerankerbge-reranker-v2-m3的 query 应为自然语言文本,
* 不含 [ENTITIES] [DIALOGUE] 等结构标签。
*
* @param {string[]} focusEntities - 焦点实体
* @param {object[]} lastMessages - 最近 K 条消息
* @param {string|null} pendingUserMessage - 待发送的用户消息
* @param {object} context - { name1, name2 }
* @returns {string}
*/
function buildRerankQuery(focusEntities, lastMessages, pendingUserMessage, context) {
const parts = [];
// 实体提示
if (focusEntities.length > 0) {
parts.push(`关于${focusEntities.join('、')}`);
}
// 最近对话原文
for (const m of (lastMessages || [])) {
const speaker = m.is_user ? (context.name1 || '用户') : (m.name || context.name2 || '角色');
const clean = cleanMessageText(m.mes || '');
if (clean) {
parts.push(`${speaker}${clean}`);
}
}
// 待发送消息
if (pendingUserMessage) {
const clean = cleanMessageText(pendingUserMessage);
if (clean) {
parts.push(`${context.name1 || '用户'}${clean}`);
}
}
return parts.join('\n');
}
// ─────────────────────────────────────────────────────────────────────────
// QueryBundle 类型定义JSDoc
// ─────────────────────────────────────────────────────────────────────────
/**
* @typedef {object} QueryBundle
* @property {string[]} focusEntities - 焦点实体(原词形,已排除 name1
* @property {string} queryText_v0 - 第一轮查询文本
* @property {string|null} queryText_v1 - 第二轮查询文本refinement 后填充)
* @property {string} rerankQuery - rerank 用的短查询
* @property {string[]} lexicalTerms - MiniSearch 查询词
* @property {Set<string>} _lexicon - 实体词典(内部使用)
* @property {Map<string, string>} _displayMap - 标准化→原词形映射(内部使用)
*/
// ─────────────────────────────────────────────────────────────────────────
// 阶段 1构建 QueryBundle_v0
// ─────────────────────────────────────────────────────────────────────────
/**
* 构建初始查询包
*
* @param {object[]} lastMessages - 最近 K=2 条消息
* @param {string|null} pendingUserMessage - 用户刚输入但未进 chat 的消息
* @param {object|null} store - getSummaryStore() 返回值(可选,内部会自动获取)
* @param {object|null} context - { name1, name2 }(可选,内部会自动获取)
* @returns {QueryBundle}
*/
export function buildQueryBundle(lastMessages, pendingUserMessage, store = null, context = null) {
// 自动获取 store 和 context
if (!store) store = getSummaryStore();
if (!context) {
const ctx = getContext();
context = { name1: ctx.name1, name2: ctx.name2 };
}
// 1. 构建实体词典
const lexicon = buildEntityLexicon(store, context);
const displayMap = buildDisplayNameMap(store, context);
// 2. 清洗消息文本
const dialogueLines = [];
const allCleanText = [];
for (const m of (lastMessages || [])) {
const speaker = m.is_user ? (context.name1 || '用户') : (m.name || context.name2 || '角色');
const clean = cleanMessageText(m.mes || '');
if (clean) {
// 不使用楼层号embedding 模型不需要
// 不截断,零暗箱
dialogueLines.push(`${speaker}: ${clean}`);
allCleanText.push(clean);
}
}
// 3. 处理 pendingUserMessage
let pendingClean = '';
if (pendingUserMessage) {
pendingClean = cleanMessageText(pendingUserMessage);
if (pendingClean) {
allCleanText.push(pendingClean);
}
}
// 4. 提取焦点实体
const combinedText = allCleanText.join(' ');
const focusEntities = extractEntitiesFromText(combinedText, lexicon, displayMap);
// 5. 构建 queryText_v0
const queryParts = [];
if (focusEntities.length > 0) {
queryParts.push(`[ENTITIES]\n${focusEntities.join('\n')}`);
}
if (dialogueLines.length > 0) {
queryParts.push(`[DIALOGUE]\n${dialogueLines.join('\n')}`);
}
if (pendingClean) {
// 不截断,零暗箱
queryParts.push(`[PENDING_USER]\n${pendingClean}`);
}
const queryText_v0 = queryParts.join('\n\n');
// 6. rerankQuery 独立构建(纯自然语言,供 reranker 使用)
const rerankQuery = buildRerankQuery(focusEntities, dialogueLines.length > 0 ? lastMessages : [], pendingUserMessage, context);
// 7. 构建 lexicalTerms
const entityTerms = focusEntities.map(e => e.toLowerCase());
const textTerms = extractKeyTerms(combinedText);
// 合并去重:实体优先
const termSet = new Set(entityTerms);
for (const t of textTerms) {
if (termSet.size >= LEXICAL_TERMS_MAX) break;
termSet.add(t);
}
const lexicalTerms = Array.from(termSet);
return {
focusEntities,
queryText_v0,
queryText_v1: null,
rerankQuery,
lexicalTerms,
_lexicon: lexicon,
_displayMap: displayMap,
};
}
// ─────────────────────────────────────────────────────────────────────────
// 阶段 3Query Refinement用第一轮召回结果增强
// ─────────────────────────────────────────────────────────────────────────
/**
* 用第一轮召回结果增强 QueryBundle
*
* 原地修改 bundle
* - queryText_v1 = queryText_v0 + [MEMORY_HINTS]
* - focusEntities 可能扩展(从 anchorHits 的 subject/object 中补充)
* - rerankQuery 追加 memory hints 关键词
* - lexicalTerms 追加 memory hints 关键词
*
* @param {QueryBundle} bundle - 原始查询包
* @param {object[]} anchorHits - 第一轮 L0 命中(按相似度降序)
* @param {object[]} eventHits - 第一轮 L2 命中(按相似度降序)
*/
export function refineQueryBundle(bundle, anchorHits, eventHits) {
const hints = [];
// 1. 从 top anchorHits 提取 memory hints
const topAnchors = (anchorHits || []).slice(0, MEMORY_HINT_ATOMS_MAX);
for (const hit of topAnchors) {
const semantic = hit.atom?.semantic || '';
if (semantic) {
// 不截断,零暗箱
hints.push(semantic);
}
}
// 2. 从 top eventHits 提取 memory hints
const topEvents = (eventHits || []).slice(0, MEMORY_HINT_EVENTS_MAX);
for (const hit of topEvents) {
const ev = hit.event || {};
const title = String(ev.title || '').trim();
const summary = cleanSummary(ev.summary);
const line = title && summary
? `${title}: ${summary}`
: title || summary;
if (line) {
// 不截断,零暗箱
hints.push(line);
}
}
// 3. 构建 queryText_v1Hints 前置,最优先)
if (hints.length > 0) {
const hintText = `[MEMORY_HINTS]\n${hints.join('\n')}`;
bundle.queryText_v1 = hintText + `\n\n` + bundle.queryText_v0;
} else {
bundle.queryText_v1 = bundle.queryText_v0;
}
// 4. 从 anchorHits 补充 focusEntities
const lexicon = bundle._lexicon;
const displayMap = bundle._displayMap;
if (lexicon && topAnchors.length > 0) {
const existingSet = new Set(bundle.focusEntities.map(e => e.toLowerCase()));
for (const hit of topAnchors) {
const atom = hit.atom;
if (!atom) continue;
// 检查 subject 和 object
for (const field of [atom.subject, atom.object]) {
if (!field) continue;
const norm = String(field).normalize('NFKC').replace(/[\u200B-\u200D\uFEFF]/g, '').trim().toLowerCase();
if (norm.length >= 2 && lexicon.has(norm) && !existingSet.has(norm)) {
existingSet.add(norm);
const display = displayMap?.get(norm) || field;
bundle.focusEntities.push(display);
}
}
}
}
// 5. rerankQuery 保持独立(不随 refinement 变更)
// reranker 需要纯自然语言 query不受 memory hints 干扰
// 6. 增强 lexicalTerms
if (hints.length > 0) {
const hintTerms = extractKeyTerms(hints.join(' '), 5);
const termSet = new Set(bundle.lexicalTerms);
for (const t of hintTerms) {
if (termSet.size >= LEXICAL_TERMS_MAX) break;
if (!termSet.has(t)) {
termSet.add(t);
bundle.lexicalTerms.push(t);
}
}
}
}