Files
LittleWhiteBox/modules/story-summary/vector/retrieval/query-builder.js

394 lines
16 KiB
JavaScript
Raw Normal View History

// ═══════════════════════════════════════════════════════════════════════════
// query-builder.js - 确定性查询构建器(无 LLM
//
// 职责:
// 1. 从最近 3 条消息构建 QueryBundle加权向量段
// 2. 用第一轮召回结果产出 hints 段用于 R2 增强
//
// 加权向量设计:
// - 每条消息独立 embed得到独立向量
// - 按位置分配基础权重(焦点 > 近上下文 > 远上下文)
// - 短消息通过 lengthFactor 自动降权(下限 35%
// - recall.js 负责 embed + 归一化 + 加权平均
//
// 焦点确定:
// - pendingUserMessage 存在 → 它是焦点
// - 否则 → lastMessages 最后一条是焦点
//
// 不负责向量化、检索、rerank
// ═══════════════════════════════════════════════════════════════════════════
import { getContext } from '../../../../../../../extensions.js';
import { buildEntityLexicon, buildDisplayNameMap, extractEntitiesFromText, buildCharacterPools } from './entity-lexicon.js';
import { getLexicalIdfAccessor } from './lexical-index.js';
import { getSummaryStore } from '../../data/store.js';
import { filterText } from '../utils/text-filter.js';
import { tokenizeForIndex as tokenizerTokenizeForIndex } from '../utils/tokenizer.js';
// ─────────────────────────────────────────────────────────────────────────
// 权重常量
// ─────────────────────────────────────────────────────────────────────────
// R1 基础权重:[...context(oldest→newest), focus]
// 焦点消息占 55%,最近上下文 30%,更早上下文 15%
export const FOCUS_BASE_WEIGHT = 0.55;
export const CONTEXT_BASE_WEIGHTS = [0.15, 0.30];
// R2 基础权重:焦点让权给 hints
export const FOCUS_BASE_WEIGHT_R2 = 0.45;
export const CONTEXT_BASE_WEIGHTS_R2 = [0.10, 0.20];
export const HINTS_BASE_WEIGHT = 0.25;
// 长度惩罚:< 50 字线性衰减,下限 35%
export const LENGTH_FULL_THRESHOLD = 50;
export const LENGTH_MIN_FACTOR = 0.35;
// 归一化后的焦点最小占比(由 recall.js 在归一化后硬保底)
// 语义:即使焦点文本很短,也不能被稀释到过低权重
export const FOCUS_MIN_NORMALIZED_WEIGHT = 0.35;
// ─────────────────────────────────────────────────────────────────────────
// 其他常量
// ─────────────────────────────────────────────────────────────────────────
const MEMORY_HINT_ATOMS_MAX = 5;
const MEMORY_HINT_EVENTS_MAX = 3;
const LEXICAL_TERMS_MAX = 10;
// ─────────────────────────────────────────────────────────────────────────
// 工具函数
// ─────────────────────────────────────────────────────────────────────────
/**
* 清洗消息文本 chunk-builder / recall 保持一致
* @param {string} text
* @returns {string}
*/
function cleanMessageText(text) {
return filterText(text)
.replace(/\[tts:[^\]]*\]/gi, '')
.replace(/<state>[\s\S]*?<\/state>/gi, '')
.trim();
}
/**
* 清理事件摘要移除楼层标记
* @param {string} summary
* @returns {string}
*/
function cleanSummary(summary) {
return String(summary || '')
.replace(/\s*\(#\d+(?:-\d+)?\)\s*$/, '')
.trim();
}
/**
* 计算长度因子
*
* charCount >= 50 1.0
* charCount = 0 0.35
* 中间线性插值
*
* @param {number} charCount - 清洗后内容字符数不含 speaker 前缀
* @returns {number} 0.35 ~ 1.0
*/
export function computeLengthFactor(charCount) {
if (charCount >= LENGTH_FULL_THRESHOLD) return 1.0;
if (charCount <= 0) return LENGTH_MIN_FACTOR;
return LENGTH_MIN_FACTOR + (1.0 - LENGTH_MIN_FACTOR) * (charCount / LENGTH_FULL_THRESHOLD);
}
/**
* 从文本中提取高频实词用于词法检索
*
* @param {string} text - 清洗后的文本
* @param {number} maxTerms - 最大词数
* @returns {string[]}
*/
function extractKeyTerms(text, maxTerms = LEXICAL_TERMS_MAX) {
if (!text) return [];
const idfAccessor = getLexicalIdfAccessor();
const tokens = tokenizerTokenizeForIndex(text);
const freq = new Map();
for (const token of tokens) {
const key = String(token || '').toLowerCase();
if (!key) continue;
freq.set(key, (freq.get(key) || 0) + 1);
}
return Array.from(freq.entries())
.map(([term, tf]) => {
const idf = idfAccessor.enabled ? idfAccessor.getIdf(term) : 1;
return { term, tf, score: tf * idf };
})
.sort((a, b) => (b.score - a.score) || (b.tf - a.tf))
.slice(0, maxTerms)
.map(x => x.term);
}
// ─────────────────────────────────────────────────────────────────────────
// 类型定义
// ─────────────────────────────────────────────────────────────────────────
/**
* @typedef {object} QuerySegment
* @property {string} text - embed 的文本 speaker 前缀纯自然语言
* @property {number} baseWeight - R1 基础权重
* @property {number} charCount - 内容字符数不含 speaker 前缀用于 lengthFactor
*/
/**
* @typedef {object} QueryBundle
* @property {QuerySegment[]} querySegments - R1 向量段上下文 oldestnewest焦点在末尾
* @property {QuerySegment|null} hintsSegment - R2 hints refinement 后填充
* @property {string} rerankQuery - rerank 用的纯自然语言查询焦点在前
* @property {string[]} lexicalTerms - MiniSearch 查询词
* @property {string[]} focusTerms - 焦点词 focusEntities
* @property {string[]} focusCharacters - 焦点人物focusTerms trustedCharacters
* @property {string[]} focusEntities - Deprecated alias of focusTerms
* @property {Set<string>} allEntities - Full entity lexicon (includes non-character entities)
* @property {Set<string>} allCharacters - Union of trusted and candidate character pools
* @property {Set<string>} trustedCharacters - Clean character pool (main/arcs/name2/L2 participants)
* @property {Set<string>} candidateCharacters - Extended character pool from L0 edges.s/t after cleanup
* @property {Set<string>} _lexicon - 实体词典内部使用
* @property {Map<string, string>} _displayMap - 标准化原词形映射内部使用
*/
// ─────────────────────────────────────────────────────────────────────────
// 内部:消息条目构建
// ─────────────────────────────────────────────────────────────────────────
/**
* @typedef {object} MessageEntry
* @property {string} text - speaker内容完整文本
* @property {number} charCount - 内容字符数不含 speaker 前缀
*/
/**
* 清洗消息并构建条目
* @param {object} message - chat 消息对象
* @param {object} context - { name1, name2 }
* @returns {MessageEntry|null}
*/
function buildMessageEntry(message, context) {
if (!message?.mes) return null;
const speaker = message.is_user
? (context.name1 || '用户')
: (message.name || context.name2 || '角色');
const clean = cleanMessageText(message.mes);
if (!clean) return null;
return {
text: `${speaker}${clean}`,
charCount: clean.length,
};
}
// ─────────────────────────────────────────────────────────────────────────
// 阶段 1构建 QueryBundle
// ─────────────────────────────────────────────────────────────────────────
/**
* 构建初始查询包
*
* 消息布局K=3
* msg[0] = USER(#N-2) 上下文 baseWeight = 0.15
* msg[1] = AI(#N-1) 上下文 baseWeight = 0.30
* msg[2] = USER(#N) 焦点 baseWeight = 0.55
*
* 焦点确定
* pendingUserMessage 存在 焦点所有 lastMessages 为上下文
* pendingUserMessage 不存在 lastMessages[-1] 为焦点其余为上下文
*
* @param {object[]} lastMessages - 最近 K 条消息 recall.js 传入
* @param {string|null} pendingUserMessage - 用户刚输入但未进 chat 的消息
* @param {object|null} store
* @param {object|null} context - { name1, name2 }
* @returns {QueryBundle}
*/
export function buildQueryBundle(lastMessages, pendingUserMessage, store = null, context = null) {
if (!store) store = getSummaryStore();
if (!context) {
const ctx = getContext();
context = { name1: ctx.name1, name2: ctx.name2 };
}
// 1. 实体/人物词典
const lexicon = buildEntityLexicon(store, context);
const displayMap = buildDisplayNameMap(store, context);
const { trustedCharacters, candidateCharacters, allCharacters } = buildCharacterPools(store, context);
// 2. 分离焦点与上下文
const contextEntries = [];
let focusEntry = null;
const allCleanTexts = [];
if (pendingUserMessage) {
// pending 是焦点,所有 lastMessages 是上下文
const pendingClean = cleanMessageText(pendingUserMessage);
if (pendingClean) {
const speaker = context.name1 || '用户';
focusEntry = {
text: `${speaker}${pendingClean}`,
charCount: pendingClean.length,
};
allCleanTexts.push(pendingClean);
}
for (const m of (lastMessages || [])) {
const entry = buildMessageEntry(m, context);
if (entry) {
contextEntries.push(entry);
allCleanTexts.push(cleanMessageText(m.mes));
}
}
} else {
// 无 pending → lastMessages[-1] 是焦点
const msgs = lastMessages || [];
if (msgs.length > 0) {
const lastMsg = msgs[msgs.length - 1];
const entry = buildMessageEntry(lastMsg, context);
if (entry) {
focusEntry = entry;
allCleanTexts.push(cleanMessageText(lastMsg.mes));
}
}
for (let i = 0; i < msgs.length - 1; i++) {
const entry = buildMessageEntry(msgs[i], context);
if (entry) {
contextEntries.push(entry);
allCleanTexts.push(cleanMessageText(msgs[i].mes));
}
}
}
// 3. 提取焦点词与焦点人物
const combinedText = allCleanTexts.join(' ');
const focusTerms = extractEntitiesFromText(combinedText, lexicon, displayMap);
const focusCharacters = focusTerms.filter(term => trustedCharacters.has(term.toLowerCase()));
// 4. 构建 querySegments
// 上下文在前oldest → newest焦点在末尾
// 上下文权重从 CONTEXT_BASE_WEIGHTS 尾部对齐分配
const querySegments = [];
for (let i = 0; i < contextEntries.length; i++) {
const weightIdx = Math.max(0, CONTEXT_BASE_WEIGHTS.length - contextEntries.length + i);
querySegments.push({
text: contextEntries[i].text,
baseWeight: CONTEXT_BASE_WEIGHTS[weightIdx] || CONTEXT_BASE_WEIGHTS[0],
charCount: contextEntries[i].charCount,
});
}
if (focusEntry) {
querySegments.push({
text: focusEntry.text,
baseWeight: FOCUS_BASE_WEIGHT,
charCount: focusEntry.charCount,
});
}
// 5. rerankQuery焦点在前纯自然语言无前缀
const contextLines = contextEntries.map(e => e.text);
const rerankQuery = focusEntry
? [focusEntry.text, ...contextLines].join('\n')
: contextLines.join('\n');
// 6. lexicalTerms实体优先 + 高频实词补充)
const entityTerms = focusTerms.map(e => e.toLowerCase());
const textTerms = extractKeyTerms(combinedText);
const termSet = new Set(entityTerms);
for (const t of textTerms) {
if (termSet.size >= LEXICAL_TERMS_MAX) break;
termSet.add(t);
}
return {
querySegments,
hintsSegment: null,
rerankQuery,
lexicalTerms: Array.from(termSet),
focusTerms,
focusCharacters,
focusEntities: focusTerms, // deprecated alias (compat)
allEntities: lexicon,
allCharacters,
trustedCharacters,
candidateCharacters,
_lexicon: lexicon,
_displayMap: displayMap,
};
}
// ─────────────────────────────────────────────────────────────────────────
// 阶段 3Query Refinement用第一轮召回结果产出 hints 段)
// ─────────────────────────────────────────────────────────────────────────
/**
* 用第一轮召回结果增强 QueryBundle
*
* 原地修改 bundle query/rerank 辅助项
* - hintsSegment填充 hints R2 加权使用
* - lexicalTerms可能追加 hints 中的关键词
* - rerankQuery不变保持焦点优先的纯自然语言
*
* @param {QueryBundle} bundle - 原始查询包
* @param {object[]} anchorHits - 第一轮 L0 命中按相似度降序
* @param {object[]} eventHits - 第一轮 L2 命中按相似度降序
*/
export function refineQueryBundle(bundle, anchorHits, eventHits) {
const hints = [];
// 1. 从 top anchorHits 提取 memory hints
const topAnchors = (anchorHits || []).slice(0, MEMORY_HINT_ATOMS_MAX);
for (const hit of topAnchors) {
const semantic = hit.atom?.semantic || '';
if (semantic) hints.push(semantic);
}
// 2. 从 top eventHits 提取 memory hints
const topEvents = (eventHits || []).slice(0, MEMORY_HINT_EVENTS_MAX);
for (const hit of topEvents) {
const ev = hit.event || {};
const title = String(ev.title || '').trim();
const summary = cleanSummary(ev.summary);
const line = title && summary
? `${title}: ${summary}`
: title || summary;
if (line) hints.push(line);
}
// 3. 构建 hintsSegment
if (hints.length > 0) {
const hintsText = hints.join('\n');
bundle.hintsSegment = {
text: hintsText,
baseWeight: HINTS_BASE_WEIGHT,
charCount: hintsText.length,
};
} else {
bundle.hintsSegment = null;
}
// 4. rerankQuery 不变
// cross-encoder 接收纯自然语言 query不受 hints 干扰
// 5. 增强 lexicalTerms
if (hints.length > 0) {
const hintTerms = extractKeyTerms(hints.join(' '), 5);
const termSet = new Set(bundle.lexicalTerms);
for (const t of hintTerms) {
if (termSet.size >= LEXICAL_TERMS_MAX) break;
if (!termSet.has(t)) {
termSet.add(t);
bundle.lexicalTerms.push(t);
}
}
}
}