Zero-darkbox query updates and tokenizer improvements

This commit is contained in:
2026-02-09 20:25:26 +08:00
parent 8131d6a15f
commit 0a28539b29
14 changed files with 1771 additions and 175 deletions

View File

@@ -44,6 +44,12 @@ import { runSummaryGeneration } from "./generate/generator.js";
// vector service
import { embed, getEngineFingerprint, testOnlineService } from "./vector/utils/embedder.js";
// tokenizer
import { preload as preloadTokenizer, injectEntities } from "./vector/utils/tokenizer.js";
// entity lexicon
import { buildEntityLexicon, buildDisplayNameMap } from "./vector/retrieval/entity-lexicon.js";
import {
getMeta,
updateMeta,
@@ -82,7 +88,7 @@ import {
// vector io
import { exportVectors, importVectors } from "./vector/storage/vector-io.js";
import { invalidateLexicalIndex } from "./vector/retrieval/lexical-index.js";
import { invalidateLexicalIndex, warmupIndex } from "./vector/retrieval/lexical-index.js";
// ═══════════════════════════════════════════════════════════════════════════
// 常量
@@ -145,6 +151,30 @@ const VECTOR_WARNING_COOLDOWN_MS = 120000; // 2分钟内不重复提醒
const EXT_PROMPT_KEY = "LittleWhiteBox_StorySummary";
const MIN_INJECTION_DEPTH = 2;
// ═══════════════════════════════════════════════════════════════════════════
// 分词器预热
// ═══════════════════════════════════════════════════════════════════════════
/** 是否已触发过预热 */
let tokenizerPreloaded = false;
function maybePreloadTokenizer() {
if (tokenizerPreloaded) return;
const vectorCfg = getVectorConfig();
if (!vectorCfg?.enabled) return;
tokenizerPreloaded = true;
preloadTokenizer()
.then((ok) => {
if (ok) tokenizerPreloaded = true;
})
.catch((e) => {
// 不置 tokenizerPreloaded允许后续重试例如用户修复路径/刷新后)
xbLog.warn(MODULE_ID, "分词器预热失败(将降级运行,可稍后重试)", e);
});
}
// role 映射
const ROLE_MAP = {
system: extension_prompt_roles.SYSTEM,
@@ -499,6 +529,27 @@ async function handleClearVectors() {
xbLog.info(MODULE_ID, "向量数据已清除");
}
// ═══════════════════════════════════════════════════════════════════════════
// 实体词典注入 + 索引预热
// ═══════════════════════════════════════════════════════════════════════════
function refreshEntityLexiconAndWarmup() {
const vectorCfg = getVectorConfig();
if (!vectorCfg?.enabled) return;
const store = getSummaryStore();
const { name1, name2 } = getContext();
const lexicon = buildEntityLexicon(store, { name1, name2 });
const displayMap = buildDisplayNameMap(store, { name1, name2 });
injectEntities(lexicon, displayMap);
// 异步预建词法索引(不阻塞)
invalidateLexicalIndex();
warmupIndex();
}
// ═══════════════════════════════════════════════════════════════════════════
// L2 自动增量向量化(总结完成后调用)
// ═══════════════════════════════════════════════════════════════════════════
@@ -997,6 +1048,9 @@ function handleFrameMessage(event) {
case "VECTOR_GENERATE":
if (data.config) saveVectorConfig(data.config);
// 向量配置变更,可能刚启用,触发预热
maybePreloadTokenizer();
refreshEntityLexiconAndWarmup();
handleGenerateVectors(data.config);
break;
@@ -1085,6 +1139,8 @@ function handleFrameMessage(event) {
case "REQUEST_VECTOR_STATS":
sendVectorStatsToFrame();
// 向量开关可能在 iframe 中被修改,检查是否需要预热
maybePreloadTokenizer();
break;
case "REQUEST_CLEAR": {
@@ -1213,7 +1269,7 @@ async function handleChatChanged() {
const newLength = Array.isArray(chat) ? chat.length : 0;
await rollbackSummaryIfNeeded();
invalidateLexicalIndex();
invalidateLexicalIndex();
initButtonsForAll();
const store = getSummaryStore();
@@ -1230,6 +1286,9 @@ async function handleChatChanged() {
sendVectorStatsToFrame();
}
// 实体词典注入 + 索引预热
refreshEntityLexiconAndWarmup();
setTimeout(() => checkVectorIntegrityAndWarn(), 2000);
}
@@ -1267,6 +1326,9 @@ async function handleMessageReceived() {
applyHideStateDebounced();
setTimeout(() => maybeAutoRunSummary("after_ai"), 1000);
// 新消息后刷新实体词典(可能有新角色)
refreshEntityLexiconAndWarmup();
}
function handleMessageSent() {
@@ -1458,4 +1520,7 @@ jQuery(() => {
if (!getSettings().storySummary?.enabled) return;
registerEvents();
initStateIntegration();
// 条件预热分词器storySummary 已启用,检查 vector 是否也启用)
maybePreloadTokenizer();
});

View File

@@ -78,7 +78,8 @@ export async function rerank(query, documents, options = {}) {
},
body: JSON.stringify({
model: RERANK_MODEL,
query: query.slice(0, 1000), // 限制 query 长度
// Zero-darkbox: do not silently truncate query.
query,
documents: validDocs,
top_n: Math.min(topN, validDocs.length),
return_documents: false,

View File

@@ -4,9 +4,10 @@
// 职责:
// 1. 对 L0 atoms + L1 chunks + L2 events 建立词法索引
// 2. 提供词法检索接口(专名精确匹配兜底)
// 3. 惰性构建 + 缓存失效机制
// 3. 惰性构建 + 异步预热 + 缓存失效机制
//
// 索引存储:纯内存(不持久化)
// 分词器:统一使用 tokenizer.js结巴 + 实体保护 + 降级)
// 重建时机CHAT_CHANGED / L0提取完成 / L2总结完成
// ═══════════════════════════════════════════════════════════════════════════
@@ -16,6 +17,7 @@ import { getSummaryStore } from '../../data/store.js';
import { getStateAtoms } from '../storage/state-store.js';
import { getAllChunks } from '../storage/chunk-store.js';
import { xbLog } from '../../../../core/debug-core.js';
import { tokenizeForIndex } from '../utils/tokenizer.js';
const MODULE_ID = 'lexical-index';
@@ -23,9 +25,20 @@ const MODULE_ID = 'lexical-index';
// 缓存
// ─────────────────────────────────────────────────────────────────────────
/** @type {MiniSearch|null} */
let cachedIndex = null;
/** @type {string|null} */
let cachedChatId = null;
let cachedFingerprint = null; // atoms.length + chunks.length + events.length 的简单指纹
/** @type {string|null} 数据指纹atoms + chunks + events 数量) */
let cachedFingerprint = null;
/** @type {boolean} 是否正在构建 */
let building = false;
/** @type {Promise<MiniSearch|null>|null} 当前构建 Promise防重入 */
let buildPromise = null;
// ─────────────────────────────────────────────────────────────────────────
// 工具函数
@@ -43,7 +56,7 @@ function cleanSummary(summary) {
}
/**
* 计算缓存指纹(用于判断是否需要重建)
* 计算缓存指纹
* @param {number} atomCount
* @param {number} chunkCount
* @param {number} eventCount
@@ -53,39 +66,27 @@ function computeFingerprint(atomCount, chunkCount, eventCount) {
return `${atomCount}:${chunkCount}:${eventCount}`;
}
/**
* 让出主线程(避免长时间阻塞 UI
* @returns {Promise<void>}
*/
function yieldToMain() {
return new Promise(resolve => setTimeout(resolve, 0));
}
// ─────────────────────────────────────────────────────────────────────────
// 索引构建
// 文档收集
// ─────────────────────────────────────────────────────────────────────────
/**
* 构建 MiniSearch 索引
*
* 索引三类文档:
* - L0 atoms: { id: atomId, type: 'atom', floor, text: semantic }
* - L1 chunks: { id: chunkId, type: 'chunk', floor, text: chunk.text }
* - L2 events: { id: eventId, type: 'event', floor: null, text: title + participants + summary }
* 收集所有待索引文档
*
* @param {object[]} atoms - getStateAtoms() 返回值
* @param {object[]} chunks - getAllChunks(chatId) 返回值
* @param {object[]} events - store.json.events
* @returns {MiniSearch}
* @returns {object[]} 文档数组
*/
export function buildLexicalIndex(atoms, chunks, events) {
const T0 = performance.now();
const index = new MiniSearch({
fields: ['text'],
storeFields: ['type', 'floor'],
idField: 'id',
searchOptions: {
boost: { text: 1 },
fuzzy: 0.2,
prefix: true,
},
// 中文友好的 tokenizer按字符 bigram + 空格/标点分词
tokenize: chineseTokenize,
});
function collectDocuments(atoms, chunks, events) {
const docs = [];
// L0 atoms
@@ -129,72 +130,58 @@ export function buildLexicalIndex(atoms, chunks, events) {
});
}
if (docs.length > 0) {
index.addAll(docs);
}
const elapsed = Math.round(performance.now() - T0);
xbLog.info(MODULE_ID, `索引构建完成: ${docs.length} 文档 (atoms=${atoms?.length || 0}, chunks=${chunks?.length || 0}, events=${events?.length || 0}) ${elapsed}ms`);
return index;
return docs;
}
// ─────────────────────────────────────────────────────────────────────────
// 中文 Tokenizer
// 索引构建(分片,不阻塞主线程)
// ─────────────────────────────────────────────────────────────────────────
/** 每批添加的文档数 */
const BUILD_BATCH_SIZE = 500;
/**
* 中文友好的分词器
* 构建 MiniSearch 索引(分片异步)
*
* 策略:
* 1. 连续中文字符 → 滑动 bigram"黄英梅" → "黄英", "英梅"
* 2. 连续非中文字符 → 按空格/标点分割
* 3. 保留完整中文词2-4字作为额外 token
*
* @param {string} text
* @returns {string[]}
* @param {object[]} docs - 文档数组
* @returns {Promise<MiniSearch>}
*/
function chineseTokenize(text) {
if (!text) return [];
async function buildIndexAsync(docs) {
const T0 = performance.now();
const tokens = [];
const s = String(text).toLowerCase();
const index = new MiniSearch({
fields: ['text'],
storeFields: ['type', 'floor'],
idField: 'id',
searchOptions: {
boost: { text: 1 },
fuzzy: 0.2,
prefix: true,
},
tokenize: tokenizeForIndex,
});
// 分离中文段和非中文段
const segments = s.split(/([\u4e00-\u9fff]+)/g);
if (!docs.length) {
return index;
}
for (const seg of segments) {
if (!seg) continue;
// 分片添加,每批 BUILD_BATCH_SIZE 条后让出主线程
for (let i = 0; i < docs.length; i += BUILD_BATCH_SIZE) {
const batch = docs.slice(i, i + BUILD_BATCH_SIZE);
index.addAll(batch);
// 中文段bigram + 完整段(如果 2-6 字)
if (/^[\u4e00-\u9fff]+$/.test(seg)) {
// 完整段作为一个 token如果长度合适
if (seg.length >= 2 && seg.length <= 6) {
tokens.push(seg);
}
// bigram
for (let i = 0; i < seg.length - 1; i++) {
tokens.push(seg.slice(i, i + 2));
}
// trigram对 3+ 字的段)
for (let i = 0; i < seg.length - 2; i++) {
tokens.push(seg.slice(i, i + 3));
}
} else {
// 非中文段:按空格/标点分割
const words = seg.split(/[\s\-_.,;:!?'"()[\]{}<>/\\|@#$%^&*+=~`]+/);
for (const w of words) {
const trimmed = w.trim();
if (trimmed.length >= 2) {
tokens.push(trimmed);
}
}
// 非最后一批时让出主线程
if (i + BUILD_BATCH_SIZE < docs.length) {
await yieldToMain();
}
}
return tokens;
const elapsed = Math.round(performance.now() - T0);
xbLog.info(MODULE_ID,
`索引构建完成: ${docs.length} 文档 (${elapsed}ms)`
);
return index;
}
// ─────────────────────────────────────────────────────────────────────────
@@ -247,6 +234,8 @@ export function searchLexicalIndex(index, terms) {
fuzzy: 0.2,
prefix: true,
combineWith: 'OR',
// 使用与索引相同的分词器
tokenize: tokenizeForIndex,
});
} catch (e) {
xbLog.warn(MODULE_ID, '检索失败', e);
@@ -305,22 +294,17 @@ export function searchLexicalIndex(index, terms) {
}
// ─────────────────────────────────────────────────────────────────────────
// 惰性缓存管理
// 内部构建流程(收集数据 + 构建索引)
// ─────────────────────────────────────────────────────────────────────────
/**
* 获取词法索引(惰性构建 + 缓存)
* 收集数据并构建索引
*
* 如果缓存有效则直接返回;否则自动构建。
* 缓存失效条件chatId 变化 / 数据指纹变化 / 手动 invalidate
*
* @returns {Promise<MiniSearch>}
* @param {string} chatId
* @returns {Promise<{index: MiniSearch, fingerprint: string}>}
*/
export async function getLexicalIndex() {
const { chatId } = getContext();
if (!chatId) return null;
// 收集当前数据
async function collectAndBuild(chatId) {
// 收集数据
const atoms = getStateAtoms() || [];
const store = getSummaryStore();
const events = store?.json?.events || [];
@@ -334,30 +318,118 @@ export async function getLexicalIndex() {
const fp = computeFingerprint(atoms.length, chunks.length, events.length);
// 缓存命中
// 检查是否在收集过程中缓存已被其他调用更新
if (cachedIndex && cachedChatId === chatId && cachedFingerprint === fp) {
return { index: cachedIndex, fingerprint: fp };
}
// 收集文档
const docs = collectDocuments(atoms, chunks, events);
// 异步分片构建
const index = await buildIndexAsync(docs);
return { index, fingerprint: fp };
}
// ─────────────────────────────────────────────────────────────────────────
// 公开接口getLexicalIndex惰性获取
// ─────────────────────────────────────────────────────────────────────────
/**
* 获取词法索引(惰性构建 + 缓存)
*
* 如果缓存有效则直接返回;否则自动构建。
* 如果正在构建中,等待构建完成。
*
* @returns {Promise<MiniSearch|null>}
*/
export async function getLexicalIndex() {
const { chatId } = getContext();
if (!chatId) return null;
// 快速路径:如果缓存存在且 chatId 未变,则直接命中
// 指纹校验放到构建流程中完成,避免为指纹而额外读一次 IndexedDB
if (cachedIndex && cachedChatId === chatId && cachedFingerprint) {
return cachedIndex;
}
// 重建
xbLog.info(MODULE_ID, `缓存失效,重建索引 (chatId=${chatId.slice(0, 8)}, fp=${fp})`);
// 正在构建中,等待结果
if (building && buildPromise) {
try {
await buildPromise;
if (cachedIndex && cachedChatId === chatId && cachedFingerprint) {
return cachedIndex;
}
} catch {
// 构建失败,继续往下重建
}
}
const index = buildLexicalIndex(atoms, chunks, events);
// 需要重建(指纹将在 collectAndBuild 内部计算并写入缓存)
xbLog.info(MODULE_ID, `缓存失效,重建索引 (chatId=${chatId.slice(0, 8)})`);
cachedIndex = index;
cachedChatId = chatId;
cachedFingerprint = fp;
building = true;
buildPromise = collectAndBuild(chatId);
return index;
try {
const { index, fingerprint } = await buildPromise;
// 原子替换缓存
cachedIndex = index;
cachedChatId = chatId;
cachedFingerprint = fingerprint;
return index;
} catch (e) {
xbLog.error(MODULE_ID, '索引构建失败', e);
return null;
} finally {
building = false;
buildPromise = null;
}
}
// ─────────────────────────────────────────────────────────────────────────
// 公开接口warmupIndex异步预建
// ─────────────────────────────────────────────────────────────────────────
/**
* 使缓存失效(下次 getLexicalIndex 时自动重建)
* 异步预建索引
*
* 在 CHAT_CHANGED 时调用,后台构建索引。
* 不阻塞调用方,不返回结果。
* 构建完成后缓存自动更新,后续 getLexicalIndex() 直接命中。
*
* 调用时机:
* - handleChatChanged实体注入后
* - L0 提取完成
* - L2 总结完成
*/
export function warmupIndex() {
const { chatId } = getContext();
if (!chatId) return;
// 已在构建中,不重复触发
if (building) return;
// fire-and-forget
getLexicalIndex().catch(e => {
xbLog.warn(MODULE_ID, '预热索引失败', e);
});
}
// ─────────────────────────────────────────────────────────────────────────
// 公开接口invalidateLexicalIndex缓存失效
// ─────────────────────────────────────────────────────────────────────────
/**
* 使缓存失效(下次 getLexicalIndex / warmupIndex 时自动重建)
*
* 调用时机:
* - CHAT_CHANGED
* - L0 提取完成handleAnchorGenerate 完成后)
* - L2 总结完成onComplete 回调中)
* - L0 提取完成
* - L2 总结完成
*/
export function invalidateLexicalIndex() {
if (cachedIndex) {

View File

@@ -16,6 +16,11 @@ export function createMetrics() {
query: {
buildTime: 0,
refineTime: 0,
lengths: {
v0Chars: 0,
v1Chars: null, // null = NA
rerankChars: 0,
},
},
// Anchor (L0 StateAtoms) - 语义锚点
@@ -177,6 +182,13 @@ export function formatMetricsLog(metrics) {
lines.push('════════════════════════════════════════');
lines.push('');
// Query Length
lines.push('[Query Length] 查询长度');
lines.push(`├─ query_v0_chars: ${m.query?.lengths?.v0Chars ?? 0}`);
lines.push(`├─ query_v1_chars: ${m.query?.lengths?.v1Chars == null ? 'NA' : m.query.lengths.v1Chars}`);
lines.push(`└─ rerank_query_chars: ${m.query?.lengths?.rerankChars ?? 0}`);
lines.push('');
// Query Build
lines.push('[Query] 查询构建');
lines.push(`├─ build_time: ${m.query.buildTime}ms`);

View File

@@ -12,36 +12,18 @@ import { getContext } from '../../../../../../../extensions.js';
import { buildEntityLexicon, buildDisplayNameMap, extractEntitiesFromText } from './entity-lexicon.js';
import { getSummaryStore } from '../../data/store.js';
import { filterText } from '../utils/text-filter.js';
import { tokenizeForIndex as tokenizerTokenizeForIndex } from '../utils/tokenizer.js';
// ─────────────────────────────────────────────────────────────────────────
// 常量
// ─────────────────────────────────────────────────────────────────────────
const DIALOGUE_MAX_CHARS = 400;
const PENDING_MAX_CHARS = 400;
const MEMORY_HINT_MAX_CHARS = 100;
// Zero-darkbox policy:
// - No internal truncation. We rely on model-side truncation / provider limits.
// - If provider rejects due to length, we fail loudly and degrade explicitly.
const MEMORY_HINT_ATOMS_MAX = 5;
const MEMORY_HINT_EVENTS_MAX = 3;
const RERANK_QUERY_MAX_CHARS = 500;
const RERANK_SNIPPET_CHARS = 150;
const LEXICAL_TERMS_MAX = 10;
const LEXICAL_TERM_MIN_LEN = 2;
const LEXICAL_TERM_MAX_LEN = 6;
// 中文停用词(高频无意义词)
const STOP_WORDS = new Set([
'的', '了', '在', '是', '我', '有', '和', '就', '不', '人',
'都', '一', '一个', '上', '也', '很', '到', '说', '要', '去',
'你', '会', '着', '没有', '看', '好', '自己', '这', '他', '她',
'它', '吗', '什么', '那', '里', '来', '吧', '呢', '啊', '哦',
'嗯', '呀', '哈', '嘿', '喂', '哎', '唉', '哇', '呃', '嘛',
'把', '被', '让', '给', '从', '向', '对', '跟', '比', '但',
'而', '或', '如果', '因为', '所以', '虽然', '但是', '然后',
'可以', '这样', '那样', '怎么', '为什么', '什么样', '哪里',
'时候', '现在', '已经', '还是', '只是', '可能', '应该', '知道',
'觉得', '开始', '一下', '一些', '这个', '那个', '他们', '我们',
'你们', '自己', '起来', '出来', '进去', '回来', '过来', '下去',
]);
// ─────────────────────────────────────────────────────────────────────────
// 工具函数
@@ -65,10 +47,7 @@ function cleanMessageText(text) {
* @param {number} maxLen
* @returns {string}
*/
function truncate(text, maxLen) {
if (!text || text.length <= maxLen) return text || '';
return text.slice(0, maxLen) + '…';
}
// truncate removed by design (zero-darkbox)
/**
* 清理事件摘要(移除楼层标记)
@@ -84,8 +63,7 @@ function cleanSummary(summary) {
/**
* 从文本中提取高频实词(用于词法检索)
*
* 策略:按中文字符边界 + 空格/标点分词,取长度 2-6 的片段
* 过滤停用词,按频率排序
* 使用统一分词器(结巴 + 实体保护 + 停用词过滤),按频率排序
*
* @param {string} text - 清洗后的文本
* @param {number} maxTerms - 最大词数
@@ -94,15 +72,15 @@ function cleanSummary(summary) {
function extractKeyTerms(text, maxTerms = LEXICAL_TERMS_MAX) {
if (!text) return [];
// 提取连续中文片段 + 英文单词
const segments = text.match(/[\u4e00-\u9fff]{2,6}|[a-zA-Z]{3,}/g) || [];
// 使用统一分词器(索引用,不去重,保留词频)
const tokens = tokenizerTokenizeForIndex(text);
// 统计词频
const freq = new Map();
for (const seg of segments) {
const s = seg.toLowerCase();
if (s.length < LEXICAL_TERM_MIN_LEN || s.length > LEXICAL_TERM_MAX_LEN) continue;
if (STOP_WORDS.has(s)) continue;
freq.set(s, (freq.get(s) || 0) + 1);
for (const token of tokens) {
const key = String(token || '').toLowerCase();
if (!key) continue;
freq.set(key, (freq.get(key) || 0) + 1);
}
return Array.from(freq.entries())
@@ -160,8 +138,9 @@ export function buildQueryBundle(lastMessages, pendingUserMessage, store = null,
const clean = cleanMessageText(m.mes || '');
if (clean) {
// ★ 修复 A不使用楼层号embedding 模型不需要
dialogueLines.push(`${speaker}: ${truncate(clean, DIALOGUE_MAX_CHARS)}`);
// 不使用楼层号embedding 模型不需要
// 不截断,零暗箱
dialogueLines.push(`${speaker}: ${clean}`);
allCleanText.push(clean);
}
}
@@ -191,30 +170,15 @@ export function buildQueryBundle(lastMessages, pendingUserMessage, store = null,
}
if (pendingClean) {
queryParts.push(`[PENDING_USER]\n${truncate(pendingClean, PENDING_MAX_CHARS)}`);
// 不截断,零暗箱
queryParts.push(`[PENDING_USER]\n${pendingClean}`);
}
const queryText_v0 = queryParts.join('\n\n');
// 6. 构建 rerankQuery(短版
const rerankParts = [];
if (focusEntities.length > 0) {
rerankParts.push(focusEntities.join(' '));
}
for (const m of (lastMessages || [])) {
const clean = cleanMessageText(m.mes || '');
if (clean) {
rerankParts.push(truncate(clean, RERANK_SNIPPET_CHARS));
}
}
if (pendingClean) {
rerankParts.push(truncate(pendingClean, RERANK_SNIPPET_CHARS));
}
const rerankQuery = truncate(rerankParts.join('\n'), RERANK_QUERY_MAX_CHARS);
// 6. rerankQuery 与 embedding query 同源(零暗箱
// 后续 refine 会把它升级为与 queryText_v1 同源。
const rerankQuery = queryText_v0;
// 7. 构建 lexicalTerms
const entityTerms = focusEntities.map(e => e.toLowerCase());
@@ -265,7 +229,8 @@ export function refineQueryBundle(bundle, anchorHits, eventHits) {
for (const hit of topAnchors) {
const semantic = hit.atom?.semantic || '';
if (semantic) {
hints.push(truncate(semantic, MEMORY_HINT_MAX_CHARS));
// 不截断,零暗箱
hints.push(semantic);
}
}
@@ -279,13 +244,15 @@ export function refineQueryBundle(bundle, anchorHits, eventHits) {
? `${title}: ${summary}`
: title || summary;
if (line) {
hints.push(truncate(line, MEMORY_HINT_MAX_CHARS));
// 不截断,零暗箱
hints.push(line);
}
}
// 3. 构建 queryText_v1
// 3. 构建 queryText_v1Hints 前置,最优先)
if (hints.length > 0) {
bundle.queryText_v1 = bundle.queryText_v0 + `\n\n[MEMORY_HINTS]\n${hints.join('\n')}`;
const hintText = `[MEMORY_HINTS]\n${hints.join('\n')}`;
bundle.queryText_v1 = hintText + `\n\n` + bundle.queryText_v0;
} else {
bundle.queryText_v1 = bundle.queryText_v0;
}
@@ -314,17 +281,8 @@ export function refineQueryBundle(bundle, anchorHits, eventHits) {
}
}
// 5. 增强 rerankQuery
if (hints.length > 0) {
const hintKeywords = extractKeyTerms(hints.join(' '), 5);
if (hintKeywords.length > 0) {
const addition = hintKeywords.join(' ');
bundle.rerankQuery = truncate(
bundle.rerankQuery + '\n' + addition,
RERANK_QUERY_MAX_CHARS
);
}
}
// 5. rerankQuery 与最终 query 同源(零暗箱)
bundle.rerankQuery = bundle.queryText_v1 || bundle.queryText_v0;
// 6. 增强 lexicalTerms
if (hints.length > 0) {

View File

@@ -782,6 +782,14 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) {
metrics.query.buildTime = Math.round(performance.now() - T_Build_Start);
metrics.anchor.focusEntities = bundle.focusEntities;
// Query lengths (v0 available here)
if (metrics.query?.lengths) {
metrics.query.lengths.v0Chars = String(bundle.queryText_v0 || '').length;
// v1 not built yet
metrics.query.lengths.v1Chars = null;
metrics.query.lengths.rerankChars = String(bundle.rerankQuery || bundle.queryText_v0 || '').length;
}
xbLog.info(MODULE_ID,
`Query Build: focus=[${bundle.focusEntities.join(',')}] lexTerms=[${bundle.lexicalTerms.slice(0, 5).join(',')}]`
);
@@ -841,6 +849,12 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) {
// 更新 focusEntitiesrefinement 可能扩展了)
metrics.anchor.focusEntities = bundle.focusEntities;
// Query lengths (v1/rerank updated here)
if (metrics.query?.lengths) {
metrics.query.lengths.v1Chars = bundle.queryText_v1 == null ? null : String(bundle.queryText_v1).length;
metrics.query.lengths.rerankChars = String(bundle.rerankQuery || bundle.queryText_v1 || bundle.queryText_v0 || '').length;
}
xbLog.info(MODULE_ID,
`Refinement: focus=[${bundle.focusEntities.join(',')}] hasV1=${!!bundle.queryText_v1} (${metrics.query.refineTime}ms)`
);

View File

@@ -0,0 +1,650 @@
// ═══════════════════════════════════════════════════════════════════════════
// tokenizer.js - 统一分词器
//
// 职责:
// 1. 管理结巴 WASM 生命周期(预加载 / 就绪检测 / 降级)
// 2. 实体词典注入(分词前最长匹配保护)
// 3. 亚洲文字CJK + 假名)走结巴,拉丁文字走空格分割
// 4. 提供 tokenize(text): string[] 统一接口
//
// 加载时机:
// - 插件初始化时 storySummary.enabled && vectorConfig.enabled → preload()
// - 向量开关从 off→on 时 → preload()
// - CHAT_CHANGED 时 → injectEntities() + warmup 索引(不负责加载 WASM
//
// 降级策略:
// - WASM 未就绪时 → 实体保护 + 标点分割(不用 bigram
// ═══════════════════════════════════════════════════════════════════════════
import { extensionFolderPath } from '../../../../core/constants.js';
import { xbLog } from '../../../../core/debug-core.js';
const MODULE_ID = 'tokenizer';
// ═══════════════════════════════════════════════════════════════════════════
// WASM 状态机
// ═══════════════════════════════════════════════════════════════════════════
/**
* @enum {string}
*/
const WasmState = {
IDLE: 'IDLE',
LOADING: 'LOADING',
READY: 'READY',
FAILED: 'FAILED',
};
let wasmState = WasmState.IDLE;
/** @type {Promise<void>|null} 当前加载 Promise防重入 */
let loadingPromise = null;
/** @type {typeof import('../../../../libs/jieba-wasm/jieba_rs_wasm.js')|null} */
let jiebaModule = null;
/** @type {Function|null} jieba cut 函数引用 */
let jiebaCut = null;
/** @type {Function|null} jieba add_word 函数引用 */
let jiebaAddWord = null;
// ═══════════════════════════════════════════════════════════════════════════
// 实体词典
// ═══════════════════════════════════════════════════════════════════════════
/** @type {string[]} 按长度降序排列的实体列表(用于最长匹配) */
let entityList = [];
/** @type {Set<string>} 已注入结巴的实体(避免重复 add_word */
let injectedEntities = new Set();
// ═══════════════════════════════════════════════════════════════════════════
// 停用词
// ═══════════════════════════════════════════════════════════════════════════
const STOP_WORDS = new Set([
// 中文高频虚词
'的', '了', '在', '是', '我', '有', '和', '就', '不', '人',
'都', '一', '一个', '上', '也', '很', '到', '说', '要', '去',
'你', '会', '着', '没有', '看', '好', '自己', '这', '他', '她',
'它', '吗', '什么', '那', '里', '来', '吧', '呢', '啊', '哦',
'嗯', '呀', '哈', '嘿', '喂', '哎', '唉', '哇', '呃', '嘛',
'把', '被', '让', '给', '从', '向', '对', '跟', '比', '但',
'而', '或', '如果', '因为', '所以', '虽然', '但是', '然后',
'可以', '这样', '那样', '怎么', '为什么', '什么样', '哪里',
'时候', '现在', '已经', '还是', '只是', '可能', '应该', '知道',
'觉得', '开始', '一下', '一些', '这个', '那个', '他们', '我们',
'你们', '自己', '起来', '出来', '进去', '回来', '过来', '下去',
// 日语助词 + 常见虚词
'は', 'が', 'を', 'に', 'で', 'と', 'の', 'も', 'へ', 'や',
'か', 'な', 'よ', 'ね', 'わ', 'だ', 'です', 'ます', 'た', 'て',
'する', 'いる', 'ある', 'なる', 'れる', 'られる', 'ない',
'この', 'その', 'あの', 'どの', 'ここ', 'そこ', 'あそこ',
'これ', 'それ', 'あれ', 'どれ',
// 英文常见停用词
'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been',
'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
'would', 'could', 'should', 'may', 'might', 'can', 'shall',
'and', 'but', 'or', 'not', 'no', 'nor', 'so', 'yet',
'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from',
'it', 'its', 'he', 'she', 'his', 'her', 'they', 'them',
'this', 'that', 'these', 'those', 'i', 'me', 'my', 'you', 'your',
'we', 'our', 'if', 'then', 'than', 'when', 'what', 'which',
'who', 'how', 'where', 'there', 'here', 'all', 'each', 'every',
'both', 'few', 'more', 'most', 'other', 'some', 'such',
'only', 'own', 'same', 'just', 'very', 'also', 'about',
]);
// ═══════════════════════════════════════════════════════════════════════════
// Unicode 分类
// ═══════════════════════════════════════════════════════════════════════════
/**
* 判断字符是否为亚洲文字CJK + 假名)
* @param {number} code - charCode
* @returns {boolean}
*/
function isAsian(code) {
return (
(code >= 0x4E00 && code <= 0x9FFF) || // CJK Unified Ideographs
(code >= 0x3400 && code <= 0x4DBF) || // CJK Extension A
(code >= 0x3040 && code <= 0x309F) || // Hiragana
(code >= 0x30A0 && code <= 0x30FF) || // Katakana
(code >= 0x31F0 && code <= 0x31FF) || // Katakana Phonetic Extensions
(code >= 0xFF65 && code <= 0xFF9F) || // Halfwidth Katakana
(code >= 0xF900 && code <= 0xFAFF) || // CJK Compatibility Ideographs
(code >= 0x20000 && code <= 0x2A6DF) // CJK Extension B
);
}
/**
* 判断字符是否为拉丁字母或数字
* @param {number} code - charCode
* @returns {boolean}
*/
function isLatin(code) {
return (
(code >= 0x41 && code <= 0x5A) || // A-Z
(code >= 0x61 && code <= 0x7A) || // a-z
(code >= 0x30 && code <= 0x39) || // 0-9
(code >= 0xC0 && code <= 0x024F) // Latin Extended (àáâ 等)
);
}
// ═══════════════════════════════════════════════════════════════════════════
// 文本分段(亚洲 vs 拉丁 vs 其他)
// ═══════════════════════════════════════════════════════════════════════════
/**
* @typedef {'asian'|'latin'|'other'} SegmentType
*/
/**
* @typedef {object} TextSegment
* @property {SegmentType} type - 段类型
* @property {string} text - 段文本
*/
/**
* 将文本按 Unicode 脚本分段
* 连续的同类字符归为一段
*
* @param {string} text
* @returns {TextSegment[]}
*/
function segmentByScript(text) {
if (!text) return [];
const segments = [];
let currentType = null;
let currentStart = 0;
for (let i = 0; i < text.length; i++) {
const code = text.charCodeAt(i);
let type;
if (isAsian(code)) {
type = 'asian';
} else if (isLatin(code)) {
type = 'latin';
} else {
type = 'other';
}
if (type !== currentType) {
if (currentType !== null && currentStart < i) {
const seg = text.slice(currentStart, i);
if (currentType !== 'other' || seg.trim()) {
segments.push({ type: currentType, text: seg });
}
}
currentType = type;
currentStart = i;
}
}
// 最后一段
if (currentStart < text.length) {
const seg = text.slice(currentStart);
if (currentType !== 'other' || seg.trim()) {
segments.push({ type: currentType, text: seg });
}
}
return segments;
}
// ═══════════════════════════════════════════════════════════════════════════
// 实体保护(最长匹配占位符替换)
// ═══════════════════════════════════════════════════════════════════════════
// 使用 Unicode Private Use Area (PUA) 字符作为边界,避免控制字符在分词器中产生不可控行为
const PLACEHOLDER_PREFIX = '\uE000ENT_';
const PLACEHOLDER_SUFFIX = '\uE001';
/**
* 在文本中执行实体最长匹配,替换为占位符
*
* @param {string} text - 原始文本
* @returns {{masked: string, entities: Map<string, string>}} masked 文本 + 占位符→原文映射
*/
function maskEntities(text) {
const entities = new Map();
if (!entityList.length || !text) {
return { masked: text, entities };
}
let masked = text;
let idx = 0;
// entityList 已按长度降序排列,保证最长匹配优先
for (const entity of entityList) {
// 大小写不敏感搜索
const lowerMasked = masked.toLowerCase();
const lowerEntity = entity.toLowerCase();
let searchFrom = 0;
while (true) {
const pos = lowerMasked.indexOf(lowerEntity, searchFrom);
if (pos === -1) break;
// 已被占位符覆盖则跳过(检查前后是否存在 PUA 边界字符)
const aroundStart = Math.max(0, pos - 4);
const aroundEnd = Math.min(masked.length, pos + entity.length + 4);
const around = masked.slice(aroundStart, aroundEnd);
if (around.includes('\uE000') || around.includes('\uE001')) {
searchFrom = pos + 1;
continue;
}
const placeholder = `${PLACEHOLDER_PREFIX}${idx}${PLACEHOLDER_SUFFIX}`;
const originalText = masked.slice(pos, pos + entity.length);
entities.set(placeholder, originalText);
masked = masked.slice(0, pos) + placeholder + masked.slice(pos + entity.length);
idx++;
// 更新搜索位置(跳过占位符)
searchFrom = pos + placeholder.length;
}
}
return { masked, entities };
}
/**
* 将 token 数组中的占位符还原为原始实体
*
* @param {string[]} tokens
* @param {Map<string, string>} entities - 占位符→原文映射
* @returns {string[]}
*/
function unmaskTokens(tokens, entities) {
if (!entities.size) return tokens;
return tokens.map(token => {
// token 本身就是一个占位符
if (entities.has(token)) {
return entities.get(token);
}
// token 中包含占位符(结巴可能把占位符和其他字符连在一起)
let result = token;
for (const [placeholder, original] of entities) {
if (result.includes(placeholder)) {
result = result.replace(placeholder, original);
}
}
return result;
});
}
// ═══════════════════════════════════════════════════════════════════════════
// 分词:亚洲文字(结巴 / 降级)
// ═══════════════════════════════════════════════════════════════════════════
/**
* 用结巴分词处理亚洲文字段
* @param {string} text
* @returns {string[]}
*/
function tokenizeAsianJieba(text) {
if (!text || !jiebaCut) return [];
try {
const words = jiebaCut(text, true); // hmm=true
return Array.from(words)
.map(w => String(w || '').trim())
.filter(w => w.length >= 2);
} catch (e) {
xbLog.warn(MODULE_ID, '结巴分词异常,降级处理', e);
return tokenizeAsianFallback(text);
}
}
/**
* 降级分词:标点/空格分割 + 保留 2-6 字 CJK 片段
* 不使用 bigram避免索引膨胀
*
* @param {string} text
* @returns {string[]}
*/
function tokenizeAsianFallback(text) {
if (!text) return [];
const tokens = [];
// 按标点和空格分割
const parts = text.split(/[\s""''()【】《》…—\-,.!?;:'"()[\]{}<>/\\|@#$%^&*+=~`]+/);
for (const part of parts) {
const trimmed = part.trim();
if (!trimmed) continue;
if (trimmed.length >= 2 && trimmed.length <= 6) {
tokens.push(trimmed);
} else if (trimmed.length > 6) {
// 长片段按 4 字滑窗切分(比 bigram 稀疏得多)
for (let i = 0; i <= trimmed.length - 4; i += 2) {
tokens.push(trimmed.slice(i, i + 4));
}
// 保留完整片段的前 6 字
tokens.push(trimmed.slice(0, 6));
}
}
return tokens;
}
// ═══════════════════════════════════════════════════════════════════════════
// 分词:拉丁文字
// ═══════════════════════════════════════════════════════════════════════════
/**
* 拉丁文字分词:空格/标点分割
* @param {string} text
* @returns {string[]}
*/
function tokenizeLatin(text) {
if (!text) return [];
return text
.split(/[\s\-_.,;:!?'"()[\]{}<>/\\|@#$%^&*+=~`]+/)
.map(w => w.trim().toLowerCase())
.filter(w => w.length >= 3);
}
// ═══════════════════════════════════════════════════════════════════════════
// 公开接口preload
// ═══════════════════════════════════════════════════════════════════════════
/**
* 预加载结巴 WASM
*
* 可多次调用,内部防重入。
* FAILED 状态下再次调用会重试。
*
* @returns {Promise<boolean>} 是否加载成功
*/
export async function preload() {
// 已就绪
if (wasmState === WasmState.READY) return true;
// 正在加载,等待结果
if (wasmState === WasmState.LOADING && loadingPromise) {
try {
await loadingPromise;
return wasmState === WasmState.READY;
} catch {
return false;
}
}
// IDLE 或 FAILED → 开始加载
wasmState = WasmState.LOADING;
const T0 = performance.now();
loadingPromise = (async () => {
try {
// 动态 import 结巴模块
const wasmPath = `${extensionFolderPath}/libs/jieba-wasm/jieba_rs_wasm_bg.wasm`;
// eslint-disable-next-line no-unsanitized/method
jiebaModule = await import(
`${extensionFolderPath}/libs/jieba-wasm/jieba_rs_wasm.js`
);
// 初始化 WASM
if (typeof jiebaModule.default === 'function') {
await jiebaModule.default(wasmPath);
}
// 缓存函数引用
jiebaCut = jiebaModule.cut;
jiebaAddWord = jiebaModule.add_word;
if (typeof jiebaCut !== 'function') {
throw new Error('jieba cut 函数不存在');
}
wasmState = WasmState.READY;
const elapsed = Math.round(performance.now() - T0);
xbLog.info(MODULE_ID, `结巴 WASM 加载完成 (${elapsed}ms)`);
// 如果有待注入的实体,补做
if (entityList.length > 0 && jiebaAddWord) {
reInjectAllEntities();
}
return true;
} catch (e) {
wasmState = WasmState.FAILED;
xbLog.error(MODULE_ID, '结巴 WASM 加载失败', e);
throw e;
}
})();
try {
await loadingPromise;
return true;
} catch {
return false;
} finally {
loadingPromise = null;
}
}
// ═══════════════════════════════════════════════════════════════════════════
// 公开接口isReady
// ═══════════════════════════════════════════════════════════════════════════
/**
* 检查结巴是否已就绪
* @returns {boolean}
*/
export function isReady() {
return wasmState === WasmState.READY;
}
/**
* 获取当前 WASM 状态
* @returns {string}
*/
export function getState() {
return wasmState;
}
// ═══════════════════════════════════════════════════════════════════════════
// 公开接口injectEntities
// ═══════════════════════════════════════════════════════════════════════════
/**
* 注入实体词典
*
* 更新内部实体列表(用于最长匹配保护)
* 如果结巴已就绪,同时调用 add_word 注入
*
* @param {Set<string>} lexicon - 标准化后的实体集合
* @param {Map<string, string>} [displayMap] - normalize→原词形映射
*/
export function injectEntities(lexicon, displayMap) {
if (!lexicon?.size) {
entityList = [];
return;
}
// 构建实体列表使用原词形displayMap按长度降序排列
const entities = [];
for (const normalized of lexicon) {
const display = displayMap?.get(normalized) || normalized;
if (display.length >= 2) {
entities.push(display);
}
}
// 按长度降序(最长匹配优先)
entities.sort((a, b) => b.length - a.length);
entityList = entities;
// 如果结巴已就绪,注入自定义词
if (wasmState === WasmState.READY && jiebaAddWord) {
injectNewEntitiesToJieba(entities);
}
xbLog.info(MODULE_ID, `实体词典更新: ${entities.length} 个实体`);
}
/**
* 将新实体注入结巴(增量,跳过已注入的)
* @param {string[]} entities
*/
function injectNewEntitiesToJieba(entities) {
let count = 0;
for (const entity of entities) {
if (!injectedEntities.has(entity)) {
try {
// freq 设高保证不被切碎
jiebaAddWord(entity, 99999);
injectedEntities.add(entity);
count++;
} catch (e) {
xbLog.warn(MODULE_ID, `add_word 失败: ${entity}`, e);
}
}
}
if (count > 0) {
xbLog.info(MODULE_ID, `注入 ${count} 个新实体到结巴`);
}
}
/**
* 重新注入所有实体WASM 刚加载完时调用)
*/
function reInjectAllEntities() {
injectedEntities.clear();
injectNewEntitiesToJieba(entityList);
}
// ═══════════════════════════════════════════════════════════════════════════
// 公开接口tokenize
// ═══════════════════════════════════════════════════════════════════════════
/**
* 统一分词接口
*
* 流程:
* 1. 实体最长匹配 → 占位符保护
* 2. 按 Unicode 脚本分段(亚洲 vs 拉丁)
* 3. 亚洲段 → 结巴 cut()(或降级)
* 4. 拉丁段 → 空格/标点分割
* 5. 还原占位符
* 6. 过滤停用词 + 去重
*
* @param {string} text - 输入文本
* @returns {string[]} token 数组
*/
export function tokenize(text) {
const restored = tokenizeCore(text);
// 5. 过滤停用词 + 去重 + 清理
const seen = new Set();
const result = [];
for (const token of restored) {
const cleaned = token.trim().toLowerCase();
if (!cleaned) continue;
if (cleaned.length < 2) continue;
if (STOP_WORDS.has(cleaned)) continue;
if (seen.has(cleaned)) continue;
// 过滤纯标点/特殊字符
if (/^[\s\x00-\x1F\p{P}\p{S}]+$/u.test(cleaned)) continue;
seen.add(cleaned);
result.push(token.trim()); // 保留原始大小写
}
return result;
}
/**
* 内核分词流程(不去重、不 lower、仅完成实体保护→分段→分词→还原
* @param {string} text
* @returns {string[]}
*/
function tokenizeCore(text) {
if (!text) return [];
const input = String(text).trim();
if (!input) return [];
// 1. 实体保护
const { masked, entities } = maskEntities(input);
// 2. 分段
const segments = segmentByScript(masked);
// 3. 分段分词
const rawTokens = [];
for (const seg of segments) {
if (seg.type === 'asian') {
if (wasmState === WasmState.READY && jiebaCut) {
rawTokens.push(...tokenizeAsianJieba(seg.text));
} else {
rawTokens.push(...tokenizeAsianFallback(seg.text));
}
} else if (seg.type === 'latin') {
rawTokens.push(...tokenizeLatin(seg.text));
}
}
// 4. 还原占位符
return unmaskTokens(rawTokens, entities);
}
// ═══════════════════════════════════════════════════════════════════════════
// 公开接口tokenizeForIndex
// ═══════════════════════════════════════════════════════════════════════════
/**
* MiniSearch 索引专用分词
*
* 与 tokenize() 的区别:
* - 全部转小写MiniSearch 内部需要一致性)
* - 不去重MiniSearch 自己处理词频)
*
* @param {string} text
* @returns {string[]}
*/
export function tokenizeForIndex(text) {
const restored = tokenizeCore(text);
return restored
.map(t => t.trim().toLowerCase())
.filter(t => {
if (!t || t.length < 2) return false;
if (STOP_WORDS.has(t)) return false;
if (/^[\s\x00-\x1F\p{P}\p{S}]+$/u.test(t)) return false;
return true;
});
}
// ═══════════════════════════════════════════════════════════════════════════
// 公开接口reset
// ═══════════════════════════════════════════════════════════════════════════
/**
* 重置分词器状态
* 用于测试或模块卸载
*/
export function reset() {
entityList = [];
injectedEntities.clear();
// 不重置 WASM 状态(避免重复加载)
}