Add files via upload

This commit is contained in:
RT15548
2026-02-17 22:45:01 +08:00
committed by GitHub
parent 59e7301bf8
commit 4bbc2f9fd5
18 changed files with 5167 additions and 442 deletions

View File

@@ -1,56 +1,62 @@
// ═══════════════════════════════════════════════════════════════════════════
// Story Summary - Config (v2 简化版)
// ═══════════════════════════════════════════════════════════════════════════
import { extension_settings } from "../../../../../../extensions.js"; import { extension_settings } from "../../../../../../extensions.js";
import { EXT_ID } from "../../../core/constants.js"; import { EXT_ID } from "../../../core/constants.js";
import { xbLog } from "../../../core/debug-core.js"; import { xbLog } from "../../../core/debug-core.js";
import { CommonSettingStorage } from "../../../core/server-storage.js"; import { CommonSettingStorage } from "../../../core/server-storage.js";
const MODULE_ID = 'summaryConfig'; const MODULE_ID = "summaryConfig";
const SUMMARY_CONFIG_KEY = 'storySummaryPanelConfig'; const SUMMARY_CONFIG_KEY = "storySummaryPanelConfig";
const DEFAULT_FILTER_RULES = [
{ start: "<think>", end: "</think>" },
{ start: "<thinking>", end: "</thinking>" },
{ start: "```", end: "```" },
];
export function getSettings() { export function getSettings() {
const ext = extension_settings[EXT_ID] ||= {}; const ext = (extension_settings[EXT_ID] ||= {});
ext.storySummary ||= { enabled: true }; ext.storySummary ||= { enabled: true };
return ext; return ext;
} }
const DEFAULT_FILTER_RULES = [
{ start: '<think>', end: '</think>' },
{ start: '<thinking>', end: '</thinking>' },
];
export function getSummaryPanelConfig() { export function getSummaryPanelConfig() {
const defaults = { const defaults = {
api: { provider: 'st', url: '', key: '', model: '', modelCache: [] }, api: { provider: "st", url: "", key: "", model: "", modelCache: [] },
gen: { temperature: null, top_p: null, top_k: null, presence_penalty: null, frequency_penalty: null }, gen: { temperature: null, top_p: null, top_k: null, presence_penalty: null, frequency_penalty: null },
trigger: { trigger: {
enabled: false, enabled: false,
interval: 20, interval: 20,
timing: 'before_user', timing: "before_user",
role: 'system', role: "system",
useStream: true, useStream: true,
maxPerRun: 100, maxPerRun: 100,
wrapperHead: '', wrapperHead: "",
wrapperTail: '', wrapperTail: "",
forceInsertAtEnd: false, forceInsertAtEnd: false,
}, },
textFilterRules: [...DEFAULT_FILTER_RULES],
vector: null, vector: null,
}; };
try { try {
const raw = localStorage.getItem('summary_panel_config'); const raw = localStorage.getItem("summary_panel_config");
if (!raw) return defaults; if (!raw) return defaults;
const parsed = JSON.parse(raw); const parsed = JSON.parse(raw);
const textFilterRules = Array.isArray(parsed.textFilterRules)
? parsed.textFilterRules
: (Array.isArray(parsed.vector?.textFilterRules)
? parsed.vector.textFilterRules
: defaults.textFilterRules);
const result = { const result = {
api: { ...defaults.api, ...(parsed.api || {}) }, api: { ...defaults.api, ...(parsed.api || {}) },
gen: { ...defaults.gen, ...(parsed.gen || {}) }, gen: { ...defaults.gen, ...(parsed.gen || {}) },
trigger: { ...defaults.trigger, ...(parsed.trigger || {}) }, trigger: { ...defaults.trigger, ...(parsed.trigger || {}) },
textFilterRules,
vector: parsed.vector || null,
}; };
if (result.trigger.timing === 'manual') result.trigger.enabled = false; if (result.trigger.timing === "manual") result.trigger.enabled = false;
if (result.trigger.useStream === undefined) result.trigger.useStream = true; if (result.trigger.useStream === undefined) result.trigger.useStream = true;
return result; return result;
@@ -61,35 +67,27 @@ export function getSummaryPanelConfig() {
export function saveSummaryPanelConfig(config) { export function saveSummaryPanelConfig(config) {
try { try {
localStorage.setItem('summary_panel_config', JSON.stringify(config)); localStorage.setItem("summary_panel_config", JSON.stringify(config));
CommonSettingStorage.set(SUMMARY_CONFIG_KEY, config); CommonSettingStorage.set(SUMMARY_CONFIG_KEY, config);
} catch (e) { } catch (e) {
xbLog.error(MODULE_ID, '保存面板配置失败', e); xbLog.error(MODULE_ID, "保存面板配置失败", e);
} }
} }
// ═══════════════════════════════════════════════════════════════════════════
// 向量配置(简化版 - 只需要 key
// ═══════════════════════════════════════════════════════════════════════════
export function getVectorConfig() { export function getVectorConfig() {
try { try {
const raw = localStorage.getItem('summary_panel_config'); const raw = localStorage.getItem("summary_panel_config");
if (!raw) return null; if (!raw) return null;
const parsed = JSON.parse(raw); const parsed = JSON.parse(raw);
const cfg = parsed.vector || null; const cfg = parsed.vector || null;
if (!cfg) return null;
if (cfg && !cfg.textFilterRules) { // Keep vector side normalized to online + siliconflow.
cfg.textFilterRules = [...DEFAULT_FILTER_RULES]; cfg.engine = "online";
}
// 简化:统一使用硅基
if (cfg) {
cfg.engine = 'online';
cfg.online = cfg.online || {}; cfg.online = cfg.online || {};
cfg.online.provider = 'siliconflow'; cfg.online.provider = "siliconflow";
cfg.online.model = 'BAAI/bge-m3'; cfg.online.model = "BAAI/bge-m3";
}
return cfg; return cfg;
} catch { } catch {
@@ -98,31 +96,31 @@ export function getVectorConfig() {
} }
export function getTextFilterRules() { export function getTextFilterRules() {
const cfg = getVectorConfig(); const cfg = getSummaryPanelConfig();
return cfg?.textFilterRules || DEFAULT_FILTER_RULES; return Array.isArray(cfg?.textFilterRules)
? cfg.textFilterRules
: DEFAULT_FILTER_RULES;
} }
export function saveVectorConfig(vectorCfg) { export function saveVectorConfig(vectorCfg) {
try { try {
const raw = localStorage.getItem('summary_panel_config') || '{}'; const raw = localStorage.getItem("summary_panel_config") || "{}";
const parsed = JSON.parse(raw); const parsed = JSON.parse(raw);
// 简化配置
parsed.vector = { parsed.vector = {
enabled: vectorCfg?.enabled || false, enabled: !!vectorCfg?.enabled,
engine: 'online', engine: "online",
online: { online: {
provider: 'siliconflow', provider: "siliconflow",
key: vectorCfg?.online?.key || '', key: vectorCfg?.online?.key || "",
model: 'BAAI/bge-m3', model: "BAAI/bge-m3",
}, },
textFilterRules: vectorCfg?.textFilterRules || DEFAULT_FILTER_RULES,
}; };
localStorage.setItem('summary_panel_config', JSON.stringify(parsed)); localStorage.setItem("summary_panel_config", JSON.stringify(parsed));
CommonSettingStorage.set(SUMMARY_CONFIG_KEY, parsed); CommonSettingStorage.set(SUMMARY_CONFIG_KEY, parsed);
} catch (e) { } catch (e) {
xbLog.error(MODULE_ID, '保存向量配置失败', e); xbLog.error(MODULE_ID, "保存向量配置失败", e);
} }
} }
@@ -130,12 +128,12 @@ export async function loadConfigFromServer() {
try { try {
const savedConfig = await CommonSettingStorage.get(SUMMARY_CONFIG_KEY, null); const savedConfig = await CommonSettingStorage.get(SUMMARY_CONFIG_KEY, null);
if (savedConfig) { if (savedConfig) {
localStorage.setItem('summary_panel_config', JSON.stringify(savedConfig)); localStorage.setItem("summary_panel_config", JSON.stringify(savedConfig));
xbLog.info(MODULE_ID, '已从服务加载面板配置'); xbLog.info(MODULE_ID, "已从服务加载面板配置");
return savedConfig; return savedConfig;
} }
} catch (e) { } catch (e) {
xbLog.warn(MODULE_ID, '加载面板配置失败', e); xbLog.warn(MODULE_ID, "加载面板配置失败", e);
} }
return null; return null;
} }

View File

@@ -5,6 +5,7 @@ import { getContext } from "../../../../../../extensions.js";
import { xbLog } from "../../../core/debug-core.js"; import { xbLog } from "../../../core/debug-core.js";
import { getSummaryStore, saveSummaryStore, addSummarySnapshot, mergeNewData, getFacts } from "../data/store.js"; import { getSummaryStore, saveSummaryStore, addSummarySnapshot, mergeNewData, getFacts } from "../data/store.js";
import { generateSummary, parseSummaryJson } from "./llm.js"; import { generateSummary, parseSummaryJson } from "./llm.js";
import { filterText } from "../vector/utils/text-filter.js";
const MODULE_ID = 'summaryGenerator'; const MODULE_ID = 'summaryGenerator';
const SUMMARY_SESSION_ID = 'xb9'; const SUMMARY_SESSION_ID = 'xb9';
@@ -168,7 +169,8 @@ export function buildIncrementalSlice(targetMesId, lastSummarizedMesId, maxPerRu
const text = slice.map((m, i) => { const text = slice.map((m, i) => {
const speaker = m.name || (m.is_user ? userLabel : charLabel); const speaker = m.name || (m.is_user ? userLabel : charLabel);
return `#${start + i + 1}${speaker}\n${m.mes}`; const filteredMessage = filterText(m.mes || "");
return `#${start + i + 1}${speaker}\n${filteredMessage}`;
}).join('\n\n'); }).join('\n\n');
return { text, count: slice.length, range: `${start + 1}-${end + 1}`, endMesId: end }; return { text, count: slice.length, range: `${start + 1}-${end + 1}`, endMesId: end };

View File

@@ -87,6 +87,7 @@
api: { provider: 'st', url: '', key: '', model: '', modelCache: [] }, api: { provider: 'st', url: '', key: '', model: '', modelCache: [] },
gen: { temperature: null, top_p: null, top_k: null, presence_penalty: null, frequency_penalty: null }, gen: { temperature: null, top_p: null, top_k: null, presence_penalty: null, frequency_penalty: null },
trigger: { enabled: false, interval: 20, timing: 'before_user', role: 'system', useStream: true, maxPerRun: 100, wrapperHead: '', wrapperTail: '', forceInsertAtEnd: false }, trigger: { enabled: false, interval: 20, timing: 'before_user', role: 'system', useStream: true, maxPerRun: 100, wrapperHead: '', wrapperTail: '', forceInsertAtEnd: false },
textFilterRules: [...DEFAULT_FILTER_RULES],
vector: { enabled: false, engine: 'online', local: { modelId: 'bge-small-zh' }, online: { provider: 'siliconflow', url: '', key: '', model: '' } } vector: { enabled: false, engine: 'online', local: { modelId: 'bge-small-zh' }, online: { provider: 'siliconflow', url: '', key: '', model: '' } }
}; };
@@ -123,6 +124,9 @@
Object.assign(config.api, p.api || {}); Object.assign(config.api, p.api || {});
Object.assign(config.gen, p.gen || {}); Object.assign(config.gen, p.gen || {});
Object.assign(config.trigger, p.trigger || {}); Object.assign(config.trigger, p.trigger || {});
config.textFilterRules = Array.isArray(p.textFilterRules)
? p.textFilterRules
: (Array.isArray(p.vector?.textFilterRules) ? p.vector.textFilterRules : [...DEFAULT_FILTER_RULES]);
if (p.vector) config.vector = p.vector; if (p.vector) config.vector = p.vector;
if (config.trigger.timing === 'manual' && config.trigger.enabled) { if (config.trigger.timing === 'manual' && config.trigger.enabled) {
config.trigger.enabled = false; config.trigger.enabled = false;
@@ -137,6 +141,11 @@
Object.assign(config.api, cfg.api || {}); Object.assign(config.api, cfg.api || {});
Object.assign(config.gen, cfg.gen || {}); Object.assign(config.gen, cfg.gen || {});
Object.assign(config.trigger, cfg.trigger || {}); Object.assign(config.trigger, cfg.trigger || {});
config.textFilterRules = Array.isArray(cfg.textFilterRules)
? cfg.textFilterRules
: (Array.isArray(cfg.vector?.textFilterRules)
? cfg.vector.textFilterRules
: (Array.isArray(config.textFilterRules) ? config.textFilterRules : [...DEFAULT_FILTER_RULES]));
if (cfg.vector) config.vector = cfg.vector; if (cfg.vector) config.vector = cfg.vector;
if (config.trigger.timing === 'manual') config.trigger.enabled = false; if (config.trigger.timing === 'manual') config.trigger.enabled = false;
localStorage.setItem('summary_panel_config', JSON.stringify(config)); localStorage.setItem('summary_panel_config', JSON.stringify(config));
@@ -145,7 +154,10 @@
function saveConfig() { function saveConfig() {
try { try {
const settingsOpen = $('settings-modal')?.classList.contains('active'); const settingsOpen = $('settings-modal')?.classList.contains('active');
if (settingsOpen) config.vector = getVectorConfig(); if (settingsOpen) {
config.vector = getVectorConfig();
config.textFilterRules = collectFilterRules();
}
if (!config.vector) { if (!config.vector) {
config.vector = { enabled: false, engine: 'online', online: { provider: 'siliconflow', key: '', model: 'BAAI/bge-m3' } }; config.vector = { enabled: false, engine: 'online', online: { provider: 'siliconflow', key: '', model: 'BAAI/bge-m3' } };
} }
@@ -169,7 +181,6 @@
key: $('vector-api-key')?.value?.trim() || '', key: $('vector-api-key')?.value?.trim() || '',
model: 'BAAI/bge-m3', model: 'BAAI/bge-m3',
}, },
textFilterRules: collectFilterRules(),
}; };
} }
@@ -182,7 +193,6 @@
$('vector-api-key').value = cfg.online.key; $('vector-api-key').value = cfg.online.key;
} }
renderFilterRules(cfg?.textFilterRules || DEFAULT_FILTER_RULES);
} }
// ═══════════════════════════════════════════════════════════════════════════ // ═══════════════════════════════════════════════════════════════════════════
@@ -471,6 +481,7 @@
updateProviderUI(config.api.provider); updateProviderUI(config.api.provider);
if (config.vector) loadVectorConfig(config.vector); if (config.vector) loadVectorConfig(config.vector);
renderFilterRules(Array.isArray(config.textFilterRules) ? config.textFilterRules : DEFAULT_FILTER_RULES);
// Initialize sub-options visibility // Initialize sub-options visibility
const autoSummaryOptions = $('auto-summary-options'); const autoSummaryOptions = $('auto-summary-options');
@@ -520,6 +531,7 @@
config.trigger.wrapperHead = $('trigger-wrapper-head').value; config.trigger.wrapperHead = $('trigger-wrapper-head').value;
config.trigger.wrapperTail = $('trigger-wrapper-tail').value; config.trigger.wrapperTail = $('trigger-wrapper-tail').value;
config.trigger.forceInsertAtEnd = $('trigger-insert-at-end').checked; config.trigger.forceInsertAtEnd = $('trigger-insert-at-end').checked;
config.textFilterRules = collectFilterRules();
config.vector = getVectorConfig(); config.vector = getVectorConfig();
saveConfig(); saveConfig();

View File

@@ -289,6 +289,33 @@
<!-- Trigger Settings --> <!-- Trigger Settings -->
<div class="settings-section"> <div class="settings-section">
<div class="settings-section-title">总结设置</div> <div class="settings-section-title">总结设置</div>
<!-- Filter Rules -->
<div class="settings-collapse" id="filter-rules-collapse"
style="margin-top:0; margin-bottom: 16px;">
<div class="settings-collapse-header" id="filter-rules-toggle">
<span>文本过滤规则 · <strong id="filter-rules-count">0</strong></span>
<svg class="collapse-icon" viewBox="0 0 24 24" fill="none" stroke="currentColor"
stroke-width="2">
<polyline points="6 9 12 15 18 9"></polyline>
</svg>
</div>
<div class="settings-collapse-content hidden" id="filter-rules-content"
style="border-left: 1px solid var(--bdr); border-right: 1px solid var(--bdr); border-bottom: 1px solid var(--bdr); border-radius: 0 0 6px 6px; margin-top: -2px;">
<div class="filter-rules-header">
<p class="settings-hint" style="margin:0">过滤干扰内容(如思考标签)</p>
<button class="btn btn-sm btn-add" id="btn-add-filter-rule">
<svg viewBox="0 0 24 24" width="14" height="14" fill="none"
stroke="currentColor" stroke-width="2">
<line x1="12" y1="5" x2="12" y2="19"></line>
<line x1="5" y1="12" x2="19" y2="12"></line>
</svg> 添加
</button>
</div>
<div id="filter-rules-list" class="filter-rules-list"></div>
</div>
</div>
<div class="settings-row"> <div class="settings-row">
<div class="settings-field"> <div class="settings-field">
<label>注入角色</label> <label>注入角色</label>
@@ -522,36 +549,8 @@
<!-- Tools & Settings --> <!-- Tools & Settings -->
<div> <div>
<div class="neo-tools-header"> <div class="neo-tools-header">
<span>设置与工具</span> <span>导出与导入</span>
<span style="opacity:0.5">///</span>
</div> </div>
<!-- Filter Rules -->
<div class="settings-collapse" id="filter-rules-collapse"
style="margin-top:0; margin-bottom: 16px;">
<div class="settings-collapse-header" id="filter-rules-toggle">
<span>文本过滤规则 · <strong id="filter-rules-count">0</strong></span>
<svg class="collapse-icon" viewBox="0 0 24 24" fill="none" stroke="currentColor"
stroke-width="2">
<polyline points="6 9 12 15 18 9"></polyline>
</svg>
</div>
<div class="settings-collapse-content hidden" id="filter-rules-content"
style="border-left: 1px solid var(--bdr); border-right: 1px solid var(--bdr); border-bottom: 1px solid var(--bdr); border-radius: 0 0 6px 6px; margin-top: -2px;">
<div class="filter-rules-header">
<p class="settings-hint" style="margin:0">过滤干扰内容(如思考标签)</p>
<button class="btn btn-sm btn-add" id="btn-add-filter-rule">
<svg viewBox="0 0 24 24" width="14" height="14" fill="none"
stroke="currentColor" stroke-width="2">
<line x1="12" y1="5" x2="12" y2="19"></line>
<line x1="5" y1="12" x2="19" y2="12"></line>
</svg> 添加
</button>
</div>
<div id="filter-rules-list" class="filter-rules-list"></div>
</div>
</div>
<!-- Import/Export --> <!-- Import/Export -->
<div class="settings-row"> <div class="settings-row">
<div class="settings-field full"> <div class="settings-field full">

View File

@@ -1551,6 +1551,7 @@ async function handleMessageReceived(scheduledChatId) {
// Refresh entity lexicon after new message (new roles may appear) // Refresh entity lexicon after new message (new roles may appear)
refreshEntityLexiconAndWarmup(); refreshEntityLexiconAndWarmup();
scheduleLexicalWarmup(100);
// Auto backfill missing L0 (delay to avoid contention with current floor) // Auto backfill missing L0 (delay to avoid contention with current floor)
setTimeout(() => maybeAutoExtractL0(), 2000); setTimeout(() => maybeAutoExtractL0(), 2000);
@@ -1559,6 +1560,7 @@ async function handleMessageReceived(scheduledChatId) {
function handleMessageSent(scheduledChatId) { function handleMessageSent(scheduledChatId) {
if (isChatStale(scheduledChatId)) return; if (isChatStale(scheduledChatId)) return;
initButtonsForAll(); initButtonsForAll();
scheduleLexicalWarmup(0);
setTimeout(() => maybeAutoRunSummary("before_user"), 1000); setTimeout(() => maybeAutoRunSummary("before_user"), 1000);
} }

View File

@@ -2,7 +2,6 @@
// vector/llm/llm-service.js - 修复 prefill 传递方式 // vector/llm/llm-service.js - 修复 prefill 传递方式
// ═══════════════════════════════════════════════════════════════════════════ // ═══════════════════════════════════════════════════════════════════════════
import { xbLog } from '../../../../core/debug-core.js'; import { xbLog } from '../../../../core/debug-core.js';
import { getVectorConfig } from '../../data/config.js';
import { getApiKey } from './siliconflow.js'; import { getApiKey } from './siliconflow.js';
const MODULE_ID = 'vector-llm-service'; const MODULE_ID = 'vector-llm-service';

View File

@@ -1,16 +1,3 @@
// ═══════════════════════════════════════════════════════════════════════════
// lexical-index.js - MiniSearch 词法检索索引
//
// 职责:
// 1. 对 L0 atoms + L1 chunks + L2 events 建立词法索引
// 2. 提供词法检索接口(专名精确匹配兜底)
// 3. 惰性构建 + 异步预热 + 缓存失效机制
//
// 索引存储:纯内存(不持久化)
// 分词器:统一使用 tokenizer.js结巴 + 实体保护 + 降级)
// 重建时机CHAT_CHANGED / L0提取完成 / L2总结完成
// ═══════════════════════════════════════════════════════════════════════════
import MiniSearch from '../../../../libs/minisearch.mjs'; import MiniSearch from '../../../../libs/minisearch.mjs';
import { getContext } from '../../../../../../../extensions.js'; import { getContext } from '../../../../../../../extensions.js';
import { getSummaryStore } from '../../data/store.js'; import { getSummaryStore } from '../../data/store.js';
@@ -20,76 +7,166 @@ import { tokenizeForIndex } from '../utils/tokenizer.js';
const MODULE_ID = 'lexical-index'; const MODULE_ID = 'lexical-index';
// ───────────────────────────────────────────────────────────────────────── // In-memory index cache
// 缓存
// ─────────────────────────────────────────────────────────────────────────
/** @type {MiniSearch|null} */
let cachedIndex = null; let cachedIndex = null;
/** @type {string|null} */
let cachedChatId = null; let cachedChatId = null;
/** @type {string|null} 数据指纹atoms + chunks + events 数量) */
let cachedFingerprint = null; let cachedFingerprint = null;
/** @type {boolean} 是否正在构建 */
let building = false; let building = false;
/** @type {Promise<MiniSearch|null>|null} 当前构建 Promise防重入 */
let buildPromise = null; let buildPromise = null;
/** @type {Map<number, string[]>} floor → 该楼层的 doc IDs仅 L1 chunks */
// floor -> chunk doc ids (L1 only)
let floorDocIds = new Map(); let floorDocIds = new Map();
// ───────────────────────────────────────────────────────────────────────── // IDF stats over lexical docs (L1 chunks + L2 events)
// 工具函数 let termDfMap = new Map();
// ───────────────────────────────────────────────────────────────────────── let docTokenSets = new Map(); // docId -> Set<token>
let lexicalDocCount = 0;
const IDF_MIN = 1.0;
const IDF_MAX = 4.0;
const BUILD_BATCH_SIZE = 500;
/**
* 清理事件摘要(移除楼层标记)
* @param {string} summary
* @returns {string}
*/
function cleanSummary(summary) { function cleanSummary(summary) {
return String(summary || '') return String(summary || '')
.replace(/\s*\(#\d+(?:-\d+)?\)\s*$/, '') .replace(/\s*\(#\d+(?:-\d+)?\)\s*$/, '')
.trim(); .trim();
} }
/** function fnv1a32(input, seed = 0x811C9DC5) {
* 计算缓存指纹 let hash = seed >>> 0;
* @param {number} chunkCount const text = String(input || '');
* @param {number} eventCount for (let i = 0; i < text.length; i++) {
* @returns {string} hash ^= text.charCodeAt(i);
*/ hash = Math.imul(hash, 0x01000193) >>> 0;
function computeFingerprint(chunkCount, eventCount) { }
return `${chunkCount}:${eventCount}`; return hash >>> 0;
}
function compareDocKeys(a, b) {
const ka = `${a?.type || ''}:${a?.id || ''}`;
const kb = `${b?.type || ''}:${b?.id || ''}`;
if (ka < kb) return -1;
if (ka > kb) return 1;
return 0;
}
function computeFingerprintFromDocs(docs) {
const normalizedDocs = Array.isArray(docs) ? [...docs].sort(compareDocKeys) : [];
let hash = 0x811C9DC5;
for (const doc of normalizedDocs) {
const payload = `${doc?.type || ''}\u001F${doc?.id || ''}\u001F${doc?.floor ?? ''}\u001F${doc?.text || ''}\u001E`;
hash = fnv1a32(payload, hash);
}
return `${normalizedDocs.length}:${(hash >>> 0).toString(16)}`;
} }
/**
* 让出主线程(避免长时间阻塞 UI
* @returns {Promise<void>}
*/
function yieldToMain() { function yieldToMain() {
return new Promise(resolve => setTimeout(resolve, 0)); return new Promise(resolve => setTimeout(resolve, 0));
} }
// ───────────────────────────────────────────────────────────────────────── function clamp(v, min, max) {
// 文档收集 return Math.max(min, Math.min(max, v));
// ───────────────────────────────────────────────────────────────────────── }
function normalizeTerm(term) {
return String(term || '').trim().toLowerCase();
}
function computeIdfFromDf(df, docCount) {
if (!docCount || docCount <= 0) return 1;
const raw = Math.log((docCount + 1) / ((df || 0) + 1)) + 1;
return clamp(raw, IDF_MIN, IDF_MAX);
}
function computeIdf(term) {
const t = normalizeTerm(term);
if (!t || lexicalDocCount <= 0) return 1;
return computeIdfFromDf(termDfMap.get(t) || 0, lexicalDocCount);
}
function extractUniqueTokens(text) {
return new Set(tokenizeForIndex(String(text || '')).map(normalizeTerm).filter(Boolean));
}
function clearIdfState() {
termDfMap = new Map();
docTokenSets = new Map();
lexicalDocCount = 0;
}
function removeDocumentIdf(docId) {
const id = String(docId || '');
if (!id) return;
const tokens = docTokenSets.get(id);
if (!tokens) return;
for (const token of tokens) {
const current = termDfMap.get(token) || 0;
if (current <= 1) {
termDfMap.delete(token);
} else {
termDfMap.set(token, current - 1);
}
}
docTokenSets.delete(id);
lexicalDocCount = Math.max(0, lexicalDocCount - 1);
}
function addDocumentIdf(docId, text) {
const id = String(docId || '');
if (!id) return;
// Replace semantics: remove old token set first if this id already exists.
removeDocumentIdf(id);
const tokens = extractUniqueTokens(text);
docTokenSets.set(id, tokens);
lexicalDocCount += 1;
for (const token of tokens) {
termDfMap.set(token, (termDfMap.get(token) || 0) + 1);
}
}
function rebuildIdfFromDocs(docs) {
clearIdfState();
for (const doc of docs || []) {
const id = String(doc?.id || '');
const text = String(doc?.text || '');
if (!id || !text.trim()) continue;
addDocumentIdf(id, text);
}
}
function buildEventDoc(ev) {
if (!ev?.id) return null;
const parts = [];
if (ev.title) parts.push(ev.title);
if (ev.participants?.length) parts.push(ev.participants.join(' '));
const summary = cleanSummary(ev.summary);
if (summary) parts.push(summary);
const text = parts.join(' ').trim();
if (!text) return null;
return {
id: ev.id,
type: 'event',
floor: null,
text,
};
}
/**
* 收集所有待索引文档
*
* @param {object[]} chunks - getAllChunks(chatId) 返回值
* @param {object[]} events - store.json.events
* @returns {object[]} 文档数组
*/
function collectDocuments(chunks, events) { function collectDocuments(chunks, events) {
const docs = []; const docs = [];
// L1 chunks + 填充 floorDocIds for (const chunk of chunks || []) {
for (const chunk of (chunks || [])) {
if (!chunk?.chunkId || !chunk.text) continue; if (!chunk?.chunkId || !chunk.text) continue;
const floor = chunk.floor ?? -1; const floor = chunk.floor ?? -1;
@@ -101,48 +178,19 @@ function collectDocuments(chunks, events) {
}); });
if (floor >= 0) { if (floor >= 0) {
if (!floorDocIds.has(floor)) { if (!floorDocIds.has(floor)) floorDocIds.set(floor, []);
floorDocIds.set(floor, []);
}
floorDocIds.get(floor).push(chunk.chunkId); floorDocIds.get(floor).push(chunk.chunkId);
} }
} }
// L2 events for (const ev of events || []) {
for (const ev of (events || [])) { const doc = buildEventDoc(ev);
if (!ev?.id) continue; if (doc) docs.push(doc);
const parts = [];
if (ev.title) parts.push(ev.title);
if (ev.participants?.length) parts.push(ev.participants.join(' '));
const summary = cleanSummary(ev.summary);
if (summary) parts.push(summary);
const text = parts.join(' ').trim();
if (!text) continue;
docs.push({
id: ev.id,
type: 'event',
floor: null,
text,
});
} }
return docs; return docs;
} }
// ─────────────────────────────────────────────────────────────────────────
// 索引构建(分片,不阻塞主线程)
// ─────────────────────────────────────────────────────────────────────────
/** 每批添加的文档数 */
const BUILD_BATCH_SIZE = 500;
/**
* 构建 MiniSearch 索引(分片异步)
*
* @param {object[]} docs - 文档数组
* @returns {Promise<MiniSearch>}
*/
async function buildIndexAsync(docs) { async function buildIndexAsync(docs) {
const T0 = performance.now(); const T0 = performance.now();
@@ -158,49 +206,46 @@ async function buildIndexAsync(docs) {
tokenize: tokenizeForIndex, tokenize: tokenizeForIndex,
}); });
if (!docs.length) { if (!docs.length) return index;
return index;
}
// 分片添加,每批 BUILD_BATCH_SIZE 条后让出主线程
for (let i = 0; i < docs.length; i += BUILD_BATCH_SIZE) { for (let i = 0; i < docs.length; i += BUILD_BATCH_SIZE) {
const batch = docs.slice(i, i + BUILD_BATCH_SIZE); const batch = docs.slice(i, i + BUILD_BATCH_SIZE);
index.addAll(batch); index.addAll(batch);
// 非最后一批时让出主线程
if (i + BUILD_BATCH_SIZE < docs.length) { if (i + BUILD_BATCH_SIZE < docs.length) {
await yieldToMain(); await yieldToMain();
} }
} }
const elapsed = Math.round(performance.now() - T0); const elapsed = Math.round(performance.now() - T0);
xbLog.info(MODULE_ID, xbLog.info(MODULE_ID, `Index built: ${docs.length} docs (${elapsed}ms)`);
`索引构建完成: ${docs.length} 文档 (${elapsed}ms)`
);
return index; return index;
} }
// ─────────────────────────────────────────────────────────────────────────
// 检索
// ─────────────────────────────────────────────────────────────────────────
/** /**
* @typedef {object} LexicalSearchResult * @typedef {object} LexicalSearchResult
* @property {string[]} atomIds - 命中的 L0 atom IDs * @property {string[]} atomIds - Reserved for backward compatibility (currently empty).
* @property {Set<number>} atomFloors - 命中的 L0 楼层集合 * @property {Set<number>} atomFloors - Reserved for backward compatibility (currently empty).
* @property {string[]} chunkIds - 命中的 L1 chunk IDs * @property {string[]} chunkIds - Matched L1 chunk ids sorted by weighted lexical score.
* @property {Set<number>} chunkFloors - 命中的 L1 楼层集合 * @property {Set<number>} chunkFloors - Floor ids covered by matched chunks.
* @property {string[]} eventIds - 命中的 L2 event IDs * @property {string[]} eventIds - Matched L2 event ids sorted by weighted lexical score.
* @property {object[]} chunkScores - chunk 命中详情 [{ chunkId, score }] * @property {object[]} chunkScores - Weighted lexical scores for matched chunks.
* @property {number} searchTime - 检索耗时 ms * @property {boolean} idfEnabled - Whether IDF stats are available for weighting.
* @property {number} idfDocCount - Number of lexical docs used to compute IDF.
* @property {Array<{term:string,idf:number}>} topIdfTerms - Top query terms by IDF.
* @property {string[]} queryTerms - Normalized query terms actually searched.
* @property {Record<string, Array<{floor:number, weightedScore:number, chunkId:string}>>} termFloorHits - Chunk-floor hits by term.
* @property {Array<{floor:number, score:number, hitTermsCount:number}>} floorLexScores - Aggregated lexical floor scores (debug).
* @property {number} termSearches - Number of per-term MiniSearch queries executed.
* @property {number} searchTime - Total lexical search time in milliseconds.
*/ */
/** /**
* 在词法索引中检索 * Search lexical index by terms, using per-term MiniSearch and IDF-weighted score aggregation.
* This keeps existing outputs compatible while adding observability fields.
* *
* @param {MiniSearch} index - 索引实例 * @param {MiniSearch} index
* @param {string[]} terms - 查询词列表 * @param {string[]} terms
* @returns {LexicalSearchResult} * @returns {LexicalSearchResult}
*/ */
export function searchLexicalIndex(index, terms) { export function searchLexicalIndex(index, terms) {
@@ -213,6 +258,13 @@ export function searchLexicalIndex(index, terms) {
chunkFloors: new Set(), chunkFloors: new Set(),
eventIds: [], eventIds: [],
chunkScores: [], chunkScores: [],
idfEnabled: lexicalDocCount > 0,
idfDocCount: lexicalDocCount,
topIdfTerms: [],
queryTerms: [],
termFloorHits: {},
floorLexScores: [],
termSearches: 0,
searchTime: 0, searchTime: 0,
}; };
@@ -221,79 +273,111 @@ export function searchLexicalIndex(index, terms) {
return result; return result;
} }
// 用所有 terms 联合查询 const queryTerms = Array.from(new Set((terms || []).map(normalizeTerm).filter(Boolean)));
const queryString = terms.join(' '); result.queryTerms = [...queryTerms];
const weightedScores = new Map(); // docId -> score
const hitMeta = new Map(); // docId -> { type, floor }
const idfPairs = [];
const termFloorHits = new Map(); // term -> [{ floor, weightedScore, chunkId }]
const floorLexAgg = new Map(); // floor -> { score, terms:Set<string> }
let hits; for (const term of queryTerms) {
const idf = computeIdf(term);
idfPairs.push({ term, idf });
let hits = [];
try { try {
hits = index.search(queryString, { hits = index.search(term, {
boost: { text: 1 }, boost: { text: 1 },
fuzzy: 0.2, fuzzy: 0.2,
prefix: true, prefix: true,
combineWith: 'OR', combineWith: 'OR',
// 使用与索引相同的分词器
tokenize: tokenizeForIndex, tokenize: tokenizeForIndex,
}); });
} catch (e) { } catch (e) {
xbLog.warn(MODULE_ID, '检索失败', e); xbLog.warn(MODULE_ID, `Lexical term search failed: ${term}`, e);
result.searchTime = Math.round(performance.now() - T0); continue;
return result;
} }
// 分类结果 result.termSearches += 1;
const chunkIdSet = new Set();
const eventIdSet = new Set();
for (const hit of hits) { for (const hit of hits) {
const type = hit.type; const id = String(hit.id || '');
const id = hit.id; if (!id) continue;
const floor = hit.floor;
switch (type) { const weighted = (hit.score || 0) * idf;
case 'chunk': weightedScores.set(id, (weightedScores.get(id) || 0) + weighted);
if (!chunkIdSet.has(id)) {
chunkIdSet.add(id); if (!hitMeta.has(id)) {
hitMeta.set(id, {
type: hit.type,
floor: hit.floor,
});
}
if (hit.type === 'chunk' && typeof hit.floor === 'number' && hit.floor >= 0) {
if (!termFloorHits.has(term)) termFloorHits.set(term, []);
termFloorHits.get(term).push({
floor: hit.floor,
weightedScore: weighted,
chunkId: id,
});
const floorAgg = floorLexAgg.get(hit.floor) || { score: 0, terms: new Set() };
floorAgg.score += weighted;
floorAgg.terms.add(term);
floorLexAgg.set(hit.floor, floorAgg);
}
}
}
idfPairs.sort((a, b) => b.idf - a.idf);
result.topIdfTerms = idfPairs.slice(0, 5);
result.termFloorHits = Object.fromEntries(
[...termFloorHits.entries()].map(([term, hits]) => [term, hits]),
);
result.floorLexScores = [...floorLexAgg.entries()]
.map(([floor, info]) => ({
floor,
score: Number(info.score.toFixed(6)),
hitTermsCount: info.terms.size,
}))
.sort((a, b) => b.score - a.score);
const sortedHits = Array.from(weightedScores.entries())
.sort((a, b) => b[1] - a[1]);
for (const [id, score] of sortedHits) {
const meta = hitMeta.get(id);
if (!meta) continue;
if (meta.type === 'chunk') {
result.chunkIds.push(id); result.chunkIds.push(id);
result.chunkScores.push({ chunkId: id, score: hit.score }); result.chunkScores.push({ chunkId: id, score });
if (typeof floor === 'number' && floor >= 0) { if (typeof meta.floor === 'number' && meta.floor >= 0) {
result.chunkFloors.add(floor); result.chunkFloors.add(meta.floor);
} }
continue;
} }
break;
case 'event': if (meta.type === 'event') {
if (!eventIdSet.has(id)) {
eventIdSet.add(id);
result.eventIds.push(id); result.eventIds.push(id);
} }
break;
}
} }
result.searchTime = Math.round(performance.now() - T0); result.searchTime = Math.round(performance.now() - T0);
xbLog.info(MODULE_ID, xbLog.info(
`检索完成: terms=[${terms.slice(0, 5).join(',')}] → atoms=${result.atomIds.length} chunks=${result.chunkIds.length} events=${result.eventIds.length} (${result.searchTime}ms)` MODULE_ID,
`Lexical search terms=[${queryTerms.slice(0, 5).join(',')}] chunks=${result.chunkIds.length} events=${result.eventIds.length} termSearches=${result.termSearches} (${result.searchTime}ms)`,
); );
return result; return result;
} }
// ─────────────────────────────────────────────────────────────────────────
// 内部构建流程(收集数据 + 构建索引)
// ─────────────────────────────────────────────────────────────────────────
/**
* 收集数据并构建索引
*
* @param {string} chatId
* @returns {Promise<{index: MiniSearch, fingerprint: string}>}
*/
async function collectAndBuild(chatId) { async function collectAndBuild(chatId) {
// 清空侧索引(全量重建)
floorDocIds = new Map(); floorDocIds = new Map();
// 收集数据(不含 L0 atoms
const store = getSummaryStore(); const store = getSummaryStore();
const events = store?.json?.events || []; const events = store?.json?.events || [];
@@ -301,48 +385,44 @@ async function collectAndBuild(chatId) {
try { try {
chunks = await getAllChunks(chatId); chunks = await getAllChunks(chatId);
} catch (e) { } catch (e) {
xbLog.warn(MODULE_ID, '获取 chunks 失败', e); xbLog.warn(MODULE_ID, 'Failed to load chunks', e);
} }
const fp = computeFingerprint(chunks.length, events.length); const docs = collectDocuments(chunks, events);
const fp = computeFingerprintFromDocs(docs);
// 检查是否在收集过程中缓存已被其他调用更新
if (cachedIndex && cachedChatId === chatId && cachedFingerprint === fp) { if (cachedIndex && cachedChatId === chatId && cachedFingerprint === fp) {
return { index: cachedIndex, fingerprint: fp }; return { index: cachedIndex, fingerprint: fp };
} }
// 收集文档(同时填充 floorDocIds rebuildIdfFromDocs(docs);
const docs = collectDocuments(chunks, events);
// 异步分片构建
const index = await buildIndexAsync(docs); const index = await buildIndexAsync(docs);
return { index, fingerprint: fp }; return { index, fingerprint: fp };
} }
// ─────────────────────────────────────────────────────────────────────────
// 公开接口getLexicalIndex惰性获取
// ─────────────────────────────────────────────────────────────────────────
/** /**
* 获取词法索引(惰性构建 + 缓存) * Expose IDF accessor for query-term selection in query-builder.
* * If index stats are not ready, this gracefully falls back to idf=1.
* 如果缓存有效则直接返回;否则自动构建。
* 如果正在构建中,等待构建完成。
*
* @returns {Promise<MiniSearch|null>}
*/ */
export function getLexicalIdfAccessor() {
return {
enabled: lexicalDocCount > 0,
docCount: lexicalDocCount,
getIdf(term) {
return computeIdf(term);
},
};
}
export async function getLexicalIndex() { export async function getLexicalIndex() {
const { chatId } = getContext(); const { chatId } = getContext();
if (!chatId) return null; if (!chatId) return null;
// 快速路径:如果缓存存在且 chatId 未变,则直接命中
// 指纹校验放到构建流程中完成,避免为指纹而额外读一次 IndexedDB
if (cachedIndex && cachedChatId === chatId && cachedFingerprint) { if (cachedIndex && cachedChatId === chatId && cachedFingerprint) {
return cachedIndex; return cachedIndex;
} }
// 正在构建中,等待结果
if (building && buildPromise) { if (building && buildPromise) {
try { try {
await buildPromise; await buildPromise;
@@ -350,27 +430,23 @@ export async function getLexicalIndex() {
return cachedIndex; return cachedIndex;
} }
} catch { } catch {
// 构建失败,继续往下重建 // Continue to rebuild below.
} }
} }
// 需要重建(指纹将在 collectAndBuild 内部计算并写入缓存) xbLog.info(MODULE_ID, `Lexical cache miss; rebuilding (chatId=${chatId.slice(0, 8)})`);
xbLog.info(MODULE_ID, `缓存失效,重建索引 (chatId=${chatId.slice(0, 8)})`);
building = true; building = true;
buildPromise = collectAndBuild(chatId); buildPromise = collectAndBuild(chatId);
try { try {
const { index, fingerprint } = await buildPromise; const { index, fingerprint } = await buildPromise;
// 原子替换缓存
cachedIndex = index; cachedIndex = index;
cachedChatId = chatId; cachedChatId = chatId;
cachedFingerprint = fingerprint; cachedFingerprint = fingerprint;
return index; return index;
} catch (e) { } catch (e) {
xbLog.error(MODULE_ID, '索引构建失败', e); xbLog.error(MODULE_ID, 'Index build failed', e);
return null; return null;
} finally { } finally {
building = false; building = false;
@@ -378,74 +454,29 @@ export async function getLexicalIndex() {
} }
} }
// ─────────────────────────────────────────────────────────────────────────
// 公开接口warmupIndex异步预建
// ─────────────────────────────────────────────────────────────────────────
/**
* 异步预建索引
*
* 在 CHAT_CHANGED 时调用,后台构建索引。
* 不阻塞调用方,不返回结果。
* 构建完成后缓存自动更新,后续 getLexicalIndex() 直接命中。
*
* 调用时机:
* - handleChatChanged实体注入后
* - L0 提取完成
* - L2 总结完成
*/
export function warmupIndex() { export function warmupIndex() {
const { chatId } = getContext(); const { chatId } = getContext();
if (!chatId) return; if (!chatId || building) return;
// 已在构建中,不重复触发
if (building) return;
// fire-and-forget
getLexicalIndex().catch(e => { getLexicalIndex().catch(e => {
xbLog.warn(MODULE_ID, '预热索引失败', e); xbLog.warn(MODULE_ID, 'Warmup failed', e);
}); });
} }
// ─────────────────────────────────────────────────────────────────────────
// 公开接口invalidateLexicalIndex缓存失效
// ─────────────────────────────────────────────────────────────────────────
/**
* 使缓存失效(下次 getLexicalIndex / warmupIndex 时自动重建)
*
* 调用时机:
* - CHAT_CHANGED
* - L0 提取完成
* - L2 总结完成
*/
export function invalidateLexicalIndex() { export function invalidateLexicalIndex() {
if (cachedIndex) { if (cachedIndex) {
xbLog.info(MODULE_ID, '索引缓存已失效'); xbLog.info(MODULE_ID, 'Lexical index cache invalidated');
} }
cachedIndex = null; cachedIndex = null;
cachedChatId = null; cachedChatId = null;
cachedFingerprint = null; cachedFingerprint = null;
floorDocIds = new Map(); floorDocIds = new Map();
clearIdfState();
} }
// ─────────────────────────────────────────────────────────────────────────
// 增量更新接口
// ─────────────────────────────────────────────────────────────────────────
/**
* 为指定楼层添加 L1 chunks 到索引
*
* 先移除该楼层旧文档,再添加新文档。
* 如果索引不存在(缓存失效),静默跳过(下次 getLexicalIndex 全量重建)。
*
* @param {number} floor - 楼层号
* @param {object[]} chunks - chunk 对象列表(需有 chunkId、text、floor
*/
export function addDocumentsForFloor(floor, chunks) { export function addDocumentsForFloor(floor, chunks) {
if (!cachedIndex || !chunks?.length) return; if (!cachedIndex || !chunks?.length) return;
// 先移除旧文档
removeDocumentsByFloor(floor); removeDocumentsByFloor(floor);
const docs = []; const docs = [];
@@ -453,30 +484,29 @@ export function addDocumentsForFloor(floor, chunks) {
for (const chunk of chunks) { for (const chunk of chunks) {
if (!chunk?.chunkId || !chunk.text) continue; if (!chunk?.chunkId || !chunk.text) continue;
docs.push({
const doc = {
id: chunk.chunkId, id: chunk.chunkId,
type: 'chunk', type: 'chunk',
floor: chunk.floor ?? floor, floor: chunk.floor ?? floor,
text: chunk.text, text: chunk.text,
}); };
docs.push(doc);
docIds.push(chunk.chunkId); docIds.push(chunk.chunkId);
} }
if (docs.length > 0) { if (!docs.length) return;
cachedIndex.addAll(docs); cachedIndex.addAll(docs);
floorDocIds.set(floor, docIds); floorDocIds.set(floor, docIds);
xbLog.info(MODULE_ID, `增量添加: floor ${floor}, ${docs.length} 个 chunk`);
} for (const doc of docs) {
addDocumentIdf(doc.id, doc.text);
}
xbLog.info(MODULE_ID, `Incremental add floor=${floor} chunks=${docs.length}`);
} }
/**
* 从索引中移除指定楼层的所有 L1 chunk 文档
*
* 使用 MiniSearch discard()(软删除)。
* 如果索引不存在,静默跳过。
*
* @param {number} floor - 楼层号
*/
export function removeDocumentsByFloor(floor) { export function removeDocumentsByFloor(floor) {
if (!cachedIndex) return; if (!cachedIndex) return;
@@ -487,55 +517,39 @@ export function removeDocumentsByFloor(floor) {
try { try {
cachedIndex.discard(id); cachedIndex.discard(id);
} catch { } catch {
// 文档可能不存在(已被全量重建替换) // Ignore if the doc was already removed/rebuilt.
} }
removeDocumentIdf(id);
} }
floorDocIds.delete(floor); floorDocIds.delete(floor);
xbLog.info(MODULE_ID, `增量移除: floor ${floor}, ${docIds.length} 个文档`); xbLog.info(MODULE_ID, `Incremental remove floor=${floor} chunks=${docIds.length}`);
} }
/**
* 将新 L2 事件添加到索引
*
* 如果事件 ID 已存在,先 discard 再 add覆盖
* 如果索引不存在,静默跳过。
*
* @param {object[]} events - 事件对象列表(需有 id、title、summary 等)
*/
export function addEventDocuments(events) { export function addEventDocuments(events) {
if (!cachedIndex || !events?.length) return; if (!cachedIndex || !events?.length) return;
const docs = []; const docs = [];
for (const ev of events) { for (const ev of events) {
if (!ev?.id) continue; const doc = buildEventDoc(ev);
if (!doc) continue;
const parts = [];
if (ev.title) parts.push(ev.title);
if (ev.participants?.length) parts.push(ev.participants.join(' '));
const summary = cleanSummary(ev.summary);
if (summary) parts.push(summary);
const text = parts.join(' ').trim();
if (!text) continue;
// 覆盖:先尝试移除旧的
try { try {
cachedIndex.discard(ev.id); cachedIndex.discard(doc.id);
} catch { } catch {
// 不存在则忽略 // Ignore if previous document does not exist.
}
removeDocumentIdf(doc.id);
docs.push(doc);
} }
docs.push({ if (!docs.length) return;
id: ev.id,
type: 'event',
floor: null,
text,
});
}
if (docs.length > 0) {
cachedIndex.addAll(docs); cachedIndex.addAll(docs);
xbLog.info(MODULE_ID, `增量添加: ${docs.length} 个事件`); for (const doc of docs) {
addDocumentIdf(doc.id, doc.text);
} }
xbLog.info(MODULE_ID, `Incremental add events=${docs.length}`);
} }

View File

@@ -52,6 +52,10 @@ export function createMetrics() {
eventHits: 0, eventHits: 0,
searchTime: 0, searchTime: 0,
indexReadyTime: 0, indexReadyTime: 0,
idfEnabled: false,
idfDocCount: 0,
topIdfTerms: [],
termSearches: 0,
eventFilteredByDense: 0, eventFilteredByDense: 0,
floorFilteredByDense: 0, floorFilteredByDense: 0,
}, },
@@ -97,6 +101,11 @@ export function createMetrics() {
floorCandidates: 0, floorCandidates: 0,
floorsSelected: 0, floorsSelected: 0,
l0Collected: 0, l0Collected: 0,
mustKeepTermsCount: 0,
mustKeepFloorsCount: 0,
mustKeepFloors: [],
droppedByRerankCount: 0,
lexHitButNotSelected: 0,
rerankApplied: false, rerankApplied: false,
rerankFailed: false, rerankFailed: false,
beforeRerank: 0, beforeRerank: 0,
@@ -274,6 +283,20 @@ export function formatMetricsLog(metrics) {
if (m.lexical.indexReadyTime > 0) { if (m.lexical.indexReadyTime > 0) {
lines.push(`├─ index_ready_time: ${m.lexical.indexReadyTime}ms`); lines.push(`├─ index_ready_time: ${m.lexical.indexReadyTime}ms`);
} }
lines.push(`├─ idf_enabled: ${!!m.lexical.idfEnabled}`);
if (m.lexical.idfDocCount > 0) {
lines.push(`├─ idf_doc_count: ${m.lexical.idfDocCount}`);
}
if ((m.lexical.topIdfTerms || []).length > 0) {
const topIdfText = m.lexical.topIdfTerms
.slice(0, 5)
.map(x => `${x.term}:${x.idf}`)
.join(', ');
lines.push(`├─ top_idf_terms: [${topIdfText}]`);
}
if (m.lexical.termSearches > 0) {
lines.push(`├─ term_searches: ${m.lexical.termSearches}`);
}
if (m.lexical.eventFilteredByDense > 0) { if (m.lexical.eventFilteredByDense > 0) {
lines.push(`├─ event_filtered_by_dense: ${m.lexical.eventFilteredByDense}`); lines.push(`├─ event_filtered_by_dense: ${m.lexical.eventFilteredByDense}`);
} }
@@ -295,6 +318,20 @@ export function formatMetricsLog(metrics) {
lines.push(`└─ time: ${m.fusion.time}ms`); lines.push(`└─ time: ${m.fusion.time}ms`);
lines.push(''); lines.push('');
// Fusion Guard (must-keep lexical floors)
lines.push('[Fusion Guard] Lexical Must-Keep');
lines.push(`├─ must_keep_terms: ${m.evidence.mustKeepTermsCount || 0}`);
lines.push(`├─ must_keep_floors: ${m.evidence.mustKeepFloorsCount || 0}`);
if ((m.evidence.mustKeepFloors || []).length > 0) {
lines.push(`│ └─ floors: [${m.evidence.mustKeepFloors.slice(0, 10).join(', ')}]`);
}
if ((m.evidence.lexHitButNotSelected || 0) > 0) {
lines.push(`└─ lex_hit_but_not_selected: ${m.evidence.lexHitButNotSelected}`);
} else {
lines.push(`└─ lex_hit_but_not_selected: 0`);
}
lines.push('');
// Constraint (L3 Facts) // Constraint (L3 Facts)
lines.push('[Constraint] L3 Facts - 世界约束'); lines.push('[Constraint] L3 Facts - 世界约束');
lines.push(`├─ total: ${m.constraint.total}`); lines.push(`├─ total: ${m.constraint.total}`);
@@ -358,6 +395,9 @@ export function formatMetricsLog(metrics) {
lines.push(`│ │ ├─ before: ${m.evidence.beforeRerank} floors`); lines.push(`│ │ ├─ before: ${m.evidence.beforeRerank} floors`);
lines.push(`│ │ ├─ after: ${m.evidence.afterRerank} floors`); lines.push(`│ │ ├─ after: ${m.evidence.afterRerank} floors`);
lines.push(`│ │ └─ time: ${m.evidence.rerankTime}ms`); lines.push(`│ │ └─ time: ${m.evidence.rerankTime}ms`);
if ((m.evidence.droppedByRerankCount || 0) > 0) {
lines.push(`│ ├─ dropped_normal: ${m.evidence.droppedByRerankCount}`);
}
if (m.evidence.rerankScores) { if (m.evidence.rerankScores) {
const rs = m.evidence.rerankScores; const rs = m.evidence.rerankScores;
lines.push(`│ ├─ rerank_scores: min=${rs.min}, max=${rs.max}, mean=${rs.mean}`); lines.push(`│ ├─ rerank_scores: min=${rs.min}, max=${rs.max}, mean=${rs.mean}`);

View File

@@ -20,6 +20,7 @@
import { getContext } from '../../../../../../../extensions.js'; import { getContext } from '../../../../../../../extensions.js';
import { buildEntityLexicon, buildDisplayNameMap, extractEntitiesFromText, buildCharacterPools } from './entity-lexicon.js'; import { buildEntityLexicon, buildDisplayNameMap, extractEntitiesFromText, buildCharacterPools } from './entity-lexicon.js';
import { getLexicalIdfAccessor } from './lexical-index.js';
import { getSummaryStore } from '../../data/store.js'; import { getSummaryStore } from '../../data/store.js';
import { filterText } from '../utils/text-filter.js'; import { filterText } from '../utils/text-filter.js';
import { tokenizeForIndex as tokenizerTokenizeForIndex } from '../utils/tokenizer.js'; import { tokenizeForIndex as tokenizerTokenizeForIndex } from '../utils/tokenizer.js';
@@ -106,6 +107,7 @@ export function computeLengthFactor(charCount) {
function extractKeyTerms(text, maxTerms = LEXICAL_TERMS_MAX) { function extractKeyTerms(text, maxTerms = LEXICAL_TERMS_MAX) {
if (!text) return []; if (!text) return [];
const idfAccessor = getLexicalIdfAccessor();
const tokens = tokenizerTokenizeForIndex(text); const tokens = tokenizerTokenizeForIndex(text);
const freq = new Map(); const freq = new Map();
for (const token of tokens) { for (const token of tokens) {
@@ -115,9 +117,13 @@ function extractKeyTerms(text, maxTerms = LEXICAL_TERMS_MAX) {
} }
return Array.from(freq.entries()) return Array.from(freq.entries())
.sort((a, b) => b[1] - a[1]) .map(([term, tf]) => {
const idf = idfAccessor.enabled ? idfAccessor.getIdf(term) : 1;
return { term, tf, score: tf * idf };
})
.sort((a, b) => (b.score - a.score) || (b.tf - a.tf))
.slice(0, maxTerms) .slice(0, maxTerms)
.map(([term]) => term); .map(x => x.term);
} }
// ───────────────────────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────────────────────

View File

@@ -42,6 +42,7 @@ import { getLexicalIndex, searchLexicalIndex } from './lexical-index.js';
import { rerankChunks } from '../llm/reranker.js'; import { rerankChunks } from '../llm/reranker.js';
import { createMetrics, calcSimilarityStats } from './metrics.js'; import { createMetrics, calcSimilarityStats } from './metrics.js';
import { diffuseFromSeeds } from './diffusion.js'; import { diffuseFromSeeds } from './diffusion.js';
import { tokenizeForIndex } from '../utils/tokenizer.js';
const MODULE_ID = 'recall'; const MODULE_ID = 'recall';
@@ -81,6 +82,11 @@ const CONFIG = {
RERANK_TOP_N: 20, RERANK_TOP_N: 20,
RERANK_MIN_SCORE: 0.10, RERANK_MIN_SCORE: 0.10,
// Fusion guard: lexical must-keep floors
MUST_KEEP_MAX_FLOORS: 3,
MUST_KEEP_MIN_IDF: 2.2,
MUST_KEEP_CLUSTER_WINDOW: 2,
// 因果链 // 因果链
CAUSAL_CHAIN_MAX_DEPTH: 10, CAUSAL_CHAIN_MAX_DEPTH: 10,
CAUSAL_INJECT_MAX: 30, CAUSAL_INJECT_MAX: 30,
@@ -517,13 +523,107 @@ function fuseByFloor(denseRank, lexRank, cap = CONFIG.FUSION_CAP) {
return { top: scored.slice(0, cap), totalUnique }; return { top: scored.slice(0, cap), totalUnique };
} }
function mapChunkFloorToAiFloor(floor, chat) {
let mapped = Number(floor);
if (!Number.isInteger(mapped) || mapped < 0) return null;
if (chat?.[mapped]?.is_user) {
const aiFloor = mapped + 1;
if (aiFloor < (chat?.length || 0) && !chat?.[aiFloor]?.is_user) {
mapped = aiFloor;
} else {
return null;
}
}
return mapped;
}
function isNonStopwordTerm(term) {
const norm = normalize(term);
if (!norm) return false;
const tokens = tokenizeForIndex(norm).map(normalize);
return tokens.includes(norm);
}
function buildMustKeepFloors(lexicalResult, lexicalTerms, atomFloorSet, chat) {
const out = {
terms: [],
floors: [],
floorSet: new Set(),
lexHitButNotSelected: 0,
};
if (!lexicalResult || !lexicalTerms?.length || !atomFloorSet?.size) return out;
const queryTermSet = new Set((lexicalTerms || []).map(normalize).filter(Boolean));
const topIdfTerms = (lexicalResult.topIdfTerms || [])
.filter(x => {
const term = normalize(x?.term);
if (!term) return false;
if (!queryTermSet.has(term)) return false;
if (term.length < 2) return false;
if (!isNonStopwordTerm(term)) return false;
if ((x?.idf || 0) < CONFIG.MUST_KEEP_MIN_IDF) return false;
const hits = lexicalResult.termFloorHits?.[term];
return Array.isArray(hits) && hits.length > 0;
})
.sort((a, b) => (b.idf || 0) - (a.idf || 0));
if (!topIdfTerms.length) return out;
out.terms = topIdfTerms.map(x => ({ term: normalize(x.term), idf: x.idf || 0 }));
const floorAgg = new Map(); // floor -> { lexHitScore, terms:Set<string> }
for (const { term } of out.terms) {
const hits = lexicalResult.termFloorHits?.[term] || [];
for (const hit of hits) {
const aiFloor = mapChunkFloorToAiFloor(hit.floor, chat);
if (aiFloor == null) continue;
if (!atomFloorSet.has(aiFloor)) continue;
const cur = floorAgg.get(aiFloor) || { lexHitScore: 0, terms: new Set() };
cur.lexHitScore += Number(hit?.weightedScore || 0);
cur.terms.add(term);
floorAgg.set(aiFloor, cur);
}
}
const candidates = [...floorAgg.entries()]
.map(([floor, info]) => {
const termCoverage = info.terms.size;
const finalFloorScore = info.lexHitScore * (1 + 0.2 * Math.max(0, termCoverage - 1));
return {
floor,
score: finalFloorScore,
termCoverage,
terms: [...info.terms],
};
})
.sort((a, b) => b.score - a.score);
out.lexHitButNotSelected = candidates.length;
// Cluster by floor distance and keep the highest score per cluster.
const selected = [];
for (const c of candidates) {
const conflict = selected.some(s => Math.abs(s.floor - c.floor) <= CONFIG.MUST_KEEP_CLUSTER_WINDOW);
if (conflict) continue;
selected.push(c);
if (selected.length >= CONFIG.MUST_KEEP_MAX_FLOORS) break;
}
out.floors = selected;
out.floorSet = new Set(selected.map(x => x.floor));
return out;
}
// ═══════════════════════════════════════════════════════════════════════════ // ═══════════════════════════════════════════════════════════════════════════
// [Stage 6] Floor 融合 + Rerank // [Stage 6] Floor 融合 + Rerank
// ═══════════════════════════════════════════════════════════════════════════ // ═══════════════════════════════════════════════════════════════════════════
async function locateAndPullEvidence(anchorHits, queryVector, rerankQuery, lexicalResult, metrics) { async function locateAndPullEvidence(anchorHits, queryVector, rerankQuery, lexicalResult, lexicalTerms, metrics) {
const { chatId, chat, name1, name2 } = getContext(); const { chatId, chat, name1, name2 } = getContext();
if (!chatId) return { l0Selected: [], l1ScoredByFloor: new Map() }; if (!chatId) return { l0Selected: [], l1ScoredByFloor: new Map(), mustKeepFloors: [] };
const T_Start = performance.now(); const T_Start = performance.now();
@@ -558,17 +658,8 @@ async function locateAndPullEvidence(anchorHits, queryVector, rerankQuery, lexic
for (const { chunkId, score } of (lexicalResult?.chunkScores || [])) { for (const { chunkId, score } of (lexicalResult?.chunkScores || [])) {
const match = chunkId?.match(/^c-(\d+)-/); const match = chunkId?.match(/^c-(\d+)-/);
if (!match) continue; if (!match) continue;
let floor = parseInt(match[1], 10); const floor = mapChunkFloorToAiFloor(parseInt(match[1], 10), chat);
if (floor == null) continue;
// USER floor → AI floor 映射
if (chat?.[floor]?.is_user) {
const aiFloor = floor + 1;
if (aiFloor < chat.length && !chat[aiFloor]?.is_user) {
floor = aiFloor;
} else {
continue;
}
}
// 预过滤:必须有 L0 atoms // 预过滤:必须有 L0 atoms
if (!atomFloorSet.has(floor)) continue; if (!atomFloorSet.has(floor)) continue;
@@ -600,6 +691,12 @@ async function locateAndPullEvidence(anchorHits, queryVector, rerankQuery, lexic
metrics.lexical.floorFilteredByDense = lexFloorFilteredByDense; metrics.lexical.floorFilteredByDense = lexFloorFilteredByDense;
} }
// ─────────────────────────────────────────────────────────────────
// 6b.5 Fusion Guard: lexical must-keep floors
// ─────────────────────────────────────────────────────────────────
const mustKeep = buildMustKeepFloors(lexicalResult, lexicalTerms, atomFloorSet, chat);
// ───────────────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────────────
// 6c. Floor W-RRF 融合 // 6c. Floor W-RRF 融合
// ───────────────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────────────
@@ -617,6 +714,10 @@ async function locateAndPullEvidence(anchorHits, queryVector, rerankQuery, lexic
metrics.fusion.denseAggMethod = 'maxSim'; metrics.fusion.denseAggMethod = 'maxSim';
metrics.fusion.lexDensityBonus = CONFIG.LEX_DENSITY_BONUS; metrics.fusion.lexDensityBonus = CONFIG.LEX_DENSITY_BONUS;
metrics.evidence.floorCandidates = fusedFloors.length; metrics.evidence.floorCandidates = fusedFloors.length;
metrics.evidence.mustKeepTermsCount = mustKeep.terms.length;
metrics.evidence.mustKeepFloorsCount = mustKeep.floors.length;
metrics.evidence.mustKeepFloors = mustKeep.floors.map(x => x.floor).slice(0, 10);
metrics.evidence.lexHitButNotSelected = Math.max(0, mustKeep.lexHitButNotSelected - mustKeep.floors.length);
} }
if (fusedFloors.length === 0) { if (fusedFloors.length === 0) {
@@ -628,7 +729,7 @@ async function locateAndPullEvidence(anchorHits, queryVector, rerankQuery, lexic
metrics.evidence.l1CosineTime = 0; metrics.evidence.l1CosineTime = 0;
metrics.evidence.rerankApplied = false; metrics.evidence.rerankApplied = false;
} }
return { l0Selected: [], l1ScoredByFloor: new Map() }; return { l0Selected: [], l1ScoredByFloor: new Map(), mustKeepFloors: [] };
} }
// ───────────────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────────────
@@ -650,8 +751,10 @@ async function locateAndPullEvidence(anchorHits, queryVector, rerankQuery, lexic
// 6e. 构建 rerank documents每个 floor: USER chunks + AI chunks // 6e. 构建 rerank documents每个 floor: USER chunks + AI chunks
// ───────────────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────────────
const normalFloors = fusedFloors.filter(f => !mustKeep.floorSet.has(f.id));
const rerankCandidates = []; const rerankCandidates = [];
for (const f of fusedFloors) { for (const f of normalFloors) {
const aiFloor = f.id; const aiFloor = f.id;
const userFloor = aiFloor - 1; const userFloor = aiFloor - 1;
@@ -698,6 +801,7 @@ async function locateAndPullEvidence(anchorHits, queryVector, rerankQuery, lexic
metrics.evidence.rerankApplied = true; metrics.evidence.rerankApplied = true;
metrics.evidence.beforeRerank = rerankCandidates.length; metrics.evidence.beforeRerank = rerankCandidates.length;
metrics.evidence.afterRerank = reranked.length; metrics.evidence.afterRerank = reranked.length;
metrics.evidence.droppedByRerankCount = Math.max(0, rerankCandidates.length - reranked.length);
metrics.evidence.rerankFailed = reranked.some(c => c._rerankFailed); metrics.evidence.rerankFailed = reranked.some(c => c._rerankFailed);
metrics.evidence.rerankTime = rerankTime; metrics.evidence.rerankTime = rerankTime;
metrics.timing.evidenceRerank = rerankTime; metrics.timing.evidenceRerank = rerankTime;
@@ -722,9 +826,12 @@ async function locateAndPullEvidence(anchorHits, queryVector, rerankQuery, lexic
// 6g. 收集 L0 atoms // 6g. 收集 L0 atoms
// ───────────────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────────────
// 仅保留“真实 dense 命中”的 L0 原子: // Floor-based L0 collection:
// 旧逻辑按 floor 全塞,容易把同层无关原子带进来。 // once a floor is selected by fusion/rerank, L0 atoms come from that floor.
const atomById = new Map(getStateAtoms().map(a => [a.atomId, a])); // Dense anchor hits are used as similarity signals (ranking), not hard admission.
const allAtoms = getStateAtoms();
const atomById = new Map(allAtoms.map(a => [a.atomId, a]));
const anchorSimilarityByAtomId = new Map((anchorHits || []).map(h => [h.atomId, h.similarity || 0]));
const matchedAtomsByFloor = new Map(); const matchedAtomsByFloor = new Map();
for (const hit of (anchorHits || [])) { for (const hit of (anchorHits || [])) {
const atom = hit.atom || atomById.get(hit.atomId); const atom = hit.atom || atomById.get(hit.atomId);
@@ -739,15 +846,42 @@ async function locateAndPullEvidence(anchorHits, queryVector, rerankQuery, lexic
arr.sort((a, b) => b.similarity - a.similarity); arr.sort((a, b) => b.similarity - a.similarity);
} }
const mustKeepMissing = mustKeep.floors
.filter(mf => !reranked.some(r => r.floor === mf.floor))
.map(mf => ({
floor: mf.floor,
_rerankScore: 0.12 + Math.min(0.05, 0.01 * (mf.termCoverage || 1)),
_isMustKeep: true,
}));
const finalFloorItems = [
...reranked.map(r => ({ ...r, _isMustKeep: false })),
...mustKeepMissing,
];
const allAtomsByFloor = new Map();
for (const atom of allAtoms) {
const f = Number(atom?.floor);
if (!Number.isInteger(f) || f < 0) continue;
if (!allAtomsByFloor.has(f)) allAtomsByFloor.set(f, []);
allAtomsByFloor.get(f).push(atom);
}
const l0Selected = []; const l0Selected = [];
for (const item of reranked) { for (const item of finalFloorItems) {
const floor = item.floor; const floor = item.floor;
const rerankScore = item._rerankScore || 0; const rerankScore = Number.isFinite(item?._rerankScore) ? item._rerankScore : 0;
// 仅收集该 floor 中真实命中的 L0 atoms const floorAtoms = allAtomsByFloor.get(floor) || [];
const floorMatchedAtoms = matchedAtomsByFloor.get(floor) || []; floorAtoms.sort((a, b) => {
for (const { atom, similarity } of floorMatchedAtoms) { const sa = anchorSimilarityByAtomId.get(a.atomId) || 0;
const sb = anchorSimilarityByAtomId.get(b.atomId) || 0;
return sb - sa;
});
for (const atom of floorAtoms) {
const similarity = anchorSimilarityByAtomId.get(atom.atomId) || 0;
l0Selected.push({ l0Selected.push({
id: `anchor-${atom.atomId}`, id: `anchor-${atom.atomId}`,
atomId: atom.atomId, atomId: atom.atomId,
@@ -762,7 +896,7 @@ async function locateAndPullEvidence(anchorHits, queryVector, rerankQuery, lexic
} }
if (metrics) { if (metrics) {
metrics.evidence.floorsSelected = reranked.length; metrics.evidence.floorsSelected = finalFloorItems.length;
metrics.evidence.l0Collected = l0Selected.length; metrics.evidence.l0Collected = l0Selected.length;
metrics.evidence.l1Pulled = 0; metrics.evidence.l1Pulled = 0;
@@ -777,10 +911,14 @@ async function locateAndPullEvidence(anchorHits, queryVector, rerankQuery, lexic
} }
xbLog.info(MODULE_ID, xbLog.info(MODULE_ID,
`Evidence: ${denseFloorRank.length} dense floors + ${lexFloorRank.length} lex floors (${lexFloorFilteredByDense} lex filtered by dense) → fusion=${fusedFloors.length} → rerank=${reranked.length} floors → L0=${l0Selected.length} (${totalTime}ms)` `Evidence: ${denseFloorRank.length} dense floors + ${lexFloorRank.length} lex floors (${lexFloorFilteredByDense} lex filtered by dense) → fusion=${fusedFloors.length} → rerank(normal)=${reranked.length} + mustKeep=${mustKeepMissing.length} floors → L0=${l0Selected.length} (${totalTime}ms)`
); );
return { l0Selected, l1ScoredByFloor }; return {
l0Selected,
l1ScoredByFloor,
mustKeepFloors: mustKeep.floors.map(x => x.floor),
};
} }
// ═══════════════════════════════════════════════════════════════════════════ // ═══════════════════════════════════════════════════════════════════════════
@@ -965,6 +1103,7 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) {
focusEntities: [], focusEntities: [],
focusTerms: [], focusTerms: [],
focusCharacters: [], focusCharacters: [],
mustKeepFloors: [],
elapsed: metrics.timing.total, elapsed: metrics.timing.total,
logText: 'No events.', logText: 'No events.',
metrics, metrics,
@@ -984,6 +1123,12 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) {
: CONFIG.LAST_MESSAGES_K; : CONFIG.LAST_MESSAGES_K;
const lastMessages = getLastMessages(chat, lastMessagesCount, excludeLastAi); const lastMessages = getLastMessages(chat, lastMessagesCount, excludeLastAi);
// Non-blocking preload: keep recall latency stable.
// If not ready yet, query-builder will gracefully fall back to TF terms.
getLexicalIndex().catch((e) => {
xbLog.warn(MODULE_ID, 'Preload lexical index failed; continue with TF fallback', e);
});
const bundle = buildQueryBundle(lastMessages, pendingUserMessage); const bundle = buildQueryBundle(lastMessages, pendingUserMessage);
const focusTerms = bundle.focusTerms || bundle.focusEntities || []; const focusTerms = bundle.focusTerms || bundle.focusEntities || [];
const focusCharacters = bundle.focusCharacters || []; const focusCharacters = bundle.focusCharacters || [];
@@ -1015,6 +1160,7 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) {
focusEntities: focusTerms, focusEntities: focusTerms,
focusTerms, focusTerms,
focusCharacters, focusCharacters,
mustKeepFloors: [],
elapsed: metrics.timing.total, elapsed: metrics.timing.total,
logText: 'No query segments.', logText: 'No query segments.',
metrics, metrics,
@@ -1037,6 +1183,7 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) {
focusEntities: focusTerms, focusEntities: focusTerms,
focusTerms, focusTerms,
focusCharacters, focusCharacters,
mustKeepFloors: [],
elapsed: metrics.timing.total, elapsed: metrics.timing.total,
logText: 'Embedding failed (round 1, after retry).', logText: 'Embedding failed (round 1, after retry).',
metrics, metrics,
@@ -1051,6 +1198,7 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) {
focusEntities: focusTerms, focusEntities: focusTerms,
focusTerms, focusTerms,
focusCharacters, focusCharacters,
mustKeepFloors: [],
elapsed: metrics.timing.total, elapsed: metrics.timing.total,
logText: 'Empty query vectors (round 1).', logText: 'Empty query vectors (round 1).',
metrics, metrics,
@@ -1071,6 +1219,7 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) {
focusEntities: focusTerms, focusEntities: focusTerms,
focusTerms, focusTerms,
focusCharacters, focusCharacters,
mustKeepFloors: [],
elapsed: metrics.timing.total, elapsed: metrics.timing.total,
logText: 'Weighted average produced empty vector.', logText: 'Weighted average produced empty vector.',
metrics, metrics,
@@ -1161,6 +1310,10 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) {
atomIds: [], atomFloors: new Set(), atomIds: [], atomFloors: new Set(),
chunkIds: [], chunkFloors: new Set(), chunkIds: [], chunkFloors: new Set(),
eventIds: [], chunkScores: [], searchTime: 0, eventIds: [], chunkScores: [], searchTime: 0,
idfEnabled: false, idfDocCount: 0, topIdfTerms: [], termSearches: 0,
queryTerms: [],
termFloorHits: {},
floorLexScores: [],
}; };
let indexReadyTime = 0; let indexReadyTime = 0;
@@ -1184,6 +1337,10 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) {
metrics.lexical.searchTime = lexicalResult.searchTime || 0; metrics.lexical.searchTime = lexicalResult.searchTime || 0;
metrics.lexical.indexReadyTime = indexReadyTime; metrics.lexical.indexReadyTime = indexReadyTime;
metrics.lexical.terms = bundle.lexicalTerms.slice(0, 10); metrics.lexical.terms = bundle.lexicalTerms.slice(0, 10);
metrics.lexical.idfEnabled = !!lexicalResult.idfEnabled;
metrics.lexical.idfDocCount = lexicalResult.idfDocCount || 0;
metrics.lexical.topIdfTerms = lexicalResult.topIdfTerms || [];
metrics.lexical.termSearches = lexicalResult.termSearches || 0;
} }
// 合并 L2 eventslexical 命中但 dense 未命中的 events // 合并 L2 eventslexical 命中但 dense 未命中的 events
@@ -1238,18 +1395,19 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) {
} }
xbLog.info(MODULE_ID, xbLog.info(MODULE_ID,
`Lexical: chunks=${lexicalResult.chunkIds.length} events=${lexicalResult.eventIds.length} mergedEvents=+${lexicalEventCount} filteredByDense=${lexicalEventFilteredByDense} floorFiltered=${metrics.lexical.floorFilteredByDense || 0} (indexReady=${indexReadyTime}ms search=${lexicalResult.searchTime || 0}ms total=${lexTime}ms)` `Lexical: chunks=${lexicalResult.chunkIds.length} events=${lexicalResult.eventIds.length} mergedEvents=+${lexicalEventCount} filteredByDense=${lexicalEventFilteredByDense} floorFiltered=${metrics.lexical.floorFilteredByDense || 0} idfEnabled=${lexicalResult.idfEnabled ? 'yes' : 'no'} idfDocs=${lexicalResult.idfDocCount || 0} termSearches=${lexicalResult.termSearches || 0} (indexReady=${indexReadyTime}ms search=${lexicalResult.searchTime || 0}ms total=${lexTime}ms)`
); );
// ═══════════════════════════════════════════════════════════════════ // ═══════════════════════════════════════════════════════════════════
// 阶段 6: Floor 粒度融合 + Rerank + L1 配对 // 阶段 6: Floor 粒度融合 + Rerank + L1 配对
// ═══════════════════════════════════════════════════════════════════ // ═══════════════════════════════════════════════════════════════════
const { l0Selected, l1ScoredByFloor } = await locateAndPullEvidence( const { l0Selected, l1ScoredByFloor, mustKeepFloors } = await locateAndPullEvidence(
anchorHits, anchorHits,
queryVector_v1, queryVector_v1,
bundle.rerankQuery, bundle.rerankQuery,
lexicalResult, lexicalResult,
bundle.lexicalTerms,
metrics metrics
); );
@@ -1379,6 +1537,7 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) {
console.log(`Round 2 Anchors: ${anchorHits.length} hits → ${anchorFloors_dense.size} floors`); console.log(`Round 2 Anchors: ${anchorHits.length} hits → ${anchorFloors_dense.size} floors`);
console.log(`Lexical: chunks=${lexicalResult.chunkIds.length} events=${lexicalResult.eventIds.length} evtMerged=+${lexicalEventCount} evtFiltered=${lexicalEventFilteredByDense} floorFiltered=${metrics.lexical.floorFilteredByDense || 0} (idx=${indexReadyTime}ms search=${lexicalResult.searchTime || 0}ms total=${lexTime}ms)`); console.log(`Lexical: chunks=${lexicalResult.chunkIds.length} events=${lexicalResult.eventIds.length} evtMerged=+${lexicalEventCount} evtFiltered=${lexicalEventFilteredByDense} floorFiltered=${metrics.lexical.floorFilteredByDense || 0} (idx=${indexReadyTime}ms search=${lexicalResult.searchTime || 0}ms total=${lexTime}ms)`);
console.log(`Fusion (floor, weighted): dense=${metrics.fusion.denseFloors} lex=${metrics.fusion.lexFloors} → cap=${metrics.fusion.afterCap} (${metrics.fusion.time}ms)`); console.log(`Fusion (floor, weighted): dense=${metrics.fusion.denseFloors} lex=${metrics.fusion.lexFloors} → cap=${metrics.fusion.afterCap} (${metrics.fusion.time}ms)`);
console.log(`Fusion Guard: mustKeepTerms=${metrics.evidence.mustKeepTermsCount || 0} mustKeepFloors=[${(metrics.evidence.mustKeepFloors || []).join(', ')}]`);
console.log(`Floor Rerank: ${metrics.evidence.beforeRerank || 0}${metrics.evidence.floorsSelected || 0} floors → L0=${metrics.evidence.l0Collected || 0} (${metrics.evidence.rerankTime || 0}ms)`); console.log(`Floor Rerank: ${metrics.evidence.beforeRerank || 0}${metrics.evidence.floorsSelected || 0} floors → L0=${metrics.evidence.l0Collected || 0} (${metrics.evidence.rerankTime || 0}ms)`);
console.log(`L1: ${metrics.evidence.l1Pulled || 0} pulled → ${metrics.evidence.l1Attached || 0} attached (${metrics.evidence.l1CosineTime || 0}ms)`); console.log(`L1: ${metrics.evidence.l1Pulled || 0} pulled → ${metrics.evidence.l1Attached || 0} attached (${metrics.evidence.l1CosineTime || 0}ms)`);
console.log(`Events: ${eventHits.length} hits (l0Linked=+${l0LinkedCount}), ${causalChain.length} causal`); console.log(`Events: ${eventHits.length} hits (l0Linked=+${l0LinkedCount}), ${causalChain.length} causal`);
@@ -1393,6 +1552,7 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) {
focusEntities: focusTerms, focusEntities: focusTerms,
focusTerms, focusTerms,
focusCharacters, focusCharacters,
mustKeepFloors: mustKeepFloors || [],
elapsed: metrics.timing.total, elapsed: metrics.timing.total,
metrics, metrics,
}; };

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,21 @@
The MIT License (MIT)
Copyright (c) 2020 Gene Diaz
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@@ -0,0 +1,15 @@
# stopwords sources for story-summary
- Dataset: `stopwords-iso` (npm package, version 1.1.0)
- Repository: https://github.com/stopwords-iso/stopwords-iso
- License: MIT
- Snapshot date: 2026-02-16
- Languages used: `zh`, `ja`, `en`
- Local snapshot files:
- `stopwords-iso.zh.txt`
- `stopwords-iso.ja.txt`
- `stopwords-iso.en.txt`
Generation note:
- `modules/story-summary/vector/utils/stopwords-base.js` is generated from these snapshot files.
- Keep `stopwords-patch.js` for tiny domain overrides only.

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,134 @@
あそこ
あっ
あの
あのかた
あの人
あり
あります
ある
あれ
いう
います
いる
うち
および
おり
おります
かつて
から
ここ
こちら
こと
この
これ
これら
さらに
しかし
する
せる
そこ
そして
その
その他
その後
それ
それぞれ
それで
ただし
たち
ため
たり
だっ
だれ
でき
できる
です
では
でも
という
といった
とき
ところ
として
とともに
とも
と共に
どこ
どの
ない
なお
なかっ
ながら
なく
なっ
など
なに
なら
なり
なる
なん
において
における
について
にて
によって
により
による
に対して
に対する
に関する
ので
のみ
ほか
ほとんど
ほど
ます
また
または
まで
もの
ものの
よう
より
られ
られる
れる
及び
彼女
我々
特に
私達
貴方
貴方方

View File

@@ -0,0 +1,794 @@
一个
一些
一何
一切
一则
一方面
一旦
一来
一样
一种
一般
一转眼
万一
上下
不仅
不但
不光
不单
不只
不外乎
不如
不妨
不尽
不尽然
不得
不怕
不惟
不成
不拘
不料
不是
不比
不然
不特
不独
不管
不至于
不若
不论
不过
不问
与其
与其说
与否
与此同时
且不说
且说
两者
个别
为了
为什么
为何
为止
为此
为着
乃至
乃至于
之一
之所以
之类
乌乎
也好
也罢
二来
于是
于是乎
云云
云尔
人们
人家
什么
什么样
介于
仍旧
从此
从而
他人
他们
他们们
以上
以为
以便
以免
以及
以故
以期
以来
以至
以至于
以致
任何
任凭
似的
但凡
但是
何以
何况
何处
何时
余外
作为
你们
使
使得
例如
依据
依照
便于
俺们
倘使
倘或
倘然
倘若
借傥然
假使
假如
假若
先不先
光是
全体
全部
关于
关于具体地说
其一
其中
其二
其他
其余
其它
其次
具体地说
具体说来
兼之
再其次
再则
再有
再者
再者说
再说
况且
几时
凡是
凭借
出于
出来
分别
则甚
别人
别处
别是
别的
别管
别说
前后
前此
前者
加之
加以
即令
即使
即便
即如
即或
即若
又及
及其
及至
反之
反而
反过来
反过来说
受到
另一方面
另外
另悉
只当
只怕
只是
只有
只消
只要
只限
叮咚
可以
可是
可见
各个
各位
各种
各自
同时
后者
向使
向着
否则
吧哒
呜呼
呵呵
呼哧
咱们
哈哈
哎呀
哎哟
哪个
哪些
哪儿
哪天
哪年
哪怕
哪样
哪边
哪里
哼唷
唯有
啪达
啷当
喔唷
嗡嗡
嘎登
嘿嘿
因为
因了
因此
因着
因而
固然
在下
在于
基于
处在
多么
多少
大家
她们
如上
如上所述
如下
如何
如其
如同
如是
如果
如此
如若
始而
孰料
孰知
宁可
宁愿
宁肯
它们
对于
对待
对方
对比
尔后
尔尔
尚且
就是
就是了
就是说
就算
就要
尽管
尽管如此
岂但
已矣
巴巴
并且
庶乎
庶几
开外
开始
归齐
当地
当然
当着
彼时
彼此
得了
怎么
怎么办
怎么样
怎奈
怎样
总之
总的来看
总的来说
总的说来
总而言之
恰恰相反
惟其
慢说
我们
或则
或是
或曰
或者
截至
所以
所在
所幸
所有
才能
打从
抑或
按照
换句话说
换言之
据此
接着
故此
故而
旁人
无宁
无论
既往
既是
既然
时候
是以
是的
替代
有些
有关
有及
有时
有的
朝着
本人
本地
本着
本身
来着
来自
来说
极了
果然
果真
某个
某些
某某
根据
正值
正如
正巧
正是
此地
此处
此外
此时
此次
此间
毋宁
每当
比及
比如
比方
没奈何
沿
沿着
漫说
然则
然后
然而
照着
犹且
犹自
甚且
甚么
甚或
甚而
甚至
甚至于
用来
由于
由是
由此
由此可见
的确
的话
直到
相对而言
省得
眨眼
着呢
矣乎
矣哉
竟而
等到
等等
简言之
类如
紧接着
纵令
纵使
纵然
经过
结果
继之
继后
继而
综上所述
罢了
而且
而况
而后
而外
而已
而是
而言
能否
自个儿
自从
自各儿
自后
自家
自己
自打
自身
至于
至今
至若
般的
若夫
若是
若果
若非
莫不然
莫如
莫若
虽则
虽然
虽说
要不
要不是
要不然
要么
要是
譬喻
譬如
许多
设使
设或
设若
诚如
诚然
说来
诸位
诸如
谁人
谁料
谁知
贼死
赖以
起见
趁着
越是
较之
还是
还有
还要
这一来
这个
这么
这么些
这么样
这么点儿
这些
这会儿
这儿
这就是说
这时
这样
这次
这般
这边
这里
进而
连同
逐步
通过
遵循
遵照
那个
那么
那么些
那么样
那些
那会儿
那儿
那时
那样
那般
那边
那里
鄙人
鉴于
针对
除了
除外
除开
除此之外
除非
随后
随时
随着
难道说
非但
非徒
非特
非独
顺着
首先
︿

View File

@@ -0,0 +1,9 @@
// Small domain-level tuning surface.
// Keep this file tiny: add/remove only words that are repeatedly noisy in real logs.
// Extra stopwords on top of BASE_STOP_WORDS.
export const DOMAIN_STOP_WORDS = [];
// High-value words that must never be filtered as stopwords.
// Default to empty for plugin-wide deployment; entity names are already protected dynamically.
export const KEEP_WORDS = [];

View File

@@ -18,6 +18,8 @@
import { extensionFolderPath } from '../../../../core/constants.js'; import { extensionFolderPath } from '../../../../core/constants.js';
import { xbLog } from '../../../../core/debug-core.js'; import { xbLog } from '../../../../core/debug-core.js';
import { BASE_STOP_WORDS } from './stopwords-base.js';
import { DOMAIN_STOP_WORDS, KEEP_WORDS } from './stopwords-patch.js';
const MODULE_ID = 'tokenizer'; const MODULE_ID = 'tokenizer';
@@ -61,44 +63,30 @@ let entityList = [];
/** @type {Set<string>} 已注入结巴的实体(避免重复 add_word */ /** @type {Set<string>} 已注入结巴的实体(避免重复 add_word */
let injectedEntities = new Set(); let injectedEntities = new Set();
let entityKeepSet = new Set();
// ═══════════════════════════════════════════════════════════════════════════ // ═══════════════════════════════════════════════════════════════════════════
// 停用词 // 停用词
// ═══════════════════════════════════════════════════════════════════════════ // ═══════════════════════════════════════════════════════════════════════════
const STOP_WORDS = new Set([ const STATIC_KEEP_WORDS = new Set((KEEP_WORDS || [])
// 中文高频虚词 .map(w => String(w || '').trim().toLowerCase())
'的', '了', '在', '是', '我', '有', '和', '就', '不', '人', .filter(Boolean));
'都', '一', '一个', '上', '也', '很', '到', '说', '要', '去',
'你', '会', '着', '没有', '看', '好', '自己', '这', '他', '她', // Standard source only: stopwords-iso snapshot + small domain patch.
'它', '吗', '什么', '那', '里', '来', '吧', '呢', '啊', '哦', const EFFECTIVE_STOP_WORDS = new Set(
'嗯', '呀', '哈', '嘿', '喂', '哎', '唉', '哇', '呃', '嘛', [...BASE_STOP_WORDS, ...DOMAIN_STOP_WORDS]
'把', '被', '让', '给', '从', '', '对', '跟', '比', '但', .map(w => String(w || '').trim().toLowerCase())
'而', '或', '如果', '因为', '所以', '虽然', '但是', '然后', .filter(Boolean),
'可以', '这样', '那样', '怎么', '为什么', '什么样', '哪里', );
'时候', '现在', '已经', '还是', '只是', '可能', '应该', '知道',
'觉得', '开始', '一下', '一些', '这个', '那个', '他们', '我们', function shouldKeepTokenByWhitelist(token) {
'你们', '自己', '起来', '出来', '进去', '回来', '过来', '下去', const t = String(token || '').trim().toLowerCase();
// 日语常见虚词≥2字匹配 TinySegmenter 产出粒度) if (!t) return false;
'です', 'ます', 'した', 'して', 'する', 'ない', 'いる', 'ある', if (STATIC_KEEP_WORDS.has(t)) return true;
'なる', 'れる', 'られ', 'られる', if (entityKeepSet.has(t)) return true;
'この', 'その', 'あの', 'どの', 'ここ', 'そこ', 'あそこ', return false;
'これ', 'それ', 'あれ', 'どれ', }
'ても', 'から', 'まで', 'ので', 'のに', 'けど', 'だけ',
'もう', 'まだ', 'とても', 'ちょっと', 'やっぱり',
// 英文常见停用词
'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been',
'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
'would', 'could', 'should', 'may', 'might', 'can', 'shall',
'and', 'but', 'or', 'not', 'no', 'nor', 'so', 'yet',
'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from',
'it', 'its', 'he', 'she', 'his', 'her', 'they', 'them',
'this', 'that', 'these', 'those', 'i', 'me', 'my', 'you', 'your',
'we', 'our', 'if', 'then', 'than', 'when', 'what', 'which',
'who', 'how', 'where', 'there', 'here', 'all', 'each', 'every',
'both', 'few', 'more', 'most', 'other', 'some', 'such',
'only', 'own', 'same', 'just', 'very', 'also', 'about',
]);
// ═══════════════════════════════════════════════════════════════════════════ // ═══════════════════════════════════════════════════════════════════════════
// Unicode 分类 // Unicode 分类
@@ -571,6 +559,7 @@ export function getState() {
export function injectEntities(lexicon, displayMap) { export function injectEntities(lexicon, displayMap) {
if (!lexicon?.size) { if (!lexicon?.size) {
entityList = []; entityList = [];
entityKeepSet = new Set();
return; return;
} }
@@ -586,6 +575,7 @@ export function injectEntities(lexicon, displayMap) {
// 按长度降序(最长匹配优先) // 按长度降序(最长匹配优先)
entities.sort((a, b) => b.length - a.length); entities.sort((a, b) => b.length - a.length);
entityList = entities; entityList = entities;
entityKeepSet = new Set(entities.map(e => String(e || '').trim().toLowerCase()).filter(Boolean));
// 如果结巴已就绪,注入自定义词 // 如果结巴已就绪,注入自定义词
if (wasmState === WasmState.READY && jiebaAddWord) { if (wasmState === WasmState.READY && jiebaAddWord) {
@@ -656,7 +646,7 @@ export function tokenize(text) {
if (!cleaned) continue; if (!cleaned) continue;
if (cleaned.length < 2) continue; if (cleaned.length < 2) continue;
if (STOP_WORDS.has(cleaned)) continue; if (EFFECTIVE_STOP_WORDS.has(cleaned) && !shouldKeepTokenByWhitelist(cleaned)) continue;
if (seen.has(cleaned)) continue; if (seen.has(cleaned)) continue;
// 过滤纯标点/特殊字符 // 过滤纯标点/特殊字符
@@ -728,7 +718,7 @@ export function tokenizeForIndex(text) {
.map(t => t.trim().toLowerCase()) .map(t => t.trim().toLowerCase())
.filter(t => { .filter(t => {
if (!t || t.length < 2) return false; if (!t || t.length < 2) return false;
if (STOP_WORDS.has(t)) return false; if (EFFECTIVE_STOP_WORDS.has(t) && !shouldKeepTokenByWhitelist(t)) return false;
if (/^[\s\x00-\x1F\p{P}\p{S}]+$/u.test(t)) return false; if (/^[\s\x00-\x1F\p{P}\p{S}]+$/u.test(t)) return false;
return true; return true;
}); });
@@ -744,6 +734,7 @@ export function tokenizeForIndex(text) {
*/ */
export function reset() { export function reset() {
entityList = []; entityList = [];
entityKeepSet = new Set();
injectedEntities.clear(); injectedEntities.clear();
// 不重置 WASM 状态(避免重复加载) // 不重置 WASM 状态(避免重复加载)
} }