feat(recall): add diffusion stage and improve retrieval metrics
This commit is contained in:
@@ -43,18 +43,22 @@ function canNotifyRecallFail() {
|
|||||||
// 预算常量
|
// 预算常量
|
||||||
// ─────────────────────────────────────────────────────────────────────────────
|
// ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
const MAIN_BUDGET_MAX = 10000;
|
const SHARED_POOL_MAX = 10000;
|
||||||
const DISTANT_EVIDENCE_MAX = 2500;
|
|
||||||
const RECENT_EVIDENCE_MAX = 5000;
|
|
||||||
const TOTAL_BUDGET_MAX = 15000;
|
|
||||||
const CONSTRAINT_MAX = 2000;
|
const CONSTRAINT_MAX = 2000;
|
||||||
const ARCS_MAX = 1500;
|
const ARCS_MAX = 1500;
|
||||||
|
const EVENT_BUDGET_MAX = 5000;
|
||||||
|
const RELATED_EVENT_MAX = 1000;
|
||||||
|
const SUMMARIZED_EVIDENCE_MAX = 1500;
|
||||||
|
const UNSUMMARIZED_EVIDENCE_MAX = 5000;
|
||||||
const TOP_N_STAR = 5;
|
const TOP_N_STAR = 5;
|
||||||
|
|
||||||
|
// 邻近补挂:未被事件消费的 L0,如果距最近事件 ≤ 此值则补挂
|
||||||
|
const NEARBY_FLOOR_TOLERANCE = 2;
|
||||||
|
|
||||||
// L0 显示文本:分号拼接 vs 多行模式的阈值
|
// L0 显示文本:分号拼接 vs 多行模式的阈值
|
||||||
const L0_JOINED_MAX_LENGTH = 120;
|
const L0_JOINED_MAX_LENGTH = 120;
|
||||||
// 背景证据实体过滤旁通阈值(与事件过滤策略一致)
|
// 背景证据:无实体匹配时保留的最低相似度(与 recall.js CONFIG.EVENT_ENTITY_BYPASS_SIM 保持一致)
|
||||||
const EVIDENCE_ENTITY_BYPASS_SIM = 0.80;
|
const EVIDENCE_ENTITY_BYPASS_SIM = 0.70;
|
||||||
|
|
||||||
// ─────────────────────────────────────────────────────────────────────────────
|
// ─────────────────────────────────────────────────────────────────────────────
|
||||||
// 工具函数
|
// 工具函数
|
||||||
@@ -157,7 +161,7 @@ function collectL0Entities(l0) {
|
|||||||
* 背景证据是否保留(按焦点实体过滤)
|
* 背景证据是否保留(按焦点实体过滤)
|
||||||
* 规则:
|
* 规则:
|
||||||
* 1) 无焦点实体:保留
|
* 1) 无焦点实体:保留
|
||||||
* 2) similarity >= 0.80:保留(旁通)
|
* 2) similarity >= 0.70:保留(旁通)
|
||||||
* 3) who/edges 命中焦点实体:保留
|
* 3) who/edges 命中焦点实体:保留
|
||||||
* 4) 兼容旧数据:semantic 文本包含焦点实体:保留
|
* 4) 兼容旧数据:semantic 文本包含焦点实体:保留
|
||||||
* 否则过滤。
|
* 否则过滤。
|
||||||
@@ -361,7 +365,7 @@ function formatArcLine(arc) {
|
|||||||
*/
|
*/
|
||||||
function buildL0DisplayText(l0) {
|
function buildL0DisplayText(l0) {
|
||||||
const atom = l0.atom || {};
|
const atom = l0.atom || {};
|
||||||
return String(atom.scene || atom.semantic || l0.text || '').trim() || '(未知锚点)';
|
return String(atom.semantic || l0.text || '').trim() || '(未知锚点)';
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -474,15 +478,15 @@ function buildEvidenceGroup(floor, l0AtomsForFloor, l1ByFloor) {
|
|||||||
* 格式化一个证据组为文本行数组
|
* 格式化一个证据组为文本行数组
|
||||||
*
|
*
|
||||||
* 短行模式(拼接后 ≤ 120 字):
|
* 短行模式(拼接后 ≤ 120 字):
|
||||||
* › #500 [📌] 薇薇保持跪趴姿势;薇薇展示细节;薇薇与蓝袖之间:被审视
|
* › #500 [📌] 小林整理会议记录;小周补充行动项;两人确认下周安排
|
||||||
* ┌ #499 [蓝袖] ...
|
* ┌ #499 [小周] ...
|
||||||
* › #500 [角色] ...
|
* › #500 [角色] ...
|
||||||
*
|
*
|
||||||
* 长行模式(拼接后 > 120 字):
|
* 长行模式(拼接后 > 120 字):
|
||||||
* › #500 [📌] 薇薇保持跪趴姿势 在书房
|
* › #500 [📌] 小林在图书馆归档旧资料
|
||||||
* │ 薇薇展示肛周细节 在书房
|
* │ 小周核对目录并修正编号
|
||||||
* │ 薇薇与蓝袖之间:身体被审视 在书房
|
* │ 两人讨论借阅规则并更新说明
|
||||||
* ┌ #499 [蓝袖] ...
|
* ┌ #499 [小周] ...
|
||||||
* › #500 [角色] ...
|
* › #500 [角色] ...
|
||||||
*
|
*
|
||||||
* @param {EvidenceGroup} group - 证据组
|
* @param {EvidenceGroup} group - 证据组
|
||||||
@@ -705,7 +709,7 @@ async function buildVectorPrompt(store, recallResult, causalById, focusEntities,
|
|||||||
const T_Start = performance.now();
|
const T_Start = performance.now();
|
||||||
|
|
||||||
const data = store.json || {};
|
const data = store.json || {};
|
||||||
const total = { used: 0, max: MAIN_BUDGET_MAX };
|
const total = { used: 0, max: SHARED_POOL_MAX };
|
||||||
|
|
||||||
// 从 recallResult 解构
|
// 从 recallResult 解构
|
||||||
const l0Selected = recallResult?.l0Selected || [];
|
const l0Selected = recallResult?.l0Selected || [];
|
||||||
@@ -723,7 +727,7 @@ async function buildVectorPrompt(store, recallResult, causalById, focusEntities,
|
|||||||
|
|
||||||
// 注入统计
|
// 注入统计
|
||||||
const injectionStats = {
|
const injectionStats = {
|
||||||
budget: { max: TOTAL_BUDGET_MAX, used: 0 },
|
budget: { max: SHARED_POOL_MAX + UNSUMMARIZED_EVIDENCE_MAX, used: 0 },
|
||||||
constraint: { count: 0, tokens: 0, filtered: 0 },
|
constraint: { count: 0, tokens: 0, filtered: 0 },
|
||||||
arc: { count: 0, tokens: 0 },
|
arc: { count: 0, tokens: 0 },
|
||||||
event: { selected: 0, tokens: 0 },
|
event: { selected: 0, tokens: 0 },
|
||||||
@@ -817,6 +821,8 @@ async function buildVectorPrompt(store, recallResult, causalById, focusEntities,
|
|||||||
const eventHits = (recallResult?.events || []).filter(e => e?.event?.summary);
|
const eventHits = (recallResult?.events || []).filter(e => e?.event?.summary);
|
||||||
|
|
||||||
const candidates = [...eventHits].sort((a, b) => (b.similarity || 0) - (a.similarity || 0));
|
const candidates = [...eventHits].sort((a, b) => (b.similarity || 0) - (a.similarity || 0));
|
||||||
|
const eventBudget = { used: 0, max: Math.min(EVENT_BUDGET_MAX, total.max - total.used) };
|
||||||
|
const relatedBudget = { used: 0, max: RELATED_EVENT_MAX };
|
||||||
|
|
||||||
const selectedDirect = [];
|
const selectedDirect = [];
|
||||||
const selectedRelated = [];
|
const selectedRelated = [];
|
||||||
@@ -825,8 +831,10 @@ async function buildVectorPrompt(store, recallResult, causalById, focusEntities,
|
|||||||
const e = candidates[candidateRank];
|
const e = candidates[candidateRank];
|
||||||
|
|
||||||
if (total.used >= total.max) break;
|
if (total.used >= total.max) break;
|
||||||
|
if (eventBudget.used >= eventBudget.max) break;
|
||||||
|
|
||||||
const isDirect = e._recallType === "DIRECT";
|
const isDirect = e._recallType === "DIRECT";
|
||||||
|
if (!isDirect && relatedBudget.used >= relatedBudget.max) continue;
|
||||||
|
|
||||||
// 收集该事件范围内的 EvidenceGroup(per-floor)
|
// 收集该事件范围内的 EvidenceGroup(per-floor)
|
||||||
const evidenceGroups = collectEvidenceGroupsForEvent(e.event, l0Selected, l1ByFloor, usedL0Ids);
|
const evidenceGroups = collectEvidenceGroupsForEvent(e.event, l0Selected, l1ByFloor, usedL0Ids);
|
||||||
@@ -873,6 +881,8 @@ async function buildVectorPrompt(store, recallResult, causalById, focusEntities,
|
|||||||
injectionStats.event.selected++;
|
injectionStats.event.selected++;
|
||||||
injectionStats.event.tokens += costNoEvidence;
|
injectionStats.event.tokens += costNoEvidence;
|
||||||
total.used += costNoEvidence;
|
total.used += costNoEvidence;
|
||||||
|
eventBudget.used += costNoEvidence;
|
||||||
|
if (!isDirect) relatedBudget.used += costNoEvidence;
|
||||||
|
|
||||||
eventDetails.list.push({
|
eventDetails.list.push({
|
||||||
title: e.event?.title || e.event?.id,
|
title: e.event?.title || e.event?.id,
|
||||||
@@ -912,6 +922,8 @@ async function buildVectorPrompt(store, recallResult, causalById, focusEntities,
|
|||||||
injectionStats.evidence.l0InEvents += l0Count;
|
injectionStats.evidence.l0InEvents += l0Count;
|
||||||
injectionStats.evidence.l1InEvents += l1FloorCount;
|
injectionStats.evidence.l1InEvents += l1FloorCount;
|
||||||
total.used += cost;
|
total.used += cost;
|
||||||
|
eventBudget.used += cost;
|
||||||
|
if (!isDirect) relatedBudget.used += cost;
|
||||||
|
|
||||||
eventDetails.list.push({
|
eventDetails.list.push({
|
||||||
title: e.event?.title || e.event?.id,
|
title: e.event?.title || e.event?.id,
|
||||||
@@ -928,6 +940,73 @@ async function buildVectorPrompt(store, recallResult, causalById, focusEntities,
|
|||||||
selectedDirect.sort((a, b) => getEventSortKey(a.event) - getEventSortKey(b.event));
|
selectedDirect.sort((a, b) => getEventSortKey(a.event) - getEventSortKey(b.event));
|
||||||
selectedRelated.sort((a, b) => getEventSortKey(a.event) - getEventSortKey(b.event));
|
selectedRelated.sort((a, b) => getEventSortKey(a.event) - getEventSortKey(b.event));
|
||||||
|
|
||||||
|
// ═══════════════════════════════════════════════════════════════════
|
||||||
|
// 邻近补挂:未被事件消费的 L0,距最近已选事件 ≤ 2 楼则补挂
|
||||||
|
// 每个 L0 只挂最近的一个事件,不扩展事件范围,不产生重叠
|
||||||
|
// ═══════════════════════════════════════════════════════════════════
|
||||||
|
|
||||||
|
const allSelectedItems = [...selectedDirect, ...selectedRelated];
|
||||||
|
const nearbyByItem = new Map();
|
||||||
|
|
||||||
|
for (const l0 of l0Selected) {
|
||||||
|
if (usedL0Ids.has(l0.id)) continue;
|
||||||
|
|
||||||
|
let bestItem = null;
|
||||||
|
let bestDistance = Infinity;
|
||||||
|
|
||||||
|
for (const item of allSelectedItems) {
|
||||||
|
const range = parseFloorRange(item.event?.summary);
|
||||||
|
if (!range) continue;
|
||||||
|
|
||||||
|
let distance;
|
||||||
|
if (l0.floor < range.start) distance = range.start - l0.floor;
|
||||||
|
else if (l0.floor > range.end) distance = l0.floor - range.end;
|
||||||
|
else continue;
|
||||||
|
|
||||||
|
if (distance <= NEARBY_FLOOR_TOLERANCE && distance <= bestDistance) {
|
||||||
|
bestDistance = distance;
|
||||||
|
bestItem = item;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (bestItem) {
|
||||||
|
if (!nearbyByItem.has(bestItem)) nearbyByItem.set(bestItem, []);
|
||||||
|
nearbyByItem.get(bestItem).push(l0);
|
||||||
|
usedL0Ids.add(l0.id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const [item, nearbyL0s] of nearbyByItem) {
|
||||||
|
const floorMap = groupL0ByFloor(nearbyL0s);
|
||||||
|
|
||||||
|
for (const [floor, l0s] of floorMap) {
|
||||||
|
const group = buildEvidenceGroup(floor, l0s, l1ByFloor);
|
||||||
|
item.evidenceGroups.push(group);
|
||||||
|
}
|
||||||
|
|
||||||
|
item.evidenceGroups.sort((a, b) => a.floor - b.floor);
|
||||||
|
|
||||||
|
const newText = formatEventWithEvidence(item.event, 0, item.evidenceGroups, causalById);
|
||||||
|
const newTokens = estimateTokens(newText);
|
||||||
|
const delta = newTokens - item.tokens;
|
||||||
|
|
||||||
|
if (total.used + delta > total.max) {
|
||||||
|
for (const l0 of nearbyL0s) usedL0Ids.delete(l0.id);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
total.used += delta;
|
||||||
|
eventBudget.used += delta;
|
||||||
|
|
||||||
|
const isDirect = selectedDirect.includes(item);
|
||||||
|
if (!isDirect) relatedBudget.used += delta;
|
||||||
|
|
||||||
|
injectionStats.evidence.l0InEvents += nearbyL0s.length;
|
||||||
|
item.text = newText;
|
||||||
|
item.tokens = newTokens;
|
||||||
|
injectionStats.event.tokens += delta;
|
||||||
|
}
|
||||||
|
|
||||||
// 重新编号 + 星标
|
// 重新编号 + 星标
|
||||||
const directEventTexts = selectedDirect.map((it, i) => {
|
const directEventTexts = selectedDirect.map((it, i) => {
|
||||||
const numbered = renumberEventText(it.text, i + 1);
|
const numbered = renumberEventText(it.text, i + 1);
|
||||||
@@ -964,7 +1043,7 @@ async function buildVectorPrompt(store, recallResult, causalById, focusEntities,
|
|||||||
const distantL0 = remainingL0.filter(l0 => l0.floor <= lastSummarized);
|
const distantL0 = remainingL0.filter(l0 => l0.floor <= lastSummarized);
|
||||||
|
|
||||||
if (distantL0.length && total.used < total.max) {
|
if (distantL0.length && total.used < total.max) {
|
||||||
const distantBudget = { used: 0, max: Math.min(DISTANT_EVIDENCE_MAX, total.max - total.used) };
|
const distantBudget = { used: 0, max: Math.min(SUMMARIZED_EVIDENCE_MAX, total.max - total.used) };
|
||||||
|
|
||||||
// 按楼层排序(时间顺序)后分组
|
// 按楼层排序(时间顺序)后分组
|
||||||
distantL0.sort((a, b) => a.floor - b.floor);
|
distantL0.sort((a, b) => a.floor - b.floor);
|
||||||
@@ -1006,7 +1085,7 @@ async function buildVectorPrompt(store, recallResult, causalById, focusEntities,
|
|||||||
.filter(l0 => l0.floor >= recentStart && l0.floor <= recentEnd);
|
.filter(l0 => l0.floor >= recentStart && l0.floor <= recentEnd);
|
||||||
|
|
||||||
if (recentL0.length) {
|
if (recentL0.length) {
|
||||||
const recentBudget = { used: 0, max: RECENT_EVIDENCE_MAX };
|
const recentBudget = { used: 0, max: UNSUMMARIZED_EVIDENCE_MAX };
|
||||||
|
|
||||||
// 按楼层排序后分组
|
// 按楼层排序后分组
|
||||||
recentL0.sort((a, b) => a.floor - b.floor);
|
recentL0.sort((a, b) => a.floor - b.floor);
|
||||||
@@ -1051,10 +1130,10 @@ async function buildVectorPrompt(store, recallResult, causalById, focusEntities,
|
|||||||
sections.push(`[好像有关的事] 听说过或有点模糊\n\n${assembled.relatedEvents.lines.join("\n\n")}`);
|
sections.push(`[好像有关的事] 听说过或有点模糊\n\n${assembled.relatedEvents.lines.join("\n\n")}`);
|
||||||
}
|
}
|
||||||
if (assembled.distantEvidence.lines.length) {
|
if (assembled.distantEvidence.lines.length) {
|
||||||
sections.push(`[更早以前] 记忆里残留的老画面\n${assembled.distantEvidence.lines.join("\n")}`);
|
sections.push(`[零散记忆] 没归入事件的片段\n${assembled.distantEvidence.lines.join("\n")}`);
|
||||||
}
|
}
|
||||||
if (assembled.recentEvidence.lines.length) {
|
if (assembled.recentEvidence.lines.length) {
|
||||||
sections.push(`[近期] 清晰但还没整理\n${assembled.recentEvidence.lines.join("\n")}`);
|
sections.push(`[新鲜记忆] 还没总结的部分\n${assembled.recentEvidence.lines.join("\n")}`);
|
||||||
}
|
}
|
||||||
if (assembled.arcs.lines.length) {
|
if (assembled.arcs.lines.length) {
|
||||||
sections.push(`[这些人] 他们的弧光\n${assembled.arcs.lines.join("\n")}`);
|
sections.push(`[这些人] 他们的弧光\n${assembled.arcs.lines.join("\n")}`);
|
||||||
@@ -1085,9 +1164,11 @@ async function buildVectorPrompt(store, recallResult, causalById, focusEntities,
|
|||||||
metrics.formatting.time = Math.round(performance.now() - T_Format_Start);
|
metrics.formatting.time = Math.round(performance.now() - T_Format_Start);
|
||||||
metrics.timing.formatting = metrics.formatting.time;
|
metrics.timing.formatting = metrics.formatting.time;
|
||||||
|
|
||||||
metrics.budget.total = total.used + (assembled.recentEvidence.tokens || 0);
|
const effectiveTotal = total.used + (assembled.recentEvidence.tokens || 0);
|
||||||
metrics.budget.limit = TOTAL_BUDGET_MAX;
|
const effectiveLimit = SHARED_POOL_MAX + UNSUMMARIZED_EVIDENCE_MAX;
|
||||||
metrics.budget.utilization = Math.round(metrics.budget.total / TOTAL_BUDGET_MAX * 100);
|
metrics.budget.total = effectiveTotal;
|
||||||
|
metrics.budget.limit = effectiveLimit;
|
||||||
|
metrics.budget.utilization = Math.round(effectiveTotal / effectiveLimit * 100);
|
||||||
metrics.budget.breakdown = {
|
metrics.budget.breakdown = {
|
||||||
constraints: assembled.constraints.tokens,
|
constraints: assembled.constraints.tokens,
|
||||||
events: injectionStats.event.tokens,
|
events: injectionStats.event.tokens,
|
||||||
|
|||||||
@@ -199,9 +199,6 @@ function anchorToAtom(anchor, aiFloor, idx) {
|
|||||||
// ═══ 检索层(embedding 的唯一入口) ═══
|
// ═══ 检索层(embedding 的唯一入口) ═══
|
||||||
semantic: scene,
|
semantic: scene,
|
||||||
|
|
||||||
// ═══ 场景数据 ═══
|
|
||||||
scene,
|
|
||||||
|
|
||||||
// ═══ 图结构层(扩散的 key) ═══
|
// ═══ 图结构层(扩散的 key) ═══
|
||||||
who,
|
who,
|
||||||
edges,
|
edges,
|
||||||
|
|||||||
776
modules/story-summary/vector/retrieval/diffusion.js
Normal file
776
modules/story-summary/vector/retrieval/diffusion.js
Normal file
@@ -0,0 +1,776 @@
|
|||||||
|
// ═══════════════════════════════════════════════════════════════════════════
|
||||||
|
// diffusion.js - PPR Graph Diffusion (Personalized PageRank)
|
||||||
|
//
|
||||||
|
// Spreads activation from seed L0 atoms through entity co-occurrence graph
|
||||||
|
// to discover narratively-connected but semantically-distant memories.
|
||||||
|
//
|
||||||
|
// Pipeline position: recall.js Stage 7.5
|
||||||
|
// Input: seeds (reranked L0 from Stage 6)
|
||||||
|
// Output: additional L0 atoms → merged into l0Selected
|
||||||
|
//
|
||||||
|
// Algorithm:
|
||||||
|
// 1. Build undirected weighted graph over all L0 atoms
|
||||||
|
// Four channels: WHO/WHAT/WHERE/HOW (Jaccard/Overlap/ExactMatch)
|
||||||
|
// 2. Personalized PageRank (Power Iteration)
|
||||||
|
// Seeds weighted by rerankScore — Haveliwala (2002) topic-sensitive variant
|
||||||
|
// α = 0.15 restart probability — Page et al. (1998)
|
||||||
|
// 3. Post-verification (Dense Cosine Gate)
|
||||||
|
// Exclude seeds, cosine ≥ 0.45, final = PPR_norm × cosine ≥ 0.10
|
||||||
|
//
|
||||||
|
// References:
|
||||||
|
// Page et al. "The PageRank Citation Ranking" (1998)
|
||||||
|
// Haveliwala "Topic-Sensitive PageRank" (IEEE TKDE 2003)
|
||||||
|
// Langville & Meyer "Eigenvector Methods for Web IR" (SIAM Review 2005)
|
||||||
|
// Sun et al. "GraftNet" (EMNLP 2018)
|
||||||
|
// Jaccard "Étude comparative de la distribution florale" (1912)
|
||||||
|
// Szymkiewicz "Une contribution statistique" (1934) — Overlap coefficient
|
||||||
|
// Rimmon-Kenan "Narrative Fiction" (2002) — Channel weight rationale
|
||||||
|
//
|
||||||
|
// Core PPR iteration aligned with NetworkX pagerank():
|
||||||
|
// github.com/networkx/networkx — algorithms/link_analysis/pagerank_alg.py
|
||||||
|
// ═══════════════════════════════════════════════════════════════════════════
|
||||||
|
|
||||||
|
import { xbLog } from '../../../../core/debug-core.js';
|
||||||
|
|
||||||
|
const MODULE_ID = 'diffusion';
|
||||||
|
|
||||||
|
// ═══════════════════════════════════════════════════════════════════════════
|
||||||
|
// Configuration
|
||||||
|
// ═══════════════════════════════════════════════════════════════════════════
|
||||||
|
|
||||||
|
const CONFIG = {
|
||||||
|
// PPR parameters (Page et al. 1998; GraftNet 2018 uses same values)
|
||||||
|
ALPHA: 0.15, // restart probability
|
||||||
|
EPSILON: 1e-6, // L1 convergence threshold
|
||||||
|
MAX_ITER: 50, // hard iteration cap (typically converges in 15-25)
|
||||||
|
|
||||||
|
// Edge weight channel coefficients
|
||||||
|
// Rationale: Rimmon-Kenan (2002) hierarchy: characters > events > setting > themes
|
||||||
|
GAMMA: {
|
||||||
|
who: 0.50, // entity co-occurrence — Jaccard
|
||||||
|
what: 0.25, // directed pair overlap — Szymkiewicz-Simpson
|
||||||
|
where: 0.15, // location exact match — binary
|
||||||
|
how: 0.10, // dynamics tag co-occurrence — Jaccard
|
||||||
|
},
|
||||||
|
|
||||||
|
// Post-verification (Cosine Gate)
|
||||||
|
COSINE_GATE: 0.45, // min cosine(queryVector, stateVector)
|
||||||
|
SCORE_FLOOR: 0.10, // min finalScore = PPR_normalized × cosine
|
||||||
|
DIFFUSION_CAP: 60, // max diffused nodes (excluding seeds)
|
||||||
|
};
|
||||||
|
|
||||||
|
// ═══════════════════════════════════════════════════════════════════════════
|
||||||
|
// Utility functions
|
||||||
|
// ═══════════════════════════════════════════════════════════════════════════
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Unicode-safe text normalization (matches recall.js / entity-lexicon.js)
|
||||||
|
*/
|
||||||
|
function normalize(s) {
|
||||||
|
return String(s || '')
|
||||||
|
.normalize('NFKC')
|
||||||
|
.replace(/[\u200B-\u200D\uFEFF]/g, '')
|
||||||
|
.trim()
|
||||||
|
.toLowerCase();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Cosine similarity between two vectors
|
||||||
|
*/
|
||||||
|
function cosineSimilarity(a, b) {
|
||||||
|
if (!a?.length || !b?.length || a.length !== b.length) return 0;
|
||||||
|
let dot = 0, nA = 0, nB = 0;
|
||||||
|
for (let i = 0; i < a.length; i++) {
|
||||||
|
dot += a[i] * b[i];
|
||||||
|
nA += a[i] * a[i];
|
||||||
|
nB += b[i] * b[i];
|
||||||
|
}
|
||||||
|
return nA && nB ? dot / (Math.sqrt(nA) * Math.sqrt(nB)) : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ═══════════════════════════════════════════════════════════════════════════
|
||||||
|
// Feature extraction from L0 atoms
|
||||||
|
// ═══════════════════════════════════════════════════════════════════════════
|
||||||
|
|
||||||
|
/**
|
||||||
|
* WHO channel: entity set = who ∪ edges.s ∪ edges.t
|
||||||
|
* @param {object} atom
|
||||||
|
* @returns {Set<string>}
|
||||||
|
*/
|
||||||
|
function extractEntities(atom) {
|
||||||
|
const set = new Set();
|
||||||
|
for (const w of (atom.who || [])) {
|
||||||
|
const n = normalize(w);
|
||||||
|
if (n) set.add(n);
|
||||||
|
}
|
||||||
|
for (const e of (atom.edges || [])) {
|
||||||
|
const s = normalize(e?.s);
|
||||||
|
const t = normalize(e?.t);
|
||||||
|
if (s) set.add(s);
|
||||||
|
if (t) set.add(t);
|
||||||
|
}
|
||||||
|
return set;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* WHAT channel: directed interaction pairs "A→B" (strict direction — option A)
|
||||||
|
* @param {object} atom
|
||||||
|
* @returns {Set<string>}
|
||||||
|
*/
|
||||||
|
function extractDirectedPairs(atom) {
|
||||||
|
const set = new Set();
|
||||||
|
for (const e of (atom.edges || [])) {
|
||||||
|
const s = normalize(e?.s);
|
||||||
|
const t = normalize(e?.t);
|
||||||
|
if (s && t) set.add(`${s}\u2192${t}`);
|
||||||
|
}
|
||||||
|
return set;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* WHERE channel: normalized location string
|
||||||
|
* @param {object} atom
|
||||||
|
* @returns {string} empty string if absent
|
||||||
|
*/
|
||||||
|
function extractLocation(atom) {
|
||||||
|
return normalize(atom.where);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* HOW channel: dynamics tags set
|
||||||
|
* @param {object} atom
|
||||||
|
* @returns {Set<string>}
|
||||||
|
*/
|
||||||
|
function extractDynamics(atom) {
|
||||||
|
const set = new Set();
|
||||||
|
for (const d of (atom.dynamics || [])) {
|
||||||
|
const n = normalize(d);
|
||||||
|
if (n) set.add(n);
|
||||||
|
}
|
||||||
|
return set;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ═══════════════════════════════════════════════════════════════════════════
|
||||||
|
// Set similarity functions
|
||||||
|
// ═══════════════════════════════════════════════════════════════════════════
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Jaccard index: |A∩B| / |A∪B| (Jaccard 1912)
|
||||||
|
* @param {Set<string>} a
|
||||||
|
* @param {Set<string>} b
|
||||||
|
* @returns {number} 0..1
|
||||||
|
*/
|
||||||
|
function jaccard(a, b) {
|
||||||
|
if (!a.size || !b.size) return 0;
|
||||||
|
let inter = 0;
|
||||||
|
const [smaller, larger] = a.size <= b.size ? [a, b] : [b, a];
|
||||||
|
for (const x of smaller) {
|
||||||
|
if (larger.has(x)) inter++;
|
||||||
|
}
|
||||||
|
const union = a.size + b.size - inter;
|
||||||
|
return union > 0 ? inter / union : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Overlap coefficient: |A∩B| / min(|A|,|B|) (Szymkiewicz-Simpson 1934)
|
||||||
|
* Used for directed pairs where set sizes are small (1-3); Jaccard
|
||||||
|
* over-penalizes small-set asymmetry.
|
||||||
|
* @param {Set<string>} a
|
||||||
|
* @param {Set<string>} b
|
||||||
|
* @returns {number} 0..1
|
||||||
|
*/
|
||||||
|
function overlapCoefficient(a, b) {
|
||||||
|
if (!a.size || !b.size) return 0;
|
||||||
|
let inter = 0;
|
||||||
|
const [smaller, larger] = a.size <= b.size ? [a, b] : [b, a];
|
||||||
|
for (const x of smaller) {
|
||||||
|
if (larger.has(x)) inter++;
|
||||||
|
}
|
||||||
|
return inter / smaller.size;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ═══════════════════════════════════════════════════════════════════════════
|
||||||
|
// Graph construction
|
||||||
|
//
|
||||||
|
// Candidate pairs discovered via inverted indices on entities and locations.
|
||||||
|
// Dynamics-only pairs excluded from candidate generation (γ_HOW = 0.10 is
|
||||||
|
// too weak to justify O(N²) blowup from 8-tag combinatorics).
|
||||||
|
// All four channels evaluated for every candidate pair.
|
||||||
|
// ═══════════════════════════════════════════════════════════════════════════
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Pre-extract features for all atoms
|
||||||
|
* @param {object[]} allAtoms
|
||||||
|
* @returns {object[]} feature objects with entities/directedPairs/location/dynamics
|
||||||
|
*/
|
||||||
|
function extractAllFeatures(allAtoms) {
|
||||||
|
return allAtoms.map(atom => ({
|
||||||
|
entities: extractEntities(atom),
|
||||||
|
directedPairs: extractDirectedPairs(atom),
|
||||||
|
location: extractLocation(atom),
|
||||||
|
dynamics: extractDynamics(atom),
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Build inverted index: value → list of atom indices
|
||||||
|
* @param {object[]} features
|
||||||
|
* @returns {{ entityIndex: Map, locationIndex: Map }}
|
||||||
|
*/
|
||||||
|
function buildInvertedIndices(features) {
|
||||||
|
const entityIndex = new Map();
|
||||||
|
const locationIndex = new Map();
|
||||||
|
|
||||||
|
for (let i = 0; i < features.length; i++) {
|
||||||
|
for (const e of features[i].entities) {
|
||||||
|
if (!entityIndex.has(e)) entityIndex.set(e, []);
|
||||||
|
entityIndex.get(e).push(i);
|
||||||
|
}
|
||||||
|
const loc = features[i].location;
|
||||||
|
if (loc) {
|
||||||
|
if (!locationIndex.has(loc)) locationIndex.set(loc, []);
|
||||||
|
locationIndex.get(loc).push(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return { entityIndex, locationIndex };
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Collect candidate pairs from inverted index
|
||||||
|
* @param {Map} index - value → [atomIndex, ...]
|
||||||
|
* @param {Set<number>} pairSet - packed pair collector
|
||||||
|
* @param {number} N - total atom count (for pair packing)
|
||||||
|
*/
|
||||||
|
function collectPairsFromIndex(index, pairSet, N) {
|
||||||
|
for (const indices of index.values()) {
|
||||||
|
for (let a = 0; a < indices.length; a++) {
|
||||||
|
for (let b = a + 1; b < indices.length; b++) {
|
||||||
|
const lo = Math.min(indices[a], indices[b]);
|
||||||
|
const hi = Math.max(indices[a], indices[b]);
|
||||||
|
pairSet.add(lo * N + hi);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Build weighted undirected graph over L0 atoms.
|
||||||
|
*
|
||||||
|
* @param {object[]} allAtoms
|
||||||
|
* @returns {{ neighbors: object[][], edgeCount: number, channelStats: object, buildTime: number }}
|
||||||
|
*/
|
||||||
|
function buildGraph(allAtoms) {
|
||||||
|
const N = allAtoms.length;
|
||||||
|
const T0 = performance.now();
|
||||||
|
|
||||||
|
const features = extractAllFeatures(allAtoms);
|
||||||
|
const { entityIndex, locationIndex } = buildInvertedIndices(features);
|
||||||
|
|
||||||
|
// Candidate pairs: share ≥1 entity or same location
|
||||||
|
const pairSet = new Set();
|
||||||
|
collectPairsFromIndex(entityIndex, pairSet, N);
|
||||||
|
collectPairsFromIndex(locationIndex, pairSet, N);
|
||||||
|
|
||||||
|
// Compute four-channel edge weights for all candidates
|
||||||
|
const neighbors = Array.from({ length: N }, () => []);
|
||||||
|
let edgeCount = 0;
|
||||||
|
const channelStats = { who: 0, what: 0, where: 0, how: 0 };
|
||||||
|
|
||||||
|
for (const packed of pairSet) {
|
||||||
|
const i = Math.floor(packed / N);
|
||||||
|
const j = packed % N;
|
||||||
|
|
||||||
|
const fi = features[i];
|
||||||
|
const fj = features[j];
|
||||||
|
|
||||||
|
const wWho = jaccard(fi.entities, fj.entities);
|
||||||
|
const wWhat = overlapCoefficient(fi.directedPairs, fj.directedPairs);
|
||||||
|
const wWhere = (fi.location && fi.location === fj.location) ? 1.0 : 0.0;
|
||||||
|
const wHow = jaccard(fi.dynamics, fj.dynamics);
|
||||||
|
|
||||||
|
const weight =
|
||||||
|
CONFIG.GAMMA.who * wWho +
|
||||||
|
CONFIG.GAMMA.what * wWhat +
|
||||||
|
CONFIG.GAMMA.where * wWhere +
|
||||||
|
CONFIG.GAMMA.how * wHow;
|
||||||
|
|
||||||
|
if (weight > 0) {
|
||||||
|
neighbors[i].push({ target: j, weight });
|
||||||
|
neighbors[j].push({ target: i, weight });
|
||||||
|
edgeCount++;
|
||||||
|
|
||||||
|
if (wWho > 0) channelStats.who++;
|
||||||
|
if (wWhat > 0) channelStats.what++;
|
||||||
|
if (wWhere > 0) channelStats.where++;
|
||||||
|
if (wHow > 0) channelStats.how++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const buildTime = Math.round(performance.now() - T0);
|
||||||
|
|
||||||
|
xbLog.info(MODULE_ID,
|
||||||
|
`Graph: ${N} nodes, ${edgeCount} edges ` +
|
||||||
|
`(who=${channelStats.who} what=${channelStats.what} ` +
|
||||||
|
`where=${channelStats.where} how=${channelStats.how}) ` +
|
||||||
|
`(${buildTime}ms)`
|
||||||
|
);
|
||||||
|
|
||||||
|
return { neighbors, edgeCount, channelStats, buildTime };
|
||||||
|
}
|
||||||
|
|
||||||
|
// ═══════════════════════════════════════════════════════════════════════════
|
||||||
|
// PPR: Seed vector construction
|
||||||
|
// ═══════════════════════════════════════════════════════════════════════════
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Build personalization vector s from seeds, weighted by rerankScore.
|
||||||
|
* Haveliwala (2002): non-uniform personalization improves topic sensitivity.
|
||||||
|
*
|
||||||
|
* @param {object[]} seeds - seed L0 entries with atomId and rerankScore
|
||||||
|
* @param {Map<string, number>} idToIdx - atomId → array index
|
||||||
|
* @param {number} N - total node count
|
||||||
|
* @returns {Float64Array} personalization vector (L1-normalized, sums to 1)
|
||||||
|
*/
|
||||||
|
function buildSeedVector(seeds, idToIdx, N) {
|
||||||
|
const s = new Float64Array(N);
|
||||||
|
let total = 0;
|
||||||
|
|
||||||
|
for (const seed of seeds) {
|
||||||
|
const idx = idToIdx.get(seed.atomId);
|
||||||
|
if (idx == null) continue;
|
||||||
|
|
||||||
|
const score = Math.max(0, seed.rerankScore || seed.similarity || 0);
|
||||||
|
s[idx] += score;
|
||||||
|
total += score;
|
||||||
|
}
|
||||||
|
|
||||||
|
// L1 normalize to probability distribution
|
||||||
|
if (total > 0) {
|
||||||
|
for (let i = 0; i < N; i++) s[i] /= total;
|
||||||
|
}
|
||||||
|
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ═══════════════════════════════════════════════════════════════════════════
|
||||||
|
// PPR: Column normalization + dangling node detection
|
||||||
|
// ═══════════════════════════════════════════════════════════════════════════
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Column-normalize adjacency into transition matrix W.
|
||||||
|
*
|
||||||
|
* Column j of W: W_{ij} = weight(i,j) / Σ_k weight(k,j)
|
||||||
|
* Dangling nodes (no outgoing edges): handled in powerIteration
|
||||||
|
* via redistribution to personalization vector s.
|
||||||
|
* (Langville & Meyer 2005, §4.1)
|
||||||
|
*
|
||||||
|
* @param {object[][]} neighbors - neighbors[j] = [{target, weight}, ...]
|
||||||
|
* @param {number} N
|
||||||
|
* @returns {{ columns: object[][], dangling: number[] }}
|
||||||
|
*/
|
||||||
|
function columnNormalize(neighbors, N) {
|
||||||
|
const columns = Array.from({ length: N }, () => []);
|
||||||
|
const dangling = [];
|
||||||
|
|
||||||
|
for (let j = 0; j < N; j++) {
|
||||||
|
const edges = neighbors[j];
|
||||||
|
|
||||||
|
let sum = 0;
|
||||||
|
for (let e = 0; e < edges.length; e++) sum += edges[e].weight;
|
||||||
|
|
||||||
|
if (sum <= 0) {
|
||||||
|
dangling.push(j);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const col = columns[j];
|
||||||
|
for (let e = 0; e < edges.length; e++) {
|
||||||
|
col.push({ target: edges[e].target, prob: edges[e].weight / sum });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return { columns, dangling };
|
||||||
|
}
|
||||||
|
|
||||||
|
// ═══════════════════════════════════════════════════════════════════════════
|
||||||
|
// PPR: Power Iteration
|
||||||
|
//
|
||||||
|
// Aligned with NetworkX pagerank() (pagerank_alg.py):
|
||||||
|
//
|
||||||
|
// NetworkX "alpha" = damping = our (1 − α)
|
||||||
|
// NetworkX "1-alpha" = teleportation = our α
|
||||||
|
//
|
||||||
|
// Per iteration:
|
||||||
|
// π_new[i] = α·s[i] + (1−α)·( Σ_j W_{ij}·π[j] + dangling_sum·s[i] )
|
||||||
|
//
|
||||||
|
// Convergence: Perron-Frobenius theorem guarantees unique stationary
|
||||||
|
// distribution for irreducible aperiodic column-stochastic matrix.
|
||||||
|
// Rate: ‖π^(t+1) − π^t‖₁ ≤ (1−α)^t (geometric).
|
||||||
|
// ═══════════════════════════════════════════════════════════════════════════
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Run PPR Power Iteration.
|
||||||
|
*
|
||||||
|
* @param {object[][]} columns - column-normalized transition matrix
|
||||||
|
* @param {Float64Array} s - personalization vector (sums to 1)
|
||||||
|
* @param {number[]} dangling - dangling node indices
|
||||||
|
* @param {number} N - node count
|
||||||
|
* @returns {{ pi: Float64Array, iterations: number, finalError: number }}
|
||||||
|
*/
|
||||||
|
function powerIteration(columns, s, dangling, N) {
|
||||||
|
const alpha = CONFIG.ALPHA;
|
||||||
|
const d = 1 - alpha; // damping factor = prob of following edges
|
||||||
|
const epsilon = CONFIG.EPSILON;
|
||||||
|
const maxIter = CONFIG.MAX_ITER;
|
||||||
|
|
||||||
|
// Initialize π to personalization vector
|
||||||
|
let pi = new Float64Array(N);
|
||||||
|
for (let i = 0; i < N; i++) pi[i] = s[i];
|
||||||
|
|
||||||
|
let iterations = 0;
|
||||||
|
let finalError = 0;
|
||||||
|
|
||||||
|
for (let iter = 0; iter < maxIter; iter++) {
|
||||||
|
const piNew = new Float64Array(N);
|
||||||
|
|
||||||
|
// Dangling mass: probability at nodes with no outgoing edges
|
||||||
|
// redistributed to personalization vector (Langville & Meyer 2005)
|
||||||
|
let danglingSum = 0;
|
||||||
|
for (let k = 0; k < dangling.length; k++) {
|
||||||
|
danglingSum += pi[dangling[k]];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sparse matrix-vector product: (1−α) · W · π
|
||||||
|
for (let j = 0; j < N; j++) {
|
||||||
|
const pj = pi[j];
|
||||||
|
if (pj === 0) continue;
|
||||||
|
|
||||||
|
const col = columns[j];
|
||||||
|
const dpj = d * pj;
|
||||||
|
for (let e = 0; e < col.length; e++) {
|
||||||
|
piNew[col[e].target] += dpj * col[e].prob;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Restart + dangling contribution:
|
||||||
|
// α · s[i] + (1−α) · danglingSum · s[i]
|
||||||
|
const restartCoeff = alpha + d * danglingSum;
|
||||||
|
for (let i = 0; i < N; i++) {
|
||||||
|
piNew[i] += restartCoeff * s[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
// L1 convergence check
|
||||||
|
let l1 = 0;
|
||||||
|
for (let i = 0; i < N; i++) {
|
||||||
|
l1 += Math.abs(piNew[i] - pi[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
pi = piNew;
|
||||||
|
iterations = iter + 1;
|
||||||
|
finalError = l1;
|
||||||
|
|
||||||
|
if (l1 < epsilon) break;
|
||||||
|
}
|
||||||
|
|
||||||
|
return { pi, iterations, finalError };
|
||||||
|
}
|
||||||
|
|
||||||
|
// ═══════════════════════════════════════════════════════════════════════════
|
||||||
|
// Post-verification: Dense Cosine Gate
|
||||||
|
//
|
||||||
|
// PPR measures graph-structural relevance ("same characters").
|
||||||
|
// Cosine gate measures semantic relevance ("related to current topic").
|
||||||
|
// Product combination ensures both dimensions are satisfied
|
||||||
|
// (CombMNZ — Fox & Shaw, TREC-2 1994).
|
||||||
|
// ═══════════════════════════════════════════════════════════════════════════
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Filter PPR-activated nodes by semantic relevance.
|
||||||
|
*
|
||||||
|
* For each non-seed node with PPR > 0:
|
||||||
|
* 1. cosine(queryVector, stateVector) ≥ COSINE_GATE
|
||||||
|
* 2. finalScore = PPR_normalized × cosine ≥ SCORE_FLOOR
|
||||||
|
* 3. Top DIFFUSION_CAP by finalScore
|
||||||
|
*
|
||||||
|
* @param {Float64Array} pi - PPR stationary distribution
|
||||||
|
* @param {string[]} atomIds - index → atomId
|
||||||
|
* @param {Map<string, object>} atomById - atomId → atom object
|
||||||
|
* @param {Set<string>} seedAtomIds - seed atomIds (excluded from output)
|
||||||
|
* @param {Map<string, Float32Array>} vectorMap - atomId → embedding vector
|
||||||
|
* @param {Float32Array|number[]} queryVector - R2 weighted query vector
|
||||||
|
* @returns {{ diffused: object[], gateStats: object }}
|
||||||
|
*/
|
||||||
|
function postVerify(pi, atomIds, atomById, seedAtomIds, vectorMap, queryVector) {
|
||||||
|
const N = atomIds.length;
|
||||||
|
const gateStats = { passed: 0, filtered: 0, noVector: 0 };
|
||||||
|
|
||||||
|
// Find max PPR score among non-seed nodes (for normalization)
|
||||||
|
let maxPPR = 0;
|
||||||
|
for (let i = 0; i < N; i++) {
|
||||||
|
if (pi[i] > 0 && !seedAtomIds.has(atomIds[i])) {
|
||||||
|
if (pi[i] > maxPPR) maxPPR = pi[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (maxPPR <= 0) {
|
||||||
|
return { diffused: [], gateStats };
|
||||||
|
}
|
||||||
|
|
||||||
|
const candidates = [];
|
||||||
|
|
||||||
|
for (let i = 0; i < N; i++) {
|
||||||
|
const atomId = atomIds[i];
|
||||||
|
|
||||||
|
// Skip seeds and zero-probability nodes
|
||||||
|
if (seedAtomIds.has(atomId)) continue;
|
||||||
|
if (pi[i] <= 0) continue;
|
||||||
|
|
||||||
|
// Require state vector for cosine verification
|
||||||
|
const vec = vectorMap.get(atomId);
|
||||||
|
if (!vec?.length) {
|
||||||
|
gateStats.noVector++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Cosine gate
|
||||||
|
const cos = cosineSimilarity(queryVector, vec);
|
||||||
|
if (cos < CONFIG.COSINE_GATE) {
|
||||||
|
gateStats.filtered++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Final score = PPR_normalized × cosine
|
||||||
|
const pprNorm = pi[i] / maxPPR;
|
||||||
|
const finalScore = pprNorm * cos;
|
||||||
|
|
||||||
|
if (finalScore < CONFIG.SCORE_FLOOR) {
|
||||||
|
gateStats.filtered++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
gateStats.passed++;
|
||||||
|
|
||||||
|
const atom = atomById.get(atomId);
|
||||||
|
if (!atom) continue;
|
||||||
|
|
||||||
|
candidates.push({
|
||||||
|
atomId,
|
||||||
|
floor: atom.floor,
|
||||||
|
atom,
|
||||||
|
finalScore,
|
||||||
|
pprScore: pi[i],
|
||||||
|
pprNormalized: pprNorm,
|
||||||
|
cosine: cos,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sort by finalScore descending, cap at DIFFUSION_CAP
|
||||||
|
candidates.sort((a, b) => b.finalScore - a.finalScore);
|
||||||
|
const diffused = candidates.slice(0, CONFIG.DIFFUSION_CAP);
|
||||||
|
|
||||||
|
return { diffused, gateStats };
|
||||||
|
}
|
||||||
|
|
||||||
|
// ═══════════════════════════════════════════════════════════════════════════
|
||||||
|
// Main entry point
|
||||||
|
// ═══════════════════════════════════════════════════════════════════════════
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Spread activation from seed L0 atoms through entity co-occurrence graph.
|
||||||
|
*
|
||||||
|
* Called from recall.js Stage 7.5, after locateAndPullEvidence and before
|
||||||
|
* Causation Trace. Results are merged into l0Selected and consumed by
|
||||||
|
* prompt.js through existing budget/formatting pipeline (zero downstream changes).
|
||||||
|
*
|
||||||
|
* @param {object[]} seeds - l0Selected from recall Stage 6
|
||||||
|
* Each: { atomId, rerankScore, similarity, atom, ... }
|
||||||
|
* @param {object[]} allAtoms - getStateAtoms() result
|
||||||
|
* Each: { atomId, floor, semantic, who, edges, dynamics, where }
|
||||||
|
* @param {object[]} stateVectors - getAllStateVectors() result
|
||||||
|
* Each: { atomId, floor, vector: Float32Array }
|
||||||
|
* @param {Float32Array|number[]} queryVector - R2 weighted query vector
|
||||||
|
* @param {object|null} metrics - metrics object (optional, mutated in-place)
|
||||||
|
* @returns {object[]} Additional L0 atoms for l0Selected
|
||||||
|
* Each: { atomId, floor, atom, finalScore, pprScore, pprNormalized, cosine }
|
||||||
|
*/
|
||||||
|
export function diffuseFromSeeds(seeds, allAtoms, stateVectors, queryVector, metrics) {
|
||||||
|
const T0 = performance.now();
|
||||||
|
|
||||||
|
// ─── Early exits ─────────────────────────────────────────────────
|
||||||
|
|
||||||
|
if (!seeds?.length || !allAtoms?.length || !queryVector?.length) {
|
||||||
|
fillMetricsEmpty(metrics);
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── 1. Build atom index ─────────────────────────────────────────
|
||||||
|
|
||||||
|
const atomById = new Map();
|
||||||
|
const atomIds = [];
|
||||||
|
const idToIdx = new Map();
|
||||||
|
|
||||||
|
for (let i = 0; i < allAtoms.length; i++) {
|
||||||
|
const a = allAtoms[i];
|
||||||
|
atomById.set(a.atomId, a);
|
||||||
|
atomIds.push(a.atomId);
|
||||||
|
idToIdx.set(a.atomId, i);
|
||||||
|
}
|
||||||
|
|
||||||
|
const N = allAtoms.length;
|
||||||
|
|
||||||
|
// Validate seeds against atom index
|
||||||
|
const validSeeds = seeds.filter(s => idToIdx.has(s.atomId));
|
||||||
|
const seedAtomIds = new Set(validSeeds.map(s => s.atomId));
|
||||||
|
|
||||||
|
if (!validSeeds.length) {
|
||||||
|
fillMetricsEmpty(metrics);
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── 2. Build graph ──────────────────────────────────────────────
|
||||||
|
|
||||||
|
const graph = buildGraph(allAtoms);
|
||||||
|
|
||||||
|
if (graph.edgeCount === 0) {
|
||||||
|
fillMetrics(metrics, {
|
||||||
|
seedCount: validSeeds.length,
|
||||||
|
graphNodes: N,
|
||||||
|
graphEdges: 0,
|
||||||
|
channelStats: graph.channelStats,
|
||||||
|
time: graph.buildTime,
|
||||||
|
});
|
||||||
|
xbLog.info(MODULE_ID, 'No graph edges — skipping diffusion');
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── 3. Build seed vector ────────────────────────────────────────
|
||||||
|
|
||||||
|
const s = buildSeedVector(validSeeds, idToIdx, N);
|
||||||
|
|
||||||
|
// ─── 4. Column normalize ─────────────────────────────────────────
|
||||||
|
|
||||||
|
const { columns, dangling } = columnNormalize(graph.neighbors, N);
|
||||||
|
|
||||||
|
// ─── 5. PPR Power Iteration ──────────────────────────────────────
|
||||||
|
|
||||||
|
const T_PPR = performance.now();
|
||||||
|
const { pi, iterations, finalError } = powerIteration(columns, s, dangling, N);
|
||||||
|
const pprTime = Math.round(performance.now() - T_PPR);
|
||||||
|
|
||||||
|
// Count activated non-seed nodes
|
||||||
|
let pprActivated = 0;
|
||||||
|
for (let i = 0; i < N; i++) {
|
||||||
|
if (pi[i] > 0 && !seedAtomIds.has(atomIds[i])) pprActivated++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── 6. Post-verification ────────────────────────────────────────
|
||||||
|
|
||||||
|
const vectorMap = new Map();
|
||||||
|
for (const sv of (stateVectors || [])) {
|
||||||
|
vectorMap.set(sv.atomId, sv.vector);
|
||||||
|
}
|
||||||
|
|
||||||
|
const { diffused, gateStats } = postVerify(
|
||||||
|
pi, atomIds, atomById, seedAtomIds, vectorMap, queryVector
|
||||||
|
);
|
||||||
|
|
||||||
|
// ─── 7. Metrics ──────────────────────────────────────────────────
|
||||||
|
|
||||||
|
const totalTime = Math.round(performance.now() - T0);
|
||||||
|
|
||||||
|
fillMetrics(metrics, {
|
||||||
|
seedCount: validSeeds.length,
|
||||||
|
graphNodes: N,
|
||||||
|
graphEdges: graph.edgeCount,
|
||||||
|
channelStats: graph.channelStats,
|
||||||
|
buildTime: graph.buildTime,
|
||||||
|
iterations,
|
||||||
|
convergenceError: finalError,
|
||||||
|
pprActivated,
|
||||||
|
cosineGatePassed: gateStats.passed,
|
||||||
|
cosineGateFiltered: gateStats.filtered,
|
||||||
|
cosineGateNoVector: gateStats.noVector,
|
||||||
|
finalCount: diffused.length,
|
||||||
|
scoreDistribution: diffused.length > 0
|
||||||
|
? calcScoreStats(diffused.map(d => d.finalScore))
|
||||||
|
: { min: 0, max: 0, mean: 0 },
|
||||||
|
time: totalTime,
|
||||||
|
});
|
||||||
|
|
||||||
|
xbLog.info(MODULE_ID,
|
||||||
|
`Diffusion: ${validSeeds.length} seeds → ` +
|
||||||
|
`graph(${N}n/${graph.edgeCount}e) → ` +
|
||||||
|
`PPR(${iterations}it, ε=${finalError.toExponential(1)}, ${pprTime}ms) → ` +
|
||||||
|
`${pprActivated} activated → ` +
|
||||||
|
`gate(${gateStats.passed}\u2713/${gateStats.filtered}\u2717` +
|
||||||
|
`${gateStats.noVector ? `/${gateStats.noVector}?` : ''}) → ` +
|
||||||
|
`${diffused.length} final (${totalTime}ms)`
|
||||||
|
);
|
||||||
|
|
||||||
|
return diffused;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ═══════════════════════════════════════════════════════════════════════════
|
||||||
|
// Metrics helpers
|
||||||
|
// ═══════════════════════════════════════════════════════════════════════════
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compute min/max/mean distribution
|
||||||
|
* @param {number[]} scores
|
||||||
|
* @returns {{ min: number, max: number, mean: number }}
|
||||||
|
*/
|
||||||
|
function calcScoreStats(scores) {
|
||||||
|
if (!scores.length) return { min: 0, max: 0, mean: 0 };
|
||||||
|
const sorted = [...scores].sort((a, b) => a - b);
|
||||||
|
const sum = sorted.reduce((a, b) => a + b, 0);
|
||||||
|
return {
|
||||||
|
min: Number(sorted[0].toFixed(3)),
|
||||||
|
max: Number(sorted[sorted.length - 1].toFixed(3)),
|
||||||
|
mean: Number((sum / sorted.length).toFixed(3)),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fill metrics with empty diffusion block
|
||||||
|
*/
|
||||||
|
function fillMetricsEmpty(metrics) {
|
||||||
|
if (!metrics) return;
|
||||||
|
metrics.diffusion = {
|
||||||
|
seedCount: 0,
|
||||||
|
graphNodes: 0,
|
||||||
|
graphEdges: 0,
|
||||||
|
iterations: 0,
|
||||||
|
convergenceError: 0,
|
||||||
|
pprActivated: 0,
|
||||||
|
cosineGatePassed: 0,
|
||||||
|
cosineGateFiltered: 0,
|
||||||
|
cosineGateNoVector: 0,
|
||||||
|
finalCount: 0,
|
||||||
|
scoreDistribution: { min: 0, max: 0, mean: 0 },
|
||||||
|
byChannel: { who: 0, what: 0, where: 0, how: 0 },
|
||||||
|
time: 0,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fill metrics with diffusion results
|
||||||
|
*/
|
||||||
|
function fillMetrics(metrics, data) {
|
||||||
|
if (!metrics) return;
|
||||||
|
metrics.diffusion = {
|
||||||
|
seedCount: data.seedCount || 0,
|
||||||
|
graphNodes: data.graphNodes || 0,
|
||||||
|
graphEdges: data.graphEdges || 0,
|
||||||
|
iterations: data.iterations || 0,
|
||||||
|
convergenceError: data.convergenceError || 0,
|
||||||
|
pprActivated: data.pprActivated || 0,
|
||||||
|
cosineGatePassed: data.cosineGatePassed || 0,
|
||||||
|
cosineGateFiltered: data.cosineGateFiltered || 0,
|
||||||
|
cosineGateNoVector: data.cosineGateNoVector || 0,
|
||||||
|
finalCount: data.finalCount || 0,
|
||||||
|
scoreDistribution: data.scoreDistribution || { min: 0, max: 0, mean: 0 },
|
||||||
|
byChannel: data.channelStats || { who: 0, what: 0, where: 0, how: 0 },
|
||||||
|
time: data.time || 0,
|
||||||
|
};
|
||||||
|
}
|
||||||
@@ -8,6 +8,8 @@
|
|||||||
// 硬约束:name1 永不进入词典
|
// 硬约束:name1 永不进入词典
|
||||||
// ═══════════════════════════════════════════════════════════════════════════
|
// ═══════════════════════════════════════════════════════════════════════════
|
||||||
|
|
||||||
|
import { getStateAtoms } from '../storage/state-store.js';
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 标准化字符串(用于实体匹配)
|
* 标准化字符串(用于实体匹配)
|
||||||
* @param {string} s
|
* @param {string} s
|
||||||
@@ -69,6 +71,14 @@ export function buildEntityLexicon(store, context) {
|
|||||||
add(f.s);
|
add(f.s);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 5. L0 atoms 的 who(新角色在 L2 总结前即可进入词典)
|
||||||
|
const atoms = getStateAtoms();
|
||||||
|
for (const atom of atoms) {
|
||||||
|
for (const name of (atom.who || [])) {
|
||||||
|
add(name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// ★ 硬约束:删除 name1
|
// ★ 硬约束:删除 name1
|
||||||
if (context?.name1) {
|
if (context?.name1) {
|
||||||
lexicon.delete(normalize(context.name1));
|
lexicon.delete(normalize(context.name1));
|
||||||
@@ -112,6 +122,14 @@ export function buildDisplayNameMap(store, context) {
|
|||||||
if (!f.retracted) register(f.s);
|
if (!f.retracted) register(f.s);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 5. L0 atoms 的 who
|
||||||
|
const atoms = getStateAtoms();
|
||||||
|
for (const atom of atoms) {
|
||||||
|
for (const name of (atom.who || [])) {
|
||||||
|
register(name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// ★ 硬约束:删除 name1
|
// ★ 硬约束:删除 name1
|
||||||
if (context?.name1) {
|
if (context?.name1) {
|
||||||
map.delete(normalize(context.name1));
|
map.delete(normalize(context.name1));
|
||||||
|
|||||||
@@ -78,7 +78,7 @@ export function createMetrics() {
|
|||||||
inStore: 0,
|
inStore: 0,
|
||||||
considered: 0,
|
considered: 0,
|
||||||
selected: 0,
|
selected: 0,
|
||||||
byRecallType: { direct: 0, related: 0, causal: 0, lexical: 0 },
|
byRecallType: { direct: 0, related: 0, causal: 0, lexical: 0, l0Linked: 0 },
|
||||||
similarityDistribution: { min: 0, max: 0, mean: 0, median: 0 },
|
similarityDistribution: { min: 0, max: 0, mean: 0, median: 0 },
|
||||||
entityFilter: null,
|
entityFilter: null,
|
||||||
causalChainDepth: 0,
|
causalChainDepth: 0,
|
||||||
@@ -112,6 +112,23 @@ export function createMetrics() {
|
|||||||
assemblyTime: 0,
|
assemblyTime: 0,
|
||||||
},
|
},
|
||||||
|
|
||||||
|
// Diffusion (PPR Spreading Activation) - 图扩散
|
||||||
|
diffusion: {
|
||||||
|
seedCount: 0,
|
||||||
|
graphNodes: 0,
|
||||||
|
graphEdges: 0,
|
||||||
|
iterations: 0,
|
||||||
|
convergenceError: 0,
|
||||||
|
pprActivated: 0,
|
||||||
|
cosineGatePassed: 0,
|
||||||
|
cosineGateFiltered: 0,
|
||||||
|
cosineGateNoVector: 0,
|
||||||
|
finalCount: 0,
|
||||||
|
scoreDistribution: { min: 0, max: 0, mean: 0 },
|
||||||
|
byChannel: { who: 0, what: 0, where: 0, how: 0 },
|
||||||
|
time: 0,
|
||||||
|
},
|
||||||
|
|
||||||
// Formatting - 格式化
|
// Formatting - 格式化
|
||||||
formatting: {
|
formatting: {
|
||||||
sectionsIncluded: [],
|
sectionsIncluded: [],
|
||||||
@@ -140,6 +157,7 @@ export function createMetrics() {
|
|||||||
evidenceRetrieval: 0,
|
evidenceRetrieval: 0,
|
||||||
evidenceRerank: 0,
|
evidenceRerank: 0,
|
||||||
evidenceAssembly: 0,
|
evidenceAssembly: 0,
|
||||||
|
diffusion: 0,
|
||||||
formatting: 0,
|
formatting: 0,
|
||||||
total: 0,
|
total: 0,
|
||||||
},
|
},
|
||||||
@@ -249,9 +267,6 @@ export function formatMetricsLog(metrics) {
|
|||||||
// Fusion (W-RRF, floor-level)
|
// Fusion (W-RRF, floor-level)
|
||||||
lines.push('[Fusion] W-RRF (floor-level) - 多路融合');
|
lines.push('[Fusion] W-RRF (floor-level) - 多路融合');
|
||||||
lines.push(`├─ dense_floors: ${m.fusion.denseFloors}`);
|
lines.push(`├─ dense_floors: ${m.fusion.denseFloors}`);
|
||||||
if (m.fusion.denseAggMethod) {
|
|
||||||
lines.push(`│ └─ aggregation: ${m.fusion.denseAggMethod}`);
|
|
||||||
}
|
|
||||||
lines.push(`├─ lex_floors: ${m.fusion.lexFloors}`);
|
lines.push(`├─ lex_floors: ${m.fusion.lexFloors}`);
|
||||||
if (m.fusion.lexDensityBonus > 0) {
|
if (m.fusion.lexDensityBonus > 0) {
|
||||||
lines.push(`│ └─ density_bonus: ${m.fusion.lexDensityBonus}`);
|
lines.push(`│ └─ density_bonus: ${m.fusion.lexDensityBonus}`);
|
||||||
@@ -291,7 +306,12 @@ export function formatMetricsLog(metrics) {
|
|||||||
lines.push(`│ ├─ direct: ${m.event.byRecallType.direct}`);
|
lines.push(`│ ├─ direct: ${m.event.byRecallType.direct}`);
|
||||||
lines.push(`│ ├─ related: ${m.event.byRecallType.related}`);
|
lines.push(`│ ├─ related: ${m.event.byRecallType.related}`);
|
||||||
lines.push(`│ ├─ causal: ${m.event.byRecallType.causal}`);
|
lines.push(`│ ├─ causal: ${m.event.byRecallType.causal}`);
|
||||||
|
if (m.event.byRecallType.l0Linked) {
|
||||||
|
lines.push(`│ ├─ lexical: ${m.event.byRecallType.lexical}`);
|
||||||
|
lines.push(`│ └─ l0_linked: ${m.event.byRecallType.l0Linked}`);
|
||||||
|
} else {
|
||||||
lines.push(`│ └─ lexical: ${m.event.byRecallType.lexical}`);
|
lines.push(`│ └─ lexical: ${m.event.byRecallType.lexical}`);
|
||||||
|
}
|
||||||
|
|
||||||
const sim = m.event.similarityDistribution;
|
const sim = m.event.similarityDistribution;
|
||||||
if (sim && sim.max > 0) {
|
if (sim && sim.max > 0) {
|
||||||
@@ -340,6 +360,32 @@ export function formatMetricsLog(metrics) {
|
|||||||
lines.push(`└─ assembly_time: ${m.evidence.assemblyTime}ms`);
|
lines.push(`└─ assembly_time: ${m.evidence.assemblyTime}ms`);
|
||||||
lines.push('');
|
lines.push('');
|
||||||
|
|
||||||
|
// Diffusion (PPR)
|
||||||
|
lines.push('[Diffusion] PPR Spreading Activation');
|
||||||
|
lines.push(`├─ seeds: ${m.diffusion.seedCount}`);
|
||||||
|
lines.push(`├─ graph: ${m.diffusion.graphNodes} nodes, ${m.diffusion.graphEdges} edges`);
|
||||||
|
if (m.diffusion.graphEdges > 0) {
|
||||||
|
const ch = m.diffusion.byChannel || {};
|
||||||
|
lines.push(`│ └─ by_channel: who=${ch.who || 0}, what=${ch.what || 0}, where=${ch.where || 0}, how=${ch.how || 0}`);
|
||||||
|
}
|
||||||
|
if (m.diffusion.iterations > 0) {
|
||||||
|
lines.push(`├─ ppr: ${m.diffusion.iterations} iterations, ε=${Number(m.diffusion.convergenceError).toExponential(1)}`);
|
||||||
|
}
|
||||||
|
lines.push(`├─ activated (excl seeds): ${m.diffusion.pprActivated}`);
|
||||||
|
if (m.diffusion.pprActivated > 0) {
|
||||||
|
lines.push(`├─ cosine_gate: ${m.diffusion.cosineGatePassed} passed, ${m.diffusion.cosineGateFiltered} filtered`);
|
||||||
|
if (m.diffusion.cosineGateNoVector > 0) {
|
||||||
|
lines.push(`│ └─ no_vector: ${m.diffusion.cosineGateNoVector}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
lines.push(`├─ final_injected: ${m.diffusion.finalCount}`);
|
||||||
|
if (m.diffusion.finalCount > 0) {
|
||||||
|
const ds = m.diffusion.scoreDistribution;
|
||||||
|
lines.push(`├─ scores: min=${ds.min}, max=${ds.max}, mean=${ds.mean}`);
|
||||||
|
}
|
||||||
|
lines.push(`└─ time: ${m.diffusion.time}ms`);
|
||||||
|
lines.push('');
|
||||||
|
|
||||||
// Formatting
|
// Formatting
|
||||||
lines.push('[Formatting] 格式化');
|
lines.push('[Formatting] 格式化');
|
||||||
lines.push(`├─ sections: [${(m.formatting.sectionsIncluded || []).join(', ')}]`);
|
lines.push(`├─ sections: [${(m.formatting.sectionsIncluded || []).join(', ')}]`);
|
||||||
@@ -372,6 +418,7 @@ export function formatMetricsLog(metrics) {
|
|||||||
lines.push(`├─ evidence_retrieval: ${m.timing.evidenceRetrieval}ms`);
|
lines.push(`├─ evidence_retrieval: ${m.timing.evidenceRetrieval}ms`);
|
||||||
lines.push(`├─ floor_rerank: ${m.timing.evidenceRerank || 0}ms`);
|
lines.push(`├─ floor_rerank: ${m.timing.evidenceRerank || 0}ms`);
|
||||||
lines.push(`├─ l1_cosine: ${m.evidence.l1CosineTime}ms`);
|
lines.push(`├─ l1_cosine: ${m.evidence.l1CosineTime}ms`);
|
||||||
|
lines.push(`├─ diffusion: ${m.timing.diffusion}ms`);
|
||||||
lines.push(`├─ evidence_assembly: ${m.timing.evidenceAssembly}ms`);
|
lines.push(`├─ evidence_assembly: ${m.timing.evidenceAssembly}ms`);
|
||||||
lines.push(`├─ formatting: ${m.timing.formatting}ms`);
|
lines.push(`├─ formatting: ${m.timing.formatting}ms`);
|
||||||
lines.push(`└─ total: ${m.timing.total}ms`);
|
lines.push(`└─ total: ${m.timing.total}ms`);
|
||||||
@@ -578,5 +625,25 @@ export function detectIssues(metrics) {
|
|||||||
issues.push(`Slow L1 cosine scoring (${m.evidence.l1CosineTime}ms) - too many chunks pulled`);
|
issues.push(`Slow L1 cosine scoring (${m.evidence.l1CosineTime}ms) - too many chunks pulled`);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ─────────────────────────────────────────────────────────────────
|
||||||
|
// Diffusion 问题
|
||||||
|
// ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
if (m.diffusion.graphEdges === 0 && m.diffusion.seedCount > 0) {
|
||||||
|
issues.push('No diffusion graph edges - atoms may lack who/edges fields');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (m.diffusion.pprActivated > 0 && m.diffusion.cosineGatePassed === 0) {
|
||||||
|
issues.push('All PPR-activated nodes failed cosine gate - graph structure diverged from query semantics');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (m.diffusion.cosineGateNoVector > 5) {
|
||||||
|
issues.push(`${m.diffusion.cosineGateNoVector} PPR nodes missing vectors - L0 vectorization may be incomplete`);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (m.diffusion.time > 50) {
|
||||||
|
issues.push(`Slow diffusion (${m.diffusion.time}ms) - graph may be too dense`);
|
||||||
|
}
|
||||||
|
|
||||||
return issues;
|
return issues;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -39,6 +39,7 @@ import {
|
|||||||
import { getLexicalIndex, searchLexicalIndex } from './lexical-index.js';
|
import { getLexicalIndex, searchLexicalIndex } from './lexical-index.js';
|
||||||
import { rerankChunks } from '../llm/reranker.js';
|
import { rerankChunks } from '../llm/reranker.js';
|
||||||
import { createMetrics, calcSimilarityStats } from './metrics.js';
|
import { createMetrics, calcSimilarityStats } from './metrics.js';
|
||||||
|
import { diffuseFromSeeds } from './diffusion.js';
|
||||||
|
|
||||||
const MODULE_ID = 'recall';
|
const MODULE_ID = 'recall';
|
||||||
|
|
||||||
@@ -59,10 +60,10 @@ const CONFIG = {
|
|||||||
EVENT_SELECT_MAX: 50,
|
EVENT_SELECT_MAX: 50,
|
||||||
EVENT_MIN_SIMILARITY: 0.55,
|
EVENT_MIN_SIMILARITY: 0.55,
|
||||||
EVENT_MMR_LAMBDA: 0.72,
|
EVENT_MMR_LAMBDA: 0.72,
|
||||||
EVENT_ENTITY_BYPASS_SIM: 0.80,
|
EVENT_ENTITY_BYPASS_SIM: 0.70,
|
||||||
|
|
||||||
// Lexical Dense 门槛
|
// Lexical Dense 门槛
|
||||||
LEXICAL_EVENT_DENSE_MIN: 0.50,
|
LEXICAL_EVENT_DENSE_MIN: 0.60,
|
||||||
LEXICAL_FLOOR_DENSE_MIN: 0.50,
|
LEXICAL_FLOOR_DENSE_MIN: 0.50,
|
||||||
|
|
||||||
// W-RRF 融合(L0-only)
|
// W-RRF 融合(L0-only)
|
||||||
@@ -71,10 +72,6 @@ const CONFIG = {
|
|||||||
RRF_W_LEX: 0.9,
|
RRF_W_LEX: 0.9,
|
||||||
FUSION_CAP: 60,
|
FUSION_CAP: 60,
|
||||||
|
|
||||||
// Dense floor 聚合权重
|
|
||||||
DENSE_AGG_W_MAX: 0.6,
|
|
||||||
DENSE_AGG_W_MEAN: 0.4,
|
|
||||||
|
|
||||||
// Lexical floor 聚合密度加成
|
// Lexical floor 聚合密度加成
|
||||||
LEX_DENSITY_BONUS: 0.3,
|
LEX_DENSITY_BONUS: 0.3,
|
||||||
|
|
||||||
@@ -102,6 +99,20 @@ function cosineSimilarity(a, b) {
|
|||||||
return nA && nB ? dot / (Math.sqrt(nA) * Math.sqrt(nB)) : 0;
|
return nA && nB ? dot / (Math.sqrt(nA) * Math.sqrt(nB)) : 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 从事件 summary 末尾解析楼层范围 (#X) 或 (#X-Y)
|
||||||
|
* @param {string} summary
|
||||||
|
* @returns {{start: number, end: number}|null}
|
||||||
|
*/
|
||||||
|
function parseFloorRange(summary) {
|
||||||
|
if (!summary) return null;
|
||||||
|
const match = String(summary).match(/\(#(\d+)(?:-(\d+))?\)/);
|
||||||
|
if (!match) return null;
|
||||||
|
const start = Math.max(0, parseInt(match[1], 10) - 1);
|
||||||
|
const end = Math.max(0, (match[2] ? parseInt(match[2], 10) : parseInt(match[1], 10)) - 1);
|
||||||
|
return { start, end };
|
||||||
|
}
|
||||||
|
|
||||||
function normalize(s) {
|
function normalize(s) {
|
||||||
return String(s || '')
|
return String(s || '')
|
||||||
.normalize('NFKC')
|
.normalize('NFKC')
|
||||||
@@ -253,19 +264,19 @@ function mmrSelect(candidates, k, lambda, getVector, getScore) {
|
|||||||
async function recallAnchors(queryVector, vectorConfig, metrics) {
|
async function recallAnchors(queryVector, vectorConfig, metrics) {
|
||||||
const { chatId } = getContext();
|
const { chatId } = getContext();
|
||||||
if (!chatId || !queryVector?.length) {
|
if (!chatId || !queryVector?.length) {
|
||||||
return { hits: [], floors: new Set() };
|
return { hits: [], floors: new Set(), stateVectors: [] };
|
||||||
}
|
}
|
||||||
|
|
||||||
const meta = await getMeta(chatId);
|
const meta = await getMeta(chatId);
|
||||||
const fp = getEngineFingerprint(vectorConfig);
|
const fp = getEngineFingerprint(vectorConfig);
|
||||||
if (meta.fingerprint && meta.fingerprint !== fp) {
|
if (meta.fingerprint && meta.fingerprint !== fp) {
|
||||||
xbLog.warn(MODULE_ID, 'Anchor fingerprint 不匹配');
|
xbLog.warn(MODULE_ID, 'Anchor fingerprint 不匹配');
|
||||||
return { hits: [], floors: new Set() };
|
return { hits: [], floors: new Set(), stateVectors: [] };
|
||||||
}
|
}
|
||||||
|
|
||||||
const stateVectors = await getAllStateVectors(chatId);
|
const stateVectors = await getAllStateVectors(chatId);
|
||||||
if (!stateVectors.length) {
|
if (!stateVectors.length) {
|
||||||
return { hits: [], floors: new Set() };
|
return { hits: [], floors: new Set(), stateVectors: [] };
|
||||||
}
|
}
|
||||||
|
|
||||||
const atomsList = getStateAtoms();
|
const atomsList = getStateAtoms();
|
||||||
@@ -298,7 +309,7 @@ async function recallAnchors(queryVector, vectorConfig, metrics) {
|
|||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
|
|
||||||
return { hits: scored, floors };
|
return { hits: scored, floors, stateVectors };
|
||||||
}
|
}
|
||||||
|
|
||||||
// ═══════════════════════════════════════════════════════════════════════════
|
// ═══════════════════════════════════════════════════════════════════════════
|
||||||
@@ -402,7 +413,7 @@ async function recallEvents(queryVector, allEvents, vectorConfig, focusEntities,
|
|||||||
|
|
||||||
if (metrics) {
|
if (metrics) {
|
||||||
metrics.event.selected = results.length;
|
metrics.event.selected = results.length;
|
||||||
metrics.event.byRecallType = { direct: directCount, related: relatedCount, causal: 0, lexical: 0 };
|
metrics.event.byRecallType = { direct: directCount, related: relatedCount, causal: 0, lexical: 0, l0Linked: 0 };
|
||||||
metrics.event.similarityDistribution = calcSimilarityStats(results.map(r => r.similarity));
|
metrics.event.similarityDistribution = calcSimilarityStats(results.map(r => r.similarity));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -517,23 +528,18 @@ async function locateAndPullEvidence(anchorHits, queryVector, rerankQuery, lexic
|
|||||||
// 6a. Dense floor rank(加权聚合:maxSim×0.6 + meanSim×0.4)
|
// 6a. Dense floor rank(加权聚合:maxSim×0.6 + meanSim×0.4)
|
||||||
// ─────────────────────────────────────────────────────────────────
|
// ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
const denseFloorAgg = new Map();
|
const denseFloorMax = new Map();
|
||||||
for (const a of (anchorHits || [])) {
|
for (const a of (anchorHits || [])) {
|
||||||
const cur = denseFloorAgg.get(a.floor);
|
const cur = denseFloorMax.get(a.floor);
|
||||||
if (!cur) {
|
if (!cur || a.similarity > cur) {
|
||||||
denseFloorAgg.set(a.floor, { maxSim: a.similarity, hitCount: 1, sumSim: a.similarity });
|
denseFloorMax.set(a.floor, a.similarity);
|
||||||
} else {
|
|
||||||
cur.maxSim = Math.max(cur.maxSim, a.similarity);
|
|
||||||
cur.hitCount++;
|
|
||||||
cur.sumSim += a.similarity;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const denseFloorRank = [...denseFloorAgg.entries()]
|
const denseFloorRank = [...denseFloorMax.entries()]
|
||||||
.map(([floor, info]) => ({
|
.map(([floor, maxSim]) => ({
|
||||||
id: floor,
|
id: floor,
|
||||||
score: info.maxSim * CONFIG.DENSE_AGG_W_MAX
|
score: maxSim,
|
||||||
+ (info.sumSim / info.hitCount) * CONFIG.DENSE_AGG_W_MEAN,
|
|
||||||
}))
|
}))
|
||||||
.sort((a, b) => b.score - a.score);
|
.sort((a, b) => b.score - a.score);
|
||||||
|
|
||||||
@@ -565,8 +571,8 @@ async function locateAndPullEvidence(anchorHits, queryVector, rerankQuery, lexic
|
|||||||
if (!atomFloorSet.has(floor)) continue;
|
if (!atomFloorSet.has(floor)) continue;
|
||||||
|
|
||||||
// Dense 门槛:lexical floor 必须有最低 dense 相关性
|
// Dense 门槛:lexical floor 必须有最低 dense 相关性
|
||||||
const denseInfo = denseFloorAgg.get(floor);
|
const denseMax = denseFloorMax.get(floor);
|
||||||
if (!denseInfo || denseInfo.maxSim < CONFIG.LEXICAL_FLOOR_DENSE_MIN) {
|
if (!denseMax || denseMax < CONFIG.LEXICAL_FLOOR_DENSE_MIN) {
|
||||||
lexFloorFilteredByDense++;
|
lexFloorFilteredByDense++;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@@ -605,7 +611,7 @@ async function locateAndPullEvidence(anchorHits, queryVector, rerankQuery, lexic
|
|||||||
metrics.fusion.totalUnique = totalUnique;
|
metrics.fusion.totalUnique = totalUnique;
|
||||||
metrics.fusion.afterCap = fusedFloors.length;
|
metrics.fusion.afterCap = fusedFloors.length;
|
||||||
metrics.fusion.time = fusionTime;
|
metrics.fusion.time = fusionTime;
|
||||||
metrics.fusion.denseAggMethod = `max×${CONFIG.DENSE_AGG_W_MAX}+mean×${CONFIG.DENSE_AGG_W_MEAN}`;
|
metrics.fusion.denseAggMethod = 'maxSim';
|
||||||
metrics.fusion.lexDensityBonus = CONFIG.LEX_DENSITY_BONUS;
|
metrics.fusion.lexDensityBonus = CONFIG.LEX_DENSITY_BONUS;
|
||||||
metrics.evidence.floorCandidates = fusedFloors.length;
|
metrics.evidence.floorCandidates = fusedFloors.length;
|
||||||
}
|
}
|
||||||
@@ -1060,7 +1066,7 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const T_R2_Anchor_Start = performance.now();
|
const T_R2_Anchor_Start = performance.now();
|
||||||
const { hits: anchorHits, floors: anchorFloors_dense } = await recallAnchors(queryVector_v1, vectorConfig, metrics);
|
const { hits: anchorHits, floors: anchorFloors_dense, stateVectors: allStateVectors } = await recallAnchors(queryVector_v1, vectorConfig, metrics);
|
||||||
metrics.timing.anchorSearch = Math.round(performance.now() - T_R2_Anchor_Start);
|
metrics.timing.anchorSearch = Math.round(performance.now() - T_R2_Anchor_Start);
|
||||||
|
|
||||||
const T_R2_Event_Start = performance.now();
|
const T_R2_Event_Start = performance.now();
|
||||||
@@ -1108,6 +1114,7 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) {
|
|||||||
const eventIndex = buildEventIndex(allEvents);
|
const eventIndex = buildEventIndex(allEvents);
|
||||||
let lexicalEventCount = 0;
|
let lexicalEventCount = 0;
|
||||||
let lexicalEventFilteredByDense = 0;
|
let lexicalEventFilteredByDense = 0;
|
||||||
|
const focusSetForLexical = new Set((bundle.focusEntities || []).map(normalize));
|
||||||
|
|
||||||
for (const eid of lexicalResult.eventIds) {
|
for (const eid of lexicalResult.eventIds) {
|
||||||
if (existingEventIds.has(eid)) continue;
|
if (existingEventIds.has(eid)) continue;
|
||||||
@@ -1129,16 +1136,59 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// 通过门槛,使用实际 dense similarity(而非硬编码 0)
|
// 实体分类:与 Dense 路径统一标准
|
||||||
|
const participants = (ev.participants || []).map(p => normalize(p));
|
||||||
|
const hasEntityMatch = focusSetForLexical.size > 0 && participants.some(p => focusSetForLexical.has(p));
|
||||||
|
|
||||||
eventHits.push({
|
eventHits.push({
|
||||||
event: ev,
|
event: ev,
|
||||||
similarity: sim,
|
similarity: sim,
|
||||||
_recallType: 'LEXICAL',
|
_recallType: hasEntityMatch ? 'DIRECT' : 'RELATED',
|
||||||
});
|
});
|
||||||
existingEventIds.add(eid);
|
existingEventIds.add(eid);
|
||||||
lexicalEventCount++;
|
lexicalEventCount++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ═══════════════════════════════════════════════════════════════════
|
||||||
|
// 阶段 5.5: L0 → L2 反向查找
|
||||||
|
// 已召回的 L0 楼层落在某 L2 事件范围内,但该 L2 自身未被召回
|
||||||
|
// ═══════════════════════════════════════════════════════════════════
|
||||||
|
|
||||||
|
const recalledL0Floors = new Set(anchorHits.map(h => h.floor));
|
||||||
|
let l0LinkedCount = 0;
|
||||||
|
|
||||||
|
for (const event of allEvents) {
|
||||||
|
if (existingEventIds.has(event.id)) continue;
|
||||||
|
|
||||||
|
const range = parseFloorRange(event.summary);
|
||||||
|
if (!range) continue;
|
||||||
|
|
||||||
|
let hasOverlap = false;
|
||||||
|
for (const floor of recalledL0Floors) {
|
||||||
|
if (floor >= range.start && floor <= range.end) {
|
||||||
|
hasOverlap = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!hasOverlap) continue;
|
||||||
|
|
||||||
|
// 实体分类:与所有路径统一标准
|
||||||
|
const participants = (event.participants || []).map(p => normalize(p));
|
||||||
|
const hasEntityMatch = focusSetForLexical.size > 0
|
||||||
|
&& participants.some(p => focusSetForLexical.has(p));
|
||||||
|
|
||||||
|
const evVec = eventVectorMap.get(event.id);
|
||||||
|
const sim = evVec?.length ? cosineSimilarity(queryVector_v1, evVec) : 0;
|
||||||
|
|
||||||
|
eventHits.push({
|
||||||
|
event,
|
||||||
|
similarity: sim,
|
||||||
|
_recallType: hasEntityMatch ? 'DIRECT' : 'RELATED',
|
||||||
|
});
|
||||||
|
existingEventIds.add(event.id);
|
||||||
|
l0LinkedCount++;
|
||||||
|
}
|
||||||
|
|
||||||
if (metrics) {
|
if (metrics) {
|
||||||
metrics.lexical.eventFilteredByDense = lexicalEventFilteredByDense;
|
metrics.lexical.eventFilteredByDense = lexicalEventFilteredByDense;
|
||||||
|
|
||||||
@@ -1146,10 +1196,14 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) {
|
|||||||
metrics.event.byRecallType.lexical = lexicalEventCount;
|
metrics.event.byRecallType.lexical = lexicalEventCount;
|
||||||
metrics.event.selected += lexicalEventCount;
|
metrics.event.selected += lexicalEventCount;
|
||||||
}
|
}
|
||||||
|
if (l0LinkedCount > 0) {
|
||||||
|
metrics.event.byRecallType.l0Linked = l0LinkedCount;
|
||||||
|
metrics.event.selected += l0LinkedCount;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
xbLog.info(MODULE_ID,
|
xbLog.info(MODULE_ID,
|
||||||
`Lexical: chunks=${lexicalResult.chunkIds.length} events=${lexicalResult.eventIds.length} mergedEvents=+${lexicalEventCount} filteredByDense=${lexicalEventFilteredByDense} (${lexTime}ms)`
|
`Lexical: chunks=${lexicalResult.chunkIds.length} events=${lexicalResult.eventIds.length} mergedEvents=+${lexicalEventCount} filteredByDense=${lexicalEventFilteredByDense} l0Linked=+${l0LinkedCount} (${lexTime}ms)`
|
||||||
);
|
);
|
||||||
|
|
||||||
// ═══════════════════════════════════════════════════════════════════
|
// ═══════════════════════════════════════════════════════════════════
|
||||||
@@ -1164,6 +1218,35 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) {
|
|||||||
metrics
|
metrics
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// ═══════════════════════════════════════════════════════════════════
|
||||||
|
// Stage 7.5: PPR Diffusion Activation
|
||||||
|
//
|
||||||
|
// Spread from reranked seeds through entity co-occurrence graph.
|
||||||
|
// Diffused atoms merge into l0Selected at lower scores than seeds,
|
||||||
|
// consumed by prompt.js through the same budget pipeline.
|
||||||
|
// ═══════════════════════════════════════════════════════════════════
|
||||||
|
|
||||||
|
const diffused = diffuseFromSeeds(
|
||||||
|
l0Selected, // seeds (rerank-verified)
|
||||||
|
getStateAtoms(), // all L0 atoms
|
||||||
|
allStateVectors, // all L0 vectors (already read by recallAnchors)
|
||||||
|
queryVector_v1, // R2 query vector (for cosine gate)
|
||||||
|
metrics, // metrics collector
|
||||||
|
);
|
||||||
|
|
||||||
|
for (const da of diffused) {
|
||||||
|
l0Selected.push({
|
||||||
|
id: `diffused-${da.atomId}`,
|
||||||
|
atomId: da.atomId,
|
||||||
|
floor: da.floor,
|
||||||
|
similarity: da.finalScore,
|
||||||
|
rerankScore: da.finalScore,
|
||||||
|
atom: da.atom,
|
||||||
|
text: da.atom.semantic || '',
|
||||||
|
});
|
||||||
|
}
|
||||||
|
metrics.timing.diffusion = metrics.diffusion?.time || 0;
|
||||||
|
|
||||||
// ═══════════════════════════════════════════════════════════════════
|
// ═══════════════════════════════════════════════════════════════════
|
||||||
// 阶段 7: Causation Trace
|
// 阶段 7: Causation Trace
|
||||||
// ═══════════════════════════════════════════════════════════════════
|
// ═══════════════════════════════════════════════════════════════════
|
||||||
@@ -1206,6 +1289,7 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) {
|
|||||||
console.log(`Floor Rerank: ${metrics.evidence.beforeRerank || 0} → ${metrics.evidence.floorsSelected || 0} floors → L0=${metrics.evidence.l0Collected || 0} (${metrics.evidence.rerankTime || 0}ms)`);
|
console.log(`Floor Rerank: ${metrics.evidence.beforeRerank || 0} → ${metrics.evidence.floorsSelected || 0} floors → L0=${metrics.evidence.l0Collected || 0} (${metrics.evidence.rerankTime || 0}ms)`);
|
||||||
console.log(`L1: ${metrics.evidence.l1Pulled || 0} pulled → ${metrics.evidence.l1Attached || 0} attached (${metrics.evidence.l1CosineTime || 0}ms)`);
|
console.log(`L1: ${metrics.evidence.l1Pulled || 0} pulled → ${metrics.evidence.l1Attached || 0} attached (${metrics.evidence.l1CosineTime || 0}ms)`);
|
||||||
console.log(`Events: ${eventHits.length} hits, ${causalChain.length} causal`);
|
console.log(`Events: ${eventHits.length} hits, ${causalChain.length} causal`);
|
||||||
|
console.log(`Diffusion: ${metrics.diffusion?.seedCount || 0} seeds → ${metrics.diffusion?.pprActivated || 0} activated → ${metrics.diffusion?.finalCount || 0} final (${metrics.diffusion?.time || 0}ms)`);
|
||||||
console.groupEnd();
|
console.groupEnd();
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
|||||||
Reference in New Issue
Block a user