feat(recall): add diffusion stage and improve retrieval metrics
This commit is contained in:
@@ -199,9 +199,6 @@ function anchorToAtom(anchor, aiFloor, idx) {
|
||||
// ═══ 检索层(embedding 的唯一入口) ═══
|
||||
semantic: scene,
|
||||
|
||||
// ═══ 场景数据 ═══
|
||||
scene,
|
||||
|
||||
// ═══ 图结构层(扩散的 key) ═══
|
||||
who,
|
||||
edges,
|
||||
|
||||
776
modules/story-summary/vector/retrieval/diffusion.js
Normal file
776
modules/story-summary/vector/retrieval/diffusion.js
Normal file
@@ -0,0 +1,776 @@
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// diffusion.js - PPR Graph Diffusion (Personalized PageRank)
|
||||
//
|
||||
// Spreads activation from seed L0 atoms through entity co-occurrence graph
|
||||
// to discover narratively-connected but semantically-distant memories.
|
||||
//
|
||||
// Pipeline position: recall.js Stage 7.5
|
||||
// Input: seeds (reranked L0 from Stage 6)
|
||||
// Output: additional L0 atoms → merged into l0Selected
|
||||
//
|
||||
// Algorithm:
|
||||
// 1. Build undirected weighted graph over all L0 atoms
|
||||
// Four channels: WHO/WHAT/WHERE/HOW (Jaccard/Overlap/ExactMatch)
|
||||
// 2. Personalized PageRank (Power Iteration)
|
||||
// Seeds weighted by rerankScore — Haveliwala (2002) topic-sensitive variant
|
||||
// α = 0.15 restart probability — Page et al. (1998)
|
||||
// 3. Post-verification (Dense Cosine Gate)
|
||||
// Exclude seeds, cosine ≥ 0.45, final = PPR_norm × cosine ≥ 0.10
|
||||
//
|
||||
// References:
|
||||
// Page et al. "The PageRank Citation Ranking" (1998)
|
||||
// Haveliwala "Topic-Sensitive PageRank" (IEEE TKDE 2003)
|
||||
// Langville & Meyer "Eigenvector Methods for Web IR" (SIAM Review 2005)
|
||||
// Sun et al. "GraftNet" (EMNLP 2018)
|
||||
// Jaccard "Étude comparative de la distribution florale" (1912)
|
||||
// Szymkiewicz "Une contribution statistique" (1934) — Overlap coefficient
|
||||
// Rimmon-Kenan "Narrative Fiction" (2002) — Channel weight rationale
|
||||
//
|
||||
// Core PPR iteration aligned with NetworkX pagerank():
|
||||
// github.com/networkx/networkx — algorithms/link_analysis/pagerank_alg.py
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
import { xbLog } from '../../../../core/debug-core.js';
|
||||
|
||||
const MODULE_ID = 'diffusion';
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Configuration
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
const CONFIG = {
|
||||
// PPR parameters (Page et al. 1998; GraftNet 2018 uses same values)
|
||||
ALPHA: 0.15, // restart probability
|
||||
EPSILON: 1e-6, // L1 convergence threshold
|
||||
MAX_ITER: 50, // hard iteration cap (typically converges in 15-25)
|
||||
|
||||
// Edge weight channel coefficients
|
||||
// Rationale: Rimmon-Kenan (2002) hierarchy: characters > events > setting > themes
|
||||
GAMMA: {
|
||||
who: 0.50, // entity co-occurrence — Jaccard
|
||||
what: 0.25, // directed pair overlap — Szymkiewicz-Simpson
|
||||
where: 0.15, // location exact match — binary
|
||||
how: 0.10, // dynamics tag co-occurrence — Jaccard
|
||||
},
|
||||
|
||||
// Post-verification (Cosine Gate)
|
||||
COSINE_GATE: 0.45, // min cosine(queryVector, stateVector)
|
||||
SCORE_FLOOR: 0.10, // min finalScore = PPR_normalized × cosine
|
||||
DIFFUSION_CAP: 60, // max diffused nodes (excluding seeds)
|
||||
};
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Utility functions
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* Unicode-safe text normalization (matches recall.js / entity-lexicon.js)
|
||||
*/
|
||||
function normalize(s) {
|
||||
return String(s || '')
|
||||
.normalize('NFKC')
|
||||
.replace(/[\u200B-\u200D\uFEFF]/g, '')
|
||||
.trim()
|
||||
.toLowerCase();
|
||||
}
|
||||
|
||||
/**
|
||||
* Cosine similarity between two vectors
|
||||
*/
|
||||
function cosineSimilarity(a, b) {
|
||||
if (!a?.length || !b?.length || a.length !== b.length) return 0;
|
||||
let dot = 0, nA = 0, nB = 0;
|
||||
for (let i = 0; i < a.length; i++) {
|
||||
dot += a[i] * b[i];
|
||||
nA += a[i] * a[i];
|
||||
nB += b[i] * b[i];
|
||||
}
|
||||
return nA && nB ? dot / (Math.sqrt(nA) * Math.sqrt(nB)) : 0;
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Feature extraction from L0 atoms
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* WHO channel: entity set = who ∪ edges.s ∪ edges.t
|
||||
* @param {object} atom
|
||||
* @returns {Set<string>}
|
||||
*/
|
||||
function extractEntities(atom) {
|
||||
const set = new Set();
|
||||
for (const w of (atom.who || [])) {
|
||||
const n = normalize(w);
|
||||
if (n) set.add(n);
|
||||
}
|
||||
for (const e of (atom.edges || [])) {
|
||||
const s = normalize(e?.s);
|
||||
const t = normalize(e?.t);
|
||||
if (s) set.add(s);
|
||||
if (t) set.add(t);
|
||||
}
|
||||
return set;
|
||||
}
|
||||
|
||||
/**
|
||||
* WHAT channel: directed interaction pairs "A→B" (strict direction — option A)
|
||||
* @param {object} atom
|
||||
* @returns {Set<string>}
|
||||
*/
|
||||
function extractDirectedPairs(atom) {
|
||||
const set = new Set();
|
||||
for (const e of (atom.edges || [])) {
|
||||
const s = normalize(e?.s);
|
||||
const t = normalize(e?.t);
|
||||
if (s && t) set.add(`${s}\u2192${t}`);
|
||||
}
|
||||
return set;
|
||||
}
|
||||
|
||||
/**
|
||||
* WHERE channel: normalized location string
|
||||
* @param {object} atom
|
||||
* @returns {string} empty string if absent
|
||||
*/
|
||||
function extractLocation(atom) {
|
||||
return normalize(atom.where);
|
||||
}
|
||||
|
||||
/**
|
||||
* HOW channel: dynamics tags set
|
||||
* @param {object} atom
|
||||
* @returns {Set<string>}
|
||||
*/
|
||||
function extractDynamics(atom) {
|
||||
const set = new Set();
|
||||
for (const d of (atom.dynamics || [])) {
|
||||
const n = normalize(d);
|
||||
if (n) set.add(n);
|
||||
}
|
||||
return set;
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Set similarity functions
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* Jaccard index: |A∩B| / |A∪B| (Jaccard 1912)
|
||||
* @param {Set<string>} a
|
||||
* @param {Set<string>} b
|
||||
* @returns {number} 0..1
|
||||
*/
|
||||
function jaccard(a, b) {
|
||||
if (!a.size || !b.size) return 0;
|
||||
let inter = 0;
|
||||
const [smaller, larger] = a.size <= b.size ? [a, b] : [b, a];
|
||||
for (const x of smaller) {
|
||||
if (larger.has(x)) inter++;
|
||||
}
|
||||
const union = a.size + b.size - inter;
|
||||
return union > 0 ? inter / union : 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Overlap coefficient: |A∩B| / min(|A|,|B|) (Szymkiewicz-Simpson 1934)
|
||||
* Used for directed pairs where set sizes are small (1-3); Jaccard
|
||||
* over-penalizes small-set asymmetry.
|
||||
* @param {Set<string>} a
|
||||
* @param {Set<string>} b
|
||||
* @returns {number} 0..1
|
||||
*/
|
||||
function overlapCoefficient(a, b) {
|
||||
if (!a.size || !b.size) return 0;
|
||||
let inter = 0;
|
||||
const [smaller, larger] = a.size <= b.size ? [a, b] : [b, a];
|
||||
for (const x of smaller) {
|
||||
if (larger.has(x)) inter++;
|
||||
}
|
||||
return inter / smaller.size;
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Graph construction
|
||||
//
|
||||
// Candidate pairs discovered via inverted indices on entities and locations.
|
||||
// Dynamics-only pairs excluded from candidate generation (γ_HOW = 0.10 is
|
||||
// too weak to justify O(N²) blowup from 8-tag combinatorics).
|
||||
// All four channels evaluated for every candidate pair.
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* Pre-extract features for all atoms
|
||||
* @param {object[]} allAtoms
|
||||
* @returns {object[]} feature objects with entities/directedPairs/location/dynamics
|
||||
*/
|
||||
function extractAllFeatures(allAtoms) {
|
||||
return allAtoms.map(atom => ({
|
||||
entities: extractEntities(atom),
|
||||
directedPairs: extractDirectedPairs(atom),
|
||||
location: extractLocation(atom),
|
||||
dynamics: extractDynamics(atom),
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Build inverted index: value → list of atom indices
|
||||
* @param {object[]} features
|
||||
* @returns {{ entityIndex: Map, locationIndex: Map }}
|
||||
*/
|
||||
function buildInvertedIndices(features) {
|
||||
const entityIndex = new Map();
|
||||
const locationIndex = new Map();
|
||||
|
||||
for (let i = 0; i < features.length; i++) {
|
||||
for (const e of features[i].entities) {
|
||||
if (!entityIndex.has(e)) entityIndex.set(e, []);
|
||||
entityIndex.get(e).push(i);
|
||||
}
|
||||
const loc = features[i].location;
|
||||
if (loc) {
|
||||
if (!locationIndex.has(loc)) locationIndex.set(loc, []);
|
||||
locationIndex.get(loc).push(i);
|
||||
}
|
||||
}
|
||||
|
||||
return { entityIndex, locationIndex };
|
||||
}
|
||||
|
||||
/**
|
||||
* Collect candidate pairs from inverted index
|
||||
* @param {Map} index - value → [atomIndex, ...]
|
||||
* @param {Set<number>} pairSet - packed pair collector
|
||||
* @param {number} N - total atom count (for pair packing)
|
||||
*/
|
||||
function collectPairsFromIndex(index, pairSet, N) {
|
||||
for (const indices of index.values()) {
|
||||
for (let a = 0; a < indices.length; a++) {
|
||||
for (let b = a + 1; b < indices.length; b++) {
|
||||
const lo = Math.min(indices[a], indices[b]);
|
||||
const hi = Math.max(indices[a], indices[b]);
|
||||
pairSet.add(lo * N + hi);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Build weighted undirected graph over L0 atoms.
|
||||
*
|
||||
* @param {object[]} allAtoms
|
||||
* @returns {{ neighbors: object[][], edgeCount: number, channelStats: object, buildTime: number }}
|
||||
*/
|
||||
function buildGraph(allAtoms) {
|
||||
const N = allAtoms.length;
|
||||
const T0 = performance.now();
|
||||
|
||||
const features = extractAllFeatures(allAtoms);
|
||||
const { entityIndex, locationIndex } = buildInvertedIndices(features);
|
||||
|
||||
// Candidate pairs: share ≥1 entity or same location
|
||||
const pairSet = new Set();
|
||||
collectPairsFromIndex(entityIndex, pairSet, N);
|
||||
collectPairsFromIndex(locationIndex, pairSet, N);
|
||||
|
||||
// Compute four-channel edge weights for all candidates
|
||||
const neighbors = Array.from({ length: N }, () => []);
|
||||
let edgeCount = 0;
|
||||
const channelStats = { who: 0, what: 0, where: 0, how: 0 };
|
||||
|
||||
for (const packed of pairSet) {
|
||||
const i = Math.floor(packed / N);
|
||||
const j = packed % N;
|
||||
|
||||
const fi = features[i];
|
||||
const fj = features[j];
|
||||
|
||||
const wWho = jaccard(fi.entities, fj.entities);
|
||||
const wWhat = overlapCoefficient(fi.directedPairs, fj.directedPairs);
|
||||
const wWhere = (fi.location && fi.location === fj.location) ? 1.0 : 0.0;
|
||||
const wHow = jaccard(fi.dynamics, fj.dynamics);
|
||||
|
||||
const weight =
|
||||
CONFIG.GAMMA.who * wWho +
|
||||
CONFIG.GAMMA.what * wWhat +
|
||||
CONFIG.GAMMA.where * wWhere +
|
||||
CONFIG.GAMMA.how * wHow;
|
||||
|
||||
if (weight > 0) {
|
||||
neighbors[i].push({ target: j, weight });
|
||||
neighbors[j].push({ target: i, weight });
|
||||
edgeCount++;
|
||||
|
||||
if (wWho > 0) channelStats.who++;
|
||||
if (wWhat > 0) channelStats.what++;
|
||||
if (wWhere > 0) channelStats.where++;
|
||||
if (wHow > 0) channelStats.how++;
|
||||
}
|
||||
}
|
||||
|
||||
const buildTime = Math.round(performance.now() - T0);
|
||||
|
||||
xbLog.info(MODULE_ID,
|
||||
`Graph: ${N} nodes, ${edgeCount} edges ` +
|
||||
`(who=${channelStats.who} what=${channelStats.what} ` +
|
||||
`where=${channelStats.where} how=${channelStats.how}) ` +
|
||||
`(${buildTime}ms)`
|
||||
);
|
||||
|
||||
return { neighbors, edgeCount, channelStats, buildTime };
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// PPR: Seed vector construction
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* Build personalization vector s from seeds, weighted by rerankScore.
|
||||
* Haveliwala (2002): non-uniform personalization improves topic sensitivity.
|
||||
*
|
||||
* @param {object[]} seeds - seed L0 entries with atomId and rerankScore
|
||||
* @param {Map<string, number>} idToIdx - atomId → array index
|
||||
* @param {number} N - total node count
|
||||
* @returns {Float64Array} personalization vector (L1-normalized, sums to 1)
|
||||
*/
|
||||
function buildSeedVector(seeds, idToIdx, N) {
|
||||
const s = new Float64Array(N);
|
||||
let total = 0;
|
||||
|
||||
for (const seed of seeds) {
|
||||
const idx = idToIdx.get(seed.atomId);
|
||||
if (idx == null) continue;
|
||||
|
||||
const score = Math.max(0, seed.rerankScore || seed.similarity || 0);
|
||||
s[idx] += score;
|
||||
total += score;
|
||||
}
|
||||
|
||||
// L1 normalize to probability distribution
|
||||
if (total > 0) {
|
||||
for (let i = 0; i < N; i++) s[i] /= total;
|
||||
}
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// PPR: Column normalization + dangling node detection
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* Column-normalize adjacency into transition matrix W.
|
||||
*
|
||||
* Column j of W: W_{ij} = weight(i,j) / Σ_k weight(k,j)
|
||||
* Dangling nodes (no outgoing edges): handled in powerIteration
|
||||
* via redistribution to personalization vector s.
|
||||
* (Langville & Meyer 2005, §4.1)
|
||||
*
|
||||
* @param {object[][]} neighbors - neighbors[j] = [{target, weight}, ...]
|
||||
* @param {number} N
|
||||
* @returns {{ columns: object[][], dangling: number[] }}
|
||||
*/
|
||||
function columnNormalize(neighbors, N) {
|
||||
const columns = Array.from({ length: N }, () => []);
|
||||
const dangling = [];
|
||||
|
||||
for (let j = 0; j < N; j++) {
|
||||
const edges = neighbors[j];
|
||||
|
||||
let sum = 0;
|
||||
for (let e = 0; e < edges.length; e++) sum += edges[e].weight;
|
||||
|
||||
if (sum <= 0) {
|
||||
dangling.push(j);
|
||||
continue;
|
||||
}
|
||||
|
||||
const col = columns[j];
|
||||
for (let e = 0; e < edges.length; e++) {
|
||||
col.push({ target: edges[e].target, prob: edges[e].weight / sum });
|
||||
}
|
||||
}
|
||||
|
||||
return { columns, dangling };
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// PPR: Power Iteration
|
||||
//
|
||||
// Aligned with NetworkX pagerank() (pagerank_alg.py):
|
||||
//
|
||||
// NetworkX "alpha" = damping = our (1 − α)
|
||||
// NetworkX "1-alpha" = teleportation = our α
|
||||
//
|
||||
// Per iteration:
|
||||
// π_new[i] = α·s[i] + (1−α)·( Σ_j W_{ij}·π[j] + dangling_sum·s[i] )
|
||||
//
|
||||
// Convergence: Perron-Frobenius theorem guarantees unique stationary
|
||||
// distribution for irreducible aperiodic column-stochastic matrix.
|
||||
// Rate: ‖π^(t+1) − π^t‖₁ ≤ (1−α)^t (geometric).
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* Run PPR Power Iteration.
|
||||
*
|
||||
* @param {object[][]} columns - column-normalized transition matrix
|
||||
* @param {Float64Array} s - personalization vector (sums to 1)
|
||||
* @param {number[]} dangling - dangling node indices
|
||||
* @param {number} N - node count
|
||||
* @returns {{ pi: Float64Array, iterations: number, finalError: number }}
|
||||
*/
|
||||
function powerIteration(columns, s, dangling, N) {
|
||||
const alpha = CONFIG.ALPHA;
|
||||
const d = 1 - alpha; // damping factor = prob of following edges
|
||||
const epsilon = CONFIG.EPSILON;
|
||||
const maxIter = CONFIG.MAX_ITER;
|
||||
|
||||
// Initialize π to personalization vector
|
||||
let pi = new Float64Array(N);
|
||||
for (let i = 0; i < N; i++) pi[i] = s[i];
|
||||
|
||||
let iterations = 0;
|
||||
let finalError = 0;
|
||||
|
||||
for (let iter = 0; iter < maxIter; iter++) {
|
||||
const piNew = new Float64Array(N);
|
||||
|
||||
// Dangling mass: probability at nodes with no outgoing edges
|
||||
// redistributed to personalization vector (Langville & Meyer 2005)
|
||||
let danglingSum = 0;
|
||||
for (let k = 0; k < dangling.length; k++) {
|
||||
danglingSum += pi[dangling[k]];
|
||||
}
|
||||
|
||||
// Sparse matrix-vector product: (1−α) · W · π
|
||||
for (let j = 0; j < N; j++) {
|
||||
const pj = pi[j];
|
||||
if (pj === 0) continue;
|
||||
|
||||
const col = columns[j];
|
||||
const dpj = d * pj;
|
||||
for (let e = 0; e < col.length; e++) {
|
||||
piNew[col[e].target] += dpj * col[e].prob;
|
||||
}
|
||||
}
|
||||
|
||||
// Restart + dangling contribution:
|
||||
// α · s[i] + (1−α) · danglingSum · s[i]
|
||||
const restartCoeff = alpha + d * danglingSum;
|
||||
for (let i = 0; i < N; i++) {
|
||||
piNew[i] += restartCoeff * s[i];
|
||||
}
|
||||
|
||||
// L1 convergence check
|
||||
let l1 = 0;
|
||||
for (let i = 0; i < N; i++) {
|
||||
l1 += Math.abs(piNew[i] - pi[i]);
|
||||
}
|
||||
|
||||
pi = piNew;
|
||||
iterations = iter + 1;
|
||||
finalError = l1;
|
||||
|
||||
if (l1 < epsilon) break;
|
||||
}
|
||||
|
||||
return { pi, iterations, finalError };
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Post-verification: Dense Cosine Gate
|
||||
//
|
||||
// PPR measures graph-structural relevance ("same characters").
|
||||
// Cosine gate measures semantic relevance ("related to current topic").
|
||||
// Product combination ensures both dimensions are satisfied
|
||||
// (CombMNZ — Fox & Shaw, TREC-2 1994).
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* Filter PPR-activated nodes by semantic relevance.
|
||||
*
|
||||
* For each non-seed node with PPR > 0:
|
||||
* 1. cosine(queryVector, stateVector) ≥ COSINE_GATE
|
||||
* 2. finalScore = PPR_normalized × cosine ≥ SCORE_FLOOR
|
||||
* 3. Top DIFFUSION_CAP by finalScore
|
||||
*
|
||||
* @param {Float64Array} pi - PPR stationary distribution
|
||||
* @param {string[]} atomIds - index → atomId
|
||||
* @param {Map<string, object>} atomById - atomId → atom object
|
||||
* @param {Set<string>} seedAtomIds - seed atomIds (excluded from output)
|
||||
* @param {Map<string, Float32Array>} vectorMap - atomId → embedding vector
|
||||
* @param {Float32Array|number[]} queryVector - R2 weighted query vector
|
||||
* @returns {{ diffused: object[], gateStats: object }}
|
||||
*/
|
||||
function postVerify(pi, atomIds, atomById, seedAtomIds, vectorMap, queryVector) {
|
||||
const N = atomIds.length;
|
||||
const gateStats = { passed: 0, filtered: 0, noVector: 0 };
|
||||
|
||||
// Find max PPR score among non-seed nodes (for normalization)
|
||||
let maxPPR = 0;
|
||||
for (let i = 0; i < N; i++) {
|
||||
if (pi[i] > 0 && !seedAtomIds.has(atomIds[i])) {
|
||||
if (pi[i] > maxPPR) maxPPR = pi[i];
|
||||
}
|
||||
}
|
||||
|
||||
if (maxPPR <= 0) {
|
||||
return { diffused: [], gateStats };
|
||||
}
|
||||
|
||||
const candidates = [];
|
||||
|
||||
for (let i = 0; i < N; i++) {
|
||||
const atomId = atomIds[i];
|
||||
|
||||
// Skip seeds and zero-probability nodes
|
||||
if (seedAtomIds.has(atomId)) continue;
|
||||
if (pi[i] <= 0) continue;
|
||||
|
||||
// Require state vector for cosine verification
|
||||
const vec = vectorMap.get(atomId);
|
||||
if (!vec?.length) {
|
||||
gateStats.noVector++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Cosine gate
|
||||
const cos = cosineSimilarity(queryVector, vec);
|
||||
if (cos < CONFIG.COSINE_GATE) {
|
||||
gateStats.filtered++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Final score = PPR_normalized × cosine
|
||||
const pprNorm = pi[i] / maxPPR;
|
||||
const finalScore = pprNorm * cos;
|
||||
|
||||
if (finalScore < CONFIG.SCORE_FLOOR) {
|
||||
gateStats.filtered++;
|
||||
continue;
|
||||
}
|
||||
|
||||
gateStats.passed++;
|
||||
|
||||
const atom = atomById.get(atomId);
|
||||
if (!atom) continue;
|
||||
|
||||
candidates.push({
|
||||
atomId,
|
||||
floor: atom.floor,
|
||||
atom,
|
||||
finalScore,
|
||||
pprScore: pi[i],
|
||||
pprNormalized: pprNorm,
|
||||
cosine: cos,
|
||||
});
|
||||
}
|
||||
|
||||
// Sort by finalScore descending, cap at DIFFUSION_CAP
|
||||
candidates.sort((a, b) => b.finalScore - a.finalScore);
|
||||
const diffused = candidates.slice(0, CONFIG.DIFFUSION_CAP);
|
||||
|
||||
return { diffused, gateStats };
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Main entry point
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* Spread activation from seed L0 atoms through entity co-occurrence graph.
|
||||
*
|
||||
* Called from recall.js Stage 7.5, after locateAndPullEvidence and before
|
||||
* Causation Trace. Results are merged into l0Selected and consumed by
|
||||
* prompt.js through existing budget/formatting pipeline (zero downstream changes).
|
||||
*
|
||||
* @param {object[]} seeds - l0Selected from recall Stage 6
|
||||
* Each: { atomId, rerankScore, similarity, atom, ... }
|
||||
* @param {object[]} allAtoms - getStateAtoms() result
|
||||
* Each: { atomId, floor, semantic, who, edges, dynamics, where }
|
||||
* @param {object[]} stateVectors - getAllStateVectors() result
|
||||
* Each: { atomId, floor, vector: Float32Array }
|
||||
* @param {Float32Array|number[]} queryVector - R2 weighted query vector
|
||||
* @param {object|null} metrics - metrics object (optional, mutated in-place)
|
||||
* @returns {object[]} Additional L0 atoms for l0Selected
|
||||
* Each: { atomId, floor, atom, finalScore, pprScore, pprNormalized, cosine }
|
||||
*/
|
||||
export function diffuseFromSeeds(seeds, allAtoms, stateVectors, queryVector, metrics) {
|
||||
const T0 = performance.now();
|
||||
|
||||
// ─── Early exits ─────────────────────────────────────────────────
|
||||
|
||||
if (!seeds?.length || !allAtoms?.length || !queryVector?.length) {
|
||||
fillMetricsEmpty(metrics);
|
||||
return [];
|
||||
}
|
||||
|
||||
// ─── 1. Build atom index ─────────────────────────────────────────
|
||||
|
||||
const atomById = new Map();
|
||||
const atomIds = [];
|
||||
const idToIdx = new Map();
|
||||
|
||||
for (let i = 0; i < allAtoms.length; i++) {
|
||||
const a = allAtoms[i];
|
||||
atomById.set(a.atomId, a);
|
||||
atomIds.push(a.atomId);
|
||||
idToIdx.set(a.atomId, i);
|
||||
}
|
||||
|
||||
const N = allAtoms.length;
|
||||
|
||||
// Validate seeds against atom index
|
||||
const validSeeds = seeds.filter(s => idToIdx.has(s.atomId));
|
||||
const seedAtomIds = new Set(validSeeds.map(s => s.atomId));
|
||||
|
||||
if (!validSeeds.length) {
|
||||
fillMetricsEmpty(metrics);
|
||||
return [];
|
||||
}
|
||||
|
||||
// ─── 2. Build graph ──────────────────────────────────────────────
|
||||
|
||||
const graph = buildGraph(allAtoms);
|
||||
|
||||
if (graph.edgeCount === 0) {
|
||||
fillMetrics(metrics, {
|
||||
seedCount: validSeeds.length,
|
||||
graphNodes: N,
|
||||
graphEdges: 0,
|
||||
channelStats: graph.channelStats,
|
||||
time: graph.buildTime,
|
||||
});
|
||||
xbLog.info(MODULE_ID, 'No graph edges — skipping diffusion');
|
||||
return [];
|
||||
}
|
||||
|
||||
// ─── 3. Build seed vector ────────────────────────────────────────
|
||||
|
||||
const s = buildSeedVector(validSeeds, idToIdx, N);
|
||||
|
||||
// ─── 4. Column normalize ─────────────────────────────────────────
|
||||
|
||||
const { columns, dangling } = columnNormalize(graph.neighbors, N);
|
||||
|
||||
// ─── 5. PPR Power Iteration ──────────────────────────────────────
|
||||
|
||||
const T_PPR = performance.now();
|
||||
const { pi, iterations, finalError } = powerIteration(columns, s, dangling, N);
|
||||
const pprTime = Math.round(performance.now() - T_PPR);
|
||||
|
||||
// Count activated non-seed nodes
|
||||
let pprActivated = 0;
|
||||
for (let i = 0; i < N; i++) {
|
||||
if (pi[i] > 0 && !seedAtomIds.has(atomIds[i])) pprActivated++;
|
||||
}
|
||||
|
||||
// ─── 6. Post-verification ────────────────────────────────────────
|
||||
|
||||
const vectorMap = new Map();
|
||||
for (const sv of (stateVectors || [])) {
|
||||
vectorMap.set(sv.atomId, sv.vector);
|
||||
}
|
||||
|
||||
const { diffused, gateStats } = postVerify(
|
||||
pi, atomIds, atomById, seedAtomIds, vectorMap, queryVector
|
||||
);
|
||||
|
||||
// ─── 7. Metrics ──────────────────────────────────────────────────
|
||||
|
||||
const totalTime = Math.round(performance.now() - T0);
|
||||
|
||||
fillMetrics(metrics, {
|
||||
seedCount: validSeeds.length,
|
||||
graphNodes: N,
|
||||
graphEdges: graph.edgeCount,
|
||||
channelStats: graph.channelStats,
|
||||
buildTime: graph.buildTime,
|
||||
iterations,
|
||||
convergenceError: finalError,
|
||||
pprActivated,
|
||||
cosineGatePassed: gateStats.passed,
|
||||
cosineGateFiltered: gateStats.filtered,
|
||||
cosineGateNoVector: gateStats.noVector,
|
||||
finalCount: diffused.length,
|
||||
scoreDistribution: diffused.length > 0
|
||||
? calcScoreStats(diffused.map(d => d.finalScore))
|
||||
: { min: 0, max: 0, mean: 0 },
|
||||
time: totalTime,
|
||||
});
|
||||
|
||||
xbLog.info(MODULE_ID,
|
||||
`Diffusion: ${validSeeds.length} seeds → ` +
|
||||
`graph(${N}n/${graph.edgeCount}e) → ` +
|
||||
`PPR(${iterations}it, ε=${finalError.toExponential(1)}, ${pprTime}ms) → ` +
|
||||
`${pprActivated} activated → ` +
|
||||
`gate(${gateStats.passed}\u2713/${gateStats.filtered}\u2717` +
|
||||
`${gateStats.noVector ? `/${gateStats.noVector}?` : ''}) → ` +
|
||||
`${diffused.length} final (${totalTime}ms)`
|
||||
);
|
||||
|
||||
return diffused;
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Metrics helpers
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* Compute min/max/mean distribution
|
||||
* @param {number[]} scores
|
||||
* @returns {{ min: number, max: number, mean: number }}
|
||||
*/
|
||||
function calcScoreStats(scores) {
|
||||
if (!scores.length) return { min: 0, max: 0, mean: 0 };
|
||||
const sorted = [...scores].sort((a, b) => a - b);
|
||||
const sum = sorted.reduce((a, b) => a + b, 0);
|
||||
return {
|
||||
min: Number(sorted[0].toFixed(3)),
|
||||
max: Number(sorted[sorted.length - 1].toFixed(3)),
|
||||
mean: Number((sum / sorted.length).toFixed(3)),
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Fill metrics with empty diffusion block
|
||||
*/
|
||||
function fillMetricsEmpty(metrics) {
|
||||
if (!metrics) return;
|
||||
metrics.diffusion = {
|
||||
seedCount: 0,
|
||||
graphNodes: 0,
|
||||
graphEdges: 0,
|
||||
iterations: 0,
|
||||
convergenceError: 0,
|
||||
pprActivated: 0,
|
||||
cosineGatePassed: 0,
|
||||
cosineGateFiltered: 0,
|
||||
cosineGateNoVector: 0,
|
||||
finalCount: 0,
|
||||
scoreDistribution: { min: 0, max: 0, mean: 0 },
|
||||
byChannel: { who: 0, what: 0, where: 0, how: 0 },
|
||||
time: 0,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Fill metrics with diffusion results
|
||||
*/
|
||||
function fillMetrics(metrics, data) {
|
||||
if (!metrics) return;
|
||||
metrics.diffusion = {
|
||||
seedCount: data.seedCount || 0,
|
||||
graphNodes: data.graphNodes || 0,
|
||||
graphEdges: data.graphEdges || 0,
|
||||
iterations: data.iterations || 0,
|
||||
convergenceError: data.convergenceError || 0,
|
||||
pprActivated: data.pprActivated || 0,
|
||||
cosineGatePassed: data.cosineGatePassed || 0,
|
||||
cosineGateFiltered: data.cosineGateFiltered || 0,
|
||||
cosineGateNoVector: data.cosineGateNoVector || 0,
|
||||
finalCount: data.finalCount || 0,
|
||||
scoreDistribution: data.scoreDistribution || { min: 0, max: 0, mean: 0 },
|
||||
byChannel: data.channelStats || { who: 0, what: 0, where: 0, how: 0 },
|
||||
time: data.time || 0,
|
||||
};
|
||||
}
|
||||
@@ -8,6 +8,8 @@
|
||||
// 硬约束:name1 永不进入词典
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
import { getStateAtoms } from '../storage/state-store.js';
|
||||
|
||||
/**
|
||||
* 标准化字符串(用于实体匹配)
|
||||
* @param {string} s
|
||||
@@ -69,6 +71,14 @@ export function buildEntityLexicon(store, context) {
|
||||
add(f.s);
|
||||
}
|
||||
|
||||
// 5. L0 atoms 的 who(新角色在 L2 总结前即可进入词典)
|
||||
const atoms = getStateAtoms();
|
||||
for (const atom of atoms) {
|
||||
for (const name of (atom.who || [])) {
|
||||
add(name);
|
||||
}
|
||||
}
|
||||
|
||||
// ★ 硬约束:删除 name1
|
||||
if (context?.name1) {
|
||||
lexicon.delete(normalize(context.name1));
|
||||
@@ -112,6 +122,14 @@ export function buildDisplayNameMap(store, context) {
|
||||
if (!f.retracted) register(f.s);
|
||||
}
|
||||
|
||||
// 5. L0 atoms 的 who
|
||||
const atoms = getStateAtoms();
|
||||
for (const atom of atoms) {
|
||||
for (const name of (atom.who || [])) {
|
||||
register(name);
|
||||
}
|
||||
}
|
||||
|
||||
// ★ 硬约束:删除 name1
|
||||
if (context?.name1) {
|
||||
map.delete(normalize(context.name1));
|
||||
|
||||
@@ -78,7 +78,7 @@ export function createMetrics() {
|
||||
inStore: 0,
|
||||
considered: 0,
|
||||
selected: 0,
|
||||
byRecallType: { direct: 0, related: 0, causal: 0, lexical: 0 },
|
||||
byRecallType: { direct: 0, related: 0, causal: 0, lexical: 0, l0Linked: 0 },
|
||||
similarityDistribution: { min: 0, max: 0, mean: 0, median: 0 },
|
||||
entityFilter: null,
|
||||
causalChainDepth: 0,
|
||||
@@ -112,6 +112,23 @@ export function createMetrics() {
|
||||
assemblyTime: 0,
|
||||
},
|
||||
|
||||
// Diffusion (PPR Spreading Activation) - 图扩散
|
||||
diffusion: {
|
||||
seedCount: 0,
|
||||
graphNodes: 0,
|
||||
graphEdges: 0,
|
||||
iterations: 0,
|
||||
convergenceError: 0,
|
||||
pprActivated: 0,
|
||||
cosineGatePassed: 0,
|
||||
cosineGateFiltered: 0,
|
||||
cosineGateNoVector: 0,
|
||||
finalCount: 0,
|
||||
scoreDistribution: { min: 0, max: 0, mean: 0 },
|
||||
byChannel: { who: 0, what: 0, where: 0, how: 0 },
|
||||
time: 0,
|
||||
},
|
||||
|
||||
// Formatting - 格式化
|
||||
formatting: {
|
||||
sectionsIncluded: [],
|
||||
@@ -140,6 +157,7 @@ export function createMetrics() {
|
||||
evidenceRetrieval: 0,
|
||||
evidenceRerank: 0,
|
||||
evidenceAssembly: 0,
|
||||
diffusion: 0,
|
||||
formatting: 0,
|
||||
total: 0,
|
||||
},
|
||||
@@ -249,9 +267,6 @@ export function formatMetricsLog(metrics) {
|
||||
// Fusion (W-RRF, floor-level)
|
||||
lines.push('[Fusion] W-RRF (floor-level) - 多路融合');
|
||||
lines.push(`├─ dense_floors: ${m.fusion.denseFloors}`);
|
||||
if (m.fusion.denseAggMethod) {
|
||||
lines.push(`│ └─ aggregation: ${m.fusion.denseAggMethod}`);
|
||||
}
|
||||
lines.push(`├─ lex_floors: ${m.fusion.lexFloors}`);
|
||||
if (m.fusion.lexDensityBonus > 0) {
|
||||
lines.push(`│ └─ density_bonus: ${m.fusion.lexDensityBonus}`);
|
||||
@@ -291,7 +306,12 @@ export function formatMetricsLog(metrics) {
|
||||
lines.push(`│ ├─ direct: ${m.event.byRecallType.direct}`);
|
||||
lines.push(`│ ├─ related: ${m.event.byRecallType.related}`);
|
||||
lines.push(`│ ├─ causal: ${m.event.byRecallType.causal}`);
|
||||
lines.push(`│ └─ lexical: ${m.event.byRecallType.lexical}`);
|
||||
if (m.event.byRecallType.l0Linked) {
|
||||
lines.push(`│ ├─ lexical: ${m.event.byRecallType.lexical}`);
|
||||
lines.push(`│ └─ l0_linked: ${m.event.byRecallType.l0Linked}`);
|
||||
} else {
|
||||
lines.push(`│ └─ lexical: ${m.event.byRecallType.lexical}`);
|
||||
}
|
||||
|
||||
const sim = m.event.similarityDistribution;
|
||||
if (sim && sim.max > 0) {
|
||||
@@ -340,6 +360,32 @@ export function formatMetricsLog(metrics) {
|
||||
lines.push(`└─ assembly_time: ${m.evidence.assemblyTime}ms`);
|
||||
lines.push('');
|
||||
|
||||
// Diffusion (PPR)
|
||||
lines.push('[Diffusion] PPR Spreading Activation');
|
||||
lines.push(`├─ seeds: ${m.diffusion.seedCount}`);
|
||||
lines.push(`├─ graph: ${m.diffusion.graphNodes} nodes, ${m.diffusion.graphEdges} edges`);
|
||||
if (m.diffusion.graphEdges > 0) {
|
||||
const ch = m.diffusion.byChannel || {};
|
||||
lines.push(`│ └─ by_channel: who=${ch.who || 0}, what=${ch.what || 0}, where=${ch.where || 0}, how=${ch.how || 0}`);
|
||||
}
|
||||
if (m.diffusion.iterations > 0) {
|
||||
lines.push(`├─ ppr: ${m.diffusion.iterations} iterations, ε=${Number(m.diffusion.convergenceError).toExponential(1)}`);
|
||||
}
|
||||
lines.push(`├─ activated (excl seeds): ${m.diffusion.pprActivated}`);
|
||||
if (m.diffusion.pprActivated > 0) {
|
||||
lines.push(`├─ cosine_gate: ${m.diffusion.cosineGatePassed} passed, ${m.diffusion.cosineGateFiltered} filtered`);
|
||||
if (m.diffusion.cosineGateNoVector > 0) {
|
||||
lines.push(`│ └─ no_vector: ${m.diffusion.cosineGateNoVector}`);
|
||||
}
|
||||
}
|
||||
lines.push(`├─ final_injected: ${m.diffusion.finalCount}`);
|
||||
if (m.diffusion.finalCount > 0) {
|
||||
const ds = m.diffusion.scoreDistribution;
|
||||
lines.push(`├─ scores: min=${ds.min}, max=${ds.max}, mean=${ds.mean}`);
|
||||
}
|
||||
lines.push(`└─ time: ${m.diffusion.time}ms`);
|
||||
lines.push('');
|
||||
|
||||
// Formatting
|
||||
lines.push('[Formatting] 格式化');
|
||||
lines.push(`├─ sections: [${(m.formatting.sectionsIncluded || []).join(', ')}]`);
|
||||
@@ -372,6 +418,7 @@ export function formatMetricsLog(metrics) {
|
||||
lines.push(`├─ evidence_retrieval: ${m.timing.evidenceRetrieval}ms`);
|
||||
lines.push(`├─ floor_rerank: ${m.timing.evidenceRerank || 0}ms`);
|
||||
lines.push(`├─ l1_cosine: ${m.evidence.l1CosineTime}ms`);
|
||||
lines.push(`├─ diffusion: ${m.timing.diffusion}ms`);
|
||||
lines.push(`├─ evidence_assembly: ${m.timing.evidenceAssembly}ms`);
|
||||
lines.push(`├─ formatting: ${m.timing.formatting}ms`);
|
||||
lines.push(`└─ total: ${m.timing.total}ms`);
|
||||
@@ -578,5 +625,25 @@ export function detectIssues(metrics) {
|
||||
issues.push(`Slow L1 cosine scoring (${m.evidence.l1CosineTime}ms) - too many chunks pulled`);
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────
|
||||
// Diffusion 问题
|
||||
// ─────────────────────────────────────────────────────────────────
|
||||
|
||||
if (m.diffusion.graphEdges === 0 && m.diffusion.seedCount > 0) {
|
||||
issues.push('No diffusion graph edges - atoms may lack who/edges fields');
|
||||
}
|
||||
|
||||
if (m.diffusion.pprActivated > 0 && m.diffusion.cosineGatePassed === 0) {
|
||||
issues.push('All PPR-activated nodes failed cosine gate - graph structure diverged from query semantics');
|
||||
}
|
||||
|
||||
if (m.diffusion.cosineGateNoVector > 5) {
|
||||
issues.push(`${m.diffusion.cosineGateNoVector} PPR nodes missing vectors - L0 vectorization may be incomplete`);
|
||||
}
|
||||
|
||||
if (m.diffusion.time > 50) {
|
||||
issues.push(`Slow diffusion (${m.diffusion.time}ms) - graph may be too dense`);
|
||||
}
|
||||
|
||||
return issues;
|
||||
}
|
||||
|
||||
@@ -39,6 +39,7 @@ import {
|
||||
import { getLexicalIndex, searchLexicalIndex } from './lexical-index.js';
|
||||
import { rerankChunks } from '../llm/reranker.js';
|
||||
import { createMetrics, calcSimilarityStats } from './metrics.js';
|
||||
import { diffuseFromSeeds } from './diffusion.js';
|
||||
|
||||
const MODULE_ID = 'recall';
|
||||
|
||||
@@ -59,10 +60,10 @@ const CONFIG = {
|
||||
EVENT_SELECT_MAX: 50,
|
||||
EVENT_MIN_SIMILARITY: 0.55,
|
||||
EVENT_MMR_LAMBDA: 0.72,
|
||||
EVENT_ENTITY_BYPASS_SIM: 0.80,
|
||||
EVENT_ENTITY_BYPASS_SIM: 0.70,
|
||||
|
||||
// Lexical Dense 门槛
|
||||
LEXICAL_EVENT_DENSE_MIN: 0.50,
|
||||
LEXICAL_EVENT_DENSE_MIN: 0.60,
|
||||
LEXICAL_FLOOR_DENSE_MIN: 0.50,
|
||||
|
||||
// W-RRF 融合(L0-only)
|
||||
@@ -71,10 +72,6 @@ const CONFIG = {
|
||||
RRF_W_LEX: 0.9,
|
||||
FUSION_CAP: 60,
|
||||
|
||||
// Dense floor 聚合权重
|
||||
DENSE_AGG_W_MAX: 0.6,
|
||||
DENSE_AGG_W_MEAN: 0.4,
|
||||
|
||||
// Lexical floor 聚合密度加成
|
||||
LEX_DENSITY_BONUS: 0.3,
|
||||
|
||||
@@ -102,6 +99,20 @@ function cosineSimilarity(a, b) {
|
||||
return nA && nB ? dot / (Math.sqrt(nA) * Math.sqrt(nB)) : 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* 从事件 summary 末尾解析楼层范围 (#X) 或 (#X-Y)
|
||||
* @param {string} summary
|
||||
* @returns {{start: number, end: number}|null}
|
||||
*/
|
||||
function parseFloorRange(summary) {
|
||||
if (!summary) return null;
|
||||
const match = String(summary).match(/\(#(\d+)(?:-(\d+))?\)/);
|
||||
if (!match) return null;
|
||||
const start = Math.max(0, parseInt(match[1], 10) - 1);
|
||||
const end = Math.max(0, (match[2] ? parseInt(match[2], 10) : parseInt(match[1], 10)) - 1);
|
||||
return { start, end };
|
||||
}
|
||||
|
||||
function normalize(s) {
|
||||
return String(s || '')
|
||||
.normalize('NFKC')
|
||||
@@ -253,19 +264,19 @@ function mmrSelect(candidates, k, lambda, getVector, getScore) {
|
||||
async function recallAnchors(queryVector, vectorConfig, metrics) {
|
||||
const { chatId } = getContext();
|
||||
if (!chatId || !queryVector?.length) {
|
||||
return { hits: [], floors: new Set() };
|
||||
return { hits: [], floors: new Set(), stateVectors: [] };
|
||||
}
|
||||
|
||||
const meta = await getMeta(chatId);
|
||||
const fp = getEngineFingerprint(vectorConfig);
|
||||
if (meta.fingerprint && meta.fingerprint !== fp) {
|
||||
xbLog.warn(MODULE_ID, 'Anchor fingerprint 不匹配');
|
||||
return { hits: [], floors: new Set() };
|
||||
return { hits: [], floors: new Set(), stateVectors: [] };
|
||||
}
|
||||
|
||||
const stateVectors = await getAllStateVectors(chatId);
|
||||
if (!stateVectors.length) {
|
||||
return { hits: [], floors: new Set() };
|
||||
return { hits: [], floors: new Set(), stateVectors: [] };
|
||||
}
|
||||
|
||||
const atomsList = getStateAtoms();
|
||||
@@ -298,7 +309,7 @@ async function recallAnchors(queryVector, vectorConfig, metrics) {
|
||||
}));
|
||||
}
|
||||
|
||||
return { hits: scored, floors };
|
||||
return { hits: scored, floors, stateVectors };
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
@@ -402,7 +413,7 @@ async function recallEvents(queryVector, allEvents, vectorConfig, focusEntities,
|
||||
|
||||
if (metrics) {
|
||||
metrics.event.selected = results.length;
|
||||
metrics.event.byRecallType = { direct: directCount, related: relatedCount, causal: 0, lexical: 0 };
|
||||
metrics.event.byRecallType = { direct: directCount, related: relatedCount, causal: 0, lexical: 0, l0Linked: 0 };
|
||||
metrics.event.similarityDistribution = calcSimilarityStats(results.map(r => r.similarity));
|
||||
}
|
||||
|
||||
@@ -517,23 +528,18 @@ async function locateAndPullEvidence(anchorHits, queryVector, rerankQuery, lexic
|
||||
// 6a. Dense floor rank(加权聚合:maxSim×0.6 + meanSim×0.4)
|
||||
// ─────────────────────────────────────────────────────────────────
|
||||
|
||||
const denseFloorAgg = new Map();
|
||||
const denseFloorMax = new Map();
|
||||
for (const a of (anchorHits || [])) {
|
||||
const cur = denseFloorAgg.get(a.floor);
|
||||
if (!cur) {
|
||||
denseFloorAgg.set(a.floor, { maxSim: a.similarity, hitCount: 1, sumSim: a.similarity });
|
||||
} else {
|
||||
cur.maxSim = Math.max(cur.maxSim, a.similarity);
|
||||
cur.hitCount++;
|
||||
cur.sumSim += a.similarity;
|
||||
const cur = denseFloorMax.get(a.floor);
|
||||
if (!cur || a.similarity > cur) {
|
||||
denseFloorMax.set(a.floor, a.similarity);
|
||||
}
|
||||
}
|
||||
|
||||
const denseFloorRank = [...denseFloorAgg.entries()]
|
||||
.map(([floor, info]) => ({
|
||||
const denseFloorRank = [...denseFloorMax.entries()]
|
||||
.map(([floor, maxSim]) => ({
|
||||
id: floor,
|
||||
score: info.maxSim * CONFIG.DENSE_AGG_W_MAX
|
||||
+ (info.sumSim / info.hitCount) * CONFIG.DENSE_AGG_W_MEAN,
|
||||
score: maxSim,
|
||||
}))
|
||||
.sort((a, b) => b.score - a.score);
|
||||
|
||||
@@ -565,8 +571,8 @@ async function locateAndPullEvidence(anchorHits, queryVector, rerankQuery, lexic
|
||||
if (!atomFloorSet.has(floor)) continue;
|
||||
|
||||
// Dense 门槛:lexical floor 必须有最低 dense 相关性
|
||||
const denseInfo = denseFloorAgg.get(floor);
|
||||
if (!denseInfo || denseInfo.maxSim < CONFIG.LEXICAL_FLOOR_DENSE_MIN) {
|
||||
const denseMax = denseFloorMax.get(floor);
|
||||
if (!denseMax || denseMax < CONFIG.LEXICAL_FLOOR_DENSE_MIN) {
|
||||
lexFloorFilteredByDense++;
|
||||
continue;
|
||||
}
|
||||
@@ -605,7 +611,7 @@ async function locateAndPullEvidence(anchorHits, queryVector, rerankQuery, lexic
|
||||
metrics.fusion.totalUnique = totalUnique;
|
||||
metrics.fusion.afterCap = fusedFloors.length;
|
||||
metrics.fusion.time = fusionTime;
|
||||
metrics.fusion.denseAggMethod = `max×${CONFIG.DENSE_AGG_W_MAX}+mean×${CONFIG.DENSE_AGG_W_MEAN}`;
|
||||
metrics.fusion.denseAggMethod = 'maxSim';
|
||||
metrics.fusion.lexDensityBonus = CONFIG.LEX_DENSITY_BONUS;
|
||||
metrics.evidence.floorCandidates = fusedFloors.length;
|
||||
}
|
||||
@@ -1060,7 +1066,7 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) {
|
||||
}
|
||||
|
||||
const T_R2_Anchor_Start = performance.now();
|
||||
const { hits: anchorHits, floors: anchorFloors_dense } = await recallAnchors(queryVector_v1, vectorConfig, metrics);
|
||||
const { hits: anchorHits, floors: anchorFloors_dense, stateVectors: allStateVectors } = await recallAnchors(queryVector_v1, vectorConfig, metrics);
|
||||
metrics.timing.anchorSearch = Math.round(performance.now() - T_R2_Anchor_Start);
|
||||
|
||||
const T_R2_Event_Start = performance.now();
|
||||
@@ -1108,6 +1114,7 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) {
|
||||
const eventIndex = buildEventIndex(allEvents);
|
||||
let lexicalEventCount = 0;
|
||||
let lexicalEventFilteredByDense = 0;
|
||||
const focusSetForLexical = new Set((bundle.focusEntities || []).map(normalize));
|
||||
|
||||
for (const eid of lexicalResult.eventIds) {
|
||||
if (existingEventIds.has(eid)) continue;
|
||||
@@ -1129,16 +1136,59 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// 通过门槛,使用实际 dense similarity(而非硬编码 0)
|
||||
// 实体分类:与 Dense 路径统一标准
|
||||
const participants = (ev.participants || []).map(p => normalize(p));
|
||||
const hasEntityMatch = focusSetForLexical.size > 0 && participants.some(p => focusSetForLexical.has(p));
|
||||
|
||||
eventHits.push({
|
||||
event: ev,
|
||||
similarity: sim,
|
||||
_recallType: 'LEXICAL',
|
||||
_recallType: hasEntityMatch ? 'DIRECT' : 'RELATED',
|
||||
});
|
||||
existingEventIds.add(eid);
|
||||
lexicalEventCount++;
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════
|
||||
// 阶段 5.5: L0 → L2 反向查找
|
||||
// 已召回的 L0 楼层落在某 L2 事件范围内,但该 L2 自身未被召回
|
||||
// ═══════════════════════════════════════════════════════════════════
|
||||
|
||||
const recalledL0Floors = new Set(anchorHits.map(h => h.floor));
|
||||
let l0LinkedCount = 0;
|
||||
|
||||
for (const event of allEvents) {
|
||||
if (existingEventIds.has(event.id)) continue;
|
||||
|
||||
const range = parseFloorRange(event.summary);
|
||||
if (!range) continue;
|
||||
|
||||
let hasOverlap = false;
|
||||
for (const floor of recalledL0Floors) {
|
||||
if (floor >= range.start && floor <= range.end) {
|
||||
hasOverlap = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!hasOverlap) continue;
|
||||
|
||||
// 实体分类:与所有路径统一标准
|
||||
const participants = (event.participants || []).map(p => normalize(p));
|
||||
const hasEntityMatch = focusSetForLexical.size > 0
|
||||
&& participants.some(p => focusSetForLexical.has(p));
|
||||
|
||||
const evVec = eventVectorMap.get(event.id);
|
||||
const sim = evVec?.length ? cosineSimilarity(queryVector_v1, evVec) : 0;
|
||||
|
||||
eventHits.push({
|
||||
event,
|
||||
similarity: sim,
|
||||
_recallType: hasEntityMatch ? 'DIRECT' : 'RELATED',
|
||||
});
|
||||
existingEventIds.add(event.id);
|
||||
l0LinkedCount++;
|
||||
}
|
||||
|
||||
if (metrics) {
|
||||
metrics.lexical.eventFilteredByDense = lexicalEventFilteredByDense;
|
||||
|
||||
@@ -1146,10 +1196,14 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) {
|
||||
metrics.event.byRecallType.lexical = lexicalEventCount;
|
||||
metrics.event.selected += lexicalEventCount;
|
||||
}
|
||||
if (l0LinkedCount > 0) {
|
||||
metrics.event.byRecallType.l0Linked = l0LinkedCount;
|
||||
metrics.event.selected += l0LinkedCount;
|
||||
}
|
||||
}
|
||||
|
||||
xbLog.info(MODULE_ID,
|
||||
`Lexical: chunks=${lexicalResult.chunkIds.length} events=${lexicalResult.eventIds.length} mergedEvents=+${lexicalEventCount} filteredByDense=${lexicalEventFilteredByDense} (${lexTime}ms)`
|
||||
`Lexical: chunks=${lexicalResult.chunkIds.length} events=${lexicalResult.eventIds.length} mergedEvents=+${lexicalEventCount} filteredByDense=${lexicalEventFilteredByDense} l0Linked=+${l0LinkedCount} (${lexTime}ms)`
|
||||
);
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════
|
||||
@@ -1164,6 +1218,35 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) {
|
||||
metrics
|
||||
);
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════
|
||||
// Stage 7.5: PPR Diffusion Activation
|
||||
//
|
||||
// Spread from reranked seeds through entity co-occurrence graph.
|
||||
// Diffused atoms merge into l0Selected at lower scores than seeds,
|
||||
// consumed by prompt.js through the same budget pipeline.
|
||||
// ═══════════════════════════════════════════════════════════════════
|
||||
|
||||
const diffused = diffuseFromSeeds(
|
||||
l0Selected, // seeds (rerank-verified)
|
||||
getStateAtoms(), // all L0 atoms
|
||||
allStateVectors, // all L0 vectors (already read by recallAnchors)
|
||||
queryVector_v1, // R2 query vector (for cosine gate)
|
||||
metrics, // metrics collector
|
||||
);
|
||||
|
||||
for (const da of diffused) {
|
||||
l0Selected.push({
|
||||
id: `diffused-${da.atomId}`,
|
||||
atomId: da.atomId,
|
||||
floor: da.floor,
|
||||
similarity: da.finalScore,
|
||||
rerankScore: da.finalScore,
|
||||
atom: da.atom,
|
||||
text: da.atom.semantic || '',
|
||||
});
|
||||
}
|
||||
metrics.timing.diffusion = metrics.diffusion?.time || 0;
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════
|
||||
// 阶段 7: Causation Trace
|
||||
// ═══════════════════════════════════════════════════════════════════
|
||||
@@ -1206,6 +1289,7 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) {
|
||||
console.log(`Floor Rerank: ${metrics.evidence.beforeRerank || 0} → ${metrics.evidence.floorsSelected || 0} floors → L0=${metrics.evidence.l0Collected || 0} (${metrics.evidence.rerankTime || 0}ms)`);
|
||||
console.log(`L1: ${metrics.evidence.l1Pulled || 0} pulled → ${metrics.evidence.l1Attached || 0} attached (${metrics.evidence.l1CosineTime || 0}ms)`);
|
||||
console.log(`Events: ${eventHits.length} hits, ${causalChain.length} causal`);
|
||||
console.log(`Diffusion: ${metrics.diffusion?.seedCount || 0} seeds → ${metrics.diffusion?.pprActivated || 0} activated → ${metrics.diffusion?.finalCount || 0} final (${metrics.diffusion?.time || 0}ms)`);
|
||||
console.groupEnd();
|
||||
|
||||
return {
|
||||
|
||||
Reference in New Issue
Block a user