// ═══════════════════════════════════════════════════════════════════════════ // diffusion.js - PPR Graph Diffusion (Personalized PageRank) // // Spreads activation from seed L0 atoms through entity co-occurrence graph // to discover narratively-connected but semantically-distant memories. // // Pipeline position: recall.js Stage 7.5 // Input: seeds (reranked L0 from Stage 6) // Output: additional L0 atoms → merged into l0Selected // // Algorithm: // 1. Build undirected weighted graph over all L0 atoms // Four channels: WHO/WHAT/WHERE/HOW (Jaccard/Overlap/ExactMatch) // 2. Personalized PageRank (Power Iteration) // Seeds weighted by rerankScore — Haveliwala (2002) topic-sensitive variant // α = 0.15 restart probability — Page et al. (1998) // 3. Post-verification (Dense Cosine Gate) // Exclude seeds, cosine ≥ 0.45, final = PPR_norm × cosine ≥ 0.10 // // References: // Page et al. "The PageRank Citation Ranking" (1998) // Haveliwala "Topic-Sensitive PageRank" (IEEE TKDE 2003) // Langville & Meyer "Eigenvector Methods for Web IR" (SIAM Review 2005) // Sun et al. "GraftNet" (EMNLP 2018) // Jaccard "Étude comparative de la distribution florale" (1912) // Szymkiewicz "Une contribution statistique" (1934) — Overlap coefficient // Rimmon-Kenan "Narrative Fiction" (2002) — Channel weight rationale // // Core PPR iteration aligned with NetworkX pagerank(): // github.com/networkx/networkx — algorithms/link_analysis/pagerank_alg.py // ═══════════════════════════════════════════════════════════════════════════ import { xbLog } from '../../../../core/debug-core.js'; import { getContext } from '../../../../../../../extensions.js'; const MODULE_ID = 'diffusion'; // ═══════════════════════════════════════════════════════════════════════════ // Configuration // ═══════════════════════════════════════════════════════════════════════════ const CONFIG = { // PPR parameters (Page et al. 1998; GraftNet 2018 uses same values) ALPHA: 0.15, // restart probability EPSILON: 1e-6, // L1 convergence threshold MAX_ITER: 50, // hard iteration cap (typically converges in 15-25) // Edge weight channel coefficients // Rationale: Rimmon-Kenan (2002) hierarchy: characters > events > setting > themes GAMMA: { who: 0.50, // entity co-occurrence — Jaccard what: 0.25, // directed pair overlap — Szymkiewicz-Simpson where: 0.15, // location exact match — binary how: 0.10, // dynamics tag co-occurrence — Jaccard }, // Post-verification (Cosine Gate) COSINE_GATE: 0.45, // min cosine(queryVector, stateVector) SCORE_FLOOR: 0.10, // min finalScore = PPR_normalized × cosine DIFFUSION_CAP: 60, // max diffused nodes (excluding seeds) }; // ═══════════════════════════════════════════════════════════════════════════ // Utility functions // ═══════════════════════════════════════════════════════════════════════════ /** * Unicode-safe text normalization (matches recall.js / entity-lexicon.js) */ function normalize(s) { return String(s || '') .normalize('NFKC') .replace(/[\u200B-\u200D\uFEFF]/g, '') .trim() .toLowerCase(); } /** * Cosine similarity between two vectors */ function cosineSimilarity(a, b) { if (!a?.length || !b?.length || a.length !== b.length) return 0; let dot = 0, nA = 0, nB = 0; for (let i = 0; i < a.length; i++) { dot += a[i] * b[i]; nA += a[i] * a[i]; nB += b[i] * b[i]; } return nA && nB ? dot / (Math.sqrt(nA) * Math.sqrt(nB)) : 0; } // ═══════════════════════════════════════════════════════════════════════════ // Feature extraction from L0 atoms // ═══════════════════════════════════════════════════════════════════════════ /** * WHO channel: entity set = who ∪ edges.s ∪ edges.t * @param {object} atom * @param {Set} excludeEntities - entities to exclude (e.g. name1) * @returns {Set} */ function extractEntities(atom, excludeEntities = new Set()) { const set = new Set(); for (const w of (atom.who || [])) { const n = normalize(w); if (n && !excludeEntities.has(n)) set.add(n); } for (const e of (atom.edges || [])) { const s = normalize(e?.s); const t = normalize(e?.t); if (s && !excludeEntities.has(s)) set.add(s); if (t && !excludeEntities.has(t)) set.add(t); } return set; } /** * WHAT channel: directed interaction pairs "A→B" (strict direction — option A) * @param {object} atom * @param {Set} excludeEntities * @returns {Set} */ function extractDirectedPairs(atom, excludeEntities = new Set()) { const set = new Set(); for (const e of (atom.edges || [])) { const s = normalize(e?.s); const t = normalize(e?.t); if (s && t && !excludeEntities.has(s) && !excludeEntities.has(t)) { set.add(`${s}\u2192${t}`); } } return set; } /** * WHERE channel: normalized location string * @param {object} atom * @returns {string} empty string if absent */ function extractLocation(atom) { return normalize(atom.where); } /** * HOW channel: dynamics tags set * @param {object} atom * @returns {Set} */ function extractDynamics(atom) { const set = new Set(); for (const d of (atom.dynamics || [])) { const n = normalize(d); if (n) set.add(n); } return set; } // ═══════════════════════════════════════════════════════════════════════════ // Set similarity functions // ═══════════════════════════════════════════════════════════════════════════ /** * Jaccard index: |A∩B| / |A∪B| (Jaccard 1912) * @param {Set} a * @param {Set} b * @returns {number} 0..1 */ function jaccard(a, b) { if (!a.size || !b.size) return 0; let inter = 0; const [smaller, larger] = a.size <= b.size ? [a, b] : [b, a]; for (const x of smaller) { if (larger.has(x)) inter++; } const union = a.size + b.size - inter; return union > 0 ? inter / union : 0; } /** * Overlap coefficient: |A∩B| / min(|A|,|B|) (Szymkiewicz-Simpson 1934) * Used for directed pairs where set sizes are small (1-3); Jaccard * over-penalizes small-set asymmetry. * @param {Set} a * @param {Set} b * @returns {number} 0..1 */ function overlapCoefficient(a, b) { if (!a.size || !b.size) return 0; let inter = 0; const [smaller, larger] = a.size <= b.size ? [a, b] : [b, a]; for (const x of smaller) { if (larger.has(x)) inter++; } return inter / smaller.size; } // ═══════════════════════════════════════════════════════════════════════════ // Graph construction // // Candidate pairs discovered via inverted indices on entities and locations. // Dynamics-only pairs excluded from candidate generation (γ_HOW = 0.10 is // too weak to justify O(N²) blowup from 8-tag combinatorics). // All four channels evaluated for every candidate pair. // ═══════════════════════════════════════════════════════════════════════════ /** * Pre-extract features for all atoms * @param {object[]} allAtoms * @param {Set} excludeEntities * @returns {object[]} feature objects with entities/directedPairs/location/dynamics */ function extractAllFeatures(allAtoms, excludeEntities = new Set()) { return allAtoms.map(atom => ({ entities: extractEntities(atom, excludeEntities), directedPairs: extractDirectedPairs(atom, excludeEntities), location: extractLocation(atom), dynamics: extractDynamics(atom), })); } /** * Build inverted index: value → list of atom indices * @param {object[]} features * @returns {{ entityIndex: Map, locationIndex: Map }} */ function buildInvertedIndices(features) { const entityIndex = new Map(); const locationIndex = new Map(); for (let i = 0; i < features.length; i++) { for (const e of features[i].entities) { if (!entityIndex.has(e)) entityIndex.set(e, []); entityIndex.get(e).push(i); } const loc = features[i].location; if (loc) { if (!locationIndex.has(loc)) locationIndex.set(loc, []); locationIndex.get(loc).push(i); } } return { entityIndex, locationIndex }; } /** * Collect candidate pairs from inverted index * @param {Map} index - value → [atomIndex, ...] * @param {Set} pairSet - packed pair collector * @param {number} N - total atom count (for pair packing) */ function collectPairsFromIndex(index, pairSet, N) { for (const indices of index.values()) { for (let a = 0; a < indices.length; a++) { for (let b = a + 1; b < indices.length; b++) { const lo = Math.min(indices[a], indices[b]); const hi = Math.max(indices[a], indices[b]); pairSet.add(lo * N + hi); } } } } /** * Build weighted undirected graph over L0 atoms. * * @param {object[]} allAtoms * @param {Set} excludeEntities * @returns {{ neighbors: object[][], edgeCount: number, channelStats: object, buildTime: number }} */ function buildGraph(allAtoms, excludeEntities = new Set()) { const N = allAtoms.length; const T0 = performance.now(); const features = extractAllFeatures(allAtoms, excludeEntities); const { entityIndex, locationIndex } = buildInvertedIndices(features); // Candidate pairs: share ≥1 entity or same location const pairSet = new Set(); collectPairsFromIndex(entityIndex, pairSet, N); collectPairsFromIndex(locationIndex, pairSet, N); // Compute four-channel edge weights for all candidates const neighbors = Array.from({ length: N }, () => []); let edgeCount = 0; const channelStats = { who: 0, what: 0, where: 0, how: 0 }; for (const packed of pairSet) { const i = Math.floor(packed / N); const j = packed % N; const fi = features[i]; const fj = features[j]; const wWho = jaccard(fi.entities, fj.entities); const wWhat = overlapCoefficient(fi.directedPairs, fj.directedPairs); const wWhere = (fi.location && fi.location === fj.location) ? 1.0 : 0.0; const wHow = jaccard(fi.dynamics, fj.dynamics); const weight = CONFIG.GAMMA.who * wWho + CONFIG.GAMMA.what * wWhat + CONFIG.GAMMA.where * wWhere + CONFIG.GAMMA.how * wHow; if (weight > 0) { neighbors[i].push({ target: j, weight }); neighbors[j].push({ target: i, weight }); edgeCount++; if (wWho > 0) channelStats.who++; if (wWhat > 0) channelStats.what++; if (wWhere > 0) channelStats.where++; if (wHow > 0) channelStats.how++; } } const buildTime = Math.round(performance.now() - T0); xbLog.info(MODULE_ID, `Graph: ${N} nodes, ${edgeCount} edges ` + `(who=${channelStats.who} what=${channelStats.what} ` + `where=${channelStats.where} how=${channelStats.how}) ` + `(${buildTime}ms)` ); return { neighbors, edgeCount, channelStats, buildTime }; } // ═══════════════════════════════════════════════════════════════════════════ // PPR: Seed vector construction // ═══════════════════════════════════════════════════════════════════════════ /** * Build personalization vector s from seeds, weighted by rerankScore. * Haveliwala (2002): non-uniform personalization improves topic sensitivity. * * @param {object[]} seeds - seed L0 entries with atomId and rerankScore * @param {Map} idToIdx - atomId → array index * @param {number} N - total node count * @returns {Float64Array} personalization vector (L1-normalized, sums to 1) */ function buildSeedVector(seeds, idToIdx, N) { const s = new Float64Array(N); let total = 0; for (const seed of seeds) { const idx = idToIdx.get(seed.atomId); if (idx == null) continue; const score = Math.max(0, seed.rerankScore || seed.similarity || 0); s[idx] += score; total += score; } // L1 normalize to probability distribution if (total > 0) { for (let i = 0; i < N; i++) s[i] /= total; } return s; } // ═══════════════════════════════════════════════════════════════════════════ // PPR: Column normalization + dangling node detection // ═══════════════════════════════════════════════════════════════════════════ /** * Column-normalize adjacency into transition matrix W. * * Column j of W: W_{ij} = weight(i,j) / Σ_k weight(k,j) * Dangling nodes (no outgoing edges): handled in powerIteration * via redistribution to personalization vector s. * (Langville & Meyer 2005, §4.1) * * @param {object[][]} neighbors - neighbors[j] = [{target, weight}, ...] * @param {number} N * @returns {{ columns: object[][], dangling: number[] }} */ function columnNormalize(neighbors, N) { const columns = Array.from({ length: N }, () => []); const dangling = []; for (let j = 0; j < N; j++) { const edges = neighbors[j]; let sum = 0; for (let e = 0; e < edges.length; e++) sum += edges[e].weight; if (sum <= 0) { dangling.push(j); continue; } const col = columns[j]; for (let e = 0; e < edges.length; e++) { col.push({ target: edges[e].target, prob: edges[e].weight / sum }); } } return { columns, dangling }; } // ═══════════════════════════════════════════════════════════════════════════ // PPR: Power Iteration // // Aligned with NetworkX pagerank() (pagerank_alg.py): // // NetworkX "alpha" = damping = our (1 − α) // NetworkX "1-alpha" = teleportation = our α // // Per iteration: // π_new[i] = α·s[i] + (1−α)·( Σ_j W_{ij}·π[j] + dangling_sum·s[i] ) // // Convergence: Perron-Frobenius theorem guarantees unique stationary // distribution for irreducible aperiodic column-stochastic matrix. // Rate: ‖π^(t+1) − π^t‖₁ ≤ (1−α)^t (geometric). // ═══════════════════════════════════════════════════════════════════════════ /** * Run PPR Power Iteration. * * @param {object[][]} columns - column-normalized transition matrix * @param {Float64Array} s - personalization vector (sums to 1) * @param {number[]} dangling - dangling node indices * @param {number} N - node count * @returns {{ pi: Float64Array, iterations: number, finalError: number }} */ function powerIteration(columns, s, dangling, N) { const alpha = CONFIG.ALPHA; const d = 1 - alpha; // damping factor = prob of following edges const epsilon = CONFIG.EPSILON; const maxIter = CONFIG.MAX_ITER; // Initialize π to personalization vector let pi = new Float64Array(N); for (let i = 0; i < N; i++) pi[i] = s[i]; let iterations = 0; let finalError = 0; for (let iter = 0; iter < maxIter; iter++) { const piNew = new Float64Array(N); // Dangling mass: probability at nodes with no outgoing edges // redistributed to personalization vector (Langville & Meyer 2005) let danglingSum = 0; for (let k = 0; k < dangling.length; k++) { danglingSum += pi[dangling[k]]; } // Sparse matrix-vector product: (1−α) · W · π for (let j = 0; j < N; j++) { const pj = pi[j]; if (pj === 0) continue; const col = columns[j]; const dpj = d * pj; for (let e = 0; e < col.length; e++) { piNew[col[e].target] += dpj * col[e].prob; } } // Restart + dangling contribution: // α · s[i] + (1−α) · danglingSum · s[i] const restartCoeff = alpha + d * danglingSum; for (let i = 0; i < N; i++) { piNew[i] += restartCoeff * s[i]; } // L1 convergence check let l1 = 0; for (let i = 0; i < N; i++) { l1 += Math.abs(piNew[i] - pi[i]); } pi = piNew; iterations = iter + 1; finalError = l1; if (l1 < epsilon) break; } return { pi, iterations, finalError }; } // ═══════════════════════════════════════════════════════════════════════════ // Post-verification: Dense Cosine Gate // // PPR measures graph-structural relevance ("same characters"). // Cosine gate measures semantic relevance ("related to current topic"). // Product combination ensures both dimensions are satisfied // (CombMNZ — Fox & Shaw, TREC-2 1994). // ═══════════════════════════════════════════════════════════════════════════ /** * Filter PPR-activated nodes by semantic relevance. * * For each non-seed node with PPR > 0: * 1. cosine(queryVector, stateVector) ≥ COSINE_GATE * 2. finalScore = PPR_normalized × cosine ≥ SCORE_FLOOR * 3. Top DIFFUSION_CAP by finalScore * * @param {Float64Array} pi - PPR stationary distribution * @param {string[]} atomIds - index → atomId * @param {Map} atomById - atomId → atom object * @param {Set} seedAtomIds - seed atomIds (excluded from output) * @param {Map} vectorMap - atomId → embedding vector * @param {Float32Array|number[]} queryVector - R2 weighted query vector * @returns {{ diffused: object[], gateStats: object }} */ function postVerify(pi, atomIds, atomById, seedAtomIds, vectorMap, queryVector) { const N = atomIds.length; const gateStats = { passed: 0, filtered: 0, noVector: 0 }; // Find max PPR score among non-seed nodes (for normalization) let maxPPR = 0; for (let i = 0; i < N; i++) { if (pi[i] > 0 && !seedAtomIds.has(atomIds[i])) { if (pi[i] > maxPPR) maxPPR = pi[i]; } } if (maxPPR <= 0) { return { diffused: [], gateStats }; } const candidates = []; for (let i = 0; i < N; i++) { const atomId = atomIds[i]; // Skip seeds and zero-probability nodes if (seedAtomIds.has(atomId)) continue; if (pi[i] <= 0) continue; // Require state vector for cosine verification const vec = vectorMap.get(atomId); if (!vec?.length) { gateStats.noVector++; continue; } // Cosine gate const cos = cosineSimilarity(queryVector, vec); if (cos < CONFIG.COSINE_GATE) { gateStats.filtered++; continue; } // Final score = PPR_normalized × cosine const pprNorm = pi[i] / maxPPR; const finalScore = pprNorm * cos; if (finalScore < CONFIG.SCORE_FLOOR) { gateStats.filtered++; continue; } gateStats.passed++; const atom = atomById.get(atomId); if (!atom) continue; candidates.push({ atomId, floor: atom.floor, atom, finalScore, pprScore: pi[i], pprNormalized: pprNorm, cosine: cos, }); } // Sort by finalScore descending, cap at DIFFUSION_CAP candidates.sort((a, b) => b.finalScore - a.finalScore); const diffused = candidates.slice(0, CONFIG.DIFFUSION_CAP); return { diffused, gateStats }; } // ═══════════════════════════════════════════════════════════════════════════ // Main entry point // ═══════════════════════════════════════════════════════════════════════════ /** * Spread activation from seed L0 atoms through entity co-occurrence graph. * * Called from recall.js Stage 7.5, after locateAndPullEvidence and before * Causation Trace. Results are merged into l0Selected and consumed by * prompt.js through existing budget/formatting pipeline (zero downstream changes). * * @param {object[]} seeds - l0Selected from recall Stage 6 * Each: { atomId, rerankScore, similarity, atom, ... } * @param {object[]} allAtoms - getStateAtoms() result * Each: { atomId, floor, semantic, who, edges, dynamics, where } * @param {object[]} stateVectors - getAllStateVectors() result * Each: { atomId, floor, vector: Float32Array } * @param {Float32Array|number[]} queryVector - R2 weighted query vector * @param {object|null} metrics - metrics object (optional, mutated in-place) * @returns {object[]} Additional L0 atoms for l0Selected * Each: { atomId, floor, atom, finalScore, pprScore, pprNormalized, cosine } */ export function diffuseFromSeeds(seeds, allAtoms, stateVectors, queryVector, metrics) { const T0 = performance.now(); // ─── Early exits ───────────────────────────────────────────────── if (!seeds?.length || !allAtoms?.length || !queryVector?.length) { fillMetricsEmpty(metrics); return []; } // Align with entity-lexicon hard rule: exclude name1 from graph features. const { name1 } = getContext(); const excludeEntities = new Set(); if (name1) excludeEntities.add(normalize(name1)); // ─── 1. Build atom index ───────────────────────────────────────── const atomById = new Map(); const atomIds = []; const idToIdx = new Map(); for (let i = 0; i < allAtoms.length; i++) { const a = allAtoms[i]; atomById.set(a.atomId, a); atomIds.push(a.atomId); idToIdx.set(a.atomId, i); } const N = allAtoms.length; // Validate seeds against atom index const validSeeds = seeds.filter(s => idToIdx.has(s.atomId)); const seedAtomIds = new Set(validSeeds.map(s => s.atomId)); if (!validSeeds.length) { fillMetricsEmpty(metrics); return []; } // ─── 2. Build graph ────────────────────────────────────────────── const graph = buildGraph(allAtoms, excludeEntities); if (graph.edgeCount === 0) { fillMetrics(metrics, { seedCount: validSeeds.length, graphNodes: N, graphEdges: 0, channelStats: graph.channelStats, time: graph.buildTime, }); xbLog.info(MODULE_ID, 'No graph edges — skipping diffusion'); return []; } // ─── 3. Build seed vector ──────────────────────────────────────── const s = buildSeedVector(validSeeds, idToIdx, N); // ─── 4. Column normalize ───────────────────────────────────────── const { columns, dangling } = columnNormalize(graph.neighbors, N); // ─── 5. PPR Power Iteration ────────────────────────────────────── const T_PPR = performance.now(); const { pi, iterations, finalError } = powerIteration(columns, s, dangling, N); const pprTime = Math.round(performance.now() - T_PPR); // Count activated non-seed nodes let pprActivated = 0; for (let i = 0; i < N; i++) { if (pi[i] > 0 && !seedAtomIds.has(atomIds[i])) pprActivated++; } // ─── 6. Post-verification ──────────────────────────────────────── const vectorMap = new Map(); for (const sv of (stateVectors || [])) { vectorMap.set(sv.atomId, sv.vector); } const { diffused, gateStats } = postVerify( pi, atomIds, atomById, seedAtomIds, vectorMap, queryVector ); // ─── 7. Metrics ────────────────────────────────────────────────── const totalTime = Math.round(performance.now() - T0); fillMetrics(metrics, { seedCount: validSeeds.length, graphNodes: N, graphEdges: graph.edgeCount, channelStats: graph.channelStats, buildTime: graph.buildTime, iterations, convergenceError: finalError, pprActivated, cosineGatePassed: gateStats.passed, cosineGateFiltered: gateStats.filtered, cosineGateNoVector: gateStats.noVector, finalCount: diffused.length, scoreDistribution: diffused.length > 0 ? calcScoreStats(diffused.map(d => d.finalScore)) : { min: 0, max: 0, mean: 0 }, time: totalTime, }); xbLog.info(MODULE_ID, `Diffusion: ${validSeeds.length} seeds → ` + `graph(${N}n/${graph.edgeCount}e) → ` + `PPR(${iterations}it, ε=${finalError.toExponential(1)}, ${pprTime}ms) → ` + `${pprActivated} activated → ` + `gate(${gateStats.passed}\u2713/${gateStats.filtered}\u2717` + `${gateStats.noVector ? `/${gateStats.noVector}?` : ''}) → ` + `${diffused.length} final (${totalTime}ms)` ); return diffused; } // ═══════════════════════════════════════════════════════════════════════════ // Metrics helpers // ═══════════════════════════════════════════════════════════════════════════ /** * Compute min/max/mean distribution * @param {number[]} scores * @returns {{ min: number, max: number, mean: number }} */ function calcScoreStats(scores) { if (!scores.length) return { min: 0, max: 0, mean: 0 }; const sorted = [...scores].sort((a, b) => a - b); const sum = sorted.reduce((a, b) => a + b, 0); return { min: Number(sorted[0].toFixed(3)), max: Number(sorted[sorted.length - 1].toFixed(3)), mean: Number((sum / sorted.length).toFixed(3)), }; } /** * Fill metrics with empty diffusion block */ function fillMetricsEmpty(metrics) { if (!metrics) return; metrics.diffusion = { seedCount: 0, graphNodes: 0, graphEdges: 0, iterations: 0, convergenceError: 0, pprActivated: 0, cosineGatePassed: 0, cosineGateFiltered: 0, cosineGateNoVector: 0, finalCount: 0, scoreDistribution: { min: 0, max: 0, mean: 0 }, byChannel: { who: 0, what: 0, where: 0, how: 0 }, time: 0, }; } /** * Fill metrics with diffusion results */ function fillMetrics(metrics, data) { if (!metrics) return; metrics.diffusion = { seedCount: data.seedCount || 0, graphNodes: data.graphNodes || 0, graphEdges: data.graphEdges || 0, iterations: data.iterations || 0, convergenceError: data.convergenceError || 0, pprActivated: data.pprActivated || 0, cosineGatePassed: data.cosineGatePassed || 0, cosineGateFiltered: data.cosineGateFiltered || 0, cosineGateNoVector: data.cosineGateNoVector || 0, finalCount: data.finalCount || 0, scoreDistribution: data.scoreDistribution || { min: 0, max: 0, mean: 0 }, byChannel: data.channelStats || { who: 0, what: 0, where: 0, how: 0 }, time: data.time || 0, }; }