Upload LittleWhiteBox extension
This commit is contained in:
928
modules/story-summary/vector/retrieval/diffusion.js
Normal file
928
modules/story-summary/vector/retrieval/diffusion.js
Normal file
@@ -0,0 +1,928 @@
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// diffusion.js - PPR Graph Diffusion (Personalized PageRank)
|
||||
//
|
||||
// Spreads activation from seed L0 atoms through entity co-occurrence graph
|
||||
// to discover narratively-connected but semantically-distant memories.
|
||||
//
|
||||
// Pipeline position: recall.js Stage 7.5
|
||||
// Input: seeds (reranked L0 from Stage 6)
|
||||
// Output: additional L0 atoms → merged into l0Selected
|
||||
//
|
||||
// Algorithm:
|
||||
// 1. Build undirected weighted graph over all L0 atoms
|
||||
// Candidate edges: WHAT + R semantic; WHO/WHERE are reweight-only
|
||||
// 2. Personalized PageRank (Power Iteration)
|
||||
// Seeds weighted by rerankScore — Haveliwala (2002) topic-sensitive variant
|
||||
// α = 0.15 restart probability — Page et al. (1998)
|
||||
// 3. Post-verification (Dense Cosine Gate)
|
||||
// Exclude seeds, cosine ≥ 0.45, final = PPR_norm × cosine ≥ 0.10
|
||||
//
|
||||
// References:
|
||||
// Page et al. "The PageRank Citation Ranking" (1998)
|
||||
// Haveliwala "Topic-Sensitive PageRank" (IEEE TKDE 2003)
|
||||
// Langville & Meyer "Eigenvector Methods for Web IR" (SIAM Review 2005)
|
||||
// Sun et al. "GraftNet" (EMNLP 2018)
|
||||
// Jaccard "Étude comparative de la distribution florale" (1912)
|
||||
// Szymkiewicz "Une contribution statistique" (1934) — Overlap coefficient
|
||||
// Rimmon-Kenan "Narrative Fiction" (2002) — Channel weight rationale
|
||||
//
|
||||
// Core PPR iteration aligned with NetworkX pagerank():
|
||||
// github.com/networkx/networkx — algorithms/link_analysis/pagerank_alg.py
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
import { xbLog } from '../../../../core/debug-core.js';
|
||||
import { getContext } from '../../../../../../../extensions.js';
|
||||
|
||||
const MODULE_ID = 'diffusion';
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Configuration
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
const CONFIG = {
|
||||
// PPR parameters (Page et al. 1998; GraftNet 2018 uses same values)
|
||||
ALPHA: 0.15, // restart probability
|
||||
EPSILON: 1e-5, // L1 convergence threshold
|
||||
MAX_ITER: 50, // hard iteration cap (typically converges in 15-25)
|
||||
|
||||
// Edge weight channel coefficients
|
||||
// Candidate generation uses WHAT + R semantic only.
|
||||
// WHO/WHERE are reweight-only signals.
|
||||
GAMMA: {
|
||||
what: 0.40, // interaction pair overlap
|
||||
rSem: 0.40, // semantic similarity over edges.r aggregate
|
||||
who: 0.10, // endpoint entity overlap (reweight-only)
|
||||
where: 0.05, // location exact match (reweight-only)
|
||||
time: 0.05, // temporal decay score
|
||||
},
|
||||
// R semantic candidate generation
|
||||
R_SEM_MIN_SIM: 0.62,
|
||||
R_SEM_TOPK: 8,
|
||||
TIME_WINDOW_MAX: 80,
|
||||
TIME_DECAY_DIVISOR: 12,
|
||||
WHERE_MAX_GROUP_SIZE: 16, // skip location-only pair expansion for over-common places
|
||||
WHERE_FREQ_DAMP_PIVOT: 6, // location freq <= pivot keeps full WHERE score
|
||||
WHERE_FREQ_DAMP_MIN: 0.20, // lower bound for damped WHERE contribution
|
||||
|
||||
// Post-verification (Cosine Gate)
|
||||
COSINE_GATE: 0.46, // min cosine(queryVector, stateVector)
|
||||
SCORE_FLOOR: 0.10, // min finalScore = PPR_normalized × cosine
|
||||
DIFFUSION_CAP: 100, // max diffused nodes (excluding seeds)
|
||||
};
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Utility functions
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* Unicode-safe text normalization (matches recall.js / entity-lexicon.js)
|
||||
*/
|
||||
function normalize(s) {
|
||||
return String(s || '')
|
||||
.normalize('NFKC')
|
||||
.replace(/[\u200B-\u200D\uFEFF]/g, '')
|
||||
.trim()
|
||||
.toLowerCase();
|
||||
}
|
||||
|
||||
/**
|
||||
* Cosine similarity between two vectors
|
||||
*/
|
||||
function cosineSimilarity(a, b) {
|
||||
if (!a?.length || !b?.length || a.length !== b.length) return 0;
|
||||
let dot = 0, nA = 0, nB = 0;
|
||||
for (let i = 0; i < a.length; i++) {
|
||||
dot += a[i] * b[i];
|
||||
nA += a[i] * a[i];
|
||||
nB += b[i] * b[i];
|
||||
}
|
||||
return nA && nB ? dot / (Math.sqrt(nA) * Math.sqrt(nB)) : 0;
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Feature extraction from L0 atoms
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* Endpoint entity set from edges.s/edges.t (used for candidate pair generation).
|
||||
* @param {object} atom
|
||||
* @param {Set<string>} excludeEntities - entities to exclude (e.g. name1)
|
||||
* @returns {Set<string>}
|
||||
*/
|
||||
function extractEntities(atom, excludeEntities = new Set()) {
|
||||
const set = new Set();
|
||||
for (const e of (atom.edges || [])) {
|
||||
const s = normalize(e?.s);
|
||||
const t = normalize(e?.t);
|
||||
if (s && !excludeEntities.has(s)) set.add(s);
|
||||
if (t && !excludeEntities.has(t)) set.add(t);
|
||||
}
|
||||
return set;
|
||||
}
|
||||
|
||||
/**
|
||||
* WHAT channel: interaction pairs "A↔B" (direction-insensitive).
|
||||
* @param {object} atom
|
||||
* @param {Set<string>} excludeEntities
|
||||
* @returns {Set<string>}
|
||||
*/
|
||||
function extractInteractionPairs(atom, excludeEntities = new Set()) {
|
||||
const set = new Set();
|
||||
for (const e of (atom.edges || [])) {
|
||||
const s = normalize(e?.s);
|
||||
const t = normalize(e?.t);
|
||||
if (s && t && !excludeEntities.has(s) && !excludeEntities.has(t)) {
|
||||
const pair = [s, t].sort().join('\u2194');
|
||||
set.add(pair);
|
||||
}
|
||||
}
|
||||
return set;
|
||||
}
|
||||
|
||||
/**
|
||||
* WHERE channel: normalized location string
|
||||
* @param {object} atom
|
||||
* @returns {string} empty string if absent
|
||||
*/
|
||||
function extractLocation(atom) {
|
||||
return normalize(atom.where);
|
||||
}
|
||||
|
||||
function getFloorDistance(a, b) {
|
||||
const fa = Number(a?.floor || 0);
|
||||
const fb = Number(b?.floor || 0);
|
||||
return Math.abs(fa - fb);
|
||||
}
|
||||
|
||||
function getTimeScore(distance) {
|
||||
return Math.exp(-distance / CONFIG.TIME_DECAY_DIVISOR);
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Set similarity functions
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* Jaccard index: |A∩B| / |A∪B| (Jaccard 1912)
|
||||
* @param {Set<string>} a
|
||||
* @param {Set<string>} b
|
||||
* @returns {number} 0..1
|
||||
*/
|
||||
function jaccard(a, b) {
|
||||
if (!a.size || !b.size) return 0;
|
||||
let inter = 0;
|
||||
const [smaller, larger] = a.size <= b.size ? [a, b] : [b, a];
|
||||
for (const x of smaller) {
|
||||
if (larger.has(x)) inter++;
|
||||
}
|
||||
const union = a.size + b.size - inter;
|
||||
return union > 0 ? inter / union : 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Overlap coefficient: |A∩B| / min(|A|,|B|) (Szymkiewicz-Simpson 1934)
|
||||
* Used for directed pairs where set sizes are small (1-3); Jaccard
|
||||
* over-penalizes small-set asymmetry.
|
||||
* @param {Set<string>} a
|
||||
* @param {Set<string>} b
|
||||
* @returns {number} 0..1
|
||||
*/
|
||||
function overlapCoefficient(a, b) {
|
||||
if (!a.size || !b.size) return 0;
|
||||
let inter = 0;
|
||||
const [smaller, larger] = a.size <= b.size ? [a, b] : [b, a];
|
||||
for (const x of smaller) {
|
||||
if (larger.has(x)) inter++;
|
||||
}
|
||||
return inter / smaller.size;
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Graph construction
|
||||
//
|
||||
// Candidate pairs discovered via WHAT inverted index and R semantic top-k.
|
||||
// WHO/WHERE are reweight-only signals and never create candidate pairs.
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* Pre-extract features for all atoms
|
||||
* @param {object[]} allAtoms
|
||||
* @param {Set<string>} excludeEntities
|
||||
* @returns {object[]} feature objects with entities/interactionPairs/location
|
||||
*/
|
||||
function extractAllFeatures(allAtoms, excludeEntities = new Set()) {
|
||||
return allAtoms.map(atom => ({
|
||||
entities: extractEntities(atom, excludeEntities),
|
||||
interactionPairs: extractInteractionPairs(atom, excludeEntities),
|
||||
location: extractLocation(atom),
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Build inverted index: value → list of atom indices
|
||||
* @param {object[]} features
|
||||
* @returns {{ whatIndex: Map, locationFreq: Map }}
|
||||
*/
|
||||
function buildInvertedIndices(features) {
|
||||
const whatIndex = new Map();
|
||||
const locationFreq = new Map();
|
||||
|
||||
for (let i = 0; i < features.length; i++) {
|
||||
for (const pair of features[i].interactionPairs) {
|
||||
if (!whatIndex.has(pair)) whatIndex.set(pair, []);
|
||||
whatIndex.get(pair).push(i);
|
||||
}
|
||||
const loc = features[i].location;
|
||||
if (loc) locationFreq.set(loc, (locationFreq.get(loc) || 0) + 1);
|
||||
}
|
||||
|
||||
return { whatIndex, locationFreq };
|
||||
}
|
||||
|
||||
/**
|
||||
* Collect candidate pairs from inverted index
|
||||
* @param {Map} index - value → [atomIndex, ...]
|
||||
* @param {Set<number>} pairSet - packed pair collector
|
||||
* @param {number} N - total atom count (for pair packing)
|
||||
*/
|
||||
function collectPairsFromIndex(index, pairSet, N) {
|
||||
for (const indices of index.values()) {
|
||||
for (let a = 0; a < indices.length; a++) {
|
||||
for (let b = a + 1; b < indices.length; b++) {
|
||||
const lo = Math.min(indices[a], indices[b]);
|
||||
const hi = Math.max(indices[a], indices[b]);
|
||||
pairSet.add(lo * N + hi);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Build weighted undirected graph over L0 atoms.
|
||||
*
|
||||
* @param {object[]} allAtoms
|
||||
* @param {object[]} stateVectors
|
||||
* @param {Set<string>} excludeEntities
|
||||
* @returns {{ neighbors: object[][], edgeCount: number, channelStats: object, buildTime: number }}
|
||||
*/
|
||||
function buildGraph(allAtoms, stateVectors = [], excludeEntities = new Set()) {
|
||||
const N = allAtoms.length;
|
||||
const T0 = performance.now();
|
||||
|
||||
const features = extractAllFeatures(allAtoms, excludeEntities);
|
||||
const { whatIndex, locationFreq } = buildInvertedIndices(features);
|
||||
|
||||
// Candidate pairs: WHAT + R semantic
|
||||
const pairSetByWhat = new Set();
|
||||
const pairSetByRSem = new Set();
|
||||
const rSemByPair = new Map();
|
||||
const pairSet = new Set();
|
||||
collectPairsFromIndex(whatIndex, pairSetByWhat, N);
|
||||
|
||||
const rVectorByAtomId = new Map(
|
||||
(stateVectors || [])
|
||||
.filter(v => v?.atomId && v?.rVector?.length)
|
||||
.map(v => [v.atomId, v.rVector])
|
||||
);
|
||||
const rVectors = allAtoms.map(a => rVectorByAtomId.get(a.atomId) || null);
|
||||
|
||||
const directedNeighbors = Array.from({ length: N }, () => []);
|
||||
let rSemSimSum = 0;
|
||||
let rSemSimCount = 0;
|
||||
let topKPrunedPairs = 0;
|
||||
let timeWindowFilteredPairs = 0;
|
||||
|
||||
// Enumerate only pairs within floor window to avoid O(N^2) full scan.
|
||||
const sortedByFloor = allAtoms
|
||||
.map((atom, idx) => ({ idx, floor: Number(atom?.floor || 0) }))
|
||||
.sort((a, b) => a.floor - b.floor);
|
||||
|
||||
for (let left = 0; left < sortedByFloor.length; left++) {
|
||||
const i = sortedByFloor[left].idx;
|
||||
const baseFloor = sortedByFloor[left].floor;
|
||||
|
||||
for (let right = left + 1; right < sortedByFloor.length; right++) {
|
||||
const floorDelta = sortedByFloor[right].floor - baseFloor;
|
||||
if (floorDelta > CONFIG.TIME_WINDOW_MAX) break;
|
||||
|
||||
const j = sortedByFloor[right].idx;
|
||||
const vi = rVectors[i];
|
||||
const vj = rVectors[j];
|
||||
if (!vi?.length || !vj?.length) continue;
|
||||
|
||||
const sim = cosineSimilarity(vi, vj);
|
||||
if (sim < CONFIG.R_SEM_MIN_SIM) continue;
|
||||
|
||||
directedNeighbors[i].push({ target: j, sim });
|
||||
directedNeighbors[j].push({ target: i, sim });
|
||||
rSemSimSum += sim;
|
||||
rSemSimCount++;
|
||||
}
|
||||
}
|
||||
|
||||
for (let i = 0; i < N; i++) {
|
||||
const arr = directedNeighbors[i];
|
||||
if (!arr.length) continue;
|
||||
arr.sort((a, b) => b.sim - a.sim);
|
||||
if (arr.length > CONFIG.R_SEM_TOPK) {
|
||||
topKPrunedPairs += arr.length - CONFIG.R_SEM_TOPK;
|
||||
}
|
||||
for (const n of arr.slice(0, CONFIG.R_SEM_TOPK)) {
|
||||
const lo = Math.min(i, n.target);
|
||||
const hi = Math.max(i, n.target);
|
||||
const packed = lo * N + hi;
|
||||
pairSetByRSem.add(packed);
|
||||
const prev = rSemByPair.get(packed) || 0;
|
||||
if (n.sim > prev) rSemByPair.set(packed, n.sim);
|
||||
}
|
||||
}
|
||||
for (const p of pairSetByWhat) pairSet.add(p);
|
||||
for (const p of pairSetByRSem) pairSet.add(p);
|
||||
|
||||
// Compute edge weights for all candidates
|
||||
const neighbors = Array.from({ length: N }, () => []);
|
||||
let edgeCount = 0;
|
||||
const channelStats = { what: 0, where: 0, rSem: 0, who: 0 };
|
||||
let reweightWhoUsed = 0;
|
||||
let reweightWhereUsed = 0;
|
||||
|
||||
for (const packed of pairSet) {
|
||||
const i = Math.floor(packed / N);
|
||||
const j = packed % N;
|
||||
|
||||
const distance = getFloorDistance(allAtoms[i], allAtoms[j]);
|
||||
if (distance > CONFIG.TIME_WINDOW_MAX) {
|
||||
timeWindowFilteredPairs++;
|
||||
continue;
|
||||
}
|
||||
const wTime = getTimeScore(distance);
|
||||
|
||||
const fi = features[i];
|
||||
const fj = features[j];
|
||||
|
||||
const wWhat = overlapCoefficient(fi.interactionPairs, fj.interactionPairs);
|
||||
const wRSem = rSemByPair.get(packed) || 0;
|
||||
const wWho = jaccard(fi.entities, fj.entities);
|
||||
let wWhere = 0.0;
|
||||
if (fi.location && fi.location === fj.location) {
|
||||
const freq = locationFreq.get(fi.location) || 1;
|
||||
const damp = Math.max(
|
||||
CONFIG.WHERE_FREQ_DAMP_MIN,
|
||||
Math.min(1, CONFIG.WHERE_FREQ_DAMP_PIVOT / Math.max(1, freq))
|
||||
);
|
||||
wWhere = damp;
|
||||
}
|
||||
|
||||
const weight =
|
||||
CONFIG.GAMMA.what * wWhat +
|
||||
CONFIG.GAMMA.rSem * wRSem +
|
||||
CONFIG.GAMMA.who * wWho +
|
||||
CONFIG.GAMMA.where * wWhere +
|
||||
CONFIG.GAMMA.time * wTime;
|
||||
|
||||
if (weight > 0) {
|
||||
neighbors[i].push({ target: j, weight });
|
||||
neighbors[j].push({ target: i, weight });
|
||||
edgeCount++;
|
||||
|
||||
if (wWhat > 0) channelStats.what++;
|
||||
if (wRSem > 0) channelStats.rSem++;
|
||||
if (wWho > 0) channelStats.who++;
|
||||
if (wWhere > 0) channelStats.where++;
|
||||
if (wWho > 0) reweightWhoUsed++;
|
||||
if (wWhere > 0) reweightWhereUsed++;
|
||||
}
|
||||
}
|
||||
|
||||
const buildTime = Math.round(performance.now() - T0);
|
||||
|
||||
xbLog.info(MODULE_ID,
|
||||
`Graph: ${N} nodes, ${edgeCount} edges ` +
|
||||
`(candidate_by_what=${pairSetByWhat.size} candidate_by_r_sem=${pairSetByRSem.size}) ` +
|
||||
`(what=${channelStats.what} r_sem=${channelStats.rSem} who=${channelStats.who} where=${channelStats.where}) ` +
|
||||
`(reweight_who_used=${reweightWhoUsed} reweight_where_used=${reweightWhereUsed}) ` +
|
||||
`(time_window_filtered=${timeWindowFilteredPairs} topk_pruned=${topKPrunedPairs}) ` +
|
||||
`(${buildTime}ms)`
|
||||
);
|
||||
|
||||
const totalPairs = N > 1 ? (N * (N - 1)) / 2 : 0;
|
||||
const edgeDensity = totalPairs > 0 ? Number((edgeCount / totalPairs * 100).toFixed(2)) : 0;
|
||||
|
||||
return {
|
||||
neighbors,
|
||||
edgeCount,
|
||||
channelStats,
|
||||
buildTime,
|
||||
candidatePairs: pairSet.size,
|
||||
pairsFromWhat: pairSetByWhat.size,
|
||||
pairsFromRSem: pairSetByRSem.size,
|
||||
rSemAvgSim: rSemSimCount ? Number((rSemSimSum / rSemSimCount).toFixed(3)) : 0,
|
||||
timeWindowFilteredPairs,
|
||||
topKPrunedPairs,
|
||||
reweightWhoUsed,
|
||||
reweightWhereUsed,
|
||||
edgeDensity,
|
||||
};
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// PPR: Seed vector construction
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* Build personalization vector s from seeds, weighted by rerankScore.
|
||||
* Haveliwala (2002): non-uniform personalization improves topic sensitivity.
|
||||
*
|
||||
* @param {object[]} seeds - seed L0 entries with atomId and rerankScore
|
||||
* @param {Map<string, number>} idToIdx - atomId → array index
|
||||
* @param {number} N - total node count
|
||||
* @returns {Float64Array} personalization vector (L1-normalized, sums to 1)
|
||||
*/
|
||||
function buildSeedVector(seeds, idToIdx, N) {
|
||||
const s = new Float64Array(N);
|
||||
let total = 0;
|
||||
|
||||
for (const seed of seeds) {
|
||||
const idx = idToIdx.get(seed.atomId);
|
||||
if (idx == null) continue;
|
||||
|
||||
const score = Math.max(0, seed.rerankScore || seed.similarity || 0);
|
||||
s[idx] += score;
|
||||
total += score;
|
||||
}
|
||||
|
||||
// L1 normalize to probability distribution
|
||||
if (total > 0) {
|
||||
for (let i = 0; i < N; i++) s[i] /= total;
|
||||
}
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// PPR: Column normalization + dangling node detection
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* Column-normalize adjacency into transition matrix W.
|
||||
*
|
||||
* Column j of W: W_{ij} = weight(i,j) / Σ_k weight(k,j)
|
||||
* Dangling nodes (no outgoing edges): handled in powerIteration
|
||||
* via redistribution to personalization vector s.
|
||||
* (Langville & Meyer 2005, §4.1)
|
||||
*
|
||||
* @param {object[][]} neighbors - neighbors[j] = [{target, weight}, ...]
|
||||
* @param {number} N
|
||||
* @returns {{ columns: object[][], dangling: number[] }}
|
||||
*/
|
||||
function columnNormalize(neighbors, N) {
|
||||
const columns = Array.from({ length: N }, () => []);
|
||||
const dangling = [];
|
||||
|
||||
for (let j = 0; j < N; j++) {
|
||||
const edges = neighbors[j];
|
||||
|
||||
let sum = 0;
|
||||
for (let e = 0; e < edges.length; e++) sum += edges[e].weight;
|
||||
|
||||
if (sum <= 0) {
|
||||
dangling.push(j);
|
||||
continue;
|
||||
}
|
||||
|
||||
const col = columns[j];
|
||||
for (let e = 0; e < edges.length; e++) {
|
||||
col.push({ target: edges[e].target, prob: edges[e].weight / sum });
|
||||
}
|
||||
}
|
||||
|
||||
return { columns, dangling };
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// PPR: Power Iteration
|
||||
//
|
||||
// Aligned with NetworkX pagerank() (pagerank_alg.py):
|
||||
//
|
||||
// NetworkX "alpha" = damping = our (1 − α)
|
||||
// NetworkX "1-alpha" = teleportation = our α
|
||||
//
|
||||
// Per iteration:
|
||||
// π_new[i] = α·s[i] + (1−α)·( Σ_j W_{ij}·π[j] + dangling_sum·s[i] )
|
||||
//
|
||||
// Convergence: Perron-Frobenius theorem guarantees unique stationary
|
||||
// distribution for irreducible aperiodic column-stochastic matrix.
|
||||
// Rate: ‖π^(t+1) − π^t‖₁ ≤ (1−α)^t (geometric).
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* Run PPR Power Iteration.
|
||||
*
|
||||
* @param {object[][]} columns - column-normalized transition matrix
|
||||
* @param {Float64Array} s - personalization vector (sums to 1)
|
||||
* @param {number[]} dangling - dangling node indices
|
||||
* @param {number} N - node count
|
||||
* @returns {{ pi: Float64Array, iterations: number, finalError: number }}
|
||||
*/
|
||||
function powerIteration(columns, s, dangling, N) {
|
||||
const alpha = CONFIG.ALPHA;
|
||||
const d = 1 - alpha; // damping factor = prob of following edges
|
||||
const epsilon = CONFIG.EPSILON;
|
||||
const maxIter = CONFIG.MAX_ITER;
|
||||
|
||||
// Initialize π to personalization vector
|
||||
let pi = new Float64Array(N);
|
||||
for (let i = 0; i < N; i++) pi[i] = s[i];
|
||||
|
||||
let iterations = 0;
|
||||
let finalError = 0;
|
||||
|
||||
for (let iter = 0; iter < maxIter; iter++) {
|
||||
const piNew = new Float64Array(N);
|
||||
|
||||
// Dangling mass: probability at nodes with no outgoing edges
|
||||
// redistributed to personalization vector (Langville & Meyer 2005)
|
||||
let danglingSum = 0;
|
||||
for (let k = 0; k < dangling.length; k++) {
|
||||
danglingSum += pi[dangling[k]];
|
||||
}
|
||||
|
||||
// Sparse matrix-vector product: (1−α) · W · π
|
||||
for (let j = 0; j < N; j++) {
|
||||
const pj = pi[j];
|
||||
if (pj === 0) continue;
|
||||
|
||||
const col = columns[j];
|
||||
const dpj = d * pj;
|
||||
for (let e = 0; e < col.length; e++) {
|
||||
piNew[col[e].target] += dpj * col[e].prob;
|
||||
}
|
||||
}
|
||||
|
||||
// Restart + dangling contribution:
|
||||
// α · s[i] + (1−α) · danglingSum · s[i]
|
||||
const restartCoeff = alpha + d * danglingSum;
|
||||
for (let i = 0; i < N; i++) {
|
||||
piNew[i] += restartCoeff * s[i];
|
||||
}
|
||||
|
||||
// L1 convergence check
|
||||
let l1 = 0;
|
||||
for (let i = 0; i < N; i++) {
|
||||
l1 += Math.abs(piNew[i] - pi[i]);
|
||||
}
|
||||
|
||||
pi = piNew;
|
||||
iterations = iter + 1;
|
||||
finalError = l1;
|
||||
|
||||
if (l1 < epsilon) break;
|
||||
}
|
||||
|
||||
return { pi, iterations, finalError };
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Post-verification: Dense Cosine Gate
|
||||
//
|
||||
// PPR measures graph-structural relevance ("same characters").
|
||||
// Cosine gate measures semantic relevance ("related to current topic").
|
||||
// Product combination ensures both dimensions are satisfied
|
||||
// (CombMNZ — Fox & Shaw, TREC-2 1994).
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* Filter PPR-activated nodes by semantic relevance.
|
||||
*
|
||||
* For each non-seed node with PPR > 0:
|
||||
* 1. cosine(queryVector, stateVector) ≥ COSINE_GATE
|
||||
* 2. finalScore = PPR_normalized × cosine ≥ SCORE_FLOOR
|
||||
* 3. Top DIFFUSION_CAP by finalScore
|
||||
*
|
||||
* @param {Float64Array} pi - PPR stationary distribution
|
||||
* @param {string[]} atomIds - index → atomId
|
||||
* @param {Map<string, object>} atomById - atomId → atom object
|
||||
* @param {Set<string>} seedAtomIds - seed atomIds (excluded from output)
|
||||
* @param {Map<string, Float32Array>} vectorMap - atomId → embedding vector
|
||||
* @param {Float32Array|number[]} queryVector - R2 weighted query vector
|
||||
* @returns {{ diffused: object[], gateStats: object }}
|
||||
*/
|
||||
function postVerify(pi, atomIds, atomById, seedAtomIds, vectorMap, queryVector) {
|
||||
const N = atomIds.length;
|
||||
const gateStats = { passed: 0, filtered: 0, noVector: 0 };
|
||||
|
||||
// Find max PPR score among non-seed nodes (for normalization)
|
||||
let maxPPR = 0;
|
||||
for (let i = 0; i < N; i++) {
|
||||
if (pi[i] > 0 && !seedAtomIds.has(atomIds[i])) {
|
||||
if (pi[i] > maxPPR) maxPPR = pi[i];
|
||||
}
|
||||
}
|
||||
|
||||
if (maxPPR <= 0) {
|
||||
return { diffused: [], gateStats };
|
||||
}
|
||||
|
||||
const candidates = [];
|
||||
|
||||
for (let i = 0; i < N; i++) {
|
||||
const atomId = atomIds[i];
|
||||
|
||||
// Skip seeds and zero-probability nodes
|
||||
if (seedAtomIds.has(atomId)) continue;
|
||||
if (pi[i] <= 0) continue;
|
||||
|
||||
// Require state vector for cosine verification
|
||||
const vec = vectorMap.get(atomId);
|
||||
if (!vec?.length) {
|
||||
gateStats.noVector++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Cosine gate
|
||||
const cos = cosineSimilarity(queryVector, vec);
|
||||
if (cos < CONFIG.COSINE_GATE) {
|
||||
gateStats.filtered++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Final score = PPR_normalized × cosine
|
||||
const pprNorm = pi[i] / maxPPR;
|
||||
const finalScore = pprNorm * cos;
|
||||
|
||||
if (finalScore < CONFIG.SCORE_FLOOR) {
|
||||
gateStats.filtered++;
|
||||
continue;
|
||||
}
|
||||
|
||||
gateStats.passed++;
|
||||
|
||||
const atom = atomById.get(atomId);
|
||||
if (!atom) continue;
|
||||
|
||||
candidates.push({
|
||||
atomId,
|
||||
floor: atom.floor,
|
||||
atom,
|
||||
finalScore,
|
||||
pprScore: pi[i],
|
||||
pprNormalized: pprNorm,
|
||||
cosine: cos,
|
||||
});
|
||||
}
|
||||
|
||||
// Sort by finalScore descending, cap at DIFFUSION_CAP
|
||||
candidates.sort((a, b) => b.finalScore - a.finalScore);
|
||||
const diffused = candidates.slice(0, CONFIG.DIFFUSION_CAP);
|
||||
|
||||
return { diffused, gateStats };
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Main entry point
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* Spread activation from seed L0 atoms through entity co-occurrence graph.
|
||||
*
|
||||
* Called from recall.js Stage 7.5, after locateAndPullEvidence and before
|
||||
* Causation Trace. Results are merged into l0Selected and consumed by
|
||||
* prompt.js through existing budget/formatting pipeline (zero downstream changes).
|
||||
*
|
||||
* @param {object[]} seeds - l0Selected from recall Stage 6
|
||||
* Each: { atomId, rerankScore, similarity, atom, ... }
|
||||
* @param {object[]} allAtoms - getStateAtoms() result
|
||||
* Each: { atomId, floor, semantic, edges, where }
|
||||
* @param {object[]} stateVectors - getAllStateVectors() result
|
||||
* Each: { atomId, floor, vector: Float32Array, rVector?: Float32Array }
|
||||
* @param {Float32Array|number[]} queryVector - R2 weighted query vector
|
||||
* @param {object|null} metrics - metrics object (optional, mutated in-place)
|
||||
* @returns {object[]} Additional L0 atoms for l0Selected
|
||||
* Each: { atomId, floor, atom, finalScore, pprScore, pprNormalized, cosine }
|
||||
*/
|
||||
export function diffuseFromSeeds(seeds, allAtoms, stateVectors, queryVector, metrics) {
|
||||
const T0 = performance.now();
|
||||
|
||||
// ─── Early exits ─────────────────────────────────────────────────
|
||||
|
||||
if (!seeds?.length || !allAtoms?.length || !queryVector?.length) {
|
||||
fillMetricsEmpty(metrics);
|
||||
return [];
|
||||
}
|
||||
|
||||
// Align with entity-lexicon hard rule: exclude name1 from graph features.
|
||||
const { name1 } = getContext();
|
||||
const excludeEntities = new Set();
|
||||
if (name1) excludeEntities.add(normalize(name1));
|
||||
|
||||
// ─── 1. Build atom index ─────────────────────────────────────────
|
||||
|
||||
const atomById = new Map();
|
||||
const atomIds = [];
|
||||
const idToIdx = new Map();
|
||||
|
||||
for (let i = 0; i < allAtoms.length; i++) {
|
||||
const a = allAtoms[i];
|
||||
atomById.set(a.atomId, a);
|
||||
atomIds.push(a.atomId);
|
||||
idToIdx.set(a.atomId, i);
|
||||
}
|
||||
|
||||
const N = allAtoms.length;
|
||||
|
||||
// Validate seeds against atom index
|
||||
const validSeeds = seeds.filter(s => idToIdx.has(s.atomId));
|
||||
const seedAtomIds = new Set(validSeeds.map(s => s.atomId));
|
||||
|
||||
if (!validSeeds.length) {
|
||||
fillMetricsEmpty(metrics);
|
||||
return [];
|
||||
}
|
||||
|
||||
// ─── 2. Build graph ──────────────────────────────────────────────
|
||||
|
||||
const graph = buildGraph(allAtoms, stateVectors, excludeEntities);
|
||||
|
||||
if (graph.edgeCount === 0) {
|
||||
fillMetrics(metrics, {
|
||||
seedCount: validSeeds.length,
|
||||
graphNodes: N,
|
||||
graphEdges: 0,
|
||||
channelStats: graph.channelStats,
|
||||
candidatePairs: graph.candidatePairs,
|
||||
pairsFromWhat: graph.pairsFromWhat,
|
||||
pairsFromRSem: graph.pairsFromRSem,
|
||||
rSemAvgSim: graph.rSemAvgSim,
|
||||
timeWindowFilteredPairs: graph.timeWindowFilteredPairs,
|
||||
topKPrunedPairs: graph.topKPrunedPairs,
|
||||
edgeDensity: graph.edgeDensity,
|
||||
reweightWhoUsed: graph.reweightWhoUsed,
|
||||
reweightWhereUsed: graph.reweightWhereUsed,
|
||||
time: graph.buildTime,
|
||||
});
|
||||
xbLog.info(MODULE_ID, 'No graph edges — skipping diffusion');
|
||||
return [];
|
||||
}
|
||||
|
||||
// ─── 3. Build seed vector ────────────────────────────────────────
|
||||
|
||||
const s = buildSeedVector(validSeeds, idToIdx, N);
|
||||
|
||||
// ─── 4. Column normalize ─────────────────────────────────────────
|
||||
|
||||
const { columns, dangling } = columnNormalize(graph.neighbors, N);
|
||||
|
||||
// ─── 5. PPR Power Iteration ──────────────────────────────────────
|
||||
|
||||
const T_PPR = performance.now();
|
||||
const { pi, iterations, finalError } = powerIteration(columns, s, dangling, N);
|
||||
const pprTime = Math.round(performance.now() - T_PPR);
|
||||
|
||||
// Count activated non-seed nodes
|
||||
let pprActivated = 0;
|
||||
for (let i = 0; i < N; i++) {
|
||||
if (pi[i] > 0 && !seedAtomIds.has(atomIds[i])) pprActivated++;
|
||||
}
|
||||
|
||||
// ─── 6. Post-verification ────────────────────────────────────────
|
||||
|
||||
const vectorMap = new Map();
|
||||
for (const sv of (stateVectors || [])) {
|
||||
vectorMap.set(sv.atomId, sv.vector);
|
||||
}
|
||||
|
||||
const { diffused, gateStats } = postVerify(
|
||||
pi, atomIds, atomById, seedAtomIds, vectorMap, queryVector
|
||||
);
|
||||
|
||||
// ─── 7. Metrics ──────────────────────────────────────────────────
|
||||
|
||||
const totalTime = Math.round(performance.now() - T0);
|
||||
|
||||
fillMetrics(metrics, {
|
||||
seedCount: validSeeds.length,
|
||||
graphNodes: N,
|
||||
graphEdges: graph.edgeCount,
|
||||
channelStats: graph.channelStats,
|
||||
candidatePairs: graph.candidatePairs,
|
||||
pairsFromWhat: graph.pairsFromWhat,
|
||||
pairsFromRSem: graph.pairsFromRSem,
|
||||
rSemAvgSim: graph.rSemAvgSim,
|
||||
timeWindowFilteredPairs: graph.timeWindowFilteredPairs,
|
||||
topKPrunedPairs: graph.topKPrunedPairs,
|
||||
edgeDensity: graph.edgeDensity,
|
||||
reweightWhoUsed: graph.reweightWhoUsed,
|
||||
reweightWhereUsed: graph.reweightWhereUsed,
|
||||
buildTime: graph.buildTime,
|
||||
iterations,
|
||||
convergenceError: finalError,
|
||||
pprActivated,
|
||||
cosineGatePassed: gateStats.passed,
|
||||
cosineGateFiltered: gateStats.filtered,
|
||||
cosineGateNoVector: gateStats.noVector,
|
||||
postGatePassRate: pprActivated > 0
|
||||
? Math.round((gateStats.passed / pprActivated) * 100)
|
||||
: 0,
|
||||
finalCount: diffused.length,
|
||||
scoreDistribution: diffused.length > 0
|
||||
? calcScoreStats(diffused.map(d => d.finalScore))
|
||||
: { min: 0, max: 0, mean: 0 },
|
||||
time: totalTime,
|
||||
});
|
||||
|
||||
xbLog.info(MODULE_ID,
|
||||
`Diffusion: ${validSeeds.length} seeds → ` +
|
||||
`graph(${N}n/${graph.edgeCount}e) → ` +
|
||||
`PPR(${iterations}it, ε=${finalError.toExponential(1)}, ${pprTime}ms) → ` +
|
||||
`${pprActivated} activated → ` +
|
||||
`gate(${gateStats.passed}\u2713/${gateStats.filtered}\u2717` +
|
||||
`${gateStats.noVector ? `/${gateStats.noVector}?` : ''}) → ` +
|
||||
`${diffused.length} final (${totalTime}ms)`
|
||||
);
|
||||
|
||||
return diffused;
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Metrics helpers
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* Compute min/max/mean distribution
|
||||
* @param {number[]} scores
|
||||
* @returns {{ min: number, max: number, mean: number }}
|
||||
*/
|
||||
function calcScoreStats(scores) {
|
||||
if (!scores.length) return { min: 0, max: 0, mean: 0 };
|
||||
const sorted = [...scores].sort((a, b) => a - b);
|
||||
const sum = sorted.reduce((a, b) => a + b, 0);
|
||||
return {
|
||||
min: Number(sorted[0].toFixed(3)),
|
||||
max: Number(sorted[sorted.length - 1].toFixed(3)),
|
||||
mean: Number((sum / sorted.length).toFixed(3)),
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Fill metrics with empty diffusion block
|
||||
*/
|
||||
function fillMetricsEmpty(metrics) {
|
||||
if (!metrics) return;
|
||||
metrics.diffusion = {
|
||||
seedCount: 0,
|
||||
graphNodes: 0,
|
||||
graphEdges: 0,
|
||||
iterations: 0,
|
||||
convergenceError: 0,
|
||||
pprActivated: 0,
|
||||
cosineGatePassed: 0,
|
||||
cosineGateFiltered: 0,
|
||||
cosineGateNoVector: 0,
|
||||
finalCount: 0,
|
||||
scoreDistribution: { min: 0, max: 0, mean: 0 },
|
||||
byChannel: { what: 0, where: 0, rSem: 0, who: 0 },
|
||||
candidatePairs: 0,
|
||||
pairsFromWhat: 0,
|
||||
pairsFromRSem: 0,
|
||||
rSemAvgSim: 0,
|
||||
timeWindowFilteredPairs: 0,
|
||||
topKPrunedPairs: 0,
|
||||
edgeDensity: 0,
|
||||
reweightWhoUsed: 0,
|
||||
reweightWhereUsed: 0,
|
||||
postGatePassRate: 0,
|
||||
time: 0,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Fill metrics with diffusion results
|
||||
*/
|
||||
function fillMetrics(metrics, data) {
|
||||
if (!metrics) return;
|
||||
metrics.diffusion = {
|
||||
seedCount: data.seedCount || 0,
|
||||
graphNodes: data.graphNodes || 0,
|
||||
graphEdges: data.graphEdges || 0,
|
||||
iterations: data.iterations || 0,
|
||||
convergenceError: data.convergenceError || 0,
|
||||
pprActivated: data.pprActivated || 0,
|
||||
cosineGatePassed: data.cosineGatePassed || 0,
|
||||
cosineGateFiltered: data.cosineGateFiltered || 0,
|
||||
cosineGateNoVector: data.cosineGateNoVector || 0,
|
||||
postGatePassRate: data.postGatePassRate || 0,
|
||||
finalCount: data.finalCount || 0,
|
||||
scoreDistribution: data.scoreDistribution || { min: 0, max: 0, mean: 0 },
|
||||
byChannel: data.channelStats || { what: 0, where: 0, rSem: 0, who: 0 },
|
||||
candidatePairs: data.candidatePairs || 0,
|
||||
pairsFromWhat: data.pairsFromWhat || 0,
|
||||
pairsFromRSem: data.pairsFromRSem || 0,
|
||||
rSemAvgSim: data.rSemAvgSim || 0,
|
||||
timeWindowFilteredPairs: data.timeWindowFilteredPairs || 0,
|
||||
topKPrunedPairs: data.topKPrunedPairs || 0,
|
||||
edgeDensity: data.edgeDensity || 0,
|
||||
reweightWhoUsed: data.reweightWhoUsed || 0,
|
||||
reweightWhereUsed: data.reweightWhereUsed || 0,
|
||||
time: data.time || 0,
|
||||
};
|
||||
}
|
||||
Reference in New Issue
Block a user