LittleWhiteBox/modules/story-summary/vector/retrieval/diffusion.js

// ═══════════════════════════════════════════════════════════════════════════
// diffusion.js - PPR Graph Diffusion (Personalized PageRank)
//
// Spreads activation from seed L0 atoms through entity co-occurrence graph
// to discover narratively-connected but semantically-distant memories.
//
// Pipeline position: recall.js Stage 7.5
//   Input:  seeds (reranked L0 from Stage 6)
//   Output: additional L0 atoms → merged into l0Selected
//
// Algorithm:
//   1. Build undirected weighted graph over all L0 atoms
//      Candidate edges: WHAT + R semantic; WHO/WHERE are reweight-only
//   2. Personalized PageRank (Power Iteration)
//      Seeds weighted by rerankScore — Haveliwala (2002) topic-sensitive variant
//      α = 0.15 restart probability — Page et al. (1998)
//   3. Post-verification (Dense Cosine Gate)
//      Exclude seeds, cosine ≥ 0.45, final = PPR_norm × cosine ≥ 0.10
//
// References:
//   Page et al. "The PageRank Citation Ranking" (1998)
//   Haveliwala "Topic-Sensitive PageRank" (IEEE TKDE 2003)
//   Langville & Meyer "Eigenvector Methods for Web IR" (SIAM Review 2005)
//   Sun et al. "GraftNet" (EMNLP 2018)
//   Jaccard "Étude comparative de la distribution florale" (1912)
//   Szymkiewicz "Une contribution statistique" (1934) — Overlap coefficient
//   Rimmon-Kenan "Narrative Fiction" (2002) — Channel weight rationale
//
// Core PPR iteration aligned with NetworkX pagerank():
//   github.com/networkx/networkx — algorithms/link_analysis/pagerank_alg.py
// ═══════════════════════════════════════════════════════════════════════════

import { xbLog } from '../../../../core/debug-core.js';
import { getContext } from '../../../../../../../extensions.js';

const MODULE_ID = 'diffusion';

// ═══════════════════════════════════════════════════════════════════════════
// Configuration
// ═══════════════════════════════════════════════════════════════════════════

const CONFIG = {
    // PPR parameters (Page et al. 1998; GraftNet 2018 uses same values)
    ALPHA: 0.15,            // restart probability
    EPSILON: 1e-6,          // L1 convergence threshold
    MAX_ITER: 50,           // hard iteration cap (typically converges in 15-25)

    // Edge weight channel coefficients
    // Candidate generation uses WHAT + R semantic only.
    // WHO/WHERE are reweight-only signals.
    GAMMA: {
        what: 0.40,         // interaction pair overlap
        rSem: 0.40,         // semantic similarity over edges.r aggregate
        who: 0.10,          // endpoint entity overlap   (reweight-only)
        where: 0.05,        // location exact match      (reweight-only)
        time: 0.05,         // temporal decay score
    },
    // R semantic candidate generation
    R_SEM_MIN_SIM: 0.62,
    R_SEM_TOPK: 8,
    TIME_WINDOW_MAX: 80,
    TIME_DECAY_DIVISOR: 12,
    WHERE_MAX_GROUP_SIZE: 16,   // skip location-only pair expansion for over-common places
    WHERE_FREQ_DAMP_PIVOT: 6,   // location freq <= pivot keeps full WHERE score
    WHERE_FREQ_DAMP_MIN: 0.20,  // lower bound for damped WHERE contribution

    // Post-verification (Cosine Gate)
    COSINE_GATE: 0.46,      // min cosine(queryVector, stateVector)
    SCORE_FLOOR: 0.10,      // min finalScore = PPR_normalized × cosine
    DIFFUSION_CAP: 100,     // max diffused nodes (excluding seeds)
};

// ═══════════════════════════════════════════════════════════════════════════
// Utility functions
// ═══════════════════════════════════════════════════════════════════════════

/**
 * Unicode-safe text normalization (matches recall.js / entity-lexicon.js)
 */
function normalize(s) {
    return String(s || '')
        .normalize('NFKC')
        .replace(/[\u200B-\u200D\uFEFF]/g, '')
        .trim()
        .toLowerCase();
}

/**
 * Cosine similarity between two vectors
 */
function cosineSimilarity(a, b) {
    if (!a?.length || !b?.length || a.length !== b.length) return 0;
    let dot = 0, nA = 0, nB = 0;
    for (let i = 0; i < a.length; i++) {
        dot += a[i] * b[i];
        nA += a[i] * a[i];
        nB += b[i] * b[i];
    }
    return nA && nB ? dot / (Math.sqrt(nA) * Math.sqrt(nB)) : 0;
}

// ═══════════════════════════════════════════════════════════════════════════
// Feature extraction from L0 atoms
// ═══════════════════════════════════════════════════════════════════════════

/**
 * Endpoint entity set from edges.s/edges.t (used for candidate pair generation).
 * @param {object} atom
 * @param {Set<string>} excludeEntities - entities to exclude (e.g. name1)
 * @returns {Set<string>}
 */
function extractEntities(atom, excludeEntities = new Set()) {
    const set = new Set();
    for (const e of (atom.edges || [])) {
        const s = normalize(e?.s);
        const t = normalize(e?.t);
        if (s && !excludeEntities.has(s)) set.add(s);
        if (t && !excludeEntities.has(t)) set.add(t);
    }
    return set;
}

/**
 * WHAT channel: interaction pairs "A↔B" (direction-insensitive).
 * @param {object} atom
 * @param {Set<string>} excludeEntities
 * @returns {Set<string>}
 */
function extractInteractionPairs(atom, excludeEntities = new Set()) {
    const set = new Set();
    for (const e of (atom.edges || [])) {
        const s = normalize(e?.s);
        const t = normalize(e?.t);
        if (s && t && !excludeEntities.has(s) && !excludeEntities.has(t)) {
            const pair = [s, t].sort().join('\u2194');
            set.add(pair);
        }
    }
    return set;
}

/**
 * WHERE channel: normalized location string
 * @param {object} atom
 * @returns {string} empty string if absent
 */
function extractLocation(atom) {
    return normalize(atom.where);
}

function getFloorDistance(a, b) {
    const fa = Number(a?.floor || 0);
    const fb = Number(b?.floor || 0);
    return Math.abs(fa - fb);
}

function getTimeScore(distance) {
    return Math.exp(-distance / CONFIG.TIME_DECAY_DIVISOR);
}

// ═══════════════════════════════════════════════════════════════════════════
// Set similarity functions
// ═══════════════════════════════════════════════════════════════════════════

/**
 * Jaccard index: |A∩B| / |A∪B| (Jaccard 1912)
 * @param {Set<string>} a
 * @param {Set<string>} b
 * @returns {number} 0..1
 */
function jaccard(a, b) {
    if (!a.size || !b.size) return 0;
    let inter = 0;
    const [smaller, larger] = a.size <= b.size ? [a, b] : [b, a];
    for (const x of smaller) {
        if (larger.has(x)) inter++;
    }
    const union = a.size + b.size - inter;
    return union > 0 ? inter / union : 0;
}

/**
 * Overlap coefficient: |A∩B| / min(|A|,|B|) (Szymkiewicz-Simpson 1934)
 * Used for directed pairs where set sizes are small (1-3); Jaccard
 * over-penalizes small-set asymmetry.
 * @param {Set<string>} a
 * @param {Set<string>} b
 * @returns {number} 0..1
 */
function overlapCoefficient(a, b) {
    if (!a.size || !b.size) return 0;
    let inter = 0;
    const [smaller, larger] = a.size <= b.size ? [a, b] : [b, a];
    for (const x of smaller) {
        if (larger.has(x)) inter++;
    }
    return inter / smaller.size;
}

// ═══════════════════════════════════════════════════════════════════════════
// Graph construction
//
// Candidate pairs discovered via WHAT inverted index and R semantic top-k.
// WHO/WHERE are reweight-only signals and never create candidate pairs.
// ═══════════════════════════════════════════════════════════════════════════

/**
 * Pre-extract features for all atoms
 * @param {object[]} allAtoms
 * @param {Set<string>} excludeEntities
 * @returns {object[]} feature objects with entities/interactionPairs/location
 */
function extractAllFeatures(allAtoms, excludeEntities = new Set()) {
    return allAtoms.map(atom => ({
        entities: extractEntities(atom, excludeEntities),
        interactionPairs: extractInteractionPairs(atom, excludeEntities),
        location: extractLocation(atom),
    }));
}

/**
 * Build inverted index: value → list of atom indices
 * @param {object[]} features
 * @returns {{ whatIndex: Map, locationFreq: Map }}
 */
function buildInvertedIndices(features) {
    const whatIndex = new Map();
    const locationFreq = new Map();

    for (let i = 0; i < features.length; i++) {
        for (const pair of features[i].interactionPairs) {
            if (!whatIndex.has(pair)) whatIndex.set(pair, []);
            whatIndex.get(pair).push(i);
        }
        const loc = features[i].location;
        if (loc) locationFreq.set(loc, (locationFreq.get(loc) || 0) + 1);
    }

    return { whatIndex, locationFreq };
}

/**
 * Collect candidate pairs from inverted index
 * @param {Map} index - value → [atomIndex, ...]
 * @param {Set<number>} pairSet - packed pair collector
 * @param {number} N - total atom count (for pair packing)
 */
function collectPairsFromIndex(index, pairSet, N) {
    for (const indices of index.values()) {
        for (let a = 0; a < indices.length; a++) {
            for (let b = a + 1; b < indices.length; b++) {
                const lo = Math.min(indices[a], indices[b]);
                const hi = Math.max(indices[a], indices[b]);
                pairSet.add(lo * N + hi);
            }
        }
    }
}

/**
 * Build weighted undirected graph over L0 atoms.
 *
 * @param {object[]} allAtoms
 * @param {object[]} stateVectors
 * @param {Set<string>} excludeEntities
 * @returns {{ neighbors: object[][], edgeCount: number, channelStats: object, buildTime: number }}
 */
function buildGraph(allAtoms, stateVectors = [], excludeEntities = new Set()) {
    const N = allAtoms.length;
    const T0 = performance.now();

    const features = extractAllFeatures(allAtoms, excludeEntities);
    const { whatIndex, locationFreq } = buildInvertedIndices(features);

    // Candidate pairs: WHAT + R semantic
    const pairSetByWhat = new Set();
    const pairSetByRSem = new Set();
    const rSemByPair = new Map();
    const pairSet = new Set();
    collectPairsFromIndex(whatIndex, pairSetByWhat, N);

    const rVectorByAtomId = new Map(
        (stateVectors || [])
            .filter(v => v?.atomId && v?.rVector?.length)
            .map(v => [v.atomId, v.rVector])
    );
    const rVectors = allAtoms.map(a => rVectorByAtomId.get(a.atomId) || null);

    const directedNeighbors = Array.from({ length: N }, () => []);
    let rSemSimSum = 0;
    let rSemSimCount = 0;
    let topKPrunedPairs = 0;
    let timeWindowFilteredPairs = 0;

    // Enumerate only pairs within floor window to avoid O(N^2) full scan.
    const sortedByFloor = allAtoms
        .map((atom, idx) => ({ idx, floor: Number(atom?.floor || 0) }))
        .sort((a, b) => a.floor - b.floor);

    for (let left = 0; left < sortedByFloor.length; left++) {
        const i = sortedByFloor[left].idx;
        const baseFloor = sortedByFloor[left].floor;

        for (let right = left + 1; right < sortedByFloor.length; right++) {
            const floorDelta = sortedByFloor[right].floor - baseFloor;
            if (floorDelta > CONFIG.TIME_WINDOW_MAX) break;

            const j = sortedByFloor[right].idx;
            const vi = rVectors[i];
            const vj = rVectors[j];
            if (!vi?.length || !vj?.length) continue;

            const sim = cosineSimilarity(vi, vj);
            if (sim < CONFIG.R_SEM_MIN_SIM) continue;

            directedNeighbors[i].push({ target: j, sim });
            directedNeighbors[j].push({ target: i, sim });
            rSemSimSum += sim;
            rSemSimCount++;
        }
    }

    for (let i = 0; i < N; i++) {
        const arr = directedNeighbors[i];
        if (!arr.length) continue;
        arr.sort((a, b) => b.sim - a.sim);
        if (arr.length > CONFIG.R_SEM_TOPK) {
            topKPrunedPairs += arr.length - CONFIG.R_SEM_TOPK;
        }
        for (const n of arr.slice(0, CONFIG.R_SEM_TOPK)) {
            const lo = Math.min(i, n.target);
            const hi = Math.max(i, n.target);
            const packed = lo * N + hi;
            pairSetByRSem.add(packed);
            const prev = rSemByPair.get(packed) || 0;
            if (n.sim > prev) rSemByPair.set(packed, n.sim);
        }
    }
    for (const p of pairSetByWhat) pairSet.add(p);
    for (const p of pairSetByRSem) pairSet.add(p);

    // Compute edge weights for all candidates
    const neighbors = Array.from({ length: N }, () => []);
    let edgeCount = 0;
    const channelStats = { what: 0, where: 0, rSem: 0, who: 0 };
    let reweightWhoUsed = 0;
    let reweightWhereUsed = 0;

    for (const packed of pairSet) {
        const i = Math.floor(packed / N);
        const j = packed % N;

        const distance = getFloorDistance(allAtoms[i], allAtoms[j]);
        if (distance > CONFIG.TIME_WINDOW_MAX) {
            timeWindowFilteredPairs++;
            continue;
        }
        const wTime = getTimeScore(distance);

        const fi = features[i];
        const fj = features[j];

        const wWhat = overlapCoefficient(fi.interactionPairs, fj.interactionPairs);
        const wRSem = rSemByPair.get(packed) || 0;
        const wWho = jaccard(fi.entities, fj.entities);
        let wWhere = 0.0;
        if (fi.location && fi.location === fj.location) {
            const freq = locationFreq.get(fi.location) || 1;
            const damp = Math.max(
                CONFIG.WHERE_FREQ_DAMP_MIN,
                Math.min(1, CONFIG.WHERE_FREQ_DAMP_PIVOT / Math.max(1, freq))
            );
            wWhere = damp;
        }

        const weight =
            CONFIG.GAMMA.what * wWhat +
            CONFIG.GAMMA.rSem * wRSem +
            CONFIG.GAMMA.who * wWho +
            CONFIG.GAMMA.where * wWhere +
            CONFIG.GAMMA.time * wTime;

        if (weight > 0) {
            neighbors[i].push({ target: j, weight });
            neighbors[j].push({ target: i, weight });
            edgeCount++;

            if (wWhat > 0) channelStats.what++;
            if (wRSem > 0) channelStats.rSem++;
            if (wWho > 0) channelStats.who++;
            if (wWhere > 0) channelStats.where++;
            if (wWho > 0) reweightWhoUsed++;
            if (wWhere > 0) reweightWhereUsed++;
        }
    }

    const buildTime = Math.round(performance.now() - T0);

    xbLog.info(MODULE_ID,
        `Graph: ${N} nodes, ${edgeCount} edges ` +
        `(candidate_by_what=${pairSetByWhat.size} candidate_by_r_sem=${pairSetByRSem.size}) ` +
        `(what=${channelStats.what} r_sem=${channelStats.rSem} who=${channelStats.who} where=${channelStats.where}) ` +
        `(reweight_who_used=${reweightWhoUsed} reweight_where_used=${reweightWhereUsed}) ` +
        `(time_window_filtered=${timeWindowFilteredPairs} topk_pruned=${topKPrunedPairs}) ` +
        `(${buildTime}ms)`
    );

    const totalPairs = N > 1 ? (N * (N - 1)) / 2 : 0;
    const edgeDensity = totalPairs > 0 ? Number((edgeCount / totalPairs * 100).toFixed(2)) : 0;

    return {
        neighbors,
        edgeCount,
        channelStats,
        buildTime,
        candidatePairs: pairSet.size,
        pairsFromWhat: pairSetByWhat.size,
        pairsFromRSem: pairSetByRSem.size,
        rSemAvgSim: rSemSimCount ? Number((rSemSimSum / rSemSimCount).toFixed(3)) : 0,
        timeWindowFilteredPairs,
        topKPrunedPairs,
        reweightWhoUsed,
        reweightWhereUsed,
        edgeDensity,
    };
}

// ═══════════════════════════════════════════════════════════════════════════
// PPR: Seed vector construction
// ═══════════════════════════════════════════════════════════════════════════

/**
 * Build personalization vector s from seeds, weighted by rerankScore.
 * Haveliwala (2002): non-uniform personalization improves topic sensitivity.
 *
 * @param {object[]} seeds - seed L0 entries with atomId and rerankScore
 * @param {Map<string, number>} idToIdx - atomId → array index
 * @param {number} N - total node count
 * @returns {Float64Array} personalization vector (L1-normalized, sums to 1)
 */
function buildSeedVector(seeds, idToIdx, N) {
    const s = new Float64Array(N);
    let total = 0;

    for (const seed of seeds) {
        const idx = idToIdx.get(seed.atomId);
        if (idx == null) continue;

        const score = Math.max(0, seed.rerankScore || seed.similarity || 0);
        s[idx] += score;
        total += score;
    }

    // L1 normalize to probability distribution
    if (total > 0) {
        for (let i = 0; i < N; i++) s[i] /= total;
    }

    return s;
}

// ═══════════════════════════════════════════════════════════════════════════
// PPR: Column normalization + dangling node detection
// ═══════════════════════════════════════════════════════════════════════════

/**
 * Column-normalize adjacency into transition matrix W.
 *
 * Column j of W: W_{ij} = weight(i,j) / Σ_k weight(k,j)
 * Dangling nodes (no outgoing edges): handled in powerIteration
 * via redistribution to personalization vector s.
 * (Langville & Meyer 2005, §4.1)
 *
 * @param {object[][]} neighbors - neighbors[j] = [{target, weight}, ...]
 * @param {number} N
 * @returns {{ columns: object[][], dangling: number[] }}
 */
function columnNormalize(neighbors, N) {
    const columns = Array.from({ length: N }, () => []);
    const dangling = [];

    for (let j = 0; j < N; j++) {
        const edges = neighbors[j];

        let sum = 0;
        for (let e = 0; e < edges.length; e++) sum += edges[e].weight;

        if (sum <= 0) {
            dangling.push(j);
            continue;
        }

        const col = columns[j];
        for (let e = 0; e < edges.length; e++) {
            col.push({ target: edges[e].target, prob: edges[e].weight / sum });
        }
    }

    return { columns, dangling };
}

// ═══════════════════════════════════════════════════════════════════════════
// PPR: Power Iteration
//
// Aligned with NetworkX pagerank() (pagerank_alg.py):
//
//   NetworkX "alpha" = damping = our (1 − α)
//   NetworkX "1-alpha" = teleportation = our α
//
//   Per iteration:
//     π_new[i] = α·s[i] + (1−α)·( Σ_j W_{ij}·π[j] + dangling_sum·s[i] )
//
// Convergence: Perron-Frobenius theorem guarantees unique stationary
// distribution for irreducible aperiodic column-stochastic matrix.
// Rate: ‖π^(t+1) − π^t‖₁ ≤ (1−α)^t (geometric).
// ═══════════════════════════════════════════════════════════════════════════

/**
 * Run PPR Power Iteration.
 *
 * @param {object[][]} columns - column-normalized transition matrix
 * @param {Float64Array} s - personalization vector (sums to 1)
 * @param {number[]} dangling - dangling node indices
 * @param {number} N - node count
 * @returns {{ pi: Float64Array, iterations: number, finalError: number }}
 */
function powerIteration(columns, s, dangling, N) {
    const alpha = CONFIG.ALPHA;
    const d = 1 - alpha;       // damping factor = prob of following edges
    const epsilon = CONFIG.EPSILON;
    const maxIter = CONFIG.MAX_ITER;

    // Initialize π to personalization vector
    let pi = new Float64Array(N);
    for (let i = 0; i < N; i++) pi[i] = s[i];

    let iterations = 0;
    let finalError = 0;

    for (let iter = 0; iter < maxIter; iter++) {
        const piNew = new Float64Array(N);

        // Dangling mass: probability at nodes with no outgoing edges
        // redistributed to personalization vector (Langville & Meyer 2005)
        let danglingSum = 0;
        for (let k = 0; k < dangling.length; k++) {
            danglingSum += pi[dangling[k]];
        }

        // Sparse matrix-vector product: (1−α) · W · π
        for (let j = 0; j < N; j++) {
            const pj = pi[j];
            if (pj === 0) continue;

            const col = columns[j];
            const dpj = d * pj;
            for (let e = 0; e < col.length; e++) {
                piNew[col[e].target] += dpj * col[e].prob;
            }
        }

        // Restart + dangling contribution:
        // α · s[i] + (1−α) · danglingSum · s[i]
        const restartCoeff = alpha + d * danglingSum;
        for (let i = 0; i < N; i++) {
            piNew[i] += restartCoeff * s[i];
        }

        // L1 convergence check
        let l1 = 0;
        for (let i = 0; i < N; i++) {
            l1 += Math.abs(piNew[i] - pi[i]);
        }

        pi = piNew;
        iterations = iter + 1;
        finalError = l1;

        if (l1 < epsilon) break;
    }

    return { pi, iterations, finalError };
}

// ═══════════════════════════════════════════════════════════════════════════
// Post-verification: Dense Cosine Gate
//
// PPR measures graph-structural relevance ("same characters").
// Cosine gate measures semantic relevance ("related to current topic").
// Product combination ensures both dimensions are satisfied
// (CombMNZ — Fox & Shaw, TREC-2 1994).
// ═══════════════════════════════════════════════════════════════════════════

/**
 * Filter PPR-activated nodes by semantic relevance.
 *
 * For each non-seed node with PPR > 0:
 *   1. cosine(queryVector, stateVector) ≥ COSINE_GATE
 *   2. finalScore = PPR_normalized × cosine ≥ SCORE_FLOOR
 *   3. Top DIFFUSION_CAP by finalScore
 *
 * @param {Float64Array} pi - PPR stationary distribution
 * @param {string[]} atomIds - index → atomId
 * @param {Map<string, object>} atomById - atomId → atom object
 * @param {Set<string>} seedAtomIds - seed atomIds (excluded from output)
 * @param {Map<string, Float32Array>} vectorMap - atomId → embedding vector
 * @param {Float32Array|number[]} queryVector - R2 weighted query vector
 * @returns {{ diffused: object[], gateStats: object }}
 */
function postVerify(pi, atomIds, atomById, seedAtomIds, vectorMap, queryVector) {
    const N = atomIds.length;
    const gateStats = { passed: 0, filtered: 0, noVector: 0 };

    // Find max PPR score among non-seed nodes (for normalization)
    let maxPPR = 0;
    for (let i = 0; i < N; i++) {
        if (pi[i] > 0 && !seedAtomIds.has(atomIds[i])) {
            if (pi[i] > maxPPR) maxPPR = pi[i];
        }
    }

    if (maxPPR <= 0) {
        return { diffused: [], gateStats };
    }

    const candidates = [];

    for (let i = 0; i < N; i++) {
        const atomId = atomIds[i];

        // Skip seeds and zero-probability nodes
        if (seedAtomIds.has(atomId)) continue;
        if (pi[i] <= 0) continue;

        // Require state vector for cosine verification
        const vec = vectorMap.get(atomId);
        if (!vec?.length) {
            gateStats.noVector++;
            continue;
        }

        // Cosine gate
        const cos = cosineSimilarity(queryVector, vec);
        if (cos < CONFIG.COSINE_GATE) {
            gateStats.filtered++;
            continue;
        }

        // Final score = PPR_normalized × cosine
        const pprNorm = pi[i] / maxPPR;
        const finalScore = pprNorm * cos;

        if (finalScore < CONFIG.SCORE_FLOOR) {
            gateStats.filtered++;
            continue;
        }

        gateStats.passed++;

        const atom = atomById.get(atomId);
        if (!atom) continue;

        candidates.push({
            atomId,
            floor: atom.floor,
            atom,
            finalScore,
            pprScore: pi[i],
            pprNormalized: pprNorm,
            cosine: cos,
        });
    }

    // Sort by finalScore descending, cap at DIFFUSION_CAP
    candidates.sort((a, b) => b.finalScore - a.finalScore);
    const diffused = candidates.slice(0, CONFIG.DIFFUSION_CAP);

    return { diffused, gateStats };
}

// ═══════════════════════════════════════════════════════════════════════════
// Main entry point
// ═══════════════════════════════════════════════════════════════════════════

/**
 * Spread activation from seed L0 atoms through entity co-occurrence graph.
 *
 * Called from recall.js Stage 7.5, after locateAndPullEvidence and before
 * Causation Trace. Results are merged into l0Selected and consumed by
 * prompt.js through existing budget/formatting pipeline (zero downstream changes).
 *
 * @param {object[]} seeds - l0Selected from recall Stage 6
 *   Each: { atomId, rerankScore, similarity, atom, ... }
 * @param {object[]} allAtoms - getStateAtoms() result
 *   Each: { atomId, floor, semantic, edges, where }
 * @param {object[]} stateVectors - getAllStateVectors() result
 *   Each: { atomId, floor, vector: Float32Array, rVector?: Float32Array }
 * @param {Float32Array|number[]} queryVector - R2 weighted query vector
 * @param {object|null} metrics - metrics object (optional, mutated in-place)
 * @returns {object[]} Additional L0 atoms for l0Selected
 *   Each: { atomId, floor, atom, finalScore, pprScore, pprNormalized, cosine }
 */
export function diffuseFromSeeds(seeds, allAtoms, stateVectors, queryVector, metrics) {
    const T0 = performance.now();

    // ─── Early exits ─────────────────────────────────────────────────

    if (!seeds?.length || !allAtoms?.length || !queryVector?.length) {
        fillMetricsEmpty(metrics);
        return [];
    }

    // Align with entity-lexicon hard rule: exclude name1 from graph features.
    const { name1 } = getContext();
    const excludeEntities = new Set();
    if (name1) excludeEntities.add(normalize(name1));

    // ─── 1. Build atom index ─────────────────────────────────────────

    const atomById = new Map();
    const atomIds = [];
    const idToIdx = new Map();

    for (let i = 0; i < allAtoms.length; i++) {
        const a = allAtoms[i];
        atomById.set(a.atomId, a);
        atomIds.push(a.atomId);
        idToIdx.set(a.atomId, i);
    }

    const N = allAtoms.length;

    // Validate seeds against atom index
    const validSeeds = seeds.filter(s => idToIdx.has(s.atomId));
    const seedAtomIds = new Set(validSeeds.map(s => s.atomId));

    if (!validSeeds.length) {
        fillMetricsEmpty(metrics);
        return [];
    }

    // ─── 2. Build graph ──────────────────────────────────────────────

    const graph = buildGraph(allAtoms, stateVectors, excludeEntities);

    if (graph.edgeCount === 0) {
        fillMetrics(metrics, {
            seedCount: validSeeds.length,
            graphNodes: N,
            graphEdges: 0,
            channelStats: graph.channelStats,
            candidatePairs: graph.candidatePairs,
            pairsFromWhat: graph.pairsFromWhat,
            pairsFromRSem: graph.pairsFromRSem,
            rSemAvgSim: graph.rSemAvgSim,
            timeWindowFilteredPairs: graph.timeWindowFilteredPairs,
            topKPrunedPairs: graph.topKPrunedPairs,
            edgeDensity: graph.edgeDensity,
            reweightWhoUsed: graph.reweightWhoUsed,
            reweightWhereUsed: graph.reweightWhereUsed,
            time: graph.buildTime,
        });
        xbLog.info(MODULE_ID, 'No graph edges — skipping diffusion');
        return [];
    }

    // ─── 3. Build seed vector ────────────────────────────────────────

    const s = buildSeedVector(validSeeds, idToIdx, N);

    // ─── 4. Column normalize ─────────────────────────────────────────

    const { columns, dangling } = columnNormalize(graph.neighbors, N);

    // ─── 5. PPR Power Iteration ──────────────────────────────────────

    const T_PPR = performance.now();
    const { pi, iterations, finalError } = powerIteration(columns, s, dangling, N);
    const pprTime = Math.round(performance.now() - T_PPR);

    // Count activated non-seed nodes
    let pprActivated = 0;
    for (let i = 0; i < N; i++) {
        if (pi[i] > 0 && !seedAtomIds.has(atomIds[i])) pprActivated++;
    }

    // ─── 6. Post-verification ────────────────────────────────────────

    const vectorMap = new Map();
    for (const sv of (stateVectors || [])) {
        vectorMap.set(sv.atomId, sv.vector);
    }

    const { diffused, gateStats } = postVerify(
        pi, atomIds, atomById, seedAtomIds, vectorMap, queryVector
    );

    // ─── 7. Metrics ──────────────────────────────────────────────────

    const totalTime = Math.round(performance.now() - T0);

    fillMetrics(metrics, {
        seedCount: validSeeds.length,
        graphNodes: N,
        graphEdges: graph.edgeCount,
        channelStats: graph.channelStats,
        candidatePairs: graph.candidatePairs,
        pairsFromWhat: graph.pairsFromWhat,
        pairsFromRSem: graph.pairsFromRSem,
        rSemAvgSim: graph.rSemAvgSim,
        timeWindowFilteredPairs: graph.timeWindowFilteredPairs,
        topKPrunedPairs: graph.topKPrunedPairs,
        edgeDensity: graph.edgeDensity,
        reweightWhoUsed: graph.reweightWhoUsed,
        reweightWhereUsed: graph.reweightWhereUsed,
        buildTime: graph.buildTime,
        iterations,
        convergenceError: finalError,
        pprActivated,
        cosineGatePassed: gateStats.passed,
        cosineGateFiltered: gateStats.filtered,
        cosineGateNoVector: gateStats.noVector,
        postGatePassRate: pprActivated > 0
            ? Math.round((gateStats.passed / pprActivated) * 100)
            : 0,
        finalCount: diffused.length,
        scoreDistribution: diffused.length > 0
            ? calcScoreStats(diffused.map(d => d.finalScore))
            : { min: 0, max: 0, mean: 0 },
        time: totalTime,
    });

    xbLog.info(MODULE_ID,
        `Diffusion: ${validSeeds.length} seeds → ` +
        `graph(${N}n/${graph.edgeCount}e) → ` +
        `PPR(${iterations}it, ε=${finalError.toExponential(1)}, ${pprTime}ms) → ` +
        `${pprActivated} activated → ` +
        `gate(${gateStats.passed}\u2713/${gateStats.filtered}\u2717` +
        `${gateStats.noVector ? `/${gateStats.noVector}?` : ''}) → ` +
        `${diffused.length} final (${totalTime}ms)`
    );

    return diffused;
}

// ═══════════════════════════════════════════════════════════════════════════
// Metrics helpers
// ═══════════════════════════════════════════════════════════════════════════

/**
 * Compute min/max/mean distribution
 * @param {number[]} scores
 * @returns {{ min: number, max: number, mean: number }}
 */
function calcScoreStats(scores) {
    if (!scores.length) return { min: 0, max: 0, mean: 0 };
    const sorted = [...scores].sort((a, b) => a - b);
    const sum = sorted.reduce((a, b) => a + b, 0);
    return {
        min: Number(sorted[0].toFixed(3)),
        max: Number(sorted[sorted.length - 1].toFixed(3)),
        mean: Number((sum / sorted.length).toFixed(3)),
    };
}

/**
 * Fill metrics with empty diffusion block
 */
function fillMetricsEmpty(metrics) {
    if (!metrics) return;
    metrics.diffusion = {
        seedCount: 0,
        graphNodes: 0,
        graphEdges: 0,
        iterations: 0,
        convergenceError: 0,
        pprActivated: 0,
        cosineGatePassed: 0,
        cosineGateFiltered: 0,
        cosineGateNoVector: 0,
        finalCount: 0,
        scoreDistribution: { min: 0, max: 0, mean: 0 },
        byChannel: { what: 0, where: 0, rSem: 0, who: 0 },
        candidatePairs: 0,
        pairsFromWhat: 0,
        pairsFromRSem: 0,
        rSemAvgSim: 0,
        timeWindowFilteredPairs: 0,
        topKPrunedPairs: 0,
        edgeDensity: 0,
        reweightWhoUsed: 0,
        reweightWhereUsed: 0,
        postGatePassRate: 0,
        time: 0,
    };
}

/**
 * Fill metrics with diffusion results
 */
function fillMetrics(metrics, data) {
    if (!metrics) return;
    metrics.diffusion = {
        seedCount: data.seedCount || 0,
        graphNodes: data.graphNodes || 0,
        graphEdges: data.graphEdges || 0,
        iterations: data.iterations || 0,
        convergenceError: data.convergenceError || 0,
        pprActivated: data.pprActivated || 0,
        cosineGatePassed: data.cosineGatePassed || 0,
        cosineGateFiltered: data.cosineGateFiltered || 0,
        cosineGateNoVector: data.cosineGateNoVector || 0,
        postGatePassRate: data.postGatePassRate || 0,
        finalCount: data.finalCount || 0,
        scoreDistribution: data.scoreDistribution || { min: 0, max: 0, mean: 0 },
        byChannel: data.channelStats || { what: 0, where: 0, rSem: 0, who: 0 },
        candidatePairs: data.candidatePairs || 0,
        pairsFromWhat: data.pairsFromWhat || 0,
        pairsFromRSem: data.pairsFromRSem || 0,
        rSemAvgSim: data.rSemAvgSim || 0,
        timeWindowFilteredPairs: data.timeWindowFilteredPairs || 0,
        topKPrunedPairs: data.topKPrunedPairs || 0,
        edgeDensity: data.edgeDensity || 0,
        reweightWhoUsed: data.reweightWhoUsed || 0,
        reweightWhereUsed: data.reweightWhereUsed || 0,
        time: data.time || 0,
    };
}