fix(recall): keep only matched atoms and filter evidence by focus entities

2026-02-12 00:05:19 +08:00
parent 9f279d902f
commit 111cd081f6
2 changed files with 80 additions and 15 deletions
--- a/modules/story-summary/generate/prompt.js
+++ b/modules/story-summary/generate/prompt.js
@@ -53,6 +53,8 @@ const TOP_N_STAR = 5;
 // L0 显示文本：分号拼接 vs 多行模式的阈值
 const L0_JOINED_MAX_LENGTH = 120;
 // 背景证据实体过滤旁通阈值（与事件过滤策略一致）
 const EVIDENCE_ENTITY_BYPASS_SIM = 0.80;
 // ─────────────────────────────────────────────────────────────────────────────
 // 工具函数
@@ -123,6 +125,63 @@ function normalize(s) {
        .toLowerCase();
 }
 /**
 * 收集 L0 的实体集合（用于背景证据实体过滤）
 * 支持新结构 who/edges，也兼容旧结构 subject/object。
 * @param {object} l0
 * @returns {Set<string>}
 */
 function collectL0Entities(l0) {
    const atom = l0?.atom || l0?._atom || {};
    const set = new Set();
    const add = (v) => {
        const n = normalize(v);
        if (n) set.add(n);
    };
    for (const w of (atom.who || [])) add(w);
    for (const e of (atom.edges || [])) {
        add(e?.s);
        add(e?.t);
    }
    // 兼容旧数据
    add(atom.subject);
    add(atom.object);
    return set;
 }
 /**
 * 背景证据是否保留（按焦点实体过滤）
 * 规则：
 * 1) 无焦点实体：保留
 * 2) similarity >= 0.80：保留（旁通）
 * 3) who/edges 命中焦点实体：保留
 * 4) 兼容旧数据：semantic 文本包含焦点实体：保留
 * 否则过滤。
 * @param {object} l0
 * @param {Set<string>} focusSet
 * @returns {boolean}
 */
 function shouldKeepEvidenceL0(l0, focusSet) {
    if (!focusSet?.size) return true;
    if ((l0?.similarity || 0) >= EVIDENCE_ENTITY_BYPASS_SIM) return true;
    const entities = collectL0Entities(l0);
    for (const f of focusSet) {
        if (entities.has(f)) return true;
    }
    // 旧数据兜底：从 semantic 文本里做包含匹配
    const textNorm = normalize(l0?.atom?.semantic || l0?.text || '');
    for (const f of focusSet) {
        if (f && textNorm.includes(f)) return true;
    }
    return false;
 }
 /**
 * 获取事件排序键
 * @param {object} event - 事件对象
@@ -894,8 +953,11 @@ async function buildVectorPrompt(store, recallResult, causalById, focusEntities,
    const keepVisible = store.keepVisibleCount ?? 3;
    // 收集未被事件消费的 L0，按 rerankScore 降序
    const focusSetForEvidence = new Set((focusEntities || []).map(normalize).filter(Boolean));
    const remainingL0 = l0Selected
        .filter(l0 => !usedL0Ids.has(l0.id))
        .filter(l0 => shouldKeepEvidenceL0(l0, focusSetForEvidence))
        .sort((a, b) => (b.rerankScore || 0) - (a.rerankScore || 0));
    // 远期：floor <= lastSummarized
--- a/modules/story-summary/vector/retrieval/recall.js
+++ b/modules/story-summary/vector/retrieval/recall.js
@@ -723,18 +723,21 @@ async function locateAndPullEvidence(anchorHits, queryVector, rerankQuery, lexic
    // 6g. 收集 L0 atoms + L1 top-1 配对
    // ─────────────────────────────────────────────────────────────────
-    const atomsList = getStateAtoms();
+    // 仅保留“真实 dense 命中”的 L0 原子：
-    const atomsByFloor = new Map();
+    // 旧逻辑按 floor 全塞，容易把同层无关原子带进来。
-    for (const atom of atomsList) {
+    const atomById = new Map(getStateAtoms().map(a => [a.atomId, a]));
-        if (typeof atom.floor !== 'number' || atom.floor < 0) continue;
+    const matchedAtomsByFloor = new Map();
-        if (!atomsByFloor.has(atom.floor)) atomsByFloor.set(atom.floor, []);
+    for (const hit of (anchorHits || [])) {
-        atomsByFloor.get(atom.floor).push(atom);
+        const atom = hit.atom || atomById.get(hit.atomId);
        if (!atom) continue;
        if (!matchedAtomsByFloor.has(hit.floor)) matchedAtomsByFloor.set(hit.floor, []);
        matchedAtomsByFloor.get(hit.floor).push({
            atom,
            similarity: hit.similarity,
        });
    }
-
+    for (const arr of matchedAtomsByFloor.values()) {
-    const denseFloorMaxMap = new Map();
+        arr.sort((a, b) => b.similarity - a.similarity);
    for (const a of (anchorHits || [])) {
        const cur = denseFloorMaxMap.get(a.floor) || 0;
        if (a.similarity > cur) denseFloorMaxMap.set(a.floor, a.similarity);
    }
    const l0Selected = [];
@@ -744,15 +747,15 @@ async function locateAndPullEvidence(anchorHits, queryVector, rerankQuery, lexic
    for (const item of reranked) {
        const floor = item.floor;
        const rerankScore = item._rerankScore || 0;
        const denseSim = denseFloorMaxMap.get(floor) || 0;
-        const floorAtoms = atomsByFloor.get(floor) || [];
+        // 仅收集该 floor 中真实命中的 L0 atoms
-        for (const atom of floorAtoms) {
+        const floorMatchedAtoms = matchedAtomsByFloor.get(floor) || [];
        for (const { atom, similarity } of floorMatchedAtoms) {
            l0Selected.push({
                id: `anchor-${atom.atomId}`,
                atomId: atom.atomId,
                floor: atom.floor,
-                similarity: denseSim,
+                similarity,
                rerankScore,
                atom,
                text: atom.semantic || '',