LittleWhiteBox/modules/story-summary/vector/retrieval/entity-lexicon.js

// ═══════════════════════════════════════════════════════════════════════════
// entity-lexicon.js - 实体词典（确定性，无 LLM）
//
// 职责：
// 1. 从已有结构化存储构建可信实体词典
// 2. 从文本中提取命中的实体
//
// 硬约束：name1 永不进入词典
// ═══════════════════════════════════════════════════════════════════════════

import { getStateAtoms } from '../storage/state-store.js';

/**
 * 标准化字符串（用于实体匹配）
 * @param {string} s
 * @returns {string}
 */
function normalize(s) {
    return String(s || '')
        .normalize('NFKC')
        .replace(/[\u200B-\u200D\uFEFF]/g, '')
        .trim()
        .toLowerCase();
}

/**
 * 构建实体词典
 *
 * 来源（按可信度）：
 *   1. store.json.characters.main  — 已确认主要角色
 *   2. store.json.arcs[].name      — 弧光对象
 *   3. context.name2               — 当前角色
 *   4. store.json.facts[].s        — L3 事实主语
 *
 * 硬约束：永远排除 normalize(context.name1)
 *
 * @param {object} store  - getSummaryStore() 返回值
 * @param {object} context - { name1: string, name2: string }
 * @returns {Set<string>} 标准化后的实体集合
 */
export function buildEntityLexicon(store, context) {
    const lexicon = new Set();

    // 内部辅助：添加非空实体
    const add = (raw) => {
        const n = normalize(raw);
        if (n && n.length >= 2) lexicon.add(n);
    };

    // 1. 主要角色
    const main = store?.json?.characters?.main || [];
    for (const m of main) {
        add(typeof m === 'string' ? m : m.name);
    }

    // 2. 弧光角色
    const arcs = store?.json?.arcs || [];
    for (const a of arcs) {
        add(a.name);
    }

    // 3. 当前角色 name2
    if (context?.name2) {
        add(context.name2);
    }

    // 4. L3 facts 主语
    const facts = store?.json?.facts || [];
    for (const f of facts) {
        if (f.retracted) continue;
        add(f.s);
    }

    // 5. L0 atoms 的 who（新角色在 L2 总结前即可进入词典）
    const atoms = getStateAtoms();
    for (const atom of atoms) {
        for (const name of (atom.who || [])) {
            add(name);
        }
    }

    // ★ 硬约束：删除 name1
    if (context?.name1) {
        lexicon.delete(normalize(context.name1));
    }

    return lexicon;
}

/**
 * 构建"原词形 → 标准化"映射表
 * 用于从 lexicon 反查原始显示名
 *
 * @param {object} store
 * @param {object} context
 * @returns {Map<string, string>} normalize(name) → 原词形
 */
export function buildDisplayNameMap(store, context) {
    const map = new Map();

    const register = (raw) => {
        const n = normalize(raw);
        if (n && n.length >= 2 && !map.has(n)) {
            map.set(n, String(raw).trim());
        }
    };

    const main = store?.json?.characters?.main || [];
    for (const m of main) {
        register(typeof m === 'string' ? m : m.name);
    }

    const arcs = store?.json?.arcs || [];
    for (const a of arcs) {
        register(a.name);
    }

    if (context?.name2) register(context.name2);

    const facts = store?.json?.facts || [];
    for (const f of facts) {
        if (!f.retracted) register(f.s);
    }

    // 5. L0 atoms 的 who
    const atoms = getStateAtoms();
    for (const atom of atoms) {
        for (const name of (atom.who || [])) {
            register(name);
        }
    }

    // ★ 硬约束：删除 name1
    if (context?.name1) {
        map.delete(normalize(context.name1));
    }

    return map;
}

/**
 * 从文本中提取命中的实体
 *
 * 逻辑：遍历词典，检查文本中是否包含（不区分大小写）
 * 返回命中的实体原词形（去重）
 *
 * @param {string} text - 清洗后的文本
 * @param {Set<string>} lexicon - 标准化后的实体集合
 * @param {Map<string, string>} displayMap - normalize → 原词形
 * @returns {string[]} 命中的实体（原词形）
 */
export function extractEntitiesFromText(text, lexicon, displayMap) {
    if (!text || !lexicon?.size) return [];

    const textNorm = normalize(text);
    const hits = [];
    const seen = new Set();

    for (const entity of lexicon) {
        if (textNorm.includes(entity) && !seen.has(entity)) {
            seen.add(entity);
            // 优先返回原词形
            const display = displayMap?.get(entity) || entity;
            hits.push(display);
        }
    }

    return hits;
}