Files
LittleWhiteBox/modules/story-summary/vector/retrieval/entity-lexicon.js

170 lines
4.7 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// ═══════════════════════════════════════════════════════════════════════════
// entity-lexicon.js - 实体词典(确定性,无 LLM
//
// 职责:
// 1. 从已有结构化存储构建可信实体词典
// 2. 从文本中提取命中的实体
//
// 硬约束name1 永不进入词典
// ═══════════════════════════════════════════════════════════════════════════
import { getStateAtoms } from '../storage/state-store.js';
/**
* 标准化字符串(用于实体匹配)
* @param {string} s
* @returns {string}
*/
function normalize(s) {
return String(s || '')
.normalize('NFKC')
.replace(/[\u200B-\u200D\uFEFF]/g, '')
.trim()
.toLowerCase();
}
/**
* 构建实体词典
*
* 来源(按可信度):
* 1. store.json.characters.main — 已确认主要角色
* 2. store.json.arcs[].name — 弧光对象
* 3. context.name2 — 当前角色
* 4. store.json.facts[].s — L3 事实主语
*
* 硬约束:永远排除 normalize(context.name1)
*
* @param {object} store - getSummaryStore() 返回值
* @param {object} context - { name1: string, name2: string }
* @returns {Set<string>} 标准化后的实体集合
*/
export function buildEntityLexicon(store, context) {
const lexicon = new Set();
// 内部辅助:添加非空实体
const add = (raw) => {
const n = normalize(raw);
if (n && n.length >= 2) lexicon.add(n);
};
// 1. 主要角色
const main = store?.json?.characters?.main || [];
for (const m of main) {
add(typeof m === 'string' ? m : m.name);
}
// 2. 弧光角色
const arcs = store?.json?.arcs || [];
for (const a of arcs) {
add(a.name);
}
// 3. 当前角色 name2
if (context?.name2) {
add(context.name2);
}
// 4. L3 facts 主语
const facts = store?.json?.facts || [];
for (const f of facts) {
if (f.retracted) continue;
add(f.s);
}
// 5. L0 atoms 的 who新角色在 L2 总结前即可进入词典)
const atoms = getStateAtoms();
for (const atom of atoms) {
for (const name of (atom.who || [])) {
add(name);
}
}
// ★ 硬约束:删除 name1
if (context?.name1) {
lexicon.delete(normalize(context.name1));
}
return lexicon;
}
/**
* 构建"原词形 → 标准化"映射表
* 用于从 lexicon 反查原始显示名
*
* @param {object} store
* @param {object} context
* @returns {Map<string, string>} normalize(name) → 原词形
*/
export function buildDisplayNameMap(store, context) {
const map = new Map();
const register = (raw) => {
const n = normalize(raw);
if (n && n.length >= 2 && !map.has(n)) {
map.set(n, String(raw).trim());
}
};
const main = store?.json?.characters?.main || [];
for (const m of main) {
register(typeof m === 'string' ? m : m.name);
}
const arcs = store?.json?.arcs || [];
for (const a of arcs) {
register(a.name);
}
if (context?.name2) register(context.name2);
const facts = store?.json?.facts || [];
for (const f of facts) {
if (!f.retracted) register(f.s);
}
// 5. L0 atoms 的 who
const atoms = getStateAtoms();
for (const atom of atoms) {
for (const name of (atom.who || [])) {
register(name);
}
}
// ★ 硬约束:删除 name1
if (context?.name1) {
map.delete(normalize(context.name1));
}
return map;
}
/**
* 从文本中提取命中的实体
*
* 逻辑:遍历词典,检查文本中是否包含(不区分大小写)
* 返回命中的实体原词形(去重)
*
* @param {string} text - 清洗后的文本
* @param {Set<string>} lexicon - 标准化后的实体集合
* @param {Map<string, string>} displayMap - normalize → 原词形
* @returns {string[]} 命中的实体(原词形)
*/
export function extractEntitiesFromText(text, lexicon, displayMap) {
if (!text || !lexicon?.size) return [];
const textNorm = normalize(text);
const hits = [];
const seen = new Set();
for (const entity of lexicon) {
if (textNorm.includes(entity) && !seen.has(entity)) {
seen.add(entity);
// 优先返回原词形
const display = displayMap?.get(entity) || entity;
hits.push(display);
}
}
return hits;
}