// ============================================================================ // atom-extraction.js - L0 场景锚点提取（v2 - 场景摘要 + 图结构） // // 设计依据： // - BGE-M3 (BAAI, 2024): 自然语言段落检索精度最高 → semantic = 纯自然语言 // - TransE (Bordes, 2013): s/t/r 三元组方向性 → edges 格式 // // 每楼层 1-2 个场景锚点（非碎片原子），60-100 字场景摘要 // ============================================================================ import { callLLM, parseJson } from './llm-service.js'; import { xbLog } from '../../../../core/debug-core.js'; import { filterText } from '../utils/text-filter.js'; const MODULE_ID = 'atom-extraction'; const CONCURRENCY = 10; const RETRY_COUNT = 2; const RETRY_DELAY = 500; const DEFAULT_TIMEOUT = 40000; const STAGGER_DELAY = 80; const DEBUG_RAW_PREVIEW_LEN = 800; let batchCancelled = false; export function cancelBatchExtraction() { batchCancelled = true; } export function isBatchCancelled() { return batchCancelled; } // ============================================================================ // L0 提取 Prompt // ============================================================================ const SYSTEM_PROMPT = `你是场景摘要器。从一轮对话中提取1-2个场景锚点，用于语义检索和关系追踪。输入格式： ... ... 只输出严格JSON： {"anchors":[ { "scene": "60-100字完整场景描述", "edges": [{"s":"施事方","t":"受事方","r":"互动行为"}], "where": "地点" } ]} ## scene 写法 - 纯自然语言，像旁白或日记，不要任何标签/标记/枚举值 - 必须包含：角色名、动作、情感氛围、关键细节 - 读者只看 scene 就能复原这一幕 - 60-100字，信息密集但流畅 ## edges（关系三元组） - s=施事方 t=受事方 r=互动行为（建议 6-12 字，最多 20 字） - s/t 必须是参与互动的角色正式名称，不用代词或别称 - 只从正文内容中识别角色名，不要把标签名（如 user、assistant）当作角色 - r 使用动作模板短语：“动作+对象/结果”（例：“提出交易条件”、“拒绝对方请求”、“当众揭露秘密”、“安抚对方情绪”） - r 不要写人名，不要复述整句，不要写心理描写或评价词 - r 正例（合格）：提出交易条件、拒绝对方请求、当众揭露秘密、安抚对方情绪、强行打断发言、转移谈话焦点 - r 反例（不合格）：我觉得她现在很害怕、他突然非常生气地大喊起来、user开始说话、assistant解释了很多细节 - 每个锚点 1-3 条 ## where - 场景地点，无明确地点时空字符串 ## 数量规则 - 最多2个。1个够时不凑2个 - 明显场景切换（地点/时间/对象变化）时才2个 - 同一场景不拆分 - 无角色互动时返回 {"anchors":[]} ## 示例输入：艾拉在火山口举起圣剑刺穿古龙心脏，龙血溅满她的铠甲，她跪倒在地痛哭输出： {"anchors":[{"scene":"火山口上艾拉举起圣剑刺穿古龙的心脏，龙血溅满铠甲，古龙轰然倒地，艾拉跪倒在滚烫的岩石上痛哭，完成了她不得不做的弑杀","edges":[{"s":"艾拉","t":"古龙","r":"以圣剑刺穿心脏"}],"where":"火山口"}]}`; // ============================================================================ // 睡眠工具 // ============================================================================ const sleep = (ms) => new Promise(r => setTimeout(r, ms)); function previewText(text, maxLen = DEBUG_RAW_PREVIEW_LEN) { const raw = String(text ?? '').replace(/\s+/g, ' ').trim(); if (!raw) return '(empty)'; return raw.length > maxLen ? `${raw.slice(0, maxLen)} ...(truncated)` : raw; } const ACTION_STRIP_WORDS = [ '突然', '非常', '有些', '有点', '轻轻', '悄悄', '缓缓', '立刻', '马上', '然后', '并且', '而且', '开始', '继续', '再次', '正在', ]; function clamp(v, min, max) { return Math.max(min, Math.min(max, v)); } function sanitizeActionPhrase(raw) { let text = String(raw || '') .normalize('NFKC') .replace(/[\u200B-\u200D\uFEFF]/g, '') .trim(); if (!text) return ''; text = text .replace(/[，。！？、；：,.!?;:"'“”‘’()（）[\]{}<>《》]/g, '') .replace(/\s+/g, ''); for (const word of ACTION_STRIP_WORDS) { text = text.replaceAll(word, ''); } text = text.replace(/(地|得|了|着|过)+$/g, ''); if (text.length < 2) return ''; if (text.length > 12) text = text.slice(0, 12); return text; } function calcAtomQuality(scene, edges, where) { const sceneLen = String(scene || '').length; const sceneScore = clamp(sceneLen / 80, 0, 1); const edgeScore = clamp((edges?.length || 0) / 3, 0, 1); const whereScore = where ? 1 : 0; const quality = 0.55 * sceneScore + 0.35 * edgeScore + 0.10 * whereScore; return Number(quality.toFixed(3)); } // ============================================================================ // 清洗与构建 // ============================================================================ /** * 清洗 edges 三元组 * @param {object[]} raw * @returns {object[]} */ function sanitizeEdges(raw) { if (!Array.isArray(raw)) return []; return raw .filter(e => e && typeof e === 'object') .map(e => ({ s: String(e.s || '').trim(), t: String(e.t || '').trim(), r: sanitizeActionPhrase(e.r), })) .filter(e => e.s && e.t && e.r) .slice(0, 3); } /** * 将解析后的 anchor 转换为 atom 存储对象 * * semantic = scene（纯自然语言，直接用于 embedding） * * @param {object} anchor - LLM 输出的 anchor 对象 * @param {number} aiFloor - AI 消息楼层号 * @param {number} idx - 同楼层序号（0 或 1） * @returns {object|null} atom 对象 */ function anchorToAtom(anchor, aiFloor, idx) { const scene = String(anchor.scene || '').trim(); if (!scene) return null; // scene 过短（< 15 字）可能是噪音 if (scene.length < 15) return null; const edges = sanitizeEdges(anchor.edges); const where = String(anchor.where || '').trim(); const quality = calcAtomQuality(scene, edges, where); return { atomId: `atom-${aiFloor}-${idx}`, floor: aiFloor, source: 'ai', // ═══ 检索层（embedding 的唯一入口） ═══ semantic: scene, // ═══ 图结构层（扩散的 key） ═══ edges, where, quality, }; } // ============================================================================ // 单轮提取（带重试） // ============================================================================ async function extractAtomsForRoundWithRetry(userMessage, aiMessage, aiFloor, options = {}) { const { timeout = DEFAULT_TIMEOUT } = options; if (!aiMessage?.mes?.trim()) return []; const parts = []; const userName = userMessage?.name || '用户'; if (userMessage?.mes?.trim()) { const userText = filterText(userMessage.mes); parts.push(`\n${userText}\n`); } const aiText = filterText(aiMessage.mes); parts.push(`\n${aiText}\n`); const input = `\n${parts.join('\n')}\n\n请读取上述内容，提取 1-2 个场景锚点，并严格按 JSON 输出。\n不要解释，不要续写，不要角色扮演，不要输出 JSON 以外的任何内容。`; for (let attempt = 0; attempt <= RETRY_COUNT; attempt++) { if (batchCancelled) return []; try { const response = await callLLM([ { role: 'system', content: SYSTEM_PROMPT }, { role: 'user', content: input }, ], { temperature: 0.3, max_tokens: 600, timeout, }); const rawText = String(response || ''); xbLog.info(MODULE_ID, `floor ${aiFloor} attempt ${attempt} rawText(len=${rawText.length}): ${previewText(rawText)}`); if (!rawText.trim()) { if (attempt < RETRY_COUNT) { await sleep(RETRY_DELAY); continue; } return null; } xbLog.info(MODULE_ID, `floor ${aiFloor} attempt ${attempt} parseSource(len=${rawText.length}): ${previewText(rawText)}`); let parsed; try { parsed = parseJson(rawText); } catch (e) { xbLog.warn(MODULE_ID, `floor ${aiFloor} JSON解析失败 (attempt ${attempt})`); if (attempt < RETRY_COUNT) { await sleep(RETRY_DELAY); continue; } return null; } // 兼容：优先 anchors，回退 atoms const rawAnchors = parsed?.anchors; if (!rawAnchors || !Array.isArray(rawAnchors)) { xbLog.warn(MODULE_ID, `floor ${aiFloor} attempt ${attempt} 缺少有效 anchors，parsed=${previewText(JSON.stringify(parsed))}`); if (attempt < RETRY_COUNT) { await sleep(RETRY_DELAY); continue; } return null; } // 转换为 atom 存储格式（最多 2 个） const atoms = rawAnchors .slice(0, 2) .map((a, idx) => anchorToAtom(a, aiFloor, idx)) .filter(Boolean); xbLog.info(MODULE_ID, `floor ${aiFloor} attempt ${attempt} anchors=${rawAnchors.length} atoms=${atoms.length}`); if (rawAnchors.length === 0) { return []; } return atoms; } catch (e) { if (batchCancelled) return null; if (attempt < RETRY_COUNT) { await sleep(RETRY_DELAY * (attempt + 1)); continue; } xbLog.error(MODULE_ID, `floor ${aiFloor} 失败`, e); return null; } } return null; } export async function extractAtomsForRound(userMessage, aiMessage, aiFloor, options = {}) { return extractAtomsForRoundWithRetry(userMessage, aiMessage, aiFloor, options); } // ============================================================================ // 批量提取 // ============================================================================ export async function batchExtractAtoms(chat, onProgress) { if (!chat?.length) return []; batchCancelled = false; const pairs = []; for (let i = 0; i < chat.length; i++) { if (!chat[i].is_user) { const userMsg = (i > 0 && chat[i - 1]?.is_user) ? chat[i - 1] : null; pairs.push({ userMsg, aiMsg: chat[i], aiFloor: i }); } } if (!pairs.length) return []; const allAtoms = []; let completed = 0; let failed = 0; for (let i = 0; i < pairs.length; i += CONCURRENCY) { if (batchCancelled) break; const batch = pairs.slice(i, i + CONCURRENCY); if (i === 0) { const promises = batch.map((pair, idx) => (async () => { await sleep(idx * STAGGER_DELAY); if (batchCancelled) return; try { const atoms = await extractAtomsForRoundWithRetry( pair.userMsg, pair.aiMsg, pair.aiFloor, { timeout: DEFAULT_TIMEOUT } ); if (atoms?.length) { allAtoms.push(...atoms); } else if (atoms === null) { failed++; } } catch { failed++; } completed++; onProgress?.(completed, pairs.length, failed); })()); await Promise.all(promises); } else { const promises = batch.map(pair => extractAtomsForRoundWithRetry( pair.userMsg, pair.aiMsg, pair.aiFloor, { timeout: DEFAULT_TIMEOUT } ) .then(atoms => { if (batchCancelled) return; if (atoms?.length) { allAtoms.push(...atoms); } else if (atoms === null) { failed++; } completed++; onProgress?.(completed, pairs.length, failed); }) .catch(() => { if (batchCancelled) return; failed++; completed++; onProgress?.(completed, pairs.length, failed); }) ); await Promise.all(promises); } if (i + CONCURRENCY < pairs.length && !batchCancelled) { await sleep(30); } } xbLog.info(MODULE_ID, `批量提取完成: ${allAtoms.length} atoms, ${failed} 失败`); return allAtoms; }