Files
LittleWhiteBox/modules/story-summary/vector/llm/atom-extraction.js

362 lines
12 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// ============================================================================
// atom-extraction.js - L0 叙事锚点提取(三层 themes 版)
// ============================================================================
import { callLLM, parseJson } from './llm-service.js';
import { xbLog } from '../../../../core/debug-core.js';
import { filterText } from '../utils/text-filter.js';
const MODULE_ID = 'atom-extraction';
const CONCURRENCY = 10;
const RETRY_COUNT = 2;
const RETRY_DELAY = 500;
const DEFAULT_TIMEOUT = 20000;
const STAGGER_DELAY = 80;
let batchCancelled = false;
export function cancelBatchExtraction() {
batchCancelled = true;
}
export function isBatchCancelled() {
return batchCancelled;
}
// ============================================================================
// L0 提取 Prompt三层 themes
// ============================================================================
const SYSTEM_PROMPT = `你是叙事锚点提取器。从一轮对话中提取4-8个关键锚点用于后续语义检索。
输入格式:
<round>
<user name="用户名">...</user>
<assistant name="角色名">...</assistant>
</round>
只输出严格JSON
{"atoms":[{"t":"类型","s":"主体","o":"客体","v":"谓词","l":"地点","f":"来源","th":{"fn":[],"pt":[],"kw":[]}}]}
## 类型t
- emo: 情绪状态变化
- act: 关键动作/行为
- rev: 揭示/发现/真相
- dec: 决定/承诺/宣言
- ten: 冲突/张力/对立
- loc: 场景/地点变化
## 字段说明
- s: 主体(必填)
- o: 客体(可空)
- v: 谓词15字内必填
- l: 地点(可空)
- f: "u"=用户 / "a"=角色(必填)
- th: 主题标签(必填,结构化对象)
## th 三层结构
fn叙事功能1-2个枚举
establish=建立设定 | escalate=升级加剧 | reveal=揭示发现 | challenge=挑战试探
commit=承诺锁定 | conflict=冲突对抗 | resolve=解决收束 | transform=转变逆转
bond=连接羁绊 | break=断裂破坏
pt互动模式1-3个枚举
power_down=上对下 | power_up=下对上 | power_equal=对等 | power_contest=争夺
asymmetric=信息不对称 | witnessed=有观众 | secluded=隔绝私密
ritual=仪式正式 | routine=日常惯例 | triangular=三方介入
kw具体关键词1-3个自由格式
## 示例输出
{"atoms":[
{"t":"act","s":"艾拉","o":"古龙","v":"用圣剑刺穿心脏","l":"火山口","f":"a",
"th":{"fn":["commit"],"pt":["power_down","ritual"],"kw":["战斗","牺牲"]}},
{"t":"emo","s":"林夏","o":"陆远","v":"意识到自己喜欢他","l":"","f":"a",
"th":{"fn":["reveal","escalate"],"pt":["asymmetric","secluded"],"kw":["心动","暗恋"]}},
{"t":"dec","s":"凯尔","o":"王国","v":"放弃王位继承权","l":"王座厅","f":"a",
"th":{"fn":["commit","break"],"pt":["ritual","witnessed"],"kw":["抉择","自由"]}},
{"t":"rev","s":"","o":"","v":"管家其实是间谍","l":"","f":"a",
"th":{"fn":["reveal"],"pt":["asymmetric"],"kw":["背叛","真相"]}},
{"t":"ten","s":"兄弟二人","o":"","v":"为遗产反目","l":"","f":"a",
"th":{"fn":["conflict","break"],"pt":["power_contest"],"kw":["冲突","亲情破裂"]}}
]}
规则:
- 只提取对未来检索有价值的锚点
- fn 回答"这在故事里推动了什么"
- pt 回答"这是什么结构的互动"
- kw 用于细粒度检索
- 无明显锚点时返回 {"atoms":[]}`;
const JSON_PREFILL = '{"atoms":[';
// ============================================================================
// Semantic 构建
// ============================================================================
function buildSemantic(atom, userName, aiName) {
const type = atom.t || 'act';
const subject = atom.s || (atom.f === 'u' ? userName : aiName);
const object = atom.o || '';
const verb = atom.v || '';
const location = atom.l || '';
// 三层 themes 合并
const th = atom.th || {};
const tags = [
...(Array.isArray(th.fn) ? th.fn : []),
...(Array.isArray(th.pt) ? th.pt : []),
...(Array.isArray(th.kw) ? th.kw : []),
].filter(Boolean);
const typePart = `<${type}>`;
const themePart = tags.length > 0 ? ` [${tags.join('/')}]` : '';
const locPart = location ? `${location}` : '';
const objPart = object ? ` -> ${object}` : '';
let semantic = '';
switch (type) {
case 'emo':
semantic = object
? `${typePart} ${subject} -> ${verb} (对${object})${locPart}`
: `${typePart} ${subject} -> ${verb}${locPart}`;
break;
case 'act':
semantic = `${typePart} ${subject} -> ${verb}${objPart}${locPart}`;
break;
case 'rev':
semantic = object
? `${typePart} 揭示: ${verb} (关于${object})${locPart}`
: `${typePart} 揭示: ${verb}${locPart}`;
break;
case 'dec':
semantic = object
? `${typePart} ${subject} -> ${verb} (对${object})${locPart}`
: `${typePart} ${subject} -> ${verb}${locPart}`;
break;
case 'ten':
semantic = object
? `${typePart} ${subject} <-> ${object}: ${verb}${locPart}`
: `${typePart} ${subject}: ${verb}${locPart}`;
break;
case 'loc':
semantic = location
? `${typePart} 场景: ${location} - ${verb}`
: `${typePart} 场景: ${verb}`;
break;
default:
semantic = `${typePart} ${subject} -> ${verb}${objPart}${locPart}`;
}
return semantic + themePart;
}
// ============================================================================
// 睡眠工具
// ============================================================================
const sleep = (ms) => new Promise(r => setTimeout(r, ms));
// ============================================================================
// 单轮提取(带重试)
// ============================================================================
async function extractAtomsForRoundWithRetry(userMessage, aiMessage, aiFloor, options = {}) {
const { timeout = DEFAULT_TIMEOUT } = options;
if (!aiMessage?.mes?.trim()) return [];
const parts = [];
const userName = userMessage?.name || '用户';
const aiName = aiMessage.name || '角色';
if (userMessage?.mes?.trim()) {
const userText = filterText(userMessage.mes);
parts.push(`<user name="${userName}">\n${userText}\n</user>`);
}
const aiText = filterText(aiMessage.mes);
parts.push(`<assistant name="${aiName}">\n${aiText}\n</assistant>`);
const input = `<round>\n${parts.join('\n')}\n</round>`;
for (let attempt = 0; attempt <= RETRY_COUNT; attempt++) {
if (batchCancelled) return [];
try {
const response = await callLLM([
{ role: 'system', content: SYSTEM_PROMPT },
{ role: 'user', content: input },
{ role: 'assistant', content: JSON_PREFILL },
], {
temperature: 0.2,
max_tokens: 1000,
timeout,
});
const rawText = String(response || '');
if (!rawText.trim()) {
if (attempt < RETRY_COUNT) {
await sleep(RETRY_DELAY);
continue;
}
return null;
}
const fullJson = JSON_PREFILL + rawText;
let parsed;
try {
parsed = parseJson(fullJson);
} catch (e) {
xbLog.warn(MODULE_ID, `floor ${aiFloor} JSON解析失败`);
if (attempt < RETRY_COUNT) {
await sleep(RETRY_DELAY);
continue;
}
return null;
}
if (!parsed?.atoms || !Array.isArray(parsed.atoms)) {
if (attempt < RETRY_COUNT) {
await sleep(RETRY_DELAY);
continue;
}
return null;
}
const filtered = parsed.atoms
.filter(a => a?.t && a?.v)
.map((a, idx) => ({
atomId: `atom-${aiFloor}-${idx}`,
floor: aiFloor,
type: a.t,
subject: a.s || null,
object: a.o || null,
value: String(a.v).slice(0, 50),
location: a.l || null,
source: a.f === 'u' ? 'user' : 'ai',
themes: a.th || { fn: [], pt: [], kw: [] },
semantic: buildSemantic(a, userName, aiName),
}));
return filtered;
} catch (e) {
if (batchCancelled) return null;
if (attempt < RETRY_COUNT) {
await sleep(RETRY_DELAY * (attempt + 1));
continue;
}
xbLog.error(MODULE_ID, `floor ${aiFloor} 失败`, e);
return null;
}
}
return null;
}
export async function extractAtomsForRound(userMessage, aiMessage, aiFloor, options = {}) {
return extractAtomsForRoundWithRetry(userMessage, aiMessage, aiFloor, options);
}
// ============================================================================
// 批量提取
// ============================================================================
export async function batchExtractAtoms(chat, onProgress) {
if (!chat?.length) return [];
batchCancelled = false;
const pairs = [];
for (let i = 0; i < chat.length; i++) {
if (!chat[i].is_user) {
const userMsg = (i > 0 && chat[i - 1]?.is_user) ? chat[i - 1] : null;
pairs.push({ userMsg, aiMsg: chat[i], aiFloor: i });
}
}
if (!pairs.length) return [];
const allAtoms = [];
let completed = 0;
let failed = 0;
for (let i = 0; i < pairs.length; i += CONCURRENCY) {
if (batchCancelled) break;
const batch = pairs.slice(i, i + CONCURRENCY);
if (i === 0) {
const promises = batch.map((pair, idx) => (async () => {
await sleep(idx * STAGGER_DELAY);
if (batchCancelled) return;
try {
const atoms = await extractAtomsForRoundWithRetry(
pair.userMsg,
pair.aiMsg,
pair.aiFloor,
{ timeout: DEFAULT_TIMEOUT }
);
if (atoms?.length) {
allAtoms.push(...atoms);
} else if (atoms === null) {
failed++;
}
} catch {
failed++;
}
completed++;
onProgress?.(completed, pairs.length, failed);
})());
await Promise.all(promises);
} else {
const promises = batch.map(pair =>
extractAtomsForRoundWithRetry(
pair.userMsg,
pair.aiMsg,
pair.aiFloor,
{ timeout: DEFAULT_TIMEOUT }
)
.then(atoms => {
if (batchCancelled) return;
if (atoms?.length) {
allAtoms.push(...atoms);
} else if (atoms === null) {
failed++;
}
completed++;
onProgress?.(completed, pairs.length, failed);
})
.catch(() => {
if (batchCancelled) return;
failed++;
completed++;
onProgress?.(completed, pairs.length, failed);
})
);
await Promise.all(promises);
}
if (i + CONCURRENCY < pairs.length && !batchCancelled) {
await sleep(30);
}
}
xbLog.info(MODULE_ID, `批量提取完成: ${allAtoms.length} atoms, ${failed} 失败`);
return allAtoms;
}