Files
LittleWhiteBox/modules/story-summary/vector/chunk-builder.js

369 lines
14 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// ═══════════════════════════════════════════════════════════════════════════
// Story Summary - Chunk Builder
// 标准 RAG chunking: ~200 tokens per chunk
// ═══════════════════════════════════════════════════════════════════════════
import { getContext } from '../../../../../../extensions.js';
import {
getMeta,
updateMeta,
saveChunks,
saveChunkVectors,
clearAllChunks,
deleteChunksFromFloor,
deleteChunksAtFloor,
makeChunkId,
hashText,
CHUNK_MAX_TOKENS,
} from './chunk-store.js';
import { embed, getEngineFingerprint } from './embedder.js';
import { xbLog } from '../../../core/debug-core.js';
import { filterText } from './text-filter.js';
const MODULE_ID = 'chunk-builder';
// ═══════════════════════════════════════════════════════════════════════════
// Token 估算
// ═══════════════════════════════════════════════════════════════════════════
function estimateTokens(text) {
if (!text) return 0;
const chinese = (text.match(/[\u4e00-\u9fff]/g) || []).length;
const other = text.length - chinese;
return Math.ceil(chinese + other / 4);
}
function splitSentences(text) {
if (!text) return [];
const parts = text.split(/(?<=[。!?\n])|(?<=[.!?]\s)/);
return parts.map(s => s.trim()).filter(s => s.length > 0);
}
// ═══════════════════════════════════════════════════════════════════════════
// Chunk 切分
// ═══════════════════════════════════════════════════════════════════════════
export function chunkMessage(floor, message, maxTokens = CHUNK_MAX_TOKENS) {
const text = message.mes || '';
const speaker = message.name || (message.is_user ? '用户' : '角色');
const isUser = !!message.is_user;
// 1. 应用用户自定义过滤规则
// 2. 移除 TTS 标记(硬编码)
const cleanText = filterText(text)
.replace(/\[tts:[^\]]*\]/gi, '')
.trim();
if (!cleanText) return [];
const totalTokens = estimateTokens(cleanText);
if (totalTokens <= maxTokens) {
return [{
chunkId: makeChunkId(floor, 0),
floor,
chunkIdx: 0,
speaker,
isUser,
text: cleanText,
textHash: hashText(cleanText),
}];
}
const sentences = splitSentences(cleanText);
const chunks = [];
let currentSentences = [];
let currentTokens = 0;
for (const sent of sentences) {
const sentTokens = estimateTokens(sent);
if (sentTokens > maxTokens) {
if (currentSentences.length > 0) {
const chunkText = currentSentences.join('');
chunks.push({
chunkId: makeChunkId(floor, chunks.length),
floor,
chunkIdx: chunks.length,
speaker,
isUser,
text: chunkText,
textHash: hashText(chunkText),
});
currentSentences = [];
currentTokens = 0;
}
const sliceSize = maxTokens * 2;
for (let i = 0; i < sent.length; i += sliceSize) {
const slice = sent.slice(i, i + sliceSize);
chunks.push({
chunkId: makeChunkId(floor, chunks.length),
floor,
chunkIdx: chunks.length,
speaker,
isUser,
text: slice,
textHash: hashText(slice),
});
}
continue;
}
if (currentTokens + sentTokens > maxTokens && currentSentences.length > 0) {
const chunkText = currentSentences.join('');
chunks.push({
chunkId: makeChunkId(floor, chunks.length),
floor,
chunkIdx: chunks.length,
speaker,
isUser,
text: chunkText,
textHash: hashText(chunkText),
});
currentSentences = [];
currentTokens = 0;
}
currentSentences.push(sent);
currentTokens += sentTokens;
}
if (currentSentences.length > 0) {
const chunkText = currentSentences.join('');
chunks.push({
chunkId: makeChunkId(floor, chunks.length),
floor,
chunkIdx: chunks.length,
speaker,
isUser,
text: chunkText,
textHash: hashText(chunkText),
});
}
return chunks;
}
// ═══════════════════════════════════════════════════════════════════════════
// 构建状态
// ═══════════════════════════════════════════════════════════════════════════
export async function getChunkBuildStatus() {
const { chat, chatId } = getContext();
if (!chatId) {
return { totalFloors: 0, builtFloors: 0, pending: 0 };
}
const meta = await getMeta(chatId);
const totalFloors = chat?.length || 0;
const builtFloors = meta.lastChunkFloor + 1;
return {
totalFloors,
builtFloors,
lastChunkFloor: meta.lastChunkFloor,
pending: Math.max(0, totalFloors - builtFloors),
};
}
// ═══════════════════════════════════════════════════════════════════════════
// 全量构建
// ═══════════════════════════════════════════════════════════════════════════
export async function buildAllChunks(options = {}) {
const { onProgress, shouldCancel, vectorConfig } = options;
const { chat, chatId } = getContext();
if (!chatId || !chat?.length) {
return { built: 0, errors: 0 };
}
const fingerprint = getEngineFingerprint(vectorConfig);
await clearAllChunks(chatId);
await updateMeta(chatId, { lastChunkFloor: -1, fingerprint });
const allChunks = [];
for (let floor = 0; floor < chat.length; floor++) {
const chunks = chunkMessage(floor, chat[floor]);
allChunks.push(...chunks);
}
if (allChunks.length === 0) {
return { built: 0, errors: 0 };
}
xbLog.info(MODULE_ID, `开始构建 ${allChunks.length} 个 chunks${chat.length} 层楼)`);
await saveChunks(chatId, allChunks);
const texts = allChunks.map(c => c.text);
const isLocal = vectorConfig.engine === 'local';
const batchSize = isLocal ? 5 : 20;
let completed = 0;
let errors = 0;
const allVectors = [];
for (let i = 0; i < texts.length; i += batchSize) {
if (shouldCancel?.()) break;
const batch = texts.slice(i, i + batchSize);
try {
const vectors = await embed(batch, vectorConfig);
allVectors.push(...vectors);
completed += batch.length;
onProgress?.(completed, texts.length);
} catch (e) {
xbLog.error(MODULE_ID, `批次 ${i}/${texts.length} 向量化失败`, e);
allVectors.push(...batch.map(() => null));
errors++;
}
}
if (shouldCancel?.()) {
return { built: completed, errors };
}
const vectorItems = allChunks
.map((chunk, idx) => allVectors[idx] ? { chunkId: chunk.chunkId, vector: allVectors[idx] } : null)
.filter(Boolean);
if (vectorItems.length > 0) {
await saveChunkVectors(chatId, vectorItems, fingerprint);
}
await updateMeta(chatId, { lastChunkFloor: chat.length - 1 });
xbLog.info(MODULE_ID, `构建完成:${vectorItems.length} 个向量,${errors} 个错误`);
return { built: vectorItems.length, errors };
}
// ═══════════════════════════════════════════════════════════════════════════
// 增量构建
// ═══════════════════════════════════════════════════════════════════════════
export async function buildIncrementalChunks(options = {}) {
const { vectorConfig } = options;
const { chat, chatId } = getContext();
if (!chatId || !chat?.length) {
return { built: 0 };
}
const meta = await getMeta(chatId);
const fingerprint = getEngineFingerprint(vectorConfig);
if (meta.fingerprint && meta.fingerprint !== fingerprint) {
xbLog.warn(MODULE_ID, '引擎指纹不匹配,跳过增量构建');
return { built: 0 };
}
const startFloor = meta.lastChunkFloor + 1;
if (startFloor >= chat.length) {
return { built: 0 };
}
xbLog.info(MODULE_ID, `增量构建 ${startFloor} - ${chat.length - 1}`);
const newChunks = [];
for (let floor = startFloor; floor < chat.length; floor++) {
const chunks = chunkMessage(floor, chat[floor]);
newChunks.push(...chunks);
}
if (newChunks.length === 0) {
await updateMeta(chatId, { lastChunkFloor: chat.length - 1 });
return { built: 0 };
}
await saveChunks(chatId, newChunks);
const texts = newChunks.map(c => c.text);
try {
const vectors = await embed(texts, vectorConfig);
const vectorItems = newChunks.map((chunk, idx) => ({
chunkId: chunk.chunkId,
vector: vectors[idx],
}));
await saveChunkVectors(chatId, vectorItems, fingerprint);
await updateMeta(chatId, { lastChunkFloor: chat.length - 1 });
return { built: vectorItems.length };
} catch (e) {
xbLog.error(MODULE_ID, '增量向量化失败', e);
return { built: 0 };
}
}
// ═══════════════════════════════════════════════════════════════════════════
// L1 同步(消息变化时调用)
// ═══════════════════════════════════════════════════════════════════════════
/**
* 消息删除后同步:删除 floor >= newLength 的 chunk
*/
export async function syncOnMessageDeleted(chatId, newLength) {
if (!chatId || newLength < 0) return;
await deleteChunksFromFloor(chatId, newLength);
await updateMeta(chatId, { lastChunkFloor: newLength - 1 });
xbLog.info(MODULE_ID, `消息删除同步:删除 floor >= ${newLength}`);
}
/**
* swipe 后同步:删除最后楼层的 chunk等待后续重建
*/
export async function syncOnMessageSwiped(chatId, lastFloor) {
if (!chatId || lastFloor < 0) return;
await deleteChunksAtFloor(chatId, lastFloor);
await updateMeta(chatId, { lastChunkFloor: lastFloor - 1 });
xbLog.info(MODULE_ID, `swipe 同步:删除 floor ${lastFloor}`);
}
/**
* 新消息后同步:删除 + 重建最后楼层
*/
export async function syncOnMessageReceived(chatId, lastFloor, message, vectorConfig) {
if (!chatId || lastFloor < 0 || !message) return;
if (!vectorConfig?.enabled) return;
// 本地模型未加载时跳过(避免意外触发下载或报错)
if (vectorConfig.engine === "local") {
const { isLocalModelLoaded, DEFAULT_LOCAL_MODEL } = await import("./embedder.js");
const modelId = vectorConfig.local?.modelId || DEFAULT_LOCAL_MODEL;
if (!isLocalModelLoaded(modelId)) return;
}
// 删除该楼层旧的
await deleteChunksAtFloor(chatId, lastFloor);
// 重建
const chunks = chunkMessage(lastFloor, message);
if (chunks.length === 0) return;
await saveChunks(chatId, chunks);
// 向量化
const fingerprint = getEngineFingerprint(vectorConfig);
const texts = chunks.map(c => c.text);
try {
const vectors = await embed(texts, vectorConfig);
const items = chunks.map((c, i) => ({ chunkId: c.chunkId, vector: vectors[i] }));
await saveChunkVectors(chatId, items, fingerprint);
await updateMeta(chatId, { lastChunkFloor: lastFloor });
xbLog.info(MODULE_ID, `消息同步:重建 floor ${lastFloor}${chunks.length} 个 chunk`);
} catch (e) {
xbLog.error(MODULE_ID, `消息同步失败floor ${lastFloor}`, e);
}
}