2026-01-26 01:16:35 +08:00
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
|
|
// Story Summary - Chunk Builder
|
|
|
|
|
|
// 标准 RAG chunking: ~200 tokens per chunk
|
|
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
|
|
|
|
|
|
|
|
import { getContext } from '../../../../../../extensions.js';
|
|
|
|
|
|
import {
|
|
|
|
|
|
getMeta,
|
|
|
|
|
|
updateMeta,
|
|
|
|
|
|
saveChunks,
|
|
|
|
|
|
saveChunkVectors,
|
|
|
|
|
|
clearAllChunks,
|
|
|
|
|
|
deleteChunksFromFloor,
|
|
|
|
|
|
deleteChunksAtFloor,
|
|
|
|
|
|
makeChunkId,
|
|
|
|
|
|
hashText,
|
|
|
|
|
|
CHUNK_MAX_TOKENS,
|
|
|
|
|
|
} from './chunk-store.js';
|
|
|
|
|
|
import { embed, getEngineFingerprint } from './embedder.js';
|
|
|
|
|
|
import { xbLog } from '../../../core/debug-core.js';
|
2026-01-29 17:02:51 +08:00
|
|
|
|
import { filterText } from './text-filter.js';
|
2026-01-26 01:16:35 +08:00
|
|
|
|
|
|
|
|
|
|
const MODULE_ID = 'chunk-builder';
|
|
|
|
|
|
|
|
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
|
|
// Token 估算
|
|
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
|
|
|
|
|
|
|
|
function estimateTokens(text) {
|
|
|
|
|
|
if (!text) return 0;
|
|
|
|
|
|
const chinese = (text.match(/[\u4e00-\u9fff]/g) || []).length;
|
|
|
|
|
|
const other = text.length - chinese;
|
|
|
|
|
|
return Math.ceil(chinese + other / 4);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
function splitSentences(text) {
|
|
|
|
|
|
if (!text) return [];
|
|
|
|
|
|
const parts = text.split(/(?<=[。!?\n])|(?<=[.!?]\s)/);
|
|
|
|
|
|
return parts.map(s => s.trim()).filter(s => s.length > 0);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
|
|
// Chunk 切分
|
|
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
|
|
|
|
|
|
|
|
export function chunkMessage(floor, message, maxTokens = CHUNK_MAX_TOKENS) {
|
|
|
|
|
|
const text = message.mes || '';
|
|
|
|
|
|
const speaker = message.name || (message.is_user ? '用户' : '角色');
|
|
|
|
|
|
const isUser = !!message.is_user;
|
|
|
|
|
|
|
2026-01-29 17:02:51 +08:00
|
|
|
|
// 1. 应用用户自定义过滤规则
|
|
|
|
|
|
// 2. 移除 TTS 标记(硬编码)
|
|
|
|
|
|
const cleanText = filterText(text)
|
2026-01-26 01:16:35 +08:00
|
|
|
|
.replace(/\[tts:[^\]]*\]/gi, '')
|
|
|
|
|
|
.trim();
|
|
|
|
|
|
|
|
|
|
|
|
if (!cleanText) return [];
|
|
|
|
|
|
|
|
|
|
|
|
const totalTokens = estimateTokens(cleanText);
|
|
|
|
|
|
|
|
|
|
|
|
if (totalTokens <= maxTokens) {
|
|
|
|
|
|
return [{
|
|
|
|
|
|
chunkId: makeChunkId(floor, 0),
|
|
|
|
|
|
floor,
|
|
|
|
|
|
chunkIdx: 0,
|
|
|
|
|
|
speaker,
|
|
|
|
|
|
isUser,
|
|
|
|
|
|
text: cleanText,
|
|
|
|
|
|
textHash: hashText(cleanText),
|
|
|
|
|
|
}];
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
const sentences = splitSentences(cleanText);
|
|
|
|
|
|
const chunks = [];
|
|
|
|
|
|
let currentSentences = [];
|
|
|
|
|
|
let currentTokens = 0;
|
|
|
|
|
|
|
|
|
|
|
|
for (const sent of sentences) {
|
|
|
|
|
|
const sentTokens = estimateTokens(sent);
|
|
|
|
|
|
|
|
|
|
|
|
if (sentTokens > maxTokens) {
|
|
|
|
|
|
if (currentSentences.length > 0) {
|
|
|
|
|
|
const chunkText = currentSentences.join('');
|
|
|
|
|
|
chunks.push({
|
|
|
|
|
|
chunkId: makeChunkId(floor, chunks.length),
|
|
|
|
|
|
floor,
|
|
|
|
|
|
chunkIdx: chunks.length,
|
|
|
|
|
|
speaker,
|
|
|
|
|
|
isUser,
|
|
|
|
|
|
text: chunkText,
|
|
|
|
|
|
textHash: hashText(chunkText),
|
|
|
|
|
|
});
|
|
|
|
|
|
currentSentences = [];
|
|
|
|
|
|
currentTokens = 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
const sliceSize = maxTokens * 2;
|
|
|
|
|
|
for (let i = 0; i < sent.length; i += sliceSize) {
|
|
|
|
|
|
const slice = sent.slice(i, i + sliceSize);
|
|
|
|
|
|
chunks.push({
|
|
|
|
|
|
chunkId: makeChunkId(floor, chunks.length),
|
|
|
|
|
|
floor,
|
|
|
|
|
|
chunkIdx: chunks.length,
|
|
|
|
|
|
speaker,
|
|
|
|
|
|
isUser,
|
|
|
|
|
|
text: slice,
|
|
|
|
|
|
textHash: hashText(slice),
|
|
|
|
|
|
});
|
|
|
|
|
|
}
|
|
|
|
|
|
continue;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (currentTokens + sentTokens > maxTokens && currentSentences.length > 0) {
|
|
|
|
|
|
const chunkText = currentSentences.join('');
|
|
|
|
|
|
chunks.push({
|
|
|
|
|
|
chunkId: makeChunkId(floor, chunks.length),
|
|
|
|
|
|
floor,
|
|
|
|
|
|
chunkIdx: chunks.length,
|
|
|
|
|
|
speaker,
|
|
|
|
|
|
isUser,
|
|
|
|
|
|
text: chunkText,
|
|
|
|
|
|
textHash: hashText(chunkText),
|
|
|
|
|
|
});
|
|
|
|
|
|
currentSentences = [];
|
|
|
|
|
|
currentTokens = 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
currentSentences.push(sent);
|
|
|
|
|
|
currentTokens += sentTokens;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (currentSentences.length > 0) {
|
|
|
|
|
|
const chunkText = currentSentences.join('');
|
|
|
|
|
|
chunks.push({
|
|
|
|
|
|
chunkId: makeChunkId(floor, chunks.length),
|
|
|
|
|
|
floor,
|
|
|
|
|
|
chunkIdx: chunks.length,
|
|
|
|
|
|
speaker,
|
|
|
|
|
|
isUser,
|
|
|
|
|
|
text: chunkText,
|
|
|
|
|
|
textHash: hashText(chunkText),
|
|
|
|
|
|
});
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return chunks;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
|
|
// 构建状态
|
|
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
|
|
|
|
|
|
|
|
export async function getChunkBuildStatus() {
|
|
|
|
|
|
const { chat, chatId } = getContext();
|
|
|
|
|
|
if (!chatId) {
|
|
|
|
|
|
return { totalFloors: 0, builtFloors: 0, pending: 0 };
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
const meta = await getMeta(chatId);
|
|
|
|
|
|
const totalFloors = chat?.length || 0;
|
|
|
|
|
|
const builtFloors = meta.lastChunkFloor + 1;
|
|
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
|
totalFloors,
|
|
|
|
|
|
builtFloors,
|
|
|
|
|
|
lastChunkFloor: meta.lastChunkFloor,
|
|
|
|
|
|
pending: Math.max(0, totalFloors - builtFloors),
|
|
|
|
|
|
};
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
|
|
// 全量构建
|
|
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
|
|
|
|
|
|
|
|
export async function buildAllChunks(options = {}) {
|
|
|
|
|
|
const { onProgress, shouldCancel, vectorConfig } = options;
|
|
|
|
|
|
|
|
|
|
|
|
const { chat, chatId } = getContext();
|
|
|
|
|
|
if (!chatId || !chat?.length) {
|
|
|
|
|
|
return { built: 0, errors: 0 };
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
const fingerprint = getEngineFingerprint(vectorConfig);
|
|
|
|
|
|
|
|
|
|
|
|
await clearAllChunks(chatId);
|
|
|
|
|
|
await updateMeta(chatId, { lastChunkFloor: -1, fingerprint });
|
|
|
|
|
|
|
|
|
|
|
|
const allChunks = [];
|
|
|
|
|
|
for (let floor = 0; floor < chat.length; floor++) {
|
|
|
|
|
|
const chunks = chunkMessage(floor, chat[floor]);
|
|
|
|
|
|
allChunks.push(...chunks);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (allChunks.length === 0) {
|
|
|
|
|
|
return { built: 0, errors: 0 };
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
xbLog.info(MODULE_ID, `开始构建 ${allChunks.length} 个 chunks(${chat.length} 层楼)`);
|
|
|
|
|
|
|
|
|
|
|
|
await saveChunks(chatId, allChunks);
|
|
|
|
|
|
|
|
|
|
|
|
const texts = allChunks.map(c => c.text);
|
|
|
|
|
|
const isLocal = vectorConfig.engine === 'local';
|
|
|
|
|
|
const batchSize = isLocal ? 5 : 20;
|
|
|
|
|
|
|
|
|
|
|
|
let completed = 0;
|
|
|
|
|
|
let errors = 0;
|
|
|
|
|
|
const allVectors = [];
|
|
|
|
|
|
|
|
|
|
|
|
for (let i = 0; i < texts.length; i += batchSize) {
|
|
|
|
|
|
if (shouldCancel?.()) break;
|
|
|
|
|
|
|
|
|
|
|
|
const batch = texts.slice(i, i + batchSize);
|
|
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
|
const vectors = await embed(batch, vectorConfig);
|
|
|
|
|
|
allVectors.push(...vectors);
|
|
|
|
|
|
completed += batch.length;
|
|
|
|
|
|
onProgress?.(completed, texts.length);
|
|
|
|
|
|
} catch (e) {
|
|
|
|
|
|
xbLog.error(MODULE_ID, `批次 ${i}/${texts.length} 向量化失败`, e);
|
|
|
|
|
|
allVectors.push(...batch.map(() => null));
|
|
|
|
|
|
errors++;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (shouldCancel?.()) {
|
|
|
|
|
|
return { built: completed, errors };
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
const vectorItems = allChunks
|
|
|
|
|
|
.map((chunk, idx) => allVectors[idx] ? { chunkId: chunk.chunkId, vector: allVectors[idx] } : null)
|
|
|
|
|
|
.filter(Boolean);
|
|
|
|
|
|
|
|
|
|
|
|
if (vectorItems.length > 0) {
|
|
|
|
|
|
await saveChunkVectors(chatId, vectorItems, fingerprint);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
await updateMeta(chatId, { lastChunkFloor: chat.length - 1 });
|
|
|
|
|
|
|
|
|
|
|
|
xbLog.info(MODULE_ID, `构建完成:${vectorItems.length} 个向量,${errors} 个错误`);
|
|
|
|
|
|
|
|
|
|
|
|
return { built: vectorItems.length, errors };
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
|
|
// 增量构建
|
|
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
|
|
|
|
|
|
|
|
export async function buildIncrementalChunks(options = {}) {
|
|
|
|
|
|
const { vectorConfig } = options;
|
|
|
|
|
|
|
|
|
|
|
|
const { chat, chatId } = getContext();
|
|
|
|
|
|
if (!chatId || !chat?.length) {
|
|
|
|
|
|
return { built: 0 };
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
const meta = await getMeta(chatId);
|
|
|
|
|
|
const fingerprint = getEngineFingerprint(vectorConfig);
|
|
|
|
|
|
|
|
|
|
|
|
if (meta.fingerprint && meta.fingerprint !== fingerprint) {
|
|
|
|
|
|
xbLog.warn(MODULE_ID, '引擎指纹不匹配,跳过增量构建');
|
|
|
|
|
|
return { built: 0 };
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
const startFloor = meta.lastChunkFloor + 1;
|
|
|
|
|
|
if (startFloor >= chat.length) {
|
|
|
|
|
|
return { built: 0 };
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
xbLog.info(MODULE_ID, `增量构建 ${startFloor} - ${chat.length - 1} 层`);
|
|
|
|
|
|
|
|
|
|
|
|
const newChunks = [];
|
|
|
|
|
|
for (let floor = startFloor; floor < chat.length; floor++) {
|
|
|
|
|
|
const chunks = chunkMessage(floor, chat[floor]);
|
|
|
|
|
|
newChunks.push(...chunks);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (newChunks.length === 0) {
|
|
|
|
|
|
await updateMeta(chatId, { lastChunkFloor: chat.length - 1 });
|
|
|
|
|
|
return { built: 0 };
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
await saveChunks(chatId, newChunks);
|
|
|
|
|
|
|
|
|
|
|
|
const texts = newChunks.map(c => c.text);
|
|
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
|
const vectors = await embed(texts, vectorConfig);
|
|
|
|
|
|
const vectorItems = newChunks.map((chunk, idx) => ({
|
|
|
|
|
|
chunkId: chunk.chunkId,
|
|
|
|
|
|
vector: vectors[idx],
|
|
|
|
|
|
}));
|
|
|
|
|
|
await saveChunkVectors(chatId, vectorItems, fingerprint);
|
|
|
|
|
|
await updateMeta(chatId, { lastChunkFloor: chat.length - 1 });
|
|
|
|
|
|
|
|
|
|
|
|
return { built: vectorItems.length };
|
|
|
|
|
|
} catch (e) {
|
|
|
|
|
|
xbLog.error(MODULE_ID, '增量向量化失败', e);
|
|
|
|
|
|
return { built: 0 };
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
|
|
// L1 同步(消息变化时调用)
|
|
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* 消息删除后同步:删除 floor >= newLength 的 chunk
|
|
|
|
|
|
*/
|
|
|
|
|
|
export async function syncOnMessageDeleted(chatId, newLength) {
|
|
|
|
|
|
if (!chatId || newLength < 0) return;
|
|
|
|
|
|
|
|
|
|
|
|
await deleteChunksFromFloor(chatId, newLength);
|
|
|
|
|
|
await updateMeta(chatId, { lastChunkFloor: newLength - 1 });
|
|
|
|
|
|
|
|
|
|
|
|
xbLog.info(MODULE_ID, `消息删除同步:删除 floor >= ${newLength}`);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* swipe 后同步:删除最后楼层的 chunk(等待后续重建)
|
|
|
|
|
|
*/
|
|
|
|
|
|
export async function syncOnMessageSwiped(chatId, lastFloor) {
|
|
|
|
|
|
if (!chatId || lastFloor < 0) return;
|
|
|
|
|
|
|
|
|
|
|
|
await deleteChunksAtFloor(chatId, lastFloor);
|
|
|
|
|
|
await updateMeta(chatId, { lastChunkFloor: lastFloor - 1 });
|
|
|
|
|
|
|
|
|
|
|
|
xbLog.info(MODULE_ID, `swipe 同步:删除 floor ${lastFloor}`);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* 新消息后同步:删除 + 重建最后楼层
|
|
|
|
|
|
*/
|
|
|
|
|
|
export async function syncOnMessageReceived(chatId, lastFloor, message, vectorConfig) {
|
|
|
|
|
|
if (!chatId || lastFloor < 0 || !message) return;
|
|
|
|
|
|
if (!vectorConfig?.enabled) return;
|
|
|
|
|
|
|
2026-01-30 00:55:04 +08:00
|
|
|
|
// 本地模型未加载时跳过(避免意外触发下载或报错)
|
|
|
|
|
|
if (vectorConfig.engine === "local") {
|
|
|
|
|
|
const { isLocalModelLoaded, DEFAULT_LOCAL_MODEL } = await import("./embedder.js");
|
|
|
|
|
|
const modelId = vectorConfig.local?.modelId || DEFAULT_LOCAL_MODEL;
|
|
|
|
|
|
if (!isLocalModelLoaded(modelId)) return;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-01-26 01:16:35 +08:00
|
|
|
|
// 删除该楼层旧的
|
|
|
|
|
|
await deleteChunksAtFloor(chatId, lastFloor);
|
|
|
|
|
|
|
|
|
|
|
|
// 重建
|
|
|
|
|
|
const chunks = chunkMessage(lastFloor, message);
|
|
|
|
|
|
if (chunks.length === 0) return;
|
|
|
|
|
|
|
|
|
|
|
|
await saveChunks(chatId, chunks);
|
|
|
|
|
|
|
|
|
|
|
|
// 向量化
|
|
|
|
|
|
const fingerprint = getEngineFingerprint(vectorConfig);
|
|
|
|
|
|
const texts = chunks.map(c => c.text);
|
|
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
|
const vectors = await embed(texts, vectorConfig);
|
|
|
|
|
|
const items = chunks.map((c, i) => ({ chunkId: c.chunkId, vector: vectors[i] }));
|
|
|
|
|
|
await saveChunkVectors(chatId, items, fingerprint);
|
|
|
|
|
|
await updateMeta(chatId, { lastChunkFloor: lastFloor });
|
|
|
|
|
|
|
|
|
|
|
|
xbLog.info(MODULE_ID, `消息同步:重建 floor ${lastFloor},${chunks.length} 个 chunk`);
|
|
|
|
|
|
} catch (e) {
|
|
|
|
|
|
xbLog.error(MODULE_ID, `消息同步失败:floor ${lastFloor}`, e);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|