Add vector IO and text filtering
This commit is contained in:
@@ -18,6 +18,7 @@ import {
|
||||
} from './chunk-store.js';
|
||||
import { embed, getEngineFingerprint } from './embedder.js';
|
||||
import { xbLog } from '../../../core/debug-core.js';
|
||||
import { filterText } from './text-filter.js';
|
||||
|
||||
const MODULE_ID = 'chunk-builder';
|
||||
|
||||
@@ -47,9 +48,9 @@ export function chunkMessage(floor, message, maxTokens = CHUNK_MAX_TOKENS) {
|
||||
const speaker = message.name || (message.is_user ? '用户' : '角色');
|
||||
const isUser = !!message.is_user;
|
||||
|
||||
const cleanText = text
|
||||
.replace(/<think>[\s\S]*?<\/think>/gi, '')
|
||||
.replace(/<thinking>[\s\S]*?<\/thinking>/gi, '')
|
||||
// 1. 应用用户自定义过滤规则
|
||||
// 2. 移除 TTS 标记(硬编码)
|
||||
const cleanText = filterText(text)
|
||||
.replace(/\[tts:[^\]]*\]/gi, '')
|
||||
.trim();
|
||||
|
||||
|
||||
@@ -7,10 +7,11 @@
|
||||
// - floor 稀疏去重
|
||||
|
||||
import { getAllEventVectors, getAllChunkVectors, getChunksByFloors, getMeta } from './chunk-store.js';
|
||||
import { embed, getEngineFingerprint } from './embedder.js';
|
||||
import { xbLog } from '../../../core/debug-core.js';
|
||||
import { getContext } from '../../../../../../extensions.js';
|
||||
import { getSummaryStore } from '../data/store.js';
|
||||
import { embed, getEngineFingerprint } from './embedder.js';
|
||||
import { xbLog } from '../../../core/debug-core.js';
|
||||
import { getContext } from '../../../../../../extensions.js';
|
||||
import { getSummaryStore } from '../data/store.js';
|
||||
import { filterText } from './text-filter.js';
|
||||
|
||||
const MODULE_ID = 'recall';
|
||||
|
||||
@@ -139,13 +140,11 @@ function normalize(s) {
|
||||
return String(s || '').normalize('NFKC').replace(/[\u200B-\u200D\uFEFF]/g, '').trim();
|
||||
}
|
||||
|
||||
function stripNoise(text) {
|
||||
return String(text || '')
|
||||
.replace(/<think>[\s\S]*?<\/think>/gi, '')
|
||||
.replace(/<thinking>[\s\S]*?<\/thinking>/gi, '')
|
||||
.replace(/\[tts:[^\]]*\]/gi, '')
|
||||
.trim();
|
||||
}
|
||||
function cleanForRecall(text) {
|
||||
// 1. 应用用户自定义过滤规则
|
||||
// 2. 移除 TTS 标记(硬编码)
|
||||
return filterText(text).replace(/\[tts:[^\]]*\]/gi, '').trim();
|
||||
}
|
||||
|
||||
function buildExpDecayWeights(n, beta) {
|
||||
const last = n - 1;
|
||||
@@ -180,8 +179,8 @@ function buildQuerySegments(chat, count, excludeLastAi, pendingUserMessage = nul
|
||||
|
||||
return messages.slice(-count).map((m, idx, arr) => {
|
||||
const speaker = m.name || (m.is_user ? '用户' : '角色');
|
||||
const clean = stripNoise(m.mes);
|
||||
if (!clean) return '';
|
||||
const clean = cleanForRecall(m.mes);
|
||||
if (!clean) return '';
|
||||
const limit = idx === arr.length - 1 ? CONFIG.QUERY_MAX_CHARS : CONFIG.QUERY_CONTEXT_CHARS;
|
||||
return `${speaker}: ${clean.slice(0, limit)}`;
|
||||
}).filter(Boolean);
|
||||
@@ -773,7 +772,7 @@ export function buildQueryText(chat, count = 2, excludeLastAi = false) {
|
||||
messages = messages.slice(0, -1);
|
||||
}
|
||||
|
||||
return messages.slice(-count).map(m => {
|
||||
return messages.slice(-count).map(m => {
|
||||
const text = cleanForRecall(m.mes);
|
||||
const speaker = m.name || (m.is_user ? '用户' : '角色');
|
||||
return `${speaker}: ${text.slice(0, 500)}`;
|
||||
|
||||
63
modules/story-summary/vector/text-filter.js
Normal file
63
modules/story-summary/vector/text-filter.js
Normal file
@@ -0,0 +1,63 @@
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Text Filter - 通用文本过滤
|
||||
// 跳过用户定义的「起始→结束」区间
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
import { getTextFilterRules } from '../data/config.js';
|
||||
|
||||
/**
|
||||
* 转义正则特殊字符
|
||||
*/
|
||||
function escapeRegex(str) {
|
||||
return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
||||
}
|
||||
|
||||
/**
|
||||
* 应用过滤规则
|
||||
* - start + end:删除 start...end(含边界)
|
||||
* - start 空 + end:从开头删到 end(含)
|
||||
* - start + end 空:从 start 删到结尾
|
||||
* - 两者都空:跳过
|
||||
*/
|
||||
export function applyTextFilterRules(text, rules) {
|
||||
if (!text || !rules?.length) return text;
|
||||
|
||||
let result = text;
|
||||
|
||||
for (const rule of rules) {
|
||||
const start = rule.start ?? '';
|
||||
const end = rule.end ?? '';
|
||||
|
||||
if (!start && !end) continue;
|
||||
|
||||
if (start && end) {
|
||||
// 标准区间:删除 start...end(含边界),非贪婪
|
||||
const regex = new RegExp(
|
||||
escapeRegex(start) + '[\\s\\S]*?' + escapeRegex(end),
|
||||
'gi'
|
||||
);
|
||||
result = result.replace(regex, '');
|
||||
} else if (start && !end) {
|
||||
// 从 start 到结尾
|
||||
const idx = result.toLowerCase().indexOf(start.toLowerCase());
|
||||
if (idx !== -1) {
|
||||
result = result.slice(0, idx);
|
||||
}
|
||||
} else if (!start && end) {
|
||||
// 从开头到 end(含)
|
||||
const idx = result.toLowerCase().indexOf(end.toLowerCase());
|
||||
if (idx !== -1) {
|
||||
result = result.slice(idx + end.length);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return result.trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* 便捷方法:使用当前配置过滤文本
|
||||
*/
|
||||
export function filterText(text) {
|
||||
return applyTextFilterRules(text, getTextFilterRules());
|
||||
}
|
||||
301
modules/story-summary/vector/vector-io.js
Normal file
301
modules/story-summary/vector/vector-io.js
Normal file
@@ -0,0 +1,301 @@
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Vector Import/Export
|
||||
// 向量数据导入导出(当前 chatId 级别)
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
import { zipSync, unzipSync, strToU8, strFromU8 } from '../../../libs/fflate.mjs';
|
||||
import { getContext } from '../../../../../../extensions.js';
|
||||
import { xbLog } from '../../../core/debug-core.js';
|
||||
import {
|
||||
getMeta,
|
||||
updateMeta,
|
||||
getAllChunks,
|
||||
getAllChunkVectors,
|
||||
getAllEventVectors,
|
||||
saveChunks,
|
||||
saveChunkVectors,
|
||||
clearAllChunks,
|
||||
clearEventVectors,
|
||||
saveEventVectors,
|
||||
} from './chunk-store.js';
|
||||
import { getEngineFingerprint } from './embedder.js';
|
||||
import { getVectorConfig } from '../data/config.js';
|
||||
|
||||
const MODULE_ID = 'vector-io';
|
||||
const EXPORT_VERSION = 1;
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// 工具函数
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
function float32ToBytes(vectors, dims) {
|
||||
const totalFloats = vectors.length * dims;
|
||||
const buffer = new ArrayBuffer(totalFloats * 4);
|
||||
const view = new Float32Array(buffer);
|
||||
|
||||
let offset = 0;
|
||||
for (const vec of vectors) {
|
||||
for (let i = 0; i < dims; i++) {
|
||||
view[offset++] = vec[i] || 0;
|
||||
}
|
||||
}
|
||||
|
||||
return new Uint8Array(buffer);
|
||||
}
|
||||
|
||||
function bytesToFloat32(bytes, dims) {
|
||||
const view = new Float32Array(bytes.buffer, bytes.byteOffset, bytes.byteLength / 4);
|
||||
const vectors = [];
|
||||
|
||||
for (let i = 0; i < view.length; i += dims) {
|
||||
vectors.push(Array.from(view.slice(i, i + dims)));
|
||||
}
|
||||
|
||||
return vectors;
|
||||
}
|
||||
|
||||
function downloadBlob(blob, filename) {
|
||||
const url = URL.createObjectURL(blob);
|
||||
const a = document.createElement('a');
|
||||
a.href = url;
|
||||
a.download = filename;
|
||||
document.body.appendChild(a);
|
||||
a.click();
|
||||
document.body.removeChild(a);
|
||||
URL.revokeObjectURL(url);
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// 导出
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
export async function exportVectors(onProgress) {
|
||||
const { chatId } = getContext();
|
||||
if (!chatId) {
|
||||
throw new Error('未打开聊天');
|
||||
}
|
||||
|
||||
onProgress?.('读取数据...');
|
||||
|
||||
const meta = await getMeta(chatId);
|
||||
const chunks = await getAllChunks(chatId);
|
||||
const chunkVectors = await getAllChunkVectors(chatId);
|
||||
const eventVectors = await getAllEventVectors(chatId);
|
||||
|
||||
if (chunks.length === 0 && eventVectors.length === 0) {
|
||||
throw new Error('没有可导出的向量数据');
|
||||
}
|
||||
|
||||
// 确定维度
|
||||
const dims = chunkVectors[0]?.vector?.length || eventVectors[0]?.vector?.length || 0;
|
||||
if (dims === 0) {
|
||||
throw new Error('无法确定向量维度');
|
||||
}
|
||||
|
||||
onProgress?.('构建索引...');
|
||||
|
||||
// 构建 chunk 索引(按 chunkId 排序保证顺序一致)
|
||||
const sortedChunks = [...chunks].sort((a, b) => a.chunkId.localeCompare(b.chunkId));
|
||||
const chunkVectorMap = new Map(chunkVectors.map(cv => [cv.chunkId, cv.vector]));
|
||||
|
||||
// chunks.jsonl
|
||||
const chunksJsonl = sortedChunks.map(c => JSON.stringify({
|
||||
chunkId: c.chunkId,
|
||||
floor: c.floor,
|
||||
chunkIdx: c.chunkIdx,
|
||||
speaker: c.speaker,
|
||||
isUser: c.isUser,
|
||||
text: c.text,
|
||||
textHash: c.textHash,
|
||||
})).join('\n');
|
||||
|
||||
// chunk_vectors.bin(按 sortedChunks 顺序)
|
||||
const chunkVectorsOrdered = sortedChunks.map(c => chunkVectorMap.get(c.chunkId) || new Array(dims).fill(0));
|
||||
|
||||
onProgress?.('压缩向量...');
|
||||
|
||||
// 构建 event 索引
|
||||
const sortedEventVectors = [...eventVectors].sort((a, b) => a.eventId.localeCompare(b.eventId));
|
||||
const eventsJsonl = sortedEventVectors.map(ev => JSON.stringify({
|
||||
eventId: ev.eventId,
|
||||
})).join('\n');
|
||||
|
||||
// event_vectors.bin
|
||||
const eventVectorsOrdered = sortedEventVectors.map(ev => ev.vector);
|
||||
|
||||
// manifest
|
||||
const manifest = {
|
||||
version: EXPORT_VERSION,
|
||||
exportedAt: Date.now(),
|
||||
chatId,
|
||||
fingerprint: meta.fingerprint || '',
|
||||
dims,
|
||||
chunkCount: sortedChunks.length,
|
||||
chunkVectorCount: chunkVectors.length,
|
||||
eventCount: sortedEventVectors.length,
|
||||
lastChunkFloor: meta.lastChunkFloor ?? -1,
|
||||
};
|
||||
|
||||
onProgress?.('打包文件...');
|
||||
|
||||
// 打包 zip
|
||||
const zipData = zipSync({
|
||||
'manifest.json': strToU8(JSON.stringify(manifest, null, 2)),
|
||||
'chunks.jsonl': strToU8(chunksJsonl),
|
||||
'chunk_vectors.bin': float32ToBytes(chunkVectorsOrdered, dims),
|
||||
'events.jsonl': strToU8(eventsJsonl),
|
||||
'event_vectors.bin': float32ToBytes(eventVectorsOrdered, dims),
|
||||
}, { level: 1 }); // 降低压缩级别,速度优先
|
||||
|
||||
onProgress?.('下载文件...');
|
||||
|
||||
// 生成文件名
|
||||
const timestamp = new Date().toISOString().slice(0, 10).replace(/-/g, '');
|
||||
const shortChatId = chatId.slice(0, 8);
|
||||
const filename = `vectors_${shortChatId}_${timestamp}.zip`;
|
||||
|
||||
downloadBlob(new Blob([zipData]), filename);
|
||||
|
||||
const sizeMB = (zipData.byteLength / 1024 / 1024).toFixed(2);
|
||||
xbLog.info(MODULE_ID, `导出完成: ${filename} (${sizeMB}MB)`);
|
||||
|
||||
return {
|
||||
filename,
|
||||
size: zipData.byteLength,
|
||||
chunkCount: sortedChunks.length,
|
||||
eventCount: sortedEventVectors.length,
|
||||
};
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// 导入
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
export async function importVectors(file, onProgress) {
|
||||
const { chatId } = getContext();
|
||||
if (!chatId) {
|
||||
throw new Error('未打开聊天');
|
||||
}
|
||||
|
||||
onProgress?.('读取文件...');
|
||||
|
||||
const arrayBuffer = await file.arrayBuffer();
|
||||
const zipData = new Uint8Array(arrayBuffer);
|
||||
|
||||
onProgress?.('解压文件...');
|
||||
|
||||
let unzipped;
|
||||
try {
|
||||
unzipped = unzipSync(zipData);
|
||||
} catch (e) {
|
||||
throw new Error('文件格式错误,无法解压');
|
||||
}
|
||||
|
||||
// 读取 manifest
|
||||
if (!unzipped['manifest.json']) {
|
||||
throw new Error('缺少 manifest.json');
|
||||
}
|
||||
|
||||
const manifest = JSON.parse(strFromU8(unzipped['manifest.json']));
|
||||
|
||||
if (manifest.version !== EXPORT_VERSION) {
|
||||
throw new Error(`不支持的版本: ${manifest.version}`);
|
||||
}
|
||||
|
||||
onProgress?.('校验数据...');
|
||||
|
||||
// 校验 fingerprint
|
||||
const vectorCfg = getVectorConfig();
|
||||
const currentFingerprint = vectorCfg ? getEngineFingerprint(vectorCfg) : '';
|
||||
const fingerprintMismatch = manifest.fingerprint && currentFingerprint && manifest.fingerprint !== currentFingerprint;
|
||||
|
||||
// chatId 校验(警告但允许)
|
||||
const chatIdMismatch = manifest.chatId !== chatId;
|
||||
|
||||
const warnings = [];
|
||||
if (fingerprintMismatch) {
|
||||
warnings.push(`向量引擎不匹配(文件: ${manifest.fingerprint}, 当前: ${currentFingerprint}),导入后需重新生成`);
|
||||
}
|
||||
if (chatIdMismatch) {
|
||||
warnings.push(`聊天ID不匹配(文件: ${manifest.chatId}, 当前: ${chatId})`);
|
||||
}
|
||||
|
||||
onProgress?.('解析数据...');
|
||||
|
||||
// 解析 chunks
|
||||
const chunksJsonl = unzipped['chunks.jsonl'] ? strFromU8(unzipped['chunks.jsonl']) : '';
|
||||
const chunkMetas = chunksJsonl.split('\n').filter(Boolean).map(line => JSON.parse(line));
|
||||
|
||||
// 解析 chunk vectors
|
||||
const chunkVectorsBytes = unzipped['chunk_vectors.bin'];
|
||||
const chunkVectors = chunkVectorsBytes ? bytesToFloat32(chunkVectorsBytes, manifest.dims) : [];
|
||||
|
||||
// 解析 events
|
||||
const eventsJsonl = unzipped['events.jsonl'] ? strFromU8(unzipped['events.jsonl']) : '';
|
||||
const eventMetas = eventsJsonl.split('\n').filter(Boolean).map(line => JSON.parse(line));
|
||||
|
||||
// 解析 event vectors
|
||||
const eventVectorsBytes = unzipped['event_vectors.bin'];
|
||||
const eventVectors = eventVectorsBytes ? bytesToFloat32(eventVectorsBytes, manifest.dims) : [];
|
||||
|
||||
// 校验数量
|
||||
if (chunkMetas.length !== chunkVectors.length) {
|
||||
throw new Error(`chunk 数量不匹配: 元数据 ${chunkMetas.length}, 向量 ${chunkVectors.length}`);
|
||||
}
|
||||
if (eventMetas.length !== eventVectors.length) {
|
||||
throw new Error(`event 数量不匹配: 元数据 ${eventMetas.length}, 向量 ${eventVectors.length}`);
|
||||
}
|
||||
|
||||
onProgress?.('清空旧数据...');
|
||||
|
||||
// 清空当前数据
|
||||
await clearAllChunks(chatId);
|
||||
await clearEventVectors(chatId);
|
||||
|
||||
onProgress?.('写入数据...');
|
||||
|
||||
// 写入 chunks
|
||||
if (chunkMetas.length > 0) {
|
||||
const chunksToSave = chunkMetas.map(meta => ({
|
||||
chunkId: meta.chunkId,
|
||||
floor: meta.floor,
|
||||
chunkIdx: meta.chunkIdx,
|
||||
speaker: meta.speaker,
|
||||
isUser: meta.isUser,
|
||||
text: meta.text,
|
||||
textHash: meta.textHash,
|
||||
}));
|
||||
await saveChunks(chatId, chunksToSave);
|
||||
|
||||
// 写入 chunk vectors
|
||||
const chunkVectorItems = chunkMetas.map((meta, idx) => ({
|
||||
chunkId: meta.chunkId,
|
||||
vector: chunkVectors[idx],
|
||||
}));
|
||||
await saveChunkVectors(chatId, chunkVectorItems, manifest.fingerprint);
|
||||
}
|
||||
|
||||
// 写入 event vectors
|
||||
if (eventMetas.length > 0) {
|
||||
const eventVectorItems = eventMetas.map((meta, idx) => ({
|
||||
eventId: meta.eventId,
|
||||
vector: eventVectors[idx],
|
||||
}));
|
||||
await saveEventVectors(chatId, eventVectorItems, manifest.fingerprint);
|
||||
}
|
||||
|
||||
// 更新 meta
|
||||
await updateMeta(chatId, {
|
||||
fingerprint: manifest.fingerprint,
|
||||
lastChunkFloor: manifest.lastChunkFloor,
|
||||
});
|
||||
|
||||
xbLog.info(MODULE_ID, `导入完成: ${chunkMetas.length} chunks, ${eventMetas.length} events`);
|
||||
|
||||
return {
|
||||
chunkCount: chunkMetas.length,
|
||||
eventCount: eventMetas.length,
|
||||
warnings,
|
||||
fingerprintMismatch,
|
||||
};
|
||||
}
|
||||
Reference in New Issue
Block a user