Files
LittleWhiteBox/modules/story-summary/vector/vector-io.js

302 lines
11 KiB
JavaScript
Raw Normal View History

2026-01-29 17:02:51 +08:00
// ═══════════════════════════════════════════════════════════════════════════
// Vector Import/Export
// 向量数据导入导出(当前 chatId 级别)
// ═══════════════════════════════════════════════════════════════════════════
import { zipSync, unzipSync, strToU8, strFromU8 } from '../../../libs/fflate.mjs';
import { getContext } from '../../../../../../extensions.js';
import { xbLog } from '../../../core/debug-core.js';
import {
getMeta,
updateMeta,
getAllChunks,
getAllChunkVectors,
getAllEventVectors,
saveChunks,
saveChunkVectors,
clearAllChunks,
clearEventVectors,
saveEventVectors,
} from './chunk-store.js';
import { getEngineFingerprint } from './embedder.js';
import { getVectorConfig } from '../data/config.js';
const MODULE_ID = 'vector-io';
const EXPORT_VERSION = 1;
// ═══════════════════════════════════════════════════════════════════════════
// 工具函数
// ═══════════════════════════════════════════════════════════════════════════
function float32ToBytes(vectors, dims) {
const totalFloats = vectors.length * dims;
const buffer = new ArrayBuffer(totalFloats * 4);
const view = new Float32Array(buffer);
let offset = 0;
for (const vec of vectors) {
for (let i = 0; i < dims; i++) {
view[offset++] = vec[i] || 0;
}
}
return new Uint8Array(buffer);
}
function bytesToFloat32(bytes, dims) {
const view = new Float32Array(bytes.buffer, bytes.byteOffset, bytes.byteLength / 4);
const vectors = [];
for (let i = 0; i < view.length; i += dims) {
vectors.push(Array.from(view.slice(i, i + dims)));
}
return vectors;
}
function downloadBlob(blob, filename) {
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = filename;
document.body.appendChild(a);
a.click();
document.body.removeChild(a);
URL.revokeObjectURL(url);
}
// ═══════════════════════════════════════════════════════════════════════════
// 导出
// ═══════════════════════════════════════════════════════════════════════════
export async function exportVectors(onProgress) {
const { chatId } = getContext();
if (!chatId) {
throw new Error('未打开聊天');
}
onProgress?.('读取数据...');
const meta = await getMeta(chatId);
const chunks = await getAllChunks(chatId);
const chunkVectors = await getAllChunkVectors(chatId);
const eventVectors = await getAllEventVectors(chatId);
if (chunks.length === 0 && eventVectors.length === 0) {
throw new Error('没有可导出的向量数据');
}
// 确定维度
const dims = chunkVectors[0]?.vector?.length || eventVectors[0]?.vector?.length || 0;
if (dims === 0) {
throw new Error('无法确定向量维度');
}
onProgress?.('构建索引...');
// 构建 chunk 索引(按 chunkId 排序保证顺序一致)
const sortedChunks = [...chunks].sort((a, b) => a.chunkId.localeCompare(b.chunkId));
const chunkVectorMap = new Map(chunkVectors.map(cv => [cv.chunkId, cv.vector]));
// chunks.jsonl
const chunksJsonl = sortedChunks.map(c => JSON.stringify({
chunkId: c.chunkId,
floor: c.floor,
chunkIdx: c.chunkIdx,
speaker: c.speaker,
isUser: c.isUser,
text: c.text,
textHash: c.textHash,
})).join('\n');
// chunk_vectors.bin按 sortedChunks 顺序)
const chunkVectorsOrdered = sortedChunks.map(c => chunkVectorMap.get(c.chunkId) || new Array(dims).fill(0));
onProgress?.('压缩向量...');
// 构建 event 索引
const sortedEventVectors = [...eventVectors].sort((a, b) => a.eventId.localeCompare(b.eventId));
const eventsJsonl = sortedEventVectors.map(ev => JSON.stringify({
eventId: ev.eventId,
})).join('\n');
// event_vectors.bin
const eventVectorsOrdered = sortedEventVectors.map(ev => ev.vector);
// manifest
const manifest = {
version: EXPORT_VERSION,
exportedAt: Date.now(),
chatId,
fingerprint: meta.fingerprint || '',
dims,
chunkCount: sortedChunks.length,
chunkVectorCount: chunkVectors.length,
eventCount: sortedEventVectors.length,
lastChunkFloor: meta.lastChunkFloor ?? -1,
};
onProgress?.('打包文件...');
// 打包 zip
const zipData = zipSync({
'manifest.json': strToU8(JSON.stringify(manifest, null, 2)),
'chunks.jsonl': strToU8(chunksJsonl),
'chunk_vectors.bin': float32ToBytes(chunkVectorsOrdered, dims),
'events.jsonl': strToU8(eventsJsonl),
'event_vectors.bin': float32ToBytes(eventVectorsOrdered, dims),
}, { level: 1 }); // 降低压缩级别,速度优先
onProgress?.('下载文件...');
// 生成文件名
const timestamp = new Date().toISOString().slice(0, 10).replace(/-/g, '');
const shortChatId = chatId.slice(0, 8);
const filename = `vectors_${shortChatId}_${timestamp}.zip`;
downloadBlob(new Blob([zipData]), filename);
const sizeMB = (zipData.byteLength / 1024 / 1024).toFixed(2);
xbLog.info(MODULE_ID, `导出完成: ${filename} (${sizeMB}MB)`);
return {
filename,
size: zipData.byteLength,
chunkCount: sortedChunks.length,
eventCount: sortedEventVectors.length,
};
}
// ═══════════════════════════════════════════════════════════════════════════
// 导入
// ═══════════════════════════════════════════════════════════════════════════
export async function importVectors(file, onProgress) {
const { chatId } = getContext();
if (!chatId) {
throw new Error('未打开聊天');
}
onProgress?.('读取文件...');
const arrayBuffer = await file.arrayBuffer();
const zipData = new Uint8Array(arrayBuffer);
onProgress?.('解压文件...');
let unzipped;
try {
unzipped = unzipSync(zipData);
} catch (e) {
throw new Error('文件格式错误,无法解压');
}
// 读取 manifest
if (!unzipped['manifest.json']) {
throw new Error('缺少 manifest.json');
}
const manifest = JSON.parse(strFromU8(unzipped['manifest.json']));
if (manifest.version !== EXPORT_VERSION) {
throw new Error(`不支持的版本: ${manifest.version}`);
}
onProgress?.('校验数据...');
// 校验 fingerprint
const vectorCfg = getVectorConfig();
const currentFingerprint = vectorCfg ? getEngineFingerprint(vectorCfg) : '';
const fingerprintMismatch = manifest.fingerprint && currentFingerprint && manifest.fingerprint !== currentFingerprint;
// chatId 校验(警告但允许)
const chatIdMismatch = manifest.chatId !== chatId;
const warnings = [];
if (fingerprintMismatch) {
warnings.push(`向量引擎不匹配(文件: ${manifest.fingerprint}, 当前: ${currentFingerprint}),导入后需重新生成`);
}
if (chatIdMismatch) {
warnings.push(`聊天ID不匹配文件: ${manifest.chatId}, 当前: ${chatId}`);
}
onProgress?.('解析数据...');
// 解析 chunks
const chunksJsonl = unzipped['chunks.jsonl'] ? strFromU8(unzipped['chunks.jsonl']) : '';
const chunkMetas = chunksJsonl.split('\n').filter(Boolean).map(line => JSON.parse(line));
// 解析 chunk vectors
const chunkVectorsBytes = unzipped['chunk_vectors.bin'];
const chunkVectors = chunkVectorsBytes ? bytesToFloat32(chunkVectorsBytes, manifest.dims) : [];
// 解析 events
const eventsJsonl = unzipped['events.jsonl'] ? strFromU8(unzipped['events.jsonl']) : '';
const eventMetas = eventsJsonl.split('\n').filter(Boolean).map(line => JSON.parse(line));
// 解析 event vectors
const eventVectorsBytes = unzipped['event_vectors.bin'];
const eventVectors = eventVectorsBytes ? bytesToFloat32(eventVectorsBytes, manifest.dims) : [];
// 校验数量
if (chunkMetas.length !== chunkVectors.length) {
throw new Error(`chunk 数量不匹配: 元数据 ${chunkMetas.length}, 向量 ${chunkVectors.length}`);
}
if (eventMetas.length !== eventVectors.length) {
throw new Error(`event 数量不匹配: 元数据 ${eventMetas.length}, 向量 ${eventVectors.length}`);
}
onProgress?.('清空旧数据...');
// 清空当前数据
await clearAllChunks(chatId);
await clearEventVectors(chatId);
onProgress?.('写入数据...');
// 写入 chunks
if (chunkMetas.length > 0) {
const chunksToSave = chunkMetas.map(meta => ({
chunkId: meta.chunkId,
floor: meta.floor,
chunkIdx: meta.chunkIdx,
speaker: meta.speaker,
isUser: meta.isUser,
text: meta.text,
textHash: meta.textHash,
}));
await saveChunks(chatId, chunksToSave);
// 写入 chunk vectors
const chunkVectorItems = chunkMetas.map((meta, idx) => ({
chunkId: meta.chunkId,
vector: chunkVectors[idx],
}));
await saveChunkVectors(chatId, chunkVectorItems, manifest.fingerprint);
}
// 写入 event vectors
if (eventMetas.length > 0) {
const eventVectorItems = eventMetas.map((meta, idx) => ({
eventId: meta.eventId,
vector: eventVectors[idx],
}));
await saveEventVectors(chatId, eventVectorItems, manifest.fingerprint);
}
// 更新 meta
await updateMeta(chatId, {
fingerprint: manifest.fingerprint,
lastChunkFloor: manifest.lastChunkFloor,
});
xbLog.info(MODULE_ID, `导入完成: ${chunkMetas.length} chunks, ${eventMetas.length} events`);
return {
chunkCount: chunkMetas.length,
eventCount: eventMetas.length,
warnings,
fingerprintMismatch,
};
}