Improve rerank failure handling and tokenizer JP support
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Story Summary - 主入口
|
||||
//
|
||||
// 稳定目标:
|
||||
@@ -107,16 +107,43 @@ const MESSAGE_EVENT = "message";
|
||||
// 状态变量
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
let summaryGenerating = false;
|
||||
let overlayCreated = false;
|
||||
let frameReady = false;
|
||||
let currentMesId = null;
|
||||
let pendingFrameMessages = [];
|
||||
let eventsRegistered = false;
|
||||
let vectorGenerating = false;
|
||||
let vectorCancelled = false;
|
||||
let vectorAbortController = null;
|
||||
let anchorGenerating = false;
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// TaskGuard — 互斥任务管理(summary / vector / anchor)
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
class TaskGuard {
|
||||
#running = new Set();
|
||||
|
||||
acquire(taskName) {
|
||||
if (this.#running.has(taskName)) return null;
|
||||
this.#running.add(taskName);
|
||||
let released = false;
|
||||
return () => {
|
||||
if (!released) {
|
||||
released = true;
|
||||
this.#running.delete(taskName);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
isRunning(taskName) {
|
||||
return this.#running.has(taskName);
|
||||
}
|
||||
|
||||
isAnyRunning(...taskNames) {
|
||||
return taskNames.some(t => this.#running.has(t));
|
||||
}
|
||||
}
|
||||
|
||||
const guard = new TaskGuard();
|
||||
|
||||
// 用户消息缓存(解决 GENERATION_STARTED 时 chat 尚未包含用户消息的问题)
|
||||
let lastSentUserMessage = null;
|
||||
@@ -219,13 +246,12 @@ async function unhideAllMessages() {
|
||||
// 生成状态管理
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
function setSummaryGenerating(flag) {
|
||||
summaryGenerating = !!flag;
|
||||
postToFrame({ type: "GENERATION_STATE", isGenerating: summaryGenerating });
|
||||
function isSummaryGenerating() {
|
||||
return guard.isRunning('summary');
|
||||
}
|
||||
|
||||
function isSummaryGenerating() {
|
||||
return summaryGenerating;
|
||||
function notifySummaryState() {
|
||||
postToFrame({ type: "GENERATION_STATE", isGenerating: guard.isRunning('summary') });
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
@@ -300,38 +326,35 @@ async function sendAnchorStatsToFrame() {
|
||||
}
|
||||
|
||||
async function handleAnchorGenerate() {
|
||||
if (anchorGenerating) return;
|
||||
|
||||
const vectorCfg = getVectorConfig();
|
||||
if (!vectorCfg?.enabled) {
|
||||
await executeSlashCommand("/echo severity=warning 请先启用向量检索");
|
||||
return;
|
||||
}
|
||||
|
||||
if (!vectorCfg.online?.key) {
|
||||
postToFrame({ type: "VECTOR_ONLINE_STATUS", status: "error", message: "请配置 API Key" });
|
||||
return;
|
||||
}
|
||||
|
||||
const { chatId, chat } = getContext();
|
||||
if (!chatId || !chat?.length) return;
|
||||
|
||||
anchorGenerating = true;
|
||||
|
||||
postToFrame({ type: "ANCHOR_GEN_PROGRESS", current: 0, total: 1, message: "分析中..." });
|
||||
const release = guard.acquire('anchor');
|
||||
if (!release) return;
|
||||
|
||||
try {
|
||||
// Phase 1: L0 提取 + Phase 2: L0 向量化(在 incrementalExtractAtoms 内部完成)
|
||||
const vectorCfg = getVectorConfig();
|
||||
if (!vectorCfg?.enabled) {
|
||||
await executeSlashCommand("/echo severity=warning 请先启用向量检索");
|
||||
return;
|
||||
}
|
||||
|
||||
if (!vectorCfg.online?.key) {
|
||||
postToFrame({ type: "VECTOR_ONLINE_STATUS", status: "error", message: "请配置 API Key" });
|
||||
return;
|
||||
}
|
||||
|
||||
const { chatId, chat } = getContext();
|
||||
if (!chatId || !chat?.length) return;
|
||||
|
||||
postToFrame({ type: "ANCHOR_GEN_PROGRESS", current: 0, total: 1, message: "分析中..." });
|
||||
|
||||
await incrementalExtractAtoms(chatId, chat, (message, current, total) => {
|
||||
postToFrame({ type: "ANCHOR_GEN_PROGRESS", current, total, message });
|
||||
});
|
||||
|
||||
// Phase 3: 处理 pending L1 Chunks
|
||||
postToFrame({ type: "ANCHOR_GEN_PROGRESS", current: 0, total: 1, message: "向量化 L1..." });
|
||||
await buildIncrementalChunks({ vectorConfig: vectorCfg });
|
||||
|
||||
|
||||
invalidateLexicalIndex();
|
||||
|
||||
|
||||
await sendAnchorStatsToFrame();
|
||||
await sendVectorStatsToFrame();
|
||||
|
||||
@@ -340,7 +363,7 @@ async function handleAnchorGenerate() {
|
||||
xbLog.error(MODULE_ID, "记忆锚点生成失败", e);
|
||||
await executeSlashCommand(`/echo severity=error 记忆锚点生成失败:${e.message}`);
|
||||
} finally {
|
||||
anchorGenerating = false;
|
||||
release();
|
||||
postToFrame({ type: "ANCHOR_GEN_PROGRESS", current: -1, total: 0 });
|
||||
}
|
||||
}
|
||||
@@ -359,7 +382,6 @@ async function handleAnchorClear() {
|
||||
|
||||
function handleAnchorCancel() {
|
||||
cancelL0Extraction();
|
||||
anchorGenerating = false;
|
||||
postToFrame({ type: "ANCHOR_GEN_PROGRESS", current: -1, total: 0 });
|
||||
}
|
||||
|
||||
@@ -378,142 +400,159 @@ async function handleTestOnlineService(provider, config) {
|
||||
}
|
||||
|
||||
async function handleGenerateVectors(vectorCfg) {
|
||||
if (vectorGenerating) return;
|
||||
const release = guard.acquire('vector');
|
||||
if (!release) return;
|
||||
|
||||
if (!vectorCfg?.enabled) {
|
||||
postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "ALL", current: -1, total: 0 });
|
||||
return;
|
||||
}
|
||||
try {
|
||||
if (!vectorCfg?.enabled) {
|
||||
postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "ALL", current: -1, total: 0 });
|
||||
return;
|
||||
}
|
||||
|
||||
const { chatId, chat } = getContext();
|
||||
if (!chatId || !chat?.length) return;
|
||||
const { chatId, chat } = getContext();
|
||||
if (!chatId || !chat?.length) return;
|
||||
|
||||
if (!vectorCfg.online?.key) {
|
||||
postToFrame({ type: "VECTOR_ONLINE_STATUS", status: "error", message: "请配置 API Key" });
|
||||
return;
|
||||
}
|
||||
if (!vectorCfg.online?.key) {
|
||||
postToFrame({ type: "VECTOR_ONLINE_STATUS", status: "error", message: "请配置 API Key" });
|
||||
return;
|
||||
}
|
||||
|
||||
vectorGenerating = true;
|
||||
vectorCancelled = false;
|
||||
vectorAbortController = new AbortController();
|
||||
vectorCancelled = false;
|
||||
vectorAbortController = new AbortController();
|
||||
|
||||
const fingerprint = getEngineFingerprint(vectorCfg);
|
||||
const batchSize = 20;
|
||||
const fingerprint = getEngineFingerprint(vectorCfg);
|
||||
const batchSize = 20;
|
||||
|
||||
await clearAllChunks(chatId);
|
||||
await clearEventVectors(chatId);
|
||||
await clearStateVectors(chatId);
|
||||
await updateMeta(chatId, { lastChunkFloor: -1, fingerprint });
|
||||
await clearAllChunks(chatId);
|
||||
await clearEventVectors(chatId);
|
||||
await clearStateVectors(chatId);
|
||||
await updateMeta(chatId, { lastChunkFloor: -1, fingerprint });
|
||||
|
||||
const atoms = getStateAtoms();
|
||||
if (!atoms.length) {
|
||||
postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L0", current: 0, total: 0, message: "L0 为空,跳过" });
|
||||
} else {
|
||||
postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L0", current: 0, total: atoms.length, message: "L0 向量化..." });
|
||||
const atoms = getStateAtoms();
|
||||
if (!atoms.length) {
|
||||
postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L0", current: 0, total: 0, message: "L0 为空,跳过" });
|
||||
} else {
|
||||
postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L0", current: 0, total: atoms.length, message: "L0 向量化..." });
|
||||
|
||||
let l0Completed = 0;
|
||||
for (let i = 0; i < atoms.length; i += batchSize) {
|
||||
if (vectorCancelled) break;
|
||||
let l0Completed = 0;
|
||||
for (let i = 0; i < atoms.length; i += batchSize) {
|
||||
if (vectorCancelled) break;
|
||||
|
||||
const batch = atoms.slice(i, i + batchSize);
|
||||
const texts = batch.map(a => a.semantic);
|
||||
try {
|
||||
const vectors = await embed(texts, vectorCfg, { signal: vectorAbortController.signal });
|
||||
const items = batch.map((a, j) => ({
|
||||
atomId: a.atomId,
|
||||
floor: a.floor,
|
||||
vector: vectors[j],
|
||||
}));
|
||||
await saveStateVectors(chatId, items, fingerprint);
|
||||
l0Completed += batch.length;
|
||||
postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L0", current: l0Completed, total: atoms.length });
|
||||
} catch (e) {
|
||||
if (e?.name === "AbortError") break;
|
||||
xbLog.error(MODULE_ID, "L0 向量化失败", e);
|
||||
vectorCancelled = true;
|
||||
break;
|
||||
const batch = atoms.slice(i, i + batchSize);
|
||||
const texts = batch.map(a => a.semantic);
|
||||
try {
|
||||
const vectors = await embed(texts, vectorCfg, { signal: vectorAbortController.signal });
|
||||
const items = batch.map((a, j) => ({
|
||||
atomId: a.atomId,
|
||||
floor: a.floor,
|
||||
vector: vectors[j],
|
||||
}));
|
||||
await saveStateVectors(chatId, items, fingerprint);
|
||||
l0Completed += batch.length;
|
||||
postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L0", current: l0Completed, total: atoms.length });
|
||||
} catch (e) {
|
||||
if (e?.name === "AbortError") break;
|
||||
xbLog.error(MODULE_ID, "L0 向量化失败", e);
|
||||
vectorCancelled = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (vectorCancelled) {
|
||||
vectorGenerating = false;
|
||||
return;
|
||||
}
|
||||
if (vectorCancelled) return;
|
||||
|
||||
const allChunks = [];
|
||||
for (let floor = 0; floor < chat.length; floor++) {
|
||||
const chunks = chunkMessage(floor, chat[floor]);
|
||||
allChunks.push(...chunks);
|
||||
}
|
||||
const allChunks = [];
|
||||
for (let floor = 0; floor < chat.length; floor++) {
|
||||
if (vectorCancelled) break;
|
||||
|
||||
if (allChunks.length > 0) {
|
||||
await saveChunks(chatId, allChunks);
|
||||
}
|
||||
const message = chat[floor];
|
||||
if (!message) continue;
|
||||
|
||||
const l1Texts = allChunks.map(c => c.text);
|
||||
const store = getSummaryStore();
|
||||
const events = store?.json?.events || [];
|
||||
const { chunks, status } = chunkMessage(message, floor, vectorCfg, true);
|
||||
if (status === "skip") continue;
|
||||
|
||||
postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L1", current: 0, total: l1Texts.length });
|
||||
|
||||
const l1Vectors = [];
|
||||
let completed = 0;
|
||||
for (let i = 0; i < l1Texts.length; i += batchSize) {
|
||||
if (vectorCancelled) break;
|
||||
|
||||
const batch = l1Texts.slice(i, i + batchSize);
|
||||
try {
|
||||
const vectors = await embed(batch, vectorCfg, { signal: vectorAbortController.signal });
|
||||
l1Vectors.push(...vectors);
|
||||
completed += batch.length;
|
||||
postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L1", current: completed, total: l1Texts.length });
|
||||
} catch (e) {
|
||||
if (e?.name === 'AbortError') break;
|
||||
xbLog.error(MODULE_ID, 'L1 向量化失败', e);
|
||||
vectorCancelled = true;
|
||||
break;
|
||||
allChunks.push(...chunks);
|
||||
}
|
||||
}
|
||||
|
||||
if (!vectorCancelled && l1Vectors.length > 0) {
|
||||
const items = allChunks.map((c, i) => ({ chunkId: c.chunkId, vector: l1Vectors[i] })).filter(x => x.vector);
|
||||
await saveChunkVectors(chatId, items, fingerprint);
|
||||
await updateMeta(chatId, { lastChunkFloor: chat.length - 1 });
|
||||
}
|
||||
let l1Vectors = [];
|
||||
if (!allChunks.length) {
|
||||
postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L1", current: 0, total: 0, message: "L1 为空,跳过" });
|
||||
} else {
|
||||
postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L1", current: 0, total: allChunks.length, message: "L1 向量化..." });
|
||||
await saveChunks(chatId, allChunks);
|
||||
|
||||
const l2Pairs = events
|
||||
.map(e => ({ id: e.id, text: `${e.title || ''} ${e.summary || ''}`.trim() }))
|
||||
.filter(p => p.text);
|
||||
let l1Completed = 0;
|
||||
for (let i = 0; i < allChunks.length; i += batchSize) {
|
||||
if (vectorCancelled) break;
|
||||
|
||||
postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L2", current: 0, total: l2Pairs.length });
|
||||
let l2Completed = 0;
|
||||
for (let i = 0; i < l2Pairs.length; i += batchSize) {
|
||||
if (vectorCancelled) break;
|
||||
|
||||
const batch = l2Pairs.slice(i, i + batchSize);
|
||||
try {
|
||||
const vectors = await embed(batch.map(p => p.text), vectorCfg, { signal: vectorAbortController.signal });
|
||||
const items = batch.map((p, j) => ({ eventId: p.id, vector: vectors[j] }));
|
||||
await saveEventVectorsToDb(chatId, items, fingerprint);
|
||||
l2Completed += batch.length;
|
||||
postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L2", current: l2Completed, total: l2Pairs.length });
|
||||
} catch (e) {
|
||||
if (e?.name === 'AbortError') break;
|
||||
xbLog.error(MODULE_ID, 'L2 向量化失败', e);
|
||||
vectorCancelled = true;
|
||||
break;
|
||||
const batch = allChunks.slice(i, i + batchSize);
|
||||
const texts = batch.map(c => c.text);
|
||||
try {
|
||||
const vectors = await embed(texts, vectorCfg, { signal: vectorAbortController.signal });
|
||||
const items = batch.map((c, j) => ({
|
||||
chunkId: c.chunkId,
|
||||
vector: vectors[j],
|
||||
}));
|
||||
await saveChunkVectors(chatId, items, fingerprint);
|
||||
l1Vectors = l1Vectors.concat(items);
|
||||
l1Completed += batch.length;
|
||||
postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L1", current: l1Completed, total: allChunks.length });
|
||||
} catch (e) {
|
||||
if (e?.name === "AbortError") break;
|
||||
xbLog.error(MODULE_ID, "L1 向量化失败", e);
|
||||
vectorCancelled = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (vectorCancelled) return;
|
||||
|
||||
const store = getSummaryStore();
|
||||
const events = store?.json?.events || [];
|
||||
|
||||
const l2Pairs = events
|
||||
.map((e) => ({ id: e.id, text: `${e.title || ""} ${e.summary || ""}`.trim() }))
|
||||
.filter((p) => p.text);
|
||||
|
||||
if (!l2Pairs.length) {
|
||||
postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L2", current: 0, total: 0, message: "L2 为空,跳过" });
|
||||
} else {
|
||||
postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L2", current: 0, total: l2Pairs.length, message: "L2 向量化..." });
|
||||
|
||||
let l2Completed = 0;
|
||||
for (let i = 0; i < l2Pairs.length; i += batchSize) {
|
||||
if (vectorCancelled) break;
|
||||
|
||||
const batch = l2Pairs.slice(i, i + batchSize);
|
||||
const texts = batch.map(p => p.text);
|
||||
try {
|
||||
const vectors = await embed(texts, vectorCfg, { signal: vectorAbortController.signal });
|
||||
const items = batch.map((p, idx) => ({
|
||||
eventId: p.id,
|
||||
vector: vectors[idx],
|
||||
}));
|
||||
await saveEventVectorsToDb(chatId, items, fingerprint);
|
||||
l2Completed += batch.length;
|
||||
postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L2", current: l2Completed, total: l2Pairs.length });
|
||||
} catch (e) {
|
||||
if (e?.name === "AbortError") break;
|
||||
xbLog.error(MODULE_ID, "L2 向量化失败", e);
|
||||
vectorCancelled = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "ALL", current: -1, total: 0 });
|
||||
await sendVectorStatsToFrame();
|
||||
|
||||
xbLog.info(MODULE_ID, `向量生成完成: L0=${atoms.length}, L1=${l1Vectors.length}, L2=${l2Pairs.length}`);
|
||||
} finally {
|
||||
release();
|
||||
vectorCancelled = false;
|
||||
vectorAbortController = null;
|
||||
}
|
||||
|
||||
postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "ALL", current: -1, total: 0 });
|
||||
await sendVectorStatsToFrame();
|
||||
|
||||
vectorGenerating = false;
|
||||
vectorCancelled = false;
|
||||
vectorAbortController = null;
|
||||
|
||||
xbLog.info(MODULE_ID, `向量生成完成: L0=${atoms.length}, L1=${l1Vectors.length}, L2=${l2Pairs.length}`);
|
||||
}
|
||||
|
||||
async function handleClearVectors() {
|
||||
@@ -529,52 +568,6 @@ async function handleClearVectors() {
|
||||
xbLog.info(MODULE_ID, "向量数据已清除");
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// L0 自动补提取(每收到新消息后检查并补提取缺失楼层)
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
async function maybeAutoExtractL0() {
|
||||
const vectorCfg = getVectorConfig();
|
||||
if (!vectorCfg?.enabled) return;
|
||||
if (anchorGenerating || vectorGenerating) return;
|
||||
|
||||
const { chatId, chat } = getContext();
|
||||
if (!chatId || !chat?.length) return;
|
||||
|
||||
const stats = await getAnchorStats();
|
||||
if (stats.pending <= 0) return;
|
||||
|
||||
anchorGenerating = true;
|
||||
|
||||
try {
|
||||
await incrementalExtractAtoms(chatId, chat, null, { maxFloors: 20 });
|
||||
|
||||
// 为新提取的 L0 楼层构建 L1 chunks
|
||||
await buildIncrementalChunks({ vectorConfig: vectorCfg });
|
||||
|
||||
invalidateLexicalIndex();
|
||||
|
||||
await sendAnchorStatsToFrame();
|
||||
await sendVectorStatsToFrame();
|
||||
|
||||
xbLog.info(MODULE_ID, "自动 L0 补提取完成");
|
||||
} catch (e) {
|
||||
xbLog.error(MODULE_ID, "自动 L0 补提取失败", e);
|
||||
} finally {
|
||||
anchorGenerating = false;
|
||||
}
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Embedding 连接预热
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
function warmupEmbeddingConnection() {
|
||||
const vectorCfg = getVectorConfig();
|
||||
if (!vectorCfg?.enabled) return;
|
||||
embed(['.'], vectorCfg, { timeout: 5000 }).catch(() => {});
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// 实体词典注入 + 索引预热
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
@@ -597,9 +590,52 @@ function refreshEntityLexiconAndWarmup() {
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// L2 自动增量向量化(总结完成后调用)
|
||||
// L0 自动补提取(每收到新消息后检查并补提取缺失楼层)
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
async function maybeAutoExtractL0() {
|
||||
const vectorCfg = getVectorConfig();
|
||||
if (!vectorCfg?.enabled) return;
|
||||
if (guard.isAnyRunning('anchor', 'vector')) return;
|
||||
|
||||
const { chatId, chat } = getContext();
|
||||
if (!chatId || !chat?.length) return;
|
||||
|
||||
const stats = await getAnchorStats();
|
||||
if (stats.pending <= 0) return;
|
||||
|
||||
const release = guard.acquire('anchor');
|
||||
if (!release) return;
|
||||
|
||||
try {
|
||||
await incrementalExtractAtoms(chatId, chat, null, { maxFloors: 20 });
|
||||
|
||||
// 为新提取的 L0 楼层构建 L1 chunks
|
||||
await buildIncrementalChunks({ vectorConfig: vectorCfg });
|
||||
|
||||
invalidateLexicalIndex();
|
||||
|
||||
await sendAnchorStatsToFrame();
|
||||
await sendVectorStatsToFrame();
|
||||
|
||||
xbLog.info(MODULE_ID, "自动 L0 补提取完成");
|
||||
} catch (e) {
|
||||
xbLog.error(MODULE_ID, "自动 L0 补提取失败", e);
|
||||
} finally {
|
||||
release();
|
||||
}
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Embedding 连接预热
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
function warmupEmbeddingConnection() {
|
||||
const vectorCfg = getVectorConfig();
|
||||
if (!vectorCfg?.enabled) return;
|
||||
embed(['.'], vectorCfg, { timeout: 5000 }).catch(() => {});
|
||||
}
|
||||
|
||||
async function autoVectorizeNewEvents(newEventIds) {
|
||||
if (!newEventIds?.length) return;
|
||||
|
||||
@@ -902,7 +938,7 @@ function openPanelForMessage(mesId) {
|
||||
|
||||
sendFrameBaseData(store, totalFloors);
|
||||
sendFrameFullData(store, totalFloors);
|
||||
setSummaryGenerating(summaryGenerating);
|
||||
notifySummaryState();
|
||||
|
||||
sendVectorConfigToFrame();
|
||||
sendVectorStatsToFrame();
|
||||
@@ -990,36 +1026,40 @@ async function maybeAutoRunSummary(reason) {
|
||||
}
|
||||
|
||||
async function autoRunSummaryWithRetry(targetMesId, configForRun) {
|
||||
setSummaryGenerating(true);
|
||||
const release = guard.acquire('summary');
|
||||
if (!release) return;
|
||||
notifySummaryState();
|
||||
|
||||
for (let attempt = 1; attempt <= 3; attempt++) {
|
||||
const result = await runSummaryGeneration(targetMesId, configForRun, {
|
||||
onStatus: (text) => postToFrame({ type: "SUMMARY_STATUS", statusText: text }),
|
||||
onError: (msg) => postToFrame({ type: "SUMMARY_ERROR", message: msg }),
|
||||
onComplete: async ({ merged, endMesId, newEventIds }) => {
|
||||
const store = getSummaryStore();
|
||||
postToFrame({ type: "SUMMARY_FULL_DATA", payload: buildFramePayload(store) });
|
||||
|
||||
invalidateLexicalIndex();
|
||||
|
||||
applyHideStateDebounced();
|
||||
updateFrameStatsAfterSummary(endMesId, store.json || {});
|
||||
try {
|
||||
for (let attempt = 1; attempt <= 3; attempt++) {
|
||||
const result = await runSummaryGeneration(targetMesId, configForRun, {
|
||||
onStatus: (text) => postToFrame({ type: "SUMMARY_STATUS", statusText: text }),
|
||||
onError: (msg) => postToFrame({ type: "SUMMARY_ERROR", message: msg }),
|
||||
onComplete: async ({ merged, endMesId, newEventIds }) => {
|
||||
const store = getSummaryStore();
|
||||
postToFrame({ type: "SUMMARY_FULL_DATA", payload: buildFramePayload(store) });
|
||||
|
||||
// L2 自动增量向量化
|
||||
await autoVectorizeNewEvents(newEventIds);
|
||||
},
|
||||
});
|
||||
invalidateLexicalIndex();
|
||||
|
||||
if (result.success) {
|
||||
setSummaryGenerating(false);
|
||||
return;
|
||||
applyHideStateDebounced();
|
||||
updateFrameStatsAfterSummary(endMesId, store.json || {});
|
||||
|
||||
await autoVectorizeNewEvents(newEventIds);
|
||||
},
|
||||
});
|
||||
|
||||
if (result.success) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (attempt < 3) await sleep(1000);
|
||||
}
|
||||
|
||||
if (attempt < 3) await sleep(1000);
|
||||
await executeSlashCommand("/echo severity=error 剧情总结失败(已自动重试 3 次)。请稍后再试。");
|
||||
} finally {
|
||||
release();
|
||||
notifySummaryState();
|
||||
}
|
||||
|
||||
setSummaryGenerating(false);
|
||||
await executeSlashCommand("/echo severity=error 剧情总结失败(已自动重试 3 次)。请稍后再试。");
|
||||
}
|
||||
|
||||
function updateFrameStatsAfterSummary(endMesId, merged) {
|
||||
@@ -1055,7 +1095,7 @@ function handleFrameMessage(event) {
|
||||
case "FRAME_READY": {
|
||||
frameReady = true;
|
||||
flushPendingFrameMessages();
|
||||
setSummaryGenerating(summaryGenerating);
|
||||
notifySummaryState();
|
||||
sendSavedConfigToFrame();
|
||||
sendVectorConfigToFrame();
|
||||
sendVectorStatsToFrame();
|
||||
@@ -1084,7 +1124,7 @@ function handleFrameMessage(event) {
|
||||
|
||||
case "REQUEST_CANCEL":
|
||||
window.xiaobaixStreamingGeneration?.cancel?.("xb9");
|
||||
setSummaryGenerating(false);
|
||||
postToFrame({ type: "GENERATION_STATE", isGenerating: false });
|
||||
postToFrame({ type: "SUMMARY_STATUS", statusText: "已停止" });
|
||||
break;
|
||||
|
||||
@@ -1282,26 +1322,30 @@ async function handleManualGenerate(mesId, config) {
|
||||
return;
|
||||
}
|
||||
|
||||
setSummaryGenerating(true);
|
||||
const release = guard.acquire('summary');
|
||||
if (!release) return;
|
||||
notifySummaryState();
|
||||
|
||||
await runSummaryGeneration(mesId, config, {
|
||||
onStatus: (text) => postToFrame({ type: "SUMMARY_STATUS", statusText: text }),
|
||||
onError: (msg) => postToFrame({ type: "SUMMARY_ERROR", message: msg }),
|
||||
onComplete: async ({ merged, endMesId, newEventIds }) => {
|
||||
const store = getSummaryStore();
|
||||
postToFrame({ type: "SUMMARY_FULL_DATA", payload: buildFramePayload(store) });
|
||||
|
||||
invalidateLexicalIndex();
|
||||
|
||||
applyHideStateDebounced();
|
||||
updateFrameStatsAfterSummary(endMesId, store.json || {});
|
||||
try {
|
||||
await runSummaryGeneration(mesId, config, {
|
||||
onStatus: (text) => postToFrame({ type: "SUMMARY_STATUS", statusText: text }),
|
||||
onError: (msg) => postToFrame({ type: "SUMMARY_ERROR", message: msg }),
|
||||
onComplete: async ({ merged, endMesId, newEventIds }) => {
|
||||
const store = getSummaryStore();
|
||||
postToFrame({ type: "SUMMARY_FULL_DATA", payload: buildFramePayload(store) });
|
||||
|
||||
// L2 自动增量向量化
|
||||
await autoVectorizeNewEvents(newEventIds);
|
||||
},
|
||||
});
|
||||
invalidateLexicalIndex();
|
||||
|
||||
setSummaryGenerating(false);
|
||||
applyHideStateDebounced();
|
||||
updateFrameStatsAfterSummary(endMesId, store.json || {});
|
||||
|
||||
await autoVectorizeNewEvents(newEventIds);
|
||||
},
|
||||
});
|
||||
} finally {
|
||||
release();
|
||||
notifySummaryState();
|
||||
}
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
@@ -1390,7 +1434,7 @@ async function handleMessageReceived() {
|
||||
initButtonsForAll();
|
||||
|
||||
// 向量全量生成中时跳过 L1 sync(避免竞争写入)
|
||||
if (vectorGenerating) return;
|
||||
if (guard.isRunning('vector')) return;
|
||||
|
||||
await syncOnMessageReceived(chatId, lastFloor, message, vectorConfig, () => {
|
||||
sendAnchorStatsToFrame();
|
||||
@@ -1529,6 +1573,22 @@ async function handleGenerationStarted(type, _params, isDryRun) {
|
||||
// 事件注册
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
const boundHandlers = {
|
||||
chatChanged: () => setTimeout(handleChatChanged, 80),
|
||||
messageDeleted: () => setTimeout(handleMessageDeleted, 50),
|
||||
messageReceived: () => setTimeout(handleMessageReceived, 150),
|
||||
messageSent: () => setTimeout(handleMessageSent, 150),
|
||||
messageSentRecall: handleMessageSentForRecall,
|
||||
messageSwiped: () => setTimeout(handleMessageSwiped, 100),
|
||||
messageUpdated: () => setTimeout(handleMessageUpdated, 100),
|
||||
messageEdited: () => setTimeout(handleMessageUpdated, 100),
|
||||
userRendered: (data) => setTimeout(() => handleMessageRendered(data), 50),
|
||||
charRendered: (data) => setTimeout(() => handleMessageRendered(data), 50),
|
||||
genStarted: handleGenerationStarted,
|
||||
genStopped: clearExtensionPrompt,
|
||||
genEnded: clearExtensionPrompt,
|
||||
};
|
||||
|
||||
function registerEvents() {
|
||||
if (eventsRegistered) return;
|
||||
eventsRegistered = true;
|
||||
@@ -1551,31 +1611,45 @@ function registerEvents() {
|
||||
|
||||
initButtonsForAll();
|
||||
|
||||
eventSource.on(event_types.CHAT_CHANGED, () => setTimeout(handleChatChanged, 80));
|
||||
eventSource.on(event_types.MESSAGE_DELETED, () => setTimeout(handleMessageDeleted, 50));
|
||||
eventSource.on(event_types.MESSAGE_RECEIVED, () => setTimeout(handleMessageReceived, 150));
|
||||
eventSource.on(event_types.MESSAGE_SENT, () => setTimeout(handleMessageSent, 150));
|
||||
eventSource.on(event_types.MESSAGE_SENT, handleMessageSentForRecall);
|
||||
eventSource.on(event_types.MESSAGE_SWIPED, () => setTimeout(handleMessageSwiped, 100));
|
||||
eventSource.on(event_types.MESSAGE_UPDATED, () => setTimeout(handleMessageUpdated, 100));
|
||||
eventSource.on(event_types.MESSAGE_EDITED, () => setTimeout(handleMessageUpdated, 100));
|
||||
eventSource.on(event_types.USER_MESSAGE_RENDERED, (data) => setTimeout(() => handleMessageRendered(data), 50));
|
||||
eventSource.on(event_types.CHARACTER_MESSAGE_RENDERED, (data) => setTimeout(() => handleMessageRendered(data), 50));
|
||||
eventSource.on(event_types.CHAT_CHANGED, boundHandlers.chatChanged);
|
||||
eventSource.on(event_types.MESSAGE_DELETED, boundHandlers.messageDeleted);
|
||||
eventSource.on(event_types.MESSAGE_RECEIVED, boundHandlers.messageReceived);
|
||||
eventSource.on(event_types.MESSAGE_SENT, boundHandlers.messageSent);
|
||||
eventSource.on(event_types.MESSAGE_SENT, boundHandlers.messageSentRecall);
|
||||
eventSource.on(event_types.MESSAGE_SWIPED, boundHandlers.messageSwiped);
|
||||
eventSource.on(event_types.MESSAGE_UPDATED, boundHandlers.messageUpdated);
|
||||
eventSource.on(event_types.MESSAGE_EDITED, boundHandlers.messageEdited);
|
||||
eventSource.on(event_types.USER_MESSAGE_RENDERED, boundHandlers.userRendered);
|
||||
eventSource.on(event_types.CHARACTER_MESSAGE_RENDERED, boundHandlers.charRendered);
|
||||
|
||||
// 用户输入捕获(原生捕获阶段)
|
||||
document.addEventListener("pointerdown", onSendPointerdown, true);
|
||||
document.addEventListener("keydown", onSendKeydown, true);
|
||||
|
||||
// 注入链路
|
||||
eventSource.on(event_types.GENERATION_STARTED, handleGenerationStarted);
|
||||
eventSource.on(event_types.GENERATION_STOPPED, clearExtensionPrompt);
|
||||
eventSource.on(event_types.GENERATION_ENDED, clearExtensionPrompt);
|
||||
eventSource.on(event_types.GENERATION_STARTED, boundHandlers.genStarted);
|
||||
eventSource.on(event_types.GENERATION_STOPPED, boundHandlers.genStopped);
|
||||
eventSource.on(event_types.GENERATION_ENDED, boundHandlers.genEnded);
|
||||
}
|
||||
|
||||
function unregisterEvents() {
|
||||
CacheRegistry.unregister(MODULE_ID);
|
||||
eventsRegistered = false;
|
||||
|
||||
eventSource.off(event_types.CHAT_CHANGED, boundHandlers.chatChanged);
|
||||
eventSource.off(event_types.MESSAGE_DELETED, boundHandlers.messageDeleted);
|
||||
eventSource.off(event_types.MESSAGE_RECEIVED, boundHandlers.messageReceived);
|
||||
eventSource.off(event_types.MESSAGE_SENT, boundHandlers.messageSent);
|
||||
eventSource.off(event_types.MESSAGE_SENT, boundHandlers.messageSentRecall);
|
||||
eventSource.off(event_types.MESSAGE_SWIPED, boundHandlers.messageSwiped);
|
||||
eventSource.off(event_types.MESSAGE_UPDATED, boundHandlers.messageUpdated);
|
||||
eventSource.off(event_types.MESSAGE_EDITED, boundHandlers.messageEdited);
|
||||
eventSource.off(event_types.USER_MESSAGE_RENDERED, boundHandlers.userRendered);
|
||||
eventSource.off(event_types.CHARACTER_MESSAGE_RENDERED, boundHandlers.charRendered);
|
||||
eventSource.off(event_types.GENERATION_STARTED, boundHandlers.genStarted);
|
||||
eventSource.off(event_types.GENERATION_STOPPED, boundHandlers.genStopped);
|
||||
eventSource.off(event_types.GENERATION_ENDED, boundHandlers.genEnded);
|
||||
|
||||
$(".xiaobaix-story-summary-btn").remove();
|
||||
hideOverlay();
|
||||
|
||||
|
||||
@@ -28,17 +28,17 @@ export async function rerank(query, documents, options = {}) {
|
||||
|
||||
if (!query?.trim()) {
|
||||
xbLog.warn(MODULE_ID, 'query 为空,跳过 rerank');
|
||||
return documents.map((_, i) => ({ index: i, relevance_score: 0.5 }));
|
||||
return { results: documents.map((_, i) => ({ index: i, relevance_score: 0 })), failed: true };
|
||||
}
|
||||
|
||||
if (!documents?.length) {
|
||||
return [];
|
||||
return { results: [], failed: false };
|
||||
}
|
||||
|
||||
const key = getApiKey();
|
||||
if (!key) {
|
||||
xbLog.warn(MODULE_ID, '未配置 API Key,跳过 rerank');
|
||||
return documents.map((_, i) => ({ index: i, relevance_score: 0.5 }));
|
||||
return { results: documents.map((_, i) => ({ index: i, relevance_score: 0 })), failed: true };
|
||||
}
|
||||
|
||||
// 截断超长文档列表
|
||||
@@ -61,7 +61,7 @@ export async function rerank(query, documents, options = {}) {
|
||||
|
||||
if (!validDocs.length) {
|
||||
xbLog.warn(MODULE_ID, '无有效文档,跳过 rerank');
|
||||
return [];
|
||||
return { results: [], failed: false };
|
||||
}
|
||||
|
||||
const controller = new AbortController();
|
||||
@@ -106,7 +106,7 @@ export async function rerank(query, documents, options = {}) {
|
||||
const elapsed = Math.round(performance.now() - T0);
|
||||
xbLog.info(MODULE_ID, `Rerank 完成: ${validDocs.length} docs → ${results.length} selected (${elapsed}ms)`);
|
||||
|
||||
return mapped;
|
||||
return { results: mapped, failed: false };
|
||||
|
||||
} catch (e) {
|
||||
clearTimeout(timeoutId);
|
||||
@@ -118,10 +118,13 @@ export async function rerank(query, documents, options = {}) {
|
||||
}
|
||||
|
||||
// 降级:返回原顺序,分数均匀分布
|
||||
return documents.slice(0, topN).map((_, i) => ({
|
||||
index: i,
|
||||
relevance_score: 1 - (i / documents.length) * 0.5,
|
||||
}));
|
||||
return {
|
||||
results: documents.slice(0, topN).map((_, i) => ({
|
||||
index: i,
|
||||
relevance_score: 0,
|
||||
})),
|
||||
failed: true,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
@@ -138,30 +141,38 @@ export async function rerankChunks(query, chunks, options = {}) {
|
||||
|
||||
if (!chunks?.length) return [];
|
||||
if (chunks.length <= topN) {
|
||||
// 数量不超限,仍然 rerank 以获取分数,但不过滤
|
||||
const texts = chunks.map(c => c.text || c.semantic || '');
|
||||
const results = await rerank(query, texts, { topN: chunks.length, ...options });
|
||||
|
||||
const { results, failed } = await rerank(query, texts, { topN: chunks.length, ...options });
|
||||
|
||||
if (failed) {
|
||||
return chunks.map(c => ({ ...c, _rerankScore: 0, _rerankFailed: true }));
|
||||
}
|
||||
|
||||
const scoreMap = new Map(results.map(r => [r.index, r.relevance_score]));
|
||||
return chunks.map((c, i) => ({
|
||||
...c,
|
||||
_rerankScore: scoreMap.get(i) ?? 0.5,
|
||||
_rerankScore: scoreMap.get(i) ?? 0,
|
||||
})).sort((a, b) => b._rerankScore - a._rerankScore);
|
||||
}
|
||||
|
||||
const texts = chunks.map(c => c.text || c.semantic || '');
|
||||
const results = await rerank(query, texts, { topN, ...options });
|
||||
const { results, failed } = await rerank(query, texts, { topN, ...options });
|
||||
|
||||
// 过滤低分 + 排序
|
||||
const selected = results
|
||||
if (failed) {
|
||||
return chunks.slice(0, topN).map(c => ({
|
||||
...c,
|
||||
_rerankScore: 0,
|
||||
_rerankFailed: true,
|
||||
}));
|
||||
}
|
||||
|
||||
return results
|
||||
.filter(r => r.relevance_score >= minScore)
|
||||
.sort((a, b) => b.relevance_score - a.relevance_score)
|
||||
.map(r => ({
|
||||
...chunks[r.index],
|
||||
_rerankScore: r.relevance_score,
|
||||
}));
|
||||
|
||||
return selected;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -174,7 +185,7 @@ export async function testRerankService() {
|
||||
}
|
||||
|
||||
try {
|
||||
const results = await rerank('测试查询', ['测试文档1', '测试文档2'], { topN: 2 });
|
||||
const { results } = await rerank('测试查询', ['测试文档1', '测试文档2'], { topN: 2 });
|
||||
return {
|
||||
success: true,
|
||||
message: `连接成功,返回 ${results.length} 个结果`,
|
||||
|
||||
@@ -86,6 +86,7 @@ export function createMetrics() {
|
||||
l0Candidates: 0, // W-RRF 融合后的 L0 候选数
|
||||
l0Selected: 0, // rerank 后选中的 L0 数
|
||||
rerankApplied: false,
|
||||
rerankFailed: false,
|
||||
beforeRerank: 0,
|
||||
afterRerank: 0,
|
||||
rerankTime: 0,
|
||||
@@ -283,6 +284,9 @@ export function formatMetricsLog(metrics) {
|
||||
|
||||
if (m.evidence.rerankApplied) {
|
||||
lines.push(`│ ├─ rerank_applied: true`);
|
||||
if (m.evidence.rerankFailed) {
|
||||
lines.push(`│ ├─ rerank_failed: ⚠ YES (using fusion order)`);
|
||||
}
|
||||
lines.push(`│ │ ├─ before: ${m.evidence.beforeRerank}`);
|
||||
lines.push(`│ │ ├─ after: ${m.evidence.afterRerank}`);
|
||||
lines.push(`│ │ └─ time: ${m.evidence.rerankTime}ms`);
|
||||
@@ -489,6 +493,10 @@ export function detectIssues(metrics) {
|
||||
// L1 挂载问题
|
||||
// ─────────────────────────────────────────────────────────────────
|
||||
|
||||
if (m.evidence.rerankFailed) {
|
||||
issues.push('Rerank API failed — using fusion rank order as fallback, relevance scores are zero');
|
||||
}
|
||||
|
||||
if (m.evidence.l0Selected > 0 && m.evidence.l1Pulled === 0) {
|
||||
issues.push('Zero L1 chunks pulled - L1 vectors may not exist or DB read failed');
|
||||
}
|
||||
|
||||
@@ -631,6 +631,7 @@ async function locateAndPullEvidence(anchorHits, anchorFloors, queryVector, rera
|
||||
metrics.evidence.rerankApplied = true;
|
||||
metrics.evidence.beforeRerank = rerankCandidates.length;
|
||||
metrics.evidence.afterRerank = rerankedL0.length;
|
||||
metrics.evidence.rerankFailed = rerankedL0.some(c => c._rerankFailed);
|
||||
metrics.evidence.l0Selected = rerankedL0.length;
|
||||
metrics.evidence.rerankTime = rerankTime;
|
||||
metrics.timing.evidenceRerank = rerankTime;
|
||||
|
||||
@@ -49,6 +49,9 @@ let jiebaCut = null;
|
||||
/** @type {Function|null} jieba add_word 函数引用 */
|
||||
let jiebaAddWord = null;
|
||||
|
||||
/** @type {object|null} TinySegmenter 实例 */
|
||||
let tinySegmenter = null;
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// 实体词典
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
@@ -76,12 +79,13 @@ const STOP_WORDS = new Set([
|
||||
'时候', '现在', '已经', '还是', '只是', '可能', '应该', '知道',
|
||||
'觉得', '开始', '一下', '一些', '这个', '那个', '他们', '我们',
|
||||
'你们', '自己', '起来', '出来', '进去', '回来', '过来', '下去',
|
||||
// 日语助词 + 常见虚词
|
||||
'は', 'が', 'を', 'に', 'で', 'と', 'の', 'も', 'へ', 'や',
|
||||
'か', 'な', 'よ', 'ね', 'わ', 'だ', 'です', 'ます', 'た', 'て',
|
||||
'する', 'いる', 'ある', 'なる', 'れる', 'られる', 'ない',
|
||||
// 日语常见虚词(≥2字,匹配 TinySegmenter 产出粒度)
|
||||
'です', 'ます', 'した', 'して', 'する', 'ない', 'いる', 'ある',
|
||||
'なる', 'れる', 'られ', 'られる',
|
||||
'この', 'その', 'あの', 'どの', 'ここ', 'そこ', 'あそこ',
|
||||
'これ', 'それ', 'あれ', 'どれ',
|
||||
'ても', 'から', 'まで', 'ので', 'のに', 'けど', 'だけ',
|
||||
'もう', 'まだ', 'とても', 'ちょっと', 'やっぱり',
|
||||
// 英文常见停用词
|
||||
'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been',
|
||||
'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
|
||||
@@ -100,6 +104,34 @@ const STOP_WORDS = new Set([
|
||||
// Unicode 分类
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* 判断字符是否为假名(平假名 + 片假名)
|
||||
* @param {number} code - charCode
|
||||
* @returns {boolean}
|
||||
*/
|
||||
function isKana(code) {
|
||||
return (
|
||||
(code >= 0x3040 && code <= 0x309F) || // Hiragana
|
||||
(code >= 0x30A0 && code <= 0x30FF) || // Katakana
|
||||
(code >= 0x31F0 && code <= 0x31FF) || // Katakana Extensions
|
||||
(code >= 0xFF65 && code <= 0xFF9F) // Halfwidth Katakana
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* 判断字符是否为 CJK 汉字(不含假名)
|
||||
* @param {number} code - charCode
|
||||
* @returns {boolean}
|
||||
*/
|
||||
function isCJK(code) {
|
||||
return (
|
||||
(code >= 0x4E00 && code <= 0x9FFF) ||
|
||||
(code >= 0x3400 && code <= 0x4DBF) ||
|
||||
(code >= 0xF900 && code <= 0xFAFF) ||
|
||||
(code >= 0x20000 && code <= 0x2A6DF)
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* 判断字符是否为亚洲文字(CJK + 假名)
|
||||
* @param {number} code - charCode
|
||||
@@ -107,14 +139,7 @@ const STOP_WORDS = new Set([
|
||||
*/
|
||||
function isAsian(code) {
|
||||
return (
|
||||
(code >= 0x4E00 && code <= 0x9FFF) || // CJK Unified Ideographs
|
||||
(code >= 0x3400 && code <= 0x4DBF) || // CJK Extension A
|
||||
(code >= 0x3040 && code <= 0x309F) || // Hiragana
|
||||
(code >= 0x30A0 && code <= 0x30FF) || // Katakana
|
||||
(code >= 0x31F0 && code <= 0x31FF) || // Katakana Phonetic Extensions
|
||||
(code >= 0xFF65 && code <= 0xFF9F) || // Halfwidth Katakana
|
||||
(code >= 0xF900 && code <= 0xFAFF) || // CJK Compatibility Ideographs
|
||||
(code >= 0x20000 && code <= 0x2A6DF) // CJK Extension B
|
||||
isCJK(code) || isKana(code)
|
||||
);
|
||||
}
|
||||
|
||||
@@ -195,6 +220,31 @@ function segmentByScript(text) {
|
||||
return segments;
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// 亚洲文字语言检测(中文 vs 日语)
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* 检测亚洲文字段的语言
|
||||
*
|
||||
* 假名占比 > 30% 判定为日语(日语文本中假名通常占 40-60%)
|
||||
*
|
||||
* @param {string} text - 亚洲文字段
|
||||
* @returns {'zh'|'ja'|'other'}
|
||||
*/
|
||||
function detectAsianLanguage(text) {
|
||||
let kanaCount = 0;
|
||||
let cjkCount = 0;
|
||||
for (const ch of text) {
|
||||
const code = ch.codePointAt(0);
|
||||
if (isKana(code)) kanaCount++;
|
||||
else if (isCJK(code)) cjkCount++;
|
||||
}
|
||||
const total = kanaCount + cjkCount;
|
||||
if (total === 0) return 'other';
|
||||
return (kanaCount / total) > 0.3 ? 'ja' : 'zh';
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// 实体保护(最长匹配占位符替换)
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
@@ -343,6 +393,26 @@ function tokenizeAsianFallback(text) {
|
||||
return tokens;
|
||||
}
|
||||
|
||||
/**
|
||||
* 用 TinySegmenter 处理日语文字段
|
||||
* @param {string} text
|
||||
* @returns {string[]}
|
||||
*/
|
||||
function tokenizeJapanese(text) {
|
||||
if (tinySegmenter) {
|
||||
try {
|
||||
const words = tinySegmenter.segment(text);
|
||||
return words
|
||||
.map(w => String(w || '').trim())
|
||||
.filter(w => w.length >= 2);
|
||||
} catch (e) {
|
||||
xbLog.warn(MODULE_ID, 'TinySegmenter 分词异常,降级处理', e);
|
||||
return tokenizeAsianFallback(text);
|
||||
}
|
||||
}
|
||||
return tokenizeAsianFallback(text);
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// 分词:拉丁文字
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
@@ -374,6 +444,9 @@ function tokenizeLatin(text) {
|
||||
* @returns {Promise<boolean>} 是否加载成功
|
||||
*/
|
||||
export async function preload() {
|
||||
// TinySegmenter 独立于结巴状态(内部有防重入)
|
||||
loadTinySegmenter();
|
||||
|
||||
// 已就绪
|
||||
if (wasmState === WasmState.READY) return true;
|
||||
|
||||
@@ -443,6 +516,25 @@ export async function preload() {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 加载 TinySegmenter(懒加载,不阻塞)
|
||||
*/
|
||||
async function loadTinySegmenter() {
|
||||
if (tinySegmenter) return;
|
||||
|
||||
try {
|
||||
// eslint-disable-next-line no-unsanitized/method
|
||||
const mod = await import(
|
||||
`/${extensionFolderPath}/libs/tiny-segmenter.js`
|
||||
);
|
||||
const Ctor = mod.TinySegmenter || mod.default;
|
||||
tinySegmenter = new Ctor();
|
||||
xbLog.info(MODULE_ID, 'TinySegmenter 加载完成');
|
||||
} catch (e) {
|
||||
xbLog.warn(MODULE_ID, 'TinySegmenter 加载失败,日语将使用降级分词', e);
|
||||
}
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// 公开接口:isReady
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
@@ -598,7 +690,10 @@ function tokenizeCore(text) {
|
||||
const rawTokens = [];
|
||||
for (const seg of segments) {
|
||||
if (seg.type === 'asian') {
|
||||
if (wasmState === WasmState.READY && jiebaCut) {
|
||||
const lang = detectAsianLanguage(seg.text);
|
||||
if (lang === 'ja') {
|
||||
rawTokens.push(...tokenizeJapanese(seg.text));
|
||||
} else if (wasmState === WasmState.READY && jiebaCut) {
|
||||
rawTokens.push(...tokenizeAsianJieba(seg.text));
|
||||
} else {
|
||||
rawTokens.push(...tokenizeAsianFallback(seg.text));
|
||||
|
||||
Reference in New Issue
Block a user