Improve rerank failure handling and tokenizer JP support

This commit is contained in:
2026-02-10 17:52:09 +08:00
parent fbf34815bb
commit 062df60570
7 changed files with 655 additions and 285 deletions

View File

@@ -28,17 +28,17 @@ export async function rerank(query, documents, options = {}) {
if (!query?.trim()) {
xbLog.warn(MODULE_ID, 'query 为空,跳过 rerank');
return documents.map((_, i) => ({ index: i, relevance_score: 0.5 }));
return { results: documents.map((_, i) => ({ index: i, relevance_score: 0 })), failed: true };
}
if (!documents?.length) {
return [];
return { results: [], failed: false };
}
const key = getApiKey();
if (!key) {
xbLog.warn(MODULE_ID, '未配置 API Key跳过 rerank');
return documents.map((_, i) => ({ index: i, relevance_score: 0.5 }));
return { results: documents.map((_, i) => ({ index: i, relevance_score: 0 })), failed: true };
}
// 截断超长文档列表
@@ -61,7 +61,7 @@ export async function rerank(query, documents, options = {}) {
if (!validDocs.length) {
xbLog.warn(MODULE_ID, '无有效文档,跳过 rerank');
return [];
return { results: [], failed: false };
}
const controller = new AbortController();
@@ -106,7 +106,7 @@ export async function rerank(query, documents, options = {}) {
const elapsed = Math.round(performance.now() - T0);
xbLog.info(MODULE_ID, `Rerank 完成: ${validDocs.length} docs → ${results.length} selected (${elapsed}ms)`);
return mapped;
return { results: mapped, failed: false };
} catch (e) {
clearTimeout(timeoutId);
@@ -118,10 +118,13 @@ export async function rerank(query, documents, options = {}) {
}
// 降级:返回原顺序,分数均匀分布
return documents.slice(0, topN).map((_, i) => ({
index: i,
relevance_score: 1 - (i / documents.length) * 0.5,
}));
return {
results: documents.slice(0, topN).map((_, i) => ({
index: i,
relevance_score: 0,
})),
failed: true,
};
}
}
@@ -138,30 +141,38 @@ export async function rerankChunks(query, chunks, options = {}) {
if (!chunks?.length) return [];
if (chunks.length <= topN) {
// 数量不超限,仍然 rerank 以获取分数,但不过滤
const texts = chunks.map(c => c.text || c.semantic || '');
const results = await rerank(query, texts, { topN: chunks.length, ...options });
const { results, failed } = await rerank(query, texts, { topN: chunks.length, ...options });
if (failed) {
return chunks.map(c => ({ ...c, _rerankScore: 0, _rerankFailed: true }));
}
const scoreMap = new Map(results.map(r => [r.index, r.relevance_score]));
return chunks.map((c, i) => ({
...c,
_rerankScore: scoreMap.get(i) ?? 0.5,
_rerankScore: scoreMap.get(i) ?? 0,
})).sort((a, b) => b._rerankScore - a._rerankScore);
}
const texts = chunks.map(c => c.text || c.semantic || '');
const results = await rerank(query, texts, { topN, ...options });
const { results, failed } = await rerank(query, texts, { topN, ...options });
// 过滤低分 + 排序
const selected = results
if (failed) {
return chunks.slice(0, topN).map(c => ({
...c,
_rerankScore: 0,
_rerankFailed: true,
}));
}
return results
.filter(r => r.relevance_score >= minScore)
.sort((a, b) => b.relevance_score - a.relevance_score)
.map(r => ({
...chunks[r.index],
_rerankScore: r.relevance_score,
}));
return selected;
}
/**
@@ -174,7 +185,7 @@ export async function testRerankService() {
}
try {
const results = await rerank('测试查询', ['测试文档1', '测试文档2'], { topN: 2 });
const { results } = await rerank('测试查询', ['测试文档1', '测试文档2'], { topN: 2 });
return {
success: true,
message: `连接成功,返回 ${results.length} 个结果`,

View File

@@ -86,6 +86,7 @@ export function createMetrics() {
l0Candidates: 0, // W-RRF 融合后的 L0 候选数
l0Selected: 0, // rerank 后选中的 L0 数
rerankApplied: false,
rerankFailed: false,
beforeRerank: 0,
afterRerank: 0,
rerankTime: 0,
@@ -283,6 +284,9 @@ export function formatMetricsLog(metrics) {
if (m.evidence.rerankApplied) {
lines.push(`│ ├─ rerank_applied: true`);
if (m.evidence.rerankFailed) {
lines.push(`│ ├─ rerank_failed: ⚠ YES (using fusion order)`);
}
lines.push(`│ │ ├─ before: ${m.evidence.beforeRerank}`);
lines.push(`│ │ ├─ after: ${m.evidence.afterRerank}`);
lines.push(`│ │ └─ time: ${m.evidence.rerankTime}ms`);
@@ -489,6 +493,10 @@ export function detectIssues(metrics) {
// L1 挂载问题
// ─────────────────────────────────────────────────────────────────
if (m.evidence.rerankFailed) {
issues.push('Rerank API failed — using fusion rank order as fallback, relevance scores are zero');
}
if (m.evidence.l0Selected > 0 && m.evidence.l1Pulled === 0) {
issues.push('Zero L1 chunks pulled - L1 vectors may not exist or DB read failed');
}

View File

@@ -631,6 +631,7 @@ async function locateAndPullEvidence(anchorHits, anchorFloors, queryVector, rera
metrics.evidence.rerankApplied = true;
metrics.evidence.beforeRerank = rerankCandidates.length;
metrics.evidence.afterRerank = rerankedL0.length;
metrics.evidence.rerankFailed = rerankedL0.some(c => c._rerankFailed);
metrics.evidence.l0Selected = rerankedL0.length;
metrics.evidence.rerankTime = rerankTime;
metrics.timing.evidenceRerank = rerankTime;

View File

@@ -49,6 +49,9 @@ let jiebaCut = null;
/** @type {Function|null} jieba add_word 函数引用 */
let jiebaAddWord = null;
/** @type {object|null} TinySegmenter 实例 */
let tinySegmenter = null;
// ═══════════════════════════════════════════════════════════════════════════
// 实体词典
// ═══════════════════════════════════════════════════════════════════════════
@@ -76,12 +79,13 @@ const STOP_WORDS = new Set([
'时候', '现在', '已经', '还是', '只是', '可能', '应该', '知道',
'觉得', '开始', '一下', '一些', '这个', '那个', '他们', '我们',
'你们', '自己', '起来', '出来', '进去', '回来', '过来', '下去',
// 日语助词 + 常见虚词
'は', 'が', 'を', '', '', '', '', '', '', '',
'か', 'な', 'よ', '', '', 'だ', 'です', 'ます', 'た', 'て',
'する', 'いる', 'ある', 'なる', 'れる', 'られる', 'ない',
// 日语常见虚词≥2字匹配 TinySegmenter 产出粒度)
'です', 'ます', 'した', 'して', 'する', 'ない', 'いる', 'ある',
'なる', 'れる', 'られ', 'られる',
'この', 'その', 'あの', 'どの', 'ここ', 'そこ', 'あそこ',
'これ', 'それ', 'あれ', 'どれ',
'ても', 'から', 'まで', 'ので', 'のに', 'けど', 'だけ',
'もう', 'まだ', 'とても', 'ちょっと', 'やっぱり',
// 英文常见停用词
'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been',
'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
@@ -100,6 +104,34 @@ const STOP_WORDS = new Set([
// Unicode 分类
// ═══════════════════════════════════════════════════════════════════════════
/**
* 判断字符是否为假名(平假名 + 片假名)
* @param {number} code - charCode
* @returns {boolean}
*/
function isKana(code) {
return (
(code >= 0x3040 && code <= 0x309F) || // Hiragana
(code >= 0x30A0 && code <= 0x30FF) || // Katakana
(code >= 0x31F0 && code <= 0x31FF) || // Katakana Extensions
(code >= 0xFF65 && code <= 0xFF9F) // Halfwidth Katakana
);
}
/**
* 判断字符是否为 CJK 汉字(不含假名)
* @param {number} code - charCode
* @returns {boolean}
*/
function isCJK(code) {
return (
(code >= 0x4E00 && code <= 0x9FFF) ||
(code >= 0x3400 && code <= 0x4DBF) ||
(code >= 0xF900 && code <= 0xFAFF) ||
(code >= 0x20000 && code <= 0x2A6DF)
);
}
/**
* 判断字符是否为亚洲文字CJK + 假名)
* @param {number} code - charCode
@@ -107,14 +139,7 @@ const STOP_WORDS = new Set([
*/
function isAsian(code) {
return (
(code >= 0x4E00 && code <= 0x9FFF) || // CJK Unified Ideographs
(code >= 0x3400 && code <= 0x4DBF) || // CJK Extension A
(code >= 0x3040 && code <= 0x309F) || // Hiragana
(code >= 0x30A0 && code <= 0x30FF) || // Katakana
(code >= 0x31F0 && code <= 0x31FF) || // Katakana Phonetic Extensions
(code >= 0xFF65 && code <= 0xFF9F) || // Halfwidth Katakana
(code >= 0xF900 && code <= 0xFAFF) || // CJK Compatibility Ideographs
(code >= 0x20000 && code <= 0x2A6DF) // CJK Extension B
isCJK(code) || isKana(code)
);
}
@@ -195,6 +220,31 @@ function segmentByScript(text) {
return segments;
}
// ═══════════════════════════════════════════════════════════════════════════
// 亚洲文字语言检测(中文 vs 日语)
// ═══════════════════════════════════════════════════════════════════════════
/**
* 检测亚洲文字段的语言
*
* 假名占比 > 30% 判定为日语(日语文本中假名通常占 40-60%
*
* @param {string} text - 亚洲文字段
* @returns {'zh'|'ja'|'other'}
*/
function detectAsianLanguage(text) {
let kanaCount = 0;
let cjkCount = 0;
for (const ch of text) {
const code = ch.codePointAt(0);
if (isKana(code)) kanaCount++;
else if (isCJK(code)) cjkCount++;
}
const total = kanaCount + cjkCount;
if (total === 0) return 'other';
return (kanaCount / total) > 0.3 ? 'ja' : 'zh';
}
// ═══════════════════════════════════════════════════════════════════════════
// 实体保护(最长匹配占位符替换)
// ═══════════════════════════════════════════════════════════════════════════
@@ -343,6 +393,26 @@ function tokenizeAsianFallback(text) {
return tokens;
}
/**
* 用 TinySegmenter 处理日语文字段
* @param {string} text
* @returns {string[]}
*/
function tokenizeJapanese(text) {
if (tinySegmenter) {
try {
const words = tinySegmenter.segment(text);
return words
.map(w => String(w || '').trim())
.filter(w => w.length >= 2);
} catch (e) {
xbLog.warn(MODULE_ID, 'TinySegmenter 分词异常,降级处理', e);
return tokenizeAsianFallback(text);
}
}
return tokenizeAsianFallback(text);
}
// ═══════════════════════════════════════════════════════════════════════════
// 分词:拉丁文字
// ═══════════════════════════════════════════════════════════════════════════
@@ -374,6 +444,9 @@ function tokenizeLatin(text) {
* @returns {Promise<boolean>} 是否加载成功
*/
export async function preload() {
// TinySegmenter 独立于结巴状态(内部有防重入)
loadTinySegmenter();
// 已就绪
if (wasmState === WasmState.READY) return true;
@@ -443,6 +516,25 @@ export async function preload() {
}
}
/**
* 加载 TinySegmenter懒加载不阻塞
*/
async function loadTinySegmenter() {
if (tinySegmenter) return;
try {
// eslint-disable-next-line no-unsanitized/method
const mod = await import(
`/${extensionFolderPath}/libs/tiny-segmenter.js`
);
const Ctor = mod.TinySegmenter || mod.default;
tinySegmenter = new Ctor();
xbLog.info(MODULE_ID, 'TinySegmenter 加载完成');
} catch (e) {
xbLog.warn(MODULE_ID, 'TinySegmenter 加载失败,日语将使用降级分词', e);
}
}
// ═══════════════════════════════════════════════════════════════════════════
// 公开接口isReady
// ═══════════════════════════════════════════════════════════════════════════
@@ -598,7 +690,10 @@ function tokenizeCore(text) {
const rawTokens = [];
for (const seg of segments) {
if (seg.type === 'asian') {
if (wasmState === WasmState.READY && jiebaCut) {
const lang = detectAsianLanguage(seg.text);
if (lang === 'ja') {
rawTokens.push(...tokenizeJapanese(seg.text));
} else if (wasmState === WasmState.READY && jiebaCut) {
rawTokens.push(...tokenizeAsianJieba(seg.text));
} else {
rawTokens.push(...tokenizeAsianFallback(seg.text));