Prevent placeholder token leaks

This commit is contained in:
2026-02-09 21:24:32 +08:00
parent 8edb0cf006
commit da1e3088eb

View File

@@ -199,8 +199,8 @@ function segmentByScript(text) {
// 实体保护(最长匹配占位符替换) // 实体保护(最长匹配占位符替换)
// ═══════════════════════════════════════════════════════════════════════════ // ═══════════════════════════════════════════════════════════════════════════
// 使用 Unicode Private Use Area (PUA) 字符作为边界,避免控制字符在分词器中产生不可控行为 // 使用 PUA 字符序列作为占位符,避免拉丁字母泄漏到分词结果
const PLACEHOLDER_PREFIX = '\uE000ENT_'; const PLACEHOLDER_PREFIX = '\uE000\uE010';
const PLACEHOLDER_SUFFIX = '\uE001'; const PLACEHOLDER_SUFFIX = '\uE001';
/** /**
@@ -264,21 +264,25 @@ function maskEntities(text) {
function unmaskTokens(tokens, entities) { function unmaskTokens(tokens, entities) {
if (!entities.size) return tokens; if (!entities.size) return tokens;
return tokens.map(token => { return tokens.flatMap(token => {
// token 本身就是一个占位符 // token 本身就是一个完整占位符
if (entities.has(token)) { if (entities.has(token)) {
return entities.get(token); return [entities.get(token)];
} }
// token 中包含占位符(结巴可能把占位符和其他字符连在一起) // token 中包含 PUA 字符 → 检查是否包含完整占位符
let result = token; if (/[\uE000-\uE0FF]/.test(token)) {
for (const [placeholder, original] of entities) { for (const [placeholder, original] of entities) {
if (result.includes(placeholder)) { if (token.includes(placeholder)) {
result = result.replace(placeholder, original); return [original];
}
} }
// 纯 PUA 碎片,丢弃
return [];
} }
return result; // 普通 token原样保留
return [token];
}); });
} }