Prevent placeholder token leaks
This commit is contained in:
@@ -199,8 +199,8 @@ function segmentByScript(text) {
|
||||
// 实体保护(最长匹配占位符替换)
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
// 使用 Unicode Private Use Area (PUA) 字符作为边界,避免控制字符在分词器中产生不可控行为
|
||||
const PLACEHOLDER_PREFIX = '\uE000ENT_';
|
||||
// 使用纯 PUA 字符序列作为占位符,避免拉丁字母泄漏到分词结果
|
||||
const PLACEHOLDER_PREFIX = '\uE000\uE010';
|
||||
const PLACEHOLDER_SUFFIX = '\uE001';
|
||||
|
||||
/**
|
||||
@@ -264,21 +264,25 @@ function maskEntities(text) {
|
||||
function unmaskTokens(tokens, entities) {
|
||||
if (!entities.size) return tokens;
|
||||
|
||||
return tokens.map(token => {
|
||||
// token 本身就是一个占位符
|
||||
return tokens.flatMap(token => {
|
||||
// token 本身就是一个完整占位符
|
||||
if (entities.has(token)) {
|
||||
return entities.get(token);
|
||||
return [entities.get(token)];
|
||||
}
|
||||
|
||||
// token 中包含占位符(结巴可能把占位符和其他字符连在一起)
|
||||
let result = token;
|
||||
// token 中包含 PUA 字符 → 检查是否包含完整占位符
|
||||
if (/[\uE000-\uE0FF]/.test(token)) {
|
||||
for (const [placeholder, original] of entities) {
|
||||
if (result.includes(placeholder)) {
|
||||
result = result.replace(placeholder, original);
|
||||
if (token.includes(placeholder)) {
|
||||
return [original];
|
||||
}
|
||||
}
|
||||
// 纯 PUA 碎片,丢弃
|
||||
return [];
|
||||
}
|
||||
|
||||
return result;
|
||||
// 普通 token,原样保留
|
||||
return [token];
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user