Prevent placeholder token leaks
This commit is contained in:
@@ -199,8 +199,8 @@ function segmentByScript(text) {
|
|||||||
// 实体保护(最长匹配占位符替换)
|
// 实体保护(最长匹配占位符替换)
|
||||||
// ═══════════════════════════════════════════════════════════════════════════
|
// ═══════════════════════════════════════════════════════════════════════════
|
||||||
|
|
||||||
// 使用 Unicode Private Use Area (PUA) 字符作为边界,避免控制字符在分词器中产生不可控行为
|
// 使用纯 PUA 字符序列作为占位符,避免拉丁字母泄漏到分词结果
|
||||||
const PLACEHOLDER_PREFIX = '\uE000ENT_';
|
const PLACEHOLDER_PREFIX = '\uE000\uE010';
|
||||||
const PLACEHOLDER_SUFFIX = '\uE001';
|
const PLACEHOLDER_SUFFIX = '\uE001';
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -264,21 +264,25 @@ function maskEntities(text) {
|
|||||||
function unmaskTokens(tokens, entities) {
|
function unmaskTokens(tokens, entities) {
|
||||||
if (!entities.size) return tokens;
|
if (!entities.size) return tokens;
|
||||||
|
|
||||||
return tokens.map(token => {
|
return tokens.flatMap(token => {
|
||||||
// token 本身就是一个占位符
|
// token 本身就是一个完整占位符
|
||||||
if (entities.has(token)) {
|
if (entities.has(token)) {
|
||||||
return entities.get(token);
|
return [entities.get(token)];
|
||||||
}
|
}
|
||||||
|
|
||||||
// token 中包含占位符(结巴可能把占位符和其他字符连在一起)
|
// token 中包含 PUA 字符 → 检查是否包含完整占位符
|
||||||
let result = token;
|
if (/[\uE000-\uE0FF]/.test(token)) {
|
||||||
for (const [placeholder, original] of entities) {
|
for (const [placeholder, original] of entities) {
|
||||||
if (result.includes(placeholder)) {
|
if (token.includes(placeholder)) {
|
||||||
result = result.replace(placeholder, original);
|
return [original];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// 纯 PUA 碎片,丢弃
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
return result;
|
// 普通 token,原样保留
|
||||||
|
return [token];
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user