diff --git a/modules/story-summary/vector/utils/tokenizer.js b/modules/story-summary/vector/utils/tokenizer.js index 4122b12..6c150d0 100644 --- a/modules/story-summary/vector/utils/tokenizer.js +++ b/modules/story-summary/vector/utils/tokenizer.js @@ -199,8 +199,8 @@ function segmentByScript(text) { // 实体保护(最长匹配占位符替换) // ═══════════════════════════════════════════════════════════════════════════ -// 使用 Unicode Private Use Area (PUA) 字符作为边界,避免控制字符在分词器中产生不可控行为 -const PLACEHOLDER_PREFIX = '\uE000ENT_'; +// 使用纯 PUA 字符序列作为占位符,避免拉丁字母泄漏到分词结果 +const PLACEHOLDER_PREFIX = '\uE000\uE010'; const PLACEHOLDER_SUFFIX = '\uE001'; /** @@ -264,21 +264,25 @@ function maskEntities(text) { function unmaskTokens(tokens, entities) { if (!entities.size) return tokens; - return tokens.map(token => { - // token 本身就是一个占位符 + return tokens.flatMap(token => { + // token 本身就是一个完整占位符 if (entities.has(token)) { - return entities.get(token); + return [entities.get(token)]; } - // token 中包含占位符(结巴可能把占位符和其他字符连在一起) - let result = token; - for (const [placeholder, original] of entities) { - if (result.includes(placeholder)) { - result = result.replace(placeholder, original); + // token 中包含 PUA 字符 → 检查是否包含完整占位符 + if (/[\uE000-\uE0FF]/.test(token)) { + for (const [placeholder, original] of entities) { + if (token.includes(placeholder)) { + return [original]; + } } + // 纯 PUA 碎片,丢弃 + return []; } - return result; + // 普通 token,原样保留 + return [token]; }); }