diff --git a/libs/jieba-wasm/LICENSE b/libs/jieba-wasm/LICENSE new file mode 100644 index 0000000..5a792c4 --- /dev/null +++ b/libs/jieba-wasm/LICENSE @@ -0,0 +1,25 @@ +Copyright (c) 2018 fengkx + +Permission is hereby granted, free of charge, to any +person obtaining a copy of this software and associated +documentation files (the "Software"), to deal in the +Software without restriction, including without +limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software +is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice +shall be included in all copies or substantial portions +of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/libs/jieba-wasm/README.md b/libs/jieba-wasm/README.md new file mode 100644 index 0000000..a438327 --- /dev/null +++ b/libs/jieba-wasm/README.md @@ -0,0 +1,134 @@ +# jieba-wasm + +> [jieba-rs](https://github.com/messense/jieba-rs) 的 wasm binding + +_编译成 WASM 摆脱编译 Node Addon 的烦恼_ + +# Usage +## Node.js +```js +const { + cut, + cut_all, + cut_for_search, + tokenize, + add_word, +} = require("jieba-wasm"); +cut("中华人民共和国武汉市长江大桥", true); +// [ '中华人民共和国', '武汉市', '长江大桥' ] +cut_all("中华人民共和国武汉市长江大桥", true); +/* +[ + '中', '中华', + '中华人民', '中华人民共和国', + '华', '华人', + '人', '人民', + '人民共和国', '民', + '共', '共和', + '共和国', '和', + '国', '武', + '武汉', '武汉市', + '汉', '市', + '市长', '长', + '长江', '长江大桥', + '江', '大', + '大桥', '桥' +] +*/ +cut_for_search("中华人民共和国武汉市长江大桥", true); +/* +[ + '中华', '华人', + '人民', '共和', + '共和国', '中华人民共和国', + '武汉', '武汉市', + '长江', '大桥', + '长江大桥' +] +*/ +tokenize("中华人民共和国武汉市长江大桥", "default", true); +/* +[ + { word: '中华人民共和国', start: 0, end: 7 }, + { word: '武汉市', start: 7, end: 10 }, + { word: '长江大桥', start: 10, end: 14 } +] +*/ +tokenize("中华人民共和国武汉市长江大桥", "search", true); +/* +[ + { word: '中华', start: 0, end: 2 }, + { word: '华人', start: 1, end: 3 }, + { word: '人民', start: 2, end: 4 }, + { word: '共和', start: 4, end: 6 }, + { word: '共和国', start: 4, end: 7 }, + { word: '中华人民共和国', start: 0, end: 7 }, + { word: '武汉', start: 7, end: 9 }, + { word: '武汉市', start: 7, end: 10 }, + { word: '长江', start: 10, end: 12 }, + { word: '大桥', start: 12, end: 14 }, + { word: '长江大桥', start: 10, end: 14 } +] +*/ + +cut("桥大江长市汉武的省北湖国和共民人华中"); +/* +[ + '桥', '大江', '长', + '市', '汉', '武', + '的', '省', '北湖', + '国', '和', '共', + '民', '人', '华中' +] +*/ +["桥大江长", "市汉武", "省北湖", "国和共民人华中"].map((word) => { + add_word(word); +}); +cut("桥大江长市汉武的省北湖国和共民人华中"); +// ["桥大江长", "市汉武", "的", "省北湖", "国和共民人华中"]; + +with_dict("自动借书机 1 n"); // 导入自定义字典,词条格式:词语 词频 词性(可选),以换行符分隔 +cut("你好我是一个自动借书机"); +// ["你好", "我", "是", "一个", "自动借书机"]; +``` + +## Browser +```ts +import init, { cut } from 'jieba-wasm'; + +// 重要:使用前必须初始化 +await init(); + +cut("中华人民共和国武汉市长江大桥", true); +// [ '中华人民共和国', '武汉市', '长江大桥' ] +``` + +# 示例 Demo + +## 安装依赖 + +安装 wasm-bindgen 和 wasm-opt + +```bash +cargo install wasm-bindgen-cli --locked +cargo install wasm-opt --locked +``` + +## 前期准备 + +首先保证存在 rust 环境,然后运行以下命令 +```bash +npm run build:cargo +npm run build +``` + +## 运行浏览器端示例 +```bash +cd demo/web +npm install +npm run dev +``` + +# Piror Art + +https://github.com/messense/jieba-rs diff --git a/libs/jieba-wasm/jieba_rs_wasm.d.ts b/libs/jieba-wasm/jieba_rs_wasm.d.ts new file mode 100644 index 0000000..fa7c50a --- /dev/null +++ b/libs/jieba-wasm/jieba_rs_wasm.d.ts @@ -0,0 +1,73 @@ +/* tslint:disable */ +/* eslint-disable */ +export function cut(text: string, hmm?: boolean | null): string[]; +export function cut_all(text: string): string[]; +export function cut_for_search(text: string, hmm?: boolean | null): string[]; +export function tokenize(text: string, mode: string, hmm?: boolean | null): Token[]; +export function add_word(word: string, freq?: number | null, tag?: string | null): number; +export function tag(sentence: string, hmm?: boolean | null): Tag[]; +export function with_dict(dict: string): void; + +/** Represents a single token with its word and position. */ +export interface Token { + word: string; + start: number; + end: number; +} + +/** Represents a single word and its part-of-speech tag. */ +export interface Tag { + word: string; + tag: string; +} + + + +export type InitInput = RequestInfo | URL | Response | BufferSource | WebAssembly.Module; + +export interface InitOutput { + readonly memory: WebAssembly.Memory; + readonly cut: (a: number, b: number, c: number) => [number, number]; + readonly cut_all: (a: number, b: number) => [number, number]; + readonly cut_for_search: (a: number, b: number, c: number) => [number, number]; + readonly tokenize: (a: number, b: number, c: number, d: number, e: number) => [number, number, number, number]; + readonly add_word: (a: number, b: number, c: number, d: number, e: number) => number; + readonly tag: (a: number, b: number, c: number) => [number, number]; + readonly with_dict: (a: number, b: number) => [number, number]; + readonly rust_zstd_wasm_shim_qsort: (a: number, b: number, c: number, d: number) => void; + readonly rust_zstd_wasm_shim_malloc: (a: number) => number; + readonly rust_zstd_wasm_shim_memcmp: (a: number, b: number, c: number) => number; + readonly rust_zstd_wasm_shim_calloc: (a: number, b: number) => number; + readonly rust_zstd_wasm_shim_free: (a: number) => void; + readonly rust_zstd_wasm_shim_memcpy: (a: number, b: number, c: number) => number; + readonly rust_zstd_wasm_shim_memmove: (a: number, b: number, c: number) => number; + readonly rust_zstd_wasm_shim_memset: (a: number, b: number, c: number) => number; + readonly __wbindgen_malloc: (a: number, b: number) => number; + readonly __wbindgen_realloc: (a: number, b: number, c: number, d: number) => number; + readonly __wbindgen_export_2: WebAssembly.Table; + readonly __externref_drop_slice: (a: number, b: number) => void; + readonly __wbindgen_free: (a: number, b: number, c: number) => void; + readonly __externref_table_dealloc: (a: number) => void; + readonly __wbindgen_start: () => void; +} + +export type SyncInitInput = BufferSource | WebAssembly.Module; +/** +* Instantiates the given `module`, which can either be bytes or +* a precompiled `WebAssembly.Module`. +* +* @param {{ module: SyncInitInput }} module - Passing `SyncInitInput` directly is deprecated. +* +* @returns {InitOutput} +*/ +export function initSync(module: { module: SyncInitInput } | SyncInitInput): InitOutput; + +/** +* If `module_or_path` is {RequestInfo} or {URL}, makes a request and +* for everything else, calls `WebAssembly.instantiate` directly. +* +* @param {{ module_or_path: InitInput | Promise }} module_or_path - Passing `InitInput` directly is deprecated. +* +* @returns {Promise} +*/ +export default function __wbg_init (module_or_path?: { module_or_path: InitInput | Promise } | InitInput | Promise): Promise; diff --git a/libs/jieba-wasm/jieba_rs_wasm.js b/libs/jieba-wasm/jieba_rs_wasm.js new file mode 100644 index 0000000..7281ce6 --- /dev/null +++ b/libs/jieba-wasm/jieba_rs_wasm.js @@ -0,0 +1,438 @@ +let wasm; + +let cachedUint8ArrayMemory0 = null; + +function getUint8ArrayMemory0() { + if (cachedUint8ArrayMemory0 === null || cachedUint8ArrayMemory0.byteLength === 0) { + cachedUint8ArrayMemory0 = new Uint8Array(wasm.memory.buffer); + } + return cachedUint8ArrayMemory0; +} + +let cachedTextDecoder = (typeof TextDecoder !== 'undefined' ? new TextDecoder('utf-8', { ignoreBOM: true, fatal: true }) : { decode: () => { throw Error('TextDecoder not available') } } ); + +if (typeof TextDecoder !== 'undefined') { cachedTextDecoder.decode(); }; + +const MAX_SAFARI_DECODE_BYTES = 2146435072; +let numBytesDecoded = 0; +function decodeText(ptr, len) { + numBytesDecoded += len; + if (numBytesDecoded >= MAX_SAFARI_DECODE_BYTES) { + cachedTextDecoder = (typeof TextDecoder !== 'undefined' ? new TextDecoder('utf-8', { ignoreBOM: true, fatal: true }) : { decode: () => { throw Error('TextDecoder not available') } } ); + cachedTextDecoder.decode(); + numBytesDecoded = len; + } + return cachedTextDecoder.decode(getUint8ArrayMemory0().subarray(ptr, ptr + len)); +} + +function getStringFromWasm0(ptr, len) { + ptr = ptr >>> 0; + return decodeText(ptr, len); +} + +function debugString(val) { + // primitive types + const type = typeof val; + if (type == 'number' || type == 'boolean' || val == null) { + return `${val}`; + } + if (type == 'string') { + return `"${val}"`; + } + if (type == 'symbol') { + const description = val.description; + if (description == null) { + return 'Symbol'; + } else { + return `Symbol(${description})`; + } + } + if (type == 'function') { + const name = val.name; + if (typeof name == 'string' && name.length > 0) { + return `Function(${name})`; + } else { + return 'Function'; + } + } + // objects + if (Array.isArray(val)) { + const length = val.length; + let debug = '['; + if (length > 0) { + debug += debugString(val[0]); + } + for(let i = 1; i < length; i++) { + debug += ', ' + debugString(val[i]); + } + debug += ']'; + return debug; + } + // Test for built-in + const builtInMatches = /\[object ([^\]]+)\]/.exec(toString.call(val)); + let className; + if (builtInMatches && builtInMatches.length > 1) { + className = builtInMatches[1]; + } else { + // Failed to match the standard '[object ClassName]' + return toString.call(val); + } + if (className == 'Object') { + // we're a user defined class or Object + // JSON.stringify avoids problems with cycles, and is generally much + // easier than looping through ownProperties of `val`. + try { + return 'Object(' + JSON.stringify(val) + ')'; + } catch (_) { + return 'Object'; + } + } + // errors + if (val instanceof Error) { + return `${val.name}: ${val.message}\n${val.stack}`; + } + // TODO we could test for more things here, like `Set`s and `Map`s. + return className; +} + +let WASM_VECTOR_LEN = 0; + +const cachedTextEncoder = (typeof TextEncoder !== 'undefined' ? new TextEncoder('utf-8') : { encode: () => { throw Error('TextEncoder not available') } } ); + +const encodeString = (typeof cachedTextEncoder.encodeInto === 'function' + ? function (arg, view) { + return cachedTextEncoder.encodeInto(arg, view); +} + : function (arg, view) { + const buf = cachedTextEncoder.encode(arg); + view.set(buf); + return { + read: arg.length, + written: buf.length + }; +}); + +function passStringToWasm0(arg, malloc, realloc) { + + if (realloc === undefined) { + const buf = cachedTextEncoder.encode(arg); + const ptr = malloc(buf.length, 1) >>> 0; + getUint8ArrayMemory0().subarray(ptr, ptr + buf.length).set(buf); + WASM_VECTOR_LEN = buf.length; + return ptr; + } + + let len = arg.length; + let ptr = malloc(len, 1) >>> 0; + + const mem = getUint8ArrayMemory0(); + + let offset = 0; + + for (; offset < len; offset++) { + const code = arg.charCodeAt(offset); + if (code > 0x7F) break; + mem[ptr + offset] = code; + } + + if (offset !== len) { + if (offset !== 0) { + arg = arg.slice(offset); + } + ptr = realloc(ptr, len, len = offset + arg.length * 3, 1) >>> 0; + const view = getUint8ArrayMemory0().subarray(ptr + offset, ptr + len); + const ret = encodeString(arg, view); + + offset += ret.written; + ptr = realloc(ptr, len, offset, 1) >>> 0; + } + + WASM_VECTOR_LEN = offset; + return ptr; +} + +let cachedDataViewMemory0 = null; + +function getDataViewMemory0() { + if (cachedDataViewMemory0 === null || cachedDataViewMemory0.buffer.detached === true || (cachedDataViewMemory0.buffer.detached === undefined && cachedDataViewMemory0.buffer !== wasm.memory.buffer)) { + cachedDataViewMemory0 = new DataView(wasm.memory.buffer); + } + return cachedDataViewMemory0; +} + +function isLikeNone(x) { + return x === undefined || x === null; +} + +function getArrayJsValueFromWasm0(ptr, len) { + ptr = ptr >>> 0; + const mem = getDataViewMemory0(); + const result = []; + for (let i = ptr; i < ptr + 4 * len; i += 4) { + result.push(wasm.__wbindgen_export_2.get(mem.getUint32(i, true))); + } + wasm.__externref_drop_slice(ptr, len); + return result; +} +/** + * @param {string} text + * @param {boolean | null} [hmm] + * @returns {string[]} + */ +export function cut(text, hmm) { + const ptr0 = passStringToWasm0(text, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc); + const len0 = WASM_VECTOR_LEN; + const ret = wasm.cut(ptr0, len0, isLikeNone(hmm) ? 0xFFFFFF : hmm ? 1 : 0); + var v2 = getArrayJsValueFromWasm0(ret[0], ret[1]).slice(); + wasm.__wbindgen_free(ret[0], ret[1] * 4, 4); + return v2; +} + +/** + * @param {string} text + * @returns {string[]} + */ +export function cut_all(text) { + const ptr0 = passStringToWasm0(text, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc); + const len0 = WASM_VECTOR_LEN; + const ret = wasm.cut_all(ptr0, len0); + var v2 = getArrayJsValueFromWasm0(ret[0], ret[1]).slice(); + wasm.__wbindgen_free(ret[0], ret[1] * 4, 4); + return v2; +} + +/** + * @param {string} text + * @param {boolean | null} [hmm] + * @returns {string[]} + */ +export function cut_for_search(text, hmm) { + const ptr0 = passStringToWasm0(text, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc); + const len0 = WASM_VECTOR_LEN; + const ret = wasm.cut_for_search(ptr0, len0, isLikeNone(hmm) ? 0xFFFFFF : hmm ? 1 : 0); + var v2 = getArrayJsValueFromWasm0(ret[0], ret[1]).slice(); + wasm.__wbindgen_free(ret[0], ret[1] * 4, 4); + return v2; +} + +function takeFromExternrefTable0(idx) { + const value = wasm.__wbindgen_export_2.get(idx); + wasm.__externref_table_dealloc(idx); + return value; +} +/** + * @param {string} text + * @param {string} mode + * @param {boolean | null} [hmm] + * @returns {Token[]} + */ +export function tokenize(text, mode, hmm) { + const ptr0 = passStringToWasm0(text, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc); + const len0 = WASM_VECTOR_LEN; + const ptr1 = passStringToWasm0(mode, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc); + const len1 = WASM_VECTOR_LEN; + const ret = wasm.tokenize(ptr0, len0, ptr1, len1, isLikeNone(hmm) ? 0xFFFFFF : hmm ? 1 : 0); + if (ret[3]) { + throw takeFromExternrefTable0(ret[2]); + } + var v3 = getArrayJsValueFromWasm0(ret[0], ret[1]).slice(); + wasm.__wbindgen_free(ret[0], ret[1] * 4, 4); + return v3; +} + +/** + * @param {string} word + * @param {number | null} [freq] + * @param {string | null} [tag] + * @returns {number} + */ +export function add_word(word, freq, tag) { + const ptr0 = passStringToWasm0(word, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc); + const len0 = WASM_VECTOR_LEN; + var ptr1 = isLikeNone(tag) ? 0 : passStringToWasm0(tag, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc); + var len1 = WASM_VECTOR_LEN; + const ret = wasm.add_word(ptr0, len0, isLikeNone(freq) ? 0x100000001 : (freq) >>> 0, ptr1, len1); + return ret >>> 0; +} + +/** + * @param {string} sentence + * @param {boolean | null} [hmm] + * @returns {Tag[]} + */ +export function tag(sentence, hmm) { + const ptr0 = passStringToWasm0(sentence, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc); + const len0 = WASM_VECTOR_LEN; + const ret = wasm.tag(ptr0, len0, isLikeNone(hmm) ? 0xFFFFFF : hmm ? 1 : 0); + var v2 = getArrayJsValueFromWasm0(ret[0], ret[1]).slice(); + wasm.__wbindgen_free(ret[0], ret[1] * 4, 4); + return v2; +} + +/** + * @param {string} dict + */ +export function with_dict(dict) { + const ptr0 = passStringToWasm0(dict, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc); + const len0 = WASM_VECTOR_LEN; + const ret = wasm.with_dict(ptr0, len0); + if (ret[1]) { + throw takeFromExternrefTable0(ret[0]); + } +} + +const EXPECTED_RESPONSE_TYPES = new Set(['basic', 'cors', 'default']); + +async function __wbg_load(module, imports) { + if (typeof Response === 'function' && module instanceof Response) { + if (typeof WebAssembly.instantiateStreaming === 'function') { + try { + return await WebAssembly.instantiateStreaming(module, imports); + + } catch (e) { + const validResponse = module.ok && EXPECTED_RESPONSE_TYPES.has(module.type); + + if (validResponse && module.headers.get('Content-Type') !== 'application/wasm') { + console.warn("`WebAssembly.instantiateStreaming` failed because your server does not serve Wasm with `application/wasm` MIME type. Falling back to `WebAssembly.instantiate` which is slower. Original error:\n", e); + + } else { + throw e; + } + } + } + + const bytes = await module.arrayBuffer(); + return await WebAssembly.instantiate(bytes, imports); + + } else { + const instance = await WebAssembly.instantiate(module, imports); + + if (instance instanceof WebAssembly.Instance) { + return { instance, module }; + + } else { + return instance; + } + } +} + +function __wbg_get_imports() { + const imports = {}; + imports.wbg = {}; + imports.wbg.__wbg_Error_0497d5bdba9362e5 = function(arg0, arg1) { + const ret = Error(getStringFromWasm0(arg0, arg1)); + return ret; + }; + imports.wbg.__wbg_new_07b483f72211fd66 = function() { + const ret = new Object(); + return ret; + }; + imports.wbg.__wbg_set_3f1d0b984ed272ed = function(arg0, arg1, arg2) { + arg0[arg1] = arg2; + }; + imports.wbg.__wbindgen_bigint_from_u64 = function(arg0) { + const ret = BigInt.asUintN(64, arg0); + return ret; + }; + imports.wbg.__wbindgen_debug_string = function(arg0, arg1) { + const ret = debugString(arg1); + const ptr1 = passStringToWasm0(ret, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc); + const len1 = WASM_VECTOR_LEN; + getDataViewMemory0().setInt32(arg0 + 4 * 1, len1, true); + getDataViewMemory0().setInt32(arg0 + 4 * 0, ptr1, true); + }; + imports.wbg.__wbindgen_init_externref_table = function() { + const table = wasm.__wbindgen_export_2; + const offset = table.grow(4); + table.set(0, undefined); + table.set(offset + 0, undefined); + table.set(offset + 1, null); + table.set(offset + 2, true); + table.set(offset + 3, false); + ; + }; + imports.wbg.__wbindgen_number_new = function(arg0) { + const ret = arg0; + return ret; + }; + imports.wbg.__wbindgen_string_new = function(arg0, arg1) { + const ret = getStringFromWasm0(arg0, arg1); + return ret; + }; + imports.wbg.__wbindgen_throw = function(arg0, arg1) { + throw new Error(getStringFromWasm0(arg0, arg1)); + }; + + return imports; +} + +function __wbg_init_memory(imports, memory) { + +} + +function __wbg_finalize_init(instance, module) { + wasm = instance.exports; + __wbg_init.__wbindgen_wasm_module = module; + cachedDataViewMemory0 = null; + cachedUint8ArrayMemory0 = null; + + + wasm.__wbindgen_start(); + return wasm; +} + +function initSync(module) { + if (wasm !== undefined) return wasm; + + + if (typeof module !== 'undefined') { + if (Object.getPrototypeOf(module) === Object.prototype) { + ({module} = module) + } else { + console.warn('using deprecated parameters for `initSync()`; pass a single object instead') + } + } + + const imports = __wbg_get_imports(); + + __wbg_init_memory(imports); + + if (!(module instanceof WebAssembly.Module)) { + module = new WebAssembly.Module(module); + } + + const instance = new WebAssembly.Instance(module, imports); + + return __wbg_finalize_init(instance, module); +} + +async function __wbg_init(module_or_path) { + if (wasm !== undefined) return wasm; + + + if (typeof module_or_path !== 'undefined') { + if (Object.getPrototypeOf(module_or_path) === Object.prototype) { + ({module_or_path} = module_or_path) + } else { + console.warn('using deprecated parameters for the initialization function; pass a single object instead') + } + } + + if (typeof module_or_path === 'undefined') { + module_or_path = new URL('jieba_rs_wasm_bg.wasm', import.meta.url); + } + const imports = __wbg_get_imports(); + + if (typeof module_or_path === 'string' || (typeof Request === 'function' && module_or_path instanceof Request) || (typeof URL === 'function' && module_or_path instanceof URL)) { + module_or_path = fetch(module_or_path); + } + + __wbg_init_memory(imports); + + const { instance, module } = await __wbg_load(await module_or_path, imports); + + return __wbg_finalize_init(instance, module); +} + +export { initSync }; +export default __wbg_init; diff --git a/libs/jieba-wasm/jieba_rs_wasm_bg.wasm b/libs/jieba-wasm/jieba_rs_wasm_bg.wasm new file mode 100644 index 0000000..92df1dc Binary files /dev/null and b/libs/jieba-wasm/jieba_rs_wasm_bg.wasm differ diff --git a/libs/jieba-wasm/jieba_rs_wasm_bg.wasm.d.ts b/libs/jieba-wasm/jieba_rs_wasm_bg.wasm.d.ts new file mode 100644 index 0000000..ab7e1cd --- /dev/null +++ b/libs/jieba-wasm/jieba_rs_wasm_bg.wasm.d.ts @@ -0,0 +1,25 @@ +/* tslint:disable */ +/* eslint-disable */ +export const memory: WebAssembly.Memory; +export const cut: (a: number, b: number, c: number) => [number, number]; +export const cut_all: (a: number, b: number) => [number, number]; +export const cut_for_search: (a: number, b: number, c: number) => [number, number]; +export const tokenize: (a: number, b: number, c: number, d: number, e: number) => [number, number, number, number]; +export const add_word: (a: number, b: number, c: number, d: number, e: number) => number; +export const tag: (a: number, b: number, c: number) => [number, number]; +export const with_dict: (a: number, b: number) => [number, number]; +export const rust_zstd_wasm_shim_qsort: (a: number, b: number, c: number, d: number) => void; +export const rust_zstd_wasm_shim_malloc: (a: number) => number; +export const rust_zstd_wasm_shim_memcmp: (a: number, b: number, c: number) => number; +export const rust_zstd_wasm_shim_calloc: (a: number, b: number) => number; +export const rust_zstd_wasm_shim_free: (a: number) => void; +export const rust_zstd_wasm_shim_memcpy: (a: number, b: number, c: number) => number; +export const rust_zstd_wasm_shim_memmove: (a: number, b: number, c: number) => number; +export const rust_zstd_wasm_shim_memset: (a: number, b: number, c: number) => number; +export const __wbindgen_malloc: (a: number, b: number) => number; +export const __wbindgen_realloc: (a: number, b: number, c: number, d: number) => number; +export const __wbindgen_export_2: WebAssembly.Table; +export const __externref_drop_slice: (a: number, b: number) => void; +export const __wbindgen_free: (a: number, b: number, c: number) => void; +export const __externref_table_dealloc: (a: number) => void; +export const __wbindgen_start: () => void; diff --git a/libs/jieba-wasm/package.json b/libs/jieba-wasm/package.json new file mode 100644 index 0000000..cea989b --- /dev/null +++ b/libs/jieba-wasm/package.json @@ -0,0 +1,129 @@ +{ + "name": "jieba-wasm", + "version": "2.4.0", + "description": "WASM binding to jieba-rs", + "main": "./pkg/nodejs/jieba_rs_wasm.js", + "types": "./pkg/nodejs/jieba_rs_wasm.d.ts", + "exports": { + ".": { + "node": { + "types": "./pkg/nodejs/jieba_rs_wasm.d.ts", + "default": "./pkg/nodejs/jieba_rs_wasm.js" + }, + "deno": { + "types": "./pkg/deno/jieba_rs_wasm.d.ts", + "default": "./pkg/deno/jieba_rs_wasm.js" + }, + "browser": { + "types": "./pkg/web/jieba_rs_wasm.d.ts", + "default": "./pkg/web/jieba_rs_wasm.js" + }, + "import": { + "types": "./pkg/web/jieba_rs_wasm.d.ts", + "default": "./pkg/web/jieba_rs_wasm.js" + }, + "require": { + "types": "./pkg/nodejs/jieba_rs_wasm.d.ts", + "default": "./pkg/nodejs/jieba_rs_wasm.js" + } + }, + "./web": { + "types": "./pkg/web/jieba_rs_wasm.d.ts", + "default": "./pkg/web/jieba_rs_wasm.js" + }, + "./node": { + "types": "./pkg/nodejs/jieba_rs_wasm.d.ts", + "default": "./pkg/nodejs/jieba_rs_wasm.js" + }, + "./deno": { + "types": "./pkg/deno/jieba_rs_wasm.d.ts", + "default": "./pkg/deno/jieba_rs_wasm.js" + } + }, + "directories": { + "test": "tests" + }, + "scripts": { + "build": "wireit", + "build:cargo": "wireit", + "build:bundler": "wireit", + "build:nodejs": "wireit", + "build:deno": "wireit", + "build:web": "wireit", + "build:opt": "wireit", + "test": "echo \"Error: no test specified\" && exit 1" + }, + "wireit": { + "build:cargo": { + "command": "cargo build --release --target wasm32-unknown-unknown" + }, + "build:bundler": { + "command": "wasm-bindgen target/wasm32-unknown-unknown/release/jieba_rs_wasm.wasm --out-dir ./pkg/bundler --target bundler", + "dependencies": [ + "build:cargo" + ] + }, + "build:nodejs": { + "command": "wasm-bindgen target/wasm32-unknown-unknown/release/jieba_rs_wasm.wasm --out-dir ./pkg/nodejs --target nodejs", + "dependencies": [ + "build:cargo" + ] + }, + "build:deno": { + "command": "wasm-bindgen target/wasm32-unknown-unknown/release/jieba_rs_wasm.wasm --out-dir ./pkg/deno --target deno", + "dependencies": [ + "build:cargo" + ] + }, + "build:web": { + "command": "wasm-bindgen target/wasm32-unknown-unknown/release/jieba_rs_wasm.wasm --out-dir ./pkg/web --target web", + "dependencies": [ + "build:cargo" + ] + }, + "build": { + "dependencies": [ + "build:cargo", + "build:bundler", + "build:nodejs", + "build:deno", + "build:web", + "build:opt" + ] + }, + "build:opt": { + "command": "node scripts/opt.js", + "dependencies": [ + "build:cargo", + "build:bundler", + "build:nodejs", + "build:deno", + "build:web" + ] + } + }, + "files": [ + "pkg/**/*" + ], + "repository": { + "type": "git", + "url": "git+https://github.com/fengkx/jieba-wasm.git" + }, + "keywords": [ + "wasm", + "jieba", + "chinese", + "segment", + "中文分词" + ], + "author": "fengkx", + "license": "MIT", + "bugs": { + "url": "https://github.com/fengkx/jieba-wasm/issues" + }, + "homepage": "https://github.com/fengkx/jieba-wasm#readme", + "devDependencies": { + "@jsdevtools/ez-spawn": "^3.0.4", + "wireit": "^0.14.4" + } +} diff --git a/modules/story-summary/story-summary.js b/modules/story-summary/story-summary.js index cdced02..4b5649b 100644 --- a/modules/story-summary/story-summary.js +++ b/modules/story-summary/story-summary.js @@ -44,6 +44,12 @@ import { runSummaryGeneration } from "./generate/generator.js"; // vector service import { embed, getEngineFingerprint, testOnlineService } from "./vector/utils/embedder.js"; +// tokenizer +import { preload as preloadTokenizer, injectEntities } from "./vector/utils/tokenizer.js"; + +// entity lexicon +import { buildEntityLexicon, buildDisplayNameMap } from "./vector/retrieval/entity-lexicon.js"; + import { getMeta, updateMeta, @@ -82,7 +88,7 @@ import { // vector io import { exportVectors, importVectors } from "./vector/storage/vector-io.js"; -import { invalidateLexicalIndex } from "./vector/retrieval/lexical-index.js"; +import { invalidateLexicalIndex, warmupIndex } from "./vector/retrieval/lexical-index.js"; // ═══════════════════════════════════════════════════════════════════════════ // 常量 @@ -145,6 +151,30 @@ const VECTOR_WARNING_COOLDOWN_MS = 120000; // 2分钟内不重复提醒 const EXT_PROMPT_KEY = "LittleWhiteBox_StorySummary"; const MIN_INJECTION_DEPTH = 2; +// ═══════════════════════════════════════════════════════════════════════════ +// 分词器预热 +// ═══════════════════════════════════════════════════════════════════════════ + +/** 是否已触发过预热 */ +let tokenizerPreloaded = false; + +function maybePreloadTokenizer() { + if (tokenizerPreloaded) return; + + const vectorCfg = getVectorConfig(); + if (!vectorCfg?.enabled) return; + + tokenizerPreloaded = true; + preloadTokenizer() + .then((ok) => { + if (ok) tokenizerPreloaded = true; + }) + .catch((e) => { + // 不置 tokenizerPreloaded,允许后续重试(例如用户修复路径/刷新后) + xbLog.warn(MODULE_ID, "分词器预热失败(将降级运行,可稍后重试)", e); + }); +} + // role 映射 const ROLE_MAP = { system: extension_prompt_roles.SYSTEM, @@ -499,6 +529,27 @@ async function handleClearVectors() { xbLog.info(MODULE_ID, "向量数据已清除"); } +// ═══════════════════════════════════════════════════════════════════════════ +// 实体词典注入 + 索引预热 +// ═══════════════════════════════════════════════════════════════════════════ + +function refreshEntityLexiconAndWarmup() { + const vectorCfg = getVectorConfig(); + if (!vectorCfg?.enabled) return; + + const store = getSummaryStore(); + const { name1, name2 } = getContext(); + + const lexicon = buildEntityLexicon(store, { name1, name2 }); + const displayMap = buildDisplayNameMap(store, { name1, name2 }); + + injectEntities(lexicon, displayMap); + + // 异步预建词法索引(不阻塞) + invalidateLexicalIndex(); + warmupIndex(); +} + // ═══════════════════════════════════════════════════════════════════════════ // L2 自动增量向量化(总结完成后调用) // ═══════════════════════════════════════════════════════════════════════════ @@ -997,6 +1048,9 @@ function handleFrameMessage(event) { case "VECTOR_GENERATE": if (data.config) saveVectorConfig(data.config); + // 向量配置变更,可能刚启用,触发预热 + maybePreloadTokenizer(); + refreshEntityLexiconAndWarmup(); handleGenerateVectors(data.config); break; @@ -1085,6 +1139,8 @@ function handleFrameMessage(event) { case "REQUEST_VECTOR_STATS": sendVectorStatsToFrame(); + // 向量开关可能在 iframe 中被修改,检查是否需要预热 + maybePreloadTokenizer(); break; case "REQUEST_CLEAR": { @@ -1213,7 +1269,7 @@ async function handleChatChanged() { const newLength = Array.isArray(chat) ? chat.length : 0; await rollbackSummaryIfNeeded(); - invalidateLexicalIndex(); + invalidateLexicalIndex(); initButtonsForAll(); const store = getSummaryStore(); @@ -1230,6 +1286,9 @@ async function handleChatChanged() { sendVectorStatsToFrame(); } + // 实体词典注入 + 索引预热 + refreshEntityLexiconAndWarmup(); + setTimeout(() => checkVectorIntegrityAndWarn(), 2000); } @@ -1267,6 +1326,9 @@ async function handleMessageReceived() { applyHideStateDebounced(); setTimeout(() => maybeAutoRunSummary("after_ai"), 1000); + + // 新消息后刷新实体词典(可能有新角色) + refreshEntityLexiconAndWarmup(); } function handleMessageSent() { @@ -1458,4 +1520,7 @@ jQuery(() => { if (!getSettings().storySummary?.enabled) return; registerEvents(); initStateIntegration(); + + // 条件预热分词器(storySummary 已启用,检查 vector 是否也启用) + maybePreloadTokenizer(); }); diff --git a/modules/story-summary/vector/llm/reranker.js b/modules/story-summary/vector/llm/reranker.js index e070013..e7e68b9 100644 --- a/modules/story-summary/vector/llm/reranker.js +++ b/modules/story-summary/vector/llm/reranker.js @@ -78,7 +78,8 @@ export async function rerank(query, documents, options = {}) { }, body: JSON.stringify({ model: RERANK_MODEL, - query: query.slice(0, 1000), // 限制 query 长度 + // Zero-darkbox: do not silently truncate query. + query, documents: validDocs, top_n: Math.min(topN, validDocs.length), return_documents: false, diff --git a/modules/story-summary/vector/retrieval/lexical-index.js b/modules/story-summary/vector/retrieval/lexical-index.js index fdad5e6..c0861e1 100644 --- a/modules/story-summary/vector/retrieval/lexical-index.js +++ b/modules/story-summary/vector/retrieval/lexical-index.js @@ -4,9 +4,10 @@ // 职责: // 1. 对 L0 atoms + L1 chunks + L2 events 建立词法索引 // 2. 提供词法检索接口(专名精确匹配兜底) -// 3. 惰性构建 + 缓存失效机制 +// 3. 惰性构建 + 异步预热 + 缓存失效机制 // // 索引存储:纯内存(不持久化) +// 分词器:统一使用 tokenizer.js(结巴 + 实体保护 + 降级) // 重建时机:CHAT_CHANGED / L0提取完成 / L2总结完成 // ═══════════════════════════════════════════════════════════════════════════ @@ -16,6 +17,7 @@ import { getSummaryStore } from '../../data/store.js'; import { getStateAtoms } from '../storage/state-store.js'; import { getAllChunks } from '../storage/chunk-store.js'; import { xbLog } from '../../../../core/debug-core.js'; +import { tokenizeForIndex } from '../utils/tokenizer.js'; const MODULE_ID = 'lexical-index'; @@ -23,9 +25,20 @@ const MODULE_ID = 'lexical-index'; // 缓存 // ───────────────────────────────────────────────────────────────────────── +/** @type {MiniSearch|null} */ let cachedIndex = null; + +/** @type {string|null} */ let cachedChatId = null; -let cachedFingerprint = null; // atoms.length + chunks.length + events.length 的简单指纹 + +/** @type {string|null} 数据指纹(atoms + chunks + events 数量) */ +let cachedFingerprint = null; + +/** @type {boolean} 是否正在构建 */ +let building = false; + +/** @type {Promise|null} 当前构建 Promise(防重入) */ +let buildPromise = null; // ───────────────────────────────────────────────────────────────────────── // 工具函数 @@ -43,7 +56,7 @@ function cleanSummary(summary) { } /** - * 计算缓存指纹(用于判断是否需要重建) + * 计算缓存指纹 * @param {number} atomCount * @param {number} chunkCount * @param {number} eventCount @@ -53,39 +66,27 @@ function computeFingerprint(atomCount, chunkCount, eventCount) { return `${atomCount}:${chunkCount}:${eventCount}`; } +/** + * 让出主线程(避免长时间阻塞 UI) + * @returns {Promise} + */ +function yieldToMain() { + return new Promise(resolve => setTimeout(resolve, 0)); +} + // ───────────────────────────────────────────────────────────────────────── -// 索引构建 +// 文档收集 // ───────────────────────────────────────────────────────────────────────── /** - * 构建 MiniSearch 索引 - * - * 索引三类文档: - * - L0 atoms: { id: atomId, type: 'atom', floor, text: semantic } - * - L1 chunks: { id: chunkId, type: 'chunk', floor, text: chunk.text } - * - L2 events: { id: eventId, type: 'event', floor: null, text: title + participants + summary } + * 收集所有待索引文档 * * @param {object[]} atoms - getStateAtoms() 返回值 * @param {object[]} chunks - getAllChunks(chatId) 返回值 * @param {object[]} events - store.json.events - * @returns {MiniSearch} + * @returns {object[]} 文档数组 */ -export function buildLexicalIndex(atoms, chunks, events) { - const T0 = performance.now(); - - const index = new MiniSearch({ - fields: ['text'], - storeFields: ['type', 'floor'], - idField: 'id', - searchOptions: { - boost: { text: 1 }, - fuzzy: 0.2, - prefix: true, - }, - // 中文友好的 tokenizer:按字符 bigram + 空格/标点分词 - tokenize: chineseTokenize, - }); - +function collectDocuments(atoms, chunks, events) { const docs = []; // L0 atoms @@ -129,72 +130,58 @@ export function buildLexicalIndex(atoms, chunks, events) { }); } - if (docs.length > 0) { - index.addAll(docs); - } - - const elapsed = Math.round(performance.now() - T0); - xbLog.info(MODULE_ID, `索引构建完成: ${docs.length} 文档 (atoms=${atoms?.length || 0}, chunks=${chunks?.length || 0}, events=${events?.length || 0}) ${elapsed}ms`); - - return index; + return docs; } // ───────────────────────────────────────────────────────────────────────── -// 中文 Tokenizer +// 索引构建(分片,不阻塞主线程) // ───────────────────────────────────────────────────────────────────────── +/** 每批添加的文档数 */ +const BUILD_BATCH_SIZE = 500; + /** - * 中文友好的分词器 + * 构建 MiniSearch 索引(分片异步) * - * 策略: - * 1. 连续中文字符 → 滑动 bigram("黄英梅" → "黄英", "英梅") - * 2. 连续非中文字符 → 按空格/标点分割 - * 3. 保留完整中文词(2-4字)作为额外 token - * - * @param {string} text - * @returns {string[]} + * @param {object[]} docs - 文档数组 + * @returns {Promise} */ -function chineseTokenize(text) { - if (!text) return []; +async function buildIndexAsync(docs) { + const T0 = performance.now(); - const tokens = []; - const s = String(text).toLowerCase(); + const index = new MiniSearch({ + fields: ['text'], + storeFields: ['type', 'floor'], + idField: 'id', + searchOptions: { + boost: { text: 1 }, + fuzzy: 0.2, + prefix: true, + }, + tokenize: tokenizeForIndex, + }); - // 分离中文段和非中文段 - const segments = s.split(/([\u4e00-\u9fff]+)/g); + if (!docs.length) { + return index; + } - for (const seg of segments) { - if (!seg) continue; + // 分片添加,每批 BUILD_BATCH_SIZE 条后让出主线程 + for (let i = 0; i < docs.length; i += BUILD_BATCH_SIZE) { + const batch = docs.slice(i, i + BUILD_BATCH_SIZE); + index.addAll(batch); - // 中文段:bigram + 完整段(如果 2-6 字) - if (/^[\u4e00-\u9fff]+$/.test(seg)) { - // 完整段作为一个 token(如果长度合适) - if (seg.length >= 2 && seg.length <= 6) { - tokens.push(seg); - } - - // bigram - for (let i = 0; i < seg.length - 1; i++) { - tokens.push(seg.slice(i, i + 2)); - } - - // trigram(对 3+ 字的段) - for (let i = 0; i < seg.length - 2; i++) { - tokens.push(seg.slice(i, i + 3)); - } - } else { - // 非中文段:按空格/标点分割 -const words = seg.split(/[\s\-_.,;:!?'"()[\]{}<>/\\|@#$%^&*+=~`]+/); - for (const w of words) { - const trimmed = w.trim(); - if (trimmed.length >= 2) { - tokens.push(trimmed); - } - } + // 非最后一批时让出主线程 + if (i + BUILD_BATCH_SIZE < docs.length) { + await yieldToMain(); } } - return tokens; + const elapsed = Math.round(performance.now() - T0); + xbLog.info(MODULE_ID, + `索引构建完成: ${docs.length} 文档 (${elapsed}ms)` + ); + + return index; } // ───────────────────────────────────────────────────────────────────────── @@ -247,6 +234,8 @@ export function searchLexicalIndex(index, terms) { fuzzy: 0.2, prefix: true, combineWith: 'OR', + // 使用与索引相同的分词器 + tokenize: tokenizeForIndex, }); } catch (e) { xbLog.warn(MODULE_ID, '检索失败', e); @@ -305,22 +294,17 @@ export function searchLexicalIndex(index, terms) { } // ───────────────────────────────────────────────────────────────────────── -// 惰性缓存管理 +// 内部构建流程(收集数据 + 构建索引) // ───────────────────────────────────────────────────────────────────────── /** - * 获取词法索引(惰性构建 + 缓存) + * 收集数据并构建索引 * - * 如果缓存有效则直接返回;否则自动构建。 - * 缓存失效条件:chatId 变化 / 数据指纹变化 / 手动 invalidate - * - * @returns {Promise} + * @param {string} chatId + * @returns {Promise<{index: MiniSearch, fingerprint: string}>} */ -export async function getLexicalIndex() { - const { chatId } = getContext(); - if (!chatId) return null; - - // 收集当前数据 +async function collectAndBuild(chatId) { + // 收集数据 const atoms = getStateAtoms() || []; const store = getSummaryStore(); const events = store?.json?.events || []; @@ -334,30 +318,118 @@ export async function getLexicalIndex() { const fp = computeFingerprint(atoms.length, chunks.length, events.length); - // 缓存命中 + // 检查是否在收集过程中缓存已被其他调用更新 if (cachedIndex && cachedChatId === chatId && cachedFingerprint === fp) { + return { index: cachedIndex, fingerprint: fp }; + } + + // 收集文档 + const docs = collectDocuments(atoms, chunks, events); + + // 异步分片构建 + const index = await buildIndexAsync(docs); + + return { index, fingerprint: fp }; +} + +// ───────────────────────────────────────────────────────────────────────── +// 公开接口:getLexicalIndex(惰性获取) +// ───────────────────────────────────────────────────────────────────────── + +/** + * 获取词法索引(惰性构建 + 缓存) + * + * 如果缓存有效则直接返回;否则自动构建。 + * 如果正在构建中,等待构建完成。 + * + * @returns {Promise} + */ +export async function getLexicalIndex() { + const { chatId } = getContext(); + if (!chatId) return null; + + // 快速路径:如果缓存存在且 chatId 未变,则直接命中 + // 指纹校验放到构建流程中完成,避免为指纹而额外读一次 IndexedDB + if (cachedIndex && cachedChatId === chatId && cachedFingerprint) { return cachedIndex; } - // 重建 - xbLog.info(MODULE_ID, `缓存失效,重建索引 (chatId=${chatId.slice(0, 8)}, fp=${fp})`); + // 正在构建中,等待结果 + if (building && buildPromise) { + try { + await buildPromise; + if (cachedIndex && cachedChatId === chatId && cachedFingerprint) { + return cachedIndex; + } + } catch { + // 构建失败,继续往下重建 + } + } - const index = buildLexicalIndex(atoms, chunks, events); + // 需要重建(指纹将在 collectAndBuild 内部计算并写入缓存) + xbLog.info(MODULE_ID, `缓存失效,重建索引 (chatId=${chatId.slice(0, 8)})`); - cachedIndex = index; - cachedChatId = chatId; - cachedFingerprint = fp; + building = true; + buildPromise = collectAndBuild(chatId); - return index; + try { + const { index, fingerprint } = await buildPromise; + + // 原子替换缓存 + cachedIndex = index; + cachedChatId = chatId; + cachedFingerprint = fingerprint; + + return index; + } catch (e) { + xbLog.error(MODULE_ID, '索引构建失败', e); + return null; + } finally { + building = false; + buildPromise = null; + } } +// ───────────────────────────────────────────────────────────────────────── +// 公开接口:warmupIndex(异步预建) +// ───────────────────────────────────────────────────────────────────────── + /** - * 使缓存失效(下次 getLexicalIndex 时自动重建) + * 异步预建索引 + * + * 在 CHAT_CHANGED 时调用,后台构建索引。 + * 不阻塞调用方,不返回结果。 + * 构建完成后缓存自动更新,后续 getLexicalIndex() 直接命中。 + * + * 调用时机: + * - handleChatChanged(实体注入后) + * - L0 提取完成 + * - L2 总结完成 + */ +export function warmupIndex() { + const { chatId } = getContext(); + if (!chatId) return; + + // 已在构建中,不重复触发 + if (building) return; + + // fire-and-forget + getLexicalIndex().catch(e => { + xbLog.warn(MODULE_ID, '预热索引失败', e); + }); +} + +// ───────────────────────────────────────────────────────────────────────── +// 公开接口:invalidateLexicalIndex(缓存失效) +// ───────────────────────────────────────────────────────────────────────── + +/** + * 使缓存失效(下次 getLexicalIndex / warmupIndex 时自动重建) * * 调用时机: * - CHAT_CHANGED - * - L0 提取完成(handleAnchorGenerate 完成后) - * - L2 总结完成(onComplete 回调中) + * - L0 提取完成 + * - L2 总结完成 */ export function invalidateLexicalIndex() { if (cachedIndex) { diff --git a/modules/story-summary/vector/retrieval/metrics.js b/modules/story-summary/vector/retrieval/metrics.js index 09f66cf..d877eda 100644 --- a/modules/story-summary/vector/retrieval/metrics.js +++ b/modules/story-summary/vector/retrieval/metrics.js @@ -16,6 +16,11 @@ export function createMetrics() { query: { buildTime: 0, refineTime: 0, + lengths: { + v0Chars: 0, + v1Chars: null, // null = NA + rerankChars: 0, + }, }, // Anchor (L0 StateAtoms) - 语义锚点 @@ -177,6 +182,13 @@ export function formatMetricsLog(metrics) { lines.push('════════════════════════════════════════'); lines.push(''); + // Query Length + lines.push('[Query Length] 查询长度'); + lines.push(`├─ query_v0_chars: ${m.query?.lengths?.v0Chars ?? 0}`); + lines.push(`├─ query_v1_chars: ${m.query?.lengths?.v1Chars == null ? 'NA' : m.query.lengths.v1Chars}`); + lines.push(`└─ rerank_query_chars: ${m.query?.lengths?.rerankChars ?? 0}`); + lines.push(''); + // Query Build lines.push('[Query] 查询构建'); lines.push(`├─ build_time: ${m.query.buildTime}ms`); diff --git a/modules/story-summary/vector/retrieval/query-builder.js b/modules/story-summary/vector/retrieval/query-builder.js index b99af6d..d6f80b2 100644 --- a/modules/story-summary/vector/retrieval/query-builder.js +++ b/modules/story-summary/vector/retrieval/query-builder.js @@ -12,36 +12,18 @@ import { getContext } from '../../../../../../../extensions.js'; import { buildEntityLexicon, buildDisplayNameMap, extractEntitiesFromText } from './entity-lexicon.js'; import { getSummaryStore } from '../../data/store.js'; import { filterText } from '../utils/text-filter.js'; +import { tokenizeForIndex as tokenizerTokenizeForIndex } from '../utils/tokenizer.js'; // ───────────────────────────────────────────────────────────────────────── // 常量 // ───────────────────────────────────────────────────────────────────────── -const DIALOGUE_MAX_CHARS = 400; -const PENDING_MAX_CHARS = 400; -const MEMORY_HINT_MAX_CHARS = 100; +// Zero-darkbox policy: +// - No internal truncation. We rely on model-side truncation / provider limits. +// - If provider rejects due to length, we fail loudly and degrade explicitly. const MEMORY_HINT_ATOMS_MAX = 5; const MEMORY_HINT_EVENTS_MAX = 3; -const RERANK_QUERY_MAX_CHARS = 500; -const RERANK_SNIPPET_CHARS = 150; const LEXICAL_TERMS_MAX = 10; -const LEXICAL_TERM_MIN_LEN = 2; -const LEXICAL_TERM_MAX_LEN = 6; - -// 中文停用词(高频无意义词) -const STOP_WORDS = new Set([ - '的', '了', '在', '是', '我', '有', '和', '就', '不', '人', - '都', '一', '一个', '上', '也', '很', '到', '说', '要', '去', - '你', '会', '着', '没有', '看', '好', '自己', '这', '他', '她', - '它', '吗', '什么', '那', '里', '来', '吧', '呢', '啊', '哦', - '嗯', '呀', '哈', '嘿', '喂', '哎', '唉', '哇', '呃', '嘛', - '把', '被', '让', '给', '从', '向', '对', '跟', '比', '但', - '而', '或', '如果', '因为', '所以', '虽然', '但是', '然后', - '可以', '这样', '那样', '怎么', '为什么', '什么样', '哪里', - '时候', '现在', '已经', '还是', '只是', '可能', '应该', '知道', - '觉得', '开始', '一下', '一些', '这个', '那个', '他们', '我们', - '你们', '自己', '起来', '出来', '进去', '回来', '过来', '下去', -]); // ───────────────────────────────────────────────────────────────────────── // 工具函数 @@ -65,10 +47,7 @@ function cleanMessageText(text) { * @param {number} maxLen * @returns {string} */ -function truncate(text, maxLen) { - if (!text || text.length <= maxLen) return text || ''; - return text.slice(0, maxLen) + '…'; -} +// truncate removed by design (zero-darkbox) /** * 清理事件摘要(移除楼层标记) @@ -84,8 +63,7 @@ function cleanSummary(summary) { /** * 从文本中提取高频实词(用于词法检索) * - * 策略:按中文字符边界 + 空格/标点分词,取长度 2-6 的片段 - * 过滤停用词,按频率排序 + * 使用统一分词器(结巴 + 实体保护 + 停用词过滤),按频率排序 * * @param {string} text - 清洗后的文本 * @param {number} maxTerms - 最大词数 @@ -94,15 +72,15 @@ function cleanSummary(summary) { function extractKeyTerms(text, maxTerms = LEXICAL_TERMS_MAX) { if (!text) return []; - // 提取连续中文片段 + 英文单词 - const segments = text.match(/[\u4e00-\u9fff]{2,6}|[a-zA-Z]{3,}/g) || []; + // 使用统一分词器(索引用,不去重,保留词频) + const tokens = tokenizerTokenizeForIndex(text); + // 统计词频 const freq = new Map(); - for (const seg of segments) { - const s = seg.toLowerCase(); - if (s.length < LEXICAL_TERM_MIN_LEN || s.length > LEXICAL_TERM_MAX_LEN) continue; - if (STOP_WORDS.has(s)) continue; - freq.set(s, (freq.get(s) || 0) + 1); + for (const token of tokens) { + const key = String(token || '').toLowerCase(); + if (!key) continue; + freq.set(key, (freq.get(key) || 0) + 1); } return Array.from(freq.entries()) @@ -160,8 +138,9 @@ export function buildQueryBundle(lastMessages, pendingUserMessage, store = null, const clean = cleanMessageText(m.mes || ''); if (clean) { - // ★ 修复 A:不使用楼层号,embedding 模型不需要 - dialogueLines.push(`${speaker}: ${truncate(clean, DIALOGUE_MAX_CHARS)}`); + // 不使用楼层号,embedding 模型不需要 + // 不截断,零暗箱 + dialogueLines.push(`${speaker}: ${clean}`); allCleanText.push(clean); } } @@ -191,30 +170,15 @@ export function buildQueryBundle(lastMessages, pendingUserMessage, store = null, } if (pendingClean) { - queryParts.push(`[PENDING_USER]\n${truncate(pendingClean, PENDING_MAX_CHARS)}`); + // 不截断,零暗箱 + queryParts.push(`[PENDING_USER]\n${pendingClean}`); } const queryText_v0 = queryParts.join('\n\n'); - // 6. 构建 rerankQuery(短版) - const rerankParts = []; - - if (focusEntities.length > 0) { - rerankParts.push(focusEntities.join(' ')); - } - - for (const m of (lastMessages || [])) { - const clean = cleanMessageText(m.mes || ''); - if (clean) { - rerankParts.push(truncate(clean, RERANK_SNIPPET_CHARS)); - } - } - - if (pendingClean) { - rerankParts.push(truncate(pendingClean, RERANK_SNIPPET_CHARS)); - } - - const rerankQuery = truncate(rerankParts.join('\n'), RERANK_QUERY_MAX_CHARS); + // 6. rerankQuery 与 embedding query 同源(零暗箱) + // 后续 refine 会把它升级为与 queryText_v1 同源。 + const rerankQuery = queryText_v0; // 7. 构建 lexicalTerms const entityTerms = focusEntities.map(e => e.toLowerCase()); @@ -265,7 +229,8 @@ export function refineQueryBundle(bundle, anchorHits, eventHits) { for (const hit of topAnchors) { const semantic = hit.atom?.semantic || ''; if (semantic) { - hints.push(truncate(semantic, MEMORY_HINT_MAX_CHARS)); + // 不截断,零暗箱 + hints.push(semantic); } } @@ -279,13 +244,15 @@ export function refineQueryBundle(bundle, anchorHits, eventHits) { ? `${title}: ${summary}` : title || summary; if (line) { - hints.push(truncate(line, MEMORY_HINT_MAX_CHARS)); + // 不截断,零暗箱 + hints.push(line); } } - // 3. 构建 queryText_v1 + // 3. 构建 queryText_v1(Hints 前置,最优先) if (hints.length > 0) { - bundle.queryText_v1 = bundle.queryText_v0 + `\n\n[MEMORY_HINTS]\n${hints.join('\n')}`; + const hintText = `[MEMORY_HINTS]\n${hints.join('\n')}`; + bundle.queryText_v1 = hintText + `\n\n` + bundle.queryText_v0; } else { bundle.queryText_v1 = bundle.queryText_v0; } @@ -314,17 +281,8 @@ export function refineQueryBundle(bundle, anchorHits, eventHits) { } } - // 5. 增强 rerankQuery - if (hints.length > 0) { - const hintKeywords = extractKeyTerms(hints.join(' '), 5); - if (hintKeywords.length > 0) { - const addition = hintKeywords.join(' '); - bundle.rerankQuery = truncate( - bundle.rerankQuery + '\n' + addition, - RERANK_QUERY_MAX_CHARS - ); - } - } + // 5. rerankQuery 与最终 query 同源(零暗箱) + bundle.rerankQuery = bundle.queryText_v1 || bundle.queryText_v0; // 6. 增强 lexicalTerms if (hints.length > 0) { diff --git a/modules/story-summary/vector/retrieval/recall.js b/modules/story-summary/vector/retrieval/recall.js index c1d76ec..c7aea5e 100644 --- a/modules/story-summary/vector/retrieval/recall.js +++ b/modules/story-summary/vector/retrieval/recall.js @@ -782,6 +782,14 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) { metrics.query.buildTime = Math.round(performance.now() - T_Build_Start); metrics.anchor.focusEntities = bundle.focusEntities; + // Query lengths (v0 available here) + if (metrics.query?.lengths) { + metrics.query.lengths.v0Chars = String(bundle.queryText_v0 || '').length; + // v1 not built yet + metrics.query.lengths.v1Chars = null; + metrics.query.lengths.rerankChars = String(bundle.rerankQuery || bundle.queryText_v0 || '').length; + } + xbLog.info(MODULE_ID, `Query Build: focus=[${bundle.focusEntities.join(',')}] lexTerms=[${bundle.lexicalTerms.slice(0, 5).join(',')}]` ); @@ -841,6 +849,12 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) { // 更新 focusEntities(refinement 可能扩展了) metrics.anchor.focusEntities = bundle.focusEntities; + // Query lengths (v1/rerank updated here) + if (metrics.query?.lengths) { + metrics.query.lengths.v1Chars = bundle.queryText_v1 == null ? null : String(bundle.queryText_v1).length; + metrics.query.lengths.rerankChars = String(bundle.rerankQuery || bundle.queryText_v1 || bundle.queryText_v0 || '').length; + } + xbLog.info(MODULE_ID, `Refinement: focus=[${bundle.focusEntities.join(',')}] hasV1=${!!bundle.queryText_v1} (${metrics.query.refineTime}ms)` ); diff --git a/modules/story-summary/vector/utils/tokenizer.js b/modules/story-summary/vector/utils/tokenizer.js new file mode 100644 index 0000000..7f918fd --- /dev/null +++ b/modules/story-summary/vector/utils/tokenizer.js @@ -0,0 +1,650 @@ +// ═══════════════════════════════════════════════════════════════════════════ +// tokenizer.js - 统一分词器 +// +// 职责: +// 1. 管理结巴 WASM 生命周期(预加载 / 就绪检测 / 降级) +// 2. 实体词典注入(分词前最长匹配保护) +// 3. 亚洲文字(CJK + 假名)走结巴,拉丁文字走空格分割 +// 4. 提供 tokenize(text): string[] 统一接口 +// +// 加载时机: +// - 插件初始化时 storySummary.enabled && vectorConfig.enabled → preload() +// - 向量开关从 off→on 时 → preload() +// - CHAT_CHANGED 时 → injectEntities() + warmup 索引(不负责加载 WASM) +// +// 降级策略: +// - WASM 未就绪时 → 实体保护 + 标点分割(不用 bigram) +// ═══════════════════════════════════════════════════════════════════════════ + +import { extensionFolderPath } from '../../../../core/constants.js'; +import { xbLog } from '../../../../core/debug-core.js'; + +const MODULE_ID = 'tokenizer'; + +// ═══════════════════════════════════════════════════════════════════════════ +// WASM 状态机 +// ═══════════════════════════════════════════════════════════════════════════ + +/** + * @enum {string} + */ +const WasmState = { + IDLE: 'IDLE', + LOADING: 'LOADING', + READY: 'READY', + FAILED: 'FAILED', +}; + +let wasmState = WasmState.IDLE; + +/** @type {Promise|null} 当前加载 Promise(防重入) */ +let loadingPromise = null; + +/** @type {typeof import('../../../../libs/jieba-wasm/jieba_rs_wasm.js')|null} */ +let jiebaModule = null; + +/** @type {Function|null} jieba cut 函数引用 */ +let jiebaCut = null; + +/** @type {Function|null} jieba add_word 函数引用 */ +let jiebaAddWord = null; + +// ═══════════════════════════════════════════════════════════════════════════ +// 实体词典 +// ═══════════════════════════════════════════════════════════════════════════ + +/** @type {string[]} 按长度降序排列的实体列表(用于最长匹配) */ +let entityList = []; + +/** @type {Set} 已注入结巴的实体(避免重复 add_word) */ +let injectedEntities = new Set(); + +// ═══════════════════════════════════════════════════════════════════════════ +// 停用词 +// ═══════════════════════════════════════════════════════════════════════════ + +const STOP_WORDS = new Set([ + // 中文高频虚词 + '的', '了', '在', '是', '我', '有', '和', '就', '不', '人', + '都', '一', '一个', '上', '也', '很', '到', '说', '要', '去', + '你', '会', '着', '没有', '看', '好', '自己', '这', '他', '她', + '它', '吗', '什么', '那', '里', '来', '吧', '呢', '啊', '哦', + '嗯', '呀', '哈', '嘿', '喂', '哎', '唉', '哇', '呃', '嘛', + '把', '被', '让', '给', '从', '向', '对', '跟', '比', '但', + '而', '或', '如果', '因为', '所以', '虽然', '但是', '然后', + '可以', '这样', '那样', '怎么', '为什么', '什么样', '哪里', + '时候', '现在', '已经', '还是', '只是', '可能', '应该', '知道', + '觉得', '开始', '一下', '一些', '这个', '那个', '他们', '我们', + '你们', '自己', '起来', '出来', '进去', '回来', '过来', '下去', + // 日语助词 + 常见虚词 + 'は', 'が', 'を', 'に', 'で', 'と', 'の', 'も', 'へ', 'や', + 'か', 'な', 'よ', 'ね', 'わ', 'だ', 'です', 'ます', 'た', 'て', + 'する', 'いる', 'ある', 'なる', 'れる', 'られる', 'ない', + 'この', 'その', 'あの', 'どの', 'ここ', 'そこ', 'あそこ', + 'これ', 'それ', 'あれ', 'どれ', + // 英文常见停用词 + 'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been', + 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', + 'would', 'could', 'should', 'may', 'might', 'can', 'shall', + 'and', 'but', 'or', 'not', 'no', 'nor', 'so', 'yet', + 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from', + 'it', 'its', 'he', 'she', 'his', 'her', 'they', 'them', + 'this', 'that', 'these', 'those', 'i', 'me', 'my', 'you', 'your', + 'we', 'our', 'if', 'then', 'than', 'when', 'what', 'which', + 'who', 'how', 'where', 'there', 'here', 'all', 'each', 'every', + 'both', 'few', 'more', 'most', 'other', 'some', 'such', + 'only', 'own', 'same', 'just', 'very', 'also', 'about', +]); + +// ═══════════════════════════════════════════════════════════════════════════ +// Unicode 分类 +// ═══════════════════════════════════════════════════════════════════════════ + +/** + * 判断字符是否为亚洲文字(CJK + 假名) + * @param {number} code - charCode + * @returns {boolean} + */ +function isAsian(code) { + return ( + (code >= 0x4E00 && code <= 0x9FFF) || // CJK Unified Ideographs + (code >= 0x3400 && code <= 0x4DBF) || // CJK Extension A + (code >= 0x3040 && code <= 0x309F) || // Hiragana + (code >= 0x30A0 && code <= 0x30FF) || // Katakana + (code >= 0x31F0 && code <= 0x31FF) || // Katakana Phonetic Extensions + (code >= 0xFF65 && code <= 0xFF9F) || // Halfwidth Katakana + (code >= 0xF900 && code <= 0xFAFF) || // CJK Compatibility Ideographs + (code >= 0x20000 && code <= 0x2A6DF) // CJK Extension B + ); +} + +/** + * 判断字符是否为拉丁字母或数字 + * @param {number} code - charCode + * @returns {boolean} + */ +function isLatin(code) { + return ( + (code >= 0x41 && code <= 0x5A) || // A-Z + (code >= 0x61 && code <= 0x7A) || // a-z + (code >= 0x30 && code <= 0x39) || // 0-9 + (code >= 0xC0 && code <= 0x024F) // Latin Extended (àáâ 等) + ); +} + +// ═══════════════════════════════════════════════════════════════════════════ +// 文本分段(亚洲 vs 拉丁 vs 其他) +// ═══════════════════════════════════════════════════════════════════════════ + +/** + * @typedef {'asian'|'latin'|'other'} SegmentType + */ + +/** + * @typedef {object} TextSegment + * @property {SegmentType} type - 段类型 + * @property {string} text - 段文本 + */ + +/** + * 将文本按 Unicode 脚本分段 + * 连续的同类字符归为一段 + * + * @param {string} text + * @returns {TextSegment[]} + */ +function segmentByScript(text) { + if (!text) return []; + + const segments = []; + let currentType = null; + let currentStart = 0; + + for (let i = 0; i < text.length; i++) { + const code = text.charCodeAt(i); + let type; + + if (isAsian(code)) { + type = 'asian'; + } else if (isLatin(code)) { + type = 'latin'; + } else { + type = 'other'; + } + + if (type !== currentType) { + if (currentType !== null && currentStart < i) { + const seg = text.slice(currentStart, i); + if (currentType !== 'other' || seg.trim()) { + segments.push({ type: currentType, text: seg }); + } + } + currentType = type; + currentStart = i; + } + } + + // 最后一段 + if (currentStart < text.length) { + const seg = text.slice(currentStart); + if (currentType !== 'other' || seg.trim()) { + segments.push({ type: currentType, text: seg }); + } + } + + return segments; +} + +// ═══════════════════════════════════════════════════════════════════════════ +// 实体保护(最长匹配占位符替换) +// ═══════════════════════════════════════════════════════════════════════════ + +// 使用 Unicode Private Use Area (PUA) 字符作为边界,避免控制字符在分词器中产生不可控行为 +const PLACEHOLDER_PREFIX = '\uE000ENT_'; +const PLACEHOLDER_SUFFIX = '\uE001'; + +/** + * 在文本中执行实体最长匹配,替换为占位符 + * + * @param {string} text - 原始文本 + * @returns {{masked: string, entities: Map}} masked 文本 + 占位符→原文映射 + */ +function maskEntities(text) { + const entities = new Map(); + + if (!entityList.length || !text) { + return { masked: text, entities }; + } + + let masked = text; + let idx = 0; + + // entityList 已按长度降序排列,保证最长匹配优先 + for (const entity of entityList) { + // 大小写不敏感搜索 + const lowerMasked = masked.toLowerCase(); + const lowerEntity = entity.toLowerCase(); + let searchFrom = 0; + + while (true) { + const pos = lowerMasked.indexOf(lowerEntity, searchFrom); + if (pos === -1) break; + + // 已被占位符覆盖则跳过(检查前后是否存在 PUA 边界字符) + const aroundStart = Math.max(0, pos - 4); + const aroundEnd = Math.min(masked.length, pos + entity.length + 4); + const around = masked.slice(aroundStart, aroundEnd); + if (around.includes('\uE000') || around.includes('\uE001')) { + searchFrom = pos + 1; + continue; + } + + const placeholder = `${PLACEHOLDER_PREFIX}${idx}${PLACEHOLDER_SUFFIX}`; + const originalText = masked.slice(pos, pos + entity.length); + entities.set(placeholder, originalText); + + masked = masked.slice(0, pos) + placeholder + masked.slice(pos + entity.length); + idx++; + + // 更新搜索位置(跳过占位符) + searchFrom = pos + placeholder.length; + } + } + + return { masked, entities }; +} + +/** + * 将 token 数组中的占位符还原为原始实体 + * + * @param {string[]} tokens + * @param {Map} entities - 占位符→原文映射 + * @returns {string[]} + */ +function unmaskTokens(tokens, entities) { + if (!entities.size) return tokens; + + return tokens.map(token => { + // token 本身就是一个占位符 + if (entities.has(token)) { + return entities.get(token); + } + + // token 中包含占位符(结巴可能把占位符和其他字符连在一起) + let result = token; + for (const [placeholder, original] of entities) { + if (result.includes(placeholder)) { + result = result.replace(placeholder, original); + } + } + + return result; + }); +} + +// ═══════════════════════════════════════════════════════════════════════════ +// 分词:亚洲文字(结巴 / 降级) +// ═══════════════════════════════════════════════════════════════════════════ + +/** + * 用结巴分词处理亚洲文字段 + * @param {string} text + * @returns {string[]} + */ +function tokenizeAsianJieba(text) { + if (!text || !jiebaCut) return []; + + try { + const words = jiebaCut(text, true); // hmm=true + return Array.from(words) + .map(w => String(w || '').trim()) + .filter(w => w.length >= 2); + } catch (e) { + xbLog.warn(MODULE_ID, '结巴分词异常,降级处理', e); + return tokenizeAsianFallback(text); + } +} + +/** + * 降级分词:标点/空格分割 + 保留 2-6 字 CJK 片段 + * 不使用 bigram,避免索引膨胀 + * + * @param {string} text + * @returns {string[]} + */ +function tokenizeAsianFallback(text) { + if (!text) return []; + + const tokens = []; + + // 按标点和空格分割 + const parts = text.split(/[\s,。!?、;:""''()【】《》…—\-,.!?;:'"()[\]{}<>/\\|@#$%^&*+=~`]+/); + + for (const part of parts) { + const trimmed = part.trim(); + if (!trimmed) continue; + + if (trimmed.length >= 2 && trimmed.length <= 6) { + tokens.push(trimmed); + } else if (trimmed.length > 6) { + // 长片段按 4 字滑窗切分(比 bigram 稀疏得多) + for (let i = 0; i <= trimmed.length - 4; i += 2) { + tokens.push(trimmed.slice(i, i + 4)); + } + // 保留完整片段的前 6 字 + tokens.push(trimmed.slice(0, 6)); + } + } + + return tokens; +} + +// ═══════════════════════════════════════════════════════════════════════════ +// 分词:拉丁文字 +// ═══════════════════════════════════════════════════════════════════════════ + +/** + * 拉丁文字分词:空格/标点分割 + * @param {string} text + * @returns {string[]} + */ +function tokenizeLatin(text) { + if (!text) return []; + + return text + .split(/[\s\-_.,;:!?'"()[\]{}<>/\\|@#$%^&*+=~`]+/) + .map(w => w.trim().toLowerCase()) + .filter(w => w.length >= 3); +} + +// ═══════════════════════════════════════════════════════════════════════════ +// 公开接口:preload +// ═══════════════════════════════════════════════════════════════════════════ + +/** + * 预加载结巴 WASM + * + * 可多次调用,内部防重入。 + * FAILED 状态下再次调用会重试。 + * + * @returns {Promise} 是否加载成功 + */ +export async function preload() { + // 已就绪 + if (wasmState === WasmState.READY) return true; + + // 正在加载,等待结果 + if (wasmState === WasmState.LOADING && loadingPromise) { + try { + await loadingPromise; + return wasmState === WasmState.READY; + } catch { + return false; + } + } + + // IDLE 或 FAILED → 开始加载 + wasmState = WasmState.LOADING; + + const T0 = performance.now(); + + loadingPromise = (async () => { + try { + // 动态 import 结巴模块 + const wasmPath = `${extensionFolderPath}/libs/jieba-wasm/jieba_rs_wasm_bg.wasm`; + + // eslint-disable-next-line no-unsanitized/method + jiebaModule = await import( + `${extensionFolderPath}/libs/jieba-wasm/jieba_rs_wasm.js` + ); + + // 初始化 WASM + if (typeof jiebaModule.default === 'function') { + await jiebaModule.default(wasmPath); + } + + // 缓存函数引用 + jiebaCut = jiebaModule.cut; + jiebaAddWord = jiebaModule.add_word; + + if (typeof jiebaCut !== 'function') { + throw new Error('jieba cut 函数不存在'); + } + + wasmState = WasmState.READY; + + const elapsed = Math.round(performance.now() - T0); + xbLog.info(MODULE_ID, `结巴 WASM 加载完成 (${elapsed}ms)`); + + // 如果有待注入的实体,补做 + if (entityList.length > 0 && jiebaAddWord) { + reInjectAllEntities(); + } + + return true; + } catch (e) { + wasmState = WasmState.FAILED; + xbLog.error(MODULE_ID, '结巴 WASM 加载失败', e); + throw e; + } + })(); + + try { + await loadingPromise; + return true; + } catch { + return false; + } finally { + loadingPromise = null; + } +} + +// ═══════════════════════════════════════════════════════════════════════════ +// 公开接口:isReady +// ═══════════════════════════════════════════════════════════════════════════ + +/** + * 检查结巴是否已就绪 + * @returns {boolean} + */ +export function isReady() { + return wasmState === WasmState.READY; +} + +/** + * 获取当前 WASM 状态 + * @returns {string} + */ +export function getState() { + return wasmState; +} + +// ═══════════════════════════════════════════════════════════════════════════ +// 公开接口:injectEntities +// ═══════════════════════════════════════════════════════════════════════════ + +/** + * 注入实体词典 + * + * 更新内部实体列表(用于最长匹配保护) + * 如果结巴已就绪,同时调用 add_word 注入 + * + * @param {Set} lexicon - 标准化后的实体集合 + * @param {Map} [displayMap] - normalize→原词形映射 + */ +export function injectEntities(lexicon, displayMap) { + if (!lexicon?.size) { + entityList = []; + return; + } + + // 构建实体列表:使用原词形(displayMap),按长度降序排列 + const entities = []; + for (const normalized of lexicon) { + const display = displayMap?.get(normalized) || normalized; + if (display.length >= 2) { + entities.push(display); + } + } + + // 按长度降序(最长匹配优先) + entities.sort((a, b) => b.length - a.length); + entityList = entities; + + // 如果结巴已就绪,注入自定义词 + if (wasmState === WasmState.READY && jiebaAddWord) { + injectNewEntitiesToJieba(entities); + } + + xbLog.info(MODULE_ID, `实体词典更新: ${entities.length} 个实体`); +} + +/** + * 将新实体注入结巴(增量,跳过已注入的) + * @param {string[]} entities + */ +function injectNewEntitiesToJieba(entities) { + let count = 0; + for (const entity of entities) { + if (!injectedEntities.has(entity)) { + try { + // freq 设高保证不被切碎 + jiebaAddWord(entity, 99999); + injectedEntities.add(entity); + count++; + } catch (e) { + xbLog.warn(MODULE_ID, `add_word 失败: ${entity}`, e); + } + } + } + if (count > 0) { + xbLog.info(MODULE_ID, `注入 ${count} 个新实体到结巴`); + } +} + +/** + * 重新注入所有实体(WASM 刚加载完时调用) + */ +function reInjectAllEntities() { + injectedEntities.clear(); + injectNewEntitiesToJieba(entityList); +} + +// ═══════════════════════════════════════════════════════════════════════════ +// 公开接口:tokenize +// ═══════════════════════════════════════════════════════════════════════════ + +/** + * 统一分词接口 + * + * 流程: + * 1. 实体最长匹配 → 占位符保护 + * 2. 按 Unicode 脚本分段(亚洲 vs 拉丁) + * 3. 亚洲段 → 结巴 cut()(或降级) + * 4. 拉丁段 → 空格/标点分割 + * 5. 还原占位符 + * 6. 过滤停用词 + 去重 + * + * @param {string} text - 输入文本 + * @returns {string[]} token 数组 + */ +export function tokenize(text) { + const restored = tokenizeCore(text); + + // 5. 过滤停用词 + 去重 + 清理 + const seen = new Set(); + const result = []; + + for (const token of restored) { + const cleaned = token.trim().toLowerCase(); + + if (!cleaned) continue; + if (cleaned.length < 2) continue; + if (STOP_WORDS.has(cleaned)) continue; + if (seen.has(cleaned)) continue; + + // 过滤纯标点/特殊字符 + if (/^[\s\x00-\x1F\p{P}\p{S}]+$/u.test(cleaned)) continue; + + seen.add(cleaned); + result.push(token.trim()); // 保留原始大小写 + } + + return result; +} + +/** + * 内核分词流程(不去重、不 lower、仅完成:实体保护→分段→分词→还原) + * @param {string} text + * @returns {string[]} + */ +function tokenizeCore(text) { + if (!text) return []; + + const input = String(text).trim(); + if (!input) return []; + + // 1. 实体保护 + const { masked, entities } = maskEntities(input); + + // 2. 分段 + const segments = segmentByScript(masked); + + // 3. 分段分词 + const rawTokens = []; + for (const seg of segments) { + if (seg.type === 'asian') { + if (wasmState === WasmState.READY && jiebaCut) { + rawTokens.push(...tokenizeAsianJieba(seg.text)); + } else { + rawTokens.push(...tokenizeAsianFallback(seg.text)); + } + } else if (seg.type === 'latin') { + rawTokens.push(...tokenizeLatin(seg.text)); + } + } + + // 4. 还原占位符 + return unmaskTokens(rawTokens, entities); +} + +// ═══════════════════════════════════════════════════════════════════════════ +// 公开接口:tokenizeForIndex +// ═══════════════════════════════════════════════════════════════════════════ + +/** + * MiniSearch 索引专用分词 + * + * 与 tokenize() 的区别: + * - 全部转小写(MiniSearch 内部需要一致性) + * - 不去重(MiniSearch 自己处理词频) + * + * @param {string} text + * @returns {string[]} + */ +export function tokenizeForIndex(text) { + const restored = tokenizeCore(text); + + return restored + .map(t => t.trim().toLowerCase()) + .filter(t => { + if (!t || t.length < 2) return false; + if (STOP_WORDS.has(t)) return false; + if (/^[\s\x00-\x1F\p{P}\p{S}]+$/u.test(t)) return false; + return true; + }); +} + +// ═══════════════════════════════════════════════════════════════════════════ +// 公开接口:reset +// ═══════════════════════════════════════════════════════════════════════════ + +/** + * 重置分词器状态 + * 用于测试或模块卸载 + */ +export function reset() { + entityList = []; + injectedEntities.clear(); + // 不重置 WASM 状态(避免重复加载) +}