Zero-darkbox query updates and tokenizer improvements

2026-02-09 20:25:26 +08:00
parent 8131d6a15f
commit 0a28539b29
14 changed files with 1771 additions and 175 deletions
--- a/libs/jieba-wasm/LICENSE
+++ b/libs/jieba-wasm/LICENSE
@@ -0,0 +1,25 @@
+Copyright (c) 2018 fengkx <liangkx8237@gmail.com>
+
+Permission is hereby granted, free of charge, to any
+person obtaining a copy of this software and associated
+documentation files (the "Software"), to deal in the
+Software without restriction, including without
+limitation the rights to use, copy, modify, merge,
+publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software
+is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice
+shall be included in all copies or substantial portions
+of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
--- a/libs/jieba-wasm/README.md
+++ b/libs/jieba-wasm/README.md
@@ -0,0 +1,134 @@
+# jieba-wasm
+
+> [jieba-rs](https://github.com/messense/jieba-rs) 的 wasm binding
+
+_编译成 WASM 摆脱编译 Node Addon 的烦恼_
+
+# Usage
+## Node.js
+```js
+const {
+  cut,
+  cut_all,
+  cut_for_search,
+  tokenize,
+  add_word,
+} = require("jieba-wasm");
+cut("中华人民共和国武汉市长江大桥", true);
+// [ '中华人民共和国', '武汉市', '长江大桥' ]
+cut_all("中华人民共和国武汉市长江大桥", true);
+/*
+[
+  '中',         '中华',
+  '中华人民',   '中华人民共和国',
+  '华',         '华人',
+  '人',         '人民',
+  '人民共和国', '民',
+  '共',         '共和',
+  '共和国',     '和',
+  '国',         '武',
+  '武汉',       '武汉市',
+  '汉',         '市',
+  '市长',       '长',
+  '长江',       '长江大桥',
+  '江',         '大',
+  '大桥',       '桥'
+]
+*/
+cut_for_search("中华人民共和国武汉市长江大桥", true);
+/*
+[
+  '中华',     '华人',
+  '人民',     '共和',
+  '共和国',   '中华人民共和国',
+  '武汉',     '武汉市',
+  '长江',     '大桥',
+  '长江大桥'
+]
+*/
+tokenize("中华人民共和国武汉市长江大桥", "default", true);
+/*
+[
+  { word: '中华人民共和国', start: 0, end: 7 },
+  { word: '武汉市', start: 7, end: 10 },
+  { word: '长江大桥', start: 10, end: 14 }
+]
+*/
+tokenize("中华人民共和国武汉市长江大桥", "search", true);
+/*
+[
+  { word: '中华', start: 0, end: 2 },
+  { word: '华人', start: 1, end: 3 },
+  { word: '人民', start: 2, end: 4 },
+  { word: '共和', start: 4, end: 6 },
+  { word: '共和国', start: 4, end: 7 },
+  { word: '中华人民共和国', start: 0, end: 7 },
+  { word: '武汉', start: 7, end: 9 },
+  { word: '武汉市', start: 7, end: 10 },
+  { word: '长江', start: 10, end: 12 },
+  { word: '大桥', start: 12, end: 14 },
+  { word: '长江大桥', start: 10, end: 14 }
+]
+*/
+
+cut("桥大江长市汉武的省北湖国和共民人华中");
+/*
+[
+  '桥', '大江', '长',
+  '市', '汉',   '武',
+  '的', '省',   '北湖',
+  '国', '和',   '共',
+  '民', '人',   '华中'
+]
+*/
+["桥大江长", "市汉武", "省北湖", "国和共民人华中"].map((word) => {
+  add_word(word);
+});
+cut("桥大江长市汉武的省北湖国和共民人华中");
+// ["桥大江长", "市汉武", "的", "省北湖", "国和共民人华中"];
+
+with_dict("自动借书机 1 n"); // 导入自定义字典，词条格式：词语 词频 词性（可选），以换行符分隔
+cut("你好我是一个自动借书机");
+// ["你好", "我", "是", "一个", "自动借书机"];
+```
+
+## Browser
+```ts
+import init, { cut } from 'jieba-wasm';
+
+// 重要：使用前必须初始化
+await init();
+
+cut("中华人民共和国武汉市长江大桥", true);
+// [ '中华人民共和国', '武汉市', '长江大桥' ]
+```
+
+# 示例 Demo
+
+## 安装依赖
+
+安装 wasm-bindgen 和 wasm-opt
+
+```bash
+cargo install wasm-bindgen-cli --locked
+cargo install wasm-opt --locked
+```
+
+## 前期准备
+
+首先保证存在 rust 环境，然后运行以下命令
+```bash
+npm run build:cargo
+npm run build
+```
+
+## 运行浏览器端示例
+```bash
+cd demo/web
+npm install
+npm run dev
+```
+
+# Piror Art
+
+https://github.com/messense/jieba-rs
--- a/libs/jieba-wasm/jieba_rs_wasm.d.ts
+++ b/libs/jieba-wasm/jieba_rs_wasm.d.ts
@@ -0,0 +1,73 @@
+/* tslint:disable */
+/* eslint-disable */
+export function cut(text: string, hmm?: boolean | null): string[];
+export function cut_all(text: string): string[];
+export function cut_for_search(text: string, hmm?: boolean | null): string[];
+export function tokenize(text: string, mode: string, hmm?: boolean | null): Token[];
+export function add_word(word: string, freq?: number | null, tag?: string | null): number;
+export function tag(sentence: string, hmm?: boolean | null): Tag[];
+export function with_dict(dict: string): void;
+
+/** Represents a single token with its word and position. */
+export interface Token {
+    word: string;
+    start: number;
+    end: number;
+}
+
+/** Represents a single word and its part-of-speech tag. */
+export interface Tag {
+    word: string;
+    tag: string;
+}
+
+
+
+export type InitInput = RequestInfo | URL | Response | BufferSource | WebAssembly.Module;
+
+export interface InitOutput {
+  readonly memory: WebAssembly.Memory;
+  readonly cut: (a: number, b: number, c: number) => [number, number];
+  readonly cut_all: (a: number, b: number) => [number, number];
+  readonly cut_for_search: (a: number, b: number, c: number) => [number, number];
+  readonly tokenize: (a: number, b: number, c: number, d: number, e: number) => [number, number, number, number];
+  readonly add_word: (a: number, b: number, c: number, d: number, e: number) => number;
+  readonly tag: (a: number, b: number, c: number) => [number, number];
+  readonly with_dict: (a: number, b: number) => [number, number];
+  readonly rust_zstd_wasm_shim_qsort: (a: number, b: number, c: number, d: number) => void;
+  readonly rust_zstd_wasm_shim_malloc: (a: number) => number;
+  readonly rust_zstd_wasm_shim_memcmp: (a: number, b: number, c: number) => number;
+  readonly rust_zstd_wasm_shim_calloc: (a: number, b: number) => number;
+  readonly rust_zstd_wasm_shim_free: (a: number) => void;
+  readonly rust_zstd_wasm_shim_memcpy: (a: number, b: number, c: number) => number;
+  readonly rust_zstd_wasm_shim_memmove: (a: number, b: number, c: number) => number;
+  readonly rust_zstd_wasm_shim_memset: (a: number, b: number, c: number) => number;
+  readonly __wbindgen_malloc: (a: number, b: number) => number;
+  readonly __wbindgen_realloc: (a: number, b: number, c: number, d: number) => number;
+  readonly __wbindgen_export_2: WebAssembly.Table;
+  readonly __externref_drop_slice: (a: number, b: number) => void;
+  readonly __wbindgen_free: (a: number, b: number, c: number) => void;
+  readonly __externref_table_dealloc: (a: number) => void;
+  readonly __wbindgen_start: () => void;
+}
+
+export type SyncInitInput = BufferSource | WebAssembly.Module;
+/**
+* Instantiates the given `module`, which can either be bytes or
+* a precompiled `WebAssembly.Module`.
+*
+* @param {{ module: SyncInitInput }} module - Passing `SyncInitInput` directly is deprecated.
+*
+* @returns {InitOutput}
+*/
+export function initSync(module: { module: SyncInitInput } | SyncInitInput): InitOutput;
+
+/**
+* If `module_or_path` is {RequestInfo} or {URL}, makes a request and
+* for everything else, calls `WebAssembly.instantiate` directly.
+*
+* @param {{ module_or_path: InitInput | Promise<InitInput> }} module_or_path - Passing `InitInput` directly is deprecated.
+*
+* @returns {Promise<InitOutput>}
+*/
+export default function __wbg_init (module_or_path?: { module_or_path: InitInput | Promise<InitInput> } | InitInput | Promise<InitInput>): Promise<InitOutput>;
--- a/libs/jieba-wasm/jieba_rs_wasm.js
+++ b/libs/jieba-wasm/jieba_rs_wasm.js
@@ -0,0 +1,438 @@
+let wasm;
+
+let cachedUint8ArrayMemory0 = null;
+
+function getUint8ArrayMemory0() {
+    if (cachedUint8ArrayMemory0 === null || cachedUint8ArrayMemory0.byteLength === 0) {
+        cachedUint8ArrayMemory0 = new Uint8Array(wasm.memory.buffer);
+    }
+    return cachedUint8ArrayMemory0;
+}
+
+let cachedTextDecoder = (typeof TextDecoder !== 'undefined' ? new TextDecoder('utf-8', { ignoreBOM: true, fatal: true }) : { decode: () => { throw Error('TextDecoder not available') } } );
+
+if (typeof TextDecoder !== 'undefined') { cachedTextDecoder.decode(); };
+
+const MAX_SAFARI_DECODE_BYTES = 2146435072;
+let numBytesDecoded = 0;
+function decodeText(ptr, len) {
+    numBytesDecoded += len;
+    if (numBytesDecoded >= MAX_SAFARI_DECODE_BYTES) {
+        cachedTextDecoder = (typeof TextDecoder !== 'undefined' ? new TextDecoder('utf-8', { ignoreBOM: true, fatal: true }) : { decode: () => { throw Error('TextDecoder not available') } } );
+        cachedTextDecoder.decode();
+        numBytesDecoded = len;
+    }
+    return cachedTextDecoder.decode(getUint8ArrayMemory0().subarray(ptr, ptr + len));
+}
+
+function getStringFromWasm0(ptr, len) {
+    ptr = ptr >>> 0;
+    return decodeText(ptr, len);
+}
+
+function debugString(val) {
+    // primitive types
+    const type = typeof val;
+    if (type == 'number' || type == 'boolean' || val == null) {
+        return  `${val}`;
+    }
+    if (type == 'string') {
+        return `"${val}"`;
+    }
+    if (type == 'symbol') {
+        const description = val.description;
+        if (description == null) {
+            return 'Symbol';
+        } else {
+            return `Symbol(${description})`;
+        }
+    }
+    if (type == 'function') {
+        const name = val.name;
+        if (typeof name == 'string' && name.length > 0) {
+            return `Function(${name})`;
+        } else {
+            return 'Function';
+        }
+    }
+    // objects
+    if (Array.isArray(val)) {
+        const length = val.length;
+        let debug = '[';
+        if (length > 0) {
+            debug += debugString(val[0]);
+        }
+        for(let i = 1; i < length; i++) {
+            debug += ', ' + debugString(val[i]);
+        }
+        debug += ']';
+        return debug;
+    }
+    // Test for built-in
+    const builtInMatches = /\[object ([^\]]+)\]/.exec(toString.call(val));
+    let className;
+    if (builtInMatches && builtInMatches.length > 1) {
+        className = builtInMatches[1];
+    } else {
+        // Failed to match the standard '[object ClassName]'
+        return toString.call(val);
+    }
+    if (className == 'Object') {
+        // we're a user defined class or Object
+        // JSON.stringify avoids problems with cycles, and is generally much
+        // easier than looping through ownProperties of `val`.
+        try {
+            return 'Object(' + JSON.stringify(val) + ')';
+        } catch (_) {
+            return 'Object';
+        }
+    }
+    // errors
+    if (val instanceof Error) {
+        return `${val.name}: ${val.message}\n${val.stack}`;
+    }
+    // TODO we could test for more things here, like `Set`s and `Map`s.
+    return className;
+}
+
+let WASM_VECTOR_LEN = 0;
+
+const cachedTextEncoder = (typeof TextEncoder !== 'undefined' ? new TextEncoder('utf-8') : { encode: () => { throw Error('TextEncoder not available') } } );
+
+const encodeString = (typeof cachedTextEncoder.encodeInto === 'function'
+    ? function (arg, view) {
+    return cachedTextEncoder.encodeInto(arg, view);
+}
+    : function (arg, view) {
+    const buf = cachedTextEncoder.encode(arg);
+    view.set(buf);
+    return {
+        read: arg.length,
+        written: buf.length
+    };
+});
+
+function passStringToWasm0(arg, malloc, realloc) {
+
+    if (realloc === undefined) {
+        const buf = cachedTextEncoder.encode(arg);
+        const ptr = malloc(buf.length, 1) >>> 0;
+        getUint8ArrayMemory0().subarray(ptr, ptr + buf.length).set(buf);
+        WASM_VECTOR_LEN = buf.length;
+        return ptr;
+    }
+
+    let len = arg.length;
+    let ptr = malloc(len, 1) >>> 0;
+
+    const mem = getUint8ArrayMemory0();
+
+    let offset = 0;
+
+    for (; offset < len; offset++) {
+        const code = arg.charCodeAt(offset);
+        if (code > 0x7F) break;
+        mem[ptr + offset] = code;
+    }
+
+    if (offset !== len) {
+        if (offset !== 0) {
+            arg = arg.slice(offset);
+        }
+        ptr = realloc(ptr, len, len = offset + arg.length * 3, 1) >>> 0;
+        const view = getUint8ArrayMemory0().subarray(ptr + offset, ptr + len);
+        const ret = encodeString(arg, view);
+
+        offset += ret.written;
+        ptr = realloc(ptr, len, offset, 1) >>> 0;
+    }
+
+    WASM_VECTOR_LEN = offset;
+    return ptr;
+}
+
+let cachedDataViewMemory0 = null;
+
+function getDataViewMemory0() {
+    if (cachedDataViewMemory0 === null || cachedDataViewMemory0.buffer.detached === true || (cachedDataViewMemory0.buffer.detached === undefined && cachedDataViewMemory0.buffer !== wasm.memory.buffer)) {
+        cachedDataViewMemory0 = new DataView(wasm.memory.buffer);
+    }
+    return cachedDataViewMemory0;
+}
+
+function isLikeNone(x) {
+    return x === undefined || x === null;
+}
+
+function getArrayJsValueFromWasm0(ptr, len) {
+    ptr = ptr >>> 0;
+    const mem = getDataViewMemory0();
+    const result = [];
+    for (let i = ptr; i < ptr + 4 * len; i += 4) {
+        result.push(wasm.__wbindgen_export_2.get(mem.getUint32(i, true)));
+    }
+    wasm.__externref_drop_slice(ptr, len);
+    return result;
+}
+/**
+ * @param {string} text
+ * @param {boolean | null} [hmm]
+ * @returns {string[]}
+ */
+export function cut(text, hmm) {
+    const ptr0 = passStringToWasm0(text, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc);
+    const len0 = WASM_VECTOR_LEN;
+    const ret = wasm.cut(ptr0, len0, isLikeNone(hmm) ? 0xFFFFFF : hmm ? 1 : 0);
+    var v2 = getArrayJsValueFromWasm0(ret[0], ret[1]).slice();
+    wasm.__wbindgen_free(ret[0], ret[1] * 4, 4);
+    return v2;
+}
+
+/**
+ * @param {string} text
+ * @returns {string[]}
+ */
+export function cut_all(text) {
+    const ptr0 = passStringToWasm0(text, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc);
+    const len0 = WASM_VECTOR_LEN;
+    const ret = wasm.cut_all(ptr0, len0);
+    var v2 = getArrayJsValueFromWasm0(ret[0], ret[1]).slice();
+    wasm.__wbindgen_free(ret[0], ret[1] * 4, 4);
+    return v2;
+}
+
+/**
+ * @param {string} text
+ * @param {boolean | null} [hmm]
+ * @returns {string[]}
+ */
+export function cut_for_search(text, hmm) {
+    const ptr0 = passStringToWasm0(text, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc);
+    const len0 = WASM_VECTOR_LEN;
+    const ret = wasm.cut_for_search(ptr0, len0, isLikeNone(hmm) ? 0xFFFFFF : hmm ? 1 : 0);
+    var v2 = getArrayJsValueFromWasm0(ret[0], ret[1]).slice();
+    wasm.__wbindgen_free(ret[0], ret[1] * 4, 4);
+    return v2;
+}
+
+function takeFromExternrefTable0(idx) {
+    const value = wasm.__wbindgen_export_2.get(idx);
+    wasm.__externref_table_dealloc(idx);
+    return value;
+}
+/**
+ * @param {string} text
+ * @param {string} mode
+ * @param {boolean | null} [hmm]
+ * @returns {Token[]}
+ */
+export function tokenize(text, mode, hmm) {
+    const ptr0 = passStringToWasm0(text, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc);
+    const len0 = WASM_VECTOR_LEN;
+    const ptr1 = passStringToWasm0(mode, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc);
+    const len1 = WASM_VECTOR_LEN;
+    const ret = wasm.tokenize(ptr0, len0, ptr1, len1, isLikeNone(hmm) ? 0xFFFFFF : hmm ? 1 : 0);
+    if (ret[3]) {
+        throw takeFromExternrefTable0(ret[2]);
+    }
+    var v3 = getArrayJsValueFromWasm0(ret[0], ret[1]).slice();
+    wasm.__wbindgen_free(ret[0], ret[1] * 4, 4);
+    return v3;
+}
+
+/**
+ * @param {string} word
+ * @param {number | null} [freq]
+ * @param {string | null} [tag]
+ * @returns {number}
+ */
+export function add_word(word, freq, tag) {
+    const ptr0 = passStringToWasm0(word, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc);
+    const len0 = WASM_VECTOR_LEN;
+    var ptr1 = isLikeNone(tag) ? 0 : passStringToWasm0(tag, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc);
+    var len1 = WASM_VECTOR_LEN;
+    const ret = wasm.add_word(ptr0, len0, isLikeNone(freq) ? 0x100000001 : (freq) >>> 0, ptr1, len1);
+    return ret >>> 0;
+}
+
+/**
+ * @param {string} sentence
+ * @param {boolean | null} [hmm]
+ * @returns {Tag[]}
+ */
+export function tag(sentence, hmm) {
+    const ptr0 = passStringToWasm0(sentence, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc);
+    const len0 = WASM_VECTOR_LEN;
+    const ret = wasm.tag(ptr0, len0, isLikeNone(hmm) ? 0xFFFFFF : hmm ? 1 : 0);
+    var v2 = getArrayJsValueFromWasm0(ret[0], ret[1]).slice();
+    wasm.__wbindgen_free(ret[0], ret[1] * 4, 4);
+    return v2;
+}
+
+/**
+ * @param {string} dict
+ */
+export function with_dict(dict) {
+    const ptr0 = passStringToWasm0(dict, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc);
+    const len0 = WASM_VECTOR_LEN;
+    const ret = wasm.with_dict(ptr0, len0);
+    if (ret[1]) {
+        throw takeFromExternrefTable0(ret[0]);
+    }
+}
+
+const EXPECTED_RESPONSE_TYPES = new Set(['basic', 'cors', 'default']);
+
+async function __wbg_load(module, imports) {
+    if (typeof Response === 'function' && module instanceof Response) {
+        if (typeof WebAssembly.instantiateStreaming === 'function') {
+            try {
+                return await WebAssembly.instantiateStreaming(module, imports);
+
+            } catch (e) {
+                const validResponse = module.ok && EXPECTED_RESPONSE_TYPES.has(module.type);
+
+                if (validResponse && module.headers.get('Content-Type') !== 'application/wasm') {
+                    console.warn("`WebAssembly.instantiateStreaming` failed because your server does not serve Wasm with `application/wasm` MIME type. Falling back to `WebAssembly.instantiate` which is slower. Original error:\n", e);
+
+                } else {
+                    throw e;
+                }
+            }
+        }
+
+        const bytes = await module.arrayBuffer();
+        return await WebAssembly.instantiate(bytes, imports);
+
+    } else {
+        const instance = await WebAssembly.instantiate(module, imports);
+
+        if (instance instanceof WebAssembly.Instance) {
+            return { instance, module };
+
+        } else {
+            return instance;
+        }
+    }
+}
+
+function __wbg_get_imports() {
+    const imports = {};
+    imports.wbg = {};
+    imports.wbg.__wbg_Error_0497d5bdba9362e5 = function(arg0, arg1) {
+        const ret = Error(getStringFromWasm0(arg0, arg1));
+        return ret;
+    };
+    imports.wbg.__wbg_new_07b483f72211fd66 = function() {
+        const ret = new Object();
+        return ret;
+    };
+    imports.wbg.__wbg_set_3f1d0b984ed272ed = function(arg0, arg1, arg2) {
+        arg0[arg1] = arg2;
+    };
+    imports.wbg.__wbindgen_bigint_from_u64 = function(arg0) {
+        const ret = BigInt.asUintN(64, arg0);
+        return ret;
+    };
+    imports.wbg.__wbindgen_debug_string = function(arg0, arg1) {
+        const ret = debugString(arg1);
+        const ptr1 = passStringToWasm0(ret, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc);
+        const len1 = WASM_VECTOR_LEN;
+        getDataViewMemory0().setInt32(arg0 + 4 * 1, len1, true);
+        getDataViewMemory0().setInt32(arg0 + 4 * 0, ptr1, true);
+    };
+    imports.wbg.__wbindgen_init_externref_table = function() {
+        const table = wasm.__wbindgen_export_2;
+        const offset = table.grow(4);
+        table.set(0, undefined);
+        table.set(offset + 0, undefined);
+        table.set(offset + 1, null);
+        table.set(offset + 2, true);
+        table.set(offset + 3, false);
+        ;
+    };
+    imports.wbg.__wbindgen_number_new = function(arg0) {
+        const ret = arg0;
+        return ret;
+    };
+    imports.wbg.__wbindgen_string_new = function(arg0, arg1) {
+        const ret = getStringFromWasm0(arg0, arg1);
+        return ret;
+    };
+    imports.wbg.__wbindgen_throw = function(arg0, arg1) {
+        throw new Error(getStringFromWasm0(arg0, arg1));
+    };
+
+    return imports;
+}
+
+function __wbg_init_memory(imports, memory) {
+
+}
+
+function __wbg_finalize_init(instance, module) {
+    wasm = instance.exports;
+    __wbg_init.__wbindgen_wasm_module = module;
+    cachedDataViewMemory0 = null;
+    cachedUint8ArrayMemory0 = null;
+
+
+    wasm.__wbindgen_start();
+    return wasm;
+}
+
+function initSync(module) {
+    if (wasm !== undefined) return wasm;
+
+
+    if (typeof module !== 'undefined') {
+        if (Object.getPrototypeOf(module) === Object.prototype) {
+            ({module} = module)
+        } else {
+            console.warn('using deprecated parameters for `initSync()`; pass a single object instead')
+        }
+    }
+
+    const imports = __wbg_get_imports();
+
+    __wbg_init_memory(imports);
+
+    if (!(module instanceof WebAssembly.Module)) {
+        module = new WebAssembly.Module(module);
+    }
+
+    const instance = new WebAssembly.Instance(module, imports);
+
+    return __wbg_finalize_init(instance, module);
+}
+
+async function __wbg_init(module_or_path) {
+    if (wasm !== undefined) return wasm;
+
+
+    if (typeof module_or_path !== 'undefined') {
+        if (Object.getPrototypeOf(module_or_path) === Object.prototype) {
+            ({module_or_path} = module_or_path)
+        } else {
+            console.warn('using deprecated parameters for the initialization function; pass a single object instead')
+        }
+    }
+
+    if (typeof module_or_path === 'undefined') {
+        module_or_path = new URL('jieba_rs_wasm_bg.wasm', import.meta.url);
+    }
+    const imports = __wbg_get_imports();
+
+    if (typeof module_or_path === 'string' || (typeof Request === 'function' && module_or_path instanceof Request) || (typeof URL === 'function' && module_or_path instanceof URL)) {
+        module_or_path = fetch(module_or_path);
+    }
+
+    __wbg_init_memory(imports);
+
+    const { instance, module } = await __wbg_load(await module_or_path, imports);
+
+    return __wbg_finalize_init(instance, module);
+}
+
+export { initSync };
+export default __wbg_init;
--- a/libs/jieba-wasm/jieba_rs_wasm_bg.wasm
+++ b/libs/jieba-wasm/jieba_rs_wasm_bg.wasm
--- a/libs/jieba-wasm/jieba_rs_wasm_bg.wasm.d.ts
+++ b/libs/jieba-wasm/jieba_rs_wasm_bg.wasm.d.ts
@@ -0,0 +1,25 @@
+/* tslint:disable */
+/* eslint-disable */
+export const memory: WebAssembly.Memory;
+export const cut: (a: number, b: number, c: number) => [number, number];
+export const cut_all: (a: number, b: number) => [number, number];
+export const cut_for_search: (a: number, b: number, c: number) => [number, number];
+export const tokenize: (a: number, b: number, c: number, d: number, e: number) => [number, number, number, number];
+export const add_word: (a: number, b: number, c: number, d: number, e: number) => number;
+export const tag: (a: number, b: number, c: number) => [number, number];
+export const with_dict: (a: number, b: number) => [number, number];
+export const rust_zstd_wasm_shim_qsort: (a: number, b: number, c: number, d: number) => void;
+export const rust_zstd_wasm_shim_malloc: (a: number) => number;
+export const rust_zstd_wasm_shim_memcmp: (a: number, b: number, c: number) => number;
+export const rust_zstd_wasm_shim_calloc: (a: number, b: number) => number;
+export const rust_zstd_wasm_shim_free: (a: number) => void;
+export const rust_zstd_wasm_shim_memcpy: (a: number, b: number, c: number) => number;
+export const rust_zstd_wasm_shim_memmove: (a: number, b: number, c: number) => number;
+export const rust_zstd_wasm_shim_memset: (a: number, b: number, c: number) => number;
+export const __wbindgen_malloc: (a: number, b: number) => number;
+export const __wbindgen_realloc: (a: number, b: number, c: number, d: number) => number;
+export const __wbindgen_export_2: WebAssembly.Table;
+export const __externref_drop_slice: (a: number, b: number) => void;
+export const __wbindgen_free: (a: number, b: number, c: number) => void;
+export const __externref_table_dealloc: (a: number) => void;
+export const __wbindgen_start: () => void;
--- a/libs/jieba-wasm/package.json
+++ b/libs/jieba-wasm/package.json
@@ -0,0 +1,129 @@
+{
+  "name": "jieba-wasm",
+  "version": "2.4.0",
+  "description": "WASM binding to jieba-rs",
+  "main": "./pkg/nodejs/jieba_rs_wasm.js",
+  "types": "./pkg/nodejs/jieba_rs_wasm.d.ts",
+  "exports": {
+    ".": {
+      "node": {
+        "types": "./pkg/nodejs/jieba_rs_wasm.d.ts",
+        "default": "./pkg/nodejs/jieba_rs_wasm.js"
+      },
+      "deno": {
+        "types": "./pkg/deno/jieba_rs_wasm.d.ts",
+        "default": "./pkg/deno/jieba_rs_wasm.js"
+      },
+      "browser": {
+        "types": "./pkg/web/jieba_rs_wasm.d.ts",
+        "default": "./pkg/web/jieba_rs_wasm.js"
+      },
+      "import": {
+        "types": "./pkg/web/jieba_rs_wasm.d.ts",
+        "default": "./pkg/web/jieba_rs_wasm.js"
+      },
+      "require": {
+        "types": "./pkg/nodejs/jieba_rs_wasm.d.ts",
+        "default": "./pkg/nodejs/jieba_rs_wasm.js"
+      }
+    },
+    "./web": {
+      "types": "./pkg/web/jieba_rs_wasm.d.ts",
+      "default": "./pkg/web/jieba_rs_wasm.js"
+    },
+    "./node": {
+      "types": "./pkg/nodejs/jieba_rs_wasm.d.ts",
+      "default": "./pkg/nodejs/jieba_rs_wasm.js"
+    },
+    "./deno": {
+      "types": "./pkg/deno/jieba_rs_wasm.d.ts",
+      "default": "./pkg/deno/jieba_rs_wasm.js"
+    }
+  },
+  "directories": {
+    "test": "tests"
+  },
+  "scripts": {
+    "build": "wireit",
+    "build:cargo": "wireit",
+    "build:bundler": "wireit",
+    "build:nodejs": "wireit",
+    "build:deno": "wireit",
+    "build:web": "wireit",
+    "build:opt": "wireit",
+    "test": "echo \"Error: no test specified\" && exit 1"
+  },
+  "wireit": {
+    "build:cargo": {
+      "command": "cargo build --release --target wasm32-unknown-unknown"
+    },
+    "build:bundler": {
+      "command": "wasm-bindgen target/wasm32-unknown-unknown/release/jieba_rs_wasm.wasm --out-dir ./pkg/bundler --target bundler",
+      "dependencies": [
+        "build:cargo"
+      ]
+    },
+    "build:nodejs": {
+      "command": "wasm-bindgen target/wasm32-unknown-unknown/release/jieba_rs_wasm.wasm --out-dir ./pkg/nodejs --target nodejs",
+      "dependencies": [
+        "build:cargo"
+      ]
+    },
+    "build:deno": {
+      "command": "wasm-bindgen target/wasm32-unknown-unknown/release/jieba_rs_wasm.wasm --out-dir ./pkg/deno --target deno",
+      "dependencies": [
+        "build:cargo"
+      ]
+    },
+    "build:web": {
+      "command": "wasm-bindgen target/wasm32-unknown-unknown/release/jieba_rs_wasm.wasm --out-dir ./pkg/web --target web",
+      "dependencies": [
+        "build:cargo"
+      ]
+    },
+    "build": {
+      "dependencies": [
+        "build:cargo",
+        "build:bundler",
+        "build:nodejs",
+        "build:deno",
+        "build:web",
+        "build:opt"
+      ]
+    },
+    "build:opt": {
+      "command": "node scripts/opt.js",
+      "dependencies": [
+        "build:cargo",
+        "build:bundler",
+        "build:nodejs",
+        "build:deno",
+        "build:web"
+      ]
+    }
+  },
+  "files": [
+    "pkg/**/*"
+  ],
+  "repository": {
+    "type": "git",
+    "url": "git+https://github.com/fengkx/jieba-wasm.git"
+  },
+  "keywords": [
+    "wasm",
+    "jieba",
+    "chinese",
+    "segment",
+    "中文分词"
+  ],
+  "author": "fengkx",
+  "license": "MIT",
+  "bugs": {
+    "url": "https://github.com/fengkx/jieba-wasm/issues"
+  },
+  "homepage": "https://github.com/fengkx/jieba-wasm#readme",
+  "devDependencies": {
+    "@jsdevtools/ez-spawn": "^3.0.4",
+    "wireit": "^0.14.4"
+  }
+}
--- a/modules/story-summary/story-summary.js
+++ b/modules/story-summary/story-summary.js
@@ -44,6 +44,12 @@ import { runSummaryGeneration } from "./generate/generator.js";
 // vector service
 import { embed, getEngineFingerprint, testOnlineService } from "./vector/utils/embedder.js";

+// tokenizer
+import { preload as preloadTokenizer, injectEntities } from "./vector/utils/tokenizer.js";
+
+// entity lexicon
+import { buildEntityLexicon, buildDisplayNameMap } from "./vector/retrieval/entity-lexicon.js";
+
 import {
    getMeta,
    updateMeta,
@@ -82,7 +88,7 @@ import {
 // vector io
 import { exportVectors, importVectors } from "./vector/storage/vector-io.js";

-import { invalidateLexicalIndex } from "./vector/retrieval/lexical-index.js";
+import { invalidateLexicalIndex, warmupIndex } from "./vector/retrieval/lexical-index.js";

 // ═══════════════════════════════════════════════════════════════════════════
 // 常量
@@ -145,6 +151,30 @@ const VECTOR_WARNING_COOLDOWN_MS = 120000; // 2分钟内不重复提醒
 const EXT_PROMPT_KEY = "LittleWhiteBox_StorySummary";
 const MIN_INJECTION_DEPTH = 2;

+// ═══════════════════════════════════════════════════════════════════════════
+// 分词器预热
+// ═══════════════════════════════════════════════════════════════════════════
+
+/** 是否已触发过预热 */
+let tokenizerPreloaded = false;
+
+function maybePreloadTokenizer() {
+    if (tokenizerPreloaded) return;
+
+    const vectorCfg = getVectorConfig();
+    if (!vectorCfg?.enabled) return;
+
+    tokenizerPreloaded = true;
+    preloadTokenizer()
+        .then((ok) => {
+            if (ok) tokenizerPreloaded = true;
+        })
+        .catch((e) => {
+            // 不置 tokenizerPreloaded，允许后续重试（例如用户修复路径/刷新后）
+            xbLog.warn(MODULE_ID, "分词器预热失败（将降级运行，可稍后重试）", e);
+        });
+}
+
 // role 映射
 const ROLE_MAP = {
    system: extension_prompt_roles.SYSTEM,
@@ -499,6 +529,27 @@ async function handleClearVectors() {
    xbLog.info(MODULE_ID, "向量数据已清除");
 }

+// ═══════════════════════════════════════════════════════════════════════════
+// 实体词典注入 + 索引预热
+// ═══════════════════════════════════════════════════════════════════════════
+
+function refreshEntityLexiconAndWarmup() {
+    const vectorCfg = getVectorConfig();
+    if (!vectorCfg?.enabled) return;
+
+    const store = getSummaryStore();
+    const { name1, name2 } = getContext();
+
+    const lexicon = buildEntityLexicon(store, { name1, name2 });
+    const displayMap = buildDisplayNameMap(store, { name1, name2 });
+
+    injectEntities(lexicon, displayMap);
+
+    // 异步预建词法索引（不阻塞）
+    invalidateLexicalIndex();
+    warmupIndex();
+}
+
 // ═══════════════════════════════════════════════════════════════════════════
 // L2 自动增量向量化（总结完成后调用）
 // ═══════════════════════════════════════════════════════════════════════════
@@ -997,6 +1048,9 @@ function handleFrameMessage(event) {

        case "VECTOR_GENERATE":
            if (data.config) saveVectorConfig(data.config);
+            // 向量配置变更，可能刚启用，触发预热
+            maybePreloadTokenizer();
+            refreshEntityLexiconAndWarmup();
            handleGenerateVectors(data.config);
            break;

@@ -1085,6 +1139,8 @@ function handleFrameMessage(event) {

        case "REQUEST_VECTOR_STATS":
            sendVectorStatsToFrame();
+            // 向量开关可能在 iframe 中被修改，检查是否需要预热
+            maybePreloadTokenizer();
            break;

        case "REQUEST_CLEAR": {
@@ -1213,7 +1269,7 @@ async function handleChatChanged() {
    const newLength = Array.isArray(chat) ? chat.length : 0;

    await rollbackSummaryIfNeeded();
-    invalidateLexicalIndex();    
+    invalidateLexicalIndex();
    initButtonsForAll();

    const store = getSummaryStore();
@@ -1230,6 +1286,9 @@ async function handleChatChanged() {
        sendVectorStatsToFrame();
    }

+    // 实体词典注入 + 索引预热
+    refreshEntityLexiconAndWarmup();
+
    setTimeout(() => checkVectorIntegrityAndWarn(), 2000);
 }

@@ -1267,6 +1326,9 @@ async function handleMessageReceived() {

    applyHideStateDebounced();
    setTimeout(() => maybeAutoRunSummary("after_ai"), 1000);
+
+    // 新消息后刷新实体词典（可能有新角色）
+    refreshEntityLexiconAndWarmup();
 }

 function handleMessageSent() {
@@ -1458,4 +1520,7 @@ jQuery(() => {
    if (!getSettings().storySummary?.enabled) return;
    registerEvents();
    initStateIntegration();
+
+    // 条件预热分词器（storySummary 已启用，检查 vector 是否也启用）
+    maybePreloadTokenizer();
 });
--- a/modules/story-summary/vector/llm/reranker.js
+++ b/modules/story-summary/vector/llm/reranker.js
@@ -78,7 +78,8 @@ export async function rerank(query, documents, options = {}) {
            },
            body: JSON.stringify({
                model: RERANK_MODEL,
-                query: query.slice(0, 1000),  // 限制 query 长度
+                // Zero-darkbox: do not silently truncate query.
+                query,
                documents: validDocs,
                top_n: Math.min(topN, validDocs.length),
                return_documents: false,
--- a/modules/story-summary/vector/retrieval/lexical-index.js
+++ b/modules/story-summary/vector/retrieval/lexical-index.js
@@ -4,9 +4,10 @@
 // 职责：
 // 1. 对 L0 atoms + L1 chunks + L2 events 建立词法索引
 // 2. 提供词法检索接口（专名精确匹配兜底）
-// 3. 惰性构建 + 缓存失效机制
+// 3. 惰性构建 + 异步预热 + 缓存失效机制
 //
 // 索引存储：纯内存（不持久化）
+// 分词器：统一使用 tokenizer.js（结巴 + 实体保护 + 降级）
 // 重建时机：CHAT_CHANGED / L0提取完成 / L2总结完成
 // ═══════════════════════════════════════════════════════════════════════════

@@ -16,6 +17,7 @@ import { getSummaryStore } from '../../data/store.js';
 import { getStateAtoms } from '../storage/state-store.js';
 import { getAllChunks } from '../storage/chunk-store.js';
 import { xbLog } from '../../../../core/debug-core.js';
+import { tokenizeForIndex } from '../utils/tokenizer.js';

 const MODULE_ID = 'lexical-index';

@@ -23,9 +25,20 @@ const MODULE_ID = 'lexical-index';
 // 缓存
 // ─────────────────────────────────────────────────────────────────────────

+/** @type {MiniSearch|null} */
 let cachedIndex = null;
+
+/** @type {string|null} */
 let cachedChatId = null;
-let cachedFingerprint = null; // atoms.length + chunks.length + events.length 的简单指纹
+
+/** @type {string|null} 数据指纹（atoms + chunks + events 数量） */
+let cachedFingerprint = null;
+
+/** @type {boolean} 是否正在构建 */
+let building = false;
+
+/** @type {Promise<MiniSearch|null>|null} 当前构建 Promise（防重入） */
+let buildPromise = null;

 // ─────────────────────────────────────────────────────────────────────────
 // 工具函数
@@ -43,7 +56,7 @@ function cleanSummary(summary) {
 }

 /**
- * 计算缓存指纹（用于判断是否需要重建）
+ * 计算缓存指纹
 * @param {number} atomCount
 * @param {number} chunkCount
 * @param {number} eventCount
@@ -53,39 +66,27 @@ function computeFingerprint(atomCount, chunkCount, eventCount) {
    return `${atomCount}:${chunkCount}:${eventCount}`;
 }

+/**
+ * 让出主线程（避免长时间阻塞 UI）
+ * @returns {Promise<void>}
+ */
+function yieldToMain() {
+    return new Promise(resolve => setTimeout(resolve, 0));
+}
+
 // ─────────────────────────────────────────────────────────────────────────
-// 索引构建
+// 文档收集
 // ─────────────────────────────────────────────────────────────────────────

 /**
- * 构建 MiniSearch 索引
- *
- * 索引三类文档：
- * - L0 atoms: { id: atomId, type: 'atom', floor, text: semantic }
- * - L1 chunks: { id: chunkId, type: 'chunk', floor, text: chunk.text }
- * - L2 events: { id: eventId, type: 'event', floor: null, text: title + participants + summary }
+ * 收集所有待索引文档
 *
 * @param {object[]} atoms  - getStateAtoms() 返回值
 * @param {object[]} chunks - getAllChunks(chatId) 返回值
 * @param {object[]} events - store.json.events
- * @returns {MiniSearch}
+ * @returns {object[]} 文档数组
 */
-export function buildLexicalIndex(atoms, chunks, events) {
-    const T0 = performance.now();
-
-    const index = new MiniSearch({
-        fields: ['text'],
-        storeFields: ['type', 'floor'],
-        idField: 'id',
-        searchOptions: {
-            boost: { text: 1 },
-            fuzzy: 0.2,
-            prefix: true,
-        },
-        // 中文友好的 tokenizer：按字符 bigram + 空格/标点分词
-        tokenize: chineseTokenize,
-    });
-
+function collectDocuments(atoms, chunks, events) {
    const docs = [];

    // L0 atoms
@@ -129,72 +130,58 @@ export function buildLexicalIndex(atoms, chunks, events) {
        });
    }

-    if (docs.length > 0) {
-        index.addAll(docs);
-    }
-
-    const elapsed = Math.round(performance.now() - T0);
-    xbLog.info(MODULE_ID, `索引构建完成: ${docs.length} 文档 (atoms=${atoms?.length || 0}, chunks=${chunks?.length || 0}, events=${events?.length || 0}) ${elapsed}ms`);
-
-    return index;
+    return docs;
 }

 // ─────────────────────────────────────────────────────────────────────────
-// 中文 Tokenizer
+// 索引构建（分片，不阻塞主线程）
 // ─────────────────────────────────────────────────────────────────────────

+/** 每批添加的文档数 */
+const BUILD_BATCH_SIZE = 500;
+
 /**
- * 中文友好的分词器
+ * 构建 MiniSearch 索引（分片异步）
 *
- * 策略：
- * 1. 连续中文字符 → 滑动 bigram（"黄英梅" → "黄英", "英梅"）
- * 2. 连续非中文字符 → 按空格/标点分割
- * 3. 保留完整中文词（2-4字）作为额外 token
- *
- * @param {string} text
- * @returns {string[]}
+ * @param {object[]} docs - 文档数组
+ * @returns {Promise<MiniSearch>}
 */
-function chineseTokenize(text) {
-    if (!text) return [];
+async function buildIndexAsync(docs) {
+    const T0 = performance.now();

-    const tokens = [];
-    const s = String(text).toLowerCase();
+    const index = new MiniSearch({
+        fields: ['text'],
+        storeFields: ['type', 'floor'],
+        idField: 'id',
+        searchOptions: {
+            boost: { text: 1 },
+            fuzzy: 0.2,
+            prefix: true,
+        },
+        tokenize: tokenizeForIndex,
+    });

-    // 分离中文段和非中文段
-    const segments = s.split(/([\u4e00-\u9fff]+)/g);
+    if (!docs.length) {
+        return index;
+    }

-    for (const seg of segments) {
-        if (!seg) continue;
+    // 分片添加，每批 BUILD_BATCH_SIZE 条后让出主线程
+    for (let i = 0; i < docs.length; i += BUILD_BATCH_SIZE) {
+        const batch = docs.slice(i, i + BUILD_BATCH_SIZE);
+        index.addAll(batch);

-        // 中文段：bigram + 完整段（如果 2-6 字）
-        if (/^[\u4e00-\u9fff]+$/.test(seg)) {
-            // 完整段作为一个 token（如果长度合适）
-            if (seg.length >= 2 && seg.length <= 6) {
-                tokens.push(seg);
-            }
-
-            // bigram
-            for (let i = 0; i < seg.length - 1; i++) {
-                tokens.push(seg.slice(i, i + 2));
-            }
-
-            // trigram（对 3+ 字的段）
-            for (let i = 0; i < seg.length - 2; i++) {
-                tokens.push(seg.slice(i, i + 3));
-            }
-        } else {
-            // 非中文段：按空格/标点分割
-const words = seg.split(/[\s\-_.,;:!?'"()[\]{}<>/\\|@#$%^&*+=~`]+/);
-            for (const w of words) {
-                const trimmed = w.trim();
-                if (trimmed.length >= 2) {
-                    tokens.push(trimmed);
-                }
-            }
+        // 非最后一批时让出主线程
+        if (i + BUILD_BATCH_SIZE < docs.length) {
+            await yieldToMain();
        }
    }

-    return tokens;
+    const elapsed = Math.round(performance.now() - T0);
+    xbLog.info(MODULE_ID,
+        `索引构建完成: ${docs.length} 文档 (${elapsed}ms)`
+    );
+
+    return index;
 }

 // ─────────────────────────────────────────────────────────────────────────
@@ -247,6 +234,8 @@ export function searchLexicalIndex(index, terms) {
            fuzzy: 0.2,
            prefix: true,
            combineWith: 'OR',
+            // 使用与索引相同的分词器
+            tokenize: tokenizeForIndex,
        });
    } catch (e) {
        xbLog.warn(MODULE_ID, '检索失败', e);
@@ -305,22 +294,17 @@ export function searchLexicalIndex(index, terms) {
 }

 // ─────────────────────────────────────────────────────────────────────────
-// 惰性缓存管理
+// 内部构建流程（收集数据 + 构建索引）
 // ─────────────────────────────────────────────────────────────────────────

 /**
- * 获取词法索引（惰性构建 + 缓存）
+ * 收集数据并构建索引
 *
- * 如果缓存有效则直接返回；否则自动构建。
- * 缓存失效条件：chatId 变化 / 数据指纹变化 / 手动 invalidate
- *
- * @returns {Promise<MiniSearch>}
+ * @param {string} chatId
+ * @returns {Promise<{index: MiniSearch, fingerprint: string}>}
 */
-export async function getLexicalIndex() {
-    const { chatId } = getContext();
-    if (!chatId) return null;
-
-    // 收集当前数据
+async function collectAndBuild(chatId) {
+    // 收集数据
    const atoms = getStateAtoms() || [];
    const store = getSummaryStore();
    const events = store?.json?.events || [];
@@ -334,30 +318,118 @@ export async function getLexicalIndex() {

    const fp = computeFingerprint(atoms.length, chunks.length, events.length);

-    // 缓存命中
+    // 检查是否在收集过程中缓存已被其他调用更新
    if (cachedIndex && cachedChatId === chatId && cachedFingerprint === fp) {
+        return { index: cachedIndex, fingerprint: fp };
+    }
+
+    // 收集文档
+    const docs = collectDocuments(atoms, chunks, events);
+
+    // 异步分片构建
+    const index = await buildIndexAsync(docs);
+
+    return { index, fingerprint: fp };
+}
+
+// ─────────────────────────────────────────────────────────────────────────
+// 公开接口：getLexicalIndex（惰性获取）
+// ─────────────────────────────────────────────────────────────────────────
+
+/**
+ * 获取词法索引（惰性构建 + 缓存）
+ *
+ * 如果缓存有效则直接返回；否则自动构建。
+ * 如果正在构建中，等待构建完成。
+ *
+ * @returns {Promise<MiniSearch|null>}
+ */
+export async function getLexicalIndex() {
+    const { chatId } = getContext();
+    if (!chatId) return null;
+
+    // 快速路径：如果缓存存在且 chatId 未变，则直接命中
+    // 指纹校验放到构建流程中完成，避免为指纹而额外读一次 IndexedDB
+    if (cachedIndex && cachedChatId === chatId && cachedFingerprint) {
        return cachedIndex;
    }

-    // 重建
-    xbLog.info(MODULE_ID, `缓存失效，重建索引 (chatId=${chatId.slice(0, 8)}, fp=${fp})`);
+    // 正在构建中，等待结果
+    if (building && buildPromise) {
+        try {
+            await buildPromise;
+            if (cachedIndex && cachedChatId === chatId && cachedFingerprint) {
+                return cachedIndex;
+            }
+        } catch {
+            // 构建失败，继续往下重建
+        }
+    }

-    const index = buildLexicalIndex(atoms, chunks, events);
+    // 需要重建（指纹将在 collectAndBuild 内部计算并写入缓存）
+    xbLog.info(MODULE_ID, `缓存失效，重建索引 (chatId=${chatId.slice(0, 8)})`);

-    cachedIndex = index;
-    cachedChatId = chatId;
-    cachedFingerprint = fp;
+    building = true;
+    buildPromise = collectAndBuild(chatId);

-    return index;
+    try {
+        const { index, fingerprint } = await buildPromise;
+
+        // 原子替换缓存
+        cachedIndex = index;
+        cachedChatId = chatId;
+        cachedFingerprint = fingerprint;
+
+        return index;
+    } catch (e) {
+        xbLog.error(MODULE_ID, '索引构建失败', e);
+        return null;
+    } finally {
+        building = false;
+        buildPromise = null;
+    }
 }

+// ─────────────────────────────────────────────────────────────────────────
+// 公开接口：warmupIndex（异步预建）
+// ─────────────────────────────────────────────────────────────────────────
+
 /**
- * 使缓存失效（下次 getLexicalIndex 时自动重建）
+ * 异步预建索引
+ *
+ * 在 CHAT_CHANGED 时调用，后台构建索引。
+ * 不阻塞调用方，不返回结果。
+ * 构建完成后缓存自动更新，后续 getLexicalIndex() 直接命中。
+ *
+ * 调用时机：
+ * - handleChatChanged（实体注入后）
+ * - L0 提取完成
+ * - L2 总结完成
+ */
+export function warmupIndex() {
+    const { chatId } = getContext();
+    if (!chatId) return;
+
+    // 已在构建中，不重复触发
+    if (building) return;
+
+    // fire-and-forget
+    getLexicalIndex().catch(e => {
+        xbLog.warn(MODULE_ID, '预热索引失败', e);
+    });
+}
+
+// ─────────────────────────────────────────────────────────────────────────
+// 公开接口：invalidateLexicalIndex（缓存失效）
+// ─────────────────────────────────────────────────────────────────────────
+
+/**
+ * 使缓存失效（下次 getLexicalIndex / warmupIndex 时自动重建）
 *
 * 调用时机：
 * - CHAT_CHANGED
- * - L0 提取完成（handleAnchorGenerate 完成后）
- * - L2 总结完成（onComplete 回调中）
+ * - L0 提取完成
+ * - L2 总结完成
 */
 export function invalidateLexicalIndex() {
    if (cachedIndex) {
--- a/modules/story-summary/vector/retrieval/metrics.js
+++ b/modules/story-summary/vector/retrieval/metrics.js
@@ -16,6 +16,11 @@ export function createMetrics() {
        query: {
            buildTime: 0,
            refineTime: 0,
+            lengths: {
+                v0Chars: 0,
+                v1Chars: null,     // null = NA
+                rerankChars: 0,
+            },
        },

        // Anchor (L0 StateAtoms) - 语义锚点
@@ -177,6 +182,13 @@ export function formatMetricsLog(metrics) {
    lines.push('════════════════════════════════════════');
    lines.push('');

+    // Query Length
+    lines.push('[Query Length] 查询长度');
+    lines.push(`├─ query_v0_chars: ${m.query?.lengths?.v0Chars ?? 0}`);
+    lines.push(`├─ query_v1_chars: ${m.query?.lengths?.v1Chars == null ? 'NA' : m.query.lengths.v1Chars}`);
+    lines.push(`└─ rerank_query_chars: ${m.query?.lengths?.rerankChars ?? 0}`);
+    lines.push('');
+
    // Query Build
    lines.push('[Query] 查询构建');
    lines.push(`├─ build_time: ${m.query.buildTime}ms`);
--- a/modules/story-summary/vector/retrieval/query-builder.js
+++ b/modules/story-summary/vector/retrieval/query-builder.js
@@ -12,36 +12,18 @@ import { getContext } from '../../../../../../../extensions.js';
 import { buildEntityLexicon, buildDisplayNameMap, extractEntitiesFromText } from './entity-lexicon.js';
 import { getSummaryStore } from '../../data/store.js';
 import { filterText } from '../utils/text-filter.js';
+import { tokenizeForIndex as tokenizerTokenizeForIndex } from '../utils/tokenizer.js';

 // ─────────────────────────────────────────────────────────────────────────
 // 常量
 // ─────────────────────────────────────────────────────────────────────────

-const DIALOGUE_MAX_CHARS = 400;
-const PENDING_MAX_CHARS = 400;
-const MEMORY_HINT_MAX_CHARS = 100;
+// Zero-darkbox policy:
+// - No internal truncation. We rely on model-side truncation / provider limits.
+// - If provider rejects due to length, we fail loudly and degrade explicitly.
 const MEMORY_HINT_ATOMS_MAX = 5;
 const MEMORY_HINT_EVENTS_MAX = 3;
-const RERANK_QUERY_MAX_CHARS = 500;
-const RERANK_SNIPPET_CHARS = 150;
 const LEXICAL_TERMS_MAX = 10;
-const LEXICAL_TERM_MIN_LEN = 2;
-const LEXICAL_TERM_MAX_LEN = 6;
-
-// 中文停用词（高频无意义词）
-const STOP_WORDS = new Set([
-    '的', '了', '在', '是', '我', '有', '和', '就', '不', '人',
-    '都', '一', '一个', '上', '也', '很', '到', '说', '要', '去',
-    '你', '会', '着', '没有', '看', '好', '自己', '这', '他', '她',
-    '它', '吗', '什么', '那', '里', '来', '吧', '呢', '啊', '哦',
-    '嗯', '呀', '哈', '嘿', '喂', '哎', '唉', '哇', '呃', '嘛',
-    '把', '被', '让', '给', '从', '向', '对', '跟', '比', '但',
-    '而', '或', '如果', '因为', '所以', '虽然', '但是', '然后',
-    '可以', '这样', '那样', '怎么', '为什么', '什么样', '哪里',
-    '时候', '现在', '已经', '还是', '只是', '可能', '应该', '知道',
-    '觉得', '开始', '一下', '一些', '这个', '那个', '他们', '我们',
-    '你们', '自己', '起来', '出来', '进去', '回来', '过来', '下去',
-]);

 // ─────────────────────────────────────────────────────────────────────────
 // 工具函数
@@ -65,10 +47,7 @@ function cleanMessageText(text) {
 * @param {number} maxLen
 * @returns {string}
 */
-function truncate(text, maxLen) {
-    if (!text || text.length <= maxLen) return text || '';
-    return text.slice(0, maxLen) + '…';
-}
+// truncate removed by design (zero-darkbox)

 /**
 * 清理事件摘要（移除楼层标记）
@@ -84,8 +63,7 @@ function cleanSummary(summary) {
 /**
 * 从文本中提取高频实词（用于词法检索）
 *
- * 策略：按中文字符边界 + 空格/标点分词，取长度 2-6 的片段
- * 过滤停用词，按频率排序
+ * 使用统一分词器（结巴 + 实体保护 + 停用词过滤），按频率排序
 *
 * @param {string} text - 清洗后的文本
 * @param {number} maxTerms - 最大词数
@@ -94,15 +72,15 @@ function cleanSummary(summary) {
 function extractKeyTerms(text, maxTerms = LEXICAL_TERMS_MAX) {
    if (!text) return [];

-    // 提取连续中文片段 + 英文单词
-    const segments = text.match(/[\u4e00-\u9fff]{2,6}|[a-zA-Z]{3,}/g) || [];
+    // 使用统一分词器（索引用，不去重，保留词频）
+    const tokens = tokenizerTokenizeForIndex(text);

+    // 统计词频
    const freq = new Map();
-    for (const seg of segments) {
-        const s = seg.toLowerCase();
-        if (s.length < LEXICAL_TERM_MIN_LEN || s.length > LEXICAL_TERM_MAX_LEN) continue;
-        if (STOP_WORDS.has(s)) continue;
-        freq.set(s, (freq.get(s) || 0) + 1);
+    for (const token of tokens) {
+        const key = String(token || '').toLowerCase();
+        if (!key) continue;
+        freq.set(key, (freq.get(key) || 0) + 1);
    }

    return Array.from(freq.entries())
@@ -160,8 +138,9 @@ export function buildQueryBundle(lastMessages, pendingUserMessage, store = null,
        const clean = cleanMessageText(m.mes || '');

        if (clean) {
-            // ★ 修复 A：不使用楼层号，embedding 模型不需要
-            dialogueLines.push(`${speaker}: ${truncate(clean, DIALOGUE_MAX_CHARS)}`);
+            // 不使用楼层号，embedding 模型不需要
+            // 不截断，零暗箱
+            dialogueLines.push(`${speaker}: ${clean}`);
            allCleanText.push(clean);
        }
    }
@@ -191,30 +170,15 @@ export function buildQueryBundle(lastMessages, pendingUserMessage, store = null,
    }

    if (pendingClean) {
-        queryParts.push(`[PENDING_USER]\n${truncate(pendingClean, PENDING_MAX_CHARS)}`);
+        // 不截断，零暗箱
+        queryParts.push(`[PENDING_USER]\n${pendingClean}`);
    }

    const queryText_v0 = queryParts.join('\n\n');

-    // 6. 构建 rerankQuery（短版）
-    const rerankParts = [];
-
-    if (focusEntities.length > 0) {
-        rerankParts.push(focusEntities.join(' '));
-    }
-
-    for (const m of (lastMessages || [])) {
-        const clean = cleanMessageText(m.mes || '');
-        if (clean) {
-            rerankParts.push(truncate(clean, RERANK_SNIPPET_CHARS));
-        }
-    }
-
-    if (pendingClean) {
-        rerankParts.push(truncate(pendingClean, RERANK_SNIPPET_CHARS));
-    }
-
-    const rerankQuery = truncate(rerankParts.join('\n'), RERANK_QUERY_MAX_CHARS);
+    // 6. rerankQuery 与 embedding query 同源（零暗箱）
+    // 后续 refine 会把它升级为与 queryText_v1 同源。
+    const rerankQuery = queryText_v0;

    // 7. 构建 lexicalTerms
    const entityTerms = focusEntities.map(e => e.toLowerCase());
@@ -265,7 +229,8 @@ export function refineQueryBundle(bundle, anchorHits, eventHits) {
    for (const hit of topAnchors) {
        const semantic = hit.atom?.semantic || '';
        if (semantic) {
-            hints.push(truncate(semantic, MEMORY_HINT_MAX_CHARS));
+            // 不截断，零暗箱
+            hints.push(semantic);
        }
    }

@@ -279,13 +244,15 @@ export function refineQueryBundle(bundle, anchorHits, eventHits) {
            ? `${title}: ${summary}`
            : title || summary;
        if (line) {
-            hints.push(truncate(line, MEMORY_HINT_MAX_CHARS));
+            // 不截断，零暗箱
+            hints.push(line);
        }
    }

-    // 3. 构建 queryText_v1
+    // 3. 构建 queryText_v1（Hints 前置，最优先）
    if (hints.length > 0) {
-        bundle.queryText_v1 = bundle.queryText_v0 + `\n\n[MEMORY_HINTS]\n${hints.join('\n')}`;
+        const hintText = `[MEMORY_HINTS]\n${hints.join('\n')}`;
+        bundle.queryText_v1 = hintText + `\n\n` + bundle.queryText_v0;
    } else {
        bundle.queryText_v1 = bundle.queryText_v0;
    }
@@ -314,17 +281,8 @@ export function refineQueryBundle(bundle, anchorHits, eventHits) {
        }
    }

-    // 5. 增强 rerankQuery
-    if (hints.length > 0) {
-        const hintKeywords = extractKeyTerms(hints.join(' '), 5);
-        if (hintKeywords.length > 0) {
-            const addition = hintKeywords.join(' ');
-            bundle.rerankQuery = truncate(
-                bundle.rerankQuery + '\n' + addition,
-                RERANK_QUERY_MAX_CHARS
-            );
-        }
-    }
+    // 5. rerankQuery 与最终 query 同源（零暗箱）
+    bundle.rerankQuery = bundle.queryText_v1 || bundle.queryText_v0;

    // 6. 增强 lexicalTerms
    if (hints.length > 0) {
--- a/modules/story-summary/vector/retrieval/recall.js
+++ b/modules/story-summary/vector/retrieval/recall.js
@@ -782,6 +782,14 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) {
    metrics.query.buildTime = Math.round(performance.now() - T_Build_Start);
    metrics.anchor.focusEntities = bundle.focusEntities;

+    // Query lengths (v0 available here)
+    if (metrics.query?.lengths) {
+        metrics.query.lengths.v0Chars = String(bundle.queryText_v0 || '').length;
+        // v1 not built yet
+        metrics.query.lengths.v1Chars = null;
+        metrics.query.lengths.rerankChars = String(bundle.rerankQuery || bundle.queryText_v0 || '').length;
+    }
+
    xbLog.info(MODULE_ID,
        `Query Build: focus=[${bundle.focusEntities.join(',')}] lexTerms=[${bundle.lexicalTerms.slice(0, 5).join(',')}]`
    );
@@ -841,6 +849,12 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) {
    // 更新 focusEntities（refinement 可能扩展了）
    metrics.anchor.focusEntities = bundle.focusEntities;

+    // Query lengths (v1/rerank updated here)
+    if (metrics.query?.lengths) {
+        metrics.query.lengths.v1Chars = bundle.queryText_v1 == null ? null : String(bundle.queryText_v1).length;
+        metrics.query.lengths.rerankChars = String(bundle.rerankQuery || bundle.queryText_v1 || bundle.queryText_v0 || '').length;
+    }
+
    xbLog.info(MODULE_ID,
        `Refinement: focus=[${bundle.focusEntities.join(',')}] hasV1=${!!bundle.queryText_v1} (${metrics.query.refineTime}ms)`
    );
--- a/modules/story-summary/vector/utils/tokenizer.js
+++ b/modules/story-summary/vector/utils/tokenizer.js
@@ -0,0 +1,650 @@
+// ═══════════════════════════════════════════════════════════════════════════
+// tokenizer.js - 统一分词器
+//
+// 职责：
+// 1. 管理结巴 WASM 生命周期（预加载 / 就绪检测 / 降级）
+// 2. 实体词典注入（分词前最长匹配保护）
+// 3. 亚洲文字（CJK + 假名）走结巴，拉丁文字走空格分割
+// 4. 提供 tokenize(text): string[] 统一接口
+//
+// 加载时机：
+// - 插件初始化时 storySummary.enabled && vectorConfig.enabled → preload()
+// - 向量开关从 off→on 时 → preload()
+// - CHAT_CHANGED 时 → injectEntities() + warmup 索引（不负责加载 WASM）
+//
+// 降级策略：
+// - WASM 未就绪时 → 实体保护 + 标点分割（不用 bigram）
+// ═══════════════════════════════════════════════════════════════════════════
+
+import { extensionFolderPath } from '../../../../core/constants.js';
+import { xbLog } from '../../../../core/debug-core.js';
+
+const MODULE_ID = 'tokenizer';
+
+// ═══════════════════════════════════════════════════════════════════════════
+// WASM 状态机
+// ═══════════════════════════════════════════════════════════════════════════
+
+/**
+ * @enum {string}
+ */
+const WasmState = {
+    IDLE: 'IDLE',
+    LOADING: 'LOADING',
+    READY: 'READY',
+    FAILED: 'FAILED',
+};
+
+let wasmState = WasmState.IDLE;
+
+/** @type {Promise<void>|null} 当前加载 Promise（防重入） */
+let loadingPromise = null;
+
+/** @type {typeof import('../../../../libs/jieba-wasm/jieba_rs_wasm.js')|null} */
+let jiebaModule = null;
+
+/** @type {Function|null} jieba cut 函数引用 */
+let jiebaCut = null;
+
+/** @type {Function|null} jieba add_word 函数引用 */
+let jiebaAddWord = null;
+
+// ═══════════════════════════════════════════════════════════════════════════
+// 实体词典
+// ═══════════════════════════════════════════════════════════════════════════
+
+/** @type {string[]} 按长度降序排列的实体列表（用于最长匹配） */
+let entityList = [];
+
+/** @type {Set<string>} 已注入结巴的实体（避免重复 add_word） */
+let injectedEntities = new Set();
+
+// ═══════════════════════════════════════════════════════════════════════════
+// 停用词
+// ═══════════════════════════════════════════════════════════════════════════
+
+const STOP_WORDS = new Set([
+    // 中文高频虚词
+    '的', '了', '在', '是', '我', '有', '和', '就', '不', '人',
+    '都', '一', '一个', '上', '也', '很', '到', '说', '要', '去',
+    '你', '会', '着', '没有', '看', '好', '自己', '这', '他', '她',
+    '它', '吗', '什么', '那', '里', '来', '吧', '呢', '啊', '哦',
+    '嗯', '呀', '哈', '嘿', '喂', '哎', '唉', '哇', '呃', '嘛',
+    '把', '被', '让', '给', '从', '向', '对', '跟', '比', '但',
+    '而', '或', '如果', '因为', '所以', '虽然', '但是', '然后',
+    '可以', '这样', '那样', '怎么', '为什么', '什么样', '哪里',
+    '时候', '现在', '已经', '还是', '只是', '可能', '应该', '知道',
+    '觉得', '开始', '一下', '一些', '这个', '那个', '他们', '我们',
+    '你们', '自己', '起来', '出来', '进去', '回来', '过来', '下去',
+    // 日语助词 + 常见虚词
+    'は', 'が', 'を', 'に', 'で', 'と', 'の', 'も', 'へ', 'や',
+    'か', 'な', 'よ', 'ね', 'わ', 'だ', 'です', 'ます', 'た', 'て',
+    'する', 'いる', 'ある', 'なる', 'れる', 'られる', 'ない',
+    'この', 'その', 'あの', 'どの', 'ここ', 'そこ', 'あそこ',
+    'これ', 'それ', 'あれ', 'どれ',
+    // 英文常见停用词
+    'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been',
+    'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
+    'would', 'could', 'should', 'may', 'might', 'can', 'shall',
+    'and', 'but', 'or', 'not', 'no', 'nor', 'so', 'yet',
+    'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from',
+    'it', 'its', 'he', 'she', 'his', 'her', 'they', 'them',
+    'this', 'that', 'these', 'those', 'i', 'me', 'my', 'you', 'your',
+    'we', 'our', 'if', 'then', 'than', 'when', 'what', 'which',
+    'who', 'how', 'where', 'there', 'here', 'all', 'each', 'every',
+    'both', 'few', 'more', 'most', 'other', 'some', 'such',
+    'only', 'own', 'same', 'just', 'very', 'also', 'about',
+]);
+
+// ═══════════════════════════════════════════════════════════════════════════
+// Unicode 分类
+// ═══════════════════════════════════════════════════════════════════════════
+
+/**
+ * 判断字符是否为亚洲文字（CJK + 假名）
+ * @param {number} code - charCode
+ * @returns {boolean}
+ */
+function isAsian(code) {
+    return (
+        (code >= 0x4E00 && code <= 0x9FFF) ||   // CJK Unified Ideographs
+        (code >= 0x3400 && code <= 0x4DBF) ||   // CJK Extension A
+        (code >= 0x3040 && code <= 0x309F) ||   // Hiragana
+        (code >= 0x30A0 && code <= 0x30FF) ||   // Katakana
+        (code >= 0x31F0 && code <= 0x31FF) ||   // Katakana Phonetic Extensions
+        (code >= 0xFF65 && code <= 0xFF9F) ||   // Halfwidth Katakana
+        (code >= 0xF900 && code <= 0xFAFF) ||   // CJK Compatibility Ideographs
+        (code >= 0x20000 && code <= 0x2A6DF)    // CJK Extension B
+    );
+}
+
+/**
+ * 判断字符是否为拉丁字母或数字
+ * @param {number} code - charCode
+ * @returns {boolean}
+ */
+function isLatin(code) {
+    return (
+        (code >= 0x41 && code <= 0x5A) ||       // A-Z
+        (code >= 0x61 && code <= 0x7A) ||       // a-z
+        (code >= 0x30 && code <= 0x39) ||       // 0-9
+        (code >= 0xC0 && code <= 0x024F)        // Latin Extended (àáâ 等)
+    );
+}
+
+// ═══════════════════════════════════════════════════════════════════════════
+// 文本分段（亚洲 vs 拉丁 vs 其他）
+// ═══════════════════════════════════════════════════════════════════════════
+
+/**
+ * @typedef {'asian'|'latin'|'other'} SegmentType
+ */
+
+/**
+ * @typedef {object} TextSegment
+ * @property {SegmentType} type - 段类型
+ * @property {string} text - 段文本
+ */
+
+/**
+ * 将文本按 Unicode 脚本分段
+ * 连续的同类字符归为一段
+ *
+ * @param {string} text
+ * @returns {TextSegment[]}
+ */
+function segmentByScript(text) {
+    if (!text) return [];
+
+    const segments = [];
+    let currentType = null;
+    let currentStart = 0;
+
+    for (let i = 0; i < text.length; i++) {
+        const code = text.charCodeAt(i);
+        let type;
+
+        if (isAsian(code)) {
+            type = 'asian';
+        } else if (isLatin(code)) {
+            type = 'latin';
+        } else {
+            type = 'other';
+        }
+
+        if (type !== currentType) {
+            if (currentType !== null && currentStart < i) {
+                const seg = text.slice(currentStart, i);
+                if (currentType !== 'other' || seg.trim()) {
+                    segments.push({ type: currentType, text: seg });
+                }
+            }
+            currentType = type;
+            currentStart = i;
+        }
+    }
+
+    // 最后一段
+    if (currentStart < text.length) {
+        const seg = text.slice(currentStart);
+        if (currentType !== 'other' || seg.trim()) {
+            segments.push({ type: currentType, text: seg });
+        }
+    }
+
+    return segments;
+}
+
+// ═══════════════════════════════════════════════════════════════════════════
+// 实体保护（最长匹配占位符替换）
+// ═══════════════════════════════════════════════════════════════════════════
+
+// 使用 Unicode Private Use Area (PUA) 字符作为边界，避免控制字符在分词器中产生不可控行为
+const PLACEHOLDER_PREFIX = '\uE000ENT_';
+const PLACEHOLDER_SUFFIX = '\uE001';
+
+/**
+ * 在文本中执行实体最长匹配，替换为占位符
+ *
+ * @param {string} text - 原始文本
+ * @returns {{masked: string, entities: Map<string, string>}} masked 文本 + 占位符→原文映射
+ */
+function maskEntities(text) {
+    const entities = new Map();
+
+    if (!entityList.length || !text) {
+        return { masked: text, entities };
+    }
+
+    let masked = text;
+    let idx = 0;
+
+    // entityList 已按长度降序排列，保证最长匹配优先
+    for (const entity of entityList) {
+        // 大小写不敏感搜索
+        const lowerMasked = masked.toLowerCase();
+        const lowerEntity = entity.toLowerCase();
+        let searchFrom = 0;
+
+        while (true) {
+            const pos = lowerMasked.indexOf(lowerEntity, searchFrom);
+            if (pos === -1) break;
+
+            // 已被占位符覆盖则跳过（检查前后是否存在 PUA 边界字符）
+            const aroundStart = Math.max(0, pos - 4);
+            const aroundEnd = Math.min(masked.length, pos + entity.length + 4);
+            const around = masked.slice(aroundStart, aroundEnd);
+            if (around.includes('\uE000') || around.includes('\uE001')) {
+                searchFrom = pos + 1;
+                continue;
+            }
+
+            const placeholder = `${PLACEHOLDER_PREFIX}${idx}${PLACEHOLDER_SUFFIX}`;
+            const originalText = masked.slice(pos, pos + entity.length);
+            entities.set(placeholder, originalText);
+
+            masked = masked.slice(0, pos) + placeholder + masked.slice(pos + entity.length);
+            idx++;
+
+            // 更新搜索位置（跳过占位符）
+            searchFrom = pos + placeholder.length;
+        }
+    }
+
+    return { masked, entities };
+}
+
+/**
+ * 将 token 数组中的占位符还原为原始实体
+ *
+ * @param {string[]} tokens
+ * @param {Map<string, string>} entities - 占位符→原文映射
+ * @returns {string[]}
+ */
+function unmaskTokens(tokens, entities) {
+    if (!entities.size) return tokens;
+
+    return tokens.map(token => {
+        // token 本身就是一个占位符
+        if (entities.has(token)) {
+            return entities.get(token);
+        }
+
+        // token 中包含占位符（结巴可能把占位符和其他字符连在一起）
+        let result = token;
+        for (const [placeholder, original] of entities) {
+            if (result.includes(placeholder)) {
+                result = result.replace(placeholder, original);
+            }
+        }
+
+        return result;
+    });
+}
+
+// ═══════════════════════════════════════════════════════════════════════════
+// 分词：亚洲文字（结巴 / 降级）
+// ═══════════════════════════════════════════════════════════════════════════
+
+/**
+ * 用结巴分词处理亚洲文字段
+ * @param {string} text
+ * @returns {string[]}
+ */
+function tokenizeAsianJieba(text) {
+    if (!text || !jiebaCut) return [];
+
+    try {
+        const words = jiebaCut(text, true); // hmm=true
+        return Array.from(words)
+            .map(w => String(w || '').trim())
+            .filter(w => w.length >= 2);
+    } catch (e) {
+        xbLog.warn(MODULE_ID, '结巴分词异常，降级处理', e);
+        return tokenizeAsianFallback(text);
+    }
+}
+
+/**
+ * 降级分词：标点/空格分割 + 保留 2-6 字 CJK 片段
+ * 不使用 bigram，避免索引膨胀
+ *
+ * @param {string} text
+ * @returns {string[]}
+ */
+function tokenizeAsianFallback(text) {
+    if (!text) return [];
+
+    const tokens = [];
+
+    // 按标点和空格分割
+    const parts = text.split(/[\s，。！？、；：""''（）【】《》…—\-,.!?;:'"()[\]{}<>/\\|@#$%^&*+=~`]+/);
+
+    for (const part of parts) {
+        const trimmed = part.trim();
+        if (!trimmed) continue;
+
+        if (trimmed.length >= 2 && trimmed.length <= 6) {
+            tokens.push(trimmed);
+        } else if (trimmed.length > 6) {
+            // 长片段按 4 字滑窗切分（比 bigram 稀疏得多）
+            for (let i = 0; i <= trimmed.length - 4; i += 2) {
+                tokens.push(trimmed.slice(i, i + 4));
+            }
+            // 保留完整片段的前 6 字
+            tokens.push(trimmed.slice(0, 6));
+        }
+    }
+
+    return tokens;
+}
+
+// ═══════════════════════════════════════════════════════════════════════════
+// 分词：拉丁文字
+// ═══════════════════════════════════════════════════════════════════════════
+
+/**
+ * 拉丁文字分词：空格/标点分割
+ * @param {string} text
+ * @returns {string[]}
+ */
+function tokenizeLatin(text) {
+    if (!text) return [];
+
+    return text
+        .split(/[\s\-_.,;:!?'"()[\]{}<>/\\|@#$%^&*+=~`]+/)
+        .map(w => w.trim().toLowerCase())
+        .filter(w => w.length >= 3);
+}
+
+// ═══════════════════════════════════════════════════════════════════════════
+// 公开接口：preload
+// ═══════════════════════════════════════════════════════════════════════════
+
+/**
+ * 预加载结巴 WASM
+ *
+ * 可多次调用，内部防重入。
+ * FAILED 状态下再次调用会重试。
+ *
+ * @returns {Promise<boolean>} 是否加载成功
+ */
+export async function preload() {
+    // 已就绪
+    if (wasmState === WasmState.READY) return true;
+
+    // 正在加载，等待结果
+    if (wasmState === WasmState.LOADING && loadingPromise) {
+        try {
+            await loadingPromise;
+            return wasmState === WasmState.READY;
+        } catch {
+            return false;
+        }
+    }
+
+    // IDLE 或 FAILED → 开始加载
+    wasmState = WasmState.LOADING;
+
+    const T0 = performance.now();
+
+    loadingPromise = (async () => {
+        try {
+            // 动态 import 结巴模块
+            const wasmPath = `${extensionFolderPath}/libs/jieba-wasm/jieba_rs_wasm_bg.wasm`;
+
+            // eslint-disable-next-line no-unsanitized/method
+            jiebaModule = await import(
+                `${extensionFolderPath}/libs/jieba-wasm/jieba_rs_wasm.js`
+            );
+
+            // 初始化 WASM
+            if (typeof jiebaModule.default === 'function') {
+                await jiebaModule.default(wasmPath);
+            }
+
+            // 缓存函数引用
+            jiebaCut = jiebaModule.cut;
+            jiebaAddWord = jiebaModule.add_word;
+
+            if (typeof jiebaCut !== 'function') {
+                throw new Error('jieba cut 函数不存在');
+            }
+
+            wasmState = WasmState.READY;
+
+            const elapsed = Math.round(performance.now() - T0);
+            xbLog.info(MODULE_ID, `结巴 WASM 加载完成 (${elapsed}ms)`);
+
+            // 如果有待注入的实体，补做
+            if (entityList.length > 0 && jiebaAddWord) {
+                reInjectAllEntities();
+            }
+
+            return true;
+        } catch (e) {
+            wasmState = WasmState.FAILED;
+            xbLog.error(MODULE_ID, '结巴 WASM 加载失败', e);
+            throw e;
+        }
+    })();
+
+    try {
+        await loadingPromise;
+        return true;
+    } catch {
+        return false;
+    } finally {
+        loadingPromise = null;
+    }
+}
+
+// ═══════════════════════════════════════════════════════════════════════════
+// 公开接口：isReady
+// ═══════════════════════════════════════════════════════════════════════════
+
+/**
+ * 检查结巴是否已就绪
+ * @returns {boolean}
+ */
+export function isReady() {
+    return wasmState === WasmState.READY;
+}
+
+/**
+ * 获取当前 WASM 状态
+ * @returns {string}
+ */
+export function getState() {
+    return wasmState;
+}
+
+// ═══════════════════════════════════════════════════════════════════════════
+// 公开接口：injectEntities
+// ═══════════════════════════════════════════════════════════════════════════
+
+/**
+ * 注入实体词典
+ *
+ * 更新内部实体列表（用于最长匹配保护）
+ * 如果结巴已就绪，同时调用 add_word 注入
+ *
+ * @param {Set<string>} lexicon - 标准化后的实体集合
+ * @param {Map<string, string>} [displayMap] - normalize→原词形映射
+ */
+export function injectEntities(lexicon, displayMap) {
+    if (!lexicon?.size) {
+        entityList = [];
+        return;
+    }
+
+    // 构建实体列表：使用原词形（displayMap），按长度降序排列
+    const entities = [];
+    for (const normalized of lexicon) {
+        const display = displayMap?.get(normalized) || normalized;
+        if (display.length >= 2) {
+            entities.push(display);
+        }
+    }
+
+    // 按长度降序（最长匹配优先）
+    entities.sort((a, b) => b.length - a.length);
+    entityList = entities;
+
+    // 如果结巴已就绪，注入自定义词
+    if (wasmState === WasmState.READY && jiebaAddWord) {
+        injectNewEntitiesToJieba(entities);
+    }
+
+    xbLog.info(MODULE_ID, `实体词典更新: ${entities.length} 个实体`);
+}
+
+/**
+ * 将新实体注入结巴（增量，跳过已注入的）
+ * @param {string[]} entities
+ */
+function injectNewEntitiesToJieba(entities) {
+    let count = 0;
+    for (const entity of entities) {
+        if (!injectedEntities.has(entity)) {
+            try {
+                // freq 设高保证不被切碎
+                jiebaAddWord(entity, 99999);
+                injectedEntities.add(entity);
+                count++;
+            } catch (e) {
+                xbLog.warn(MODULE_ID, `add_word 失败: ${entity}`, e);
+            }
+        }
+    }
+    if (count > 0) {
+        xbLog.info(MODULE_ID, `注入 ${count} 个新实体到结巴`);
+    }
+}
+
+/**
+ * 重新注入所有实体（WASM 刚加载完时调用）
+ */
+function reInjectAllEntities() {
+    injectedEntities.clear();
+    injectNewEntitiesToJieba(entityList);
+}
+
+// ═══════════════════════════════════════════════════════════════════════════
+// 公开接口：tokenize
+// ═══════════════════════════════════════════════════════════════════════════
+
+/**
+ * 统一分词接口
+ *
+ * 流程：
+ * 1. 实体最长匹配 → 占位符保护
+ * 2. 按 Unicode 脚本分段（亚洲 vs 拉丁）
+ * 3. 亚洲段 → 结巴 cut()（或降级）
+ * 4. 拉丁段 → 空格/标点分割
+ * 5. 还原占位符
+ * 6. 过滤停用词 + 去重
+ *
+ * @param {string} text - 输入文本
+ * @returns {string[]} token 数组
+ */
+export function tokenize(text) {
+    const restored = tokenizeCore(text);
+
+    // 5. 过滤停用词 + 去重 + 清理
+    const seen = new Set();
+    const result = [];
+
+    for (const token of restored) {
+        const cleaned = token.trim().toLowerCase();
+
+        if (!cleaned) continue;
+        if (cleaned.length < 2) continue;
+        if (STOP_WORDS.has(cleaned)) continue;
+        if (seen.has(cleaned)) continue;
+
+        // 过滤纯标点/特殊字符
+        if (/^[\s\x00-\x1F\p{P}\p{S}]+$/u.test(cleaned)) continue;
+
+        seen.add(cleaned);
+        result.push(token.trim()); // 保留原始大小写
+    }
+
+    return result;
+}
+
+/**
+ * 内核分词流程（不去重、不 lower、仅完成：实体保护→分段→分词→还原）
+ * @param {string} text
+ * @returns {string[]}
+ */
+function tokenizeCore(text) {
+    if (!text) return [];
+
+    const input = String(text).trim();
+    if (!input) return [];
+
+    // 1. 实体保护
+    const { masked, entities } = maskEntities(input);
+
+    // 2. 分段
+    const segments = segmentByScript(masked);
+
+    // 3. 分段分词
+    const rawTokens = [];
+    for (const seg of segments) {
+        if (seg.type === 'asian') {
+            if (wasmState === WasmState.READY && jiebaCut) {
+                rawTokens.push(...tokenizeAsianJieba(seg.text));
+            } else {
+                rawTokens.push(...tokenizeAsianFallback(seg.text));
+            }
+        } else if (seg.type === 'latin') {
+            rawTokens.push(...tokenizeLatin(seg.text));
+        }
+    }
+
+    // 4. 还原占位符
+    return unmaskTokens(rawTokens, entities);
+}
+
+// ═══════════════════════════════════════════════════════════════════════════
+// 公开接口：tokenizeForIndex
+// ═══════════════════════════════════════════════════════════════════════════
+
+/**
+ * MiniSearch 索引专用分词
+ *
+ * 与 tokenize() 的区别：
+ * - 全部转小写（MiniSearch 内部需要一致性）
+ * - 不去重（MiniSearch 自己处理词频）
+ *
+ * @param {string} text
+ * @returns {string[]}
+ */
+export function tokenizeForIndex(text) {
+    const restored = tokenizeCore(text);
+
+    return restored
+        .map(t => t.trim().toLowerCase())
+        .filter(t => {
+            if (!t || t.length < 2) return false;
+            if (STOP_WORDS.has(t)) return false;
+            if (/^[\s\x00-\x1F\p{P}\p{S}]+$/u.test(t)) return false;
+            return true;
+        });
+}
+
+// ═══════════════════════════════════════════════════════════════════════════
+// 公开接口：reset
+// ═══════════════════════════════════════════════════════════════════════════
+
+/**
+ * 重置分词器状态
+ * 用于测试或模块卸载
+ */
+export function reset() {
+    entityList = [];
+    injectedEntities.clear();
+    // 不重置 WASM 状态（避免重复加载）
+}