Zero-darkbox query updates and tokenizer improvements

This commit is contained in:
2026-02-09 20:25:26 +08:00
parent 8131d6a15f
commit 0a28539b29
14 changed files with 1771 additions and 175 deletions

25
libs/jieba-wasm/LICENSE Normal file
View File

@@ -0,0 +1,25 @@
Copyright (c) 2018 fengkx <liangkx8237@gmail.com>
Permission is hereby granted, free of charge, to any
person obtaining a copy of this software and associated
documentation files (the "Software"), to deal in the
Software without restriction, including without
limitation the rights to use, copy, modify, merge,
publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software
is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice
shall be included in all copies or substantial portions
of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.

134
libs/jieba-wasm/README.md Normal file
View File

@@ -0,0 +1,134 @@
# jieba-wasm
> [jieba-rs](https://github.com/messense/jieba-rs) 的 wasm binding
_编译成 WASM 摆脱编译 Node Addon 的烦恼_
# Usage
## Node.js
```js
const {
cut,
cut_all,
cut_for_search,
tokenize,
add_word,
} = require("jieba-wasm");
cut("中华人民共和国武汉市长江大桥", true);
// [ '中华人民共和国', '武汉市', '长江大桥' ]
cut_all("中华人民共和国武汉市长江大桥", true);
/*
[
'中', '中华',
'中华人民', '中华人民共和国',
'华', '华人',
'人', '人民',
'人民共和国', '民',
'共', '共和',
'共和国', '和',
'国', '武',
'武汉', '武汉市',
'汉', '市',
'市长', '长',
'长江', '长江大桥',
'江', '大',
'大桥', '桥'
]
*/
cut_for_search("中华人民共和国武汉市长江大桥", true);
/*
[
'中华', '华人',
'人民', '共和',
'共和国', '中华人民共和国',
'武汉', '武汉市',
'长江', '大桥',
'长江大桥'
]
*/
tokenize("中华人民共和国武汉市长江大桥", "default", true);
/*
[
{ word: '中华人民共和国', start: 0, end: 7 },
{ word: '武汉市', start: 7, end: 10 },
{ word: '长江大桥', start: 10, end: 14 }
]
*/
tokenize("中华人民共和国武汉市长江大桥", "search", true);
/*
[
{ word: '中华', start: 0, end: 2 },
{ word: '华人', start: 1, end: 3 },
{ word: '人民', start: 2, end: 4 },
{ word: '共和', start: 4, end: 6 },
{ word: '共和国', start: 4, end: 7 },
{ word: '中华人民共和国', start: 0, end: 7 },
{ word: '武汉', start: 7, end: 9 },
{ word: '武汉市', start: 7, end: 10 },
{ word: '长江', start: 10, end: 12 },
{ word: '大桥', start: 12, end: 14 },
{ word: '长江大桥', start: 10, end: 14 }
]
*/
cut("桥大江长市汉武的省北湖国和共民人华中");
/*
[
'桥', '大江', '长',
'市', '汉', '武',
'的', '省', '北湖',
'国', '和', '共',
'民', '人', '华中'
]
*/
["桥大江长", "市汉武", "省北湖", "国和共民人华中"].map((word) => {
add_word(word);
});
cut("桥大江长市汉武的省北湖国和共民人华中");
// ["桥大江长", "市汉武", "的", "省北湖", "国和共民人华中"];
with_dict("自动借书机 1 n"); // 导入自定义字典,词条格式:词语 词频 词性(可选),以换行符分隔
cut("你好我是一个自动借书机");
// ["你好", "我", "是", "一个", "自动借书机"];
```
## Browser
```ts
import init, { cut } from 'jieba-wasm';
// 重要:使用前必须初始化
await init();
cut("中华人民共和国武汉市长江大桥", true);
// [ '中华人民共和国', '武汉市', '长江大桥' ]
```
# 示例 Demo
## 安装依赖
安装 wasm-bindgen 和 wasm-opt
```bash
cargo install wasm-bindgen-cli --locked
cargo install wasm-opt --locked
```
## 前期准备
首先保证存在 rust 环境,然后运行以下命令
```bash
npm run build:cargo
npm run build
```
## 运行浏览器端示例
```bash
cd demo/web
npm install
npm run dev
```
# Piror Art
https://github.com/messense/jieba-rs

73
libs/jieba-wasm/jieba_rs_wasm.d.ts vendored Normal file
View File

@@ -0,0 +1,73 @@
/* tslint:disable */
/* eslint-disable */
export function cut(text: string, hmm?: boolean | null): string[];
export function cut_all(text: string): string[];
export function cut_for_search(text: string, hmm?: boolean | null): string[];
export function tokenize(text: string, mode: string, hmm?: boolean | null): Token[];
export function add_word(word: string, freq?: number | null, tag?: string | null): number;
export function tag(sentence: string, hmm?: boolean | null): Tag[];
export function with_dict(dict: string): void;
/** Represents a single token with its word and position. */
export interface Token {
word: string;
start: number;
end: number;
}
/** Represents a single word and its part-of-speech tag. */
export interface Tag {
word: string;
tag: string;
}
export type InitInput = RequestInfo | URL | Response | BufferSource | WebAssembly.Module;
export interface InitOutput {
readonly memory: WebAssembly.Memory;
readonly cut: (a: number, b: number, c: number) => [number, number];
readonly cut_all: (a: number, b: number) => [number, number];
readonly cut_for_search: (a: number, b: number, c: number) => [number, number];
readonly tokenize: (a: number, b: number, c: number, d: number, e: number) => [number, number, number, number];
readonly add_word: (a: number, b: number, c: number, d: number, e: number) => number;
readonly tag: (a: number, b: number, c: number) => [number, number];
readonly with_dict: (a: number, b: number) => [number, number];
readonly rust_zstd_wasm_shim_qsort: (a: number, b: number, c: number, d: number) => void;
readonly rust_zstd_wasm_shim_malloc: (a: number) => number;
readonly rust_zstd_wasm_shim_memcmp: (a: number, b: number, c: number) => number;
readonly rust_zstd_wasm_shim_calloc: (a: number, b: number) => number;
readonly rust_zstd_wasm_shim_free: (a: number) => void;
readonly rust_zstd_wasm_shim_memcpy: (a: number, b: number, c: number) => number;
readonly rust_zstd_wasm_shim_memmove: (a: number, b: number, c: number) => number;
readonly rust_zstd_wasm_shim_memset: (a: number, b: number, c: number) => number;
readonly __wbindgen_malloc: (a: number, b: number) => number;
readonly __wbindgen_realloc: (a: number, b: number, c: number, d: number) => number;
readonly __wbindgen_export_2: WebAssembly.Table;
readonly __externref_drop_slice: (a: number, b: number) => void;
readonly __wbindgen_free: (a: number, b: number, c: number) => void;
readonly __externref_table_dealloc: (a: number) => void;
readonly __wbindgen_start: () => void;
}
export type SyncInitInput = BufferSource | WebAssembly.Module;
/**
* Instantiates the given `module`, which can either be bytes or
* a precompiled `WebAssembly.Module`.
*
* @param {{ module: SyncInitInput }} module - Passing `SyncInitInput` directly is deprecated.
*
* @returns {InitOutput}
*/
export function initSync(module: { module: SyncInitInput } | SyncInitInput): InitOutput;
/**
* If `module_or_path` is {RequestInfo} or {URL}, makes a request and
* for everything else, calls `WebAssembly.instantiate` directly.
*
* @param {{ module_or_path: InitInput | Promise<InitInput> }} module_or_path - Passing `InitInput` directly is deprecated.
*
* @returns {Promise<InitOutput>}
*/
export default function __wbg_init (module_or_path?: { module_or_path: InitInput | Promise<InitInput> } | InitInput | Promise<InitInput>): Promise<InitOutput>;

View File

@@ -0,0 +1,438 @@
let wasm;
let cachedUint8ArrayMemory0 = null;
function getUint8ArrayMemory0() {
if (cachedUint8ArrayMemory0 === null || cachedUint8ArrayMemory0.byteLength === 0) {
cachedUint8ArrayMemory0 = new Uint8Array(wasm.memory.buffer);
}
return cachedUint8ArrayMemory0;
}
let cachedTextDecoder = (typeof TextDecoder !== 'undefined' ? new TextDecoder('utf-8', { ignoreBOM: true, fatal: true }) : { decode: () => { throw Error('TextDecoder not available') } } );
if (typeof TextDecoder !== 'undefined') { cachedTextDecoder.decode(); };
const MAX_SAFARI_DECODE_BYTES = 2146435072;
let numBytesDecoded = 0;
function decodeText(ptr, len) {
numBytesDecoded += len;
if (numBytesDecoded >= MAX_SAFARI_DECODE_BYTES) {
cachedTextDecoder = (typeof TextDecoder !== 'undefined' ? new TextDecoder('utf-8', { ignoreBOM: true, fatal: true }) : { decode: () => { throw Error('TextDecoder not available') } } );
cachedTextDecoder.decode();
numBytesDecoded = len;
}
return cachedTextDecoder.decode(getUint8ArrayMemory0().subarray(ptr, ptr + len));
}
function getStringFromWasm0(ptr, len) {
ptr = ptr >>> 0;
return decodeText(ptr, len);
}
function debugString(val) {
// primitive types
const type = typeof val;
if (type == 'number' || type == 'boolean' || val == null) {
return `${val}`;
}
if (type == 'string') {
return `"${val}"`;
}
if (type == 'symbol') {
const description = val.description;
if (description == null) {
return 'Symbol';
} else {
return `Symbol(${description})`;
}
}
if (type == 'function') {
const name = val.name;
if (typeof name == 'string' && name.length > 0) {
return `Function(${name})`;
} else {
return 'Function';
}
}
// objects
if (Array.isArray(val)) {
const length = val.length;
let debug = '[';
if (length > 0) {
debug += debugString(val[0]);
}
for(let i = 1; i < length; i++) {
debug += ', ' + debugString(val[i]);
}
debug += ']';
return debug;
}
// Test for built-in
const builtInMatches = /\[object ([^\]]+)\]/.exec(toString.call(val));
let className;
if (builtInMatches && builtInMatches.length > 1) {
className = builtInMatches[1];
} else {
// Failed to match the standard '[object ClassName]'
return toString.call(val);
}
if (className == 'Object') {
// we're a user defined class or Object
// JSON.stringify avoids problems with cycles, and is generally much
// easier than looping through ownProperties of `val`.
try {
return 'Object(' + JSON.stringify(val) + ')';
} catch (_) {
return 'Object';
}
}
// errors
if (val instanceof Error) {
return `${val.name}: ${val.message}\n${val.stack}`;
}
// TODO we could test for more things here, like `Set`s and `Map`s.
return className;
}
let WASM_VECTOR_LEN = 0;
const cachedTextEncoder = (typeof TextEncoder !== 'undefined' ? new TextEncoder('utf-8') : { encode: () => { throw Error('TextEncoder not available') } } );
const encodeString = (typeof cachedTextEncoder.encodeInto === 'function'
? function (arg, view) {
return cachedTextEncoder.encodeInto(arg, view);
}
: function (arg, view) {
const buf = cachedTextEncoder.encode(arg);
view.set(buf);
return {
read: arg.length,
written: buf.length
};
});
function passStringToWasm0(arg, malloc, realloc) {
if (realloc === undefined) {
const buf = cachedTextEncoder.encode(arg);
const ptr = malloc(buf.length, 1) >>> 0;
getUint8ArrayMemory0().subarray(ptr, ptr + buf.length).set(buf);
WASM_VECTOR_LEN = buf.length;
return ptr;
}
let len = arg.length;
let ptr = malloc(len, 1) >>> 0;
const mem = getUint8ArrayMemory0();
let offset = 0;
for (; offset < len; offset++) {
const code = arg.charCodeAt(offset);
if (code > 0x7F) break;
mem[ptr + offset] = code;
}
if (offset !== len) {
if (offset !== 0) {
arg = arg.slice(offset);
}
ptr = realloc(ptr, len, len = offset + arg.length * 3, 1) >>> 0;
const view = getUint8ArrayMemory0().subarray(ptr + offset, ptr + len);
const ret = encodeString(arg, view);
offset += ret.written;
ptr = realloc(ptr, len, offset, 1) >>> 0;
}
WASM_VECTOR_LEN = offset;
return ptr;
}
let cachedDataViewMemory0 = null;
function getDataViewMemory0() {
if (cachedDataViewMemory0 === null || cachedDataViewMemory0.buffer.detached === true || (cachedDataViewMemory0.buffer.detached === undefined && cachedDataViewMemory0.buffer !== wasm.memory.buffer)) {
cachedDataViewMemory0 = new DataView(wasm.memory.buffer);
}
return cachedDataViewMemory0;
}
function isLikeNone(x) {
return x === undefined || x === null;
}
function getArrayJsValueFromWasm0(ptr, len) {
ptr = ptr >>> 0;
const mem = getDataViewMemory0();
const result = [];
for (let i = ptr; i < ptr + 4 * len; i += 4) {
result.push(wasm.__wbindgen_export_2.get(mem.getUint32(i, true)));
}
wasm.__externref_drop_slice(ptr, len);
return result;
}
/**
* @param {string} text
* @param {boolean | null} [hmm]
* @returns {string[]}
*/
export function cut(text, hmm) {
const ptr0 = passStringToWasm0(text, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc);
const len0 = WASM_VECTOR_LEN;
const ret = wasm.cut(ptr0, len0, isLikeNone(hmm) ? 0xFFFFFF : hmm ? 1 : 0);
var v2 = getArrayJsValueFromWasm0(ret[0], ret[1]).slice();
wasm.__wbindgen_free(ret[0], ret[1] * 4, 4);
return v2;
}
/**
* @param {string} text
* @returns {string[]}
*/
export function cut_all(text) {
const ptr0 = passStringToWasm0(text, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc);
const len0 = WASM_VECTOR_LEN;
const ret = wasm.cut_all(ptr0, len0);
var v2 = getArrayJsValueFromWasm0(ret[0], ret[1]).slice();
wasm.__wbindgen_free(ret[0], ret[1] * 4, 4);
return v2;
}
/**
* @param {string} text
* @param {boolean | null} [hmm]
* @returns {string[]}
*/
export function cut_for_search(text, hmm) {
const ptr0 = passStringToWasm0(text, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc);
const len0 = WASM_VECTOR_LEN;
const ret = wasm.cut_for_search(ptr0, len0, isLikeNone(hmm) ? 0xFFFFFF : hmm ? 1 : 0);
var v2 = getArrayJsValueFromWasm0(ret[0], ret[1]).slice();
wasm.__wbindgen_free(ret[0], ret[1] * 4, 4);
return v2;
}
function takeFromExternrefTable0(idx) {
const value = wasm.__wbindgen_export_2.get(idx);
wasm.__externref_table_dealloc(idx);
return value;
}
/**
* @param {string} text
* @param {string} mode
* @param {boolean | null} [hmm]
* @returns {Token[]}
*/
export function tokenize(text, mode, hmm) {
const ptr0 = passStringToWasm0(text, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc);
const len0 = WASM_VECTOR_LEN;
const ptr1 = passStringToWasm0(mode, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc);
const len1 = WASM_VECTOR_LEN;
const ret = wasm.tokenize(ptr0, len0, ptr1, len1, isLikeNone(hmm) ? 0xFFFFFF : hmm ? 1 : 0);
if (ret[3]) {
throw takeFromExternrefTable0(ret[2]);
}
var v3 = getArrayJsValueFromWasm0(ret[0], ret[1]).slice();
wasm.__wbindgen_free(ret[0], ret[1] * 4, 4);
return v3;
}
/**
* @param {string} word
* @param {number | null} [freq]
* @param {string | null} [tag]
* @returns {number}
*/
export function add_word(word, freq, tag) {
const ptr0 = passStringToWasm0(word, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc);
const len0 = WASM_VECTOR_LEN;
var ptr1 = isLikeNone(tag) ? 0 : passStringToWasm0(tag, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc);
var len1 = WASM_VECTOR_LEN;
const ret = wasm.add_word(ptr0, len0, isLikeNone(freq) ? 0x100000001 : (freq) >>> 0, ptr1, len1);
return ret >>> 0;
}
/**
* @param {string} sentence
* @param {boolean | null} [hmm]
* @returns {Tag[]}
*/
export function tag(sentence, hmm) {
const ptr0 = passStringToWasm0(sentence, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc);
const len0 = WASM_VECTOR_LEN;
const ret = wasm.tag(ptr0, len0, isLikeNone(hmm) ? 0xFFFFFF : hmm ? 1 : 0);
var v2 = getArrayJsValueFromWasm0(ret[0], ret[1]).slice();
wasm.__wbindgen_free(ret[0], ret[1] * 4, 4);
return v2;
}
/**
* @param {string} dict
*/
export function with_dict(dict) {
const ptr0 = passStringToWasm0(dict, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc);
const len0 = WASM_VECTOR_LEN;
const ret = wasm.with_dict(ptr0, len0);
if (ret[1]) {
throw takeFromExternrefTable0(ret[0]);
}
}
const EXPECTED_RESPONSE_TYPES = new Set(['basic', 'cors', 'default']);
async function __wbg_load(module, imports) {
if (typeof Response === 'function' && module instanceof Response) {
if (typeof WebAssembly.instantiateStreaming === 'function') {
try {
return await WebAssembly.instantiateStreaming(module, imports);
} catch (e) {
const validResponse = module.ok && EXPECTED_RESPONSE_TYPES.has(module.type);
if (validResponse && module.headers.get('Content-Type') !== 'application/wasm') {
console.warn("`WebAssembly.instantiateStreaming` failed because your server does not serve Wasm with `application/wasm` MIME type. Falling back to `WebAssembly.instantiate` which is slower. Original error:\n", e);
} else {
throw e;
}
}
}
const bytes = await module.arrayBuffer();
return await WebAssembly.instantiate(bytes, imports);
} else {
const instance = await WebAssembly.instantiate(module, imports);
if (instance instanceof WebAssembly.Instance) {
return { instance, module };
} else {
return instance;
}
}
}
function __wbg_get_imports() {
const imports = {};
imports.wbg = {};
imports.wbg.__wbg_Error_0497d5bdba9362e5 = function(arg0, arg1) {
const ret = Error(getStringFromWasm0(arg0, arg1));
return ret;
};
imports.wbg.__wbg_new_07b483f72211fd66 = function() {
const ret = new Object();
return ret;
};
imports.wbg.__wbg_set_3f1d0b984ed272ed = function(arg0, arg1, arg2) {
arg0[arg1] = arg2;
};
imports.wbg.__wbindgen_bigint_from_u64 = function(arg0) {
const ret = BigInt.asUintN(64, arg0);
return ret;
};
imports.wbg.__wbindgen_debug_string = function(arg0, arg1) {
const ret = debugString(arg1);
const ptr1 = passStringToWasm0(ret, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc);
const len1 = WASM_VECTOR_LEN;
getDataViewMemory0().setInt32(arg0 + 4 * 1, len1, true);
getDataViewMemory0().setInt32(arg0 + 4 * 0, ptr1, true);
};
imports.wbg.__wbindgen_init_externref_table = function() {
const table = wasm.__wbindgen_export_2;
const offset = table.grow(4);
table.set(0, undefined);
table.set(offset + 0, undefined);
table.set(offset + 1, null);
table.set(offset + 2, true);
table.set(offset + 3, false);
;
};
imports.wbg.__wbindgen_number_new = function(arg0) {
const ret = arg0;
return ret;
};
imports.wbg.__wbindgen_string_new = function(arg0, arg1) {
const ret = getStringFromWasm0(arg0, arg1);
return ret;
};
imports.wbg.__wbindgen_throw = function(arg0, arg1) {
throw new Error(getStringFromWasm0(arg0, arg1));
};
return imports;
}
function __wbg_init_memory(imports, memory) {
}
function __wbg_finalize_init(instance, module) {
wasm = instance.exports;
__wbg_init.__wbindgen_wasm_module = module;
cachedDataViewMemory0 = null;
cachedUint8ArrayMemory0 = null;
wasm.__wbindgen_start();
return wasm;
}
function initSync(module) {
if (wasm !== undefined) return wasm;
if (typeof module !== 'undefined') {
if (Object.getPrototypeOf(module) === Object.prototype) {
({module} = module)
} else {
console.warn('using deprecated parameters for `initSync()`; pass a single object instead')
}
}
const imports = __wbg_get_imports();
__wbg_init_memory(imports);
if (!(module instanceof WebAssembly.Module)) {
module = new WebAssembly.Module(module);
}
const instance = new WebAssembly.Instance(module, imports);
return __wbg_finalize_init(instance, module);
}
async function __wbg_init(module_or_path) {
if (wasm !== undefined) return wasm;
if (typeof module_or_path !== 'undefined') {
if (Object.getPrototypeOf(module_or_path) === Object.prototype) {
({module_or_path} = module_or_path)
} else {
console.warn('using deprecated parameters for the initialization function; pass a single object instead')
}
}
if (typeof module_or_path === 'undefined') {
module_or_path = new URL('jieba_rs_wasm_bg.wasm', import.meta.url);
}
const imports = __wbg_get_imports();
if (typeof module_or_path === 'string' || (typeof Request === 'function' && module_or_path instanceof Request) || (typeof URL === 'function' && module_or_path instanceof URL)) {
module_or_path = fetch(module_or_path);
}
__wbg_init_memory(imports);
const { instance, module } = await __wbg_load(await module_or_path, imports);
return __wbg_finalize_init(instance, module);
}
export { initSync };
export default __wbg_init;

Binary file not shown.

View File

@@ -0,0 +1,25 @@
/* tslint:disable */
/* eslint-disable */
export const memory: WebAssembly.Memory;
export const cut: (a: number, b: number, c: number) => [number, number];
export const cut_all: (a: number, b: number) => [number, number];
export const cut_for_search: (a: number, b: number, c: number) => [number, number];
export const tokenize: (a: number, b: number, c: number, d: number, e: number) => [number, number, number, number];
export const add_word: (a: number, b: number, c: number, d: number, e: number) => number;
export const tag: (a: number, b: number, c: number) => [number, number];
export const with_dict: (a: number, b: number) => [number, number];
export const rust_zstd_wasm_shim_qsort: (a: number, b: number, c: number, d: number) => void;
export const rust_zstd_wasm_shim_malloc: (a: number) => number;
export const rust_zstd_wasm_shim_memcmp: (a: number, b: number, c: number) => number;
export const rust_zstd_wasm_shim_calloc: (a: number, b: number) => number;
export const rust_zstd_wasm_shim_free: (a: number) => void;
export const rust_zstd_wasm_shim_memcpy: (a: number, b: number, c: number) => number;
export const rust_zstd_wasm_shim_memmove: (a: number, b: number, c: number) => number;
export const rust_zstd_wasm_shim_memset: (a: number, b: number, c: number) => number;
export const __wbindgen_malloc: (a: number, b: number) => number;
export const __wbindgen_realloc: (a: number, b: number, c: number, d: number) => number;
export const __wbindgen_export_2: WebAssembly.Table;
export const __externref_drop_slice: (a: number, b: number) => void;
export const __wbindgen_free: (a: number, b: number, c: number) => void;
export const __externref_table_dealloc: (a: number) => void;
export const __wbindgen_start: () => void;

View File

@@ -0,0 +1,129 @@
{
"name": "jieba-wasm",
"version": "2.4.0",
"description": "WASM binding to jieba-rs",
"main": "./pkg/nodejs/jieba_rs_wasm.js",
"types": "./pkg/nodejs/jieba_rs_wasm.d.ts",
"exports": {
".": {
"node": {
"types": "./pkg/nodejs/jieba_rs_wasm.d.ts",
"default": "./pkg/nodejs/jieba_rs_wasm.js"
},
"deno": {
"types": "./pkg/deno/jieba_rs_wasm.d.ts",
"default": "./pkg/deno/jieba_rs_wasm.js"
},
"browser": {
"types": "./pkg/web/jieba_rs_wasm.d.ts",
"default": "./pkg/web/jieba_rs_wasm.js"
},
"import": {
"types": "./pkg/web/jieba_rs_wasm.d.ts",
"default": "./pkg/web/jieba_rs_wasm.js"
},
"require": {
"types": "./pkg/nodejs/jieba_rs_wasm.d.ts",
"default": "./pkg/nodejs/jieba_rs_wasm.js"
}
},
"./web": {
"types": "./pkg/web/jieba_rs_wasm.d.ts",
"default": "./pkg/web/jieba_rs_wasm.js"
},
"./node": {
"types": "./pkg/nodejs/jieba_rs_wasm.d.ts",
"default": "./pkg/nodejs/jieba_rs_wasm.js"
},
"./deno": {
"types": "./pkg/deno/jieba_rs_wasm.d.ts",
"default": "./pkg/deno/jieba_rs_wasm.js"
}
},
"directories": {
"test": "tests"
},
"scripts": {
"build": "wireit",
"build:cargo": "wireit",
"build:bundler": "wireit",
"build:nodejs": "wireit",
"build:deno": "wireit",
"build:web": "wireit",
"build:opt": "wireit",
"test": "echo \"Error: no test specified\" && exit 1"
},
"wireit": {
"build:cargo": {
"command": "cargo build --release --target wasm32-unknown-unknown"
},
"build:bundler": {
"command": "wasm-bindgen target/wasm32-unknown-unknown/release/jieba_rs_wasm.wasm --out-dir ./pkg/bundler --target bundler",
"dependencies": [
"build:cargo"
]
},
"build:nodejs": {
"command": "wasm-bindgen target/wasm32-unknown-unknown/release/jieba_rs_wasm.wasm --out-dir ./pkg/nodejs --target nodejs",
"dependencies": [
"build:cargo"
]
},
"build:deno": {
"command": "wasm-bindgen target/wasm32-unknown-unknown/release/jieba_rs_wasm.wasm --out-dir ./pkg/deno --target deno",
"dependencies": [
"build:cargo"
]
},
"build:web": {
"command": "wasm-bindgen target/wasm32-unknown-unknown/release/jieba_rs_wasm.wasm --out-dir ./pkg/web --target web",
"dependencies": [
"build:cargo"
]
},
"build": {
"dependencies": [
"build:cargo",
"build:bundler",
"build:nodejs",
"build:deno",
"build:web",
"build:opt"
]
},
"build:opt": {
"command": "node scripts/opt.js",
"dependencies": [
"build:cargo",
"build:bundler",
"build:nodejs",
"build:deno",
"build:web"
]
}
},
"files": [
"pkg/**/*"
],
"repository": {
"type": "git",
"url": "git+https://github.com/fengkx/jieba-wasm.git"
},
"keywords": [
"wasm",
"jieba",
"chinese",
"segment",
"中文分词"
],
"author": "fengkx",
"license": "MIT",
"bugs": {
"url": "https://github.com/fengkx/jieba-wasm/issues"
},
"homepage": "https://github.com/fengkx/jieba-wasm#readme",
"devDependencies": {
"@jsdevtools/ez-spawn": "^3.0.4",
"wireit": "^0.14.4"
}
}

View File

@@ -44,6 +44,12 @@ import { runSummaryGeneration } from "./generate/generator.js";
// vector service // vector service
import { embed, getEngineFingerprint, testOnlineService } from "./vector/utils/embedder.js"; import { embed, getEngineFingerprint, testOnlineService } from "./vector/utils/embedder.js";
// tokenizer
import { preload as preloadTokenizer, injectEntities } from "./vector/utils/tokenizer.js";
// entity lexicon
import { buildEntityLexicon, buildDisplayNameMap } from "./vector/retrieval/entity-lexicon.js";
import { import {
getMeta, getMeta,
updateMeta, updateMeta,
@@ -82,7 +88,7 @@ import {
// vector io // vector io
import { exportVectors, importVectors } from "./vector/storage/vector-io.js"; import { exportVectors, importVectors } from "./vector/storage/vector-io.js";
import { invalidateLexicalIndex } from "./vector/retrieval/lexical-index.js"; import { invalidateLexicalIndex, warmupIndex } from "./vector/retrieval/lexical-index.js";
// ═══════════════════════════════════════════════════════════════════════════ // ═══════════════════════════════════════════════════════════════════════════
// 常量 // 常量
@@ -145,6 +151,30 @@ const VECTOR_WARNING_COOLDOWN_MS = 120000; // 2分钟内不重复提醒
const EXT_PROMPT_KEY = "LittleWhiteBox_StorySummary"; const EXT_PROMPT_KEY = "LittleWhiteBox_StorySummary";
const MIN_INJECTION_DEPTH = 2; const MIN_INJECTION_DEPTH = 2;
// ═══════════════════════════════════════════════════════════════════════════
// 分词器预热
// ═══════════════════════════════════════════════════════════════════════════
/** 是否已触发过预热 */
let tokenizerPreloaded = false;
function maybePreloadTokenizer() {
if (tokenizerPreloaded) return;
const vectorCfg = getVectorConfig();
if (!vectorCfg?.enabled) return;
tokenizerPreloaded = true;
preloadTokenizer()
.then((ok) => {
if (ok) tokenizerPreloaded = true;
})
.catch((e) => {
// 不置 tokenizerPreloaded允许后续重试例如用户修复路径/刷新后)
xbLog.warn(MODULE_ID, "分词器预热失败(将降级运行,可稍后重试)", e);
});
}
// role 映射 // role 映射
const ROLE_MAP = { const ROLE_MAP = {
system: extension_prompt_roles.SYSTEM, system: extension_prompt_roles.SYSTEM,
@@ -499,6 +529,27 @@ async function handleClearVectors() {
xbLog.info(MODULE_ID, "向量数据已清除"); xbLog.info(MODULE_ID, "向量数据已清除");
} }
// ═══════════════════════════════════════════════════════════════════════════
// 实体词典注入 + 索引预热
// ═══════════════════════════════════════════════════════════════════════════
function refreshEntityLexiconAndWarmup() {
const vectorCfg = getVectorConfig();
if (!vectorCfg?.enabled) return;
const store = getSummaryStore();
const { name1, name2 } = getContext();
const lexicon = buildEntityLexicon(store, { name1, name2 });
const displayMap = buildDisplayNameMap(store, { name1, name2 });
injectEntities(lexicon, displayMap);
// 异步预建词法索引(不阻塞)
invalidateLexicalIndex();
warmupIndex();
}
// ═══════════════════════════════════════════════════════════════════════════ // ═══════════════════════════════════════════════════════════════════════════
// L2 自动增量向量化(总结完成后调用) // L2 自动增量向量化(总结完成后调用)
// ═══════════════════════════════════════════════════════════════════════════ // ═══════════════════════════════════════════════════════════════════════════
@@ -997,6 +1048,9 @@ function handleFrameMessage(event) {
case "VECTOR_GENERATE": case "VECTOR_GENERATE":
if (data.config) saveVectorConfig(data.config); if (data.config) saveVectorConfig(data.config);
// 向量配置变更,可能刚启用,触发预热
maybePreloadTokenizer();
refreshEntityLexiconAndWarmup();
handleGenerateVectors(data.config); handleGenerateVectors(data.config);
break; break;
@@ -1085,6 +1139,8 @@ function handleFrameMessage(event) {
case "REQUEST_VECTOR_STATS": case "REQUEST_VECTOR_STATS":
sendVectorStatsToFrame(); sendVectorStatsToFrame();
// 向量开关可能在 iframe 中被修改,检查是否需要预热
maybePreloadTokenizer();
break; break;
case "REQUEST_CLEAR": { case "REQUEST_CLEAR": {
@@ -1230,6 +1286,9 @@ async function handleChatChanged() {
sendVectorStatsToFrame(); sendVectorStatsToFrame();
} }
// 实体词典注入 + 索引预热
refreshEntityLexiconAndWarmup();
setTimeout(() => checkVectorIntegrityAndWarn(), 2000); setTimeout(() => checkVectorIntegrityAndWarn(), 2000);
} }
@@ -1267,6 +1326,9 @@ async function handleMessageReceived() {
applyHideStateDebounced(); applyHideStateDebounced();
setTimeout(() => maybeAutoRunSummary("after_ai"), 1000); setTimeout(() => maybeAutoRunSummary("after_ai"), 1000);
// 新消息后刷新实体词典(可能有新角色)
refreshEntityLexiconAndWarmup();
} }
function handleMessageSent() { function handleMessageSent() {
@@ -1458,4 +1520,7 @@ jQuery(() => {
if (!getSettings().storySummary?.enabled) return; if (!getSettings().storySummary?.enabled) return;
registerEvents(); registerEvents();
initStateIntegration(); initStateIntegration();
// 条件预热分词器storySummary 已启用,检查 vector 是否也启用)
maybePreloadTokenizer();
}); });

View File

@@ -78,7 +78,8 @@ export async function rerank(query, documents, options = {}) {
}, },
body: JSON.stringify({ body: JSON.stringify({
model: RERANK_MODEL, model: RERANK_MODEL,
query: query.slice(0, 1000), // 限制 query 长度 // Zero-darkbox: do not silently truncate query.
query,
documents: validDocs, documents: validDocs,
top_n: Math.min(topN, validDocs.length), top_n: Math.min(topN, validDocs.length),
return_documents: false, return_documents: false,

View File

@@ -4,9 +4,10 @@
// 职责: // 职责:
// 1. 对 L0 atoms + L1 chunks + L2 events 建立词法索引 // 1. 对 L0 atoms + L1 chunks + L2 events 建立词法索引
// 2. 提供词法检索接口(专名精确匹配兜底) // 2. 提供词法检索接口(专名精确匹配兜底)
// 3. 惰性构建 + 缓存失效机制 // 3. 惰性构建 + 异步预热 + 缓存失效机制
// //
// 索引存储:纯内存(不持久化) // 索引存储:纯内存(不持久化)
// 分词器:统一使用 tokenizer.js结巴 + 实体保护 + 降级)
// 重建时机CHAT_CHANGED / L0提取完成 / L2总结完成 // 重建时机CHAT_CHANGED / L0提取完成 / L2总结完成
// ═══════════════════════════════════════════════════════════════════════════ // ═══════════════════════════════════════════════════════════════════════════
@@ -16,6 +17,7 @@ import { getSummaryStore } from '../../data/store.js';
import { getStateAtoms } from '../storage/state-store.js'; import { getStateAtoms } from '../storage/state-store.js';
import { getAllChunks } from '../storage/chunk-store.js'; import { getAllChunks } from '../storage/chunk-store.js';
import { xbLog } from '../../../../core/debug-core.js'; import { xbLog } from '../../../../core/debug-core.js';
import { tokenizeForIndex } from '../utils/tokenizer.js';
const MODULE_ID = 'lexical-index'; const MODULE_ID = 'lexical-index';
@@ -23,9 +25,20 @@ const MODULE_ID = 'lexical-index';
// 缓存 // 缓存
// ───────────────────────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────────────────────
/** @type {MiniSearch|null} */
let cachedIndex = null; let cachedIndex = null;
/** @type {string|null} */
let cachedChatId = null; let cachedChatId = null;
let cachedFingerprint = null; // atoms.length + chunks.length + events.length 的简单指纹
/** @type {string|null} 数据指纹atoms + chunks + events 数量) */
let cachedFingerprint = null;
/** @type {boolean} 是否正在构建 */
let building = false;
/** @type {Promise<MiniSearch|null>|null} 当前构建 Promise防重入 */
let buildPromise = null;
// ───────────────────────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────────────────────
// 工具函数 // 工具函数
@@ -43,7 +56,7 @@ function cleanSummary(summary) {
} }
/** /**
* 计算缓存指纹(用于判断是否需要重建) * 计算缓存指纹
* @param {number} atomCount * @param {number} atomCount
* @param {number} chunkCount * @param {number} chunkCount
* @param {number} eventCount * @param {number} eventCount
@@ -53,39 +66,27 @@ function computeFingerprint(atomCount, chunkCount, eventCount) {
return `${atomCount}:${chunkCount}:${eventCount}`; return `${atomCount}:${chunkCount}:${eventCount}`;
} }
/**
* 让出主线程(避免长时间阻塞 UI
* @returns {Promise<void>}
*/
function yieldToMain() {
return new Promise(resolve => setTimeout(resolve, 0));
}
// ───────────────────────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────────────────────
// 索引构建 // 文档收集
// ───────────────────────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────────────────────
/** /**
* 构建 MiniSearch 索引 * 收集所有待索引文档
*
* 索引三类文档:
* - L0 atoms: { id: atomId, type: 'atom', floor, text: semantic }
* - L1 chunks: { id: chunkId, type: 'chunk', floor, text: chunk.text }
* - L2 events: { id: eventId, type: 'event', floor: null, text: title + participants + summary }
* *
* @param {object[]} atoms - getStateAtoms() 返回值 * @param {object[]} atoms - getStateAtoms() 返回值
* @param {object[]} chunks - getAllChunks(chatId) 返回值 * @param {object[]} chunks - getAllChunks(chatId) 返回值
* @param {object[]} events - store.json.events * @param {object[]} events - store.json.events
* @returns {MiniSearch} * @returns {object[]} 文档数组
*/ */
export function buildLexicalIndex(atoms, chunks, events) { function collectDocuments(atoms, chunks, events) {
const T0 = performance.now();
const index = new MiniSearch({
fields: ['text'],
storeFields: ['type', 'floor'],
idField: 'id',
searchOptions: {
boost: { text: 1 },
fuzzy: 0.2,
prefix: true,
},
// 中文友好的 tokenizer按字符 bigram + 空格/标点分词
tokenize: chineseTokenize,
});
const docs = []; const docs = [];
// L0 atoms // L0 atoms
@@ -129,72 +130,58 @@ export function buildLexicalIndex(atoms, chunks, events) {
}); });
} }
if (docs.length > 0) { return docs;
index.addAll(docs);
} }
const elapsed = Math.round(performance.now() - T0); // ─────────────────────────────────────────────────────────────────────────
xbLog.info(MODULE_ID, `索引构建完成: ${docs.length} 文档 (atoms=${atoms?.length || 0}, chunks=${chunks?.length || 0}, events=${events?.length || 0}) ${elapsed}ms`); // 索引构建(分片,不阻塞主线程)
// ─────────────────────────────────────────────────────────────────────────
/** 每批添加的文档数 */
const BUILD_BATCH_SIZE = 500;
/**
* 构建 MiniSearch 索引(分片异步)
*
* @param {object[]} docs - 文档数组
* @returns {Promise<MiniSearch>}
*/
async function buildIndexAsync(docs) {
const T0 = performance.now();
const index = new MiniSearch({
fields: ['text'],
storeFields: ['type', 'floor'],
idField: 'id',
searchOptions: {
boost: { text: 1 },
fuzzy: 0.2,
prefix: true,
},
tokenize: tokenizeForIndex,
});
if (!docs.length) {
return index; return index;
} }
// ───────────────────────────────────────────────────────────────────────── // 分片添加,每批 BUILD_BATCH_SIZE 条后让出主线程
// 中文 Tokenizer for (let i = 0; i < docs.length; i += BUILD_BATCH_SIZE) {
// ───────────────────────────────────────────────────────────────────────── const batch = docs.slice(i, i + BUILD_BATCH_SIZE);
index.addAll(batch);
/** // 非最后一批时让出主线程
* 中文友好的分词器 if (i + BUILD_BATCH_SIZE < docs.length) {
* await yieldToMain();
* 策略:
* 1. 连续中文字符 → 滑动 bigram"黄英梅" → "黄英", "英梅"
* 2. 连续非中文字符 → 按空格/标点分割
* 3. 保留完整中文词2-4字作为额外 token
*
* @param {string} text
* @returns {string[]}
*/
function chineseTokenize(text) {
if (!text) return [];
const tokens = [];
const s = String(text).toLowerCase();
// 分离中文段和非中文段
const segments = s.split(/([\u4e00-\u9fff]+)/g);
for (const seg of segments) {
if (!seg) continue;
// 中文段bigram + 完整段(如果 2-6 字)
if (/^[\u4e00-\u9fff]+$/.test(seg)) {
// 完整段作为一个 token如果长度合适
if (seg.length >= 2 && seg.length <= 6) {
tokens.push(seg);
}
// bigram
for (let i = 0; i < seg.length - 1; i++) {
tokens.push(seg.slice(i, i + 2));
}
// trigram对 3+ 字的段)
for (let i = 0; i < seg.length - 2; i++) {
tokens.push(seg.slice(i, i + 3));
}
} else {
// 非中文段:按空格/标点分割
const words = seg.split(/[\s\-_.,;:!?'"()[\]{}<>/\\|@#$%^&*+=~`]+/);
for (const w of words) {
const trimmed = w.trim();
if (trimmed.length >= 2) {
tokens.push(trimmed);
}
}
} }
} }
return tokens; const elapsed = Math.round(performance.now() - T0);
xbLog.info(MODULE_ID,
`索引构建完成: ${docs.length} 文档 (${elapsed}ms)`
);
return index;
} }
// ───────────────────────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────────────────────
@@ -247,6 +234,8 @@ export function searchLexicalIndex(index, terms) {
fuzzy: 0.2, fuzzy: 0.2,
prefix: true, prefix: true,
combineWith: 'OR', combineWith: 'OR',
// 使用与索引相同的分词器
tokenize: tokenizeForIndex,
}); });
} catch (e) { } catch (e) {
xbLog.warn(MODULE_ID, '检索失败', e); xbLog.warn(MODULE_ID, '检索失败', e);
@@ -305,22 +294,17 @@ export function searchLexicalIndex(index, terms) {
} }
// ───────────────────────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────────────────────
// 惰性缓存管理 // 内部构建流程(收集数据 + 构建索引)
// ───────────────────────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────────────────────
/** /**
* 获取词法索引(惰性构建 + 缓存) * 收集数据并构建索引
* *
* 如果缓存有效则直接返回;否则自动构建。 * @param {string} chatId
* 缓存失效条件chatId 变化 / 数据指纹变化 / 手动 invalidate * @returns {Promise<{index: MiniSearch, fingerprint: string}>}
*
* @returns {Promise<MiniSearch>}
*/ */
export async function getLexicalIndex() { async function collectAndBuild(chatId) {
const { chatId } = getContext(); // 收集数据
if (!chatId) return null;
// 收集当前数据
const atoms = getStateAtoms() || []; const atoms = getStateAtoms() || [];
const store = getSummaryStore(); const store = getSummaryStore();
const events = store?.json?.events || []; const events = store?.json?.events || [];
@@ -334,30 +318,118 @@ export async function getLexicalIndex() {
const fp = computeFingerprint(atoms.length, chunks.length, events.length); const fp = computeFingerprint(atoms.length, chunks.length, events.length);
// 缓存命中 // 检查是否在收集过程中缓存已被其他调用更新
if (cachedIndex && cachedChatId === chatId && cachedFingerprint === fp) { if (cachedIndex && cachedChatId === chatId && cachedFingerprint === fp) {
return { index: cachedIndex, fingerprint: fp };
}
// 收集文档
const docs = collectDocuments(atoms, chunks, events);
// 异步分片构建
const index = await buildIndexAsync(docs);
return { index, fingerprint: fp };
}
// ─────────────────────────────────────────────────────────────────────────
// 公开接口getLexicalIndex惰性获取
// ─────────────────────────────────────────────────────────────────────────
/**
* 获取词法索引(惰性构建 + 缓存)
*
* 如果缓存有效则直接返回;否则自动构建。
* 如果正在构建中,等待构建完成。
*
* @returns {Promise<MiniSearch|null>}
*/
export async function getLexicalIndex() {
const { chatId } = getContext();
if (!chatId) return null;
// 快速路径:如果缓存存在且 chatId 未变,则直接命中
// 指纹校验放到构建流程中完成,避免为指纹而额外读一次 IndexedDB
if (cachedIndex && cachedChatId === chatId && cachedFingerprint) {
return cachedIndex; return cachedIndex;
} }
// 重建 // 正在构建中,等待结果
xbLog.info(MODULE_ID, `缓存失效,重建索引 (chatId=${chatId.slice(0, 8)}, fp=${fp})`); if (building && buildPromise) {
try {
const index = buildLexicalIndex(atoms, chunks, events); await buildPromise;
if (cachedIndex && cachedChatId === chatId && cachedFingerprint) {
cachedIndex = index; return cachedIndex;
cachedChatId = chatId; }
cachedFingerprint = fp; } catch {
// 构建失败,继续往下重建
return index; }
} }
// 需要重建(指纹将在 collectAndBuild 内部计算并写入缓存)
xbLog.info(MODULE_ID, `缓存失效,重建索引 (chatId=${chatId.slice(0, 8)})`);
building = true;
buildPromise = collectAndBuild(chatId);
try {
const { index, fingerprint } = await buildPromise;
// 原子替换缓存
cachedIndex = index;
cachedChatId = chatId;
cachedFingerprint = fingerprint;
return index;
} catch (e) {
xbLog.error(MODULE_ID, '索引构建失败', e);
return null;
} finally {
building = false;
buildPromise = null;
}
}
// ─────────────────────────────────────────────────────────────────────────
// 公开接口warmupIndex异步预建
// ─────────────────────────────────────────────────────────────────────────
/** /**
* 使缓存失效(下次 getLexicalIndex 时自动重建) * 异步预建索引
*
* 在 CHAT_CHANGED 时调用,后台构建索引。
* 不阻塞调用方,不返回结果。
* 构建完成后缓存自动更新,后续 getLexicalIndex() 直接命中。
*
* 调用时机:
* - handleChatChanged实体注入后
* - L0 提取完成
* - L2 总结完成
*/
export function warmupIndex() {
const { chatId } = getContext();
if (!chatId) return;
// 已在构建中,不重复触发
if (building) return;
// fire-and-forget
getLexicalIndex().catch(e => {
xbLog.warn(MODULE_ID, '预热索引失败', e);
});
}
// ─────────────────────────────────────────────────────────────────────────
// 公开接口invalidateLexicalIndex缓存失效
// ─────────────────────────────────────────────────────────────────────────
/**
* 使缓存失效(下次 getLexicalIndex / warmupIndex 时自动重建)
* *
* 调用时机: * 调用时机:
* - CHAT_CHANGED * - CHAT_CHANGED
* - L0 提取完成handleAnchorGenerate 完成后) * - L0 提取完成
* - L2 总结完成onComplete 回调中) * - L2 总结完成
*/ */
export function invalidateLexicalIndex() { export function invalidateLexicalIndex() {
if (cachedIndex) { if (cachedIndex) {

View File

@@ -16,6 +16,11 @@ export function createMetrics() {
query: { query: {
buildTime: 0, buildTime: 0,
refineTime: 0, refineTime: 0,
lengths: {
v0Chars: 0,
v1Chars: null, // null = NA
rerankChars: 0,
},
}, },
// Anchor (L0 StateAtoms) - 语义锚点 // Anchor (L0 StateAtoms) - 语义锚点
@@ -177,6 +182,13 @@ export function formatMetricsLog(metrics) {
lines.push('════════════════════════════════════════'); lines.push('════════════════════════════════════════');
lines.push(''); lines.push('');
// Query Length
lines.push('[Query Length] 查询长度');
lines.push(`├─ query_v0_chars: ${m.query?.lengths?.v0Chars ?? 0}`);
lines.push(`├─ query_v1_chars: ${m.query?.lengths?.v1Chars == null ? 'NA' : m.query.lengths.v1Chars}`);
lines.push(`└─ rerank_query_chars: ${m.query?.lengths?.rerankChars ?? 0}`);
lines.push('');
// Query Build // Query Build
lines.push('[Query] 查询构建'); lines.push('[Query] 查询构建');
lines.push(`├─ build_time: ${m.query.buildTime}ms`); lines.push(`├─ build_time: ${m.query.buildTime}ms`);

View File

@@ -12,36 +12,18 @@ import { getContext } from '../../../../../../../extensions.js';
import { buildEntityLexicon, buildDisplayNameMap, extractEntitiesFromText } from './entity-lexicon.js'; import { buildEntityLexicon, buildDisplayNameMap, extractEntitiesFromText } from './entity-lexicon.js';
import { getSummaryStore } from '../../data/store.js'; import { getSummaryStore } from '../../data/store.js';
import { filterText } from '../utils/text-filter.js'; import { filterText } from '../utils/text-filter.js';
import { tokenizeForIndex as tokenizerTokenizeForIndex } from '../utils/tokenizer.js';
// ───────────────────────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────────────────────
// 常量 // 常量
// ───────────────────────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────────────────────
const DIALOGUE_MAX_CHARS = 400; // Zero-darkbox policy:
const PENDING_MAX_CHARS = 400; // - No internal truncation. We rely on model-side truncation / provider limits.
const MEMORY_HINT_MAX_CHARS = 100; // - If provider rejects due to length, we fail loudly and degrade explicitly.
const MEMORY_HINT_ATOMS_MAX = 5; const MEMORY_HINT_ATOMS_MAX = 5;
const MEMORY_HINT_EVENTS_MAX = 3; const MEMORY_HINT_EVENTS_MAX = 3;
const RERANK_QUERY_MAX_CHARS = 500;
const RERANK_SNIPPET_CHARS = 150;
const LEXICAL_TERMS_MAX = 10; const LEXICAL_TERMS_MAX = 10;
const LEXICAL_TERM_MIN_LEN = 2;
const LEXICAL_TERM_MAX_LEN = 6;
// 中文停用词(高频无意义词)
const STOP_WORDS = new Set([
'的', '了', '在', '是', '我', '有', '和', '就', '不', '人',
'都', '一', '一个', '上', '也', '很', '到', '说', '要', '去',
'你', '会', '着', '没有', '看', '好', '自己', '这', '他', '她',
'它', '吗', '什么', '那', '里', '来', '吧', '呢', '啊', '哦',
'嗯', '呀', '哈', '嘿', '喂', '哎', '唉', '哇', '呃', '嘛',
'把', '被', '让', '给', '从', '向', '对', '跟', '比', '但',
'而', '或', '如果', '因为', '所以', '虽然', '但是', '然后',
'可以', '这样', '那样', '怎么', '为什么', '什么样', '哪里',
'时候', '现在', '已经', '还是', '只是', '可能', '应该', '知道',
'觉得', '开始', '一下', '一些', '这个', '那个', '他们', '我们',
'你们', '自己', '起来', '出来', '进去', '回来', '过来', '下去',
]);
// ───────────────────────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────────────────────
// 工具函数 // 工具函数
@@ -65,10 +47,7 @@ function cleanMessageText(text) {
* @param {number} maxLen * @param {number} maxLen
* @returns {string} * @returns {string}
*/ */
function truncate(text, maxLen) { // truncate removed by design (zero-darkbox)
if (!text || text.length <= maxLen) return text || '';
return text.slice(0, maxLen) + '…';
}
/** /**
* 清理事件摘要(移除楼层标记) * 清理事件摘要(移除楼层标记)
@@ -84,8 +63,7 @@ function cleanSummary(summary) {
/** /**
* 从文本中提取高频实词(用于词法检索) * 从文本中提取高频实词(用于词法检索)
* *
* 策略:按中文字符边界 + 空格/标点分词,取长度 2-6 的片段 * 使用统一分词器(结巴 + 实体保护 + 停用词过滤),按频率排序
* 过滤停用词,按频率排序
* *
* @param {string} text - 清洗后的文本 * @param {string} text - 清洗后的文本
* @param {number} maxTerms - 最大词数 * @param {number} maxTerms - 最大词数
@@ -94,15 +72,15 @@ function cleanSummary(summary) {
function extractKeyTerms(text, maxTerms = LEXICAL_TERMS_MAX) { function extractKeyTerms(text, maxTerms = LEXICAL_TERMS_MAX) {
if (!text) return []; if (!text) return [];
// 提取连续中文片段 + 英文单词 // 使用统一分词器(索引用,不去重,保留词频)
const segments = text.match(/[\u4e00-\u9fff]{2,6}|[a-zA-Z]{3,}/g) || []; const tokens = tokenizerTokenizeForIndex(text);
// 统计词频
const freq = new Map(); const freq = new Map();
for (const seg of segments) { for (const token of tokens) {
const s = seg.toLowerCase(); const key = String(token || '').toLowerCase();
if (s.length < LEXICAL_TERM_MIN_LEN || s.length > LEXICAL_TERM_MAX_LEN) continue; if (!key) continue;
if (STOP_WORDS.has(s)) continue; freq.set(key, (freq.get(key) || 0) + 1);
freq.set(s, (freq.get(s) || 0) + 1);
} }
return Array.from(freq.entries()) return Array.from(freq.entries())
@@ -160,8 +138,9 @@ export function buildQueryBundle(lastMessages, pendingUserMessage, store = null,
const clean = cleanMessageText(m.mes || ''); const clean = cleanMessageText(m.mes || '');
if (clean) { if (clean) {
// ★ 修复 A不使用楼层号embedding 模型不需要 // 不使用楼层号embedding 模型不需要
dialogueLines.push(`${speaker}: ${truncate(clean, DIALOGUE_MAX_CHARS)}`); // 不截断,零暗箱
dialogueLines.push(`${speaker}: ${clean}`);
allCleanText.push(clean); allCleanText.push(clean);
} }
} }
@@ -191,30 +170,15 @@ export function buildQueryBundle(lastMessages, pendingUserMessage, store = null,
} }
if (pendingClean) { if (pendingClean) {
queryParts.push(`[PENDING_USER]\n${truncate(pendingClean, PENDING_MAX_CHARS)}`); // 不截断,零暗箱
queryParts.push(`[PENDING_USER]\n${pendingClean}`);
} }
const queryText_v0 = queryParts.join('\n\n'); const queryText_v0 = queryParts.join('\n\n');
// 6. 构建 rerankQuery(短版 // 6. rerankQuery 与 embedding query 同源(零暗箱
const rerankParts = []; // 后续 refine 会把它升级为与 queryText_v1 同源。
const rerankQuery = queryText_v0;
if (focusEntities.length > 0) {
rerankParts.push(focusEntities.join(' '));
}
for (const m of (lastMessages || [])) {
const clean = cleanMessageText(m.mes || '');
if (clean) {
rerankParts.push(truncate(clean, RERANK_SNIPPET_CHARS));
}
}
if (pendingClean) {
rerankParts.push(truncate(pendingClean, RERANK_SNIPPET_CHARS));
}
const rerankQuery = truncate(rerankParts.join('\n'), RERANK_QUERY_MAX_CHARS);
// 7. 构建 lexicalTerms // 7. 构建 lexicalTerms
const entityTerms = focusEntities.map(e => e.toLowerCase()); const entityTerms = focusEntities.map(e => e.toLowerCase());
@@ -265,7 +229,8 @@ export function refineQueryBundle(bundle, anchorHits, eventHits) {
for (const hit of topAnchors) { for (const hit of topAnchors) {
const semantic = hit.atom?.semantic || ''; const semantic = hit.atom?.semantic || '';
if (semantic) { if (semantic) {
hints.push(truncate(semantic, MEMORY_HINT_MAX_CHARS)); // 不截断,零暗箱
hints.push(semantic);
} }
} }
@@ -279,13 +244,15 @@ export function refineQueryBundle(bundle, anchorHits, eventHits) {
? `${title}: ${summary}` ? `${title}: ${summary}`
: title || summary; : title || summary;
if (line) { if (line) {
hints.push(truncate(line, MEMORY_HINT_MAX_CHARS)); // 不截断,零暗箱
hints.push(line);
} }
} }
// 3. 构建 queryText_v1 // 3. 构建 queryText_v1Hints 前置,最优先)
if (hints.length > 0) { if (hints.length > 0) {
bundle.queryText_v1 = bundle.queryText_v0 + `\n\n[MEMORY_HINTS]\n${hints.join('\n')}`; const hintText = `[MEMORY_HINTS]\n${hints.join('\n')}`;
bundle.queryText_v1 = hintText + `\n\n` + bundle.queryText_v0;
} else { } else {
bundle.queryText_v1 = bundle.queryText_v0; bundle.queryText_v1 = bundle.queryText_v0;
} }
@@ -314,17 +281,8 @@ export function refineQueryBundle(bundle, anchorHits, eventHits) {
} }
} }
// 5. 增强 rerankQuery // 5. rerankQuery 与最终 query 同源(零暗箱)
if (hints.length > 0) { bundle.rerankQuery = bundle.queryText_v1 || bundle.queryText_v0;
const hintKeywords = extractKeyTerms(hints.join(' '), 5);
if (hintKeywords.length > 0) {
const addition = hintKeywords.join(' ');
bundle.rerankQuery = truncate(
bundle.rerankQuery + '\n' + addition,
RERANK_QUERY_MAX_CHARS
);
}
}
// 6. 增强 lexicalTerms // 6. 增强 lexicalTerms
if (hints.length > 0) { if (hints.length > 0) {

View File

@@ -782,6 +782,14 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) {
metrics.query.buildTime = Math.round(performance.now() - T_Build_Start); metrics.query.buildTime = Math.round(performance.now() - T_Build_Start);
metrics.anchor.focusEntities = bundle.focusEntities; metrics.anchor.focusEntities = bundle.focusEntities;
// Query lengths (v0 available here)
if (metrics.query?.lengths) {
metrics.query.lengths.v0Chars = String(bundle.queryText_v0 || '').length;
// v1 not built yet
metrics.query.lengths.v1Chars = null;
metrics.query.lengths.rerankChars = String(bundle.rerankQuery || bundle.queryText_v0 || '').length;
}
xbLog.info(MODULE_ID, xbLog.info(MODULE_ID,
`Query Build: focus=[${bundle.focusEntities.join(',')}] lexTerms=[${bundle.lexicalTerms.slice(0, 5).join(',')}]` `Query Build: focus=[${bundle.focusEntities.join(',')}] lexTerms=[${bundle.lexicalTerms.slice(0, 5).join(',')}]`
); );
@@ -841,6 +849,12 @@ export async function recallMemory(allEvents, vectorConfig, options = {}) {
// 更新 focusEntitiesrefinement 可能扩展了) // 更新 focusEntitiesrefinement 可能扩展了)
metrics.anchor.focusEntities = bundle.focusEntities; metrics.anchor.focusEntities = bundle.focusEntities;
// Query lengths (v1/rerank updated here)
if (metrics.query?.lengths) {
metrics.query.lengths.v1Chars = bundle.queryText_v1 == null ? null : String(bundle.queryText_v1).length;
metrics.query.lengths.rerankChars = String(bundle.rerankQuery || bundle.queryText_v1 || bundle.queryText_v0 || '').length;
}
xbLog.info(MODULE_ID, xbLog.info(MODULE_ID,
`Refinement: focus=[${bundle.focusEntities.join(',')}] hasV1=${!!bundle.queryText_v1} (${metrics.query.refineTime}ms)` `Refinement: focus=[${bundle.focusEntities.join(',')}] hasV1=${!!bundle.queryText_v1} (${metrics.query.refineTime}ms)`
); );

View File

@@ -0,0 +1,650 @@
// ═══════════════════════════════════════════════════════════════════════════
// tokenizer.js - 统一分词器
//
// 职责:
// 1. 管理结巴 WASM 生命周期(预加载 / 就绪检测 / 降级)
// 2. 实体词典注入(分词前最长匹配保护)
// 3. 亚洲文字CJK + 假名)走结巴,拉丁文字走空格分割
// 4. 提供 tokenize(text): string[] 统一接口
//
// 加载时机:
// - 插件初始化时 storySummary.enabled && vectorConfig.enabled → preload()
// - 向量开关从 off→on 时 → preload()
// - CHAT_CHANGED 时 → injectEntities() + warmup 索引(不负责加载 WASM
//
// 降级策略:
// - WASM 未就绪时 → 实体保护 + 标点分割(不用 bigram
// ═══════════════════════════════════════════════════════════════════════════
import { extensionFolderPath } from '../../../../core/constants.js';
import { xbLog } from '../../../../core/debug-core.js';
const MODULE_ID = 'tokenizer';
// ═══════════════════════════════════════════════════════════════════════════
// WASM 状态机
// ═══════════════════════════════════════════════════════════════════════════
/**
* @enum {string}
*/
const WasmState = {
IDLE: 'IDLE',
LOADING: 'LOADING',
READY: 'READY',
FAILED: 'FAILED',
};
let wasmState = WasmState.IDLE;
/** @type {Promise<void>|null} 当前加载 Promise防重入 */
let loadingPromise = null;
/** @type {typeof import('../../../../libs/jieba-wasm/jieba_rs_wasm.js')|null} */
let jiebaModule = null;
/** @type {Function|null} jieba cut 函数引用 */
let jiebaCut = null;
/** @type {Function|null} jieba add_word 函数引用 */
let jiebaAddWord = null;
// ═══════════════════════════════════════════════════════════════════════════
// 实体词典
// ═══════════════════════════════════════════════════════════════════════════
/** @type {string[]} 按长度降序排列的实体列表(用于最长匹配) */
let entityList = [];
/** @type {Set<string>} 已注入结巴的实体(避免重复 add_word */
let injectedEntities = new Set();
// ═══════════════════════════════════════════════════════════════════════════
// 停用词
// ═══════════════════════════════════════════════════════════════════════════
const STOP_WORDS = new Set([
// 中文高频虚词
'的', '了', '在', '是', '我', '有', '和', '就', '不', '人',
'都', '一', '一个', '上', '也', '很', '到', '说', '要', '去',
'你', '会', '着', '没有', '看', '好', '自己', '这', '他', '她',
'它', '吗', '什么', '那', '里', '来', '吧', '呢', '啊', '哦',
'嗯', '呀', '哈', '嘿', '喂', '哎', '唉', '哇', '呃', '嘛',
'把', '被', '让', '给', '从', '向', '对', '跟', '比', '但',
'而', '或', '如果', '因为', '所以', '虽然', '但是', '然后',
'可以', '这样', '那样', '怎么', '为什么', '什么样', '哪里',
'时候', '现在', '已经', '还是', '只是', '可能', '应该', '知道',
'觉得', '开始', '一下', '一些', '这个', '那个', '他们', '我们',
'你们', '自己', '起来', '出来', '进去', '回来', '过来', '下去',
// 日语助词 + 常见虚词
'は', 'が', 'を', 'に', 'で', 'と', 'の', 'も', 'へ', 'や',
'か', 'な', 'よ', 'ね', 'わ', 'だ', 'です', 'ます', 'た', 'て',
'する', 'いる', 'ある', 'なる', 'れる', 'られる', 'ない',
'この', 'その', 'あの', 'どの', 'ここ', 'そこ', 'あそこ',
'これ', 'それ', 'あれ', 'どれ',
// 英文常见停用词
'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been',
'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
'would', 'could', 'should', 'may', 'might', 'can', 'shall',
'and', 'but', 'or', 'not', 'no', 'nor', 'so', 'yet',
'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from',
'it', 'its', 'he', 'she', 'his', 'her', 'they', 'them',
'this', 'that', 'these', 'those', 'i', 'me', 'my', 'you', 'your',
'we', 'our', 'if', 'then', 'than', 'when', 'what', 'which',
'who', 'how', 'where', 'there', 'here', 'all', 'each', 'every',
'both', 'few', 'more', 'most', 'other', 'some', 'such',
'only', 'own', 'same', 'just', 'very', 'also', 'about',
]);
// ═══════════════════════════════════════════════════════════════════════════
// Unicode 分类
// ═══════════════════════════════════════════════════════════════════════════
/**
* 判断字符是否为亚洲文字CJK + 假名)
* @param {number} code - charCode
* @returns {boolean}
*/
function isAsian(code) {
return (
(code >= 0x4E00 && code <= 0x9FFF) || // CJK Unified Ideographs
(code >= 0x3400 && code <= 0x4DBF) || // CJK Extension A
(code >= 0x3040 && code <= 0x309F) || // Hiragana
(code >= 0x30A0 && code <= 0x30FF) || // Katakana
(code >= 0x31F0 && code <= 0x31FF) || // Katakana Phonetic Extensions
(code >= 0xFF65 && code <= 0xFF9F) || // Halfwidth Katakana
(code >= 0xF900 && code <= 0xFAFF) || // CJK Compatibility Ideographs
(code >= 0x20000 && code <= 0x2A6DF) // CJK Extension B
);
}
/**
* 判断字符是否为拉丁字母或数字
* @param {number} code - charCode
* @returns {boolean}
*/
function isLatin(code) {
return (
(code >= 0x41 && code <= 0x5A) || // A-Z
(code >= 0x61 && code <= 0x7A) || // a-z
(code >= 0x30 && code <= 0x39) || // 0-9
(code >= 0xC0 && code <= 0x024F) // Latin Extended (àáâ 等)
);
}
// ═══════════════════════════════════════════════════════════════════════════
// 文本分段(亚洲 vs 拉丁 vs 其他)
// ═══════════════════════════════════════════════════════════════════════════
/**
* @typedef {'asian'|'latin'|'other'} SegmentType
*/
/**
* @typedef {object} TextSegment
* @property {SegmentType} type - 段类型
* @property {string} text - 段文本
*/
/**
* 将文本按 Unicode 脚本分段
* 连续的同类字符归为一段
*
* @param {string} text
* @returns {TextSegment[]}
*/
function segmentByScript(text) {
if (!text) return [];
const segments = [];
let currentType = null;
let currentStart = 0;
for (let i = 0; i < text.length; i++) {
const code = text.charCodeAt(i);
let type;
if (isAsian(code)) {
type = 'asian';
} else if (isLatin(code)) {
type = 'latin';
} else {
type = 'other';
}
if (type !== currentType) {
if (currentType !== null && currentStart < i) {
const seg = text.slice(currentStart, i);
if (currentType !== 'other' || seg.trim()) {
segments.push({ type: currentType, text: seg });
}
}
currentType = type;
currentStart = i;
}
}
// 最后一段
if (currentStart < text.length) {
const seg = text.slice(currentStart);
if (currentType !== 'other' || seg.trim()) {
segments.push({ type: currentType, text: seg });
}
}
return segments;
}
// ═══════════════════════════════════════════════════════════════════════════
// 实体保护(最长匹配占位符替换)
// ═══════════════════════════════════════════════════════════════════════════
// 使用 Unicode Private Use Area (PUA) 字符作为边界,避免控制字符在分词器中产生不可控行为
const PLACEHOLDER_PREFIX = '\uE000ENT_';
const PLACEHOLDER_SUFFIX = '\uE001';
/**
* 在文本中执行实体最长匹配,替换为占位符
*
* @param {string} text - 原始文本
* @returns {{masked: string, entities: Map<string, string>}} masked 文本 + 占位符→原文映射
*/
function maskEntities(text) {
const entities = new Map();
if (!entityList.length || !text) {
return { masked: text, entities };
}
let masked = text;
let idx = 0;
// entityList 已按长度降序排列,保证最长匹配优先
for (const entity of entityList) {
// 大小写不敏感搜索
const lowerMasked = masked.toLowerCase();
const lowerEntity = entity.toLowerCase();
let searchFrom = 0;
while (true) {
const pos = lowerMasked.indexOf(lowerEntity, searchFrom);
if (pos === -1) break;
// 已被占位符覆盖则跳过(检查前后是否存在 PUA 边界字符)
const aroundStart = Math.max(0, pos - 4);
const aroundEnd = Math.min(masked.length, pos + entity.length + 4);
const around = masked.slice(aroundStart, aroundEnd);
if (around.includes('\uE000') || around.includes('\uE001')) {
searchFrom = pos + 1;
continue;
}
const placeholder = `${PLACEHOLDER_PREFIX}${idx}${PLACEHOLDER_SUFFIX}`;
const originalText = masked.slice(pos, pos + entity.length);
entities.set(placeholder, originalText);
masked = masked.slice(0, pos) + placeholder + masked.slice(pos + entity.length);
idx++;
// 更新搜索位置(跳过占位符)
searchFrom = pos + placeholder.length;
}
}
return { masked, entities };
}
/**
* 将 token 数组中的占位符还原为原始实体
*
* @param {string[]} tokens
* @param {Map<string, string>} entities - 占位符→原文映射
* @returns {string[]}
*/
function unmaskTokens(tokens, entities) {
if (!entities.size) return tokens;
return tokens.map(token => {
// token 本身就是一个占位符
if (entities.has(token)) {
return entities.get(token);
}
// token 中包含占位符(结巴可能把占位符和其他字符连在一起)
let result = token;
for (const [placeholder, original] of entities) {
if (result.includes(placeholder)) {
result = result.replace(placeholder, original);
}
}
return result;
});
}
// ═══════════════════════════════════════════════════════════════════════════
// 分词:亚洲文字(结巴 / 降级)
// ═══════════════════════════════════════════════════════════════════════════
/**
* 用结巴分词处理亚洲文字段
* @param {string} text
* @returns {string[]}
*/
function tokenizeAsianJieba(text) {
if (!text || !jiebaCut) return [];
try {
const words = jiebaCut(text, true); // hmm=true
return Array.from(words)
.map(w => String(w || '').trim())
.filter(w => w.length >= 2);
} catch (e) {
xbLog.warn(MODULE_ID, '结巴分词异常,降级处理', e);
return tokenizeAsianFallback(text);
}
}
/**
* 降级分词:标点/空格分割 + 保留 2-6 字 CJK 片段
* 不使用 bigram避免索引膨胀
*
* @param {string} text
* @returns {string[]}
*/
function tokenizeAsianFallback(text) {
if (!text) return [];
const tokens = [];
// 按标点和空格分割
const parts = text.split(/[\s""''()【】《》…—\-,.!?;:'"()[\]{}<>/\\|@#$%^&*+=~`]+/);
for (const part of parts) {
const trimmed = part.trim();
if (!trimmed) continue;
if (trimmed.length >= 2 && trimmed.length <= 6) {
tokens.push(trimmed);
} else if (trimmed.length > 6) {
// 长片段按 4 字滑窗切分(比 bigram 稀疏得多)
for (let i = 0; i <= trimmed.length - 4; i += 2) {
tokens.push(trimmed.slice(i, i + 4));
}
// 保留完整片段的前 6 字
tokens.push(trimmed.slice(0, 6));
}
}
return tokens;
}
// ═══════════════════════════════════════════════════════════════════════════
// 分词:拉丁文字
// ═══════════════════════════════════════════════════════════════════════════
/**
* 拉丁文字分词:空格/标点分割
* @param {string} text
* @returns {string[]}
*/
function tokenizeLatin(text) {
if (!text) return [];
return text
.split(/[\s\-_.,;:!?'"()[\]{}<>/\\|@#$%^&*+=~`]+/)
.map(w => w.trim().toLowerCase())
.filter(w => w.length >= 3);
}
// ═══════════════════════════════════════════════════════════════════════════
// 公开接口preload
// ═══════════════════════════════════════════════════════════════════════════
/**
* 预加载结巴 WASM
*
* 可多次调用,内部防重入。
* FAILED 状态下再次调用会重试。
*
* @returns {Promise<boolean>} 是否加载成功
*/
export async function preload() {
// 已就绪
if (wasmState === WasmState.READY) return true;
// 正在加载,等待结果
if (wasmState === WasmState.LOADING && loadingPromise) {
try {
await loadingPromise;
return wasmState === WasmState.READY;
} catch {
return false;
}
}
// IDLE 或 FAILED → 开始加载
wasmState = WasmState.LOADING;
const T0 = performance.now();
loadingPromise = (async () => {
try {
// 动态 import 结巴模块
const wasmPath = `${extensionFolderPath}/libs/jieba-wasm/jieba_rs_wasm_bg.wasm`;
// eslint-disable-next-line no-unsanitized/method
jiebaModule = await import(
`${extensionFolderPath}/libs/jieba-wasm/jieba_rs_wasm.js`
);
// 初始化 WASM
if (typeof jiebaModule.default === 'function') {
await jiebaModule.default(wasmPath);
}
// 缓存函数引用
jiebaCut = jiebaModule.cut;
jiebaAddWord = jiebaModule.add_word;
if (typeof jiebaCut !== 'function') {
throw new Error('jieba cut 函数不存在');
}
wasmState = WasmState.READY;
const elapsed = Math.round(performance.now() - T0);
xbLog.info(MODULE_ID, `结巴 WASM 加载完成 (${elapsed}ms)`);
// 如果有待注入的实体,补做
if (entityList.length > 0 && jiebaAddWord) {
reInjectAllEntities();
}
return true;
} catch (e) {
wasmState = WasmState.FAILED;
xbLog.error(MODULE_ID, '结巴 WASM 加载失败', e);
throw e;
}
})();
try {
await loadingPromise;
return true;
} catch {
return false;
} finally {
loadingPromise = null;
}
}
// ═══════════════════════════════════════════════════════════════════════════
// 公开接口isReady
// ═══════════════════════════════════════════════════════════════════════════
/**
* 检查结巴是否已就绪
* @returns {boolean}
*/
export function isReady() {
return wasmState === WasmState.READY;
}
/**
* 获取当前 WASM 状态
* @returns {string}
*/
export function getState() {
return wasmState;
}
// ═══════════════════════════════════════════════════════════════════════════
// 公开接口injectEntities
// ═══════════════════════════════════════════════════════════════════════════
/**
* 注入实体词典
*
* 更新内部实体列表(用于最长匹配保护)
* 如果结巴已就绪,同时调用 add_word 注入
*
* @param {Set<string>} lexicon - 标准化后的实体集合
* @param {Map<string, string>} [displayMap] - normalize→原词形映射
*/
export function injectEntities(lexicon, displayMap) {
if (!lexicon?.size) {
entityList = [];
return;
}
// 构建实体列表使用原词形displayMap按长度降序排列
const entities = [];
for (const normalized of lexicon) {
const display = displayMap?.get(normalized) || normalized;
if (display.length >= 2) {
entities.push(display);
}
}
// 按长度降序(最长匹配优先)
entities.sort((a, b) => b.length - a.length);
entityList = entities;
// 如果结巴已就绪,注入自定义词
if (wasmState === WasmState.READY && jiebaAddWord) {
injectNewEntitiesToJieba(entities);
}
xbLog.info(MODULE_ID, `实体词典更新: ${entities.length} 个实体`);
}
/**
* 将新实体注入结巴(增量,跳过已注入的)
* @param {string[]} entities
*/
function injectNewEntitiesToJieba(entities) {
let count = 0;
for (const entity of entities) {
if (!injectedEntities.has(entity)) {
try {
// freq 设高保证不被切碎
jiebaAddWord(entity, 99999);
injectedEntities.add(entity);
count++;
} catch (e) {
xbLog.warn(MODULE_ID, `add_word 失败: ${entity}`, e);
}
}
}
if (count > 0) {
xbLog.info(MODULE_ID, `注入 ${count} 个新实体到结巴`);
}
}
/**
* 重新注入所有实体WASM 刚加载完时调用)
*/
function reInjectAllEntities() {
injectedEntities.clear();
injectNewEntitiesToJieba(entityList);
}
// ═══════════════════════════════════════════════════════════════════════════
// 公开接口tokenize
// ═══════════════════════════════════════════════════════════════════════════
/**
* 统一分词接口
*
* 流程:
* 1. 实体最长匹配 → 占位符保护
* 2. 按 Unicode 脚本分段(亚洲 vs 拉丁)
* 3. 亚洲段 → 结巴 cut()(或降级)
* 4. 拉丁段 → 空格/标点分割
* 5. 还原占位符
* 6. 过滤停用词 + 去重
*
* @param {string} text - 输入文本
* @returns {string[]} token 数组
*/
export function tokenize(text) {
const restored = tokenizeCore(text);
// 5. 过滤停用词 + 去重 + 清理
const seen = new Set();
const result = [];
for (const token of restored) {
const cleaned = token.trim().toLowerCase();
if (!cleaned) continue;
if (cleaned.length < 2) continue;
if (STOP_WORDS.has(cleaned)) continue;
if (seen.has(cleaned)) continue;
// 过滤纯标点/特殊字符
if (/^[\s\x00-\x1F\p{P}\p{S}]+$/u.test(cleaned)) continue;
seen.add(cleaned);
result.push(token.trim()); // 保留原始大小写
}
return result;
}
/**
* 内核分词流程(不去重、不 lower、仅完成实体保护→分段→分词→还原
* @param {string} text
* @returns {string[]}
*/
function tokenizeCore(text) {
if (!text) return [];
const input = String(text).trim();
if (!input) return [];
// 1. 实体保护
const { masked, entities } = maskEntities(input);
// 2. 分段
const segments = segmentByScript(masked);
// 3. 分段分词
const rawTokens = [];
for (const seg of segments) {
if (seg.type === 'asian') {
if (wasmState === WasmState.READY && jiebaCut) {
rawTokens.push(...tokenizeAsianJieba(seg.text));
} else {
rawTokens.push(...tokenizeAsianFallback(seg.text));
}
} else if (seg.type === 'latin') {
rawTokens.push(...tokenizeLatin(seg.text));
}
}
// 4. 还原占位符
return unmaskTokens(rawTokens, entities);
}
// ═══════════════════════════════════════════════════════════════════════════
// 公开接口tokenizeForIndex
// ═══════════════════════════════════════════════════════════════════════════
/**
* MiniSearch 索引专用分词
*
* 与 tokenize() 的区别:
* - 全部转小写MiniSearch 内部需要一致性)
* - 不去重MiniSearch 自己处理词频)
*
* @param {string} text
* @returns {string[]}
*/
export function tokenizeForIndex(text) {
const restored = tokenizeCore(text);
return restored
.map(t => t.trim().toLowerCase())
.filter(t => {
if (!t || t.length < 2) return false;
if (STOP_WORDS.has(t)) return false;
if (/^[\s\x00-\x1F\p{P}\p{S}]+$/u.test(t)) return false;
return true;
});
}
// ═══════════════════════════════════════════════════════════════════════════
// 公开接口reset
// ═══════════════════════════════════════════════════════════════════════════
/**
* 重置分词器状态
* 用于测试或模块卸载
*/
export function reset() {
entityList = [];
injectedEntities.clear();
// 不重置 WASM 状态(避免重复加载)
}