diff --git a/core/debug-core.js b/core/debug-core.js index 0abc5d0..f05ea36 100644 --- a/core/debug-core.js +++ b/core/debug-core.js @@ -1,4 +1,4 @@ -import { EventCenter } from "./event-manager.js"; +import { EventCenter } from "./event-manager.js"; const DEFAULT_MAX_LOGS = 200; @@ -110,14 +110,14 @@ class LoggerCore { }); } - info(moduleId, message) { - this._log("info", moduleId, message, null); + info(moduleId, ...args) { + const msg = args.map(a => (typeof a === 'string' ? a : safeStringify(a))).join(' '); + this._log('info', moduleId, msg, null); } - - warn(moduleId, message) { - this._log("warn", moduleId, message, null); + warn(moduleId, ...args) { + const msg = args.map(a => (typeof a === 'string' ? a : safeStringify(a))).join(' '); + this._log('warn', moduleId, msg, null); } - error(moduleId, message, err) { this._log("error", moduleId, message, err || null); } diff --git a/libs/jieba-wasm/jieba_rs_wasm.js b/libs/jieba-wasm/jieba_rs_wasm.js new file mode 100644 index 0000000..7281ce6 --- /dev/null +++ b/libs/jieba-wasm/jieba_rs_wasm.js @@ -0,0 +1,438 @@ +let wasm; + +let cachedUint8ArrayMemory0 = null; + +function getUint8ArrayMemory0() { + if (cachedUint8ArrayMemory0 === null || cachedUint8ArrayMemory0.byteLength === 0) { + cachedUint8ArrayMemory0 = new Uint8Array(wasm.memory.buffer); + } + return cachedUint8ArrayMemory0; +} + +let cachedTextDecoder = (typeof TextDecoder !== 'undefined' ? new TextDecoder('utf-8', { ignoreBOM: true, fatal: true }) : { decode: () => { throw Error('TextDecoder not available') } } ); + +if (typeof TextDecoder !== 'undefined') { cachedTextDecoder.decode(); }; + +const MAX_SAFARI_DECODE_BYTES = 2146435072; +let numBytesDecoded = 0; +function decodeText(ptr, len) { + numBytesDecoded += len; + if (numBytesDecoded >= MAX_SAFARI_DECODE_BYTES) { + cachedTextDecoder = (typeof TextDecoder !== 'undefined' ? new TextDecoder('utf-8', { ignoreBOM: true, fatal: true }) : { decode: () => { throw Error('TextDecoder not available') } } ); + cachedTextDecoder.decode(); + numBytesDecoded = len; + } + return cachedTextDecoder.decode(getUint8ArrayMemory0().subarray(ptr, ptr + len)); +} + +function getStringFromWasm0(ptr, len) { + ptr = ptr >>> 0; + return decodeText(ptr, len); +} + +function debugString(val) { + // primitive types + const type = typeof val; + if (type == 'number' || type == 'boolean' || val == null) { + return `${val}`; + } + if (type == 'string') { + return `"${val}"`; + } + if (type == 'symbol') { + const description = val.description; + if (description == null) { + return 'Symbol'; + } else { + return `Symbol(${description})`; + } + } + if (type == 'function') { + const name = val.name; + if (typeof name == 'string' && name.length > 0) { + return `Function(${name})`; + } else { + return 'Function'; + } + } + // objects + if (Array.isArray(val)) { + const length = val.length; + let debug = '['; + if (length > 0) { + debug += debugString(val[0]); + } + for(let i = 1; i < length; i++) { + debug += ', ' + debugString(val[i]); + } + debug += ']'; + return debug; + } + // Test for built-in + const builtInMatches = /\[object ([^\]]+)\]/.exec(toString.call(val)); + let className; + if (builtInMatches && builtInMatches.length > 1) { + className = builtInMatches[1]; + } else { + // Failed to match the standard '[object ClassName]' + return toString.call(val); + } + if (className == 'Object') { + // we're a user defined class or Object + // JSON.stringify avoids problems with cycles, and is generally much + // easier than looping through ownProperties of `val`. + try { + return 'Object(' + JSON.stringify(val) + ')'; + } catch (_) { + return 'Object'; + } + } + // errors + if (val instanceof Error) { + return `${val.name}: ${val.message}\n${val.stack}`; + } + // TODO we could test for more things here, like `Set`s and `Map`s. + return className; +} + +let WASM_VECTOR_LEN = 0; + +const cachedTextEncoder = (typeof TextEncoder !== 'undefined' ? new TextEncoder('utf-8') : { encode: () => { throw Error('TextEncoder not available') } } ); + +const encodeString = (typeof cachedTextEncoder.encodeInto === 'function' + ? function (arg, view) { + return cachedTextEncoder.encodeInto(arg, view); +} + : function (arg, view) { + const buf = cachedTextEncoder.encode(arg); + view.set(buf); + return { + read: arg.length, + written: buf.length + }; +}); + +function passStringToWasm0(arg, malloc, realloc) { + + if (realloc === undefined) { + const buf = cachedTextEncoder.encode(arg); + const ptr = malloc(buf.length, 1) >>> 0; + getUint8ArrayMemory0().subarray(ptr, ptr + buf.length).set(buf); + WASM_VECTOR_LEN = buf.length; + return ptr; + } + + let len = arg.length; + let ptr = malloc(len, 1) >>> 0; + + const mem = getUint8ArrayMemory0(); + + let offset = 0; + + for (; offset < len; offset++) { + const code = arg.charCodeAt(offset); + if (code > 0x7F) break; + mem[ptr + offset] = code; + } + + if (offset !== len) { + if (offset !== 0) { + arg = arg.slice(offset); + } + ptr = realloc(ptr, len, len = offset + arg.length * 3, 1) >>> 0; + const view = getUint8ArrayMemory0().subarray(ptr + offset, ptr + len); + const ret = encodeString(arg, view); + + offset += ret.written; + ptr = realloc(ptr, len, offset, 1) >>> 0; + } + + WASM_VECTOR_LEN = offset; + return ptr; +} + +let cachedDataViewMemory0 = null; + +function getDataViewMemory0() { + if (cachedDataViewMemory0 === null || cachedDataViewMemory0.buffer.detached === true || (cachedDataViewMemory0.buffer.detached === undefined && cachedDataViewMemory0.buffer !== wasm.memory.buffer)) { + cachedDataViewMemory0 = new DataView(wasm.memory.buffer); + } + return cachedDataViewMemory0; +} + +function isLikeNone(x) { + return x === undefined || x === null; +} + +function getArrayJsValueFromWasm0(ptr, len) { + ptr = ptr >>> 0; + const mem = getDataViewMemory0(); + const result = []; + for (let i = ptr; i < ptr + 4 * len; i += 4) { + result.push(wasm.__wbindgen_export_2.get(mem.getUint32(i, true))); + } + wasm.__externref_drop_slice(ptr, len); + return result; +} +/** + * @param {string} text + * @param {boolean | null} [hmm] + * @returns {string[]} + */ +export function cut(text, hmm) { + const ptr0 = passStringToWasm0(text, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc); + const len0 = WASM_VECTOR_LEN; + const ret = wasm.cut(ptr0, len0, isLikeNone(hmm) ? 0xFFFFFF : hmm ? 1 : 0); + var v2 = getArrayJsValueFromWasm0(ret[0], ret[1]).slice(); + wasm.__wbindgen_free(ret[0], ret[1] * 4, 4); + return v2; +} + +/** + * @param {string} text + * @returns {string[]} + */ +export function cut_all(text) { + const ptr0 = passStringToWasm0(text, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc); + const len0 = WASM_VECTOR_LEN; + const ret = wasm.cut_all(ptr0, len0); + var v2 = getArrayJsValueFromWasm0(ret[0], ret[1]).slice(); + wasm.__wbindgen_free(ret[0], ret[1] * 4, 4); + return v2; +} + +/** + * @param {string} text + * @param {boolean | null} [hmm] + * @returns {string[]} + */ +export function cut_for_search(text, hmm) { + const ptr0 = passStringToWasm0(text, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc); + const len0 = WASM_VECTOR_LEN; + const ret = wasm.cut_for_search(ptr0, len0, isLikeNone(hmm) ? 0xFFFFFF : hmm ? 1 : 0); + var v2 = getArrayJsValueFromWasm0(ret[0], ret[1]).slice(); + wasm.__wbindgen_free(ret[0], ret[1] * 4, 4); + return v2; +} + +function takeFromExternrefTable0(idx) { + const value = wasm.__wbindgen_export_2.get(idx); + wasm.__externref_table_dealloc(idx); + return value; +} +/** + * @param {string} text + * @param {string} mode + * @param {boolean | null} [hmm] + * @returns {Token[]} + */ +export function tokenize(text, mode, hmm) { + const ptr0 = passStringToWasm0(text, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc); + const len0 = WASM_VECTOR_LEN; + const ptr1 = passStringToWasm0(mode, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc); + const len1 = WASM_VECTOR_LEN; + const ret = wasm.tokenize(ptr0, len0, ptr1, len1, isLikeNone(hmm) ? 0xFFFFFF : hmm ? 1 : 0); + if (ret[3]) { + throw takeFromExternrefTable0(ret[2]); + } + var v3 = getArrayJsValueFromWasm0(ret[0], ret[1]).slice(); + wasm.__wbindgen_free(ret[0], ret[1] * 4, 4); + return v3; +} + +/** + * @param {string} word + * @param {number | null} [freq] + * @param {string | null} [tag] + * @returns {number} + */ +export function add_word(word, freq, tag) { + const ptr0 = passStringToWasm0(word, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc); + const len0 = WASM_VECTOR_LEN; + var ptr1 = isLikeNone(tag) ? 0 : passStringToWasm0(tag, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc); + var len1 = WASM_VECTOR_LEN; + const ret = wasm.add_word(ptr0, len0, isLikeNone(freq) ? 0x100000001 : (freq) >>> 0, ptr1, len1); + return ret >>> 0; +} + +/** + * @param {string} sentence + * @param {boolean | null} [hmm] + * @returns {Tag[]} + */ +export function tag(sentence, hmm) { + const ptr0 = passStringToWasm0(sentence, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc); + const len0 = WASM_VECTOR_LEN; + const ret = wasm.tag(ptr0, len0, isLikeNone(hmm) ? 0xFFFFFF : hmm ? 1 : 0); + var v2 = getArrayJsValueFromWasm0(ret[0], ret[1]).slice(); + wasm.__wbindgen_free(ret[0], ret[1] * 4, 4); + return v2; +} + +/** + * @param {string} dict + */ +export function with_dict(dict) { + const ptr0 = passStringToWasm0(dict, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc); + const len0 = WASM_VECTOR_LEN; + const ret = wasm.with_dict(ptr0, len0); + if (ret[1]) { + throw takeFromExternrefTable0(ret[0]); + } +} + +const EXPECTED_RESPONSE_TYPES = new Set(['basic', 'cors', 'default']); + +async function __wbg_load(module, imports) { + if (typeof Response === 'function' && module instanceof Response) { + if (typeof WebAssembly.instantiateStreaming === 'function') { + try { + return await WebAssembly.instantiateStreaming(module, imports); + + } catch (e) { + const validResponse = module.ok && EXPECTED_RESPONSE_TYPES.has(module.type); + + if (validResponse && module.headers.get('Content-Type') !== 'application/wasm') { + console.warn("`WebAssembly.instantiateStreaming` failed because your server does not serve Wasm with `application/wasm` MIME type. Falling back to `WebAssembly.instantiate` which is slower. Original error:\n", e); + + } else { + throw e; + } + } + } + + const bytes = await module.arrayBuffer(); + return await WebAssembly.instantiate(bytes, imports); + + } else { + const instance = await WebAssembly.instantiate(module, imports); + + if (instance instanceof WebAssembly.Instance) { + return { instance, module }; + + } else { + return instance; + } + } +} + +function __wbg_get_imports() { + const imports = {}; + imports.wbg = {}; + imports.wbg.__wbg_Error_0497d5bdba9362e5 = function(arg0, arg1) { + const ret = Error(getStringFromWasm0(arg0, arg1)); + return ret; + }; + imports.wbg.__wbg_new_07b483f72211fd66 = function() { + const ret = new Object(); + return ret; + }; + imports.wbg.__wbg_set_3f1d0b984ed272ed = function(arg0, arg1, arg2) { + arg0[arg1] = arg2; + }; + imports.wbg.__wbindgen_bigint_from_u64 = function(arg0) { + const ret = BigInt.asUintN(64, arg0); + return ret; + }; + imports.wbg.__wbindgen_debug_string = function(arg0, arg1) { + const ret = debugString(arg1); + const ptr1 = passStringToWasm0(ret, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc); + const len1 = WASM_VECTOR_LEN; + getDataViewMemory0().setInt32(arg0 + 4 * 1, len1, true); + getDataViewMemory0().setInt32(arg0 + 4 * 0, ptr1, true); + }; + imports.wbg.__wbindgen_init_externref_table = function() { + const table = wasm.__wbindgen_export_2; + const offset = table.grow(4); + table.set(0, undefined); + table.set(offset + 0, undefined); + table.set(offset + 1, null); + table.set(offset + 2, true); + table.set(offset + 3, false); + ; + }; + imports.wbg.__wbindgen_number_new = function(arg0) { + const ret = arg0; + return ret; + }; + imports.wbg.__wbindgen_string_new = function(arg0, arg1) { + const ret = getStringFromWasm0(arg0, arg1); + return ret; + }; + imports.wbg.__wbindgen_throw = function(arg0, arg1) { + throw new Error(getStringFromWasm0(arg0, arg1)); + }; + + return imports; +} + +function __wbg_init_memory(imports, memory) { + +} + +function __wbg_finalize_init(instance, module) { + wasm = instance.exports; + __wbg_init.__wbindgen_wasm_module = module; + cachedDataViewMemory0 = null; + cachedUint8ArrayMemory0 = null; + + + wasm.__wbindgen_start(); + return wasm; +} + +function initSync(module) { + if (wasm !== undefined) return wasm; + + + if (typeof module !== 'undefined') { + if (Object.getPrototypeOf(module) === Object.prototype) { + ({module} = module) + } else { + console.warn('using deprecated parameters for `initSync()`; pass a single object instead') + } + } + + const imports = __wbg_get_imports(); + + __wbg_init_memory(imports); + + if (!(module instanceof WebAssembly.Module)) { + module = new WebAssembly.Module(module); + } + + const instance = new WebAssembly.Instance(module, imports); + + return __wbg_finalize_init(instance, module); +} + +async function __wbg_init(module_or_path) { + if (wasm !== undefined) return wasm; + + + if (typeof module_or_path !== 'undefined') { + if (Object.getPrototypeOf(module_or_path) === Object.prototype) { + ({module_or_path} = module_or_path) + } else { + console.warn('using deprecated parameters for the initialization function; pass a single object instead') + } + } + + if (typeof module_or_path === 'undefined') { + module_or_path = new URL('jieba_rs_wasm_bg.wasm', import.meta.url); + } + const imports = __wbg_get_imports(); + + if (typeof module_or_path === 'string' || (typeof Request === 'function' && module_or_path instanceof Request) || (typeof URL === 'function' && module_or_path instanceof URL)) { + module_or_path = fetch(module_or_path); + } + + __wbg_init_memory(imports); + + const { instance, module } = await __wbg_load(await module_or_path, imports); + + return __wbg_finalize_init(instance, module); +} + +export { initSync }; +export default __wbg_init; diff --git a/libs/jieba-wasm/jieba_rs_wasm_bg.js b/libs/jieba-wasm/jieba_rs_wasm_bg.js new file mode 100644 index 0000000..790c302 --- /dev/null +++ b/libs/jieba-wasm/jieba_rs_wasm_bg.js @@ -0,0 +1,372 @@ +let wasm; +export function __wbg_set_wasm(val) { + wasm = val; +} + + +const heap = new Array(128).fill(undefined); + +heap.push(undefined, null, true, false); + +function getObject(idx) { return heap[idx]; } + +let heap_next = heap.length; + +function dropObject(idx) { + if (idx < 132) return; + heap[idx] = heap_next; + heap_next = idx; +} + +function takeObject(idx) { + const ret = getObject(idx); + dropObject(idx); + return ret; +} + +const lTextDecoder = typeof TextDecoder === 'undefined' ? (0, module.require)('util').TextDecoder : TextDecoder; + +let cachedTextDecoder = new lTextDecoder('utf-8', { ignoreBOM: true, fatal: true }); + +cachedTextDecoder.decode(); + +let cachedUint8ArrayMemory0 = null; + +function getUint8ArrayMemory0() { + if (cachedUint8ArrayMemory0 === null || cachedUint8ArrayMemory0.byteLength === 0) { + cachedUint8ArrayMemory0 = new Uint8Array(wasm.memory.buffer); + } + return cachedUint8ArrayMemory0; +} + +function getStringFromWasm0(ptr, len) { + ptr = ptr >>> 0; + return cachedTextDecoder.decode(getUint8ArrayMemory0().subarray(ptr, ptr + len)); +} + +function addHeapObject(obj) { + if (heap_next === heap.length) heap.push(heap.length + 1); + const idx = heap_next; + heap_next = heap[idx]; + + heap[idx] = obj; + return idx; +} + +function debugString(val) { + // primitive types + const type = typeof val; + if (type == 'number' || type == 'boolean' || val == null) { + return `${val}`; + } + if (type == 'string') { + return `"${val}"`; + } + if (type == 'symbol') { + const description = val.description; + if (description == null) { + return 'Symbol'; + } else { + return `Symbol(${description})`; + } + } + if (type == 'function') { + const name = val.name; + if (typeof name == 'string' && name.length > 0) { + return `Function(${name})`; + } else { + return 'Function'; + } + } + // objects + if (Array.isArray(val)) { + const length = val.length; + let debug = '['; + if (length > 0) { + debug += debugString(val[0]); + } + for(let i = 1; i < length; i++) { + debug += ', ' + debugString(val[i]); + } + debug += ']'; + return debug; + } + // Test for built-in + const builtInMatches = /\[object ([^\]]+)\]/.exec(toString.call(val)); + let className; + if (builtInMatches.length > 1) { + className = builtInMatches[1]; + } else { + // Failed to match the standard '[object ClassName]' + return toString.call(val); + } + if (className == 'Object') { + // we're a user defined class or Object + // JSON.stringify avoids problems with cycles, and is generally much + // easier than looping through ownProperties of `val`. + try { + return 'Object(' + JSON.stringify(val) + ')'; + } catch (_) { + return 'Object'; + } + } + // errors + if (val instanceof Error) { + return `${val.name}: ${val.message}\n${val.stack}`; + } + // TODO we could test for more things here, like `Set`s and `Map`s. + return className; +} + +let WASM_VECTOR_LEN = 0; + +const lTextEncoder = typeof TextEncoder === 'undefined' ? (0, module.require)('util').TextEncoder : TextEncoder; + +let cachedTextEncoder = new lTextEncoder('utf-8'); + +const encodeString = (typeof cachedTextEncoder.encodeInto === 'function' + ? function (arg, view) { + return cachedTextEncoder.encodeInto(arg, view); +} + : function (arg, view) { + const buf = cachedTextEncoder.encode(arg); + view.set(buf); + return { + read: arg.length, + written: buf.length + }; +}); + +function passStringToWasm0(arg, malloc, realloc) { + + if (realloc === undefined) { + const buf = cachedTextEncoder.encode(arg); + const ptr = malloc(buf.length, 1) >>> 0; + getUint8ArrayMemory0().subarray(ptr, ptr + buf.length).set(buf); + WASM_VECTOR_LEN = buf.length; + return ptr; + } + + let len = arg.length; + let ptr = malloc(len, 1) >>> 0; + + const mem = getUint8ArrayMemory0(); + + let offset = 0; + + for (; offset < len; offset++) { + const code = arg.charCodeAt(offset); + if (code > 0x7F) break; + mem[ptr + offset] = code; + } + + if (offset !== len) { + if (offset !== 0) { + arg = arg.slice(offset); + } + ptr = realloc(ptr, len, len = offset + arg.length * 3, 1) >>> 0; + const view = getUint8ArrayMemory0().subarray(ptr + offset, ptr + len); + const ret = encodeString(arg, view); + + offset += ret.written; + ptr = realloc(ptr, len, offset, 1) >>> 0; + } + + WASM_VECTOR_LEN = offset; + return ptr; +} + +let cachedDataViewMemory0 = null; + +function getDataViewMemory0() { + if (cachedDataViewMemory0 === null || cachedDataViewMemory0.buffer.detached === true || (cachedDataViewMemory0.buffer.detached === undefined && cachedDataViewMemory0.buffer !== wasm.memory.buffer)) { + cachedDataViewMemory0 = new DataView(wasm.memory.buffer); + } + return cachedDataViewMemory0; +} + +function isLikeNone(x) { + return x === undefined || x === null; +} + +function getArrayJsValueFromWasm0(ptr, len) { + ptr = ptr >>> 0; + const mem = getDataViewMemory0(); + const result = []; + for (let i = ptr; i < ptr + 4 * len; i += 4) { + result.push(takeObject(mem.getUint32(i, true))); + } + return result; +} +/** + * @param {string} text + * @param {boolean | undefined} [hmm] + * @returns {any[]} + */ +export function cut(text, hmm) { + try { + const retptr = wasm.__wbindgen_add_to_stack_pointer(-16); + const ptr0 = passStringToWasm0(text, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc); + const len0 = WASM_VECTOR_LEN; + wasm.cut(retptr, ptr0, len0, isLikeNone(hmm) ? 0xFFFFFF : hmm ? 1 : 0); + var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true); + var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true); + var v2 = getArrayJsValueFromWasm0(r0, r1).slice(); + wasm.__wbindgen_free(r0, r1 * 4, 4); + return v2; + } finally { + wasm.__wbindgen_add_to_stack_pointer(16); + } +} + +/** + * @param {string} text + * @returns {any[]} + */ +export function cut_all(text) { + try { + const retptr = wasm.__wbindgen_add_to_stack_pointer(-16); + const ptr0 = passStringToWasm0(text, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc); + const len0 = WASM_VECTOR_LEN; + wasm.cut_all(retptr, ptr0, len0); + var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true); + var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true); + var v2 = getArrayJsValueFromWasm0(r0, r1).slice(); + wasm.__wbindgen_free(r0, r1 * 4, 4); + return v2; + } finally { + wasm.__wbindgen_add_to_stack_pointer(16); + } +} + +/** + * @param {string} text + * @param {boolean | undefined} [hmm] + * @returns {any[]} + */ +export function cut_for_search(text, hmm) { + try { + const retptr = wasm.__wbindgen_add_to_stack_pointer(-16); + const ptr0 = passStringToWasm0(text, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc); + const len0 = WASM_VECTOR_LEN; + wasm.cut_for_search(retptr, ptr0, len0, isLikeNone(hmm) ? 0xFFFFFF : hmm ? 1 : 0); + var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true); + var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true); + var v2 = getArrayJsValueFromWasm0(r0, r1).slice(); + wasm.__wbindgen_free(r0, r1 * 4, 4); + return v2; + } finally { + wasm.__wbindgen_add_to_stack_pointer(16); + } +} + +/** + * @param {string} text + * @param {string} mode + * @param {boolean | undefined} [hmm] + * @returns {any[]} + */ +export function tokenize(text, mode, hmm) { + try { + const retptr = wasm.__wbindgen_add_to_stack_pointer(-16); + const ptr0 = passStringToWasm0(text, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc); + const len0 = WASM_VECTOR_LEN; + const ptr1 = passStringToWasm0(mode, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc); + const len1 = WASM_VECTOR_LEN; + wasm.tokenize(retptr, ptr0, len0, ptr1, len1, isLikeNone(hmm) ? 0xFFFFFF : hmm ? 1 : 0); + var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true); + var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true); + var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true); + var r3 = getDataViewMemory0().getInt32(retptr + 4 * 3, true); + if (r3) { + throw takeObject(r2); + } + var v3 = getArrayJsValueFromWasm0(r0, r1).slice(); + wasm.__wbindgen_free(r0, r1 * 4, 4); + return v3; + } finally { + wasm.__wbindgen_add_to_stack_pointer(16); + } +} + +/** + * @param {string} word + * @param {number | undefined} [freq] + * @param {string | undefined} [tag] + * @returns {number} + */ +export function add_word(word, freq, tag) { + const ptr0 = passStringToWasm0(word, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc); + const len0 = WASM_VECTOR_LEN; + var ptr1 = isLikeNone(tag) ? 0 : passStringToWasm0(tag, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc); + var len1 = WASM_VECTOR_LEN; + const ret = wasm.add_word(ptr0, len0, !isLikeNone(freq), isLikeNone(freq) ? 0 : freq, ptr1, len1); + return ret >>> 0; +} + +/** + * @param {string} sentence + * @param {boolean | undefined} [hmm] + * @returns {any[]} + */ +export function tag(sentence, hmm) { + try { + const retptr = wasm.__wbindgen_add_to_stack_pointer(-16); + const ptr0 = passStringToWasm0(sentence, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc); + const len0 = WASM_VECTOR_LEN; + wasm.tag(retptr, ptr0, len0, isLikeNone(hmm) ? 0xFFFFFF : hmm ? 1 : 0); + var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true); + var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true); + var v2 = getArrayJsValueFromWasm0(r0, r1).slice(); + wasm.__wbindgen_free(r0, r1 * 4, 4); + return v2; + } finally { + wasm.__wbindgen_add_to_stack_pointer(16); + } +} + +export function __wbindgen_object_drop_ref(arg0) { + takeObject(arg0); +}; + +export function __wbindgen_string_new(arg0, arg1) { + const ret = getStringFromWasm0(arg0, arg1); + return addHeapObject(ret); +}; + +export function __wbindgen_object_clone_ref(arg0) { + const ret = getObject(arg0); + return addHeapObject(ret); +}; + +export function __wbg_new_1e7c00339420672b() { + const ret = new Object(); + return addHeapObject(ret); +}; + +export function __wbindgen_number_new(arg0) { + const ret = arg0; + return addHeapObject(ret); +}; + +export function __wbg_set_1754fb90457a8cce(arg0, arg1, arg2) { + getObject(arg0)[takeObject(arg1)] = takeObject(arg2); +}; + +export function __wbg_new_b44ab9ef6060dd36(arg0, arg1) { + const ret = new Error(getStringFromWasm0(arg0, arg1)); + return addHeapObject(ret); +}; + +export function __wbindgen_debug_string(arg0, arg1) { + const ret = debugString(getObject(arg1)); + const ptr1 = passStringToWasm0(ret, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc); + const len1 = WASM_VECTOR_LEN; + getDataViewMemory0().setInt32(arg0 + 4 * 1, len1, true); + getDataViewMemory0().setInt32(arg0 + 4 * 0, ptr1, true); +}; + +export function __wbindgen_throw(arg0, arg1) { + throw new Error(getStringFromWasm0(arg0, arg1)); +}; + diff --git a/libs/jieba-wasm/jieba_rs_wasm_bg.wasm b/libs/jieba-wasm/jieba_rs_wasm_bg.wasm new file mode 100644 index 0000000..92df1dc Binary files /dev/null and b/libs/jieba-wasm/jieba_rs_wasm_bg.wasm differ diff --git a/libs/jieba-wasm/jieba_rs_wasm_bg.wasm.d.ts b/libs/jieba-wasm/jieba_rs_wasm_bg.wasm.d.ts new file mode 100644 index 0000000..ab7e1cd --- /dev/null +++ b/libs/jieba-wasm/jieba_rs_wasm_bg.wasm.d.ts @@ -0,0 +1,25 @@ +/* tslint:disable */ +/* eslint-disable */ +export const memory: WebAssembly.Memory; +export const cut: (a: number, b: number, c: number) => [number, number]; +export const cut_all: (a: number, b: number) => [number, number]; +export const cut_for_search: (a: number, b: number, c: number) => [number, number]; +export const tokenize: (a: number, b: number, c: number, d: number, e: number) => [number, number, number, number]; +export const add_word: (a: number, b: number, c: number, d: number, e: number) => number; +export const tag: (a: number, b: number, c: number) => [number, number]; +export const with_dict: (a: number, b: number) => [number, number]; +export const rust_zstd_wasm_shim_qsort: (a: number, b: number, c: number, d: number) => void; +export const rust_zstd_wasm_shim_malloc: (a: number) => number; +export const rust_zstd_wasm_shim_memcmp: (a: number, b: number, c: number) => number; +export const rust_zstd_wasm_shim_calloc: (a: number, b: number) => number; +export const rust_zstd_wasm_shim_free: (a: number) => void; +export const rust_zstd_wasm_shim_memcpy: (a: number, b: number, c: number) => number; +export const rust_zstd_wasm_shim_memmove: (a: number, b: number, c: number) => number; +export const rust_zstd_wasm_shim_memset: (a: number, b: number, c: number) => number; +export const __wbindgen_malloc: (a: number, b: number) => number; +export const __wbindgen_realloc: (a: number, b: number, c: number, d: number) => number; +export const __wbindgen_export_2: WebAssembly.Table; +export const __externref_drop_slice: (a: number, b: number) => void; +export const __wbindgen_free: (a: number, b: number, c: number) => void; +export const __externref_table_dealloc: (a: number) => void; +export const __wbindgen_start: () => void; diff --git a/modules/story-summary/generate/generator.js b/modules/story-summary/generate/generator.js index a74f48d..faf8a26 100644 --- a/modules/story-summary/generate/generator.js +++ b/modules/story-summary/generate/generator.js @@ -14,6 +14,12 @@ const MAX_CAUSED_BY = 2; // factUpdates 清洗 // ═══════════════════════════════════════════════════════════════════════════ +function normalizeRelationPredicate(p) { + if (/^对.+的看法$/.test(p)) return p; + if (/^与.+的关系$/.test(p)) return p; + return null; +} + function sanitizeFacts(parsed) { if (!parsed) return; @@ -22,23 +28,25 @@ function sanitizeFacts(parsed) { for (const item of updates) { const s = String(item?.s || '').trim(); - const p = String(item?.p || '').trim(); + const pRaw = String(item?.p || '').trim(); - if (!s || !p) continue; + if (!s || !pRaw) continue; // 删除操作 if (item.retracted === true) { - ok.push({ s, p, retracted: true }); + ok.push({ s, p: pRaw, retracted: true }); continue; } const o = String(item?.o || '').trim(); if (!o) continue; - const fact = { s, p, o }; + const relP = normalizeRelationPredicate(pRaw); + const isRel = !!relP; + const fact = { s, p: isRel ? relP : pRaw, o }; // 关系类保留 trend - if (/^对.+的/.test(p) && item.trend) { + if (isRel && item.trend) { const validTrends = ['破裂', '厌恶', '反感', '陌生', '投缘', '亲密', '交融']; if (validTrends.includes(item.trend)) { fact.trend = item.trend; diff --git a/modules/story-summary/generate/llm.js b/modules/story-summary/generate/llm.js index 0b47329..ebeee49 100644 --- a/modules/story-summary/generate/llm.js +++ b/modules/story-summary/generate/llm.js @@ -102,7 +102,8 @@ Acknowledged. Now reviewing the incremental summarization specifications: [Fact Tracking - SPO Triples] ├─ s: 主体(角色名/物品名) -├─ p: 谓词(属性名/对X的看法) +├─ p: 谓词(属性名) +│ - 关系类只允许:对X的看法 / 与X的关系 ├─ o: 值(当前状态) ├─ trend: 仅关系类填写 ├─ retracted: 删除标记 @@ -191,7 +192,7 @@ Before generating, observe the USER and analyze carefully: ## factUpdates 规则 - s+p 为键,相同键会覆盖旧值 - 状态类:s=角色名, p=属性(生死/位置/状态等), o=值 -- 关系类:s=角色A, p="对B的看法", o=描述, trend=趋势 +- 关系类:s=角色A, p="对B的看法" 或 p="与B的关系",trend 仅限关系类 - 删除:设置 retracted: true(不需要填 o) - 只输出有变化的条目 - 硬约束才记录,避免叙事化,确保少、硬、稳定 diff --git a/modules/story-summary/story-summary.js b/modules/story-summary/story-summary.js index cfb95a5..6c10206 100644 --- a/modules/story-summary/story-summary.js +++ b/modules/story-summary/story-summary.js @@ -847,29 +847,32 @@ async function sendFrameBaseData(store, totalFloors) { } function sendFrameFullData(store, totalFloors) { - const lastSummarized = store?.lastSummarizedMesId ?? -1; if (store?.json) { - const facts = store.json.facts || []; - const relationships = extractRelationshipsFromFacts(facts); postToFrame({ type: "SUMMARY_FULL_DATA", - payload: { - keywords: store.json.keywords || [], - events: store.json.events || [], - characters: { - main: store.json.characters?.main || [], - relationships, - }, - arcs: store.json.arcs || [], - facts, - lastSummarizedMesId: lastSummarized, - }, + payload: buildFramePayload(store), }); } else { postToFrame({ type: "SUMMARY_CLEARED", payload: { totalFloors } }); } } +function buildFramePayload(store) { + const json = store?.json || {}; + const facts = json.facts || []; + return { + keywords: json.keywords || [], + events: json.events || [], + characters: { + main: json.characters?.main || [], + relationships: extractRelationshipsFromFacts(facts), + }, + arcs: json.arcs || [], + facts, + lastSummarizedMesId: store?.lastSummarizedMesId ?? -1, + }; +} + function openPanelForMessage(mesId) { createOverlay(); showOverlay(); @@ -979,20 +982,11 @@ async function autoRunSummaryWithRetry(targetMesId, configForRun) { onStatus: (text) => postToFrame({ type: "SUMMARY_STATUS", statusText: text }), onError: (msg) => postToFrame({ type: "SUMMARY_ERROR", message: msg }), onComplete: async ({ merged, endMesId, newEventIds }) => { - postToFrame({ - type: "SUMMARY_FULL_DATA", - payload: { - keywords: merged.keywords || [], - events: merged.events || [], - characters: merged.characters || { main: [], relationships: [] }, - arcs: merged.arcs || [], - world: merged.world || [], - lastSummarizedMesId: endMesId, - }, - }); + const store = getSummaryStore(); + postToFrame({ type: "SUMMARY_FULL_DATA", payload: buildFramePayload(store) }); applyHideStateDebounced(); - updateFrameStatsAfterSummary(endMesId, merged); + updateFrameStatsAfterSummary(endMesId, store.json || {}); // L2 自动增量向量化 await autoVectorizeNewEvents(newEventIds); @@ -1279,20 +1273,11 @@ async function handleManualGenerate(mesId, config) { onStatus: (text) => postToFrame({ type: "SUMMARY_STATUS", statusText: text }), onError: (msg) => postToFrame({ type: "SUMMARY_ERROR", message: msg }), onComplete: async ({ merged, endMesId, newEventIds }) => { - postToFrame({ - type: "SUMMARY_FULL_DATA", - payload: { - keywords: merged.keywords || [], - events: merged.events || [], - characters: merged.characters || { main: [], relationships: [] }, - arcs: merged.arcs || [], - world: merged.world || [], - lastSummarizedMesId: endMesId, - }, - }); + const store = getSummaryStore(); + postToFrame({ type: "SUMMARY_FULL_DATA", payload: buildFramePayload(store) }); applyHideStateDebounced(); - updateFrameStatsAfterSummary(endMesId, merged); + updateFrameStatsAfterSummary(endMesId, store.json || {}); // L2 自动增量向量化 await autoVectorizeNewEvents(newEventIds); diff --git a/modules/story-summary/vector/recall.js b/modules/story-summary/vector/recall.js index 6602da6..77ac357 100644 --- a/modules/story-summary/vector/recall.js +++ b/modules/story-summary/vector/recall.js @@ -21,6 +21,10 @@ import { mergeAndSparsify, } from './state-recall.js'; import { ensureEventTextIndex, searchEventsByText } from './text-search.js'; +import { + extractRareTerms, + extractNounsFromFactsO, +} from './tokenizer.js'; const MODULE_ID = 'recall'; @@ -50,6 +54,10 @@ const CONFIG = { RRF_K: 60, TEXT_SEARCH_LIMIT: 80, + + // TEXT-only 质量控制 + TEXT_SOFT_MIN_SIM: 0.50, + TEXT_TOTAL_MAX: 6, }; // ═══════════════════════════════════════════════════════════════════════════ @@ -83,7 +91,7 @@ function fuseEventsByRRF(vectorRanked, textRanked, eventById, k = CONFIG.RRF_K) const upsert = (id) => { if (!map.has(id)) { - map.set(id, { id, rrf: 0, vRank: Infinity, tRank: Infinity, type: 'TEXT' }); + map.set(id, { id, rrf: 0, vRank: Infinity, tRank: Infinity, type: 'TEXT', rawSim: 0, vector: null }); } return map.get(id); }; @@ -96,6 +104,7 @@ function fuseEventsByRRF(vectorRanked, textRanked, eventById, k = CONFIG.RRF_K) o.rrf += 1 / (k + i + 1); o.type = o.tRank !== Infinity ? 'HYBRID' : 'VECTOR'; o.vector = r.vector; + o.rawSim = r.rawSim || 0; }); textRanked.forEach((r) => { @@ -381,6 +390,70 @@ function normalizeEntityWeights(queryEntityWeights) { return normalized; } +// ═══════════════════════════════════════════════════════════════════════════ +// 文本路 Query 构建(分层高信号词) +// ═══════════════════════════════════════════════════════════════════════════ + +async function buildTextSearchQuery(segments, queryEntities, facts, expandedTerms) { + const breakdown = { + entities: [], + rareTerms: [], + factsO: [], + expanded: [], + }; + + breakdown.entities = [...(queryEntities || [])]; + + const q2Segments = segments.slice(-2); + const q2Text = q2Segments.join(' '); + + try { + breakdown.rareTerms = await extractRareTerms(q2Text, 15); + } catch (e) { + xbLog.warn(MODULE_ID, '稀有词提取失败', e); + breakdown.rareTerms = []; + } + + const entitySet = new Set(breakdown.entities.map(e => e.toLowerCase())); + breakdown.rareTerms = breakdown.rareTerms.filter(t => !entitySet.has(t.toLowerCase())); + + const relevantSubjects = new Set(queryEntities || []); + try { + breakdown.factsO = await extractNounsFromFactsO(facts, relevantSubjects, 5); + } catch (e) { + xbLog.warn(MODULE_ID, 'facts O 提取失败', e); + breakdown.factsO = []; + } + + const existingSet = new Set([ + ...breakdown.entities, + ...breakdown.rareTerms, + ].map(e => e.toLowerCase())); + breakdown.factsO = breakdown.factsO.filter(t => !existingSet.has(t.toLowerCase())); + + const allExistingSet = new Set([ + ...breakdown.entities, + ...breakdown.rareTerms, + ...breakdown.factsO, + ].map(e => e.toLowerCase())); + + breakdown.expanded = (expandedTerms || []) + .filter(t => !allExistingSet.has(t.toLowerCase())) + .slice(0, 3); + + const queryParts = [ + ...breakdown.entities, + ...breakdown.entities, + ...breakdown.rareTerms, + ...breakdown.factsO, + ...breakdown.expanded, + ]; + + const query = queryParts.join(' '); + + return { query, breakdown }; +} + function stripFloorTag(s) { return String(s || '').replace(/\s*\(#\d+(?:-\d+)?\)\s*$/, '').trim(); } @@ -605,7 +678,7 @@ async function searchEvents(queryVector, queryTextForSearch, allEvents, vectorCo const scored = (allEvents || []).map((event, idx) => { const v = vectorMap.get(event.id); - const sim = v ? cosineSimilarity(queryVector, v) : 0; + const rawSim = v ? cosineSimilarity(queryVector, v) : 0; let bonus = 0; @@ -635,14 +708,15 @@ async function searchEvents(queryVector, queryTextForSearch, allEvents, vectorCo _id: event.id, _idx: idx, event, - similarity: sim, - finalScore: sim + bonus, + rawSim, + finalScore: rawSim + bonus, vector: v, _entityBonus: entityBonus, _hasPresent: maxEntityWeight > 0, }; }); + const rawSimById = new Map(scored.map(s => [s._id, s.rawSim])); const entityBonusById = new Map(scored.map(s => [s._id, s._entityBonus])); const hasPresentById = new Map(scored.map(s => [s._id, s._hasPresent])); @@ -665,14 +739,34 @@ async function searchEvents(queryVector, queryTextForSearch, allEvents, vectorCo const vectorRanked = candidates.map(s => ({ event: s.event, similarity: s.finalScore, + rawSim: s.rawSim, vector: s.vector, })); const eventById = new Map(allEvents.map(e => [e.id, e])); const fused = fuseEventsByRRF(vectorRanked, textRanked, eventById); - const hasVector = vectorRanked.length > 0; - const filtered = hasVector ? fused.filter(x => x.type !== 'TEXT') : fused; + const textOnlyStats = { + total: 0, + passedSoftCheck: 0, + filtered: 0, + finalIncluded: 0, + truncatedByLimit: 0, + }; + + const filtered = fused.filter(x => { + if (x.type !== 'TEXT') return true; + + textOnlyStats.total++; + const sim = x.rawSim || rawSimById.get(x.id) || 0; + if (sim >= CONFIG.TEXT_SOFT_MIN_SIM) { + textOnlyStats.passedSoftCheck++; + return true; + } + + textOnlyStats.filtered++; + return false; + }); const mmrInput = filtered.slice(0, CONFIG.CANDIDATE_EVENTS).map(x => ({ ...x, @@ -686,14 +780,27 @@ async function searchEvents(queryVector, queryTextForSearch, allEvents, vectorCo c => c.vector || null, c => c.rrf ); - // 构造结果 - const results = mmrOutput.map(x => ({ + + let textOnlyCount = 0; + const finalResults = mmrOutput.filter(x => { + if (x.type !== 'TEXT') return true; + if (textOnlyCount < CONFIG.TEXT_TOTAL_MAX) { + textOnlyCount++; + return true; + } + textOnlyStats.truncatedByLimit++; + return false; + }); + textOnlyStats.finalIncluded = textOnlyCount; + + const results = finalResults.map(x => ({ event: x.event, similarity: x.rrf, _recallType: hasPresentById.get(x.event?.id) ? 'DIRECT' : 'SIMILAR', _recallReason: x.type, _rrfDetail: { vRank: x.vRank, tRank: x.tRank, rrf: x.rrf }, _entityBonus: entityBonusById.get(x.event?.id) || 0, + _rawSim: rawSimById.get(x.event?.id) || 0, })); // 统计信息附加到第一条结果 @@ -704,8 +811,9 @@ async function searchEvents(queryVector, queryTextForSearch, allEvents, vectorCo textCount: textRanked.length, hybridCount: fused.filter(x => x.type === 'HYBRID').length, vectorOnlyCount: fused.filter(x => x.type === 'VECTOR').length, - textOnlyFiltered: fused.filter(x => x.type === 'TEXT').length, + textOnlyTotal: textOnlyStats.total, }; + results[0]._textOnlyStats = textOnlyStats; results[0]._textGapInfo = textGapInfo; } @@ -729,6 +837,7 @@ function formatRecallLog({ l0Results = [], textGapInfo = null, expandedTerms = [], + textQueryBreakdown = null, }) { const lines = [ '\u2554' + '\u2550'.repeat(62) + '\u2557', @@ -775,6 +884,40 @@ function formatRecallLog({ lines.push(` 扩散: ${expandedTerms.join('、')}`); } + lines.push(''); + lines.push('\u250c' + '\u2500'.repeat(61) + '\u2510'); + lines.push('\u2502 【文本路 Query 构成】 \u2502'); + lines.push('\u2514' + '\u2500'.repeat(61) + '\u2518'); + + if (textQueryBreakdown) { + const bd = textQueryBreakdown; + if (bd.entities?.length) { + lines.push(` 强信号-实体 (${bd.entities.length}): ${bd.entities.slice(0, 8).join(' | ')}${bd.entities.length > 8 ? ' ...' : ''}`); + } else { + lines.push(' 强信号-实体: (无)'); + } + + if (bd.rareTerms?.length) { + lines.push(` 强信号-稀有词 (${bd.rareTerms.length}): ${bd.rareTerms.slice(0, 10).join(' | ')}${bd.rareTerms.length > 10 ? ' ...' : ''}`); + } else { + lines.push(' 强信号-稀有词: (无)'); + } + + if (bd.factsO?.length) { + lines.push(` 中信号-facts O (${bd.factsO.length}): ${bd.factsO.join(' | ')}`); + } else { + lines.push(' 中信号-facts O: (无)'); + } + + if (bd.expanded?.length) { + lines.push(` 背景扩展 (${bd.expanded.length}): ${bd.expanded.join(' | ')}`); + } else { + lines.push(' 背景扩展: (无)'); + } + } else { + lines.push(' (降级模式,无分层信息)'); + } + lines.push(''); lines.push(' 实体归一化(用于加分):'); if (normalizedEntityWeights?.size) { @@ -816,13 +959,37 @@ function formatRecallLog({ // L2 const rrfStats = eventResults[0]?._rrfStats || {}; + const textOnlyStats = eventResults[0]?._textOnlyStats || {}; lines.push(''); lines.push(' L2 事件记忆 (RRF 混合检索):'); lines.push(` 总事件: ${allEvents.length} 条 | 最终: ${eventResults.length} 条`); lines.push(` 向量路: ${rrfStats.vectorCount || 0} 条 | 文本路: ${rrfStats.textCount || 0} 条`); - lines.push(` HYBRID: ${rrfStats.hybridCount || 0} 条 | 纯 VECTOR: ${rrfStats.vectorOnlyCount || 0} 条 | 纯 TEXT (已过滤): ${rrfStats.textOnlyFiltered || 0} 条`); + lines.push(` HYBRID: ${rrfStats.hybridCount || 0} 条 | 纯 VECTOR: ${rrfStats.vectorOnlyCount || 0} 条`); + + lines.push(''); + lines.push(' TEXT-only 质量控制:'); + lines.push(` 候选: ${textOnlyStats.total || 0} 条`); + lines.push(` 通过软校验 (sim>=${CONFIG.TEXT_SOFT_MIN_SIM}): ${textOnlyStats.passedSoftCheck || 0} 条`); + lines.push(` 语义过滤: ${textOnlyStats.filtered || 0} 条`); + lines.push(` 限额截断 (max=${CONFIG.TEXT_TOTAL_MAX}): ${textOnlyStats.truncatedByLimit || 0} 条`); + lines.push(` 最终入选: ${textOnlyStats.finalIncluded || 0} 条`); + + const textOnlyEvents = eventResults.filter(e => e._recallReason === 'TEXT'); + if (textOnlyEvents.length > 0) { + lines.push(''); + lines.push(' TEXT-only 入选事件:'); + textOnlyEvents.forEach((e, i) => { + const ev = e.event || {}; + const id = ev.id || '?'; + const title = (ev.title || '').slice(0, 25) || '(无标题)'; + const sim = (e._rawSim || 0).toFixed(2); + const tRank = e._rrfDetail?.tRank ?? '?'; + lines.push(` ${i + 1}. [${id}] ${title.padEnd(25)} sim=${sim} tRank=${tRank}`); + }); + } const entityBoostedEvents = eventResults.filter(e => e._entityBonus > 0).length; + lines.push(''); lines.push(` 实体加分事件: ${entityBoostedEvents} 条`); if (textGapInfo) { @@ -886,14 +1053,17 @@ export async function recallMemory(queryText, allEvents, vectorConfig, options = const expandedTerms = expandByFacts(queryEntities, facts, 2); const normalizedEntityWeights = normalizeEntityWeights(queryEntityWeights); - // 构建文本查询串:最后一条消息 + 实体 + 关键词 - const lastSeg = segments[segments.length - 1] || ''; - const queryTextForSearch = [ - lastSeg, - ...queryEntities, - ...expandedTerms, - ...(store?.json?.keywords || []).slice(0, 5).map(k => k.text), - ].join(' '); + let queryTextForSearch = ''; + let textQueryBreakdown = null; + try { + const result = await buildTextSearchQuery(segments, queryEntities, facts, expandedTerms); + queryTextForSearch = result.query; + textQueryBreakdown = result.breakdown; + } catch (e) { + xbLog.warn(MODULE_ID, '文本路 Query 构建失败,降级到简单拼接', e); + const lastSeg = segments[segments.length - 1] || ''; + queryTextForSearch = [lastSeg, ...queryEntities, ...expandedTerms].join(' '); + } // L0 召回 let l0Results = []; @@ -954,6 +1124,7 @@ export async function recallMemory(queryText, allEvents, vectorConfig, options = l0Results, textGapInfo, expandedTerms, + textQueryBreakdown, }); console.group('%c[Recall]', 'color: #7c3aed; font-weight: bold'); diff --git a/modules/story-summary/vector/tokenizer.js b/modules/story-summary/vector/tokenizer.js new file mode 100644 index 0000000..c68552b --- /dev/null +++ b/modules/story-summary/vector/tokenizer.js @@ -0,0 +1,287 @@ +import { xbLog } from '../../../core/debug-core.js'; +import { extensionFolderPath } from '../../../core/constants.js'; + +const MODULE_ID = 'tokenizer'; + +// ═══════════════════════════════════════════════════════════════════════════ +// 词性过滤 +// ═══════════════════════════════════════════════════════════════════════════ + +// 保留的词性(名词类 + 英文) +const KEEP_POS_PREFIXES = ['n', 'eng']; + +function shouldKeepByPos(pos) { + return KEEP_POS_PREFIXES.some(prefix => pos.startsWith(prefix)); +} + +// ═══════════════════════════════════════════════════════════════════════════ +// 语言检测 +// ═══════════════════════════════════════════════════════════════════════════ + +function shouldUseJieba(text) { + const zh = (text.match(/[\u4e00-\u9fff]/g) || []).length; + return zh >= 5; +} + +function detectMainLanguage(text) { + const zh = (text.match(/[\u4e00-\u9fff]/g) || []).length; + const jp = (text.match(/[\u3040-\u309f\u30a0-\u30ff]/g) || []).length; + const en = (text.match(/[a-zA-Z]/g) || []).length; + const total = zh + jp + en || 1; + + if (jp / total > 0.2) return 'jp'; + if (en / total > 0.5) return 'en'; + return 'zh'; +} + +// 替换原有的大停用词表 +const STOP_WORDS = new Set([ + // 系统词 + '用户', '角色', '玩家', '旁白', 'user', 'assistant', 'system', + // 时间泛词 + '时候', '现在', '今天', '明天', '昨天', '早上', '晚上', + // 方位泛词 + '这里', '那里', '上面', '下面', '里面', '外面', + // 泛化名词 + '东西', '事情', '事儿', '地方', '样子', '意思', '感觉', + '一下', '一些', '一点', '一会', '一次', +]); + +// 英文停用词(fallback 用) +const EN_STOP_WORDS = new Set([ + 'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been', + 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', + 'could', 'should', 'may', 'might', 'must', 'can', + 'to', 'of', 'in', 'on', 'at', 'for', 'with', 'by', 'from', + 'and', 'or', 'but', 'if', 'that', 'this', 'it', 'its', + 'i', 'you', 'he', 'she', 'we', 'they', + 'my', 'your', 'his', 'her', 'our', 'their', + 'what', 'which', 'who', 'whom', 'where', 'when', 'why', 'how', +]); + +let jiebaModule = null; +let jiebaReady = false; +let jiebaLoading = false; + +async function ensureJieba() { + if (jiebaReady) return true; + if (jiebaLoading) { + for (let i = 0; i < 50; i++) { + await new Promise(r => setTimeout(r, 100)); + if (jiebaReady) return true; + } + return false; + } + + jiebaLoading = true; + + try { + const jiebaPath = `/${extensionFolderPath}/libs/jieba-wasm/jieba_rs_wasm.js`; + // eslint-disable-next-line no-unsanitized/method + jiebaModule = await import(jiebaPath); + + if (jiebaModule.default) { + await jiebaModule.default(); + } + + jiebaReady = true; + xbLog.info(MODULE_ID, 'jieba-wasm 加载成功'); + const keys = Object.getOwnPropertyNames(jiebaModule || {}); + const dkeys = Object.getOwnPropertyNames(jiebaModule?.default || {}); + xbLog.info(MODULE_ID, `jieba keys: ${keys.join(',')}`); + xbLog.info(MODULE_ID, `jieba default keys: ${dkeys.join(',')}`); + xbLog.info(MODULE_ID, `jieba.tag: ${typeof jiebaModule?.tag}`); + return true; + } catch (e) { + xbLog.error(MODULE_ID, 'jieba-wasm 加载失败', e); + jiebaLoading = false; + return false; + } +} + +function fallbackTokenize(text) { + const tokens = []; + const lang = detectMainLanguage(text); + + // 英文 + const enMatches = text.match(/[a-zA-Z]{2,20}/gi) || []; + tokens.push(...enMatches.filter(w => !EN_STOP_WORDS.has(w.toLowerCase()))); + + // 日语假名 + if (lang === 'jp') { + const kanaMatches = text.match(/[\u3040-\u309f\u30a0-\u30ff]{2,10}/g) || []; + tokens.push(...kanaMatches); + } + + // 中文/日语汉字 + const zhMatches = text.match(/[\u4e00-\u9fff]{2,6}/g) || []; + tokens.push(...zhMatches); + + // 数字+汉字组合 + const numZhMatches = text.match(/\d+[\u4e00-\u9fff]{1,4}/g) || []; + tokens.push(...numZhMatches); + + return tokens; +} + +export async function extractNouns(text, options = {}) { + const { minLen = 2, maxCount = 0 } = options; + if (!text?.trim()) return []; + + // 中文为主 → 用 jieba + if (shouldUseJieba(text)) { + const hasJieba = await ensureJieba(); + + if (hasJieba && jiebaModule?.tag) { + try { + const tagged = jiebaModule.tag(text, true); + + const result = []; + const seen = new Set(); + + const list = Array.isArray(tagged) ? tagged : []; + for (const item of list) { + let word = ''; + let pos = ''; + if (Array.isArray(item)) { + [word, pos] = item; + } else if (item && typeof item === 'object') { + word = item.word || item.w || item.text || item.term || ''; + pos = item.tag || item.pos || item.p || ''; + } + if (!word || !pos) continue; + if (word.length < minLen) continue; + if (!shouldKeepByPos(pos)) continue; + if (STOP_WORDS.has(word)) continue; + if (seen.has(word)) continue; + + seen.add(word); + result.push(word); + + if (maxCount > 0 && result.length >= maxCount) break; + } + + return result; + } catch (e) { + xbLog.warn(MODULE_ID, 'jieba tag 失败:' + (e && e.message ? e.message : String(e))); + } + } + } + + // 非中文 / jieba 失败 → fallback + const tokens = fallbackTokenize(text); + + const result = []; + const seen = new Set(); + + for (const t of tokens) { + if (t.length < minLen) continue; + if (STOP_WORDS.has(t)) continue; + if (seen.has(t)) continue; + + seen.add(t); + result.push(t); + + if (maxCount > 0 && result.length >= maxCount) break; + } + + return result; +} + +export async function extractRareTerms(text, maxCount = 15) { + if (!text?.trim()) return []; + + // 中文为主 → 用 jieba + if (shouldUseJieba(text)) { + const hasJieba = await ensureJieba(); + + if (hasJieba && jiebaModule?.tag) { + try { + const tagged = jiebaModule.tag(text, true); + + const candidates = []; + const seen = new Set(); + + const list = Array.isArray(tagged) ? tagged : []; + for (const item of list) { + let word = ''; + let pos = ''; + if (Array.isArray(item)) { + [word, pos] = item; + } else if (item && typeof item === 'object') { + word = item.word || item.w || item.text || item.term || ''; + pos = item.tag || item.pos || item.p || ''; + } + if (!word || !pos) continue; + if (word.length < 2) continue; + if (!shouldKeepByPos(pos)) continue; + if (STOP_WORDS.has(word)) continue; + if (seen.has(word)) continue; + + seen.add(word); + + // 稀有度评分 + let score = 0; + if (word.length >= 4) score += 3; + else if (word.length >= 3) score += 1; + if (/[a-zA-Z]/.test(word)) score += 2; + if (/\d/.test(word)) score += 1; + // 专名词性加分 + if (['nr', 'ns', 'nt', 'nz'].some(p => pos.startsWith(p))) score += 2; + + candidates.push({ term: word, score }); + } + + candidates.sort((a, b) => b.score - a.score); + return candidates.slice(0, maxCount).map(x => x.term); + } catch (e) { + xbLog.warn(MODULE_ID, 'jieba tag 失败:' + (e && e.message ? e.message : String(e))); + } + } + } + + // 非中文 / jieba 失败 → fallback + const allNouns = await extractNouns(text, { minLen: 2, maxCount: 0 }); + + const scored = allNouns.map(t => { + let score = 0; + if (t.length >= 4) score += 3; + else if (t.length >= 3) score += 1; + if (/[a-zA-Z]/.test(t)) score += 2; + if (/\d/.test(t)) score += 1; + return { term: t, score }; + }); + + scored.sort((a, b) => b.score - a.score); + return scored.slice(0, maxCount).map(x => x.term); +} + +export async function extractNounsFromFactsO(facts, relevantSubjects, maxCount = 5) { + if (!facts?.length || !relevantSubjects?.size) return []; + + const oTexts = []; + + for (const f of facts) { + if (f.retracted) continue; + + // 只取相关主体的 facts + const s = String(f.s || '').trim(); + if (!relevantSubjects.has(s)) continue; + + const o = String(f.o || '').trim(); + if (!o) continue; + + // 跳过太长的 O(可能是完整句子) + if (o.length > 30) continue; + + oTexts.push(o); + } + + if (!oTexts.length) return []; + + const combined = oTexts.join(' '); + return await extractNouns(combined, { minLen: 2, maxCount }); +} + +export { ensureJieba }; +