Improve rerank failure handling and tokenizer JP support

This commit is contained in:
2026-02-10 17:52:09 +08:00
parent fbf34815bb
commit 062df60570
7 changed files with 655 additions and 285 deletions

View File

@@ -93,3 +93,7 @@ For complete license terms, see LICENSE.md
For attribution requirements, see COPYRIGHT
Last updated: 2025-01-14
TinySegmenter 0.2
Copyright (c) 2008 Taku Kudo
MIT License
http://www.chasen.org/~taku/software/TinySegmenter/

177
libs/tiny-segmenter.js Normal file
View File

@@ -0,0 +1,177 @@
// TinySegmenter 0.1 -- Super compact Japanese tokenizer in Javascript
// (c) 2008 Taku Kudo <taku@chasen.org>
// TinySegmenter is freely distributable under the terms of a new BSD licence.
// For details, see http://chasen.org/~taku/software/TinySegmenter/LICENCE.txt
function TinySegmenter() {
var patterns = {
"[一二三四五六七八九十百千万億兆]":"M",
"[一-龠々〆ヵヶ]":"H",
"[ぁ-ん]":"I",
"[ァ-ヴーア-ン゙ー]":"K",
"[a-zA-Z--]":"A",
"[0-9-]":"N"
}
this.chartype_ = [];
for (var i in patterns) {
var regexp = new RegExp;
regexp.compile(i)
this.chartype_.push([regexp, patterns[i]]);
}
this.BIAS__ = -332
this.BC1__ = {"HH":6,"II":2461,"KH":406,"OH":-1378};
this.BC2__ = {"AA":-3267,"AI":2744,"AN":-878,"HH":-4070,"HM":-1711,"HN":4012,"HO":3761,"IA":1327,"IH":-1184,"II":-1332,"IK":1721,"IO":5492,"KI":3831,"KK":-8741,"MH":-3132,"MK":3334,"OO":-2920};
this.BC3__ = {"HH":996,"HI":626,"HK":-721,"HN":-1307,"HO":-836,"IH":-301,"KK":2762,"MK":1079,"MM":4034,"OA":-1652,"OH":266};
this.BP1__ = {"BB":295,"OB":304,"OO":-125,"UB":352};
this.BP2__ = {"BO":60,"OO":-1762};
this.BQ1__ = {"BHH":1150,"BHM":1521,"BII":-1158,"BIM":886,"BMH":1208,"BNH":449,"BOH":-91,"BOO":-2597,"OHI":451,"OIH":-296,"OKA":1851,"OKH":-1020,"OKK":904,"OOO":2965};
this.BQ2__ = {"BHH":118,"BHI":-1159,"BHM":466,"BIH":-919,"BKK":-1720,"BKO":864,"OHH":-1139,"OHM":-181,"OIH":153,"UHI":-1146};
this.BQ3__ = {"BHH":-792,"BHI":2664,"BII":-299,"BKI":419,"BMH":937,"BMM":8335,"BNN":998,"BOH":775,"OHH":2174,"OHM":439,"OII":280,"OKH":1798,"OKI":-793,"OKO":-2242,"OMH":-2402,"OOO":11699};
this.BQ4__ = {"BHH":-3895,"BIH":3761,"BII":-4654,"BIK":1348,"BKK":-1806,"BMI":-3385,"BOO":-12396,"OAH":926,"OHH":266,"OHK":-2036,"ONN":-973};
this.BW1__ = {",と":660,",同":727,"B1あ":1404,"B1同":542,"、と":660,"、同":727,"」と":1682,"あっ":1505,"いう":1743,"いっ":-2055,"いる":672,"うし":-4817,"うん":665,"から":3472,"がら":600,"こう":-790,"こと":2083,"こん":-1262,"さら":-4143,"さん":4573,"した":2641,"して":1104,"すで":-3399,"そこ":1977,"それ":-871,"たち":1122,"ため":601,"った":3463,"つい":-802,"てい":805,"てき":1249,"でき":1127,"です":3445,"では":844,"とい":-4915,"とみ":1922,"どこ":3887,"ない":5713,"なっ":3015,"など":7379,"なん":-1113,"にし":2468,"には":1498,"にも":1671,"に対":-912,"の一":-501,"の中":741,"ませ":2448,"まで":1711,"まま":2600,"まる":-2155,"やむ":-1947,"よっ":-2565,"れた":2369,"れで":-913,"をし":1860,"を見":731,"亡く":-1886,"京都":2558,"取り":-2784,"大き":-2604,"大阪":1497,"平方":-2314,"引き":-1336,"日本":-195,"本当":-2423,"毎日":-2113,"目指":-724,"B1あ":1404,"B1同":542,"」と":1682};
this.BW2__ = {"..":-11822,"11":-669,"――":-5730,"":-13175,"いう":-1609,"うか":2490,"かし":-1350,"かも":-602,"から":-7194,"かれ":4612,"がい":853,"がら":-3198,"きた":1941,"くな":-1597,"こと":-8392,"この":-4193,"させ":4533,"され":13168,"さん":-3977,"しい":-1819,"しか":-545,"した":5078,"して":972,"しな":939,"その":-3744,"たい":-1253,"たた":-662,"ただ":-3857,"たち":-786,"たと":1224,"たは":-939,"った":4589,"って":1647,"っと":-2094,"てい":6144,"てき":3640,"てく":2551,"ては":-3110,"ても":-3065,"でい":2666,"でき":-1528,"でし":-3828,"です":-4761,"でも":-4203,"とい":1890,"とこ":-1746,"とと":-2279,"との":720,"とみ":5168,"とも":-3941,"ない":-2488,"なが":-1313,"など":-6509,"なの":2614,"なん":3099,"にお":-1615,"にし":2748,"にな":2454,"によ":-7236,"に対":-14943,"に従":-4688,"に関":-11388,"のか":2093,"ので":-7059,"のに":-6041,"のの":-6125,"はい":1073,"はが":-1033,"はず":-2532,"ばれ":1813,"まし":-1316,"まで":-6621,"まれ":5409,"めて":-3153,"もい":2230,"もの":-10713,"らか":-944,"らし":-1611,"らに":-1897,"りし":651,"りま":1620,"れた":4270,"れて":849,"れば":4114,"ろう":6067,"われ":7901,"を通":-11877,"んだ":728,"んな":-4115,"一人":602,"一方":-1375,"一日":970,"一部":-1051,"上が":-4479,"会社":-1116,"出て":2163,"分の":-7758,"同党":970,"同日":-913,"大阪":-2471,"委員":-1250,"少な":-1050,"年度":-8669,"年間":-1626,"府県":-2363,"手権":-1982,"新聞":-4066,"日新":-722,"日本":-7068,"日米":3372,"曜日":-601,"朝鮮":-2355,"本人":-2697,"東京":-1543,"然と":-1384,"社会":-1276,"立て":-990,"第に":-1612,"米国":-4268,"":-669};
this.BW3__ = {"あた":-2194,"あり":719,"ある":3846,"い.":-1185,"い。":-1185,"いい":5308,"いえ":2079,"いく":3029,"いた":2056,"いっ":1883,"いる":5600,"いわ":1527,"うち":1117,"うと":4798,"えと":1454,"か.":2857,"か。":2857,"かけ":-743,"かっ":-4098,"かに":-669,"から":6520,"かり":-2670,"が,":1816,"が、":1816,"がき":-4855,"がけ":-1127,"がっ":-913,"がら":-4977,"がり":-2064,"きた":1645,"けど":1374,"こと":7397,"この":1542,"ころ":-2757,"さい":-714,"さを":976,"し,":1557,"し、":1557,"しい":-3714,"した":3562,"して":1449,"しな":2608,"しま":1200,"す.":-1310,"す。":-1310,"する":6521,"ず,":3426,"ず、":3426,"ずに":841,"そう":428,"た.":8875,"た。":8875,"たい":-594,"たの":812,"たり":-1183,"たる":-853,"だ.":4098,"だ。":4098,"だっ":1004,"った":-4748,"って":300,"てい":6240,"てお":855,"ても":302,"です":1437,"でに":-1482,"では":2295,"とう":-1387,"とし":2266,"との":541,"とも":-3543,"どう":4664,"ない":1796,"なく":-903,"など":2135,"に,":-1021,"に、":-1021,"にし":1771,"にな":1906,"には":2644,"の,":-724,"の、":-724,"の子":-1000,"は,":1337,"は、":1337,"べき":2181,"まし":1113,"ます":6943,"まっ":-1549,"まで":6154,"まれ":-793,"らし":1479,"られ":6820,"るる":3818,"れ,":854,"れ、":854,"れた":1850,"れて":1375,"れば":-3246,"れる":1091,"われ":-605,"んだ":606,"んで":798,"カ月":990,"会議":860,"入り":1232,"大会":2217,"始め":1681,"市":965,"新聞":-5055,"日,":974,"日、":974,"社会":2024,"カ月":990};
this.TC1__ = {"AAA":1093,"HHH":1029,"HHM":580,"HII":998,"HOH":-390,"HOM":-331,"IHI":1169,"IOH":-142,"IOI":-1015,"IOM":467,"MMH":187,"OOI":-1832};
this.TC2__ = {"HHO":2088,"HII":-1023,"HMM":-1154,"IHI":-1965,"KKH":703,"OII":-2649};
this.TC3__ = {"AAA":-294,"HHH":346,"HHI":-341,"HII":-1088,"HIK":731,"HOH":-1486,"IHH":128,"IHI":-3041,"IHO":-1935,"IIH":-825,"IIM":-1035,"IOI":-542,"KHH":-1216,"KKA":491,"KKH":-1217,"KOK":-1009,"MHH":-2694,"MHM":-457,"MHO":123,"MMH":-471,"NNH":-1689,"NNO":662,"OHO":-3393};
this.TC4__ = {"HHH":-203,"HHI":1344,"HHK":365,"HHM":-122,"HHN":182,"HHO":669,"HIH":804,"HII":679,"HOH":446,"IHH":695,"IHO":-2324,"IIH":321,"III":1497,"IIO":656,"IOO":54,"KAK":4845,"KKA":3386,"KKK":3065,"MHH":-405,"MHI":201,"MMH":-241,"MMM":661,"MOM":841};
this.TQ1__ = {"BHHH":-227,"BHHI":316,"BHIH":-132,"BIHH":60,"BIII":1595,"BNHH":-744,"BOHH":225,"BOOO":-908,"OAKK":482,"OHHH":281,"OHIH":249,"OIHI":200,"OIIH":-68};
this.TQ2__ = {"BIHH":-1401,"BIII":-1033,"BKAK":-543,"BOOO":-5591};
this.TQ3__ = {"BHHH":478,"BHHM":-1073,"BHIH":222,"BHII":-504,"BIIH":-116,"BIII":-105,"BMHI":-863,"BMHM":-464,"BOMH":620,"OHHH":346,"OHHI":1729,"OHII":997,"OHMH":481,"OIHH":623,"OIIH":1344,"OKAK":2792,"OKHH":587,"OKKA":679,"OOHH":110,"OOII":-685};
this.TQ4__ = {"BHHH":-721,"BHHM":-3604,"BHII":-966,"BIIH":-607,"BIII":-2181,"OAAA":-2763,"OAKK":180,"OHHH":-294,"OHHI":2446,"OHHO":480,"OHIH":-1573,"OIHH":1935,"OIHI":-493,"OIIH":626,"OIII":-4007,"OKAK":-8156};
this.TW1__ = {"につい":-4681,"東京都":2026};
this.TW2__ = {"ある程":-2049,"いった":-1256,"ころが":-2434,"しょう":3873,"その後":-4430,"だって":-1049,"ていた":1833,"として":-4657,"ともに":-4517,"もので":1882,"一気に":-792,"初めて":-1512,"同時に":-8097,"大きな":-1255,"対して":-2721,"社会党":-3216};
this.TW3__ = {"いただ":-1734,"してい":1314,"として":-4314,"につい":-5483,"にとっ":-5989,"に当た":-6247,"ので,":-727,"ので、":-727,"のもの":-600,"れから":-3752,"十二月":-2287};
this.TW4__ = {"いう.":8576,"いう。":8576,"からな":-2348,"してい":2958,"たが,":1516,"たが、":1516,"ている":1538,"という":1349,"ました":5543,"ません":1097,"ようと":-4258,"よると":5865};
this.UC1__ = {"A":484,"K":93,"M":645,"O":-505};
this.UC2__ = {"A":819,"H":1059,"I":409,"M":3987,"N":5775,"O":646};
this.UC3__ = {"A":-1370,"I":2311};
this.UC4__ = {"A":-2643,"H":1809,"I":-1032,"K":-3450,"M":3565,"N":3876,"O":6646};
this.UC5__ = {"H":313,"I":-1238,"K":-799,"M":539,"O":-831};
this.UC6__ = {"H":-506,"I":-253,"K":87,"M":247,"O":-387};
this.UP1__ = {"O":-214};
this.UP2__ = {"B":69,"O":935};
this.UP3__ = {"B":189};
this.UQ1__ = {"BH":21,"BI":-12,"BK":-99,"BN":142,"BO":-56,"OH":-95,"OI":477,"OK":410,"OO":-2422};
this.UQ2__ = {"BH":216,"BI":113,"OK":1759};
this.UQ3__ = {"BA":-479,"BH":42,"BI":1913,"BK":-7198,"BM":3160,"BN":6427,"BO":14761,"OI":-827,"ON":-3212};
this.UW1__ = {",":156,"、":156,"「":-463,"あ":-941,"う":-127,"が":-553,"き":121,"こ":505,"で":-201,"と":-547,"ど":-123,"に":-789,"の":-185,"は":-847,"も":-466,"や":-470,"よ":182,"ら":-292,"り":208,"れ":169,"を":-446,"ん":-137,"・":-135,"主":-402,"京":-268,"区":-912,"午":871,"国":-460,"大":561,"委":729,"市":-411,"日":-141,"理":361,"生":-408,"県":-386,"都":-718,"「":-463,"・":-135};
this.UW2__ = {",":-829,"、":-829,"":892,"「":-645,"」":3145,"あ":-538,"い":505,"う":134,"お":-502,"か":1454,"が":-856,"く":-412,"こ":1141,"さ":878,"ざ":540,"し":1529,"す":-675,"せ":300,"そ":-1011,"た":188,"だ":1837,"つ":-949,"て":-291,"で":-268,"と":-981,"ど":1273,"な":1063,"に":-1764,"の":130,"は":-409,"ひ":-1273,"べ":1261,"ま":600,"も":-1263,"や":-402,"よ":1639,"り":-579,"る":-694,"れ":571,"を":-2516,"ん":2095,"ア":-587,"カ":306,"キ":568,"ッ":831,"三":-758,"不":-2150,"世":-302,"中":-968,"主":-861,"事":492,"人":-123,"会":978,"保":362,"入":548,"初":-3025,"副":-1566,"北":-3414,"区":-422,"大":-1769,"天":-865,"太":-483,"子":-1519,"学":760,"実":1023,"小":-2009,"市":-813,"年":-1060,"強":1067,"手":-1519,"揺":-1033,"政":1522,"文":-1355,"新":-1682,"日":-1815,"明":-1462,"最":-630,"朝":-1843,"本":-1650,"東":-931,"果":-665,"次":-2378,"民":-180,"気":-1740,"理":752,"発":529,"目":-1584,"相":-242,"県":-1165,"立":-763,"第":810,"米":509,"自":-1353,"行":838,"西":-744,"見":-3874,"調":1010,"議":1198,"込":3041,"開":1758,"間":-1257,"「":-645,"」":3145,"ッ":831,"ア":-587,"カ":306,"キ":568};
this.UW3__ = {",":4889,"1":-800,"":-1723,"、":4889,"々":-2311,"":5827,"」":2670,"〓":-3573,"あ":-2696,"い":1006,"う":2342,"え":1983,"お":-4864,"か":-1163,"が":3271,"く":1004,"け":388,"げ":401,"こ":-3552,"ご":-3116,"さ":-1058,"し":-395,"す":584,"せ":3685,"そ":-5228,"た":842,"ち":-521,"っ":-1444,"つ":-1081,"て":6167,"で":2318,"と":1691,"ど":-899,"な":-2788,"に":2745,"の":4056,"は":4555,"ひ":-2171,"ふ":-1798,"へ":1199,"ほ":-5516,"ま":-4384,"み":-120,"め":1205,"も":2323,"や":-788,"よ":-202,"ら":727,"り":649,"る":5905,"れ":2773,"わ":-1207,"を":6620,"ん":-518,"ア":551,"グ":1319,"ス":874,"ッ":-1350,"ト":521,"ム":1109,"ル":1591,"ロ":2201,"ン":278,"・":-3794,"一":-1619,"下":-1759,"世":-2087,"両":3815,"中":653,"主":-758,"予":-1193,"二":974,"人":2742,"今":792,"他":1889,"以":-1368,"低":811,"何":4265,"作":-361,"保":-2439,"元":4858,"党":3593,"全":1574,"公":-3030,"六":755,"共":-1880,"円":5807,"再":3095,"分":457,"初":2475,"別":1129,"前":2286,"副":4437,"力":365,"動":-949,"務":-1872,"化":1327,"北":-1038,"区":4646,"千":-2309,"午":-783,"協":-1006,"口":483,"右":1233,"各":3588,"合":-241,"同":3906,"和":-837,"員":4513,"国":642,"型":1389,"場":1219,"外":-241,"妻":2016,"学":-1356,"安":-423,"実":-1008,"家":1078,"小":-513,"少":-3102,"州":1155,"市":3197,"平":-1804,"年":2416,"広":-1030,"府":1605,"度":1452,"建":-2352,"当":-3885,"得":1905,"思":-1291,"性":1822,"戸":-488,"指":-3973,"政":-2013,"教":-1479,"数":3222,"文":-1489,"新":1764,"日":2099,"旧":5792,"昨":-661,"時":-1248,"曜":-951,"最":-937,"月":4125,"期":360,"李":3094,"村":364,"東":-805,"核":5156,"森":2438,"業":484,"氏":2613,"民":-1694,"決":-1073,"法":1868,"海":-495,"無":979,"物":461,"特":-3850,"生":-273,"用":914,"町":1215,"的":7313,"直":-1835,"省":792,"県":6293,"知":-1528,"私":4231,"税":401,"立":-960,"第":1201,"米":7767,"系":3066,"約":3663,"級":1384,"統":-4229,"総":1163,"線":1255,"者":6457,"能":725,"自":-2869,"英":785,"見":1044,"調":-562,"財":-733,"費":1777,"車":1835,"軍":1375,"込":-1504,"通":-1136,"選":-681,"郎":1026,"郡":4404,"部":1200,"金":2163,"長":421,"開":-1432,"間":1302,"関":-1282,"雨":2009,"電":-1045,"非":2066,"駅":1620,"":-800,"」":2670,"・":-3794,"ッ":-1350,"ア":551,"グ":1319,"ス":874,"ト":521,"ム":1109,"ル":1591,"ロ":2201,"ン":278};
this.UW4__ = {",":3930,".":3508,"―":-4841,"、":3930,"。":3508,"":4999,"「":1895,"」":3798,"〓":-5156,"あ":4752,"い":-3435,"う":-640,"え":-2514,"お":2405,"か":530,"が":6006,"き":-4482,"ぎ":-3821,"く":-3788,"け":-4376,"げ":-4734,"こ":2255,"ご":1979,"さ":2864,"し":-843,"じ":-2506,"す":-731,"ず":1251,"せ":181,"そ":4091,"た":5034,"だ":5408,"ち":-3654,"っ":-5882,"つ":-1659,"て":3994,"で":7410,"と":4547,"な":5433,"に":6499,"ぬ":1853,"ね":1413,"の":7396,"は":8578,"ば":1940,"ひ":4249,"び":-4134,"ふ":1345,"へ":6665,"べ":-744,"ほ":1464,"ま":1051,"み":-2082,"む":-882,"め":-5046,"も":4169,"ゃ":-2666,"や":2795,"ょ":-1544,"よ":3351,"ら":-2922,"り":-9726,"る":-14896,"れ":-2613,"ろ":-4570,"わ":-1783,"を":13150,"ん":-2352,"カ":2145,"コ":1789,"セ":1287,"ッ":-724,"ト":-403,"メ":-1635,"ラ":-881,"リ":-541,"ル":-856,"ン":-3637,"・":-4371,"ー":-11870,"一":-2069,"中":2210,"予":782,"事":-190,"井":-1768,"人":1036,"以":544,"会":950,"体":-1286,"作":530,"側":4292,"先":601,"党":-2006,"共":-1212,"内":584,"円":788,"初":1347,"前":1623,"副":3879,"力":-302,"動":-740,"務":-2715,"化":776,"区":4517,"協":1013,"参":1555,"合":-1834,"和":-681,"員":-910,"器":-851,"回":1500,"国":-619,"園":-1200,"地":866,"場":-1410,"塁":-2094,"士":-1413,"多":1067,"大":571,"子":-4802,"学":-1397,"定":-1057,"寺":-809,"小":1910,"屋":-1328,"山":-1500,"島":-2056,"川":-2667,"市":2771,"年":374,"庁":-4556,"後":456,"性":553,"感":916,"所":-1566,"支":856,"改":787,"政":2182,"教":704,"文":522,"方":-856,"日":1798,"時":1829,"最":845,"月":-9066,"木":-485,"来":-442,"校":-360,"業":-1043,"氏":5388,"民":-2716,"気":-910,"沢":-939,"済":-543,"物":-735,"率":672,"球":-1267,"生":-1286,"産":-1101,"田":-2900,"町":1826,"的":2586,"目":922,"省":-3485,"県":2997,"空":-867,"立":-2112,"第":788,"米":2937,"系":786,"約":2171,"経":1146,"統":-1169,"総":940,"線":-994,"署":749,"者":2145,"能":-730,"般":-852,"行":-792,"規":792,"警":-1184,"議":-244,"谷":-1000,"賞":730,"車":-1481,"軍":1158,"輪":-1433,"込":-3370,"近":929,"道":-1291,"選":2596,"郎":-4866,"都":1192,"野":-1100,"銀":-2213,"長":357,"間":-2344,"院":-2297,"際":-2604,"電":-878,"領":-1659,"題":-792,"館":-1984,"首":1749,"高":2120,"「":1895,"」":3798,"・":-4371,"ッ":-724,"ー":-11870,"カ":2145,"コ":1789,"セ":1287,"ト":-403,"メ":-1635,"ラ":-881,"リ":-541,"ル":-856,"ン":-3637};
this.UW5__ = {",":465,".":-299,"1":-514,"E2":-32768,"]":-2762,"、":465,"。":-299,"「":363,"あ":1655,"い":331,"う":-503,"え":1199,"お":527,"か":647,"が":-421,"き":1624,"ぎ":1971,"く":312,"げ":-983,"さ":-1537,"し":-1371,"す":-852,"だ":-1186,"ち":1093,"っ":52,"つ":921,"て":-18,"で":-850,"と":-127,"ど":1682,"な":-787,"に":-1224,"の":-635,"は":-578,"べ":1001,"み":502,"め":865,"ゃ":3350,"ょ":854,"り":-208,"る":429,"れ":504,"わ":419,"を":-1264,"ん":327,"イ":241,"ル":451,"ン":-343,"中":-871,"京":722,"会":-1153,"党":-654,"務":3519,"区":-901,"告":848,"員":2104,"大":-1296,"学":-548,"定":1785,"嵐":-1304,"市":-2991,"席":921,"年":1763,"思":872,"所":-814,"挙":1618,"新":-1682,"日":218,"月":-4353,"査":932,"格":1356,"機":-1508,"氏":-1347,"田":240,"町":-3912,"的":-3149,"相":1319,"省":-1052,"県":-4003,"研":-997,"社":-278,"空":-813,"統":1955,"者":-2233,"表":663,"語":-1073,"議":1219,"選":-1018,"郎":-368,"長":786,"間":1191,"題":2368,"館":-689,"":-514,"":-32768,"「":363,"イ":241,"ル":451,"ン":-343};
this.UW6__ = {",":227,".":808,"1":-270,"E1":306,"、":227,"。":808,"あ":-307,"う":189,"か":241,"が":-73,"く":-121,"こ":-200,"じ":1782,"す":383,"た":-428,"っ":573,"て":-1014,"で":101,"と":-105,"な":-253,"に":-149,"の":-417,"は":-236,"も":-206,"り":187,"る":-135,"を":195,"ル":-673,"ン":-496,"一":-277,"中":201,"件":-800,"会":624,"前":302,"区":1792,"員":-1212,"委":798,"学":-960,"市":887,"広":-695,"後":535,"業":-697,"相":753,"社":-507,"福":974,"空":-822,"者":1811,"連":463,"郎":1082,"":-270,"":306,"ル":-673,"ン":-496};
return this;
}
TinySegmenter.prototype.ctype_ = function(str) {
for (var i in this.chartype_) {
if (str.match(this.chartype_[i][0])) {
return this.chartype_[i][1];
}
}
return "O";
}
TinySegmenter.prototype.ts_ = function(v) {
if (v) { return v; }
return 0;
}
TinySegmenter.prototype.segment = function(input) {
if (input == null || input == undefined || input == "") {
return [];
}
var result = [];
var seg = ["B3","B2","B1"];
var ctype = ["O","O","O"];
var o = input.split("");
for (i = 0; i < o.length; ++i) {
seg.push(o[i]);
ctype.push(this.ctype_(o[i]))
}
seg.push("E1");
seg.push("E2");
seg.push("E3");
ctype.push("O");
ctype.push("O");
ctype.push("O");
var word = seg[3];
var p1 = "U";
var p2 = "U";
var p3 = "U";
for (var i = 4; i < seg.length - 3; ++i) {
var score = this.BIAS__;
var w1 = seg[i-3];
var w2 = seg[i-2];
var w3 = seg[i-1];
var w4 = seg[i];
var w5 = seg[i+1];
var w6 = seg[i+2];
var c1 = ctype[i-3];
var c2 = ctype[i-2];
var c3 = ctype[i-1];
var c4 = ctype[i];
var c5 = ctype[i+1];
var c6 = ctype[i+2];
score += this.ts_(this.UP1__[p1]);
score += this.ts_(this.UP2__[p2]);
score += this.ts_(this.UP3__[p3]);
score += this.ts_(this.BP1__[p1 + p2]);
score += this.ts_(this.BP2__[p2 + p3]);
score += this.ts_(this.UW1__[w1]);
score += this.ts_(this.UW2__[w2]);
score += this.ts_(this.UW3__[w3]);
score += this.ts_(this.UW4__[w4]);
score += this.ts_(this.UW5__[w5]);
score += this.ts_(this.UW6__[w6]);
score += this.ts_(this.BW1__[w2 + w3]);
score += this.ts_(this.BW2__[w3 + w4]);
score += this.ts_(this.BW3__[w4 + w5]);
score += this.ts_(this.TW1__[w1 + w2 + w3]);
score += this.ts_(this.TW2__[w2 + w3 + w4]);
score += this.ts_(this.TW3__[w3 + w4 + w5]);
score += this.ts_(this.TW4__[w4 + w5 + w6]);
score += this.ts_(this.UC1__[c1]);
score += this.ts_(this.UC2__[c2]);
score += this.ts_(this.UC3__[c3]);
score += this.ts_(this.UC4__[c4]);
score += this.ts_(this.UC5__[c5]);
score += this.ts_(this.UC6__[c6]);
score += this.ts_(this.BC1__[c2 + c3]);
score += this.ts_(this.BC2__[c3 + c4]);
score += this.ts_(this.BC3__[c4 + c5]);
score += this.ts_(this.TC1__[c1 + c2 + c3]);
score += this.ts_(this.TC2__[c2 + c3 + c4]);
score += this.ts_(this.TC3__[c3 + c4 + c5]);
score += this.ts_(this.TC4__[c4 + c5 + c6]);
// score += this.ts_(this.TC5__[c4 + c5 + c6]);
score += this.ts_(this.UQ1__[p1 + c1]);
score += this.ts_(this.UQ2__[p2 + c2]);
score += this.ts_(this.UQ3__[p3 + c3]);
score += this.ts_(this.BQ1__[p2 + c2 + c3]);
score += this.ts_(this.BQ2__[p2 + c3 + c4]);
score += this.ts_(this.BQ3__[p3 + c2 + c3]);
score += this.ts_(this.BQ4__[p3 + c3 + c4]);
score += this.ts_(this.TQ1__[p2 + c1 + c2 + c3]);
score += this.ts_(this.TQ2__[p2 + c2 + c3 + c4]);
score += this.ts_(this.TQ3__[p3 + c1 + c2 + c3]);
score += this.ts_(this.TQ4__[p3 + c2 + c3 + c4]);
var p = "O";
if (score > 0) {
result.push(word);
word = "";
p = "B";
}
p1 = p2;
p2 = p3;
p3 = p;
word += seg[i];
}
result.push(word);
return result;
}
export { TinySegmenter };

View File

@@ -1,4 +1,4 @@
// ═══════════════════════════════════════════════════════════════════════════
// ═══════════════════════════════════════════════════════════════════════════
// Story Summary - 主入口
//
// 稳定目标:
@@ -107,16 +107,43 @@ const MESSAGE_EVENT = "message";
// 状态变量
// ═══════════════════════════════════════════════════════════════════════════
let summaryGenerating = false;
let overlayCreated = false;
let frameReady = false;
let currentMesId = null;
let pendingFrameMessages = [];
let eventsRegistered = false;
let vectorGenerating = false;
let vectorCancelled = false;
let vectorAbortController = null;
let anchorGenerating = false;
// ═══════════════════════════════════════════════════════════════════════════
// TaskGuard — 互斥任务管理summary / vector / anchor
// ═══════════════════════════════════════════════════════════════════════════
class TaskGuard {
#running = new Set();
acquire(taskName) {
if (this.#running.has(taskName)) return null;
this.#running.add(taskName);
let released = false;
return () => {
if (!released) {
released = true;
this.#running.delete(taskName);
}
};
}
isRunning(taskName) {
return this.#running.has(taskName);
}
isAnyRunning(...taskNames) {
return taskNames.some(t => this.#running.has(t));
}
}
const guard = new TaskGuard();
// 用户消息缓存(解决 GENERATION_STARTED 时 chat 尚未包含用户消息的问题)
let lastSentUserMessage = null;
@@ -219,13 +246,12 @@ async function unhideAllMessages() {
// 生成状态管理
// ═══════════════════════════════════════════════════════════════════════════
function setSummaryGenerating(flag) {
summaryGenerating = !!flag;
postToFrame({ type: "GENERATION_STATE", isGenerating: summaryGenerating });
function isSummaryGenerating() {
return guard.isRunning('summary');
}
function isSummaryGenerating() {
return summaryGenerating;
function notifySummaryState() {
postToFrame({ type: "GENERATION_STATE", isGenerating: guard.isRunning('summary') });
}
// ═══════════════════════════════════════════════════════════════════════════
@@ -300,38 +326,35 @@ async function sendAnchorStatsToFrame() {
}
async function handleAnchorGenerate() {
if (anchorGenerating) return;
const vectorCfg = getVectorConfig();
if (!vectorCfg?.enabled) {
await executeSlashCommand("/echo severity=warning 请先启用向量检索");
return;
}
if (!vectorCfg.online?.key) {
postToFrame({ type: "VECTOR_ONLINE_STATUS", status: "error", message: "请配置 API Key" });
return;
}
const { chatId, chat } = getContext();
if (!chatId || !chat?.length) return;
anchorGenerating = true;
postToFrame({ type: "ANCHOR_GEN_PROGRESS", current: 0, total: 1, message: "分析中..." });
const release = guard.acquire('anchor');
if (!release) return;
try {
// Phase 1: L0 提取 + Phase 2: L0 向量化(在 incrementalExtractAtoms 内部完成)
const vectorCfg = getVectorConfig();
if (!vectorCfg?.enabled) {
await executeSlashCommand("/echo severity=warning 请先启用向量检索");
return;
}
if (!vectorCfg.online?.key) {
postToFrame({ type: "VECTOR_ONLINE_STATUS", status: "error", message: "请配置 API Key" });
return;
}
const { chatId, chat } = getContext();
if (!chatId || !chat?.length) return;
postToFrame({ type: "ANCHOR_GEN_PROGRESS", current: 0, total: 1, message: "分析中..." });
await incrementalExtractAtoms(chatId, chat, (message, current, total) => {
postToFrame({ type: "ANCHOR_GEN_PROGRESS", current, total, message });
});
// Phase 3: 处理 pending L1 Chunks
postToFrame({ type: "ANCHOR_GEN_PROGRESS", current: 0, total: 1, message: "向量化 L1..." });
await buildIncrementalChunks({ vectorConfig: vectorCfg });
invalidateLexicalIndex();
await sendAnchorStatsToFrame();
await sendVectorStatsToFrame();
@@ -340,7 +363,7 @@ async function handleAnchorGenerate() {
xbLog.error(MODULE_ID, "记忆锚点生成失败", e);
await executeSlashCommand(`/echo severity=error 记忆锚点生成失败:${e.message}`);
} finally {
anchorGenerating = false;
release();
postToFrame({ type: "ANCHOR_GEN_PROGRESS", current: -1, total: 0 });
}
}
@@ -359,7 +382,6 @@ async function handleAnchorClear() {
function handleAnchorCancel() {
cancelL0Extraction();
anchorGenerating = false;
postToFrame({ type: "ANCHOR_GEN_PROGRESS", current: -1, total: 0 });
}
@@ -378,142 +400,159 @@ async function handleTestOnlineService(provider, config) {
}
async function handleGenerateVectors(vectorCfg) {
if (vectorGenerating) return;
const release = guard.acquire('vector');
if (!release) return;
if (!vectorCfg?.enabled) {
postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "ALL", current: -1, total: 0 });
return;
}
try {
if (!vectorCfg?.enabled) {
postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "ALL", current: -1, total: 0 });
return;
}
const { chatId, chat } = getContext();
if (!chatId || !chat?.length) return;
const { chatId, chat } = getContext();
if (!chatId || !chat?.length) return;
if (!vectorCfg.online?.key) {
postToFrame({ type: "VECTOR_ONLINE_STATUS", status: "error", message: "请配置 API Key" });
return;
}
if (!vectorCfg.online?.key) {
postToFrame({ type: "VECTOR_ONLINE_STATUS", status: "error", message: "请配置 API Key" });
return;
}
vectorGenerating = true;
vectorCancelled = false;
vectorAbortController = new AbortController();
vectorCancelled = false;
vectorAbortController = new AbortController();
const fingerprint = getEngineFingerprint(vectorCfg);
const batchSize = 20;
const fingerprint = getEngineFingerprint(vectorCfg);
const batchSize = 20;
await clearAllChunks(chatId);
await clearEventVectors(chatId);
await clearStateVectors(chatId);
await updateMeta(chatId, { lastChunkFloor: -1, fingerprint });
await clearAllChunks(chatId);
await clearEventVectors(chatId);
await clearStateVectors(chatId);
await updateMeta(chatId, { lastChunkFloor: -1, fingerprint });
const atoms = getStateAtoms();
if (!atoms.length) {
postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L0", current: 0, total: 0, message: "L0 为空,跳过" });
} else {
postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L0", current: 0, total: atoms.length, message: "L0 向量化..." });
const atoms = getStateAtoms();
if (!atoms.length) {
postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L0", current: 0, total: 0, message: "L0 为空,跳过" });
} else {
postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L0", current: 0, total: atoms.length, message: "L0 向量化..." });
let l0Completed = 0;
for (let i = 0; i < atoms.length; i += batchSize) {
if (vectorCancelled) break;
let l0Completed = 0;
for (let i = 0; i < atoms.length; i += batchSize) {
if (vectorCancelled) break;
const batch = atoms.slice(i, i + batchSize);
const texts = batch.map(a => a.semantic);
try {
const vectors = await embed(texts, vectorCfg, { signal: vectorAbortController.signal });
const items = batch.map((a, j) => ({
atomId: a.atomId,
floor: a.floor,
vector: vectors[j],
}));
await saveStateVectors(chatId, items, fingerprint);
l0Completed += batch.length;
postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L0", current: l0Completed, total: atoms.length });
} catch (e) {
if (e?.name === "AbortError") break;
xbLog.error(MODULE_ID, "L0 向量化失败", e);
vectorCancelled = true;
break;
const batch = atoms.slice(i, i + batchSize);
const texts = batch.map(a => a.semantic);
try {
const vectors = await embed(texts, vectorCfg, { signal: vectorAbortController.signal });
const items = batch.map((a, j) => ({
atomId: a.atomId,
floor: a.floor,
vector: vectors[j],
}));
await saveStateVectors(chatId, items, fingerprint);
l0Completed += batch.length;
postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L0", current: l0Completed, total: atoms.length });
} catch (e) {
if (e?.name === "AbortError") break;
xbLog.error(MODULE_ID, "L0 向量化失败", e);
vectorCancelled = true;
break;
}
}
}
}
if (vectorCancelled) {
vectorGenerating = false;
return;
}
if (vectorCancelled) return;
const allChunks = [];
for (let floor = 0; floor < chat.length; floor++) {
const chunks = chunkMessage(floor, chat[floor]);
allChunks.push(...chunks);
}
const allChunks = [];
for (let floor = 0; floor < chat.length; floor++) {
if (vectorCancelled) break;
if (allChunks.length > 0) {
await saveChunks(chatId, allChunks);
}
const message = chat[floor];
if (!message) continue;
const l1Texts = allChunks.map(c => c.text);
const store = getSummaryStore();
const events = store?.json?.events || [];
const { chunks, status } = chunkMessage(message, floor, vectorCfg, true);
if (status === "skip") continue;
postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L1", current: 0, total: l1Texts.length });
const l1Vectors = [];
let completed = 0;
for (let i = 0; i < l1Texts.length; i += batchSize) {
if (vectorCancelled) break;
const batch = l1Texts.slice(i, i + batchSize);
try {
const vectors = await embed(batch, vectorCfg, { signal: vectorAbortController.signal });
l1Vectors.push(...vectors);
completed += batch.length;
postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L1", current: completed, total: l1Texts.length });
} catch (e) {
if (e?.name === 'AbortError') break;
xbLog.error(MODULE_ID, 'L1 向量化失败', e);
vectorCancelled = true;
break;
allChunks.push(...chunks);
}
}
if (!vectorCancelled && l1Vectors.length > 0) {
const items = allChunks.map((c, i) => ({ chunkId: c.chunkId, vector: l1Vectors[i] })).filter(x => x.vector);
await saveChunkVectors(chatId, items, fingerprint);
await updateMeta(chatId, { lastChunkFloor: chat.length - 1 });
}
let l1Vectors = [];
if (!allChunks.length) {
postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L1", current: 0, total: 0, message: "L1 为空,跳过" });
} else {
postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L1", current: 0, total: allChunks.length, message: "L1 向量化..." });
await saveChunks(chatId, allChunks);
const l2Pairs = events
.map(e => ({ id: e.id, text: `${e.title || ''} ${e.summary || ''}`.trim() }))
.filter(p => p.text);
let l1Completed = 0;
for (let i = 0; i < allChunks.length; i += batchSize) {
if (vectorCancelled) break;
postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L2", current: 0, total: l2Pairs.length });
let l2Completed = 0;
for (let i = 0; i < l2Pairs.length; i += batchSize) {
if (vectorCancelled) break;
const batch = l2Pairs.slice(i, i + batchSize);
try {
const vectors = await embed(batch.map(p => p.text), vectorCfg, { signal: vectorAbortController.signal });
const items = batch.map((p, j) => ({ eventId: p.id, vector: vectors[j] }));
await saveEventVectorsToDb(chatId, items, fingerprint);
l2Completed += batch.length;
postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L2", current: l2Completed, total: l2Pairs.length });
} catch (e) {
if (e?.name === 'AbortError') break;
xbLog.error(MODULE_ID, 'L2 向量化失败', e);
vectorCancelled = true;
break;
const batch = allChunks.slice(i, i + batchSize);
const texts = batch.map(c => c.text);
try {
const vectors = await embed(texts, vectorCfg, { signal: vectorAbortController.signal });
const items = batch.map((c, j) => ({
chunkId: c.chunkId,
vector: vectors[j],
}));
await saveChunkVectors(chatId, items, fingerprint);
l1Vectors = l1Vectors.concat(items);
l1Completed += batch.length;
postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L1", current: l1Completed, total: allChunks.length });
} catch (e) {
if (e?.name === "AbortError") break;
xbLog.error(MODULE_ID, "L1 向量化失败", e);
vectorCancelled = true;
break;
}
}
}
if (vectorCancelled) return;
const store = getSummaryStore();
const events = store?.json?.events || [];
const l2Pairs = events
.map((e) => ({ id: e.id, text: `${e.title || ""} ${e.summary || ""}`.trim() }))
.filter((p) => p.text);
if (!l2Pairs.length) {
postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L2", current: 0, total: 0, message: "L2 为空,跳过" });
} else {
postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L2", current: 0, total: l2Pairs.length, message: "L2 向量化..." });
let l2Completed = 0;
for (let i = 0; i < l2Pairs.length; i += batchSize) {
if (vectorCancelled) break;
const batch = l2Pairs.slice(i, i + batchSize);
const texts = batch.map(p => p.text);
try {
const vectors = await embed(texts, vectorCfg, { signal: vectorAbortController.signal });
const items = batch.map((p, idx) => ({
eventId: p.id,
vector: vectors[idx],
}));
await saveEventVectorsToDb(chatId, items, fingerprint);
l2Completed += batch.length;
postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L2", current: l2Completed, total: l2Pairs.length });
} catch (e) {
if (e?.name === "AbortError") break;
xbLog.error(MODULE_ID, "L2 向量化失败", e);
vectorCancelled = true;
break;
}
}
}
postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "ALL", current: -1, total: 0 });
await sendVectorStatsToFrame();
xbLog.info(MODULE_ID, `向量生成完成: L0=${atoms.length}, L1=${l1Vectors.length}, L2=${l2Pairs.length}`);
} finally {
release();
vectorCancelled = false;
vectorAbortController = null;
}
postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "ALL", current: -1, total: 0 });
await sendVectorStatsToFrame();
vectorGenerating = false;
vectorCancelled = false;
vectorAbortController = null;
xbLog.info(MODULE_ID, `向量生成完成: L0=${atoms.length}, L1=${l1Vectors.length}, L2=${l2Pairs.length}`);
}
async function handleClearVectors() {
@@ -529,52 +568,6 @@ async function handleClearVectors() {
xbLog.info(MODULE_ID, "向量数据已清除");
}
// ═══════════════════════════════════════════════════════════════════════════
// L0 自动补提取(每收到新消息后检查并补提取缺失楼层)
// ═══════════════════════════════════════════════════════════════════════════
async function maybeAutoExtractL0() {
const vectorCfg = getVectorConfig();
if (!vectorCfg?.enabled) return;
if (anchorGenerating || vectorGenerating) return;
const { chatId, chat } = getContext();
if (!chatId || !chat?.length) return;
const stats = await getAnchorStats();
if (stats.pending <= 0) return;
anchorGenerating = true;
try {
await incrementalExtractAtoms(chatId, chat, null, { maxFloors: 20 });
// 为新提取的 L0 楼层构建 L1 chunks
await buildIncrementalChunks({ vectorConfig: vectorCfg });
invalidateLexicalIndex();
await sendAnchorStatsToFrame();
await sendVectorStatsToFrame();
xbLog.info(MODULE_ID, "自动 L0 补提取完成");
} catch (e) {
xbLog.error(MODULE_ID, "自动 L0 补提取失败", e);
} finally {
anchorGenerating = false;
}
}
// ═══════════════════════════════════════════════════════════════════════════
// Embedding 连接预热
// ═══════════════════════════════════════════════════════════════════════════
function warmupEmbeddingConnection() {
const vectorCfg = getVectorConfig();
if (!vectorCfg?.enabled) return;
embed(['.'], vectorCfg, { timeout: 5000 }).catch(() => {});
}
// ═══════════════════════════════════════════════════════════════════════════
// 实体词典注入 + 索引预热
// ═══════════════════════════════════════════════════════════════════════════
@@ -597,9 +590,52 @@ function refreshEntityLexiconAndWarmup() {
}
// ═══════════════════════════════════════════════════════════════════════════
// L2 自动增量向量化(总结完成后调用
// L0 自动补提取(每收到新消息后检查并补提取缺失楼层
// ═══════════════════════════════════════════════════════════════════════════
async function maybeAutoExtractL0() {
const vectorCfg = getVectorConfig();
if (!vectorCfg?.enabled) return;
if (guard.isAnyRunning('anchor', 'vector')) return;
const { chatId, chat } = getContext();
if (!chatId || !chat?.length) return;
const stats = await getAnchorStats();
if (stats.pending <= 0) return;
const release = guard.acquire('anchor');
if (!release) return;
try {
await incrementalExtractAtoms(chatId, chat, null, { maxFloors: 20 });
// 为新提取的 L0 楼层构建 L1 chunks
await buildIncrementalChunks({ vectorConfig: vectorCfg });
invalidateLexicalIndex();
await sendAnchorStatsToFrame();
await sendVectorStatsToFrame();
xbLog.info(MODULE_ID, "自动 L0 补提取完成");
} catch (e) {
xbLog.error(MODULE_ID, "自动 L0 补提取失败", e);
} finally {
release();
}
}
// ═══════════════════════════════════════════════════════════════════════════
// Embedding 连接预热
// ═══════════════════════════════════════════════════════════════════════════
function warmupEmbeddingConnection() {
const vectorCfg = getVectorConfig();
if (!vectorCfg?.enabled) return;
embed(['.'], vectorCfg, { timeout: 5000 }).catch(() => {});
}
async function autoVectorizeNewEvents(newEventIds) {
if (!newEventIds?.length) return;
@@ -902,7 +938,7 @@ function openPanelForMessage(mesId) {
sendFrameBaseData(store, totalFloors);
sendFrameFullData(store, totalFloors);
setSummaryGenerating(summaryGenerating);
notifySummaryState();
sendVectorConfigToFrame();
sendVectorStatsToFrame();
@@ -990,36 +1026,40 @@ async function maybeAutoRunSummary(reason) {
}
async function autoRunSummaryWithRetry(targetMesId, configForRun) {
setSummaryGenerating(true);
const release = guard.acquire('summary');
if (!release) return;
notifySummaryState();
for (let attempt = 1; attempt <= 3; attempt++) {
const result = await runSummaryGeneration(targetMesId, configForRun, {
onStatus: (text) => postToFrame({ type: "SUMMARY_STATUS", statusText: text }),
onError: (msg) => postToFrame({ type: "SUMMARY_ERROR", message: msg }),
onComplete: async ({ merged, endMesId, newEventIds }) => {
const store = getSummaryStore();
postToFrame({ type: "SUMMARY_FULL_DATA", payload: buildFramePayload(store) });
invalidateLexicalIndex();
applyHideStateDebounced();
updateFrameStatsAfterSummary(endMesId, store.json || {});
try {
for (let attempt = 1; attempt <= 3; attempt++) {
const result = await runSummaryGeneration(targetMesId, configForRun, {
onStatus: (text) => postToFrame({ type: "SUMMARY_STATUS", statusText: text }),
onError: (msg) => postToFrame({ type: "SUMMARY_ERROR", message: msg }),
onComplete: async ({ merged, endMesId, newEventIds }) => {
const store = getSummaryStore();
postToFrame({ type: "SUMMARY_FULL_DATA", payload: buildFramePayload(store) });
// L2 自动增量向量化
await autoVectorizeNewEvents(newEventIds);
},
});
invalidateLexicalIndex();
if (result.success) {
setSummaryGenerating(false);
return;
applyHideStateDebounced();
updateFrameStatsAfterSummary(endMesId, store.json || {});
await autoVectorizeNewEvents(newEventIds);
},
});
if (result.success) {
return;
}
if (attempt < 3) await sleep(1000);
}
if (attempt < 3) await sleep(1000);
await executeSlashCommand("/echo severity=error 剧情总结失败(已自动重试 3 次)。请稍后再试。");
} finally {
release();
notifySummaryState();
}
setSummaryGenerating(false);
await executeSlashCommand("/echo severity=error 剧情总结失败(已自动重试 3 次)。请稍后再试。");
}
function updateFrameStatsAfterSummary(endMesId, merged) {
@@ -1055,7 +1095,7 @@ function handleFrameMessage(event) {
case "FRAME_READY": {
frameReady = true;
flushPendingFrameMessages();
setSummaryGenerating(summaryGenerating);
notifySummaryState();
sendSavedConfigToFrame();
sendVectorConfigToFrame();
sendVectorStatsToFrame();
@@ -1084,7 +1124,7 @@ function handleFrameMessage(event) {
case "REQUEST_CANCEL":
window.xiaobaixStreamingGeneration?.cancel?.("xb9");
setSummaryGenerating(false);
postToFrame({ type: "GENERATION_STATE", isGenerating: false });
postToFrame({ type: "SUMMARY_STATUS", statusText: "已停止" });
break;
@@ -1282,26 +1322,30 @@ async function handleManualGenerate(mesId, config) {
return;
}
setSummaryGenerating(true);
const release = guard.acquire('summary');
if (!release) return;
notifySummaryState();
await runSummaryGeneration(mesId, config, {
onStatus: (text) => postToFrame({ type: "SUMMARY_STATUS", statusText: text }),
onError: (msg) => postToFrame({ type: "SUMMARY_ERROR", message: msg }),
onComplete: async ({ merged, endMesId, newEventIds }) => {
const store = getSummaryStore();
postToFrame({ type: "SUMMARY_FULL_DATA", payload: buildFramePayload(store) });
invalidateLexicalIndex();
applyHideStateDebounced();
updateFrameStatsAfterSummary(endMesId, store.json || {});
try {
await runSummaryGeneration(mesId, config, {
onStatus: (text) => postToFrame({ type: "SUMMARY_STATUS", statusText: text }),
onError: (msg) => postToFrame({ type: "SUMMARY_ERROR", message: msg }),
onComplete: async ({ merged, endMesId, newEventIds }) => {
const store = getSummaryStore();
postToFrame({ type: "SUMMARY_FULL_DATA", payload: buildFramePayload(store) });
// L2 自动增量向量化
await autoVectorizeNewEvents(newEventIds);
},
});
invalidateLexicalIndex();
setSummaryGenerating(false);
applyHideStateDebounced();
updateFrameStatsAfterSummary(endMesId, store.json || {});
await autoVectorizeNewEvents(newEventIds);
},
});
} finally {
release();
notifySummaryState();
}
}
// ═══════════════════════════════════════════════════════════════════════════
@@ -1390,7 +1434,7 @@ async function handleMessageReceived() {
initButtonsForAll();
// 向量全量生成中时跳过 L1 sync避免竞争写入
if (vectorGenerating) return;
if (guard.isRunning('vector')) return;
await syncOnMessageReceived(chatId, lastFloor, message, vectorConfig, () => {
sendAnchorStatsToFrame();
@@ -1529,6 +1573,22 @@ async function handleGenerationStarted(type, _params, isDryRun) {
// 事件注册
// ═══════════════════════════════════════════════════════════════════════════
const boundHandlers = {
chatChanged: () => setTimeout(handleChatChanged, 80),
messageDeleted: () => setTimeout(handleMessageDeleted, 50),
messageReceived: () => setTimeout(handleMessageReceived, 150),
messageSent: () => setTimeout(handleMessageSent, 150),
messageSentRecall: handleMessageSentForRecall,
messageSwiped: () => setTimeout(handleMessageSwiped, 100),
messageUpdated: () => setTimeout(handleMessageUpdated, 100),
messageEdited: () => setTimeout(handleMessageUpdated, 100),
userRendered: (data) => setTimeout(() => handleMessageRendered(data), 50),
charRendered: (data) => setTimeout(() => handleMessageRendered(data), 50),
genStarted: handleGenerationStarted,
genStopped: clearExtensionPrompt,
genEnded: clearExtensionPrompt,
};
function registerEvents() {
if (eventsRegistered) return;
eventsRegistered = true;
@@ -1551,31 +1611,45 @@ function registerEvents() {
initButtonsForAll();
eventSource.on(event_types.CHAT_CHANGED, () => setTimeout(handleChatChanged, 80));
eventSource.on(event_types.MESSAGE_DELETED, () => setTimeout(handleMessageDeleted, 50));
eventSource.on(event_types.MESSAGE_RECEIVED, () => setTimeout(handleMessageReceived, 150));
eventSource.on(event_types.MESSAGE_SENT, () => setTimeout(handleMessageSent, 150));
eventSource.on(event_types.MESSAGE_SENT, handleMessageSentForRecall);
eventSource.on(event_types.MESSAGE_SWIPED, () => setTimeout(handleMessageSwiped, 100));
eventSource.on(event_types.MESSAGE_UPDATED, () => setTimeout(handleMessageUpdated, 100));
eventSource.on(event_types.MESSAGE_EDITED, () => setTimeout(handleMessageUpdated, 100));
eventSource.on(event_types.USER_MESSAGE_RENDERED, (data) => setTimeout(() => handleMessageRendered(data), 50));
eventSource.on(event_types.CHARACTER_MESSAGE_RENDERED, (data) => setTimeout(() => handleMessageRendered(data), 50));
eventSource.on(event_types.CHAT_CHANGED, boundHandlers.chatChanged);
eventSource.on(event_types.MESSAGE_DELETED, boundHandlers.messageDeleted);
eventSource.on(event_types.MESSAGE_RECEIVED, boundHandlers.messageReceived);
eventSource.on(event_types.MESSAGE_SENT, boundHandlers.messageSent);
eventSource.on(event_types.MESSAGE_SENT, boundHandlers.messageSentRecall);
eventSource.on(event_types.MESSAGE_SWIPED, boundHandlers.messageSwiped);
eventSource.on(event_types.MESSAGE_UPDATED, boundHandlers.messageUpdated);
eventSource.on(event_types.MESSAGE_EDITED, boundHandlers.messageEdited);
eventSource.on(event_types.USER_MESSAGE_RENDERED, boundHandlers.userRendered);
eventSource.on(event_types.CHARACTER_MESSAGE_RENDERED, boundHandlers.charRendered);
// 用户输入捕获(原生捕获阶段)
document.addEventListener("pointerdown", onSendPointerdown, true);
document.addEventListener("keydown", onSendKeydown, true);
// 注入链路
eventSource.on(event_types.GENERATION_STARTED, handleGenerationStarted);
eventSource.on(event_types.GENERATION_STOPPED, clearExtensionPrompt);
eventSource.on(event_types.GENERATION_ENDED, clearExtensionPrompt);
eventSource.on(event_types.GENERATION_STARTED, boundHandlers.genStarted);
eventSource.on(event_types.GENERATION_STOPPED, boundHandlers.genStopped);
eventSource.on(event_types.GENERATION_ENDED, boundHandlers.genEnded);
}
function unregisterEvents() {
CacheRegistry.unregister(MODULE_ID);
eventsRegistered = false;
eventSource.off(event_types.CHAT_CHANGED, boundHandlers.chatChanged);
eventSource.off(event_types.MESSAGE_DELETED, boundHandlers.messageDeleted);
eventSource.off(event_types.MESSAGE_RECEIVED, boundHandlers.messageReceived);
eventSource.off(event_types.MESSAGE_SENT, boundHandlers.messageSent);
eventSource.off(event_types.MESSAGE_SENT, boundHandlers.messageSentRecall);
eventSource.off(event_types.MESSAGE_SWIPED, boundHandlers.messageSwiped);
eventSource.off(event_types.MESSAGE_UPDATED, boundHandlers.messageUpdated);
eventSource.off(event_types.MESSAGE_EDITED, boundHandlers.messageEdited);
eventSource.off(event_types.USER_MESSAGE_RENDERED, boundHandlers.userRendered);
eventSource.off(event_types.CHARACTER_MESSAGE_RENDERED, boundHandlers.charRendered);
eventSource.off(event_types.GENERATION_STARTED, boundHandlers.genStarted);
eventSource.off(event_types.GENERATION_STOPPED, boundHandlers.genStopped);
eventSource.off(event_types.GENERATION_ENDED, boundHandlers.genEnded);
$(".xiaobaix-story-summary-btn").remove();
hideOverlay();

View File

@@ -28,17 +28,17 @@ export async function rerank(query, documents, options = {}) {
if (!query?.trim()) {
xbLog.warn(MODULE_ID, 'query 为空,跳过 rerank');
return documents.map((_, i) => ({ index: i, relevance_score: 0.5 }));
return { results: documents.map((_, i) => ({ index: i, relevance_score: 0 })), failed: true };
}
if (!documents?.length) {
return [];
return { results: [], failed: false };
}
const key = getApiKey();
if (!key) {
xbLog.warn(MODULE_ID, '未配置 API Key跳过 rerank');
return documents.map((_, i) => ({ index: i, relevance_score: 0.5 }));
return { results: documents.map((_, i) => ({ index: i, relevance_score: 0 })), failed: true };
}
// 截断超长文档列表
@@ -61,7 +61,7 @@ export async function rerank(query, documents, options = {}) {
if (!validDocs.length) {
xbLog.warn(MODULE_ID, '无有效文档,跳过 rerank');
return [];
return { results: [], failed: false };
}
const controller = new AbortController();
@@ -106,7 +106,7 @@ export async function rerank(query, documents, options = {}) {
const elapsed = Math.round(performance.now() - T0);
xbLog.info(MODULE_ID, `Rerank 完成: ${validDocs.length} docs → ${results.length} selected (${elapsed}ms)`);
return mapped;
return { results: mapped, failed: false };
} catch (e) {
clearTimeout(timeoutId);
@@ -118,10 +118,13 @@ export async function rerank(query, documents, options = {}) {
}
// 降级:返回原顺序,分数均匀分布
return documents.slice(0, topN).map((_, i) => ({
index: i,
relevance_score: 1 - (i / documents.length) * 0.5,
}));
return {
results: documents.slice(0, topN).map((_, i) => ({
index: i,
relevance_score: 0,
})),
failed: true,
};
}
}
@@ -138,30 +141,38 @@ export async function rerankChunks(query, chunks, options = {}) {
if (!chunks?.length) return [];
if (chunks.length <= topN) {
// 数量不超限,仍然 rerank 以获取分数,但不过滤
const texts = chunks.map(c => c.text || c.semantic || '');
const results = await rerank(query, texts, { topN: chunks.length, ...options });
const { results, failed } = await rerank(query, texts, { topN: chunks.length, ...options });
if (failed) {
return chunks.map(c => ({ ...c, _rerankScore: 0, _rerankFailed: true }));
}
const scoreMap = new Map(results.map(r => [r.index, r.relevance_score]));
return chunks.map((c, i) => ({
...c,
_rerankScore: scoreMap.get(i) ?? 0.5,
_rerankScore: scoreMap.get(i) ?? 0,
})).sort((a, b) => b._rerankScore - a._rerankScore);
}
const texts = chunks.map(c => c.text || c.semantic || '');
const results = await rerank(query, texts, { topN, ...options });
const { results, failed } = await rerank(query, texts, { topN, ...options });
// 过滤低分 + 排序
const selected = results
if (failed) {
return chunks.slice(0, topN).map(c => ({
...c,
_rerankScore: 0,
_rerankFailed: true,
}));
}
return results
.filter(r => r.relevance_score >= minScore)
.sort((a, b) => b.relevance_score - a.relevance_score)
.map(r => ({
...chunks[r.index],
_rerankScore: r.relevance_score,
}));
return selected;
}
/**
@@ -174,7 +185,7 @@ export async function testRerankService() {
}
try {
const results = await rerank('测试查询', ['测试文档1', '测试文档2'], { topN: 2 });
const { results } = await rerank('测试查询', ['测试文档1', '测试文档2'], { topN: 2 });
return {
success: true,
message: `连接成功,返回 ${results.length} 个结果`,

View File

@@ -86,6 +86,7 @@ export function createMetrics() {
l0Candidates: 0, // W-RRF 融合后的 L0 候选数
l0Selected: 0, // rerank 后选中的 L0 数
rerankApplied: false,
rerankFailed: false,
beforeRerank: 0,
afterRerank: 0,
rerankTime: 0,
@@ -283,6 +284,9 @@ export function formatMetricsLog(metrics) {
if (m.evidence.rerankApplied) {
lines.push(`│ ├─ rerank_applied: true`);
if (m.evidence.rerankFailed) {
lines.push(`│ ├─ rerank_failed: ⚠ YES (using fusion order)`);
}
lines.push(`│ │ ├─ before: ${m.evidence.beforeRerank}`);
lines.push(`│ │ ├─ after: ${m.evidence.afterRerank}`);
lines.push(`│ │ └─ time: ${m.evidence.rerankTime}ms`);
@@ -489,6 +493,10 @@ export function detectIssues(metrics) {
// L1 挂载问题
// ─────────────────────────────────────────────────────────────────
if (m.evidence.rerankFailed) {
issues.push('Rerank API failed — using fusion rank order as fallback, relevance scores are zero');
}
if (m.evidence.l0Selected > 0 && m.evidence.l1Pulled === 0) {
issues.push('Zero L1 chunks pulled - L1 vectors may not exist or DB read failed');
}

View File

@@ -631,6 +631,7 @@ async function locateAndPullEvidence(anchorHits, anchorFloors, queryVector, rera
metrics.evidence.rerankApplied = true;
metrics.evidence.beforeRerank = rerankCandidates.length;
metrics.evidence.afterRerank = rerankedL0.length;
metrics.evidence.rerankFailed = rerankedL0.some(c => c._rerankFailed);
metrics.evidence.l0Selected = rerankedL0.length;
metrics.evidence.rerankTime = rerankTime;
metrics.timing.evidenceRerank = rerankTime;

View File

@@ -49,6 +49,9 @@ let jiebaCut = null;
/** @type {Function|null} jieba add_word 函数引用 */
let jiebaAddWord = null;
/** @type {object|null} TinySegmenter 实例 */
let tinySegmenter = null;
// ═══════════════════════════════════════════════════════════════════════════
// 实体词典
// ═══════════════════════════════════════════════════════════════════════════
@@ -76,12 +79,13 @@ const STOP_WORDS = new Set([
'时候', '现在', '已经', '还是', '只是', '可能', '应该', '知道',
'觉得', '开始', '一下', '一些', '这个', '那个', '他们', '我们',
'你们', '自己', '起来', '出来', '进去', '回来', '过来', '下去',
// 日语助词 + 常见虚词
'は', 'が', 'を', '', '', '', '', '', '', '',
'か', 'な', 'よ', '', '', 'だ', 'です', 'ます', 'た', 'て',
'する', 'いる', 'ある', 'なる', 'れる', 'られる', 'ない',
// 日语常见虚词≥2字匹配 TinySegmenter 产出粒度)
'です', 'ます', 'した', 'して', 'する', 'ない', 'いる', 'ある',
'なる', 'れる', 'られ', 'られる',
'この', 'その', 'あの', 'どの', 'ここ', 'そこ', 'あそこ',
'これ', 'それ', 'あれ', 'どれ',
'ても', 'から', 'まで', 'ので', 'のに', 'けど', 'だけ',
'もう', 'まだ', 'とても', 'ちょっと', 'やっぱり',
// 英文常见停用词
'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been',
'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
@@ -100,6 +104,34 @@ const STOP_WORDS = new Set([
// Unicode 分类
// ═══════════════════════════════════════════════════════════════════════════
/**
* 判断字符是否为假名(平假名 + 片假名)
* @param {number} code - charCode
* @returns {boolean}
*/
function isKana(code) {
return (
(code >= 0x3040 && code <= 0x309F) || // Hiragana
(code >= 0x30A0 && code <= 0x30FF) || // Katakana
(code >= 0x31F0 && code <= 0x31FF) || // Katakana Extensions
(code >= 0xFF65 && code <= 0xFF9F) // Halfwidth Katakana
);
}
/**
* 判断字符是否为 CJK 汉字(不含假名)
* @param {number} code - charCode
* @returns {boolean}
*/
function isCJK(code) {
return (
(code >= 0x4E00 && code <= 0x9FFF) ||
(code >= 0x3400 && code <= 0x4DBF) ||
(code >= 0xF900 && code <= 0xFAFF) ||
(code >= 0x20000 && code <= 0x2A6DF)
);
}
/**
* 判断字符是否为亚洲文字CJK + 假名)
* @param {number} code - charCode
@@ -107,14 +139,7 @@ const STOP_WORDS = new Set([
*/
function isAsian(code) {
return (
(code >= 0x4E00 && code <= 0x9FFF) || // CJK Unified Ideographs
(code >= 0x3400 && code <= 0x4DBF) || // CJK Extension A
(code >= 0x3040 && code <= 0x309F) || // Hiragana
(code >= 0x30A0 && code <= 0x30FF) || // Katakana
(code >= 0x31F0 && code <= 0x31FF) || // Katakana Phonetic Extensions
(code >= 0xFF65 && code <= 0xFF9F) || // Halfwidth Katakana
(code >= 0xF900 && code <= 0xFAFF) || // CJK Compatibility Ideographs
(code >= 0x20000 && code <= 0x2A6DF) // CJK Extension B
isCJK(code) || isKana(code)
);
}
@@ -195,6 +220,31 @@ function segmentByScript(text) {
return segments;
}
// ═══════════════════════════════════════════════════════════════════════════
// 亚洲文字语言检测(中文 vs 日语)
// ═══════════════════════════════════════════════════════════════════════════
/**
* 检测亚洲文字段的语言
*
* 假名占比 > 30% 判定为日语(日语文本中假名通常占 40-60%
*
* @param {string} text - 亚洲文字段
* @returns {'zh'|'ja'|'other'}
*/
function detectAsianLanguage(text) {
let kanaCount = 0;
let cjkCount = 0;
for (const ch of text) {
const code = ch.codePointAt(0);
if (isKana(code)) kanaCount++;
else if (isCJK(code)) cjkCount++;
}
const total = kanaCount + cjkCount;
if (total === 0) return 'other';
return (kanaCount / total) > 0.3 ? 'ja' : 'zh';
}
// ═══════════════════════════════════════════════════════════════════════════
// 实体保护(最长匹配占位符替换)
// ═══════════════════════════════════════════════════════════════════════════
@@ -343,6 +393,26 @@ function tokenizeAsianFallback(text) {
return tokens;
}
/**
* 用 TinySegmenter 处理日语文字段
* @param {string} text
* @returns {string[]}
*/
function tokenizeJapanese(text) {
if (tinySegmenter) {
try {
const words = tinySegmenter.segment(text);
return words
.map(w => String(w || '').trim())
.filter(w => w.length >= 2);
} catch (e) {
xbLog.warn(MODULE_ID, 'TinySegmenter 分词异常,降级处理', e);
return tokenizeAsianFallback(text);
}
}
return tokenizeAsianFallback(text);
}
// ═══════════════════════════════════════════════════════════════════════════
// 分词:拉丁文字
// ═══════════════════════════════════════════════════════════════════════════
@@ -374,6 +444,9 @@ function tokenizeLatin(text) {
* @returns {Promise<boolean>} 是否加载成功
*/
export async function preload() {
// TinySegmenter 独立于结巴状态(内部有防重入)
loadTinySegmenter();
// 已就绪
if (wasmState === WasmState.READY) return true;
@@ -443,6 +516,25 @@ export async function preload() {
}
}
/**
* 加载 TinySegmenter懒加载不阻塞
*/
async function loadTinySegmenter() {
if (tinySegmenter) return;
try {
// eslint-disable-next-line no-unsanitized/method
const mod = await import(
`/${extensionFolderPath}/libs/tiny-segmenter.js`
);
const Ctor = mod.TinySegmenter || mod.default;
tinySegmenter = new Ctor();
xbLog.info(MODULE_ID, 'TinySegmenter 加载完成');
} catch (e) {
xbLog.warn(MODULE_ID, 'TinySegmenter 加载失败,日语将使用降级分词', e);
}
}
// ═══════════════════════════════════════════════════════════════════════════
// 公开接口isReady
// ═══════════════════════════════════════════════════════════════════════════
@@ -598,7 +690,10 @@ function tokenizeCore(text) {
const rawTokens = [];
for (const seg of segments) {
if (seg.type === 'asian') {
if (wasmState === WasmState.READY && jiebaCut) {
const lang = detectAsianLanguage(seg.text);
if (lang === 'ja') {
rawTokens.push(...tokenizeJapanese(seg.text));
} else if (wasmState === WasmState.READY && jiebaCut) {
rawTokens.push(...tokenizeAsianJieba(seg.text));
} else {
rawTokens.push(...tokenizeAsianFallback(seg.text));