Improve rerank failure handling and tokenizer JP support

This commit is contained in:
2026-02-10 17:52:09 +08:00
parent fbf34815bb
commit 062df60570
7 changed files with 655 additions and 285 deletions

View File

@@ -93,3 +93,7 @@ For complete license terms, see LICENSE.md
For attribution requirements, see COPYRIGHT For attribution requirements, see COPYRIGHT
Last updated: 2025-01-14 Last updated: 2025-01-14
TinySegmenter 0.2
Copyright (c) 2008 Taku Kudo
MIT License
http://www.chasen.org/~taku/software/TinySegmenter/

177
libs/tiny-segmenter.js Normal file
View File

@@ -0,0 +1,177 @@
// TinySegmenter 0.1 -- Super compact Japanese tokenizer in Javascript
// (c) 2008 Taku Kudo <taku@chasen.org>
// TinySegmenter is freely distributable under the terms of a new BSD licence.
// For details, see http://chasen.org/~taku/software/TinySegmenter/LICENCE.txt
function TinySegmenter() {
var patterns = {
"[一二三四五六七八九十百千万億兆]":"M",
"[一-龠々〆ヵヶ]":"H",
"[ぁ-ん]":"I",
"[ァ-ヴーア-ン゙ー]":"K",
"[a-zA-Z--]":"A",
"[0-9-]":"N"
}
this.chartype_ = [];
for (var i in patterns) {
var regexp = new RegExp;
regexp.compile(i)
this.chartype_.push([regexp, patterns[i]]);
}
this.BIAS__ = -332
this.BC1__ = {"HH":6,"II":2461,"KH":406,"OH":-1378};
this.BC2__ = {"AA":-3267,"AI":2744,"AN":-878,"HH":-4070,"HM":-1711,"HN":4012,"HO":3761,"IA":1327,"IH":-1184,"II":-1332,"IK":1721,"IO":5492,"KI":3831,"KK":-8741,"MH":-3132,"MK":3334,"OO":-2920};
this.BC3__ = {"HH":996,"HI":626,"HK":-721,"HN":-1307,"HO":-836,"IH":-301,"KK":2762,"MK":1079,"MM":4034,"OA":-1652,"OH":266};
this.BP1__ = {"BB":295,"OB":304,"OO":-125,"UB":352};
this.BP2__ = {"BO":60,"OO":-1762};
this.BQ1__ = {"BHH":1150,"BHM":1521,"BII":-1158,"BIM":886,"BMH":1208,"BNH":449,"BOH":-91,"BOO":-2597,"OHI":451,"OIH":-296,"OKA":1851,"OKH":-1020,"OKK":904,"OOO":2965};
this.BQ2__ = {"BHH":118,"BHI":-1159,"BHM":466,"BIH":-919,"BKK":-1720,"BKO":864,"OHH":-1139,"OHM":-181,"OIH":153,"UHI":-1146};
this.BQ3__ = {"BHH":-792,"BHI":2664,"BII":-299,"BKI":419,"BMH":937,"BMM":8335,"BNN":998,"BOH":775,"OHH":2174,"OHM":439,"OII":280,"OKH":1798,"OKI":-793,"OKO":-2242,"OMH":-2402,"OOO":11699};
this.BQ4__ = {"BHH":-3895,"BIH":3761,"BII":-4654,"BIK":1348,"BKK":-1806,"BMI":-3385,"BOO":-12396,"OAH":926,"OHH":266,"OHK":-2036,"ONN":-973};
this.BW1__ = {",と":660,",同":727,"B1あ":1404,"B1同":542,"、と":660,"、同":727,"」と":1682,"あっ":1505,"いう":1743,"いっ":-2055,"いる":672,"うし":-4817,"うん":665,"から":3472,"がら":600,"こう":-790,"こと":2083,"こん":-1262,"さら":-4143,"さん":4573,"した":2641,"して":1104,"すで":-3399,"そこ":1977,"それ":-871,"たち":1122,"ため":601,"った":3463,"つい":-802,"てい":805,"てき":1249,"でき":1127,"です":3445,"では":844,"とい":-4915,"とみ":1922,"どこ":3887,"ない":5713,"なっ":3015,"など":7379,"なん":-1113,"にし":2468,"には":1498,"にも":1671,"に対":-912,"の一":-501,"の中":741,"ませ":2448,"まで":1711,"まま":2600,"まる":-2155,"やむ":-1947,"よっ":-2565,"れた":2369,"れで":-913,"をし":1860,"を見":731,"亡く":-1886,"京都":2558,"取り":-2784,"大き":-2604,"大阪":1497,"平方":-2314,"引き":-1336,"日本":-195,"本当":-2423,"毎日":-2113,"目指":-724,"B1あ":1404,"B1同":542,"」と":1682};
this.BW2__ = {"..":-11822,"11":-669,"――":-5730,"":-13175,"いう":-1609,"うか":2490,"かし":-1350,"かも":-602,"から":-7194,"かれ":4612,"がい":853,"がら":-3198,"きた":1941,"くな":-1597,"こと":-8392,"この":-4193,"させ":4533,"され":13168,"さん":-3977,"しい":-1819,"しか":-545,"した":5078,"して":972,"しな":939,"その":-3744,"たい":-1253,"たた":-662,"ただ":-3857,"たち":-786,"たと":1224,"たは":-939,"った":4589,"って":1647,"っと":-2094,"てい":6144,"てき":3640,"てく":2551,"ては":-3110,"ても":-3065,"でい":2666,"でき":-1528,"でし":-3828,"です":-4761,"でも":-4203,"とい":1890,"とこ":-1746,"とと":-2279,"との":720,"とみ":5168,"とも":-3941,"ない":-2488,"なが":-1313,"など":-6509,"なの":2614,"なん":3099,"にお":-1615,"にし":2748,"にな":2454,"によ":-7236,"に対":-14943,"に従":-4688,"に関":-11388,"のか":2093,"ので":-7059,"のに":-6041,"のの":-6125,"はい":1073,"はが":-1033,"はず":-2532,"ばれ":1813,"まし":-1316,"まで":-6621,"まれ":5409,"めて":-3153,"もい":2230,"もの":-10713,"らか":-944,"らし":-1611,"らに":-1897,"りし":651,"りま":1620,"れた":4270,"れて":849,"れば":4114,"ろう":6067,"われ":7901,"を通":-11877,"んだ":728,"んな":-4115,"一人":602,"一方":-1375,"一日":970,"一部":-1051,"上が":-4479,"会社":-1116,"出て":2163,"分の":-7758,"同党":970,"同日":-913,"大阪":-2471,"委員":-1250,"少な":-1050,"年度":-8669,"年間":-1626,"府県":-2363,"手権":-1982,"新聞":-4066,"日新":-722,"日本":-7068,"日米":3372,"曜日":-601,"朝鮮":-2355,"本人":-2697,"東京":-1543,"然と":-1384,"社会":-1276,"立て":-990,"第に":-1612,"米国":-4268,"":-669};
this.BW3__ = {"あた":-2194,"あり":719,"ある":3846,"い.":-1185,"い。":-1185,"いい":5308,"いえ":2079,"いく":3029,"いた":2056,"いっ":1883,"いる":5600,"いわ":1527,"うち":1117,"うと":4798,"えと":1454,"か.":2857,"か。":2857,"かけ":-743,"かっ":-4098,"かに":-669,"から":6520,"かり":-2670,"が,":1816,"が、":1816,"がき":-4855,"がけ":-1127,"がっ":-913,"がら":-4977,"がり":-2064,"きた":1645,"けど":1374,"こと":7397,"この":1542,"ころ":-2757,"さい":-714,"さを":976,"し,":1557,"し、":1557,"しい":-3714,"した":3562,"して":1449,"しな":2608,"しま":1200,"す.":-1310,"す。":-1310,"する":6521,"ず,":3426,"ず、":3426,"ずに":841,"そう":428,"た.":8875,"た。":8875,"たい":-594,"たの":812,"たり":-1183,"たる":-853,"だ.":4098,"だ。":4098,"だっ":1004,"った":-4748,"って":300,"てい":6240,"てお":855,"ても":302,"です":1437,"でに":-1482,"では":2295,"とう":-1387,"とし":2266,"との":541,"とも":-3543,"どう":4664,"ない":1796,"なく":-903,"など":2135,"に,":-1021,"に、":-1021,"にし":1771,"にな":1906,"には":2644,"の,":-724,"の、":-724,"の子":-1000,"は,":1337,"は、":1337,"べき":2181,"まし":1113,"ます":6943,"まっ":-1549,"まで":6154,"まれ":-793,"らし":1479,"られ":6820,"るる":3818,"れ,":854,"れ、":854,"れた":1850,"れて":1375,"れば":-3246,"れる":1091,"われ":-605,"んだ":606,"んで":798,"カ月":990,"会議":860,"入り":1232,"大会":2217,"始め":1681,"市":965,"新聞":-5055,"日,":974,"日、":974,"社会":2024,"カ月":990};
this.TC1__ = {"AAA":1093,"HHH":1029,"HHM":580,"HII":998,"HOH":-390,"HOM":-331,"IHI":1169,"IOH":-142,"IOI":-1015,"IOM":467,"MMH":187,"OOI":-1832};
this.TC2__ = {"HHO":2088,"HII":-1023,"HMM":-1154,"IHI":-1965,"KKH":703,"OII":-2649};
this.TC3__ = {"AAA":-294,"HHH":346,"HHI":-341,"HII":-1088,"HIK":731,"HOH":-1486,"IHH":128,"IHI":-3041,"IHO":-1935,"IIH":-825,"IIM":-1035,"IOI":-542,"KHH":-1216,"KKA":491,"KKH":-1217,"KOK":-1009,"MHH":-2694,"MHM":-457,"MHO":123,"MMH":-471,"NNH":-1689,"NNO":662,"OHO":-3393};
this.TC4__ = {"HHH":-203,"HHI":1344,"HHK":365,"HHM":-122,"HHN":182,"HHO":669,"HIH":804,"HII":679,"HOH":446,"IHH":695,"IHO":-2324,"IIH":321,"III":1497,"IIO":656,"IOO":54,"KAK":4845,"KKA":3386,"KKK":3065,"MHH":-405,"MHI":201,"MMH":-241,"MMM":661,"MOM":841};
this.TQ1__ = {"BHHH":-227,"BHHI":316,"BHIH":-132,"BIHH":60,"BIII":1595,"BNHH":-744,"BOHH":225,"BOOO":-908,"OAKK":482,"OHHH":281,"OHIH":249,"OIHI":200,"OIIH":-68};
this.TQ2__ = {"BIHH":-1401,"BIII":-1033,"BKAK":-543,"BOOO":-5591};
this.TQ3__ = {"BHHH":478,"BHHM":-1073,"BHIH":222,"BHII":-504,"BIIH":-116,"BIII":-105,"BMHI":-863,"BMHM":-464,"BOMH":620,"OHHH":346,"OHHI":1729,"OHII":997,"OHMH":481,"OIHH":623,"OIIH":1344,"OKAK":2792,"OKHH":587,"OKKA":679,"OOHH":110,"OOII":-685};
this.TQ4__ = {"BHHH":-721,"BHHM":-3604,"BHII":-966,"BIIH":-607,"BIII":-2181,"OAAA":-2763,"OAKK":180,"OHHH":-294,"OHHI":2446,"OHHO":480,"OHIH":-1573,"OIHH":1935,"OIHI":-493,"OIIH":626,"OIII":-4007,"OKAK":-8156};
this.TW1__ = {"につい":-4681,"東京都":2026};
this.TW2__ = {"ある程":-2049,"いった":-1256,"ころが":-2434,"しょう":3873,"その後":-4430,"だって":-1049,"ていた":1833,"として":-4657,"ともに":-4517,"もので":1882,"一気に":-792,"初めて":-1512,"同時に":-8097,"大きな":-1255,"対して":-2721,"社会党":-3216};
this.TW3__ = {"いただ":-1734,"してい":1314,"として":-4314,"につい":-5483,"にとっ":-5989,"に当た":-6247,"ので,":-727,"ので、":-727,"のもの":-600,"れから":-3752,"十二月":-2287};
this.TW4__ = {"いう.":8576,"いう。":8576,"からな":-2348,"してい":2958,"たが,":1516,"たが、":1516,"ている":1538,"という":1349,"ました":5543,"ません":1097,"ようと":-4258,"よると":5865};
this.UC1__ = {"A":484,"K":93,"M":645,"O":-505};
this.UC2__ = {"A":819,"H":1059,"I":409,"M":3987,"N":5775,"O":646};
this.UC3__ = {"A":-1370,"I":2311};
this.UC4__ = {"A":-2643,"H":1809,"I":-1032,"K":-3450,"M":3565,"N":3876,"O":6646};
this.UC5__ = {"H":313,"I":-1238,"K":-799,"M":539,"O":-831};
this.UC6__ = {"H":-506,"I":-253,"K":87,"M":247,"O":-387};
this.UP1__ = {"O":-214};
this.UP2__ = {"B":69,"O":935};
this.UP3__ = {"B":189};
this.UQ1__ = {"BH":21,"BI":-12,"BK":-99,"BN":142,"BO":-56,"OH":-95,"OI":477,"OK":410,"OO":-2422};
this.UQ2__ = {"BH":216,"BI":113,"OK":1759};
this.UQ3__ = {"BA":-479,"BH":42,"BI":1913,"BK":-7198,"BM":3160,"BN":6427,"BO":14761,"OI":-827,"ON":-3212};
this.UW1__ = {",":156,"、":156,"「":-463,"あ":-941,"う":-127,"が":-553,"き":121,"こ":505,"で":-201,"と":-547,"ど":-123,"に":-789,"の":-185,"は":-847,"も":-466,"や":-470,"よ":182,"ら":-292,"り":208,"れ":169,"を":-446,"ん":-137,"・":-135,"主":-402,"京":-268,"区":-912,"午":871,"国":-460,"大":561,"委":729,"市":-411,"日":-141,"理":361,"生":-408,"県":-386,"都":-718,"「":-463,"・":-135};
this.UW2__ = {",":-829,"、":-829,"":892,"「":-645,"」":3145,"あ":-538,"い":505,"う":134,"お":-502,"か":1454,"が":-856,"く":-412,"こ":1141,"さ":878,"ざ":540,"し":1529,"す":-675,"せ":300,"そ":-1011,"た":188,"だ":1837,"つ":-949,"て":-291,"で":-268,"と":-981,"ど":1273,"な":1063,"に":-1764,"の":130,"は":-409,"ひ":-1273,"べ":1261,"ま":600,"も":-1263,"や":-402,"よ":1639,"り":-579,"る":-694,"れ":571,"を":-2516,"ん":2095,"ア":-587,"カ":306,"キ":568,"ッ":831,"三":-758,"不":-2150,"世":-302,"中":-968,"主":-861,"事":492,"人":-123,"会":978,"保":362,"入":548,"初":-3025,"副":-1566,"北":-3414,"区":-422,"大":-1769,"天":-865,"太":-483,"子":-1519,"学":760,"実":1023,"小":-2009,"市":-813,"年":-1060,"強":1067,"手":-1519,"揺":-1033,"政":1522,"文":-1355,"新":-1682,"日":-1815,"明":-1462,"最":-630,"朝":-1843,"本":-1650,"東":-931,"果":-665,"次":-2378,"民":-180,"気":-1740,"理":752,"発":529,"目":-1584,"相":-242,"県":-1165,"立":-763,"第":810,"米":509,"自":-1353,"行":838,"西":-744,"見":-3874,"調":1010,"議":1198,"込":3041,"開":1758,"間":-1257,"「":-645,"」":3145,"ッ":831,"ア":-587,"カ":306,"キ":568};
this.UW3__ = {",":4889,"1":-800,"":-1723,"、":4889,"々":-2311,"":5827,"」":2670,"〓":-3573,"あ":-2696,"い":1006,"う":2342,"え":1983,"お":-4864,"か":-1163,"が":3271,"く":1004,"け":388,"げ":401,"こ":-3552,"ご":-3116,"さ":-1058,"し":-395,"す":584,"せ":3685,"そ":-5228,"た":842,"ち":-521,"っ":-1444,"つ":-1081,"て":6167,"で":2318,"と":1691,"ど":-899,"な":-2788,"に":2745,"の":4056,"は":4555,"ひ":-2171,"ふ":-1798,"へ":1199,"ほ":-5516,"ま":-4384,"み":-120,"め":1205,"も":2323,"や":-788,"よ":-202,"ら":727,"り":649,"る":5905,"れ":2773,"わ":-1207,"を":6620,"ん":-518,"ア":551,"グ":1319,"ス":874,"ッ":-1350,"ト":521,"ム":1109,"ル":1591,"ロ":2201,"ン":278,"・":-3794,"一":-1619,"下":-1759,"世":-2087,"両":3815,"中":653,"主":-758,"予":-1193,"二":974,"人":2742,"今":792,"他":1889,"以":-1368,"低":811,"何":4265,"作":-361,"保":-2439,"元":4858,"党":3593,"全":1574,"公":-3030,"六":755,"共":-1880,"円":5807,"再":3095,"分":457,"初":2475,"別":1129,"前":2286,"副":4437,"力":365,"動":-949,"務":-1872,"化":1327,"北":-1038,"区":4646,"千":-2309,"午":-783,"協":-1006,"口":483,"右":1233,"各":3588,"合":-241,"同":3906,"和":-837,"員":4513,"国":642,"型":1389,"場":1219,"外":-241,"妻":2016,"学":-1356,"安":-423,"実":-1008,"家":1078,"小":-513,"少":-3102,"州":1155,"市":3197,"平":-1804,"年":2416,"広":-1030,"府":1605,"度":1452,"建":-2352,"当":-3885,"得":1905,"思":-1291,"性":1822,"戸":-488,"指":-3973,"政":-2013,"教":-1479,"数":3222,"文":-1489,"新":1764,"日":2099,"旧":5792,"昨":-661,"時":-1248,"曜":-951,"最":-937,"月":4125,"期":360,"李":3094,"村":364,"東":-805,"核":5156,"森":2438,"業":484,"氏":2613,"民":-1694,"決":-1073,"法":1868,"海":-495,"無":979,"物":461,"特":-3850,"生":-273,"用":914,"町":1215,"的":7313,"直":-1835,"省":792,"県":6293,"知":-1528,"私":4231,"税":401,"立":-960,"第":1201,"米":7767,"系":3066,"約":3663,"級":1384,"統":-4229,"総":1163,"線":1255,"者":6457,"能":725,"自":-2869,"英":785,"見":1044,"調":-562,"財":-733,"費":1777,"車":1835,"軍":1375,"込":-1504,"通":-1136,"選":-681,"郎":1026,"郡":4404,"部":1200,"金":2163,"長":421,"開":-1432,"間":1302,"関":-1282,"雨":2009,"電":-1045,"非":2066,"駅":1620,"":-800,"」":2670,"・":-3794,"ッ":-1350,"ア":551,"グ":1319,"ス":874,"ト":521,"ム":1109,"ル":1591,"ロ":2201,"ン":278};
this.UW4__ = {",":3930,".":3508,"―":-4841,"、":3930,"。":3508,"":4999,"「":1895,"」":3798,"〓":-5156,"あ":4752,"い":-3435,"う":-640,"え":-2514,"お":2405,"か":530,"が":6006,"き":-4482,"ぎ":-3821,"く":-3788,"け":-4376,"げ":-4734,"こ":2255,"ご":1979,"さ":2864,"し":-843,"じ":-2506,"す":-731,"ず":1251,"せ":181,"そ":4091,"た":5034,"だ":5408,"ち":-3654,"っ":-5882,"つ":-1659,"て":3994,"で":7410,"と":4547,"な":5433,"に":6499,"ぬ":1853,"ね":1413,"の":7396,"は":8578,"ば":1940,"ひ":4249,"び":-4134,"ふ":1345,"へ":6665,"べ":-744,"ほ":1464,"ま":1051,"み":-2082,"む":-882,"め":-5046,"も":4169,"ゃ":-2666,"や":2795,"ょ":-1544,"よ":3351,"ら":-2922,"り":-9726,"る":-14896,"れ":-2613,"ろ":-4570,"わ":-1783,"を":13150,"ん":-2352,"カ":2145,"コ":1789,"セ":1287,"ッ":-724,"ト":-403,"メ":-1635,"ラ":-881,"リ":-541,"ル":-856,"ン":-3637,"・":-4371,"ー":-11870,"一":-2069,"中":2210,"予":782,"事":-190,"井":-1768,"人":1036,"以":544,"会":950,"体":-1286,"作":530,"側":4292,"先":601,"党":-2006,"共":-1212,"内":584,"円":788,"初":1347,"前":1623,"副":3879,"力":-302,"動":-740,"務":-2715,"化":776,"区":4517,"協":1013,"参":1555,"合":-1834,"和":-681,"員":-910,"器":-851,"回":1500,"国":-619,"園":-1200,"地":866,"場":-1410,"塁":-2094,"士":-1413,"多":1067,"大":571,"子":-4802,"学":-1397,"定":-1057,"寺":-809,"小":1910,"屋":-1328,"山":-1500,"島":-2056,"川":-2667,"市":2771,"年":374,"庁":-4556,"後":456,"性":553,"感":916,"所":-1566,"支":856,"改":787,"政":2182,"教":704,"文":522,"方":-856,"日":1798,"時":1829,"最":845,"月":-9066,"木":-485,"来":-442,"校":-360,"業":-1043,"氏":5388,"民":-2716,"気":-910,"沢":-939,"済":-543,"物":-735,"率":672,"球":-1267,"生":-1286,"産":-1101,"田":-2900,"町":1826,"的":2586,"目":922,"省":-3485,"県":2997,"空":-867,"立":-2112,"第":788,"米":2937,"系":786,"約":2171,"経":1146,"統":-1169,"総":940,"線":-994,"署":749,"者":2145,"能":-730,"般":-852,"行":-792,"規":792,"警":-1184,"議":-244,"谷":-1000,"賞":730,"車":-1481,"軍":1158,"輪":-1433,"込":-3370,"近":929,"道":-1291,"選":2596,"郎":-4866,"都":1192,"野":-1100,"銀":-2213,"長":357,"間":-2344,"院":-2297,"際":-2604,"電":-878,"領":-1659,"題":-792,"館":-1984,"首":1749,"高":2120,"「":1895,"」":3798,"・":-4371,"ッ":-724,"ー":-11870,"カ":2145,"コ":1789,"セ":1287,"ト":-403,"メ":-1635,"ラ":-881,"リ":-541,"ル":-856,"ン":-3637};
this.UW5__ = {",":465,".":-299,"1":-514,"E2":-32768,"]":-2762,"、":465,"。":-299,"「":363,"あ":1655,"い":331,"う":-503,"え":1199,"お":527,"か":647,"が":-421,"き":1624,"ぎ":1971,"く":312,"げ":-983,"さ":-1537,"し":-1371,"す":-852,"だ":-1186,"ち":1093,"っ":52,"つ":921,"て":-18,"で":-850,"と":-127,"ど":1682,"な":-787,"に":-1224,"の":-635,"は":-578,"べ":1001,"み":502,"め":865,"ゃ":3350,"ょ":854,"り":-208,"る":429,"れ":504,"わ":419,"を":-1264,"ん":327,"イ":241,"ル":451,"ン":-343,"中":-871,"京":722,"会":-1153,"党":-654,"務":3519,"区":-901,"告":848,"員":2104,"大":-1296,"学":-548,"定":1785,"嵐":-1304,"市":-2991,"席":921,"年":1763,"思":872,"所":-814,"挙":1618,"新":-1682,"日":218,"月":-4353,"査":932,"格":1356,"機":-1508,"氏":-1347,"田":240,"町":-3912,"的":-3149,"相":1319,"省":-1052,"県":-4003,"研":-997,"社":-278,"空":-813,"統":1955,"者":-2233,"表":663,"語":-1073,"議":1219,"選":-1018,"郎":-368,"長":786,"間":1191,"題":2368,"館":-689,"":-514,"":-32768,"「":363,"イ":241,"ル":451,"ン":-343};
this.UW6__ = {",":227,".":808,"1":-270,"E1":306,"、":227,"。":808,"あ":-307,"う":189,"か":241,"が":-73,"く":-121,"こ":-200,"じ":1782,"す":383,"た":-428,"っ":573,"て":-1014,"で":101,"と":-105,"な":-253,"に":-149,"の":-417,"は":-236,"も":-206,"り":187,"る":-135,"を":195,"ル":-673,"ン":-496,"一":-277,"中":201,"件":-800,"会":624,"前":302,"区":1792,"員":-1212,"委":798,"学":-960,"市":887,"広":-695,"後":535,"業":-697,"相":753,"社":-507,"福":974,"空":-822,"者":1811,"連":463,"郎":1082,"":-270,"":306,"ル":-673,"ン":-496};
return this;
}
TinySegmenter.prototype.ctype_ = function(str) {
for (var i in this.chartype_) {
if (str.match(this.chartype_[i][0])) {
return this.chartype_[i][1];
}
}
return "O";
}
TinySegmenter.prototype.ts_ = function(v) {
if (v) { return v; }
return 0;
}
TinySegmenter.prototype.segment = function(input) {
if (input == null || input == undefined || input == "") {
return [];
}
var result = [];
var seg = ["B3","B2","B1"];
var ctype = ["O","O","O"];
var o = input.split("");
for (i = 0; i < o.length; ++i) {
seg.push(o[i]);
ctype.push(this.ctype_(o[i]))
}
seg.push("E1");
seg.push("E2");
seg.push("E3");
ctype.push("O");
ctype.push("O");
ctype.push("O");
var word = seg[3];
var p1 = "U";
var p2 = "U";
var p3 = "U";
for (var i = 4; i < seg.length - 3; ++i) {
var score = this.BIAS__;
var w1 = seg[i-3];
var w2 = seg[i-2];
var w3 = seg[i-1];
var w4 = seg[i];
var w5 = seg[i+1];
var w6 = seg[i+2];
var c1 = ctype[i-3];
var c2 = ctype[i-2];
var c3 = ctype[i-1];
var c4 = ctype[i];
var c5 = ctype[i+1];
var c6 = ctype[i+2];
score += this.ts_(this.UP1__[p1]);
score += this.ts_(this.UP2__[p2]);
score += this.ts_(this.UP3__[p3]);
score += this.ts_(this.BP1__[p1 + p2]);
score += this.ts_(this.BP2__[p2 + p3]);
score += this.ts_(this.UW1__[w1]);
score += this.ts_(this.UW2__[w2]);
score += this.ts_(this.UW3__[w3]);
score += this.ts_(this.UW4__[w4]);
score += this.ts_(this.UW5__[w5]);
score += this.ts_(this.UW6__[w6]);
score += this.ts_(this.BW1__[w2 + w3]);
score += this.ts_(this.BW2__[w3 + w4]);
score += this.ts_(this.BW3__[w4 + w5]);
score += this.ts_(this.TW1__[w1 + w2 + w3]);
score += this.ts_(this.TW2__[w2 + w3 + w4]);
score += this.ts_(this.TW3__[w3 + w4 + w5]);
score += this.ts_(this.TW4__[w4 + w5 + w6]);
score += this.ts_(this.UC1__[c1]);
score += this.ts_(this.UC2__[c2]);
score += this.ts_(this.UC3__[c3]);
score += this.ts_(this.UC4__[c4]);
score += this.ts_(this.UC5__[c5]);
score += this.ts_(this.UC6__[c6]);
score += this.ts_(this.BC1__[c2 + c3]);
score += this.ts_(this.BC2__[c3 + c4]);
score += this.ts_(this.BC3__[c4 + c5]);
score += this.ts_(this.TC1__[c1 + c2 + c3]);
score += this.ts_(this.TC2__[c2 + c3 + c4]);
score += this.ts_(this.TC3__[c3 + c4 + c5]);
score += this.ts_(this.TC4__[c4 + c5 + c6]);
// score += this.ts_(this.TC5__[c4 + c5 + c6]);
score += this.ts_(this.UQ1__[p1 + c1]);
score += this.ts_(this.UQ2__[p2 + c2]);
score += this.ts_(this.UQ3__[p3 + c3]);
score += this.ts_(this.BQ1__[p2 + c2 + c3]);
score += this.ts_(this.BQ2__[p2 + c3 + c4]);
score += this.ts_(this.BQ3__[p3 + c2 + c3]);
score += this.ts_(this.BQ4__[p3 + c3 + c4]);
score += this.ts_(this.TQ1__[p2 + c1 + c2 + c3]);
score += this.ts_(this.TQ2__[p2 + c2 + c3 + c4]);
score += this.ts_(this.TQ3__[p3 + c1 + c2 + c3]);
score += this.ts_(this.TQ4__[p3 + c2 + c3 + c4]);
var p = "O";
if (score > 0) {
result.push(word);
word = "";
p = "B";
}
p1 = p2;
p2 = p3;
p3 = p;
word += seg[i];
}
result.push(word);
return result;
}
export { TinySegmenter };

View File

@@ -1,4 +1,4 @@
// ═══════════════════════════════════════════════════════════════════════════ // ═══════════════════════════════════════════════════════════════════════════
// Story Summary - 主入口 // Story Summary - 主入口
// //
// 稳定目标: // 稳定目标:
@@ -107,16 +107,43 @@ const MESSAGE_EVENT = "message";
// 状态变量 // 状态变量
// ═══════════════════════════════════════════════════════════════════════════ // ═══════════════════════════════════════════════════════════════════════════
let summaryGenerating = false;
let overlayCreated = false; let overlayCreated = false;
let frameReady = false; let frameReady = false;
let currentMesId = null; let currentMesId = null;
let pendingFrameMessages = []; let pendingFrameMessages = [];
let eventsRegistered = false; let eventsRegistered = false;
let vectorGenerating = false;
let vectorCancelled = false; let vectorCancelled = false;
let vectorAbortController = null; let vectorAbortController = null;
let anchorGenerating = false;
// ═══════════════════════════════════════════════════════════════════════════
// TaskGuard — 互斥任务管理summary / vector / anchor
// ═══════════════════════════════════════════════════════════════════════════
class TaskGuard {
#running = new Set();
acquire(taskName) {
if (this.#running.has(taskName)) return null;
this.#running.add(taskName);
let released = false;
return () => {
if (!released) {
released = true;
this.#running.delete(taskName);
}
};
}
isRunning(taskName) {
return this.#running.has(taskName);
}
isAnyRunning(...taskNames) {
return taskNames.some(t => this.#running.has(t));
}
}
const guard = new TaskGuard();
// 用户消息缓存(解决 GENERATION_STARTED 时 chat 尚未包含用户消息的问题) // 用户消息缓存(解决 GENERATION_STARTED 时 chat 尚未包含用户消息的问题)
let lastSentUserMessage = null; let lastSentUserMessage = null;
@@ -219,13 +246,12 @@ async function unhideAllMessages() {
// 生成状态管理 // 生成状态管理
// ═══════════════════════════════════════════════════════════════════════════ // ═══════════════════════════════════════════════════════════════════════════
function setSummaryGenerating(flag) { function isSummaryGenerating() {
summaryGenerating = !!flag; return guard.isRunning('summary');
postToFrame({ type: "GENERATION_STATE", isGenerating: summaryGenerating });
} }
function isSummaryGenerating() { function notifySummaryState() {
return summaryGenerating; postToFrame({ type: "GENERATION_STATE", isGenerating: guard.isRunning('summary') });
} }
// ═══════════════════════════════════════════════════════════════════════════ // ═══════════════════════════════════════════════════════════════════════════
@@ -300,8 +326,10 @@ async function sendAnchorStatsToFrame() {
} }
async function handleAnchorGenerate() { async function handleAnchorGenerate() {
if (anchorGenerating) return; const release = guard.acquire('anchor');
if (!release) return;
try {
const vectorCfg = getVectorConfig(); const vectorCfg = getVectorConfig();
if (!vectorCfg?.enabled) { if (!vectorCfg?.enabled) {
await executeSlashCommand("/echo severity=warning 请先启用向量检索"); await executeSlashCommand("/echo severity=warning 请先启用向量检索");
@@ -316,17 +344,12 @@ async function handleAnchorGenerate() {
const { chatId, chat } = getContext(); const { chatId, chat } = getContext();
if (!chatId || !chat?.length) return; if (!chatId || !chat?.length) return;
anchorGenerating = true;
postToFrame({ type: "ANCHOR_GEN_PROGRESS", current: 0, total: 1, message: "分析中..." }); postToFrame({ type: "ANCHOR_GEN_PROGRESS", current: 0, total: 1, message: "分析中..." });
try {
// Phase 1: L0 提取 + Phase 2: L0 向量化(在 incrementalExtractAtoms 内部完成)
await incrementalExtractAtoms(chatId, chat, (message, current, total) => { await incrementalExtractAtoms(chatId, chat, (message, current, total) => {
postToFrame({ type: "ANCHOR_GEN_PROGRESS", current, total, message }); postToFrame({ type: "ANCHOR_GEN_PROGRESS", current, total, message });
}); });
// Phase 3: 处理 pending L1 Chunks
postToFrame({ type: "ANCHOR_GEN_PROGRESS", current: 0, total: 1, message: "向量化 L1..." }); postToFrame({ type: "ANCHOR_GEN_PROGRESS", current: 0, total: 1, message: "向量化 L1..." });
await buildIncrementalChunks({ vectorConfig: vectorCfg }); await buildIncrementalChunks({ vectorConfig: vectorCfg });
@@ -340,7 +363,7 @@ async function handleAnchorGenerate() {
xbLog.error(MODULE_ID, "记忆锚点生成失败", e); xbLog.error(MODULE_ID, "记忆锚点生成失败", e);
await executeSlashCommand(`/echo severity=error 记忆锚点生成失败:${e.message}`); await executeSlashCommand(`/echo severity=error 记忆锚点生成失败:${e.message}`);
} finally { } finally {
anchorGenerating = false; release();
postToFrame({ type: "ANCHOR_GEN_PROGRESS", current: -1, total: 0 }); postToFrame({ type: "ANCHOR_GEN_PROGRESS", current: -1, total: 0 });
} }
} }
@@ -359,7 +382,6 @@ async function handleAnchorClear() {
function handleAnchorCancel() { function handleAnchorCancel() {
cancelL0Extraction(); cancelL0Extraction();
anchorGenerating = false;
postToFrame({ type: "ANCHOR_GEN_PROGRESS", current: -1, total: 0 }); postToFrame({ type: "ANCHOR_GEN_PROGRESS", current: -1, total: 0 });
} }
@@ -378,8 +400,10 @@ async function handleTestOnlineService(provider, config) {
} }
async function handleGenerateVectors(vectorCfg) { async function handleGenerateVectors(vectorCfg) {
if (vectorGenerating) return; const release = guard.acquire('vector');
if (!release) return;
try {
if (!vectorCfg?.enabled) { if (!vectorCfg?.enabled) {
postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "ALL", current: -1, total: 0 }); postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "ALL", current: -1, total: 0 });
return; return;
@@ -393,7 +417,6 @@ async function handleGenerateVectors(vectorCfg) {
return; return;
} }
vectorGenerating = true;
vectorCancelled = false; vectorCancelled = false;
vectorAbortController = new AbortController(); vectorAbortController = new AbortController();
@@ -436,84 +459,100 @@ async function handleGenerateVectors(vectorCfg) {
} }
} }
if (vectorCancelled) { if (vectorCancelled) return;
vectorGenerating = false;
return;
}
const allChunks = []; const allChunks = [];
for (let floor = 0; floor < chat.length; floor++) { for (let floor = 0; floor < chat.length; floor++) {
const chunks = chunkMessage(floor, chat[floor]); if (vectorCancelled) break;
const message = chat[floor];
if (!message) continue;
const { chunks, status } = chunkMessage(message, floor, vectorCfg, true);
if (status === "skip") continue;
allChunks.push(...chunks); allChunks.push(...chunks);
} }
if (allChunks.length > 0) { let l1Vectors = [];
if (!allChunks.length) {
postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L1", current: 0, total: 0, message: "L1 为空,跳过" });
} else {
postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L1", current: 0, total: allChunks.length, message: "L1 向量化..." });
await saveChunks(chatId, allChunks); await saveChunks(chatId, allChunks);
}
const l1Texts = allChunks.map(c => c.text); let l1Completed = 0;
const store = getSummaryStore(); for (let i = 0; i < allChunks.length; i += batchSize) {
const events = store?.json?.events || [];
postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L1", current: 0, total: l1Texts.length });
const l1Vectors = [];
let completed = 0;
for (let i = 0; i < l1Texts.length; i += batchSize) {
if (vectorCancelled) break; if (vectorCancelled) break;
const batch = l1Texts.slice(i, i + batchSize); const batch = allChunks.slice(i, i + batchSize);
const texts = batch.map(c => c.text);
try { try {
const vectors = await embed(batch, vectorCfg, { signal: vectorAbortController.signal }); const vectors = await embed(texts, vectorCfg, { signal: vectorAbortController.signal });
l1Vectors.push(...vectors); const items = batch.map((c, j) => ({
completed += batch.length; chunkId: c.chunkId,
postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L1", current: completed, total: l1Texts.length }); vector: vectors[j],
}));
await saveChunkVectors(chatId, items, fingerprint);
l1Vectors = l1Vectors.concat(items);
l1Completed += batch.length;
postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L1", current: l1Completed, total: allChunks.length });
} catch (e) { } catch (e) {
if (e?.name === 'AbortError') break; if (e?.name === "AbortError") break;
xbLog.error(MODULE_ID, 'L1 向量化失败', e); xbLog.error(MODULE_ID, "L1 向量化失败", e);
vectorCancelled = true; vectorCancelled = true;
break; break;
} }
} }
if (!vectorCancelled && l1Vectors.length > 0) {
const items = allChunks.map((c, i) => ({ chunkId: c.chunkId, vector: l1Vectors[i] })).filter(x => x.vector);
await saveChunkVectors(chatId, items, fingerprint);
await updateMeta(chatId, { lastChunkFloor: chat.length - 1 });
} }
const l2Pairs = events if (vectorCancelled) return;
.map(e => ({ id: e.id, text: `${e.title || ''} ${e.summary || ''}`.trim() }))
.filter(p => p.text); const store = getSummaryStore();
const events = store?.json?.events || [];
const l2Pairs = events
.map((e) => ({ id: e.id, text: `${e.title || ""} ${e.summary || ""}`.trim() }))
.filter((p) => p.text);
if (!l2Pairs.length) {
postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L2", current: 0, total: 0, message: "L2 为空,跳过" });
} else {
postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L2", current: 0, total: l2Pairs.length, message: "L2 向量化..." });
postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L2", current: 0, total: l2Pairs.length });
let l2Completed = 0; let l2Completed = 0;
for (let i = 0; i < l2Pairs.length; i += batchSize) { for (let i = 0; i < l2Pairs.length; i += batchSize) {
if (vectorCancelled) break; if (vectorCancelled) break;
const batch = l2Pairs.slice(i, i + batchSize); const batch = l2Pairs.slice(i, i + batchSize);
const texts = batch.map(p => p.text);
try { try {
const vectors = await embed(batch.map(p => p.text), vectorCfg, { signal: vectorAbortController.signal }); const vectors = await embed(texts, vectorCfg, { signal: vectorAbortController.signal });
const items = batch.map((p, j) => ({ eventId: p.id, vector: vectors[j] })); const items = batch.map((p, idx) => ({
eventId: p.id,
vector: vectors[idx],
}));
await saveEventVectorsToDb(chatId, items, fingerprint); await saveEventVectorsToDb(chatId, items, fingerprint);
l2Completed += batch.length; l2Completed += batch.length;
postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L2", current: l2Completed, total: l2Pairs.length }); postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L2", current: l2Completed, total: l2Pairs.length });
} catch (e) { } catch (e) {
if (e?.name === 'AbortError') break; if (e?.name === "AbortError") break;
xbLog.error(MODULE_ID, 'L2 向量化失败', e); xbLog.error(MODULE_ID, "L2 向量化失败", e);
vectorCancelled = true; vectorCancelled = true;
break; break;
} }
} }
}
postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "ALL", current: -1, total: 0 }); postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "ALL", current: -1, total: 0 });
await sendVectorStatsToFrame(); await sendVectorStatsToFrame();
vectorGenerating = false; xbLog.info(MODULE_ID, `向量生成完成: L0=${atoms.length}, L1=${l1Vectors.length}, L2=${l2Pairs.length}`);
} finally {
release();
vectorCancelled = false; vectorCancelled = false;
vectorAbortController = null; vectorAbortController = null;
}
xbLog.info(MODULE_ID, `向量生成完成: L0=${atoms.length}, L1=${l1Vectors.length}, L2=${l2Pairs.length}`);
} }
async function handleClearVectors() { async function handleClearVectors() {
@@ -529,52 +568,6 @@ async function handleClearVectors() {
xbLog.info(MODULE_ID, "向量数据已清除"); xbLog.info(MODULE_ID, "向量数据已清除");
} }
// ═══════════════════════════════════════════════════════════════════════════
// L0 自动补提取(每收到新消息后检查并补提取缺失楼层)
// ═══════════════════════════════════════════════════════════════════════════
async function maybeAutoExtractL0() {
const vectorCfg = getVectorConfig();
if (!vectorCfg?.enabled) return;
if (anchorGenerating || vectorGenerating) return;
const { chatId, chat } = getContext();
if (!chatId || !chat?.length) return;
const stats = await getAnchorStats();
if (stats.pending <= 0) return;
anchorGenerating = true;
try {
await incrementalExtractAtoms(chatId, chat, null, { maxFloors: 20 });
// 为新提取的 L0 楼层构建 L1 chunks
await buildIncrementalChunks({ vectorConfig: vectorCfg });
invalidateLexicalIndex();
await sendAnchorStatsToFrame();
await sendVectorStatsToFrame();
xbLog.info(MODULE_ID, "自动 L0 补提取完成");
} catch (e) {
xbLog.error(MODULE_ID, "自动 L0 补提取失败", e);
} finally {
anchorGenerating = false;
}
}
// ═══════════════════════════════════════════════════════════════════════════
// Embedding 连接预热
// ═══════════════════════════════════════════════════════════════════════════
function warmupEmbeddingConnection() {
const vectorCfg = getVectorConfig();
if (!vectorCfg?.enabled) return;
embed(['.'], vectorCfg, { timeout: 5000 }).catch(() => {});
}
// ═══════════════════════════════════════════════════════════════════════════ // ═══════════════════════════════════════════════════════════════════════════
// 实体词典注入 + 索引预热 // 实体词典注入 + 索引预热
// ═══════════════════════════════════════════════════════════════════════════ // ═══════════════════════════════════════════════════════════════════════════
@@ -597,9 +590,52 @@ function refreshEntityLexiconAndWarmup() {
} }
// ═══════════════════════════════════════════════════════════════════════════ // ═══════════════════════════════════════════════════════════════════════════
// L2 自动增量向量化(总结完成后调用 // L0 自动补提取(每收到新消息后检查并补提取缺失楼层
// ═══════════════════════════════════════════════════════════════════════════ // ═══════════════════════════════════════════════════════════════════════════
async function maybeAutoExtractL0() {
const vectorCfg = getVectorConfig();
if (!vectorCfg?.enabled) return;
if (guard.isAnyRunning('anchor', 'vector')) return;
const { chatId, chat } = getContext();
if (!chatId || !chat?.length) return;
const stats = await getAnchorStats();
if (stats.pending <= 0) return;
const release = guard.acquire('anchor');
if (!release) return;
try {
await incrementalExtractAtoms(chatId, chat, null, { maxFloors: 20 });
// 为新提取的 L0 楼层构建 L1 chunks
await buildIncrementalChunks({ vectorConfig: vectorCfg });
invalidateLexicalIndex();
await sendAnchorStatsToFrame();
await sendVectorStatsToFrame();
xbLog.info(MODULE_ID, "自动 L0 补提取完成");
} catch (e) {
xbLog.error(MODULE_ID, "自动 L0 补提取失败", e);
} finally {
release();
}
}
// ═══════════════════════════════════════════════════════════════════════════
// Embedding 连接预热
// ═══════════════════════════════════════════════════════════════════════════
function warmupEmbeddingConnection() {
const vectorCfg = getVectorConfig();
if (!vectorCfg?.enabled) return;
embed(['.'], vectorCfg, { timeout: 5000 }).catch(() => {});
}
async function autoVectorizeNewEvents(newEventIds) { async function autoVectorizeNewEvents(newEventIds) {
if (!newEventIds?.length) return; if (!newEventIds?.length) return;
@@ -902,7 +938,7 @@ function openPanelForMessage(mesId) {
sendFrameBaseData(store, totalFloors); sendFrameBaseData(store, totalFloors);
sendFrameFullData(store, totalFloors); sendFrameFullData(store, totalFloors);
setSummaryGenerating(summaryGenerating); notifySummaryState();
sendVectorConfigToFrame(); sendVectorConfigToFrame();
sendVectorStatsToFrame(); sendVectorStatsToFrame();
@@ -990,8 +1026,11 @@ async function maybeAutoRunSummary(reason) {
} }
async function autoRunSummaryWithRetry(targetMesId, configForRun) { async function autoRunSummaryWithRetry(targetMesId, configForRun) {
setSummaryGenerating(true); const release = guard.acquire('summary');
if (!release) return;
notifySummaryState();
try {
for (let attempt = 1; attempt <= 3; attempt++) { for (let attempt = 1; attempt <= 3; attempt++) {
const result = await runSummaryGeneration(targetMesId, configForRun, { const result = await runSummaryGeneration(targetMesId, configForRun, {
onStatus: (text) => postToFrame({ type: "SUMMARY_STATUS", statusText: text }), onStatus: (text) => postToFrame({ type: "SUMMARY_STATUS", statusText: text }),
@@ -1005,21 +1044,22 @@ async function autoRunSummaryWithRetry(targetMesId, configForRun) {
applyHideStateDebounced(); applyHideStateDebounced();
updateFrameStatsAfterSummary(endMesId, store.json || {}); updateFrameStatsAfterSummary(endMesId, store.json || {});
// L2 自动增量向量化
await autoVectorizeNewEvents(newEventIds); await autoVectorizeNewEvents(newEventIds);
}, },
}); });
if (result.success) { if (result.success) {
setSummaryGenerating(false);
return; return;
} }
if (attempt < 3) await sleep(1000); if (attempt < 3) await sleep(1000);
} }
setSummaryGenerating(false);
await executeSlashCommand("/echo severity=error 剧情总结失败(已自动重试 3 次)。请稍后再试。"); await executeSlashCommand("/echo severity=error 剧情总结失败(已自动重试 3 次)。请稍后再试。");
} finally {
release();
notifySummaryState();
}
} }
function updateFrameStatsAfterSummary(endMesId, merged) { function updateFrameStatsAfterSummary(endMesId, merged) {
@@ -1055,7 +1095,7 @@ function handleFrameMessage(event) {
case "FRAME_READY": { case "FRAME_READY": {
frameReady = true; frameReady = true;
flushPendingFrameMessages(); flushPendingFrameMessages();
setSummaryGenerating(summaryGenerating); notifySummaryState();
sendSavedConfigToFrame(); sendSavedConfigToFrame();
sendVectorConfigToFrame(); sendVectorConfigToFrame();
sendVectorStatsToFrame(); sendVectorStatsToFrame();
@@ -1084,7 +1124,7 @@ function handleFrameMessage(event) {
case "REQUEST_CANCEL": case "REQUEST_CANCEL":
window.xiaobaixStreamingGeneration?.cancel?.("xb9"); window.xiaobaixStreamingGeneration?.cancel?.("xb9");
setSummaryGenerating(false); postToFrame({ type: "GENERATION_STATE", isGenerating: false });
postToFrame({ type: "SUMMARY_STATUS", statusText: "已停止" }); postToFrame({ type: "SUMMARY_STATUS", statusText: "已停止" });
break; break;
@@ -1282,8 +1322,11 @@ async function handleManualGenerate(mesId, config) {
return; return;
} }
setSummaryGenerating(true); const release = guard.acquire('summary');
if (!release) return;
notifySummaryState();
try {
await runSummaryGeneration(mesId, config, { await runSummaryGeneration(mesId, config, {
onStatus: (text) => postToFrame({ type: "SUMMARY_STATUS", statusText: text }), onStatus: (text) => postToFrame({ type: "SUMMARY_STATUS", statusText: text }),
onError: (msg) => postToFrame({ type: "SUMMARY_ERROR", message: msg }), onError: (msg) => postToFrame({ type: "SUMMARY_ERROR", message: msg }),
@@ -1296,12 +1339,13 @@ async function handleManualGenerate(mesId, config) {
applyHideStateDebounced(); applyHideStateDebounced();
updateFrameStatsAfterSummary(endMesId, store.json || {}); updateFrameStatsAfterSummary(endMesId, store.json || {});
// L2 自动增量向量化
await autoVectorizeNewEvents(newEventIds); await autoVectorizeNewEvents(newEventIds);
}, },
}); });
} finally {
setSummaryGenerating(false); release();
notifySummaryState();
}
} }
// ═══════════════════════════════════════════════════════════════════════════ // ═══════════════════════════════════════════════════════════════════════════
@@ -1390,7 +1434,7 @@ async function handleMessageReceived() {
initButtonsForAll(); initButtonsForAll();
// 向量全量生成中时跳过 L1 sync避免竞争写入 // 向量全量生成中时跳过 L1 sync避免竞争写入
if (vectorGenerating) return; if (guard.isRunning('vector')) return;
await syncOnMessageReceived(chatId, lastFloor, message, vectorConfig, () => { await syncOnMessageReceived(chatId, lastFloor, message, vectorConfig, () => {
sendAnchorStatsToFrame(); sendAnchorStatsToFrame();
@@ -1529,6 +1573,22 @@ async function handleGenerationStarted(type, _params, isDryRun) {
// 事件注册 // 事件注册
// ═══════════════════════════════════════════════════════════════════════════ // ═══════════════════════════════════════════════════════════════════════════
const boundHandlers = {
chatChanged: () => setTimeout(handleChatChanged, 80),
messageDeleted: () => setTimeout(handleMessageDeleted, 50),
messageReceived: () => setTimeout(handleMessageReceived, 150),
messageSent: () => setTimeout(handleMessageSent, 150),
messageSentRecall: handleMessageSentForRecall,
messageSwiped: () => setTimeout(handleMessageSwiped, 100),
messageUpdated: () => setTimeout(handleMessageUpdated, 100),
messageEdited: () => setTimeout(handleMessageUpdated, 100),
userRendered: (data) => setTimeout(() => handleMessageRendered(data), 50),
charRendered: (data) => setTimeout(() => handleMessageRendered(data), 50),
genStarted: handleGenerationStarted,
genStopped: clearExtensionPrompt,
genEnded: clearExtensionPrompt,
};
function registerEvents() { function registerEvents() {
if (eventsRegistered) return; if (eventsRegistered) return;
eventsRegistered = true; eventsRegistered = true;
@@ -1551,31 +1611,45 @@ function registerEvents() {
initButtonsForAll(); initButtonsForAll();
eventSource.on(event_types.CHAT_CHANGED, () => setTimeout(handleChatChanged, 80)); eventSource.on(event_types.CHAT_CHANGED, boundHandlers.chatChanged);
eventSource.on(event_types.MESSAGE_DELETED, () => setTimeout(handleMessageDeleted, 50)); eventSource.on(event_types.MESSAGE_DELETED, boundHandlers.messageDeleted);
eventSource.on(event_types.MESSAGE_RECEIVED, () => setTimeout(handleMessageReceived, 150)); eventSource.on(event_types.MESSAGE_RECEIVED, boundHandlers.messageReceived);
eventSource.on(event_types.MESSAGE_SENT, () => setTimeout(handleMessageSent, 150)); eventSource.on(event_types.MESSAGE_SENT, boundHandlers.messageSent);
eventSource.on(event_types.MESSAGE_SENT, handleMessageSentForRecall); eventSource.on(event_types.MESSAGE_SENT, boundHandlers.messageSentRecall);
eventSource.on(event_types.MESSAGE_SWIPED, () => setTimeout(handleMessageSwiped, 100)); eventSource.on(event_types.MESSAGE_SWIPED, boundHandlers.messageSwiped);
eventSource.on(event_types.MESSAGE_UPDATED, () => setTimeout(handleMessageUpdated, 100)); eventSource.on(event_types.MESSAGE_UPDATED, boundHandlers.messageUpdated);
eventSource.on(event_types.MESSAGE_EDITED, () => setTimeout(handleMessageUpdated, 100)); eventSource.on(event_types.MESSAGE_EDITED, boundHandlers.messageEdited);
eventSource.on(event_types.USER_MESSAGE_RENDERED, (data) => setTimeout(() => handleMessageRendered(data), 50)); eventSource.on(event_types.USER_MESSAGE_RENDERED, boundHandlers.userRendered);
eventSource.on(event_types.CHARACTER_MESSAGE_RENDERED, (data) => setTimeout(() => handleMessageRendered(data), 50)); eventSource.on(event_types.CHARACTER_MESSAGE_RENDERED, boundHandlers.charRendered);
// 用户输入捕获(原生捕获阶段) // 用户输入捕获(原生捕获阶段)
document.addEventListener("pointerdown", onSendPointerdown, true); document.addEventListener("pointerdown", onSendPointerdown, true);
document.addEventListener("keydown", onSendKeydown, true); document.addEventListener("keydown", onSendKeydown, true);
// 注入链路 // 注入链路
eventSource.on(event_types.GENERATION_STARTED, handleGenerationStarted); eventSource.on(event_types.GENERATION_STARTED, boundHandlers.genStarted);
eventSource.on(event_types.GENERATION_STOPPED, clearExtensionPrompt); eventSource.on(event_types.GENERATION_STOPPED, boundHandlers.genStopped);
eventSource.on(event_types.GENERATION_ENDED, clearExtensionPrompt); eventSource.on(event_types.GENERATION_ENDED, boundHandlers.genEnded);
} }
function unregisterEvents() { function unregisterEvents() {
CacheRegistry.unregister(MODULE_ID); CacheRegistry.unregister(MODULE_ID);
eventsRegistered = false; eventsRegistered = false;
eventSource.off(event_types.CHAT_CHANGED, boundHandlers.chatChanged);
eventSource.off(event_types.MESSAGE_DELETED, boundHandlers.messageDeleted);
eventSource.off(event_types.MESSAGE_RECEIVED, boundHandlers.messageReceived);
eventSource.off(event_types.MESSAGE_SENT, boundHandlers.messageSent);
eventSource.off(event_types.MESSAGE_SENT, boundHandlers.messageSentRecall);
eventSource.off(event_types.MESSAGE_SWIPED, boundHandlers.messageSwiped);
eventSource.off(event_types.MESSAGE_UPDATED, boundHandlers.messageUpdated);
eventSource.off(event_types.MESSAGE_EDITED, boundHandlers.messageEdited);
eventSource.off(event_types.USER_MESSAGE_RENDERED, boundHandlers.userRendered);
eventSource.off(event_types.CHARACTER_MESSAGE_RENDERED, boundHandlers.charRendered);
eventSource.off(event_types.GENERATION_STARTED, boundHandlers.genStarted);
eventSource.off(event_types.GENERATION_STOPPED, boundHandlers.genStopped);
eventSource.off(event_types.GENERATION_ENDED, boundHandlers.genEnded);
$(".xiaobaix-story-summary-btn").remove(); $(".xiaobaix-story-summary-btn").remove();
hideOverlay(); hideOverlay();

View File

@@ -28,17 +28,17 @@ export async function rerank(query, documents, options = {}) {
if (!query?.trim()) { if (!query?.trim()) {
xbLog.warn(MODULE_ID, 'query 为空,跳过 rerank'); xbLog.warn(MODULE_ID, 'query 为空,跳过 rerank');
return documents.map((_, i) => ({ index: i, relevance_score: 0.5 })); return { results: documents.map((_, i) => ({ index: i, relevance_score: 0 })), failed: true };
} }
if (!documents?.length) { if (!documents?.length) {
return []; return { results: [], failed: false };
} }
const key = getApiKey(); const key = getApiKey();
if (!key) { if (!key) {
xbLog.warn(MODULE_ID, '未配置 API Key跳过 rerank'); xbLog.warn(MODULE_ID, '未配置 API Key跳过 rerank');
return documents.map((_, i) => ({ index: i, relevance_score: 0.5 })); return { results: documents.map((_, i) => ({ index: i, relevance_score: 0 })), failed: true };
} }
// 截断超长文档列表 // 截断超长文档列表
@@ -61,7 +61,7 @@ export async function rerank(query, documents, options = {}) {
if (!validDocs.length) { if (!validDocs.length) {
xbLog.warn(MODULE_ID, '无有效文档,跳过 rerank'); xbLog.warn(MODULE_ID, '无有效文档,跳过 rerank');
return []; return { results: [], failed: false };
} }
const controller = new AbortController(); const controller = new AbortController();
@@ -106,7 +106,7 @@ export async function rerank(query, documents, options = {}) {
const elapsed = Math.round(performance.now() - T0); const elapsed = Math.round(performance.now() - T0);
xbLog.info(MODULE_ID, `Rerank 完成: ${validDocs.length} docs → ${results.length} selected (${elapsed}ms)`); xbLog.info(MODULE_ID, `Rerank 完成: ${validDocs.length} docs → ${results.length} selected (${elapsed}ms)`);
return mapped; return { results: mapped, failed: false };
} catch (e) { } catch (e) {
clearTimeout(timeoutId); clearTimeout(timeoutId);
@@ -118,10 +118,13 @@ export async function rerank(query, documents, options = {}) {
} }
// 降级:返回原顺序,分数均匀分布 // 降级:返回原顺序,分数均匀分布
return documents.slice(0, topN).map((_, i) => ({ return {
results: documents.slice(0, topN).map((_, i) => ({
index: i, index: i,
relevance_score: 1 - (i / documents.length) * 0.5, relevance_score: 0,
})); })),
failed: true,
};
} }
} }
@@ -138,30 +141,38 @@ export async function rerankChunks(query, chunks, options = {}) {
if (!chunks?.length) return []; if (!chunks?.length) return [];
if (chunks.length <= topN) { if (chunks.length <= topN) {
// 数量不超限,仍然 rerank 以获取分数,但不过滤
const texts = chunks.map(c => c.text || c.semantic || ''); const texts = chunks.map(c => c.text || c.semantic || '');
const results = await rerank(query, texts, { topN: chunks.length, ...options }); const { results, failed } = await rerank(query, texts, { topN: chunks.length, ...options });
if (failed) {
return chunks.map(c => ({ ...c, _rerankScore: 0, _rerankFailed: true }));
}
const scoreMap = new Map(results.map(r => [r.index, r.relevance_score])); const scoreMap = new Map(results.map(r => [r.index, r.relevance_score]));
return chunks.map((c, i) => ({ return chunks.map((c, i) => ({
...c, ...c,
_rerankScore: scoreMap.get(i) ?? 0.5, _rerankScore: scoreMap.get(i) ?? 0,
})).sort((a, b) => b._rerankScore - a._rerankScore); })).sort((a, b) => b._rerankScore - a._rerankScore);
} }
const texts = chunks.map(c => c.text || c.semantic || ''); const texts = chunks.map(c => c.text || c.semantic || '');
const results = await rerank(query, texts, { topN, ...options }); const { results, failed } = await rerank(query, texts, { topN, ...options });
// 过滤低分 + 排序 if (failed) {
const selected = results return chunks.slice(0, topN).map(c => ({
...c,
_rerankScore: 0,
_rerankFailed: true,
}));
}
return results
.filter(r => r.relevance_score >= minScore) .filter(r => r.relevance_score >= minScore)
.sort((a, b) => b.relevance_score - a.relevance_score) .sort((a, b) => b.relevance_score - a.relevance_score)
.map(r => ({ .map(r => ({
...chunks[r.index], ...chunks[r.index],
_rerankScore: r.relevance_score, _rerankScore: r.relevance_score,
})); }));
return selected;
} }
/** /**
@@ -174,7 +185,7 @@ export async function testRerankService() {
} }
try { try {
const results = await rerank('测试查询', ['测试文档1', '测试文档2'], { topN: 2 }); const { results } = await rerank('测试查询', ['测试文档1', '测试文档2'], { topN: 2 });
return { return {
success: true, success: true,
message: `连接成功,返回 ${results.length} 个结果`, message: `连接成功,返回 ${results.length} 个结果`,

View File

@@ -86,6 +86,7 @@ export function createMetrics() {
l0Candidates: 0, // W-RRF 融合后的 L0 候选数 l0Candidates: 0, // W-RRF 融合后的 L0 候选数
l0Selected: 0, // rerank 后选中的 L0 数 l0Selected: 0, // rerank 后选中的 L0 数
rerankApplied: false, rerankApplied: false,
rerankFailed: false,
beforeRerank: 0, beforeRerank: 0,
afterRerank: 0, afterRerank: 0,
rerankTime: 0, rerankTime: 0,
@@ -283,6 +284,9 @@ export function formatMetricsLog(metrics) {
if (m.evidence.rerankApplied) { if (m.evidence.rerankApplied) {
lines.push(`│ ├─ rerank_applied: true`); lines.push(`│ ├─ rerank_applied: true`);
if (m.evidence.rerankFailed) {
lines.push(`│ ├─ rerank_failed: ⚠ YES (using fusion order)`);
}
lines.push(`│ │ ├─ before: ${m.evidence.beforeRerank}`); lines.push(`│ │ ├─ before: ${m.evidence.beforeRerank}`);
lines.push(`│ │ ├─ after: ${m.evidence.afterRerank}`); lines.push(`│ │ ├─ after: ${m.evidence.afterRerank}`);
lines.push(`│ │ └─ time: ${m.evidence.rerankTime}ms`); lines.push(`│ │ └─ time: ${m.evidence.rerankTime}ms`);
@@ -489,6 +493,10 @@ export function detectIssues(metrics) {
// L1 挂载问题 // L1 挂载问题
// ───────────────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────────────
if (m.evidence.rerankFailed) {
issues.push('Rerank API failed — using fusion rank order as fallback, relevance scores are zero');
}
if (m.evidence.l0Selected > 0 && m.evidence.l1Pulled === 0) { if (m.evidence.l0Selected > 0 && m.evidence.l1Pulled === 0) {
issues.push('Zero L1 chunks pulled - L1 vectors may not exist or DB read failed'); issues.push('Zero L1 chunks pulled - L1 vectors may not exist or DB read failed');
} }

View File

@@ -631,6 +631,7 @@ async function locateAndPullEvidence(anchorHits, anchorFloors, queryVector, rera
metrics.evidence.rerankApplied = true; metrics.evidence.rerankApplied = true;
metrics.evidence.beforeRerank = rerankCandidates.length; metrics.evidence.beforeRerank = rerankCandidates.length;
metrics.evidence.afterRerank = rerankedL0.length; metrics.evidence.afterRerank = rerankedL0.length;
metrics.evidence.rerankFailed = rerankedL0.some(c => c._rerankFailed);
metrics.evidence.l0Selected = rerankedL0.length; metrics.evidence.l0Selected = rerankedL0.length;
metrics.evidence.rerankTime = rerankTime; metrics.evidence.rerankTime = rerankTime;
metrics.timing.evidenceRerank = rerankTime; metrics.timing.evidenceRerank = rerankTime;

View File

@@ -49,6 +49,9 @@ let jiebaCut = null;
/** @type {Function|null} jieba add_word 函数引用 */ /** @type {Function|null} jieba add_word 函数引用 */
let jiebaAddWord = null; let jiebaAddWord = null;
/** @type {object|null} TinySegmenter 实例 */
let tinySegmenter = null;
// ═══════════════════════════════════════════════════════════════════════════ // ═══════════════════════════════════════════════════════════════════════════
// 实体词典 // 实体词典
// ═══════════════════════════════════════════════════════════════════════════ // ═══════════════════════════════════════════════════════════════════════════
@@ -76,12 +79,13 @@ const STOP_WORDS = new Set([
'时候', '现在', '已经', '还是', '只是', '可能', '应该', '知道', '时候', '现在', '已经', '还是', '只是', '可能', '应该', '知道',
'觉得', '开始', '一下', '一些', '这个', '那个', '他们', '我们', '觉得', '开始', '一下', '一些', '这个', '那个', '他们', '我们',
'你们', '自己', '起来', '出来', '进去', '回来', '过来', '下去', '你们', '自己', '起来', '出来', '进去', '回来', '过来', '下去',
// 日语助词 + 常见虚词 // 日语常见虚词≥2字匹配 TinySegmenter 产出粒度)
'は', 'が', 'を', '', '', '', '', '', '', '', 'です', 'ます', 'した', 'して', 'する', 'ない', 'いる', 'ある',
'か', 'な', 'よ', '', '', 'だ', 'です', 'ます', 'た', 'て', 'なる', 'れる', 'られ', 'られる',
'する', 'いる', 'ある', 'なる', 'れる', 'られる', 'ない',
'この', 'その', 'あの', 'どの', 'ここ', 'そこ', 'あそこ', 'この', 'その', 'あの', 'どの', 'ここ', 'そこ', 'あそこ',
'これ', 'それ', 'あれ', 'どれ', 'これ', 'それ', 'あれ', 'どれ',
'ても', 'から', 'まで', 'ので', 'のに', 'けど', 'だけ',
'もう', 'まだ', 'とても', 'ちょっと', 'やっぱり',
// 英文常见停用词 // 英文常见停用词
'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been', 'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been',
'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
@@ -100,6 +104,34 @@ const STOP_WORDS = new Set([
// Unicode 分类 // Unicode 分类
// ═══════════════════════════════════════════════════════════════════════════ // ═══════════════════════════════════════════════════════════════════════════
/**
* 判断字符是否为假名(平假名 + 片假名)
* @param {number} code - charCode
* @returns {boolean}
*/
function isKana(code) {
return (
(code >= 0x3040 && code <= 0x309F) || // Hiragana
(code >= 0x30A0 && code <= 0x30FF) || // Katakana
(code >= 0x31F0 && code <= 0x31FF) || // Katakana Extensions
(code >= 0xFF65 && code <= 0xFF9F) // Halfwidth Katakana
);
}
/**
* 判断字符是否为 CJK 汉字(不含假名)
* @param {number} code - charCode
* @returns {boolean}
*/
function isCJK(code) {
return (
(code >= 0x4E00 && code <= 0x9FFF) ||
(code >= 0x3400 && code <= 0x4DBF) ||
(code >= 0xF900 && code <= 0xFAFF) ||
(code >= 0x20000 && code <= 0x2A6DF)
);
}
/** /**
* 判断字符是否为亚洲文字CJK + 假名) * 判断字符是否为亚洲文字CJK + 假名)
* @param {number} code - charCode * @param {number} code - charCode
@@ -107,14 +139,7 @@ const STOP_WORDS = new Set([
*/ */
function isAsian(code) { function isAsian(code) {
return ( return (
(code >= 0x4E00 && code <= 0x9FFF) || // CJK Unified Ideographs isCJK(code) || isKana(code)
(code >= 0x3400 && code <= 0x4DBF) || // CJK Extension A
(code >= 0x3040 && code <= 0x309F) || // Hiragana
(code >= 0x30A0 && code <= 0x30FF) || // Katakana
(code >= 0x31F0 && code <= 0x31FF) || // Katakana Phonetic Extensions
(code >= 0xFF65 && code <= 0xFF9F) || // Halfwidth Katakana
(code >= 0xF900 && code <= 0xFAFF) || // CJK Compatibility Ideographs
(code >= 0x20000 && code <= 0x2A6DF) // CJK Extension B
); );
} }
@@ -195,6 +220,31 @@ function segmentByScript(text) {
return segments; return segments;
} }
// ═══════════════════════════════════════════════════════════════════════════
// 亚洲文字语言检测(中文 vs 日语)
// ═══════════════════════════════════════════════════════════════════════════
/**
* 检测亚洲文字段的语言
*
* 假名占比 > 30% 判定为日语(日语文本中假名通常占 40-60%
*
* @param {string} text - 亚洲文字段
* @returns {'zh'|'ja'|'other'}
*/
function detectAsianLanguage(text) {
let kanaCount = 0;
let cjkCount = 0;
for (const ch of text) {
const code = ch.codePointAt(0);
if (isKana(code)) kanaCount++;
else if (isCJK(code)) cjkCount++;
}
const total = kanaCount + cjkCount;
if (total === 0) return 'other';
return (kanaCount / total) > 0.3 ? 'ja' : 'zh';
}
// ═══════════════════════════════════════════════════════════════════════════ // ═══════════════════════════════════════════════════════════════════════════
// 实体保护(最长匹配占位符替换) // 实体保护(最长匹配占位符替换)
// ═══════════════════════════════════════════════════════════════════════════ // ═══════════════════════════════════════════════════════════════════════════
@@ -343,6 +393,26 @@ function tokenizeAsianFallback(text) {
return tokens; return tokens;
} }
/**
* 用 TinySegmenter 处理日语文字段
* @param {string} text
* @returns {string[]}
*/
function tokenizeJapanese(text) {
if (tinySegmenter) {
try {
const words = tinySegmenter.segment(text);
return words
.map(w => String(w || '').trim())
.filter(w => w.length >= 2);
} catch (e) {
xbLog.warn(MODULE_ID, 'TinySegmenter 分词异常,降级处理', e);
return tokenizeAsianFallback(text);
}
}
return tokenizeAsianFallback(text);
}
// ═══════════════════════════════════════════════════════════════════════════ // ═══════════════════════════════════════════════════════════════════════════
// 分词:拉丁文字 // 分词:拉丁文字
// ═══════════════════════════════════════════════════════════════════════════ // ═══════════════════════════════════════════════════════════════════════════
@@ -374,6 +444,9 @@ function tokenizeLatin(text) {
* @returns {Promise<boolean>} 是否加载成功 * @returns {Promise<boolean>} 是否加载成功
*/ */
export async function preload() { export async function preload() {
// TinySegmenter 独立于结巴状态(内部有防重入)
loadTinySegmenter();
// 已就绪 // 已就绪
if (wasmState === WasmState.READY) return true; if (wasmState === WasmState.READY) return true;
@@ -443,6 +516,25 @@ export async function preload() {
} }
} }
/**
* 加载 TinySegmenter懒加载不阻塞
*/
async function loadTinySegmenter() {
if (tinySegmenter) return;
try {
// eslint-disable-next-line no-unsanitized/method
const mod = await import(
`/${extensionFolderPath}/libs/tiny-segmenter.js`
);
const Ctor = mod.TinySegmenter || mod.default;
tinySegmenter = new Ctor();
xbLog.info(MODULE_ID, 'TinySegmenter 加载完成');
} catch (e) {
xbLog.warn(MODULE_ID, 'TinySegmenter 加载失败,日语将使用降级分词', e);
}
}
// ═══════════════════════════════════════════════════════════════════════════ // ═══════════════════════════════════════════════════════════════════════════
// 公开接口isReady // 公开接口isReady
// ═══════════════════════════════════════════════════════════════════════════ // ═══════════════════════════════════════════════════════════════════════════
@@ -598,7 +690,10 @@ function tokenizeCore(text) {
const rawTokens = []; const rawTokens = [];
for (const seg of segments) { for (const seg of segments) {
if (seg.type === 'asian') { if (seg.type === 'asian') {
if (wasmState === WasmState.READY && jiebaCut) { const lang = detectAsianLanguage(seg.text);
if (lang === 'ja') {
rawTokens.push(...tokenizeJapanese(seg.text));
} else if (wasmState === WasmState.READY && jiebaCut) {
rawTokens.push(...tokenizeAsianJieba(seg.text)); rawTokens.push(...tokenizeAsianJieba(seg.text));
} else { } else {
rawTokens.push(...tokenizeAsianFallback(seg.text)); rawTokens.push(...tokenizeAsianFallback(seg.text));