diff --git a/docs/NOTICE b/docs/NOTICE index 1d189ae..f10911b 100644 --- a/docs/NOTICE +++ b/docs/NOTICE @@ -93,3 +93,7 @@ For complete license terms, see LICENSE.md For attribution requirements, see COPYRIGHT Last updated: 2025-01-14 +TinySegmenter 0.2 +Copyright (c) 2008 Taku Kudo +MIT License +http://www.chasen.org/~taku/software/TinySegmenter/ diff --git a/libs/tiny-segmenter.js b/libs/tiny-segmenter.js new file mode 100644 index 0000000..121c1ea --- /dev/null +++ b/libs/tiny-segmenter.js @@ -0,0 +1,177 @@ +// TinySegmenter 0.1 -- Super compact Japanese tokenizer in Javascript +// (c) 2008 Taku Kudo +// TinySegmenter is freely distributable under the terms of a new BSD licence. +// For details, see http://chasen.org/~taku/software/TinySegmenter/LICENCE.txt + +function TinySegmenter() { + var patterns = { + "[一二三四五六七八九十百千万億兆]":"M", + "[一-龠々〆ヵヶ]":"H", + "[ぁ-ん]":"I", + "[ァ-ヴーア-ン゙ー]":"K", + "[a-zA-Za-zA-Z]":"A", + "[0-90-9]":"N" + } + this.chartype_ = []; + for (var i in patterns) { + var regexp = new RegExp; + regexp.compile(i) + this.chartype_.push([regexp, patterns[i]]); + } + + this.BIAS__ = -332 + this.BC1__ = {"HH":6,"II":2461,"KH":406,"OH":-1378}; + this.BC2__ = {"AA":-3267,"AI":2744,"AN":-878,"HH":-4070,"HM":-1711,"HN":4012,"HO":3761,"IA":1327,"IH":-1184,"II":-1332,"IK":1721,"IO":5492,"KI":3831,"KK":-8741,"MH":-3132,"MK":3334,"OO":-2920}; + this.BC3__ = {"HH":996,"HI":626,"HK":-721,"HN":-1307,"HO":-836,"IH":-301,"KK":2762,"MK":1079,"MM":4034,"OA":-1652,"OH":266}; + this.BP1__ = {"BB":295,"OB":304,"OO":-125,"UB":352}; + this.BP2__ = {"BO":60,"OO":-1762}; + this.BQ1__ = {"BHH":1150,"BHM":1521,"BII":-1158,"BIM":886,"BMH":1208,"BNH":449,"BOH":-91,"BOO":-2597,"OHI":451,"OIH":-296,"OKA":1851,"OKH":-1020,"OKK":904,"OOO":2965}; + this.BQ2__ = {"BHH":118,"BHI":-1159,"BHM":466,"BIH":-919,"BKK":-1720,"BKO":864,"OHH":-1139,"OHM":-181,"OIH":153,"UHI":-1146}; + this.BQ3__ = {"BHH":-792,"BHI":2664,"BII":-299,"BKI":419,"BMH":937,"BMM":8335,"BNN":998,"BOH":775,"OHH":2174,"OHM":439,"OII":280,"OKH":1798,"OKI":-793,"OKO":-2242,"OMH":-2402,"OOO":11699}; + this.BQ4__ = {"BHH":-3895,"BIH":3761,"BII":-4654,"BIK":1348,"BKK":-1806,"BMI":-3385,"BOO":-12396,"OAH":926,"OHH":266,"OHK":-2036,"ONN":-973}; + this.BW1__ = {",と":660,",同":727,"B1あ":1404,"B1同":542,"、と":660,"、同":727,"」と":1682,"あっ":1505,"いう":1743,"いっ":-2055,"いる":672,"うし":-4817,"うん":665,"から":3472,"がら":600,"こう":-790,"こと":2083,"こん":-1262,"さら":-4143,"さん":4573,"した":2641,"して":1104,"すで":-3399,"そこ":1977,"それ":-871,"たち":1122,"ため":601,"った":3463,"つい":-802,"てい":805,"てき":1249,"でき":1127,"です":3445,"では":844,"とい":-4915,"とみ":1922,"どこ":3887,"ない":5713,"なっ":3015,"など":7379,"なん":-1113,"にし":2468,"には":1498,"にも":1671,"に対":-912,"の一":-501,"の中":741,"ませ":2448,"まで":1711,"まま":2600,"まる":-2155,"やむ":-1947,"よっ":-2565,"れた":2369,"れで":-913,"をし":1860,"を見":731,"亡く":-1886,"京都":2558,"取り":-2784,"大き":-2604,"大阪":1497,"平方":-2314,"引き":-1336,"日本":-195,"本当":-2423,"毎日":-2113,"目指":-724,"B1あ":1404,"B1同":542,"」と":1682}; + this.BW2__ = {"..":-11822,"11":-669,"――":-5730,"−−":-13175,"いう":-1609,"うか":2490,"かし":-1350,"かも":-602,"から":-7194,"かれ":4612,"がい":853,"がら":-3198,"きた":1941,"くな":-1597,"こと":-8392,"この":-4193,"させ":4533,"され":13168,"さん":-3977,"しい":-1819,"しか":-545,"した":5078,"して":972,"しな":939,"その":-3744,"たい":-1253,"たた":-662,"ただ":-3857,"たち":-786,"たと":1224,"たは":-939,"った":4589,"って":1647,"っと":-2094,"てい":6144,"てき":3640,"てく":2551,"ては":-3110,"ても":-3065,"でい":2666,"でき":-1528,"でし":-3828,"です":-4761,"でも":-4203,"とい":1890,"とこ":-1746,"とと":-2279,"との":720,"とみ":5168,"とも":-3941,"ない":-2488,"なが":-1313,"など":-6509,"なの":2614,"なん":3099,"にお":-1615,"にし":2748,"にな":2454,"によ":-7236,"に対":-14943,"に従":-4688,"に関":-11388,"のか":2093,"ので":-7059,"のに":-6041,"のの":-6125,"はい":1073,"はが":-1033,"はず":-2532,"ばれ":1813,"まし":-1316,"まで":-6621,"まれ":5409,"めて":-3153,"もい":2230,"もの":-10713,"らか":-944,"らし":-1611,"らに":-1897,"りし":651,"りま":1620,"れた":4270,"れて":849,"れば":4114,"ろう":6067,"われ":7901,"を通":-11877,"んだ":728,"んな":-4115,"一人":602,"一方":-1375,"一日":970,"一部":-1051,"上が":-4479,"会社":-1116,"出て":2163,"分の":-7758,"同党":970,"同日":-913,"大阪":-2471,"委員":-1250,"少な":-1050,"年度":-8669,"年間":-1626,"府県":-2363,"手権":-1982,"新聞":-4066,"日新":-722,"日本":-7068,"日米":3372,"曜日":-601,"朝鮮":-2355,"本人":-2697,"東京":-1543,"然と":-1384,"社会":-1276,"立て":-990,"第に":-1612,"米国":-4268,"11":-669}; + this.BW3__ = {"あた":-2194,"あり":719,"ある":3846,"い.":-1185,"い。":-1185,"いい":5308,"いえ":2079,"いく":3029,"いた":2056,"いっ":1883,"いる":5600,"いわ":1527,"うち":1117,"うと":4798,"えと":1454,"か.":2857,"か。":2857,"かけ":-743,"かっ":-4098,"かに":-669,"から":6520,"かり":-2670,"が,":1816,"が、":1816,"がき":-4855,"がけ":-1127,"がっ":-913,"がら":-4977,"がり":-2064,"きた":1645,"けど":1374,"こと":7397,"この":1542,"ころ":-2757,"さい":-714,"さを":976,"し,":1557,"し、":1557,"しい":-3714,"した":3562,"して":1449,"しな":2608,"しま":1200,"す.":-1310,"す。":-1310,"する":6521,"ず,":3426,"ず、":3426,"ずに":841,"そう":428,"た.":8875,"た。":8875,"たい":-594,"たの":812,"たり":-1183,"たる":-853,"だ.":4098,"だ。":4098,"だっ":1004,"った":-4748,"って":300,"てい":6240,"てお":855,"ても":302,"です":1437,"でに":-1482,"では":2295,"とう":-1387,"とし":2266,"との":541,"とも":-3543,"どう":4664,"ない":1796,"なく":-903,"など":2135,"に,":-1021,"に、":-1021,"にし":1771,"にな":1906,"には":2644,"の,":-724,"の、":-724,"の子":-1000,"は,":1337,"は、":1337,"べき":2181,"まし":1113,"ます":6943,"まっ":-1549,"まで":6154,"まれ":-793,"らし":1479,"られ":6820,"るる":3818,"れ,":854,"れ、":854,"れた":1850,"れて":1375,"れば":-3246,"れる":1091,"われ":-605,"んだ":606,"んで":798,"カ月":990,"会議":860,"入り":1232,"大会":2217,"始め":1681,"市":965,"新聞":-5055,"日,":974,"日、":974,"社会":2024,"カ月":990}; + this.TC1__ = {"AAA":1093,"HHH":1029,"HHM":580,"HII":998,"HOH":-390,"HOM":-331,"IHI":1169,"IOH":-142,"IOI":-1015,"IOM":467,"MMH":187,"OOI":-1832}; + this.TC2__ = {"HHO":2088,"HII":-1023,"HMM":-1154,"IHI":-1965,"KKH":703,"OII":-2649}; + this.TC3__ = {"AAA":-294,"HHH":346,"HHI":-341,"HII":-1088,"HIK":731,"HOH":-1486,"IHH":128,"IHI":-3041,"IHO":-1935,"IIH":-825,"IIM":-1035,"IOI":-542,"KHH":-1216,"KKA":491,"KKH":-1217,"KOK":-1009,"MHH":-2694,"MHM":-457,"MHO":123,"MMH":-471,"NNH":-1689,"NNO":662,"OHO":-3393}; + this.TC4__ = {"HHH":-203,"HHI":1344,"HHK":365,"HHM":-122,"HHN":182,"HHO":669,"HIH":804,"HII":679,"HOH":446,"IHH":695,"IHO":-2324,"IIH":321,"III":1497,"IIO":656,"IOO":54,"KAK":4845,"KKA":3386,"KKK":3065,"MHH":-405,"MHI":201,"MMH":-241,"MMM":661,"MOM":841}; + this.TQ1__ = {"BHHH":-227,"BHHI":316,"BHIH":-132,"BIHH":60,"BIII":1595,"BNHH":-744,"BOHH":225,"BOOO":-908,"OAKK":482,"OHHH":281,"OHIH":249,"OIHI":200,"OIIH":-68}; + this.TQ2__ = {"BIHH":-1401,"BIII":-1033,"BKAK":-543,"BOOO":-5591}; + this.TQ3__ = {"BHHH":478,"BHHM":-1073,"BHIH":222,"BHII":-504,"BIIH":-116,"BIII":-105,"BMHI":-863,"BMHM":-464,"BOMH":620,"OHHH":346,"OHHI":1729,"OHII":997,"OHMH":481,"OIHH":623,"OIIH":1344,"OKAK":2792,"OKHH":587,"OKKA":679,"OOHH":110,"OOII":-685}; + this.TQ4__ = {"BHHH":-721,"BHHM":-3604,"BHII":-966,"BIIH":-607,"BIII":-2181,"OAAA":-2763,"OAKK":180,"OHHH":-294,"OHHI":2446,"OHHO":480,"OHIH":-1573,"OIHH":1935,"OIHI":-493,"OIIH":626,"OIII":-4007,"OKAK":-8156}; + this.TW1__ = {"につい":-4681,"東京都":2026}; + this.TW2__ = {"ある程":-2049,"いった":-1256,"ころが":-2434,"しょう":3873,"その後":-4430,"だって":-1049,"ていた":1833,"として":-4657,"ともに":-4517,"もので":1882,"一気に":-792,"初めて":-1512,"同時に":-8097,"大きな":-1255,"対して":-2721,"社会党":-3216}; + this.TW3__ = {"いただ":-1734,"してい":1314,"として":-4314,"につい":-5483,"にとっ":-5989,"に当た":-6247,"ので,":-727,"ので、":-727,"のもの":-600,"れから":-3752,"十二月":-2287}; + this.TW4__ = {"いう.":8576,"いう。":8576,"からな":-2348,"してい":2958,"たが,":1516,"たが、":1516,"ている":1538,"という":1349,"ました":5543,"ません":1097,"ようと":-4258,"よると":5865}; + this.UC1__ = {"A":484,"K":93,"M":645,"O":-505}; + this.UC2__ = {"A":819,"H":1059,"I":409,"M":3987,"N":5775,"O":646}; + this.UC3__ = {"A":-1370,"I":2311}; + this.UC4__ = {"A":-2643,"H":1809,"I":-1032,"K":-3450,"M":3565,"N":3876,"O":6646}; + this.UC5__ = {"H":313,"I":-1238,"K":-799,"M":539,"O":-831}; + this.UC6__ = {"H":-506,"I":-253,"K":87,"M":247,"O":-387}; + this.UP1__ = {"O":-214}; + this.UP2__ = {"B":69,"O":935}; + this.UP3__ = {"B":189}; + this.UQ1__ = {"BH":21,"BI":-12,"BK":-99,"BN":142,"BO":-56,"OH":-95,"OI":477,"OK":410,"OO":-2422}; + this.UQ2__ = {"BH":216,"BI":113,"OK":1759}; + this.UQ3__ = {"BA":-479,"BH":42,"BI":1913,"BK":-7198,"BM":3160,"BN":6427,"BO":14761,"OI":-827,"ON":-3212}; + this.UW1__ = {",":156,"、":156,"「":-463,"あ":-941,"う":-127,"が":-553,"き":121,"こ":505,"で":-201,"と":-547,"ど":-123,"に":-789,"の":-185,"は":-847,"も":-466,"や":-470,"よ":182,"ら":-292,"り":208,"れ":169,"を":-446,"ん":-137,"・":-135,"主":-402,"京":-268,"区":-912,"午":871,"国":-460,"大":561,"委":729,"市":-411,"日":-141,"理":361,"生":-408,"県":-386,"都":-718,"「":-463,"・":-135}; + this.UW2__ = {",":-829,"、":-829,"〇":892,"「":-645,"」":3145,"あ":-538,"い":505,"う":134,"お":-502,"か":1454,"が":-856,"く":-412,"こ":1141,"さ":878,"ざ":540,"し":1529,"す":-675,"せ":300,"そ":-1011,"た":188,"だ":1837,"つ":-949,"て":-291,"で":-268,"と":-981,"ど":1273,"な":1063,"に":-1764,"の":130,"は":-409,"ひ":-1273,"べ":1261,"ま":600,"も":-1263,"や":-402,"よ":1639,"り":-579,"る":-694,"れ":571,"を":-2516,"ん":2095,"ア":-587,"カ":306,"キ":568,"ッ":831,"三":-758,"不":-2150,"世":-302,"中":-968,"主":-861,"事":492,"人":-123,"会":978,"保":362,"入":548,"初":-3025,"副":-1566,"北":-3414,"区":-422,"大":-1769,"天":-865,"太":-483,"子":-1519,"学":760,"実":1023,"小":-2009,"市":-813,"年":-1060,"強":1067,"手":-1519,"揺":-1033,"政":1522,"文":-1355,"新":-1682,"日":-1815,"明":-1462,"最":-630,"朝":-1843,"本":-1650,"東":-931,"果":-665,"次":-2378,"民":-180,"気":-1740,"理":752,"発":529,"目":-1584,"相":-242,"県":-1165,"立":-763,"第":810,"米":509,"自":-1353,"行":838,"西":-744,"見":-3874,"調":1010,"議":1198,"込":3041,"開":1758,"間":-1257,"「":-645,"」":3145,"ッ":831,"ア":-587,"カ":306,"キ":568}; + this.UW3__ = {",":4889,"1":-800,"−":-1723,"、":4889,"々":-2311,"〇":5827,"」":2670,"〓":-3573,"あ":-2696,"い":1006,"う":2342,"え":1983,"お":-4864,"か":-1163,"が":3271,"く":1004,"け":388,"げ":401,"こ":-3552,"ご":-3116,"さ":-1058,"し":-395,"す":584,"せ":3685,"そ":-5228,"た":842,"ち":-521,"っ":-1444,"つ":-1081,"て":6167,"で":2318,"と":1691,"ど":-899,"な":-2788,"に":2745,"の":4056,"は":4555,"ひ":-2171,"ふ":-1798,"へ":1199,"ほ":-5516,"ま":-4384,"み":-120,"め":1205,"も":2323,"や":-788,"よ":-202,"ら":727,"り":649,"る":5905,"れ":2773,"わ":-1207,"を":6620,"ん":-518,"ア":551,"グ":1319,"ス":874,"ッ":-1350,"ト":521,"ム":1109,"ル":1591,"ロ":2201,"ン":278,"・":-3794,"一":-1619,"下":-1759,"世":-2087,"両":3815,"中":653,"主":-758,"予":-1193,"二":974,"人":2742,"今":792,"他":1889,"以":-1368,"低":811,"何":4265,"作":-361,"保":-2439,"元":4858,"党":3593,"全":1574,"公":-3030,"六":755,"共":-1880,"円":5807,"再":3095,"分":457,"初":2475,"別":1129,"前":2286,"副":4437,"力":365,"動":-949,"務":-1872,"化":1327,"北":-1038,"区":4646,"千":-2309,"午":-783,"協":-1006,"口":483,"右":1233,"各":3588,"合":-241,"同":3906,"和":-837,"員":4513,"国":642,"型":1389,"場":1219,"外":-241,"妻":2016,"学":-1356,"安":-423,"実":-1008,"家":1078,"小":-513,"少":-3102,"州":1155,"市":3197,"平":-1804,"年":2416,"広":-1030,"府":1605,"度":1452,"建":-2352,"当":-3885,"得":1905,"思":-1291,"性":1822,"戸":-488,"指":-3973,"政":-2013,"教":-1479,"数":3222,"文":-1489,"新":1764,"日":2099,"旧":5792,"昨":-661,"時":-1248,"曜":-951,"最":-937,"月":4125,"期":360,"李":3094,"村":364,"東":-805,"核":5156,"森":2438,"業":484,"氏":2613,"民":-1694,"決":-1073,"法":1868,"海":-495,"無":979,"物":461,"特":-3850,"生":-273,"用":914,"町":1215,"的":7313,"直":-1835,"省":792,"県":6293,"知":-1528,"私":4231,"税":401,"立":-960,"第":1201,"米":7767,"系":3066,"約":3663,"級":1384,"統":-4229,"総":1163,"線":1255,"者":6457,"能":725,"自":-2869,"英":785,"見":1044,"調":-562,"財":-733,"費":1777,"車":1835,"軍":1375,"込":-1504,"通":-1136,"選":-681,"郎":1026,"郡":4404,"部":1200,"金":2163,"長":421,"開":-1432,"間":1302,"関":-1282,"雨":2009,"電":-1045,"非":2066,"駅":1620,"1":-800,"」":2670,"・":-3794,"ッ":-1350,"ア":551,"グ":1319,"ス":874,"ト":521,"ム":1109,"ル":1591,"ロ":2201,"ン":278}; + this.UW4__ = {",":3930,".":3508,"―":-4841,"、":3930,"。":3508,"〇":4999,"「":1895,"」":3798,"〓":-5156,"あ":4752,"い":-3435,"う":-640,"え":-2514,"お":2405,"か":530,"が":6006,"き":-4482,"ぎ":-3821,"く":-3788,"け":-4376,"げ":-4734,"こ":2255,"ご":1979,"さ":2864,"し":-843,"じ":-2506,"す":-731,"ず":1251,"せ":181,"そ":4091,"た":5034,"だ":5408,"ち":-3654,"っ":-5882,"つ":-1659,"て":3994,"で":7410,"と":4547,"な":5433,"に":6499,"ぬ":1853,"ね":1413,"の":7396,"は":8578,"ば":1940,"ひ":4249,"び":-4134,"ふ":1345,"へ":6665,"べ":-744,"ほ":1464,"ま":1051,"み":-2082,"む":-882,"め":-5046,"も":4169,"ゃ":-2666,"や":2795,"ょ":-1544,"よ":3351,"ら":-2922,"り":-9726,"る":-14896,"れ":-2613,"ろ":-4570,"わ":-1783,"を":13150,"ん":-2352,"カ":2145,"コ":1789,"セ":1287,"ッ":-724,"ト":-403,"メ":-1635,"ラ":-881,"リ":-541,"ル":-856,"ン":-3637,"・":-4371,"ー":-11870,"一":-2069,"中":2210,"予":782,"事":-190,"井":-1768,"人":1036,"以":544,"会":950,"体":-1286,"作":530,"側":4292,"先":601,"党":-2006,"共":-1212,"内":584,"円":788,"初":1347,"前":1623,"副":3879,"力":-302,"動":-740,"務":-2715,"化":776,"区":4517,"協":1013,"参":1555,"合":-1834,"和":-681,"員":-910,"器":-851,"回":1500,"国":-619,"園":-1200,"地":866,"場":-1410,"塁":-2094,"士":-1413,"多":1067,"大":571,"子":-4802,"学":-1397,"定":-1057,"寺":-809,"小":1910,"屋":-1328,"山":-1500,"島":-2056,"川":-2667,"市":2771,"年":374,"庁":-4556,"後":456,"性":553,"感":916,"所":-1566,"支":856,"改":787,"政":2182,"教":704,"文":522,"方":-856,"日":1798,"時":1829,"最":845,"月":-9066,"木":-485,"来":-442,"校":-360,"業":-1043,"氏":5388,"民":-2716,"気":-910,"沢":-939,"済":-543,"物":-735,"率":672,"球":-1267,"生":-1286,"産":-1101,"田":-2900,"町":1826,"的":2586,"目":922,"省":-3485,"県":2997,"空":-867,"立":-2112,"第":788,"米":2937,"系":786,"約":2171,"経":1146,"統":-1169,"総":940,"線":-994,"署":749,"者":2145,"能":-730,"般":-852,"行":-792,"規":792,"警":-1184,"議":-244,"谷":-1000,"賞":730,"車":-1481,"軍":1158,"輪":-1433,"込":-3370,"近":929,"道":-1291,"選":2596,"郎":-4866,"都":1192,"野":-1100,"銀":-2213,"長":357,"間":-2344,"院":-2297,"際":-2604,"電":-878,"領":-1659,"題":-792,"館":-1984,"首":1749,"高":2120,"「":1895,"」":3798,"・":-4371,"ッ":-724,"ー":-11870,"カ":2145,"コ":1789,"セ":1287,"ト":-403,"メ":-1635,"ラ":-881,"リ":-541,"ル":-856,"ン":-3637}; + this.UW5__ = {",":465,".":-299,"1":-514,"E2":-32768,"]":-2762,"、":465,"。":-299,"「":363,"あ":1655,"い":331,"う":-503,"え":1199,"お":527,"か":647,"が":-421,"き":1624,"ぎ":1971,"く":312,"げ":-983,"さ":-1537,"し":-1371,"す":-852,"だ":-1186,"ち":1093,"っ":52,"つ":921,"て":-18,"で":-850,"と":-127,"ど":1682,"な":-787,"に":-1224,"の":-635,"は":-578,"べ":1001,"み":502,"め":865,"ゃ":3350,"ょ":854,"り":-208,"る":429,"れ":504,"わ":419,"を":-1264,"ん":327,"イ":241,"ル":451,"ン":-343,"中":-871,"京":722,"会":-1153,"党":-654,"務":3519,"区":-901,"告":848,"員":2104,"大":-1296,"学":-548,"定":1785,"嵐":-1304,"市":-2991,"席":921,"年":1763,"思":872,"所":-814,"挙":1618,"新":-1682,"日":218,"月":-4353,"査":932,"格":1356,"機":-1508,"氏":-1347,"田":240,"町":-3912,"的":-3149,"相":1319,"省":-1052,"県":-4003,"研":-997,"社":-278,"空":-813,"統":1955,"者":-2233,"表":663,"語":-1073,"議":1219,"選":-1018,"郎":-368,"長":786,"間":1191,"題":2368,"館":-689,"1":-514,"E2":-32768,"「":363,"イ":241,"ル":451,"ン":-343}; + this.UW6__ = {",":227,".":808,"1":-270,"E1":306,"、":227,"。":808,"あ":-307,"う":189,"か":241,"が":-73,"く":-121,"こ":-200,"じ":1782,"す":383,"た":-428,"っ":573,"て":-1014,"で":101,"と":-105,"な":-253,"に":-149,"の":-417,"は":-236,"も":-206,"り":187,"る":-135,"を":195,"ル":-673,"ン":-496,"一":-277,"中":201,"件":-800,"会":624,"前":302,"区":1792,"員":-1212,"委":798,"学":-960,"市":887,"広":-695,"後":535,"業":-697,"相":753,"社":-507,"福":974,"空":-822,"者":1811,"連":463,"郎":1082,"1":-270,"E1":306,"ル":-673,"ン":-496}; + + return this; +} + +TinySegmenter.prototype.ctype_ = function(str) { + for (var i in this.chartype_) { + if (str.match(this.chartype_[i][0])) { + return this.chartype_[i][1]; + } + } + return "O"; +} + +TinySegmenter.prototype.ts_ = function(v) { + if (v) { return v; } + return 0; +} + +TinySegmenter.prototype.segment = function(input) { + if (input == null || input == undefined || input == "") { + return []; + } + var result = []; + var seg = ["B3","B2","B1"]; + var ctype = ["O","O","O"]; + var o = input.split(""); + for (i = 0; i < o.length; ++i) { + seg.push(o[i]); + ctype.push(this.ctype_(o[i])) + } + seg.push("E1"); + seg.push("E2"); + seg.push("E3"); + ctype.push("O"); + ctype.push("O"); + ctype.push("O"); + var word = seg[3]; + var p1 = "U"; + var p2 = "U"; + var p3 = "U"; + for (var i = 4; i < seg.length - 3; ++i) { + var score = this.BIAS__; + var w1 = seg[i-3]; + var w2 = seg[i-2]; + var w3 = seg[i-1]; + var w4 = seg[i]; + var w5 = seg[i+1]; + var w6 = seg[i+2]; + var c1 = ctype[i-3]; + var c2 = ctype[i-2]; + var c3 = ctype[i-1]; + var c4 = ctype[i]; + var c5 = ctype[i+1]; + var c6 = ctype[i+2]; + score += this.ts_(this.UP1__[p1]); + score += this.ts_(this.UP2__[p2]); + score += this.ts_(this.UP3__[p3]); + score += this.ts_(this.BP1__[p1 + p2]); + score += this.ts_(this.BP2__[p2 + p3]); + score += this.ts_(this.UW1__[w1]); + score += this.ts_(this.UW2__[w2]); + score += this.ts_(this.UW3__[w3]); + score += this.ts_(this.UW4__[w4]); + score += this.ts_(this.UW5__[w5]); + score += this.ts_(this.UW6__[w6]); + score += this.ts_(this.BW1__[w2 + w3]); + score += this.ts_(this.BW2__[w3 + w4]); + score += this.ts_(this.BW3__[w4 + w5]); + score += this.ts_(this.TW1__[w1 + w2 + w3]); + score += this.ts_(this.TW2__[w2 + w3 + w4]); + score += this.ts_(this.TW3__[w3 + w4 + w5]); + score += this.ts_(this.TW4__[w4 + w5 + w6]); + score += this.ts_(this.UC1__[c1]); + score += this.ts_(this.UC2__[c2]); + score += this.ts_(this.UC3__[c3]); + score += this.ts_(this.UC4__[c4]); + score += this.ts_(this.UC5__[c5]); + score += this.ts_(this.UC6__[c6]); + score += this.ts_(this.BC1__[c2 + c3]); + score += this.ts_(this.BC2__[c3 + c4]); + score += this.ts_(this.BC3__[c4 + c5]); + score += this.ts_(this.TC1__[c1 + c2 + c3]); + score += this.ts_(this.TC2__[c2 + c3 + c4]); + score += this.ts_(this.TC3__[c3 + c4 + c5]); + score += this.ts_(this.TC4__[c4 + c5 + c6]); +// score += this.ts_(this.TC5__[c4 + c5 + c6]); + score += this.ts_(this.UQ1__[p1 + c1]); + score += this.ts_(this.UQ2__[p2 + c2]); + score += this.ts_(this.UQ3__[p3 + c3]); + score += this.ts_(this.BQ1__[p2 + c2 + c3]); + score += this.ts_(this.BQ2__[p2 + c3 + c4]); + score += this.ts_(this.BQ3__[p3 + c2 + c3]); + score += this.ts_(this.BQ4__[p3 + c3 + c4]); + score += this.ts_(this.TQ1__[p2 + c1 + c2 + c3]); + score += this.ts_(this.TQ2__[p2 + c2 + c3 + c4]); + score += this.ts_(this.TQ3__[p3 + c1 + c2 + c3]); + score += this.ts_(this.TQ4__[p3 + c2 + c3 + c4]); + var p = "O"; + if (score > 0) { + result.push(word); + word = ""; + p = "B"; + } + p1 = p2; + p2 = p3; + p3 = p; + word += seg[i]; + } + result.push(word); + + return result; +} +export { TinySegmenter }; diff --git a/modules/story-summary/story-summary.js b/modules/story-summary/story-summary.js index fd67322..4d3fa92 100644 --- a/modules/story-summary/story-summary.js +++ b/modules/story-summary/story-summary.js @@ -1,4 +1,4 @@ -// ═══════════════════════════════════════════════════════════════════════════ +// ═══════════════════════════════════════════════════════════════════════════ // Story Summary - 主入口 // // 稳定目标: @@ -107,16 +107,43 @@ const MESSAGE_EVENT = "message"; // 状态变量 // ═══════════════════════════════════════════════════════════════════════════ -let summaryGenerating = false; let overlayCreated = false; let frameReady = false; let currentMesId = null; let pendingFrameMessages = []; let eventsRegistered = false; -let vectorGenerating = false; let vectorCancelled = false; let vectorAbortController = null; -let anchorGenerating = false; + +// ═══════════════════════════════════════════════════════════════════════════ +// TaskGuard — 互斥任务管理(summary / vector / anchor) +// ═══════════════════════════════════════════════════════════════════════════ + +class TaskGuard { + #running = new Set(); + + acquire(taskName) { + if (this.#running.has(taskName)) return null; + this.#running.add(taskName); + let released = false; + return () => { + if (!released) { + released = true; + this.#running.delete(taskName); + } + }; + } + + isRunning(taskName) { + return this.#running.has(taskName); + } + + isAnyRunning(...taskNames) { + return taskNames.some(t => this.#running.has(t)); + } +} + +const guard = new TaskGuard(); // 用户消息缓存(解决 GENERATION_STARTED 时 chat 尚未包含用户消息的问题) let lastSentUserMessage = null; @@ -219,13 +246,12 @@ async function unhideAllMessages() { // 生成状态管理 // ═══════════════════════════════════════════════════════════════════════════ -function setSummaryGenerating(flag) { - summaryGenerating = !!flag; - postToFrame({ type: "GENERATION_STATE", isGenerating: summaryGenerating }); +function isSummaryGenerating() { + return guard.isRunning('summary'); } -function isSummaryGenerating() { - return summaryGenerating; +function notifySummaryState() { + postToFrame({ type: "GENERATION_STATE", isGenerating: guard.isRunning('summary') }); } // ═══════════════════════════════════════════════════════════════════════════ @@ -300,38 +326,35 @@ async function sendAnchorStatsToFrame() { } async function handleAnchorGenerate() { - if (anchorGenerating) return; - - const vectorCfg = getVectorConfig(); - if (!vectorCfg?.enabled) { - await executeSlashCommand("/echo severity=warning 请先启用向量检索"); - return; - } - - if (!vectorCfg.online?.key) { - postToFrame({ type: "VECTOR_ONLINE_STATUS", status: "error", message: "请配置 API Key" }); - return; - } - - const { chatId, chat } = getContext(); - if (!chatId || !chat?.length) return; - - anchorGenerating = true; - - postToFrame({ type: "ANCHOR_GEN_PROGRESS", current: 0, total: 1, message: "分析中..." }); + const release = guard.acquire('anchor'); + if (!release) return; try { - // Phase 1: L0 提取 + Phase 2: L0 向量化(在 incrementalExtractAtoms 内部完成) + const vectorCfg = getVectorConfig(); + if (!vectorCfg?.enabled) { + await executeSlashCommand("/echo severity=warning 请先启用向量检索"); + return; + } + + if (!vectorCfg.online?.key) { + postToFrame({ type: "VECTOR_ONLINE_STATUS", status: "error", message: "请配置 API Key" }); + return; + } + + const { chatId, chat } = getContext(); + if (!chatId || !chat?.length) return; + + postToFrame({ type: "ANCHOR_GEN_PROGRESS", current: 0, total: 1, message: "分析中..." }); + await incrementalExtractAtoms(chatId, chat, (message, current, total) => { postToFrame({ type: "ANCHOR_GEN_PROGRESS", current, total, message }); }); - // Phase 3: 处理 pending L1 Chunks postToFrame({ type: "ANCHOR_GEN_PROGRESS", current: 0, total: 1, message: "向量化 L1..." }); await buildIncrementalChunks({ vectorConfig: vectorCfg }); - + invalidateLexicalIndex(); - + await sendAnchorStatsToFrame(); await sendVectorStatsToFrame(); @@ -340,7 +363,7 @@ async function handleAnchorGenerate() { xbLog.error(MODULE_ID, "记忆锚点生成失败", e); await executeSlashCommand(`/echo severity=error 记忆锚点生成失败:${e.message}`); } finally { - anchorGenerating = false; + release(); postToFrame({ type: "ANCHOR_GEN_PROGRESS", current: -1, total: 0 }); } } @@ -359,7 +382,6 @@ async function handleAnchorClear() { function handleAnchorCancel() { cancelL0Extraction(); - anchorGenerating = false; postToFrame({ type: "ANCHOR_GEN_PROGRESS", current: -1, total: 0 }); } @@ -378,142 +400,159 @@ async function handleTestOnlineService(provider, config) { } async function handleGenerateVectors(vectorCfg) { - if (vectorGenerating) return; + const release = guard.acquire('vector'); + if (!release) return; - if (!vectorCfg?.enabled) { - postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "ALL", current: -1, total: 0 }); - return; - } + try { + if (!vectorCfg?.enabled) { + postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "ALL", current: -1, total: 0 }); + return; + } - const { chatId, chat } = getContext(); - if (!chatId || !chat?.length) return; + const { chatId, chat } = getContext(); + if (!chatId || !chat?.length) return; - if (!vectorCfg.online?.key) { - postToFrame({ type: "VECTOR_ONLINE_STATUS", status: "error", message: "请配置 API Key" }); - return; - } + if (!vectorCfg.online?.key) { + postToFrame({ type: "VECTOR_ONLINE_STATUS", status: "error", message: "请配置 API Key" }); + return; + } - vectorGenerating = true; - vectorCancelled = false; - vectorAbortController = new AbortController(); + vectorCancelled = false; + vectorAbortController = new AbortController(); - const fingerprint = getEngineFingerprint(vectorCfg); - const batchSize = 20; + const fingerprint = getEngineFingerprint(vectorCfg); + const batchSize = 20; - await clearAllChunks(chatId); - await clearEventVectors(chatId); - await clearStateVectors(chatId); - await updateMeta(chatId, { lastChunkFloor: -1, fingerprint }); + await clearAllChunks(chatId); + await clearEventVectors(chatId); + await clearStateVectors(chatId); + await updateMeta(chatId, { lastChunkFloor: -1, fingerprint }); - const atoms = getStateAtoms(); - if (!atoms.length) { - postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L0", current: 0, total: 0, message: "L0 为空,跳过" }); - } else { - postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L0", current: 0, total: atoms.length, message: "L0 向量化..." }); + const atoms = getStateAtoms(); + if (!atoms.length) { + postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L0", current: 0, total: 0, message: "L0 为空,跳过" }); + } else { + postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L0", current: 0, total: atoms.length, message: "L0 向量化..." }); - let l0Completed = 0; - for (let i = 0; i < atoms.length; i += batchSize) { - if (vectorCancelled) break; + let l0Completed = 0; + for (let i = 0; i < atoms.length; i += batchSize) { + if (vectorCancelled) break; - const batch = atoms.slice(i, i + batchSize); - const texts = batch.map(a => a.semantic); - try { - const vectors = await embed(texts, vectorCfg, { signal: vectorAbortController.signal }); - const items = batch.map((a, j) => ({ - atomId: a.atomId, - floor: a.floor, - vector: vectors[j], - })); - await saveStateVectors(chatId, items, fingerprint); - l0Completed += batch.length; - postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L0", current: l0Completed, total: atoms.length }); - } catch (e) { - if (e?.name === "AbortError") break; - xbLog.error(MODULE_ID, "L0 向量化失败", e); - vectorCancelled = true; - break; + const batch = atoms.slice(i, i + batchSize); + const texts = batch.map(a => a.semantic); + try { + const vectors = await embed(texts, vectorCfg, { signal: vectorAbortController.signal }); + const items = batch.map((a, j) => ({ + atomId: a.atomId, + floor: a.floor, + vector: vectors[j], + })); + await saveStateVectors(chatId, items, fingerprint); + l0Completed += batch.length; + postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L0", current: l0Completed, total: atoms.length }); + } catch (e) { + if (e?.name === "AbortError") break; + xbLog.error(MODULE_ID, "L0 向量化失败", e); + vectorCancelled = true; + break; + } } } - } - if (vectorCancelled) { - vectorGenerating = false; - return; - } + if (vectorCancelled) return; - const allChunks = []; - for (let floor = 0; floor < chat.length; floor++) { - const chunks = chunkMessage(floor, chat[floor]); - allChunks.push(...chunks); - } + const allChunks = []; + for (let floor = 0; floor < chat.length; floor++) { + if (vectorCancelled) break; - if (allChunks.length > 0) { - await saveChunks(chatId, allChunks); - } + const message = chat[floor]; + if (!message) continue; - const l1Texts = allChunks.map(c => c.text); - const store = getSummaryStore(); - const events = store?.json?.events || []; + const { chunks, status } = chunkMessage(message, floor, vectorCfg, true); + if (status === "skip") continue; - postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L1", current: 0, total: l1Texts.length }); - - const l1Vectors = []; - let completed = 0; - for (let i = 0; i < l1Texts.length; i += batchSize) { - if (vectorCancelled) break; - - const batch = l1Texts.slice(i, i + batchSize); - try { - const vectors = await embed(batch, vectorCfg, { signal: vectorAbortController.signal }); - l1Vectors.push(...vectors); - completed += batch.length; - postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L1", current: completed, total: l1Texts.length }); - } catch (e) { - if (e?.name === 'AbortError') break; - xbLog.error(MODULE_ID, 'L1 向量化失败', e); - vectorCancelled = true; - break; + allChunks.push(...chunks); } - } - if (!vectorCancelled && l1Vectors.length > 0) { - const items = allChunks.map((c, i) => ({ chunkId: c.chunkId, vector: l1Vectors[i] })).filter(x => x.vector); - await saveChunkVectors(chatId, items, fingerprint); - await updateMeta(chatId, { lastChunkFloor: chat.length - 1 }); - } + let l1Vectors = []; + if (!allChunks.length) { + postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L1", current: 0, total: 0, message: "L1 为空,跳过" }); + } else { + postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L1", current: 0, total: allChunks.length, message: "L1 向量化..." }); + await saveChunks(chatId, allChunks); - const l2Pairs = events - .map(e => ({ id: e.id, text: `${e.title || ''} ${e.summary || ''}`.trim() })) - .filter(p => p.text); + let l1Completed = 0; + for (let i = 0; i < allChunks.length; i += batchSize) { + if (vectorCancelled) break; - postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L2", current: 0, total: l2Pairs.length }); - let l2Completed = 0; - for (let i = 0; i < l2Pairs.length; i += batchSize) { - if (vectorCancelled) break; - - const batch = l2Pairs.slice(i, i + batchSize); - try { - const vectors = await embed(batch.map(p => p.text), vectorCfg, { signal: vectorAbortController.signal }); - const items = batch.map((p, j) => ({ eventId: p.id, vector: vectors[j] })); - await saveEventVectorsToDb(chatId, items, fingerprint); - l2Completed += batch.length; - postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L2", current: l2Completed, total: l2Pairs.length }); - } catch (e) { - if (e?.name === 'AbortError') break; - xbLog.error(MODULE_ID, 'L2 向量化失败', e); - vectorCancelled = true; - break; + const batch = allChunks.slice(i, i + batchSize); + const texts = batch.map(c => c.text); + try { + const vectors = await embed(texts, vectorCfg, { signal: vectorAbortController.signal }); + const items = batch.map((c, j) => ({ + chunkId: c.chunkId, + vector: vectors[j], + })); + await saveChunkVectors(chatId, items, fingerprint); + l1Vectors = l1Vectors.concat(items); + l1Completed += batch.length; + postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L1", current: l1Completed, total: allChunks.length }); + } catch (e) { + if (e?.name === "AbortError") break; + xbLog.error(MODULE_ID, "L1 向量化失败", e); + vectorCancelled = true; + break; + } + } } + + if (vectorCancelled) return; + + const store = getSummaryStore(); + const events = store?.json?.events || []; + + const l2Pairs = events + .map((e) => ({ id: e.id, text: `${e.title || ""} ${e.summary || ""}`.trim() })) + .filter((p) => p.text); + + if (!l2Pairs.length) { + postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L2", current: 0, total: 0, message: "L2 为空,跳过" }); + } else { + postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L2", current: 0, total: l2Pairs.length, message: "L2 向量化..." }); + + let l2Completed = 0; + for (let i = 0; i < l2Pairs.length; i += batchSize) { + if (vectorCancelled) break; + + const batch = l2Pairs.slice(i, i + batchSize); + const texts = batch.map(p => p.text); + try { + const vectors = await embed(texts, vectorCfg, { signal: vectorAbortController.signal }); + const items = batch.map((p, idx) => ({ + eventId: p.id, + vector: vectors[idx], + })); + await saveEventVectorsToDb(chatId, items, fingerprint); + l2Completed += batch.length; + postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "L2", current: l2Completed, total: l2Pairs.length }); + } catch (e) { + if (e?.name === "AbortError") break; + xbLog.error(MODULE_ID, "L2 向量化失败", e); + vectorCancelled = true; + break; + } + } + } + + postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "ALL", current: -1, total: 0 }); + await sendVectorStatsToFrame(); + + xbLog.info(MODULE_ID, `向量生成完成: L0=${atoms.length}, L1=${l1Vectors.length}, L2=${l2Pairs.length}`); + } finally { + release(); + vectorCancelled = false; + vectorAbortController = null; } - - postToFrame({ type: "VECTOR_GEN_PROGRESS", phase: "ALL", current: -1, total: 0 }); - await sendVectorStatsToFrame(); - - vectorGenerating = false; - vectorCancelled = false; - vectorAbortController = null; - - xbLog.info(MODULE_ID, `向量生成完成: L0=${atoms.length}, L1=${l1Vectors.length}, L2=${l2Pairs.length}`); } async function handleClearVectors() { @@ -529,52 +568,6 @@ async function handleClearVectors() { xbLog.info(MODULE_ID, "向量数据已清除"); } -// ═══════════════════════════════════════════════════════════════════════════ -// L0 自动补提取(每收到新消息后检查并补提取缺失楼层) -// ═══════════════════════════════════════════════════════════════════════════ - -async function maybeAutoExtractL0() { - const vectorCfg = getVectorConfig(); - if (!vectorCfg?.enabled) return; - if (anchorGenerating || vectorGenerating) return; - - const { chatId, chat } = getContext(); - if (!chatId || !chat?.length) return; - - const stats = await getAnchorStats(); - if (stats.pending <= 0) return; - - anchorGenerating = true; - - try { - await incrementalExtractAtoms(chatId, chat, null, { maxFloors: 20 }); - - // 为新提取的 L0 楼层构建 L1 chunks - await buildIncrementalChunks({ vectorConfig: vectorCfg }); - - invalidateLexicalIndex(); - - await sendAnchorStatsToFrame(); - await sendVectorStatsToFrame(); - - xbLog.info(MODULE_ID, "自动 L0 补提取完成"); - } catch (e) { - xbLog.error(MODULE_ID, "自动 L0 补提取失败", e); - } finally { - anchorGenerating = false; - } -} - -// ═══════════════════════════════════════════════════════════════════════════ -// Embedding 连接预热 -// ═══════════════════════════════════════════════════════════════════════════ - -function warmupEmbeddingConnection() { - const vectorCfg = getVectorConfig(); - if (!vectorCfg?.enabled) return; - embed(['.'], vectorCfg, { timeout: 5000 }).catch(() => {}); -} - // ═══════════════════════════════════════════════════════════════════════════ // 实体词典注入 + 索引预热 // ═══════════════════════════════════════════════════════════════════════════ @@ -597,9 +590,52 @@ function refreshEntityLexiconAndWarmup() { } // ═══════════════════════════════════════════════════════════════════════════ -// L2 自动增量向量化(总结完成后调用) +// L0 自动补提取(每收到新消息后检查并补提取缺失楼层) // ═══════════════════════════════════════════════════════════════════════════ +async function maybeAutoExtractL0() { + const vectorCfg = getVectorConfig(); + if (!vectorCfg?.enabled) return; + if (guard.isAnyRunning('anchor', 'vector')) return; + + const { chatId, chat } = getContext(); + if (!chatId || !chat?.length) return; + + const stats = await getAnchorStats(); + if (stats.pending <= 0) return; + + const release = guard.acquire('anchor'); + if (!release) return; + + try { + await incrementalExtractAtoms(chatId, chat, null, { maxFloors: 20 }); + + // 为新提取的 L0 楼层构建 L1 chunks + await buildIncrementalChunks({ vectorConfig: vectorCfg }); + + invalidateLexicalIndex(); + + await sendAnchorStatsToFrame(); + await sendVectorStatsToFrame(); + + xbLog.info(MODULE_ID, "自动 L0 补提取完成"); + } catch (e) { + xbLog.error(MODULE_ID, "自动 L0 补提取失败", e); + } finally { + release(); + } +} + +// ═══════════════════════════════════════════════════════════════════════════ +// Embedding 连接预热 +// ═══════════════════════════════════════════════════════════════════════════ + +function warmupEmbeddingConnection() { + const vectorCfg = getVectorConfig(); + if (!vectorCfg?.enabled) return; + embed(['.'], vectorCfg, { timeout: 5000 }).catch(() => {}); +} + async function autoVectorizeNewEvents(newEventIds) { if (!newEventIds?.length) return; @@ -902,7 +938,7 @@ function openPanelForMessage(mesId) { sendFrameBaseData(store, totalFloors); sendFrameFullData(store, totalFloors); - setSummaryGenerating(summaryGenerating); + notifySummaryState(); sendVectorConfigToFrame(); sendVectorStatsToFrame(); @@ -990,36 +1026,40 @@ async function maybeAutoRunSummary(reason) { } async function autoRunSummaryWithRetry(targetMesId, configForRun) { - setSummaryGenerating(true); + const release = guard.acquire('summary'); + if (!release) return; + notifySummaryState(); - for (let attempt = 1; attempt <= 3; attempt++) { - const result = await runSummaryGeneration(targetMesId, configForRun, { - onStatus: (text) => postToFrame({ type: "SUMMARY_STATUS", statusText: text }), - onError: (msg) => postToFrame({ type: "SUMMARY_ERROR", message: msg }), - onComplete: async ({ merged, endMesId, newEventIds }) => { - const store = getSummaryStore(); - postToFrame({ type: "SUMMARY_FULL_DATA", payload: buildFramePayload(store) }); - - invalidateLexicalIndex(); - - applyHideStateDebounced(); - updateFrameStatsAfterSummary(endMesId, store.json || {}); + try { + for (let attempt = 1; attempt <= 3; attempt++) { + const result = await runSummaryGeneration(targetMesId, configForRun, { + onStatus: (text) => postToFrame({ type: "SUMMARY_STATUS", statusText: text }), + onError: (msg) => postToFrame({ type: "SUMMARY_ERROR", message: msg }), + onComplete: async ({ merged, endMesId, newEventIds }) => { + const store = getSummaryStore(); + postToFrame({ type: "SUMMARY_FULL_DATA", payload: buildFramePayload(store) }); - // L2 自动增量向量化 - await autoVectorizeNewEvents(newEventIds); - }, - }); + invalidateLexicalIndex(); - if (result.success) { - setSummaryGenerating(false); - return; + applyHideStateDebounced(); + updateFrameStatsAfterSummary(endMesId, store.json || {}); + + await autoVectorizeNewEvents(newEventIds); + }, + }); + + if (result.success) { + return; + } + + if (attempt < 3) await sleep(1000); } - if (attempt < 3) await sleep(1000); + await executeSlashCommand("/echo severity=error 剧情总结失败(已自动重试 3 次)。请稍后再试。"); + } finally { + release(); + notifySummaryState(); } - - setSummaryGenerating(false); - await executeSlashCommand("/echo severity=error 剧情总结失败(已自动重试 3 次)。请稍后再试。"); } function updateFrameStatsAfterSummary(endMesId, merged) { @@ -1055,7 +1095,7 @@ function handleFrameMessage(event) { case "FRAME_READY": { frameReady = true; flushPendingFrameMessages(); - setSummaryGenerating(summaryGenerating); + notifySummaryState(); sendSavedConfigToFrame(); sendVectorConfigToFrame(); sendVectorStatsToFrame(); @@ -1084,7 +1124,7 @@ function handleFrameMessage(event) { case "REQUEST_CANCEL": window.xiaobaixStreamingGeneration?.cancel?.("xb9"); - setSummaryGenerating(false); + postToFrame({ type: "GENERATION_STATE", isGenerating: false }); postToFrame({ type: "SUMMARY_STATUS", statusText: "已停止" }); break; @@ -1282,26 +1322,30 @@ async function handleManualGenerate(mesId, config) { return; } - setSummaryGenerating(true); + const release = guard.acquire('summary'); + if (!release) return; + notifySummaryState(); - await runSummaryGeneration(mesId, config, { - onStatus: (text) => postToFrame({ type: "SUMMARY_STATUS", statusText: text }), - onError: (msg) => postToFrame({ type: "SUMMARY_ERROR", message: msg }), - onComplete: async ({ merged, endMesId, newEventIds }) => { - const store = getSummaryStore(); - postToFrame({ type: "SUMMARY_FULL_DATA", payload: buildFramePayload(store) }); - - invalidateLexicalIndex(); - - applyHideStateDebounced(); - updateFrameStatsAfterSummary(endMesId, store.json || {}); + try { + await runSummaryGeneration(mesId, config, { + onStatus: (text) => postToFrame({ type: "SUMMARY_STATUS", statusText: text }), + onError: (msg) => postToFrame({ type: "SUMMARY_ERROR", message: msg }), + onComplete: async ({ merged, endMesId, newEventIds }) => { + const store = getSummaryStore(); + postToFrame({ type: "SUMMARY_FULL_DATA", payload: buildFramePayload(store) }); - // L2 自动增量向量化 - await autoVectorizeNewEvents(newEventIds); - }, - }); + invalidateLexicalIndex(); - setSummaryGenerating(false); + applyHideStateDebounced(); + updateFrameStatsAfterSummary(endMesId, store.json || {}); + + await autoVectorizeNewEvents(newEventIds); + }, + }); + } finally { + release(); + notifySummaryState(); + } } // ═══════════════════════════════════════════════════════════════════════════ @@ -1390,7 +1434,7 @@ async function handleMessageReceived() { initButtonsForAll(); // 向量全量生成中时跳过 L1 sync(避免竞争写入) - if (vectorGenerating) return; + if (guard.isRunning('vector')) return; await syncOnMessageReceived(chatId, lastFloor, message, vectorConfig, () => { sendAnchorStatsToFrame(); @@ -1529,6 +1573,22 @@ async function handleGenerationStarted(type, _params, isDryRun) { // 事件注册 // ═══════════════════════════════════════════════════════════════════════════ +const boundHandlers = { + chatChanged: () => setTimeout(handleChatChanged, 80), + messageDeleted: () => setTimeout(handleMessageDeleted, 50), + messageReceived: () => setTimeout(handleMessageReceived, 150), + messageSent: () => setTimeout(handleMessageSent, 150), + messageSentRecall: handleMessageSentForRecall, + messageSwiped: () => setTimeout(handleMessageSwiped, 100), + messageUpdated: () => setTimeout(handleMessageUpdated, 100), + messageEdited: () => setTimeout(handleMessageUpdated, 100), + userRendered: (data) => setTimeout(() => handleMessageRendered(data), 50), + charRendered: (data) => setTimeout(() => handleMessageRendered(data), 50), + genStarted: handleGenerationStarted, + genStopped: clearExtensionPrompt, + genEnded: clearExtensionPrompt, +}; + function registerEvents() { if (eventsRegistered) return; eventsRegistered = true; @@ -1551,31 +1611,45 @@ function registerEvents() { initButtonsForAll(); - eventSource.on(event_types.CHAT_CHANGED, () => setTimeout(handleChatChanged, 80)); - eventSource.on(event_types.MESSAGE_DELETED, () => setTimeout(handleMessageDeleted, 50)); - eventSource.on(event_types.MESSAGE_RECEIVED, () => setTimeout(handleMessageReceived, 150)); - eventSource.on(event_types.MESSAGE_SENT, () => setTimeout(handleMessageSent, 150)); - eventSource.on(event_types.MESSAGE_SENT, handleMessageSentForRecall); - eventSource.on(event_types.MESSAGE_SWIPED, () => setTimeout(handleMessageSwiped, 100)); - eventSource.on(event_types.MESSAGE_UPDATED, () => setTimeout(handleMessageUpdated, 100)); - eventSource.on(event_types.MESSAGE_EDITED, () => setTimeout(handleMessageUpdated, 100)); - eventSource.on(event_types.USER_MESSAGE_RENDERED, (data) => setTimeout(() => handleMessageRendered(data), 50)); - eventSource.on(event_types.CHARACTER_MESSAGE_RENDERED, (data) => setTimeout(() => handleMessageRendered(data), 50)); + eventSource.on(event_types.CHAT_CHANGED, boundHandlers.chatChanged); + eventSource.on(event_types.MESSAGE_DELETED, boundHandlers.messageDeleted); + eventSource.on(event_types.MESSAGE_RECEIVED, boundHandlers.messageReceived); + eventSource.on(event_types.MESSAGE_SENT, boundHandlers.messageSent); + eventSource.on(event_types.MESSAGE_SENT, boundHandlers.messageSentRecall); + eventSource.on(event_types.MESSAGE_SWIPED, boundHandlers.messageSwiped); + eventSource.on(event_types.MESSAGE_UPDATED, boundHandlers.messageUpdated); + eventSource.on(event_types.MESSAGE_EDITED, boundHandlers.messageEdited); + eventSource.on(event_types.USER_MESSAGE_RENDERED, boundHandlers.userRendered); + eventSource.on(event_types.CHARACTER_MESSAGE_RENDERED, boundHandlers.charRendered); // 用户输入捕获(原生捕获阶段) document.addEventListener("pointerdown", onSendPointerdown, true); document.addEventListener("keydown", onSendKeydown, true); // 注入链路 - eventSource.on(event_types.GENERATION_STARTED, handleGenerationStarted); - eventSource.on(event_types.GENERATION_STOPPED, clearExtensionPrompt); - eventSource.on(event_types.GENERATION_ENDED, clearExtensionPrompt); + eventSource.on(event_types.GENERATION_STARTED, boundHandlers.genStarted); + eventSource.on(event_types.GENERATION_STOPPED, boundHandlers.genStopped); + eventSource.on(event_types.GENERATION_ENDED, boundHandlers.genEnded); } function unregisterEvents() { CacheRegistry.unregister(MODULE_ID); eventsRegistered = false; + eventSource.off(event_types.CHAT_CHANGED, boundHandlers.chatChanged); + eventSource.off(event_types.MESSAGE_DELETED, boundHandlers.messageDeleted); + eventSource.off(event_types.MESSAGE_RECEIVED, boundHandlers.messageReceived); + eventSource.off(event_types.MESSAGE_SENT, boundHandlers.messageSent); + eventSource.off(event_types.MESSAGE_SENT, boundHandlers.messageSentRecall); + eventSource.off(event_types.MESSAGE_SWIPED, boundHandlers.messageSwiped); + eventSource.off(event_types.MESSAGE_UPDATED, boundHandlers.messageUpdated); + eventSource.off(event_types.MESSAGE_EDITED, boundHandlers.messageEdited); + eventSource.off(event_types.USER_MESSAGE_RENDERED, boundHandlers.userRendered); + eventSource.off(event_types.CHARACTER_MESSAGE_RENDERED, boundHandlers.charRendered); + eventSource.off(event_types.GENERATION_STARTED, boundHandlers.genStarted); + eventSource.off(event_types.GENERATION_STOPPED, boundHandlers.genStopped); + eventSource.off(event_types.GENERATION_ENDED, boundHandlers.genEnded); + $(".xiaobaix-story-summary-btn").remove(); hideOverlay(); diff --git a/modules/story-summary/vector/llm/reranker.js b/modules/story-summary/vector/llm/reranker.js index e7e68b9..191ad54 100644 --- a/modules/story-summary/vector/llm/reranker.js +++ b/modules/story-summary/vector/llm/reranker.js @@ -28,17 +28,17 @@ export async function rerank(query, documents, options = {}) { if (!query?.trim()) { xbLog.warn(MODULE_ID, 'query 为空,跳过 rerank'); - return documents.map((_, i) => ({ index: i, relevance_score: 0.5 })); + return { results: documents.map((_, i) => ({ index: i, relevance_score: 0 })), failed: true }; } if (!documents?.length) { - return []; + return { results: [], failed: false }; } const key = getApiKey(); if (!key) { xbLog.warn(MODULE_ID, '未配置 API Key,跳过 rerank'); - return documents.map((_, i) => ({ index: i, relevance_score: 0.5 })); + return { results: documents.map((_, i) => ({ index: i, relevance_score: 0 })), failed: true }; } // 截断超长文档列表 @@ -61,7 +61,7 @@ export async function rerank(query, documents, options = {}) { if (!validDocs.length) { xbLog.warn(MODULE_ID, '无有效文档,跳过 rerank'); - return []; + return { results: [], failed: false }; } const controller = new AbortController(); @@ -106,7 +106,7 @@ export async function rerank(query, documents, options = {}) { const elapsed = Math.round(performance.now() - T0); xbLog.info(MODULE_ID, `Rerank 完成: ${validDocs.length} docs → ${results.length} selected (${elapsed}ms)`); - return mapped; + return { results: mapped, failed: false }; } catch (e) { clearTimeout(timeoutId); @@ -118,10 +118,13 @@ export async function rerank(query, documents, options = {}) { } // 降级:返回原顺序,分数均匀分布 - return documents.slice(0, topN).map((_, i) => ({ - index: i, - relevance_score: 1 - (i / documents.length) * 0.5, - })); + return { + results: documents.slice(0, topN).map((_, i) => ({ + index: i, + relevance_score: 0, + })), + failed: true, + }; } } @@ -138,30 +141,38 @@ export async function rerankChunks(query, chunks, options = {}) { if (!chunks?.length) return []; if (chunks.length <= topN) { - // 数量不超限,仍然 rerank 以获取分数,但不过滤 const texts = chunks.map(c => c.text || c.semantic || ''); - const results = await rerank(query, texts, { topN: chunks.length, ...options }); - + const { results, failed } = await rerank(query, texts, { topN: chunks.length, ...options }); + + if (failed) { + return chunks.map(c => ({ ...c, _rerankScore: 0, _rerankFailed: true })); + } + const scoreMap = new Map(results.map(r => [r.index, r.relevance_score])); return chunks.map((c, i) => ({ ...c, - _rerankScore: scoreMap.get(i) ?? 0.5, + _rerankScore: scoreMap.get(i) ?? 0, })).sort((a, b) => b._rerankScore - a._rerankScore); } const texts = chunks.map(c => c.text || c.semantic || ''); - const results = await rerank(query, texts, { topN, ...options }); + const { results, failed } = await rerank(query, texts, { topN, ...options }); - // 过滤低分 + 排序 - const selected = results + if (failed) { + return chunks.slice(0, topN).map(c => ({ + ...c, + _rerankScore: 0, + _rerankFailed: true, + })); + } + + return results .filter(r => r.relevance_score >= minScore) .sort((a, b) => b.relevance_score - a.relevance_score) .map(r => ({ ...chunks[r.index], _rerankScore: r.relevance_score, })); - - return selected; } /** @@ -174,7 +185,7 @@ export async function testRerankService() { } try { - const results = await rerank('测试查询', ['测试文档1', '测试文档2'], { topN: 2 }); + const { results } = await rerank('测试查询', ['测试文档1', '测试文档2'], { topN: 2 }); return { success: true, message: `连接成功,返回 ${results.length} 个结果`, diff --git a/modules/story-summary/vector/retrieval/metrics.js b/modules/story-summary/vector/retrieval/metrics.js index f0221fd..fde9c3f 100644 --- a/modules/story-summary/vector/retrieval/metrics.js +++ b/modules/story-summary/vector/retrieval/metrics.js @@ -86,6 +86,7 @@ export function createMetrics() { l0Candidates: 0, // W-RRF 融合后的 L0 候选数 l0Selected: 0, // rerank 后选中的 L0 数 rerankApplied: false, + rerankFailed: false, beforeRerank: 0, afterRerank: 0, rerankTime: 0, @@ -283,6 +284,9 @@ export function formatMetricsLog(metrics) { if (m.evidence.rerankApplied) { lines.push(`│ ├─ rerank_applied: true`); + if (m.evidence.rerankFailed) { + lines.push(`│ ├─ rerank_failed: ⚠ YES (using fusion order)`); + } lines.push(`│ │ ├─ before: ${m.evidence.beforeRerank}`); lines.push(`│ │ ├─ after: ${m.evidence.afterRerank}`); lines.push(`│ │ └─ time: ${m.evidence.rerankTime}ms`); @@ -489,6 +493,10 @@ export function detectIssues(metrics) { // L1 挂载问题 // ───────────────────────────────────────────────────────────────── + if (m.evidence.rerankFailed) { + issues.push('Rerank API failed — using fusion rank order as fallback, relevance scores are zero'); + } + if (m.evidence.l0Selected > 0 && m.evidence.l1Pulled === 0) { issues.push('Zero L1 chunks pulled - L1 vectors may not exist or DB read failed'); } diff --git a/modules/story-summary/vector/retrieval/recall.js b/modules/story-summary/vector/retrieval/recall.js index 853cc2d..818aec8 100644 --- a/modules/story-summary/vector/retrieval/recall.js +++ b/modules/story-summary/vector/retrieval/recall.js @@ -631,6 +631,7 @@ async function locateAndPullEvidence(anchorHits, anchorFloors, queryVector, rera metrics.evidence.rerankApplied = true; metrics.evidence.beforeRerank = rerankCandidates.length; metrics.evidence.afterRerank = rerankedL0.length; + metrics.evidence.rerankFailed = rerankedL0.some(c => c._rerankFailed); metrics.evidence.l0Selected = rerankedL0.length; metrics.evidence.rerankTime = rerankTime; metrics.timing.evidenceRerank = rerankTime; diff --git a/modules/story-summary/vector/utils/tokenizer.js b/modules/story-summary/vector/utils/tokenizer.js index 6c150d0..a39e4e9 100644 --- a/modules/story-summary/vector/utils/tokenizer.js +++ b/modules/story-summary/vector/utils/tokenizer.js @@ -49,6 +49,9 @@ let jiebaCut = null; /** @type {Function|null} jieba add_word 函数引用 */ let jiebaAddWord = null; +/** @type {object|null} TinySegmenter 实例 */ +let tinySegmenter = null; + // ═══════════════════════════════════════════════════════════════════════════ // 实体词典 // ═══════════════════════════════════════════════════════════════════════════ @@ -76,12 +79,13 @@ const STOP_WORDS = new Set([ '时候', '现在', '已经', '还是', '只是', '可能', '应该', '知道', '觉得', '开始', '一下', '一些', '这个', '那个', '他们', '我们', '你们', '自己', '起来', '出来', '进去', '回来', '过来', '下去', - // 日语助词 + 常见虚词 - 'は', 'が', 'を', 'に', 'で', 'と', 'の', 'も', 'へ', 'や', - 'か', 'な', 'よ', 'ね', 'わ', 'だ', 'です', 'ます', 'た', 'て', - 'する', 'いる', 'ある', 'なる', 'れる', 'られる', 'ない', + // 日语常见虚词(≥2字,匹配 TinySegmenter 产出粒度) + 'です', 'ます', 'した', 'して', 'する', 'ない', 'いる', 'ある', + 'なる', 'れる', 'られ', 'られる', 'この', 'その', 'あの', 'どの', 'ここ', 'そこ', 'あそこ', 'これ', 'それ', 'あれ', 'どれ', + 'ても', 'から', 'まで', 'ので', 'のに', 'けど', 'だけ', + 'もう', 'まだ', 'とても', 'ちょっと', 'やっぱり', // 英文常见停用词 'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', @@ -100,6 +104,34 @@ const STOP_WORDS = new Set([ // Unicode 分类 // ═══════════════════════════════════════════════════════════════════════════ +/** + * 判断字符是否为假名(平假名 + 片假名) + * @param {number} code - charCode + * @returns {boolean} + */ +function isKana(code) { + return ( + (code >= 0x3040 && code <= 0x309F) || // Hiragana + (code >= 0x30A0 && code <= 0x30FF) || // Katakana + (code >= 0x31F0 && code <= 0x31FF) || // Katakana Extensions + (code >= 0xFF65 && code <= 0xFF9F) // Halfwidth Katakana + ); +} + +/** + * 判断字符是否为 CJK 汉字(不含假名) + * @param {number} code - charCode + * @returns {boolean} + */ +function isCJK(code) { + return ( + (code >= 0x4E00 && code <= 0x9FFF) || + (code >= 0x3400 && code <= 0x4DBF) || + (code >= 0xF900 && code <= 0xFAFF) || + (code >= 0x20000 && code <= 0x2A6DF) + ); +} + /** * 判断字符是否为亚洲文字(CJK + 假名) * @param {number} code - charCode @@ -107,14 +139,7 @@ const STOP_WORDS = new Set([ */ function isAsian(code) { return ( - (code >= 0x4E00 && code <= 0x9FFF) || // CJK Unified Ideographs - (code >= 0x3400 && code <= 0x4DBF) || // CJK Extension A - (code >= 0x3040 && code <= 0x309F) || // Hiragana - (code >= 0x30A0 && code <= 0x30FF) || // Katakana - (code >= 0x31F0 && code <= 0x31FF) || // Katakana Phonetic Extensions - (code >= 0xFF65 && code <= 0xFF9F) || // Halfwidth Katakana - (code >= 0xF900 && code <= 0xFAFF) || // CJK Compatibility Ideographs - (code >= 0x20000 && code <= 0x2A6DF) // CJK Extension B + isCJK(code) || isKana(code) ); } @@ -195,6 +220,31 @@ function segmentByScript(text) { return segments; } +// ═══════════════════════════════════════════════════════════════════════════ +// 亚洲文字语言检测(中文 vs 日语) +// ═══════════════════════════════════════════════════════════════════════════ + +/** + * 检测亚洲文字段的语言 + * + * 假名占比 > 30% 判定为日语(日语文本中假名通常占 40-60%) + * + * @param {string} text - 亚洲文字段 + * @returns {'zh'|'ja'|'other'} + */ +function detectAsianLanguage(text) { + let kanaCount = 0; + let cjkCount = 0; + for (const ch of text) { + const code = ch.codePointAt(0); + if (isKana(code)) kanaCount++; + else if (isCJK(code)) cjkCount++; + } + const total = kanaCount + cjkCount; + if (total === 0) return 'other'; + return (kanaCount / total) > 0.3 ? 'ja' : 'zh'; +} + // ═══════════════════════════════════════════════════════════════════════════ // 实体保护(最长匹配占位符替换) // ═══════════════════════════════════════════════════════════════════════════ @@ -343,6 +393,26 @@ function tokenizeAsianFallback(text) { return tokens; } +/** + * 用 TinySegmenter 处理日语文字段 + * @param {string} text + * @returns {string[]} + */ +function tokenizeJapanese(text) { + if (tinySegmenter) { + try { + const words = tinySegmenter.segment(text); + return words + .map(w => String(w || '').trim()) + .filter(w => w.length >= 2); + } catch (e) { + xbLog.warn(MODULE_ID, 'TinySegmenter 分词异常,降级处理', e); + return tokenizeAsianFallback(text); + } + } + return tokenizeAsianFallback(text); +} + // ═══════════════════════════════════════════════════════════════════════════ // 分词:拉丁文字 // ═══════════════════════════════════════════════════════════════════════════ @@ -374,6 +444,9 @@ function tokenizeLatin(text) { * @returns {Promise} 是否加载成功 */ export async function preload() { + // TinySegmenter 独立于结巴状态(内部有防重入) + loadTinySegmenter(); + // 已就绪 if (wasmState === WasmState.READY) return true; @@ -443,6 +516,25 @@ export async function preload() { } } +/** + * 加载 TinySegmenter(懒加载,不阻塞) + */ +async function loadTinySegmenter() { + if (tinySegmenter) return; + + try { + // eslint-disable-next-line no-unsanitized/method + const mod = await import( + `/${extensionFolderPath}/libs/tiny-segmenter.js` + ); + const Ctor = mod.TinySegmenter || mod.default; + tinySegmenter = new Ctor(); + xbLog.info(MODULE_ID, 'TinySegmenter 加载完成'); + } catch (e) { + xbLog.warn(MODULE_ID, 'TinySegmenter 加载失败,日语将使用降级分词', e); + } +} + // ═══════════════════════════════════════════════════════════════════════════ // 公开接口:isReady // ═══════════════════════════════════════════════════════════════════════════ @@ -598,7 +690,10 @@ function tokenizeCore(text) { const rawTokens = []; for (const seg of segments) { if (seg.type === 'asian') { - if (wasmState === WasmState.READY && jiebaCut) { + const lang = detectAsianLanguage(seg.text); + if (lang === 'ja') { + rawTokens.push(...tokenizeJapanese(seg.text)); + } else if (wasmState === WasmState.READY && jiebaCut) { rawTokens.push(...tokenizeAsianJieba(seg.text)); } else { rawTokens.push(...tokenizeAsianFallback(seg.text));