refine recall fusion guard and floor-based L0 collection

This commit is contained in:
2026-02-17 17:08:37 +08:00
parent 94eceaed96
commit 26dd7cb053
3 changed files with 231 additions and 27 deletions

View File

@@ -233,6 +233,9 @@ async function buildIndexAsync(docs) {
* @property {boolean} idfEnabled - Whether IDF stats are available for weighting.
* @property {number} idfDocCount - Number of lexical docs used to compute IDF.
* @property {Array<{term:string,idf:number}>} topIdfTerms - Top query terms by IDF.
* @property {string[]} queryTerms - Normalized query terms actually searched.
* @property {Record<string, Array<{floor:number, weightedScore:number, chunkId:string}>>} termFloorHits - Chunk-floor hits by term.
* @property {Array<{floor:number, score:number, hitTermsCount:number}>} floorLexScores - Aggregated lexical floor scores (debug).
* @property {number} termSearches - Number of per-term MiniSearch queries executed.
* @property {number} searchTime - Total lexical search time in milliseconds.
*/
@@ -258,6 +261,9 @@ export function searchLexicalIndex(index, terms) {
idfEnabled: lexicalDocCount > 0,
idfDocCount: lexicalDocCount,
topIdfTerms: [],
queryTerms: [],
termFloorHits: {},
floorLexScores: [],
termSearches: 0,
searchTime: 0,
};
@@ -268,9 +274,12 @@ export function searchLexicalIndex(index, terms) {
}
const queryTerms = Array.from(new Set((terms || []).map(normalizeTerm).filter(Boolean)));
result.queryTerms = [...queryTerms];
const weightedScores = new Map(); // docId -> score
const hitMeta = new Map(); // docId -> { type, floor }
const idfPairs = [];
const termFloorHits = new Map(); // term -> [{ floor, weightedScore, chunkId }]
const floorLexAgg = new Map(); // floor -> { score, terms:Set<string> }
for (const term of queryTerms) {
const idf = computeIdf(term);
@@ -305,11 +314,35 @@ export function searchLexicalIndex(index, terms) {
floor: hit.floor,
});
}
if (hit.type === 'chunk' && typeof hit.floor === 'number' && hit.floor >= 0) {
if (!termFloorHits.has(term)) termFloorHits.set(term, []);
termFloorHits.get(term).push({
floor: hit.floor,
weightedScore: weighted,
chunkId: id,
});
const floorAgg = floorLexAgg.get(hit.floor) || { score: 0, terms: new Set() };
floorAgg.score += weighted;
floorAgg.terms.add(term);
floorLexAgg.set(hit.floor, floorAgg);
}
}
}
idfPairs.sort((a, b) => b.idf - a.idf);
result.topIdfTerms = idfPairs.slice(0, 5);
result.termFloorHits = Object.fromEntries(
[...termFloorHits.entries()].map(([term, hits]) => [term, hits]),
);
result.floorLexScores = [...floorLexAgg.entries()]
.map(([floor, info]) => ({
floor,
score: Number(info.score.toFixed(6)),
hitTermsCount: info.terms.size,
}))
.sort((a, b) => b.score - a.score);
const sortedHits = Array.from(weightedScores.entries())
.sort((a, b) => b[1] - a[1]);