Skip to content

Commit 13d95a9

Browse files
perf(TextAnalytics): allocation-free tokenize + single-pass termFreq
tokenize(): replaced replace().split().filter() chain (3 intermediate array allocations per call) with a regex exec loop that emits tokens directly. This is the hot path for SessionLinker (tokenizes every saved session), DriftDetector, SmartTitle, and WordCloud — the allocation savings are significant when processing dozens of sessions with thousands of messages. termFreq(): merged the counting pass and max-finding pass into a single loop — eliminates one full iteration over all TF map keys. For large vocabularies (hundreds of unique terms per session) this halves the object-key iteration overhead.
1 parent 82f15f4 commit 13d95a9

1 file changed

Lines changed: 32 additions & 8 deletions

File tree

app.js

Lines changed: 32 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -33112,21 +33112,45 @@ const TextAnalytics = (() => {
3311233112
'could','should','would','something','anything','everything','nothing'
3311333113
]);
3311433114

33115-
/** Tokenise text: lowercase, strip punctuation, remove stopwords & short words. */
33115+
/**
33116+
* Tokenise text: lowercase, extract alphanumeric words, remove stopwords & short words.
33117+
*
33118+
* Optimised: uses regex exec loop instead of replace().split().filter()
33119+
* which allocated 3 intermediate arrays per call. This is the hot path
33120+
* for SessionLinker (tokenizes every session), DriftDetector, SmartTitle,
33121+
* and WordCloud — the allocation savings are significant when processing
33122+
* dozens of sessions with thousands of messages.
33123+
*/
33124+
var _TOKEN_RE = /[a-z0-9]{3,}/g;
3311633125
function tokenize(text) {
3311733126
if (!text) return [];
33118-
return String(text).toLowerCase().replace(/[^a-z0-9\s]/g, ' ').split(/\s+/).filter(function(w) {
33119-
return w.length > 2 && !STOPWORDS.has(w);
33120-
});
33127+
var lower = String(text).toLowerCase();
33128+
_TOKEN_RE.lastIndex = 0;
33129+
var result = [];
33130+
var m;
33131+
while ((m = _TOKEN_RE.exec(lower)) !== null) {
33132+
if (!STOPWORDS.has(m[0])) result.push(m[0]);
33133+
}
33134+
return result;
3312133135
}
3312233136

33123-
/** Build max-normalised term-frequency vector from a token array. */
33137+
/**
33138+
* Build max-normalised term-frequency vector from a token array.
33139+
*
33140+
* Optimised: merged the counting and max-finding passes into a single
33141+
* loop — eliminates one full iteration over all TF keys. For large
33142+
* vocabularies (hundreds of unique terms per session) this halves the
33143+
* object-key iteration overhead.
33144+
*/
3312433145
function termFreq(tokens) {
3312533146
var tf = {};
33126-
for (var i = 0; i < tokens.length; i++) { tf[tokens[i]] = (tf[tokens[i]] || 0) + 1; }
3312733147
var max = 0;
33128-
for (var k in tf) { if (tf[k] > max) max = tf[k]; }
33129-
if (max > 0) { for (var k2 in tf) { tf[k2] /= max; } }
33148+
for (var i = 0; i < tokens.length; i++) {
33149+
var c = (tf[tokens[i]] || 0) + 1;
33150+
tf[tokens[i]] = c;
33151+
if (c > max) max = c;
33152+
}
33153+
if (max > 0) { for (var k in tf) { tf[k] /= max; } }
3313033154
return tf;
3313133155
}
3313233156

0 commit comments

Comments
 (0)