Skip to content

Commit d74db82

Browse files
committed
Add programming wordlists
1 parent 63d62ec commit d74db82

File tree

12 files changed

+6868
-0
lines changed

12 files changed

+6868
-0
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,5 @@ target
22
examples/big.html
33
.cache
44
*.wasm
5+
6+
node_modules
File renamed without changes.

scripts/get_wordlists.ts

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
// a script that gets all the wordlists for codebook and writes them to a file
2+
// data in https://github.com/blopker/common-words/tree/master/web/static/data
3+
// Use `fetch` to get the data
4+
// folder format: [programming_language]/index.json
5+
// json input format:
6+
// ---
7+
// [
8+
// {
9+
// "fontSize": 80,
10+
// "fontFamily": "Abel, sans-serif",
11+
// "x": 429,
12+
// "y": 864,
13+
// "dx": 0,
14+
// "text": "summary"
15+
// },
16+
// ]
17+
// ---
18+
//
19+
// output list format:
20+
// ---
21+
// word1
22+
// word2
23+
// word3
24+
// ---
25+
// Put everything in lower case
26+
//
27+
// The script also keeps a list of the most common programming languages, by their fontSize, that is output to common.txt
28+
29+
import fs from "node:fs";
30+
import path from "node:path";
31+
32+
interface WordSummary {
33+
fontSize: number;
34+
fontFamily: string;
35+
x: number;
36+
y: number;
37+
dx: number;
38+
text: string;
39+
}
40+
41+
const wordlistsPath = path.join(__dirname, "..", "wordlists");
42+
// ensure the folder exists
43+
fs.mkdirSync(wordlistsPath, { recursive: true });
44+
// map of language to wordlist
45+
// key: language in data repo, value: name in queries.rs
46+
const languages = {
47+
rs: "rust",
48+
py: "python",
49+
java: "java",
50+
html: "html",
51+
css: "css",
52+
go: "go",
53+
};
54+
55+
const commonWords: WordSummary[] = [];
56+
57+
function addToCommonWords(data: WordSummary) {
58+
for (const word of commonWords) {
59+
if (word.text === data.text) {
60+
return;
61+
}
62+
}
63+
if (commonWords.length <= 1000) {
64+
commonWords.push(data);
65+
commonWords.sort((a, b) => a.fontSize - b.fontSize);
66+
return;
67+
}
68+
const lowestScore = commonWords.length === 0 ? 0 : commonWords[0].fontSize;
69+
if (data.fontSize > lowestScore) {
70+
commonWords.shift();
71+
commonWords.push(data);
72+
commonWords.sort((a, b) => a.fontSize - b.fontSize);
73+
}
74+
}
75+
76+
const fetch = async (url: string) => {
77+
const response = await globalThis.fetch(url);
78+
return response.json();
79+
};
80+
81+
const getWordlist = async (language: string) => {
82+
const url = `https://raw.githubusercontent.com/blopker/common-words/master/web/static/data/${language}/index.json`;
83+
let data = (await fetch(url)) as WordSummary[];
84+
data = data.map((d) => {
85+
return {
86+
...d,
87+
text: d.text.toLowerCase(),
88+
};
89+
});
90+
for (const item of data) {
91+
addToCommonWords(item);
92+
}
93+
const words = data.map((item: WordSummary) => item.text);
94+
return words;
95+
};
96+
97+
const writeWordlist = async (language: string) => {
98+
const words = await getWordlist(language);
99+
const wordlistPath = path.join(wordlistsPath, `${language}.txt`);
100+
fs.writeFileSync(wordlistPath, words.join("\n"));
101+
};
102+
103+
const main = async () => {
104+
for (const [language, _] of Object.entries(languages)) {
105+
await writeWordlist(language);
106+
}
107+
const commonWordsPath = path.join(wordlistsPath, "common.txt");
108+
fs.writeFileSync(
109+
commonWordsPath,
110+
commonWords
111+
.map((item) => item.text)
112+
.toSorted()
113+
.join("\n"),
114+
);
115+
};
116+
117+
main();

0 commit comments

Comments
 (0)