-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.js
315 lines (261 loc) · 9.18 KB
/
app.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
import { Database } from "bun:sqlite";
import { readdirSync, statSync } from "fs";
import { join, basename } from "path";
import Tesseract from "tesseract.js";
import { serve } from "bun";
const uFuzzy = require("@leeoniya/ufuzzy");
// Initialize database
const db = new Database("images.sqlite", { create: true });
// Create tables if they don't exist
db.exec(`
CREATE TABLE IF NOT EXISTS images (
id INTEGER PRIMARY KEY AUTOINCREMENT,
path TEXT NOT NULL,
filename TEXT NOT NULL,
ocr_text TEXT,
is_document BOOLEAN,
thumbnail BLOB,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
-- For full-text search
CREATE VIRTUAL TABLE IF NOT EXISTS image_fts USING fts5(
ocr_text,
content='images',
content_rowid='id'
);
`);
// OCR throught every single image we have in the given path
async function processImage(imagePath) {
console.log(`Processing ${imagePath}...`);
try {
// Perform OCR with Tesseract.js
const result = await Tesseract.recognize(imagePath, "eng", {
logger: (m) => console.log(m),
});
const ocrText = result.data.text;
const isDocument = result.data.confidence > 60; // Higher threshold for documents
// Normalize text for better searching
const normalizedText = normalizeText(ocrText);
// Save to database
const filename = basename(imagePath);
const stmt = db.prepare("INSERT INTO images (path, filename, ocr_text, is_document) VALUES (?, ?, ?, ?)");
const info = stmt.run(imagePath, filename, normalizedText, isDocument);
// Add to FTS table
db.prepare("INSERT INTO image_fts(rowid, ocr_text) VALUES (?, ?)").run(info.lastInsertRowid, normalizedText);
return {
id: info.lastInsertRowid,
path: imagePath,
filename,
is_document: isDocument,
};
} catch (error) {
console.error(`Error processing ${imagePath}:`, error);
return null;
}
}
// Process a folder of images
async function processFolder(folderPath) {
console.log(`Scanning folder: ${folderPath}`);
const files = readdirSync(folderPath);
const results = [];
for (const file of files) {
const fullPath = join(folderPath, file);
if (statSync(fullPath).isDirectory()) {
const subResults = await processFolder(fullPath);
results.push(...subResults);
continue;
}
// Skip if current file is not an image
if (!file.match(/\.(jpg|jpeg|png|gif|bmp)$/i)) continue;
// Check if this image is already in database , so we dont duplicate stuff
const existing = db.query("SELECT id FROM images WHERE path = ?").get(fullPath);
if (existing) {
results.push({ id: existing.id, path: fullPath, filename: file, exists: true });
continue;
}
const result = await processImage(fullPath);
if (result) results.push(result);
}
return results;
}
// Search for images based on text query
function searchImages(query) {
const normalizedQuery = normalizeText(query);
// Split the query into words / tokenize
const words = normalizedQuery.split(/\s+/).filter((w) => w.length > 0);
if (words.length === 0) {
return [];
}
let searchResults = [];
try {
// For single word queries, we can use direct matching with wildcard
if (words.length === 1) {
searchResults = db
.query(
`
SELECT images.*
FROM image_fts
JOIN images ON image_fts.rowid = images.id
WHERE image_fts MATCH ?
ORDER BY rank
LIMIT 100
`
)
.all(words[0] + "*"); // Wildcard suffix for partial matching
} else {
// For multi-word queries, we need a more advanced approach
// 1. Try exact phrase matching first
const exactResults = db
.query(
`
SELECT images.*
FROM image_fts
JOIN images ON image_fts.rowid = images.id
WHERE image_fts MATCH ?
ORDER BY rank
LIMIT 100
`
)
.all(`"${normalizedQuery}"`); // Quoted for phrase search
if (exactResults.length > 0) {
searchResults = exactResults;
} else {
// 2. Use an OR query with the FTS5 syntax
// Build a search term like: word1* OR word2* OR word3*
const searchTerm = words.map((word) => word + "*").join(" OR ");
searchResults = db
.query(
`
SELECT images.*
FROM image_fts
JOIN images ON image_fts.rowid = images.id
WHERE image_fts MATCH ?
ORDER BY rank
LIMIT 100
`
)
.all(searchTerm);
}
}
// If standard FTS search doesn't yield enough results, use uFuzzy as fallback
if (searchResults.length < 3) {
console.log("Using uFuzzy search fallback for:", query);
return fuzzySearchFallback(normalizedQuery);
}
return searchResults;
} catch (error) {
console.error("Search error:", error);
// If there's an error with the SQLite query, lets try the fuzzy fallback but that is bit slow
return fuzzySearchFallback(normalizedQuery);
}
}
// Fallback fuzzy search using uFuzzy when standard FTS doesn't give good results
function fuzzySearchFallback(query) {
// Get all images - limit to a reasonable number to prevent performance issues
// Ideally, we'd have some way to pre-filter this list further
const allImages = db.query("SELECT * FROM images LIMIT 1000").all();
if (allImages.length === 0) {
return [];
}
// Create a haystack of text from the images
const haystack = allImages.map((img) => img.ocr_text || "");
// Configure uFuzzy - using the settings from the demo screenshot
const uf = new uFuzzy({
intraIns: 0, // no insertions in intra-matches (per your screenshot)
intraSub: 1, // allow for 1 substitution (per your screenshot)
intraTrn: 1, // allow 1 transposition (per your screenshot)
intraDel: 1, // allow for 1 deletion (per your screenshot)
intraChars: "[a-z\\d']", // per your screenshot
intraMode: 2, // MultiInsert mode (per your screenshot)
interLft: 0, // any (per your screenshot)
interRgt: 0, // any (per your screenshot)
interChars: ".", // per your screenshot
infoThresh: 1000, // per your screenshot
sortPreset: "search", // per your screenshot
});
// First try exact search if that fails then other
const searchResults = uf.search(haystack, query);
// If no results, try out-of-order search (allows terms to appear in different order)
const [idxs, info, order] = searchResults[0] && searchResults[0].length ? searchResults : uf.search(haystack, query, 1); // 1 = allow out-of-order matches
if (!idxs || idxs.length === 0) {
return [];
}
// Map the results back to our image objects and sort by relevance
const results = [];
for (let i = 0; i < order.length; i++) {
const idx = info.idx[order[i]];
if (idx >= 0 && idx < allImages.length) {
results.push(allImages[idx]);
}
// Limit to top 100 matches , not more than that
if (results.length >= 100) {
break;
}
}
return results;
}
// Helper to normalize text for better matching , cleaning stuff
function normalizeText(text) {
if (!text) return "";
return text.toLowerCase().replace(/\s+/g, " ").trim();
}
// API server
const server = serve({
port: 3000,
async fetch(req) {
const url = new URL(req.url);
// API routes
if (url.pathname === "/api/scan") {
const { folder } = await req.json();
const results = await processFolder(folder);
return new Response(JSON.stringify(results), {
headers: { "Content-Type": "application/json" },
});
}
if (url.pathname === "/api/search") {
const { query } = await req.json();
const results = searchImages(query);
return new Response(JSON.stringify(results), {
headers: { "Content-Type": "application/json" },
});
}
if (url.pathname === "/api/image") {
const id = url.searchParams.get("id");
const image = db.query("SELECT * FROM images WHERE id = ?").get(id);
if (!image) {
return new Response("Image not found", { status: 404 });
}
const file = Bun.file(image.path);
return new Response(file);
}
// Serve static files from public directory / return the html page ,not using react for simple thing
const filePath = join(import.meta.dir, "public", url.pathname === "/" ? "index.html" : url.pathname);
const file = Bun.file(filePath);
if (await file.exists()) {
return new Response(file);
}
return new Response("Not found", { status: 404 });
},
});
console.log(`Server running at http://localhost:3000`);
// CLI interface for testing
if (process.argv.length > 2) {
const command = process.argv[2];
if (command === "scan" && process.argv.length > 3) {
const folder = process.argv[3];
processFolder(folder).then((results) => {
console.log(`Processed ${results.length} images`);
process.exit(0);
});
}
if (command === "search" && process.argv.length > 3) {
const query = process.argv[3];
const results = searchImages(query);
console.log(`Found ${results.length} results for "${query}":`);
results.forEach((img) => {
console.log(`- ${img.filename} (${img.path})`);
});
process.exit(0);
}
}
export { processFolder, searchImages };