-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlib.py
60 lines (49 loc) · 1.94 KB
/
lib.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
from typing import Iterable
from pypdf import PdfReader
import json
from whoosh.index import create_in, open_dir
from whoosh.fields import *
from whoosh.qparser import QueryParser, OrGroup
from whoosh.analysis import StemmingAnalyzer
from glob import glob
schema = Schema(path=ID(stored=True), content=TEXT(StemmingAnalyzer()))
INDEX_DIR = "whoosh_index"
def read_text_from_pdf(path):
reader = PdfReader(path)
texts = map(lambda p: p.extract_text(), reader.pages)
return ' '.join(texts), path
def extract_conversations(file):
conversations = []
with open(file) as f:
json_content = json.load(f)
for message in json_content:
if 'subtype' not in message and 'text' in message:
# print(message)
name = message['user_profile']['real_name'] if 'user_profile' in message else "unknown user"
conversations.append(f"{name}: {message['text']}")
return '\n'.join(conversations), file
def build_whoosh_index(files: Iterable[str], indexdir):
ix = create_in(indexdir, schema)
writer = ix.writer()
data = map(extract_conversations, files)
results = list(map(lambda row: writer.add_document(path=row[1], content=row[0]), data))
writer.commit()
def get_whoosh_ix(folder: str):
return open_dir(folder)
# files = glob("./data/hackduke_slack/**/*.json", recursive=True)
# build_whoosh_index(files, INDEX_DIR)
class SearchResult:
def __init__(self, path, content) -> None:
self.path = path
self.content = content
def search_by_term(INDEX_DIR, term):
ix = get_whoosh_ix(INDEX_DIR)
qp = QueryParser("content", schema=ix.schema, group=OrGroup)
q = qp.parse(term)
search_results = []
with ix.searcher() as searcher:
results = searcher.search(q, limit=10)
for result in results:
path = result["path"]
search_results.append(SearchResult(path, extract_conversations(path)))
return search_results