-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathus_constitution.py
82 lines (66 loc) · 3.13 KB
/
us_constitution.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import sys
import os.path
import regex
import pawpaw
from pawpaw import arborform
"""
DEMO: US CONSTITUTION
This demo shows an example of how to parse, visualize, and query the US Constitution using Pawpaw.
Note: The text for the constitution was taken from https://www.archives.gov/founding-docs/constitution-transcript
"""
def get_parser() -> arborform.Itorator:
# Article: could be preamble
a_splitter = arborform.Split(
regex.compile(r'(?<=\n+)(?=Article\.)', regex.DOTALL),
boundary_retention=arborform.Split.BoundaryRetention.NONE,
tag='article splitter')
a_desc = arborform.Desc(
desc=lambda ito: 'article' if ito.str_startswith('Article.') else 'preamble',
tag='article desc')
con = arborform.Connectors.Delegate(a_desc)
a_splitter.connections.append(con)
con = arborform.Connectors.Children.Add(pawpaw.nlp.SimpleNlp().itor, lambda ito: ito.desc == 'preamble')
a_desc.connections.append(con)
a_extractor = arborform.Extract(
regex.compile(r'Article\. (?<key>[A-Z]+)\.\n(?<value>.+)', regex.DOTALL),
tag='article extractor')
con = arborform.Connectors.Children.Add(a_extractor, lambda ito: ito.desc == 'article')
a_desc.connections.append(con)
# Section: only some articles have sections
s_splitter = arborform.Split(
regex.compile(r'(?<=\n+)(?=Section\.)', regex.DOTALL),
boundary_retention=arborform.Split.BoundaryRetention.LEADING,
desc='section',
tag='section splitter')
con = arborform.Connectors.Children.Add(s_splitter, lambda ito: ito.desc == 'value' and ito.str_startswith('Section.'))
a_extractor.connections.append(con)
con = arborform.Connectors.Children.Add(pawpaw.nlp.SimpleNlp().itor, lambda ito: ito.desc == 'value' and not ito.str_startswith('Section.'))
a_extractor.connections.append(con)
s_extractor = arborform.Extract(regex.compile(r'Section\. (?<key>\d+)\.\n(?<value>.+)', regex.DOTALL))
con = arborform.Connectors.Children.Add(s_extractor)
s_splitter.connections.append(con)
con = arborform.Connectors.Children.Add(pawpaw.nlp.SimpleNlp().itor, lambda ito: ito.desc == 'value')
s_extractor.connections.append(con)
return a_splitter
def get_text() -> pawpaw.Ito:
with open(os.path.join(sys.path[0], 'us_constitution.txt')) as f:
return pawpaw.Ito(f.read(), desc='constitution')
# Visualize
print(f'\nVISUALIZE:\n')
i = get_text()
tree_vis = pawpaw.visualization.pepo.Tree()
parser = get_parser()
i.children.add(*parser(i))
print(tree_vis.dumps(i))
# Query
print(f'\nQUERY:\n')
print(f'\tGoal: Find sections containing words \'power\' or \'right\'\n')
query = '**[d:section]{**[d:word] & [lcs:power,right]}'
print(f'\tPlumule Query: {query}\n')
print(f'\tResults:\n')
for i, section in enumerate(i.find_all(query)):
article_key = section.find('..[d:article]/*[d:key]')
section_key = section.find('*[d:key]')
section_value = section.find('*[d:value]')
print(f'\t\tMatch {i}: Article {article_key}, Section {section_key}')
print(f'\t\t\t{section_value:%substr:45…}')