-
Notifications
You must be signed in to change notification settings - Fork 18
/
p05-join.py
164 lines (131 loc) · 4.59 KB
/
p05-join.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
"""
In this lab, we once again have a mandatory 'python' challenge.
Then we have a more open-ended Machine Learning 'see why' challenge.
This data is the "Is Wikipedia Literary" that I pitched.
You can contribute to science or get a sense of the data here: https://label.jjfoley.me/wiki
"""
import gzip, json
from shared import dataset_local_path, TODO
from dataclasses import dataclass
from typing import Dict, List
"""
Problem 1: We have a copy of Wikipedia (I spared you the other 6 million pages).
It is separate from our labels we collected.
"""
@dataclass
class JustWikiPage:
title: str
wiki_id: str
body: str
# Load our pages into this pages list.
pages: List[JustWikiPage] = []
with gzip.open(dataset_local_path("tiny-wiki.jsonl.gz"), "rt") as fp:
for line in fp:
entry = json.loads(line)
pages.append(JustWikiPage(**entry))
@dataclass
class JustWikiLabel:
wiki_id: str
is_literary: bool
# Load our judgments/labels/truths/ys into this labels list:
labels: List[JustWikiLabel] = []
with open(dataset_local_path("tiny-wiki-labels.jsonl")) as fp:
for line in fp:
entry = json.loads(line)
labels.append(
JustWikiLabel(wiki_id=entry["wiki_id"], is_literary=entry["truth_value"])
)
@dataclass
class JoinedWikiData:
wiki_id: str
is_literary: bool
title: str
body: str
print(len(pages), len(labels))
print(pages[0])
print(labels[0])
joined_data: Dict[str, JoinedWikiData] = {}
# TODO("1. create a list of JoinedWikiData from the ``pages`` and ``labels`` lists.")
# This challenge has some very short solutions, so it's more conceptual. If you're stuck after ~10-20 minutes of thinking, ask!
############### Problem 1 ends here ###############
# Make sure it is solved correctly!
assert len(joined_data) == len(pages)
assert len(joined_data) == len(labels)
# Make sure it has *some* positive labels!
assert sum([1 for d in joined_data.values() if d.is_literary]) > 0
# Make sure it has *some* negative labels!
assert sum([1 for d in joined_data.values() if not d.is_literary]) > 0
# Construct our ML problem:
ys = []
examples = []
for wiki_data in joined_data.values():
ys.append(wiki_data.is_literary)
examples.append(wiki_data.body)
## We're actually going to split before converting to features now...
from sklearn.model_selection import train_test_split
import numpy as np
RANDOM_SEED = 1234
## split off train/validate (tv) pieces.
ex_tv, ex_test, y_tv, y_test = train_test_split(
examples,
ys,
train_size=0.75,
shuffle=True,
random_state=RANDOM_SEED,
)
# split off train, validate from (tv) pieces.
ex_train, ex_vali, y_train, y_vali = train_test_split(
ex_tv, y_tv, train_size=0.66, shuffle=True, random_state=RANDOM_SEED
)
## Convert to features, train simple model (TFIDF will be explained eventually.)
from sklearn.feature_extraction.text import TfidfVectorizer
# Only learn columns for words in the training data, to be fair.
word_to_column = TfidfVectorizer(
strip_accents="unicode", lowercase=True, stop_words="english", max_df=0.5
)
word_to_column.fit(ex_train)
# Test words should surprise us, actually!
X_train = word_to_column.transform(ex_train)
X_vali = word_to_column.transform(ex_vali)
X_test = word_to_column.transform(ex_test)
print("Ready to Learn!")
from sklearn.linear_model import LogisticRegression, SGDClassifier, Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
models = {
"SGDClassifier": SGDClassifier(),
"Perceptron": Perceptron(),
"LogisticRegression": LogisticRegression(),
"DTree": DecisionTreeClassifier(),
}
for name, m in models.items():
m.fit(X_train, y_train)
print("{}:".format(name))
print("\tVali-Acc: {:.3}".format(m.score(X_vali, y_vali)))
if hasattr(m, "decision_function"):
scores = m.decision_function(X_vali)
else:
scores = m.predict_proba(X_vali)[:, 1]
print("\tVali-AUC: {:.3}".format(roc_auc_score(y_score=scores, y_true=y_vali)))
"""
Results should be something like:
SGDClassifier:
Vali-Acc: 0.84
Vali-AUC: 0.879
Perceptron:
Vali-Acc: 0.815
Vali-AUC: 0.844
LogisticRegression:
Vali-Acc: 0.788
Vali-AUC: 0.88
DTree:
Vali-Acc: 0.739
Vali-AUC: 0.71
"""
TODO("2. Explore why DecisionTrees are not beating linear models. Answer one of:")
TODO("2.A. Is it a bad depth?")
TODO("2.B. Do Random Forests do better?")
TODO(
"2.C. Is it randomness? Use simple_boxplot and bootstrap_auc/bootstrap_acc to see if the differences are meaningful!"
)
TODO("2.D. Is it randomness? Control for random_state parameters!")