-
Notifications
You must be signed in to change notification settings - Fork 18
/
p14-ensembles.py
95 lines (74 loc) · 2.73 KB
/
p14-ensembles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import random
import numpy as np
from dataclasses import dataclass, field
from sklearn.base import ClassifierMixin
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.utils import resample
import typing as T
from shared import TODO
# start off by seeding random number generators:
RANDOM_SEED = 12345
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
# import data; choose feature space
from dataset_poetry import y_train, Xd_train, y_vali, Xd_vali
X_train = Xd_train["numeric"]
X_vali = Xd_vali["numeric"]
def y_to_sign(x: bool) -> int:
if x:
return 1
else:
return -1
@dataclass
class WeightedEnsemble(ClassifierMixin):
""" A weighted ensemble is a list of (weight, classifier) tuples."""
members: T.List[T.Tuple[float, T.Any]] = field(default_factory=list)
def insert(self, weight: float, model: T.Any):
self.members.append((weight, model))
def predict_one(self, x: np.ndarray) -> bool:
vote_sum = 0
for weight, clf in self.members:
vote_sum += y_to_sign(clf.predict([x])[0]) * weight
return vote_sum > 0
def predict(self, X: np.ndarray) -> np.ndarray:
(N, _) = X.shape
class_votes = np.zeros((N, 1))
for weight, clf in self.members:
ys = clf.predict(X)
for i, y in enumerate(ys):
class_votes[i] += y_to_sign(y) * weight
return class_votes > 0
#%%
tree_num = []
tree_vali = []
forest_vali = []
forest = WeightedEnsemble()
(N, D) = X_train.shape
for i in range(100):
# bootstrap sample the training data
X_sample, y_sample = resample(X_train, y_train) # type:ignore
# TODO create a tree model.
tree = TODO("train and fit a model to the sampled data")
# TODO Experiment:
# What if instead of every tree having the same 1.0 weight, we considered some alternatives?
# - weight = the accuracy of that tree on the whole training set.
# - weight = the accuracy of that tree on the validation set.
# - weight = random.random()
# - weight = 0.1
weight = 1.0
# hold onto it for voting
forest.insert(weight, tree)
tree_num.append(i)
tree_vali.append(tree.score(X_vali, y_vali))
forest_vali.append(forest.score(X_vali, y_vali))
if i % 5 == 0:
print("Tree[{}] = {:.3}".format(i, tree_vali[-1]))
print("Forest[{}] = {:.3}".format(i, forest_vali[-1]))
import matplotlib.pyplot as plt
plt.plot(tree_num, tree_vali, label="Individual Trees", alpha=0.5)
plt.plot(tree_num, forest_vali, label="Random Forest")
plt.legend()
plt.show()