-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtree.py
116 lines (107 loc) · 3.47 KB
/
tree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import math
class entropyDecisionTreeClassifier:
def __init__(self):
self.dt = None
class tree:
def __init__(self):
self.colname = None
self.thisclass = None
self.isdecisive = False
self.availablecols = []
self.trees = {}
def entropy(self,column,findmax = False):
elements = list(set(column))
d={}
for ele in elements:
d[ele] = 0
total = len(column)
for i in column:
d[i] += 1
ent = 0
for ele in d:
val = d[ele]/total
if val!=0:
ent += -val * math.log(val,2)
if findmax:
maxclass = None
maxv = 0
for ele in d:
if d[ele] > maxv:
maxv = d[ele]
maxclass = ele
fin = False
if maxv/total >= 0.95:
fin = True
return ent,maxclass,fin
return ent
def entropypercategory(self,column,y):
categs = list(set(column))
d={}
count = len(column)
for cat in categs:
d[cat] = []
for i in range(count):
d[column[i]].append(y[i])
ent = 0.0
for cat in categs:
ent += (len(d[cat])/count)*self.entropy(d[cat])
return ent
def retrievenewrows(self,x,y,col,cols):
categs = list(set(x[col]))
d={}
for cat in categs:
if len(cols) > 0:
newx = x[cols].loc[x[col] == cat]
newcols = list(newx)
else:
newx = []
newcols = []
newy = y.loc[x[col] == cat]
d[cat] = [newx,newy,newcols]
return d
def builddecisiontree(self,x,y,xcols,ycol):
a = self.tree()
main_entropy,maxclass,fin = self.entropy(list(y[ycol].values),True)
a.thisclass = maxclass
if main_entropy == 0 or len(xcols) == 0 or fin:
a.isdecisive = True
return a
best = None
bestinfogain = -1
index = 0
for i,col in enumerate(xcols):
ent = self.entropypercategory(list(x[col].values),list(y[ycol].values))
infogain = main_entropy - ent
if bestinfogain < infogain:
bestinfogain = infogain
best = col
index = i
a.colname = best
a.availablecols = xcols[:index] + xcols[index+1:]
newdict = self.retrievenewrows(x,y,best,a.availablecols)
for cat in newdict:
newx = newdict[cat][0]
newy = newdict[cat][1]
newcols = newdict[cat][2]
a.trees[cat] = self.builddecisiontree(newx,newy,newcols,ycol)
return a
def predict_tree(self,dt,xtest):
pred = []
for index,row in xtest.iterrows():
a = dt
while(not a.isdecisive):
val = row[a.colname]
if val in a.trees:
a = a.trees[row[a.colname]]
else:
break
pred.append(a.thisclass)
return pred
def fit(self,x_train,y_train):
self.dt = self.builddecisiontree(x_train,y_train,list(x_train),list(y_train)[0])
def predict(self,x_test):
return self.predict_tree(self.dt,x_test)
def score(self,x_test,y_test):
from .metrics import accuracy_score
pred = self.predict(x_test)
return accuracy_score(y_test,pred)