-
Notifications
You must be signed in to change notification settings - Fork 0
/
KNN.py
79 lines (57 loc) · 2.31 KB
/
KNN.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
cols = ['fLength', 'fWidth', 'fSize', 'fConc', 'fConc1', 'fAsym', 'fM3Long', 'fM3Trans', 'fAlpha', 'fDist', 'class']
df = pd.read_csv("magic04.data", names=cols)
# Printing dataset
print(df.head())
df['class'] = (df['class'] == "g").astype(int)
# Plotting dataset
for label in cols[:-1]:
plt.hist(df[df["class"] == 1][label], color='blue', label='gamma', alpha=0.7, density=True)
plt.hist(df[df["class"] == 0][label], color='red', label='hadron', alpha=0.7, density=True)
plt.title(label)
plt.ylabel("Probability")
plt.xlabel(label)
plt.legend()
# plt.show()
# Train, Validation, test dataset
train, valid, test = np.split(df.sample(frac=1), [int(0.6 * len(df)), int(0.8 * len(df))])
def scaleDataset(dataframe, oversample=False):
x = dataframe[dataframe.columns[:-1]].values
y = dataframe[dataframe.columns[-1]].values
scaler = StandardScaler()
x = scaler.fit_transform(x)
# taking more of the less class to increase the size of out dataset of that smaller class
# so they not match
if oversample:
ros = RandomOverSampler()
x, y = ros.fit_resample(x, y)
data = np.hstack((x, np.reshape(y, (-1, 1))))
return data, x, y
# if we print len of dataset of class 0 and 1 without resampling
# "print(len(train[train["class"]==1])), print(len(train[train["class"]==0])),
# we will see that the number of class 1 is very different from class 0
# so we make over sample in function scalDataset
train, xTrain, yTrain = scaleDataset(train, oversample=True)
valid, xvalid, yvalid = scaleDataset(valid, oversample=False)
test, xtest, yTest = scaleDataset(test, oversample=False)
# after OverSampling to see result of reSample{
# print(len(yTrain))
# print(sum(yTrain == 1))
# print(sum(yTrain == 0))
# }
# Before reSample
# print(len(train[train["class"] == 1]))
# print(len(train[train["class"] == 0]))
# KNN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
knnModel = KNeighborsClassifier(n_neighbors=1)
knnModel.fit(xTrain, yTrain)
yPredict = knnModel.predict(xtest)
print(classification_report(yTest, yPredict))
# print(yPredict)
# print(yTest)