-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLinearRegression.py
80 lines (60 loc) · 2.72 KB
/
LinearRegression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
def cost_function(X, y, theta, b):
m = X.shape[0]
error = (np.dot(X, theta) + b) - y
cost = 1/(m) * np.dot(error.T, error)
return cost, error
def gradient_descent(X, y, theta, b, alpha, iters, batch_size):
cost_array = np.zeros(iters)
m = y.size
batches = [(X[i:i + batch_size], y[i:i + batch_size])
for i in range(0, len(X), batch_size)]
cost, error = None, None
for i in range(iters):
for x_, y_ in batches:
cost, error = cost_function(x_, y_, theta, b)
theta = theta - (alpha * (1/x_.shape[0]) * 2 * np.dot(x_.T, error))
b = b - (alpha * (1/x_.shape[0]) * 2 * np.sum(error))
if (i+1)%100 == 0:
print("Iteration No: {}\t Cost : {}".format(i, cost))
cost_array[i] = cost
return theta, b, cost_array
def predict(X, theta, b):
y_pred = np.dot(X, theta) + b
return y_pred
def main():
data = pd.read_csv('4.csv')
data.dropna(axis=0, how='any')
data['date'] = pd.to_datetime(data['date']).dt.strftime("%Y%m%d").astype(int)
X = np.array(data.drop(['id', 'price'], axis=1).copy())
y = np.array(data['price'].copy())
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 80% train, 20% test
print("X_train shape: {}".format(X_train.shape))
print("y_train shape: {}".format(y_train.shape))
print("X_test shape: {}".format(X_test.shape))
print("y_test shape: {}\n".format(y_test.shape))
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
X_train = np.c_[np.ones(X_train.shape[0]), X_train]
X_test = np.c_[np.ones(X_test.shape[0]), X_test]
alpha = 0.01
b = 0
iterations = 1000
batch_size = 1000
theta = np.zeros(X_train.shape[1])
initial_cost, _ = cost_function(X_train, y_train, theta, b)
print('With initial theta values of {0}, cost error on train set is {1}'.format(theta, initial_cost))
theta, b, cost_num = gradient_descent(X_train, y_train, theta, b, alpha, iterations, batch_size)
final_cost, _ = cost_function(X_train, y_train, theta, b)
print('Weights:\n{0}\nCost error on train set is: {1}'.format(theta, final_cost))
y_pred = predict(X_test, theta, b)
print('r2 score :' ,r2_score(y_test, y_pred))
print('Mean squared error on test set:', mean_squared_error(y_test, y_pred))
if __name__ == "__main__":
main()