-
-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathmodel.py
123 lines (70 loc) · 3.98 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# -*- coding: utf-8 -*-
"""model_2.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1jiO4CC4qSMyQbHlxl6-01jewT6afcLc1
## Install and import the required packages
"""
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import pickle
"""## Mount Google Drive and import dataset"""
from google.colab import drive
drive.mount("/content/drive")
data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/deploy-ml-web-workshop/mbpt_dataset_2.csv")
data.head()
data.count()
data['type'].value_counts()
"""## Split train and test (to ensure balanced distribution of data)"""
train_data, test_data = train_test_split(data, test_size = 0.2, random_state = 42, stratify = data.type)
print("\033[93m {}\033[00m" .format('TRAIN DATA \n'), train_data)
print(train_data['type'].value_counts())
print("\033[93m {}\033[00m" .format('TEST DATA \n'), test_data)
print(test_data['type'].value_counts())
# PS: "\033[93m {}\033[00m" .format('\n AFTER \n') is just a way
# to change the color of the printed string 'TRAIN DATA \n' using ANSI Escape Code.
"""## Tokenize and transform the data"""
vectorizer = TfidfVectorizer(max_features = 5000, stop_words = "english")
vectorizer.fit(train_data.posts)
train_post = vectorizer.transform(train_data.posts).toarray()
test_post = vectorizer.transform(test_data.posts).toarray()
train_post.shape
target_encoder = LabelEncoder()
train_target = target_encoder.fit_transform(train_data.type)
test_target = target_encoder.fit_transform(test_data.type)
"""## Models testing and selection"""
# Store the accuracy of each model
models_accuracy = {}
"""#### Logistic Regression"""
model_log = LogisticRegression(max_iter = 3000, C = 0.5, n_jobs = -1)
model_log.fit(train_post, train_target)
print('Train Classification Report \n ', classification_report(train_target, model_log.predict(train_post), zero_division=0, target_names = target_encoder.inverse_transform([i for i in range(16)])))
print('Test Classification Report \n', classification_report(test_target, model_log.predict(test_post), zero_division=0, target_names = target_encoder.inverse_transform([i for i in range(16)])))
models_accuracy['Logistic Regression'] = accuracy_score(test_target, model_log.predict(test_post))
"""#### Linear Support Vector Classifier"""
model_linear_svc=LinearSVC(C = 0.1)
model_linear_svc.fit(train_post, train_target)
print('Train Classification Report \n ', classification_report(train_target, model_linear_svc.predict(train_post), zero_division=0, target_names = target_encoder.inverse_transform([i for i in range(16)])))
print('Test Classification Report \n', classification_report(test_target, model_linear_svc.predict(test_post), zero_division=0, target_names = target_encoder.inverse_transform([i for i in range(16)])))
models_accuracy['Linear Support Vector Classifier'] = accuracy_score(test_target, model_linear_svc.predict(test_post))
"""## Models accuracy summary"""
models_accuracy
accuarcy = pd.DataFrame(models_accuracy.items(), columns = ['Models', 'Test accuracy'])
accuarcy.sort_values(by = 'Test accuracy', ascending = False, ignore_index = True).style.background_gradient(cmap = 'Blues')
"""## Save (pickle) the final model"""
# Vectorizer
pickle.dump(vectorizer, open('vectorizer.pkl', 'wb'))
# Model
pickle.dump(model_linear_svc, open('model.pkl', 'wb'))
"""### Load and test the saved model"""
loaded_vectorizer = pickle.load(open('vectorizer.pkl', 'rb'))
loaded_model = pickle.load(open('model.pkl', 'rb'))
message = ["This is pretty much the worse movie I have ever watched. It's completely thrash!"]
message = loaded_vectorizer.transform(message)
result = loaded_model.predict(message)
print(result, target_encoder.inverse_transform(result))