-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcnn_toxed.py
147 lines (127 loc) · 4.59 KB
/
cnn_toxed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
'''
@author: Prafull SHARMA
@Data Set: Kaggle Toxc comment dataset
'''
from __future__ import print_function, division
from builtins import range
# for updating in future
# using pip install -U future
# ----------- Utils ---------------------------------
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# ------------ NN libs ------------------------------
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, GlobalMaxPool1D
from keras.models import Model
from keras.layers import Conv1D, MaxPool1D, Embedding, Dropout
# ----------------------------------------------------
from configurations import *
from sklearn.metrics import roc_auc_score
# ----------------------------------------------------
# Embedding path...
emb_path = os.path.expanduser("~") + f'/EMBEDDINGS/glove.6B.{EMBEDDIN_DIM}d.txt'
# words pointing to vectors dict
with open(emb_path, 'r') as fil:
word2vec = {
line.split()[0]: np.asarray(line.split()[1:])
for line in fil
}
print(f"FOUND: {len(word2vec)} word vectors")
# prepare Text samples and Labels
print('Loading comments ....')
train = pd.read_csv('data/kaggle_toxic_comment_challenge/train.csv')
sentences = train['comment_text'].fillna("DUMMY_VALUE").values
possible_labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
targets = train[possible_labels].values
# analysing min, max of sequence length
seq_length = np.array([len(s.split()) for s in sentences])
# print(np.min(seq_length))
# print(np.max(seq_length))
# print(int(np.mean(seq_length)))
# Tokenization and index creation
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)
# word2index : word ---> integer mapping
word2idx = tokenizer.word_index
print(f"FOUND: {len(word2idx)} unique tokens in data...")
# Paadding to make all the vectors
# in the sequences of same lentgh [N x T]
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print(f"Shape of data tensor: {data.shape}")
# EMBEDDING MATRIX prepare
print('Filling pre-trained embedding matrix....')
num_words = min(MAX_VOCAB_SIZE, len(word2idx)+1)
# shape = V x D
embedding_matrix = np.zeros((num_words, EMBEDDIN_DIM))
for word, i in word2idx.items():
if i < MAX_VOCAB_SIZE:
embedding_vector = word2vec.get(word, None)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
print(embedding_matrix.shape)
# ----------------- MODEL DEVELOPMENT -------------------------
# 1) EMBEDDING LAYER .......
embedding_layer = Embedding(
num_words,
EMBEDDIN_DIM,
weights=[embedding_matrix],
input_length=MAX_SEQUENCE_LENGTH,
trainable=False
)
# 2) MODEL LAYERS............................................
# Creating a 1-D ConvNet with Global Max POoling
# Since the input is size N X T ,,,so we pass T
# which is MAX_SEQUENCE_LENGTH
input_ = Input(shape=(MAX_SEQUENCE_LENGTH, ))
layer = embedding_layer(input_)
layer = Conv1D(128, 3, activation='relu')(layer)
layer = Dropout(0.3)(layer)
layer = MaxPool1D(3)(layer)
layer = Conv1D(128, 3, activation='relu')(layer)
layer = Dropout(0.4)(layer)
layer = MaxPool1D(3)(layer)
layer = Conv1D(128, 3, activation='relu')(layer)
layer = GlobalMaxPool1D()(layer)
layer = Dense(128, activation='relu')(layer)
output = Dense(len(possible_labels), activation='sigmoid')(layer)
# ----------------- MODEL COMPILE -------------------------..
model = Model(input_, output)
model.compile(
loss='binary_crossentropy',
optimizer='rmsprop',
metrics=['accuracy']
)
# ----------------- MODEL TRAINING --------------------------
train = model.fit(
data,
targets,
batch_size=BATCH_SIZE,
epochs=EPOCH,
validation_split=VALIDATION_SPLIT
)
# ---------------- METRICS CHECK ----------------------------
# losses
plt.plot(train.history['loss'], label='loss')
plt.plot(train.history['val_loss'], label='val_loss')
plt.legend()
plt.show()
# accuracies
plt.plot(train.history['accuracy'], label='accuracy')
plt.plot(train.history['val_accuracy'], label='val_accuracy')
plt.legend()
plt.show()
# --------- AUC check -------------------------------------
predictions = model.predict(data)
# Note: Data has been split automatically into test, thanks to keras feature!
# plot the mean AUC over each label
# This can be seen in model.fit 'VALIDATION SPLIT'
aucs= []
for i in range(6):
auc = roc_auc_score(targets[:, i], predictions[:, i])
aucs.append(auc)
print()
print(f"AREA UNDER CURVE : {np.mean(aucs)}")