Skip to content

Commit e295ed3

Browse files
committedNov 3, 2017
score: 0.42926
1 parent 9f50325 commit e295ed3

File tree

1 file changed

+35
-3
lines changed

1 file changed

+35
-3
lines changed
 

‎spooky_author/spooky_explore.py

+35-3
Original file line numberDiff line numberDiff line change
@@ -7,22 +7,39 @@
77

88
import pandas as pd
99

10+
# load dataset
11+
# egitim_seti = train_set
12+
13+
# test seti = test_set
1014
# Veri setlerini yukleyelim.
11-
egitim_seti = pd.read_csv('spooky_train.csv', index_col=False)
12-
test_seti = pd.read_csv('spooky_test.csv', index_col=False)
15+
egitim_seti = pd.read_csv('../input/train.csv', index_col=False)
16+
test_seti = pd.read_csv('../input/test.csv', index_col=False)
17+
18+
# Explore Data
1319

1420
# Veriyi arastiralim.
1521
# print egitim_seti.head()
1622

23+
# Concatenate all test.txt and train.txt
24+
# butun_cumleler = all_texts
25+
1726
# Test ve Egitim Setindeki cumleleri birlestirelim.
1827
butun_cumleler = pd.concat([egitim_seti['text'], test_seti['text']])
1928

29+
# size of train_set
30+
2031
# egitim setinin buyuklugu
2132
m = len(egitim_seti) # 19579
2233

34+
# Set the target and predictor variables for Model Training.
2335
# Modelin Egitimi icin gerekli hedef ve ongorucu degiskenleri ayarlayalim.
2436

37+
# Target Variable = Authors
2538
# Hedef Degiskenimiz = Yazarlar
39+
40+
# Encode authors as binary.
41+
# We encode as binary 'cause we must submit a csv file with the id,
42+
# and a probability for each of the three classes.
2643
# yazarlari binary olarak kodlayalim.
2744
# bunun amaci, submission dosyasi olustururken,
2845
# tahminlerimizi her bir yazara gore olasilik dagilimi istenmesi.
@@ -33,6 +50,11 @@
3350
labelbinarizer.fit(egitim_seti['author'])
3451
y = labelbinarizer.fit_transform(egitim_seti['author'])
3552

53+
# Predictor Variable: Sentences
54+
# These are text, we can not use directly.
55+
# Let's extract some features for machine could understand.
56+
# Transforms each text in texts in a sequence of integers.
57+
3658
# Ongorucu Degiskenimiz: Yazarlarin Kurdugu Cumleler
3759
# Bu cumleler, text oldugu icin direkt kullanamayiz.
3860
# Cumlelerden makinenin anlayabilecegi ozellikler cikartalim.
@@ -48,22 +70,30 @@
4870
X = tokenizer.texts_to_sequences(egitim_seti['text'])
4971
X = pad_sequences(X)
5072

73+
# X_egitim = X_train
74+
# y_egitim = y_train
5175
X_egitim = X
5276
y_egitim = y
5377

78+
# sozluk_boyutu = size of dictionary
79+
5480
# hangi kelimeden kac tane gectigini hesapladigimizda toplam map'in boyutu
5581
# modelimizi olustururken kullanacagiz.
5682
sozluk_boyutu = len(tokenizer.word_index) # 29451
5783

84+
# X_test
85+
5886
# submission dosyasini olusturmak icin kullanacagimiz test seti
5987
# ayni sekilde test setindeki cumleleri kullanarak her biri icin
6088
# ozellik dizilerini olusturalim.
6189
X_test = tokenizer.texts_to_sequences(test_seti['text'])
6290
X_test = pad_sequences(X_test)
6391

92+
# Create our model
93+
# our model has four layers
94+
6495
# modelimizi olusturalim
6596
# modelimiz dort katmandan olusuyor
66-
# Sequence classification with LSTM:
6797

6898
model = Sequential()
6999
model.add(Embedding(input_dim=sozluk_boyutu + 1, output_dim=30))
@@ -77,6 +107,8 @@
77107

78108
model.summary()
79109

110+
# tahminler = predictions
111+
80112
tahminler = model.predict(X_test, batch_size=16)
81113
test_seti['EAP'] = tahminler[:, 0]
82114
test_seti['HPL'] = tahminler[:, 1]

0 commit comments

Comments
 (0)
Please sign in to comment.