|
7 | 7 |
|
8 | 8 | import pandas as pd
|
9 | 9 |
|
| 10 | +# load dataset |
| 11 | +# egitim_seti = train_set |
| 12 | + |
| 13 | +# test seti = test_set |
10 | 14 | # Veri setlerini yukleyelim.
|
11 |
| -egitim_seti = pd.read_csv('spooky_train.csv', index_col=False) |
12 |
| -test_seti = pd.read_csv('spooky_test.csv', index_col=False) |
| 15 | +egitim_seti = pd.read_csv('../input/train.csv', index_col=False) |
| 16 | +test_seti = pd.read_csv('../input/test.csv', index_col=False) |
| 17 | + |
| 18 | +# Explore Data |
13 | 19 |
|
14 | 20 | # Veriyi arastiralim.
|
15 | 21 | # print egitim_seti.head()
|
16 | 22 |
|
| 23 | +# Concatenate all test.txt and train.txt |
| 24 | +# butun_cumleler = all_texts |
| 25 | + |
17 | 26 | # Test ve Egitim Setindeki cumleleri birlestirelim.
|
18 | 27 | butun_cumleler = pd.concat([egitim_seti['text'], test_seti['text']])
|
19 | 28 |
|
| 29 | +# size of train_set |
| 30 | + |
20 | 31 | # egitim setinin buyuklugu
|
21 | 32 | m = len(egitim_seti) # 19579
|
22 | 33 |
|
| 34 | +# Set the target and predictor variables for Model Training. |
23 | 35 | # Modelin Egitimi icin gerekli hedef ve ongorucu degiskenleri ayarlayalim.
|
24 | 36 |
|
| 37 | +# Target Variable = Authors |
25 | 38 | # Hedef Degiskenimiz = Yazarlar
|
| 39 | + |
| 40 | +# Encode authors as binary. |
| 41 | +# We encode as binary 'cause we must submit a csv file with the id, |
| 42 | +# and a probability for each of the three classes. |
26 | 43 | # yazarlari binary olarak kodlayalim.
|
27 | 44 | # bunun amaci, submission dosyasi olustururken,
|
28 | 45 | # tahminlerimizi her bir yazara gore olasilik dagilimi istenmesi.
|
|
33 | 50 | labelbinarizer.fit(egitim_seti['author'])
|
34 | 51 | y = labelbinarizer.fit_transform(egitim_seti['author'])
|
35 | 52 |
|
| 53 | +# Predictor Variable: Sentences |
| 54 | +# These are text, we can not use directly. |
| 55 | +# Let's extract some features for machine could understand. |
| 56 | +# Transforms each text in texts in a sequence of integers. |
| 57 | + |
36 | 58 | # Ongorucu Degiskenimiz: Yazarlarin Kurdugu Cumleler
|
37 | 59 | # Bu cumleler, text oldugu icin direkt kullanamayiz.
|
38 | 60 | # Cumlelerden makinenin anlayabilecegi ozellikler cikartalim.
|
|
48 | 70 | X = tokenizer.texts_to_sequences(egitim_seti['text'])
|
49 | 71 | X = pad_sequences(X)
|
50 | 72 |
|
| 73 | +# X_egitim = X_train |
| 74 | +# y_egitim = y_train |
51 | 75 | X_egitim = X
|
52 | 76 | y_egitim = y
|
53 | 77 |
|
| 78 | +# sozluk_boyutu = size of dictionary |
| 79 | + |
54 | 80 | # hangi kelimeden kac tane gectigini hesapladigimizda toplam map'in boyutu
|
55 | 81 | # modelimizi olustururken kullanacagiz.
|
56 | 82 | sozluk_boyutu = len(tokenizer.word_index) # 29451
|
57 | 83 |
|
| 84 | +# X_test |
| 85 | + |
58 | 86 | # submission dosyasini olusturmak icin kullanacagimiz test seti
|
59 | 87 | # ayni sekilde test setindeki cumleleri kullanarak her biri icin
|
60 | 88 | # ozellik dizilerini olusturalim.
|
61 | 89 | X_test = tokenizer.texts_to_sequences(test_seti['text'])
|
62 | 90 | X_test = pad_sequences(X_test)
|
63 | 91 |
|
| 92 | +# Create our model |
| 93 | +# our model has four layers |
| 94 | + |
64 | 95 | # modelimizi olusturalim
|
65 | 96 | # modelimiz dort katmandan olusuyor
|
66 |
| -# Sequence classification with LSTM: |
67 | 97 |
|
68 | 98 | model = Sequential()
|
69 | 99 | model.add(Embedding(input_dim=sozluk_boyutu + 1, output_dim=30))
|
|
77 | 107 |
|
78 | 108 | model.summary()
|
79 | 109 |
|
| 110 | +# tahminler = predictions |
| 111 | + |
80 | 112 | tahminler = model.predict(X_test, batch_size=16)
|
81 | 113 | test_seti['EAP'] = tahminler[:, 0]
|
82 | 114 | test_seti['HPL'] = tahminler[:, 1]
|
|
0 commit comments