-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvinothxgboost.Rmd
134 lines (115 loc) · 2.93 KB
/
vinothxgboost.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
---
title : "SANTANDER BANK CUSTOMER SATISFACTION"
author: "VINOTHKUMAR A"
date : "02 NOV 2018"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
#include library
```{r}
library(xgboost)
library(Matrix)
set.seed(123)
```
#read the data
```{r}
train <- read.csv("D:santander/train.csv")
test <- read.csv("D:santander/test.csv")
```
##### Removing IDs
```{r}
train$ID <- NULL
test.id <- test$ID
test$ID <- NULL
```
##### Extracting TARGET
```{r}
train.y <- train$TARGET
train$TARGET <- NULL
```
##### 0 count per line
```{r}
count0 <- function(x) {
return( sum(x == 0) )
}
train$n0 <- apply(train, 1, FUN=count0)
test$n0 <- apply(test, 1, FUN=count0)
```
##### Removing constant features
```{r}
cat("\n## Removing the constants features.\n")
for (f in names(train)) {
if (length(unique(train[[f]])) == 1) {
cat(f, "is constant in train. We delete it.\n")
train[[f]] <- NULL
test[[f]] <- NULL
}
}
```
##### Removing identical features
```{r}
features_pair <- combn(names(train), 2, simplify = F)
toRemove <- c()
for(pair in features_pair) {
f1 <- pair[1]
f2 <- pair[2]
if (!(f1 %in% toRemove) & !(f2 %in% toRemove)) {
if (all(train[[f1]] == train[[f2]])) {
cat(f1, "and", f2, "are equals.\n")
toRemove <- c(toRemove, f2)
}
}
}
feature.names <- setdiff(names(train), toRemove)
```
#split train and test
```{r}
train <- train[, feature.names]
test <- test[, feature.names]
```
###limit vars in test based on min and max vals of train
```{r}
print('Setting min-max lims on test data')
for(f in colnames(train)){
lim <- min(train[,f])
test[test[,f]<lim,f] <- lim
lim <- max(train[,f])
test[test[,f]>lim,f] <- lim
}
```
```{r}
train$TARGET <- train.y
train <- sparse.model.matrix(TARGET ~ ., data = train)
```
```{r}
dtrain <- xgb.DMatrix(data=train, label=train.y)
watchlist <- list(train=dtrain)
```
```{r}
param <- list( objective = "binary:logistic",
booster = "gbtree",
eval_metric = "auc",
eta = 0.0202048,
max_depth = 5,
subsample = 0.6815,
colsample_bytree = 0.701
)
clf <- xgb.train( params = param,
data = dtrain,
nrounds = 560,
verbose = 1,
watchlist = watchlist,
maximize = FALSE
)
```
```{r}
test$TARGET <- -1
test <- sparse.model.matrix(TARGET ~ ., data = test)
preds <- predict(clf, test)
submission <- data.frame(ID=test.id, TARGET=preds)
cat("saving the submission file\n")
write.csv(submission, "D:santander/submit2.csv", row.names = F)
```
# kaggle score is 0.840380 using XGBoost