vinothxgboost.Rmd

---
title : "SANTANDER BANK CUSTOMER SATISFACTION"
author: "VINOTHKUMAR A"
date  : "02 NOV 2018"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
#include library
```{r}
library(xgboost)
library(Matrix)
set.seed(123)
```
#read the data
```{r}
train <- read.csv("D:santander/train.csv")
test  <- read.csv("D:santander/test.csv")
```


##### Removing IDs
```{r}
train$ID <- NULL
test.id <- test$ID
test$ID <- NULL
```
##### Extracting TARGET
```{r} 
train.y <- train$TARGET
train$TARGET <- NULL
```

##### 0 count per line
```{r}
count0 <- function(x) {
  return( sum(x == 0) )
}
train$n0 <- apply(train, 1, FUN=count0)
test$n0 <- apply(test, 1, FUN=count0)
```
##### Removing constant features
```{r}
cat("\n## Removing the constants features.\n")
for (f in names(train)) {
  if (length(unique(train[[f]])) == 1) {
    cat(f, "is constant in train. We delete it.\n")
    train[[f]] <- NULL
    test[[f]] <- NULL
  }
}
```
##### Removing identical features
```{r}
features_pair <- combn(names(train), 2, simplify = F)
toRemove <- c()
for(pair in features_pair) {
  f1 <- pair[1]
  f2 <- pair[2]
  
  if (!(f1 %in% toRemove) & !(f2 %in% toRemove)) {
    if (all(train[[f1]] == train[[f2]])) {
      cat(f1, "and", f2, "are equals.\n")
      toRemove <- c(toRemove, f2)
    }
  }
}
feature.names <- setdiff(names(train), toRemove)

```

#split train and test
```{r}
train <- train[, feature.names]
test <- test[, feature.names]

```
###limit vars in test based on min and max vals of train
```{r}
print('Setting min-max lims on test data')
for(f in colnames(train)){
  lim <- min(train[,f])
  test[test[,f]<lim,f] <- lim
  
  lim <- max(train[,f])
  test[test[,f]>lim,f] <- lim  
}
```

```{r}
train$TARGET <- train.y
train <- sparse.model.matrix(TARGET ~ ., data = train)

```

```{r}
dtrain <- xgb.DMatrix(data=train, label=train.y)
watchlist <- list(train=dtrain)
```

```{r}
param <- list(  objective           = "binary:logistic", 
                booster             = "gbtree",
                eval_metric         = "auc",
                eta                 = 0.0202048,
                max_depth           = 5,
                subsample           = 0.6815,
                colsample_bytree    = 0.701
)

clf <- xgb.train(   params              = param, 
                    data                = dtrain, 
                    nrounds             = 560, 
                    verbose             = 1,
                    watchlist           = watchlist,
                    maximize            = FALSE
)

```

```{r}
test$TARGET <- -1
test <- sparse.model.matrix(TARGET ~ ., data = test)
preds <- predict(clf, test)
submission <- data.frame(ID=test.id, TARGET=preds)
cat("saving the submission file\n")
write.csv(submission, "D:santander/submit2.csv", row.names = F)
```

# kaggle score is 0.840380 using XGBoost