Loading libraries

library(tidyverse)
library(xgboost)
library(data.table)
library(caret)
library(MLmetrics)

Loading data

I’m loading data from previous job: “Porto Seguro statistical analysis & data cleansing” and a submission file.

dataset <- read.csv("C:/Users/user/Desktop/Kaggle/Porto Seguro/dataset.csv")
submission <- read.csv("C:/Users/user/Desktop/Kaggle/Porto Seguro/sample_submission.csv")
str(dataset)
## 'data.frame':    1488028 obs. of  49 variables:
##  $ target        : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ ps_ind_01     : int  2 1 5 0 0 5 2 5 5 1 ...
##  $ ps_ind_02_cat : int  2 1 4 1 2 1 1 1 1 1 ...
##  $ ps_ind_03     : int  5 7 9 2 0 4 3 4 3 2 ...
##  $ ps_ind_04_cat : int  1 0 1 0 1 0 1 0 1 0 ...
##  $ ps_ind_06_bin : int  0 0 0 1 1 0 0 1 0 0 ...
##  $ ps_ind_07_bin : int  1 0 0 0 0 0 1 0 0 1 ...
##  $ ps_ind_08_bin : int  0 1 1 0 0 0 0 0 1 0 ...
##  $ ps_ind_09_bin : int  0 0 0 0 0 1 0 0 0 0 ...
##  $ ps_ind_15     : int  11 3 12 8 9 6 8 13 6 4 ...
##  $ ps_ind_16_bin : int  0 0 1 1 1 1 1 1 1 0 ...
##  $ ps_ind_17_bin : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ ps_ind_18_bin : int  0 1 0 0 0 0 0 0 0 1 ...
##  $ ps_reg_01     : num  0.7 0.8 0 0.9 0.7 0.9 0.6 0.7 0.9 0.9 ...
##  $ ps_reg_02     : num  0.2 0.4 0 0.2 0.6 1.8 0.1 0.4 0.7 1.4 ...
##  $ ps_reg_03     : num  0.718 0.766 0.775 0.581 0.841 ...
##  $ ps_car_01_cat : int  10 11 7 7 11 10 6 11 10 11 ...
##  $ ps_car_02_cat : int  1 1 1 1 1 0 1 1 1 0 ...
##  $ ps_car_04_cat : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ ps_car_06_cat : int  4 11 14 11 14 14 11 11 14 14 ...
##  $ ps_car_07_cat : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ ps_car_08_cat : int  0 1 1 1 1 1 1 1 1 1 ...
##  $ ps_car_09_cat : int  0 2 2 3 2 0 0 2 0 2 ...
##  $ ps_car_11_cat : int  12 19 60 104 82 104 99 30 68 104 ...
##  $ ps_car_11     : int  2 3 1 1 3 2 2 3 3 2 ...
##  $ ps_car_12     : num  0.4 0.316 0.316 0.374 0.316 ...
##  $ ps_car_13     : num  0.884 0.619 0.642 0.543 0.566 ...
##  $ ps_car_14     : num  0.371 0.389 0.347 0.295 0.365 ...
##  $ ps_car_15     : num  3.61 2.45 3.32 2 2 ...
##  $ ps_calc_01    : num  0.6 0.3 0.5 0.6 0.4 0.7 0.2 0.1 0.9 0.7 ...
##  $ ps_calc_02    : num  0.5 0.1 0.7 0.9 0.6 0.8 0.6 0.5 0.8 0.8 ...
##  $ ps_calc_03    : num  0.2 0.3 0.1 0.1 0 0.4 0.5 0.1 0.6 0.8 ...
##  $ ps_calc_04    : int  3 2 2 2 2 3 2 1 3 2 ...
##  $ ps_calc_05    : int  1 1 2 4 2 1 2 2 1 2 ...
##  $ ps_calc_06    : int  10 9 9 7 6 8 8 7 7 8 ...
##  $ ps_calc_07    : int  1 5 1 1 3 2 1 1 3 2 ...
##  $ ps_calc_08    : int  10 8 8 8 10 11 8 6 9 9 ...
##  $ ps_calc_09    : int  1 1 2 4 2 3 3 1 4 1 ...
##  $ ps_calc_10    : int  5 7 7 2 12 8 10 13 11 11 ...
##  $ ps_calc_11    : int  9 3 4 2 3 4 3 7 4 3 ...
##  $ ps_calc_12    : int  1 1 2 2 1 2 0 1 2 5 ...
##  $ ps_calc_13    : int  5 1 7 4 1 0 0 3 1 0 ...
##  $ ps_calc_14    : int  8 9 7 9 3 9 10 6 5 6 ...
##  $ ps_calc_15_bin: int  0 0 0 0 0 0 0 1 0 0 ...
##  $ ps_calc_16_bin: int  1 1 1 0 0 1 1 0 1 1 ...
##  $ ps_calc_17_bin: int  1 1 1 0 0 0 0 1 0 0 ...
##  $ ps_calc_18_bin: int  0 0 0 0 1 1 0 0 0 0 ...
##  $ ps_calc_19_bin: int  0 1 1 0 1 1 1 1 0 1 ...
##  $ ps_calc_20_bin: int  1 0 0 0 0 1 0 0 1 0 ...

Managing outliers

I’ve decided to normalize outliers on the main features exposed to it only to improve the kaggle score:

from 0.264 without managing outliers to 0.265 with managing it.

qnt <- quantile(dataset$ps_reg_02, probs=c(.25, .75), na.rm = T)
caps <- quantile(dataset$ps_reg_02, probs=c(.05, .95), na.rm = T)
H <- 1.5 * IQR(dataset$ps_reg_02, na.rm = T)
dataset$ps_reg_02[dataset$ps_reg_02 < (qnt[1] - H)]  <- caps[1]
dataset$ps_reg_02[dataset$ps_reg_02 >(qnt[2] + H)] <- caps[2]
summary(dataset$ps_reg_02)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.2000  0.3000  0.4282  0.6000  1.3000
qnt <- quantile(dataset$ps_car_13, probs=c(.25, .75), na.rm = T)
caps <- quantile(dataset$ps_car_13, probs=c(.05, .95), na.rm = T)
H <- 1.5 * IQR(dataset$ps_car_13, na.rm = T)
dataset$ps_car_13[dataset$ps_car_13 < (qnt[1] - H)]  <- caps[1]
dataset$ps_car_13[dataset$ps_car_13 >(qnt[2] + H)] <- caps[2]
summary(dataset$ps_car_13)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.3190  0.6710  0.7660  0.8020  0.9061  1.2588
qnt <- quantile(dataset$ps_car_15, probs=c(.25, .75), na.rm = T)
caps <- quantile(dataset$ps_car_15, probs=c(.05, .95), na.rm = T)
H <- 1.5 * IQR(dataset$ps_car_15, na.rm = T)
dataset$ps_car_15[dataset$ps_car_15 < (qnt[1] - H)]  <- caps[1]
dataset$ps_car_15[dataset$ps_car_15 >(qnt[2] + H)] <- caps[2]
summary(dataset$ps_car_15)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.732   2.828   3.317   3.120   3.606   3.742

Split the data back into a train set, test set and a proof set

I haven’t splitted workout data into train and validation because test set is larger than training set

and I’ve preferred to use the whole of it.

So I have used part of training set as a proof to have an idea of Gini score.

train <- dataset[1:595212,]
test <- dataset[595213:1488028,2:49]
proof <- dataset[195211:295212,]

XGBoost Modeling

XGBoost works with numeric features, for this reason I’ve leaved both integer and numeric features.

First step is to convert data into XGB Matrix.

train1 <- train %>% select(-target)
proof1 <- proof %>% select(-target)
dtrain <- xgb.DMatrix(as.matrix(train1),label = train$target)
dproof <- xgb.DMatrix(as.matrix(proof1),label = proof$target)
dtest <- xgb.DMatrix(as.matrix(test))

Second step is to set up parameters of XGBoost machine learning model.

set.seed(12345)
xgb_params <- list(booster = "gbtree", 
                    objectve = "binary:logistic",
                    eta=0.02,                      
                    gamma=1,                       
                    max_depth=7,                    
                    subsample=0.63,                 
                    colsample_bytree = 0.8,         
                    min_child_weight = 5,           
                    base_score=median(train$target))

Training

Let’s calculate the best iteration for this model. I’ve increased nrounds not only to look for the best result,

but also because I’ve used low learning rate (eta).

This means low computation that it must be supported by increase in nrounds.

set.seed(12345)
xgb_dt <- xgb.train(params = xgb_params, 
               data = dtrain,
                     nrounds = 1200,
                     print_every_n = 50,
                     early_stopping_rounds = 10,
                     maximize = F,
                     verbose = 1,
                     watchlist = list(train=dtrain))
## [1]  train-rmse:0.190745 
## Will train until train_rmse hasn't improved in 10 rounds.
## 
## [51] train-rmse:0.187075 
## [101]    train-rmse:0.186366 
## [151]    train-rmse:0.186096 
## [201]    train-rmse:0.185906 
## [251]    train-rmse:0.185767 
## [301]    train-rmse:0.185654 
## [351]    train-rmse:0.185548 
## [401]    train-rmse:0.185447 
## [451]    train-rmse:0.185358 
## [501]    train-rmse:0.185268 
## [551]    train-rmse:0.185202 
## [601]    train-rmse:0.185119 
## [651]    train-rmse:0.185046 
## [701]    train-rmse:0.184967 
## [751]    train-rmse:0.184899 
## [801]    train-rmse:0.184825 
## [851]    train-rmse:0.184765 
## [901]    train-rmse:0.184699 
## [951]    train-rmse:0.184634 
## [1001]   train-rmse:0.184562 
## [1051]   train-rmse:0.184492 
## [1101]   train-rmse:0.184421 
## [1151]   train-rmse:0.184373 
## [1200]   train-rmse:0.184318

Prediction

proof model

predv <- predict(xgb_dt, dproof, type = "prob")
NormalizedGini(predv,proof$target)
## [1] 0.4082765
summary(predv)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -0.005625  0.023926  0.032757  0.036437  0.044176  0.475848

features importance

mat <- xgb.importance (feature_names = colnames(dtrain),model = xgb_dt)
xgb.plot.importance (importance_matrix = mat[1:30]) 

test model

pred <- predict(xgb_dt, dtest, type = "prob")
sub <- data.table(submission$id,pred=pred)
colnames(sub) <-  c("id","target")
summary(sub)
##        id              target        
##  Min.   :      0   Min.   :-0.02055  
##  1st Qu.: 372022   1st Qu.: 0.02401  
##  Median : 744307   Median : 0.03286  
##  Mean   : 744154   Mean   : 0.03651  
##  3rd Qu.:1116309   3rd Qu.: 0.04443  
##  Max.   :1488026   Max.   : 0.52547
sub$target <- ifelse(sub$target<0,0,sub$target) 
head(sub)
##    id     target
## 1:  0 0.03207741
## 2:  1 0.02986537
## 3:  2 0.02598237
## 4:  3 0.01678664
## 5:  4 0.03653975
## 6:  5 0.04358792
tail(sub)
##         id     target
## 1: 1488020 0.02206263
## 2: 1488022 0.06918631
## 3: 1488023 0.03817647
## 4: 1488024 0.04183940
## 5: 1488025 0.02386898
## 6: 1488026 0.03472856

Kaggle Gini score of this model=0.265

Kaggle Gini score on leaderbord=0.290