library(tidyverse)
library(xgboost)
library(data.table)
library(caret)
library(MLmetrics)
dataset <- read.csv("C:/Users/user/Desktop/Kaggle/Porto Seguro/dataset.csv")
submission <- read.csv("C:/Users/user/Desktop/Kaggle/Porto Seguro/sample_submission.csv")
str(dataset)
## 'data.frame': 1488028 obs. of 49 variables:
## $ target : int 0 0 0 0 0 0 0 0 0 1 ...
## $ ps_ind_01 : int 2 1 5 0 0 5 2 5 5 1 ...
## $ ps_ind_02_cat : int 2 1 4 1 2 1 1 1 1 1 ...
## $ ps_ind_03 : int 5 7 9 2 0 4 3 4 3 2 ...
## $ ps_ind_04_cat : int 1 0 1 0 1 0 1 0 1 0 ...
## $ ps_ind_06_bin : int 0 0 0 1 1 0 0 1 0 0 ...
## $ ps_ind_07_bin : int 1 0 0 0 0 0 1 0 0 1 ...
## $ ps_ind_08_bin : int 0 1 1 0 0 0 0 0 1 0 ...
## $ ps_ind_09_bin : int 0 0 0 0 0 1 0 0 0 0 ...
## $ ps_ind_15 : int 11 3 12 8 9 6 8 13 6 4 ...
## $ ps_ind_16_bin : int 0 0 1 1 1 1 1 1 1 0 ...
## $ ps_ind_17_bin : int 1 0 0 0 0 0 0 0 0 0 ...
## $ ps_ind_18_bin : int 0 1 0 0 0 0 0 0 0 1 ...
## $ ps_reg_01 : num 0.7 0.8 0 0.9 0.7 0.9 0.6 0.7 0.9 0.9 ...
## $ ps_reg_02 : num 0.2 0.4 0 0.2 0.6 1.8 0.1 0.4 0.7 1.4 ...
## $ ps_reg_03 : num 0.718 0.766 0.775 0.581 0.841 ...
## $ ps_car_01_cat : int 10 11 7 7 11 10 6 11 10 11 ...
## $ ps_car_02_cat : int 1 1 1 1 1 0 1 1 1 0 ...
## $ ps_car_04_cat : int 0 0 0 0 0 0 0 0 0 1 ...
## $ ps_car_06_cat : int 4 11 14 11 14 14 11 11 14 14 ...
## $ ps_car_07_cat : int 1 1 1 1 1 1 1 1 1 1 ...
## $ ps_car_08_cat : int 0 1 1 1 1 1 1 1 1 1 ...
## $ ps_car_09_cat : int 0 2 2 3 2 0 0 2 0 2 ...
## $ ps_car_11_cat : int 12 19 60 104 82 104 99 30 68 104 ...
## $ ps_car_11 : int 2 3 1 1 3 2 2 3 3 2 ...
## $ ps_car_12 : num 0.4 0.316 0.316 0.374 0.316 ...
## $ ps_car_13 : num 0.884 0.619 0.642 0.543 0.566 ...
## $ ps_car_14 : num 0.371 0.389 0.347 0.295 0.365 ...
## $ ps_car_15 : num 3.61 2.45 3.32 2 2 ...
## $ ps_calc_01 : num 0.6 0.3 0.5 0.6 0.4 0.7 0.2 0.1 0.9 0.7 ...
## $ ps_calc_02 : num 0.5 0.1 0.7 0.9 0.6 0.8 0.6 0.5 0.8 0.8 ...
## $ ps_calc_03 : num 0.2 0.3 0.1 0.1 0 0.4 0.5 0.1 0.6 0.8 ...
## $ ps_calc_04 : int 3 2 2 2 2 3 2 1 3 2 ...
## $ ps_calc_05 : int 1 1 2 4 2 1 2 2 1 2 ...
## $ ps_calc_06 : int 10 9 9 7 6 8 8 7 7 8 ...
## $ ps_calc_07 : int 1 5 1 1 3 2 1 1 3 2 ...
## $ ps_calc_08 : int 10 8 8 8 10 11 8 6 9 9 ...
## $ ps_calc_09 : int 1 1 2 4 2 3 3 1 4 1 ...
## $ ps_calc_10 : int 5 7 7 2 12 8 10 13 11 11 ...
## $ ps_calc_11 : int 9 3 4 2 3 4 3 7 4 3 ...
## $ ps_calc_12 : int 1 1 2 2 1 2 0 1 2 5 ...
## $ ps_calc_13 : int 5 1 7 4 1 0 0 3 1 0 ...
## $ ps_calc_14 : int 8 9 7 9 3 9 10 6 5 6 ...
## $ ps_calc_15_bin: int 0 0 0 0 0 0 0 1 0 0 ...
## $ ps_calc_16_bin: int 1 1 1 0 0 1 1 0 1 1 ...
## $ ps_calc_17_bin: int 1 1 1 0 0 0 0 1 0 0 ...
## $ ps_calc_18_bin: int 0 0 0 0 1 1 0 0 0 0 ...
## $ ps_calc_19_bin: int 0 1 1 0 1 1 1 1 0 1 ...
## $ ps_calc_20_bin: int 1 0 0 0 0 1 0 0 1 0 ...
qnt <- quantile(dataset$ps_reg_02, probs=c(.25, .75), na.rm = T)
caps <- quantile(dataset$ps_reg_02, probs=c(.05, .95), na.rm = T)
H <- 1.5 * IQR(dataset$ps_reg_02, na.rm = T)
dataset$ps_reg_02[dataset$ps_reg_02 < (qnt[1] - H)] <- caps[1]
dataset$ps_reg_02[dataset$ps_reg_02 >(qnt[2] + H)] <- caps[2]
summary(dataset$ps_reg_02)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.2000 0.3000 0.4282 0.6000 1.3000
qnt <- quantile(dataset$ps_car_13, probs=c(.25, .75), na.rm = T)
caps <- quantile(dataset$ps_car_13, probs=c(.05, .95), na.rm = T)
H <- 1.5 * IQR(dataset$ps_car_13, na.rm = T)
dataset$ps_car_13[dataset$ps_car_13 < (qnt[1] - H)] <- caps[1]
dataset$ps_car_13[dataset$ps_car_13 >(qnt[2] + H)] <- caps[2]
summary(dataset$ps_car_13)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.3190 0.6710 0.7660 0.8020 0.9061 1.2588
qnt <- quantile(dataset$ps_car_15, probs=c(.25, .75), na.rm = T)
caps <- quantile(dataset$ps_car_15, probs=c(.05, .95), na.rm = T)
H <- 1.5 * IQR(dataset$ps_car_15, na.rm = T)
dataset$ps_car_15[dataset$ps_car_15 < (qnt[1] - H)] <- caps[1]
dataset$ps_car_15[dataset$ps_car_15 >(qnt[2] + H)] <- caps[2]
summary(dataset$ps_car_15)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.732 2.828 3.317 3.120 3.606 3.742
train <- dataset[1:595212,]
test <- dataset[595213:1488028,2:49]
proof <- dataset[195211:295212,]
train1 <- train %>% select(-target)
proof1 <- proof %>% select(-target)
dtrain <- xgb.DMatrix(as.matrix(train1),label = train$target)
dproof <- xgb.DMatrix(as.matrix(proof1),label = proof$target)
dtest <- xgb.DMatrix(as.matrix(test))
set.seed(12345)
xgb_params <- list(booster = "gbtree",
objectve = "binary:logistic",
eta=0.02,
gamma=1,
max_depth=7,
subsample=0.63,
colsample_bytree = 0.8,
min_child_weight = 5,
base_score=median(train$target))
set.seed(12345)
xgb_dt <- xgb.train(params = xgb_params,
data = dtrain,
nrounds = 1200,
print_every_n = 50,
early_stopping_rounds = 10,
maximize = F,
verbose = 1,
watchlist = list(train=dtrain))
## [1] train-rmse:0.190745
## Will train until train_rmse hasn't improved in 10 rounds.
##
## [51] train-rmse:0.187075
## [101] train-rmse:0.186366
## [151] train-rmse:0.186096
## [201] train-rmse:0.185906
## [251] train-rmse:0.185767
## [301] train-rmse:0.185654
## [351] train-rmse:0.185548
## [401] train-rmse:0.185447
## [451] train-rmse:0.185358
## [501] train-rmse:0.185268
## [551] train-rmse:0.185202
## [601] train-rmse:0.185119
## [651] train-rmse:0.185046
## [701] train-rmse:0.184967
## [751] train-rmse:0.184899
## [801] train-rmse:0.184825
## [851] train-rmse:0.184765
## [901] train-rmse:0.184699
## [951] train-rmse:0.184634
## [1001] train-rmse:0.184562
## [1051] train-rmse:0.184492
## [1101] train-rmse:0.184421
## [1151] train-rmse:0.184373
## [1200] train-rmse:0.184318
predv <- predict(xgb_dt, dproof, type = "prob")
NormalizedGini(predv,proof$target)
## [1] 0.4082765
summary(predv)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.005625 0.023926 0.032757 0.036437 0.044176 0.475848
mat <- xgb.importance (feature_names = colnames(dtrain),model = xgb_dt)
xgb.plot.importance (importance_matrix = mat[1:30])
pred <- predict(xgb_dt, dtest, type = "prob")
sub <- data.table(submission$id,pred=pred)
colnames(sub) <- c("id","target")
summary(sub)
## id target
## Min. : 0 Min. :-0.02055
## 1st Qu.: 372022 1st Qu.: 0.02401
## Median : 744307 Median : 0.03286
## Mean : 744154 Mean : 0.03651
## 3rd Qu.:1116309 3rd Qu.: 0.04443
## Max. :1488026 Max. : 0.52547
sub$target <- ifelse(sub$target<0,0,sub$target)
head(sub)
## id target
## 1: 0 0.03207741
## 2: 1 0.02986537
## 3: 2 0.02598237
## 4: 3 0.01678664
## 5: 4 0.03653975
## 6: 5 0.04358792
tail(sub)
## id target
## 1: 1488020 0.02206263
## 2: 1488022 0.06918631
## 3: 1488023 0.03817647
## 4: 1488024 0.04183940
## 5: 1488025 0.02386898
## 6: 1488026 0.03472856
write.csv(sub, "C:/Users/user/Desktop/Kaggle/Porto Seguro/submission.csv", quote=F, na="", row.names=F)