Lekshman Ramesh
load("D:/Lekshman/Kaggle/train.csv/Santander-Data Wrangling/Santander-Data Viz/Santander-Prediction/train.Rda")
dim(train)## [1] 76020 157
param <- list(objective = "binary:logistic",
eval_metric = "auc",
booster = "gbtree",
eta = 0.05,
subsample = 0.7,
colsample_bytree = 0.7,
max_depth = 5)library(xgboost)
library(Matrix)
label=train$TARGET
xgmat=xgb.DMatrix(as.matrix(train[,2:156]), label=label)
##OR use sparse.model.matrix function
#xgmat=sparse.model.matrix(TARGET~.-1,data=train)cv.res_red=xgb.cv(data=xgmat, nfold=3, label=label,nround=200, params = param, print_every_n = 100)## Warning in xgb.get.DMatrix(data, label, missing): xgboost: label will be
## ignored.
## [1] train-auc:0.779339+0.041311 test-auc:0.770796+0.037849
## [101] train-auc:0.872975+0.000825 test-auc:0.838322+0.001451
## [200] train-auc:0.891807+0.000446 test-auc:0.838562+0.001643
#Test AUC ~0.84
train$num_var4_factor=as.factor(train$num_var4)
train$num_var35_factor=as.factor(train$num_var35)
train$var36_factor=as.factor(train$var36)
train$num_meses_var39_vig_ult3_factor=as.factor(train$num_meses_var39_vig_ult3)xgmat=sparse.model.matrix(TARGET~.-1,data=train)
cv.res_red=xgb.cv(data=xgmat, nfold=3, label=label,nround=200, params = param, print_every_n = 100)## [1] train-auc:0.804939+0.009476 test-auc:0.792259+0.010621
## [101] train-auc:0.877267+0.001319 test-auc:0.837756+0.000989
## [200] train-auc:0.898721+0.001388 test-auc:0.839191+0.001112
##Improves AUC by 0.0011 on average# Dataframe of values tested.
# Outputs errors for each combination tested
# nrounds and ETA can be tested separately to reduce computations
xgb_grid_1=expand.grid(nrounds=200,eta=0.05,max_depth=c(5,7,9),
subsample=c(0.6,0.7,0.8),colsample_bytree=c(0.6,0.7,0.8))
cv_final=NULL
for (i in 1:nrow(xgb_grid_1)){
param <- list(objective = "binary:logistic", eval_metric = "auc",
booster = "gbtree", eta = xgb_grid_1$eta[i],
subsample = xgb_grid_1$subsample[i],
colsample_bytree = xgb_grid_1$colsample_bytree[i],
max_depth = xgb_grid_1$max_depth[i])
cv.res_red=xgb.cv(data=xgmat, nfold=3,nround=xgb_grid_1$nrounds[i], params = param, verbose=0)
cv_op=cv.res_red$evaluation_log
cv_op=cv_op[200,]
cv_final=rbind(cv_op,cv_final)}