rm(list=ls()); gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 370914 19.9 592000 31.7 460000 24.6
## Vcells 567306 4.4 1308461 10.0 807518 6.2
library(caret)
library(corrplot)
library(xgboost)
library(stats)
library(knitr)
library(ggplot2)
library(Matrix)
library(plotly)
library(htmlwidgets)
library(readr)
library(randomForest)
library(data.table)
library(h2o)
library(dplyr)
library(tidyr)
setwd("~/MassiveAnalytics/")
train <- read_csv("~/MassiveAnalytics/datasets/train.csv")
test <- read_csv("~/MassiveAnalytics/datasets/test.csv")
##### Remove id
train$Id<-NULL
test$Id<-NULL
# identify number of classes
num.class = length(levels(factor(unlist(train[,"Response"]))))
y = as.matrix(as.integer(unlist(train[,"Response"]))-1)
##### Remove columns with NA, use test data as referal for NA
cols.without.na = colSums(is.na(train)) == 0
train = train[, cols.without.na]
cols.without.na = colSums(is.na(test)) == 0
test = test[, cols.without.na]
##### Check for zero variance
zero.var = nearZeroVar(train, saveMetrics=F)
zero.var
## [1] 1 5 7 16 19 21 23 33 35 38 41 43 45 46 48 52 55
## [18] 56 59 62 64 66 67 68 69 70 71 72 73 74 75 77 78 79
## [35] 81 82 83 84 85 86 87 89 91 92 93 94 95 96 97 98 99
## [52] 100 101 103 104 106 107 108 109 110 111 112
train<-train[,-zero.var]
test<-test[, -zero.var]
##### Simpel visualization
#x<-as.data.frame(head(train[,c("BMI","Ht","Wt","Ins_Age","Product_Info_3")],100))
x<-as.data.frame(head(train[,c("BMI","Ht","Wt")],100))
y1<-factor(unlist(head(train[,"Response"],100)))
trellis.par.set(theme = col.whitebg(), warn = FALSE)
featurePlot(x, y1, "box",auto.key = list(columns = 3))
featurePlot(x, y1, "density",
# scales = list(x = list(relation="free"),
# y = list(relation="free")),
# adjust = 1.5,
# pch = "|",
# layout = c(4, 2),
auto.key = list(columns = 3))
corrplot.mixed(cor(train[,c(2:20)]), lower="circle", upper="color",
tl.pos="lt", tl.cex=0.6, diag="n", order="hclust", hclust.method="complete")
##### convert data to matrix
train$Response = NULL
train.matrix = as.matrix(train)
mode(train.matrix) = "numeric"
test.matrix = as.matrix(test)
mode(test.matrix) = "numeric"
param <- list("objective" = "multi:softprob", # multiclass classification
"num_class" = num.class, # number of classes
"eval_metric" = "merror",
"nthread" = 8, # number of threads to be used
"max_depth" = 5, # maximum depth of tree
"eta" = 0.1, # step size shrinkage
"gamma" = 0, # minimum loss reduction
"subsample" = 0.7,
"colsample_bytree" = 0.7,
"min_child_weight" = 3
)
set.seed(789)
nround.cv = 10
system.time( bst.cv <- xgb.cv(param=param, data=train.matrix, label=y,
nfold=4, nrounds=nround.cv, prediction=TRUE, verbose=T,
# callbacks = list(cb.cv.predict(save_models = FALSE))
))
## [1] train-merror:0.501861+0.007049 test-merror:0.508614+0.007918
## [2] train-merror:0.489163+0.007403 test-merror:0.495680+0.010162
## [3] train-merror:0.483954+0.003142 test-merror:0.490611+0.003303
## [4] train-merror:0.479828+0.001473 test-merror:0.487395+0.002143
## [5] train-merror:0.477583+0.000526 test-merror:0.485980+0.000954
## [6] train-merror:0.476668+0.001158 test-merror:0.484532+0.002228
## [7] train-merror:0.476005+0.000491 test-merror:0.483522+0.002003
## [8] train-merror:0.475034+0.000422 test-merror:0.483437+0.002106
## [9] train-merror:0.474040+0.000272 test-merror:0.482831+0.002194
## [10] train-merror:0.473429+0.000675 test-merror:0.482275+0.002708
## user system elapsed
## 43.97 1.64 14.40
bst.cv$evaluation_log %>%
select(-contains("std")) %>%
gather(TestOrTrain, merror,-iter) %>%
ggplot(aes(x = iter, y = merror, group = TestOrTrain, color = TestOrTrain)) +
geom_line() +
theme_bw()
##### Rename column names
col.names<-colnames(bst.cv$evaluation_log)
setnames(bst.cv$evaluation_log, old = col.names, new = c("iter","train.merror.mean","train.merror.std","test.merror.mean","test.merror.std" ))
min.merror.idx = which.min(bst.cv$evaluation_log[, test.merror.mean])
min.merror.idx
## [1] 10
bst.cv$dt=bst.cv$evaluation_log
bst.cv$dt[min.merror.idx,]
## iter train.merror.mean train.merror.std test.merror.mean
## 1: 10 0.4734288 0.0006747119 0.4822755
## test.merror.std
## 1: 0.002707574
#model_dump <- xgb.dump(bst.cv, with.stats = T)
#importance.matrix <- xgb.importance(names(data.roughfix), model_xgboost)
#xgb.plot.importance(importance.matrix[1:30])
pred.cv = matrix(bst.cv$pred, nrow=length(bst.cv$pred)/num.class, ncol=num.class)
pred.cv = max.col(pred.cv, "last")
#pred.cv[floor(runif(1428, min=0, max=51000))]<-4
confusionMatrix(factor(y+1), factor(pred.cv))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 4 5 6 7 8
## 1 833 652 6 0 587 1758 604 1767
## 2 374 1284 5 0 824 1766 570 1729
## 3 31 42 8 0 309 470 24 129
## 4 13 2 0 1 2 698 48 664
## 5 135 447 6 0 2780 1255 213 596
## 6 276 315 0 0 473 6021 1053 3095
## 7 134 52 0 1 28 2187 2446 3179
## 8 52 20 0 0 23 1554 470 17370
##
## Overall Statistics
##
## Accuracy : 0.5177
## 95% CI : (0.5137, 0.5218)
## No Information Rate : 0.4804
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3685
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity 0.45076 0.45629 0.3200000 5.000e-01 0.55312
## Specificity 0.90659 0.90687 0.9830683 9.760e-01 0.95121
## Pos Pred Value 0.13420 0.19597 0.0078973 7.003e-04 0.51178
## Neg Pred Value 0.98091 0.97104 0.9997087 1.000e+00 0.95837
## Prevalence 0.03112 0.04739 0.0004210 3.368e-05 0.08464
## Detection Rate 0.01403 0.02162 0.0001347 1.684e-05 0.04682
## Detection Prevalence 0.10453 0.11034 0.0170593 2.405e-02 0.09148
## Balanced Accuracy 0.67868 0.68158 0.6515341 7.380e-01 0.75217
## Class: 6 Class: 7 Class: 8
## Sensitivity 0.3833 0.45063 0.6089
## Specificity 0.8807 0.89656 0.9313
## Pos Pred Value 0.5360 0.30472 0.8913
## Neg Pred Value 0.7988 0.94193 0.7203
## Prevalence 0.2645 0.09141 0.4804
## Detection Rate 0.1014 0.04119 0.2925
## Detection Prevalence 0.1892 0.13518 0.3282
## Balanced Accuracy 0.6320 0.67359 0.7701
train <- read_csv("~/MassiveAnalytics/datasets/train.csv")
## Parsed with column specification:
## cols(
## .default = col_integer(),
## Product_Info_2 = col_character(),
## Product_Info_4 = col_double(),
## Ins_Age = col_double(),
## Ht = col_double(),
## Wt = col_double(),
## BMI = col_double(),
## Employment_Info_1 = col_double(),
## Employment_Info_4 = col_double(),
## Employment_Info_6 = col_double(),
## Insurance_History_5 = col_double(),
## Family_Hist_2 = col_double(),
## Family_Hist_3 = col_double(),
## Family_Hist_4 = col_double(),
## Family_Hist_5 = col_double()
## )
## See spec(...) for full column specifications.
test <- read_csv("~/MassiveAnalytics/datasets/test.csv")
## Parsed with column specification:
## cols(
## .default = col_integer(),
## Product_Info_2 = col_character(),
## Product_Info_4 = col_double(),
## Ins_Age = col_double(),
## Ht = col_double(),
## Wt = col_double(),
## BMI = col_double(),
## Employment_Info_1 = col_double(),
## Employment_Info_4 = col_double(),
## Employment_Info_6 = col_double(),
## Insurance_History_5 = col_double(),
## Family_Hist_2 = col_double(),
## Family_Hist_3 = col_double(),
## Family_Hist_4 = col_double(),
## Family_Hist_5 = col_double()
## )
## See spec(...) for full column specifications.
# All features shared, making feature transformations simultaneously.
response <- train$Response
train$training <- 1
test$training <- 0
data <- rbind(train[-c(1,128)], test[-1])
colnames(data)
## [1] "Product_Info_1" "Product_Info_2" "Product_Info_3"
## [4] "Product_Info_4" "Product_Info_5" "Product_Info_6"
## [7] "Product_Info_7" "Ins_Age" "Ht"
## [10] "Wt" "BMI" "Employment_Info_1"
## [13] "Employment_Info_2" "Employment_Info_3" "Employment_Info_4"
## [16] "Employment_Info_5" "Employment_Info_6" "InsuredInfo_1"
## [19] "InsuredInfo_2" "InsuredInfo_3" "InsuredInfo_4"
## [22] "InsuredInfo_5" "InsuredInfo_6" "InsuredInfo_7"
## [25] "Insurance_History_1" "Insurance_History_2" "Insurance_History_3"
## [28] "Insurance_History_4" "Insurance_History_5" "Insurance_History_7"
## [31] "Insurance_History_8" "Insurance_History_9" "Family_Hist_1"
## [34] "Family_Hist_2" "Family_Hist_3" "Family_Hist_4"
## [37] "Family_Hist_5" "Medical_History_1" "Medical_History_2"
## [40] "Medical_History_3" "Medical_History_4" "Medical_History_5"
## [43] "Medical_History_6" "Medical_History_7" "Medical_History_8"
## [46] "Medical_History_9" "Medical_History_10" "Medical_History_11"
## [49] "Medical_History_12" "Medical_History_13" "Medical_History_14"
## [52] "Medical_History_15" "Medical_History_16" "Medical_History_17"
## [55] "Medical_History_18" "Medical_History_19" "Medical_History_20"
## [58] "Medical_History_21" "Medical_History_22" "Medical_History_23"
## [61] "Medical_History_24" "Medical_History_25" "Medical_History_26"
## [64] "Medical_History_27" "Medical_History_28" "Medical_History_29"
## [67] "Medical_History_30" "Medical_History_31" "Medical_History_32"
## [70] "Medical_History_33" "Medical_History_34" "Medical_History_35"
## [73] "Medical_History_36" "Medical_History_37" "Medical_History_38"
## [76] "Medical_History_39" "Medical_History_40" "Medical_History_41"
## [79] "Medical_Keyword_1" "Medical_Keyword_2" "Medical_Keyword_3"
## [82] "Medical_Keyword_4" "Medical_Keyword_5" "Medical_Keyword_6"
## [85] "Medical_Keyword_7" "Medical_Keyword_8" "Medical_Keyword_9"
## [88] "Medical_Keyword_10" "Medical_Keyword_11" "Medical_Keyword_12"
## [91] "Medical_Keyword_13" "Medical_Keyword_14" "Medical_Keyword_15"
## [94] "Medical_Keyword_16" "Medical_Keyword_17" "Medical_Keyword_18"
## [97] "Medical_Keyword_19" "Medical_Keyword_20" "Medical_Keyword_21"
## [100] "Medical_Keyword_22" "Medical_Keyword_23" "Medical_Keyword_24"
## [103] "Medical_Keyword_25" "Medical_Keyword_26" "Medical_Keyword_27"
## [106] "Medical_Keyword_28" "Medical_Keyword_29" "Medical_Keyword_30"
## [109] "Medical_Keyword_31" "Medical_Keyword_32" "Medical_Keyword_33"
## [112] "Medical_Keyword_34" "Medical_Keyword_35" "Medical_Keyword_36"
## [115] "Medical_Keyword_37" "Medical_Keyword_38" "Medical_Keyword_39"
## [118] "Medical_Keyword_40" "Medical_Keyword_41" "Medical_Keyword_42"
## [121] "Medical_Keyword_43" "Medical_Keyword_44" "Medical_Keyword_45"
## [124] "Medical_Keyword_46" "Medical_Keyword_47" "Medical_Keyword_48"
## [127] "training"
prop.table(table(response))
## response
## 1 2 3 4 5 6
## 0.10452838 0.11033832 0.01705933 0.02404810 0.09147707 0.18916825
## 7 8
## 0.13517792 0.32820262
#plot(prop.table(table(response)))
feature.names <- names(data[-127])
for( f in feature.names ){
if(class(data[[f]]) == "character"){
print(class(data[[f]]))
levels <- unique(c(train[[f]],test[[f]]))
train[[f]] <- as.integer(factor(train[[f]]), levels = levels)
test[[f]] <- as.integer(factor(test[[f]]), levels = levels)
data[[f]] <- as.integer(factor(data[[f]]), levels = levels)
}
}
## [1] "character"
data.roughfix <- na.roughfix(data)
y = as.matrix(as.integer(unlist(response))-1)
# Using training data to identify most important features with xgboost.
system.time(model_xgboost <- xgboost(data = data.matrix(data.roughfix[data.roughfix$training==1,]),
label = y,
nround = 10,
objective = "multi:softprob",
eval_metric = "merror",
num_class=8,
eta = 0.01, # learning rate
max.depth = 3,
missing = NaN,
verbose = TRUE,
print_every_n = 1,
early_stopping_rounds = 10 ))
## [1] train-merror:0.472912
## Will train until train_merror hasn't improved in 10 rounds.
##
## [2] train-merror:0.472912
## [3] train-merror:0.472895
## [4] train-merror:0.472575
## [5] train-merror:0.472777
## [6] train-merror:0.465974
## [7] train-merror:0.465974
## [8] train-merror:0.465907
## [9] train-merror:0.466159
## [10] train-merror:0.464980
## user system elapsed
## 23.85 0.33 7.17
model_dump <- xgb.dump(model_xgboost, with_stats = T)
importance.matrix <- xgb.importance(names(data.roughfix), model_xgboost)
xgb.plot.importance(importance.matrix[1:30])
##### Creating a feature counting the medical keywords for each instance #####(medical keywords is column 80:127)
medkeywords <- apply(data.roughfix[,79:126], 1, sum)
data.roughfix$medkeywords <- as.integer(medkeywords)
partition <- createDataPartition(response, times = 1, p = 0.75)
training <- data.roughfix[data.roughfix$training==1,]
y_train <- y[partition$Resample1,]
y_test <- y[-partition$Resample1,]
training_train <- training[partition$Resample1,-127]
training_test <- training[-partition$Resample1,-127]
system.time(model_xgboost <- xgboost(data = data.matrix(training_train),
label = y_train,
nround = 10,
objective = "multi:softprob",
eval_metric = "merror",
num_class=8,
eta = 0.01,
max.depth = 3,
missing = NaN,
verbose = TRUE,
print_every_n = 1,
early_stopping_rounds = 10))
## [1] train-merror:0.470092
## Will train until train_merror hasn't improved in 10 rounds.
##
## [2] train-merror:0.470159
## [3] train-merror:0.469149
## [4] train-merror:0.471753
## [5] train-merror:0.471708
## [6] train-merror:0.471865
## [7] train-merror:0.465354
## [8] train-merror:0.465152
## [9] train-merror:0.464905
## [10] train-merror:0.465017
## user system elapsed
## 17.73 0.38 5.15
pred <- predict(model_xgboost, data.matrix(training_test), missing=NaN)
pred_m<- matrix(pred, nrow=length(pred)/num.class, ncol=num.class)
pred_m = max.col(pred_m, "last")
confusionMatrix(factor(y_test+1), factor(pred_m))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 4 5 6 7 8
## 1 196 187 178 181 179 183 185 192
## 2 227 206 202 198 209 204 220 206
## 3 32 28 31 39 39 33 40 35
## 4 45 52 34 40 57 51 45 46
## 5 157 161 182 167 169 153 175 177
## 6 356 335 341 381 320 352 363 377
## 7 260 248 226 255 236 253 253 263
## 8 633 589 625 601 612 611 603 611
##
## Overall Statistics
##
## Accuracy : 0.1252
## 95% CI : (0.1199, 0.1306)
## No Information Rate : 0.1285
## P-Value [Acc > NIR] : 0.8879
##
## Kappa : -9e-04
## Mcnemar's Test P-Value : <2e-16
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4 Class: 5 Class: 6
## Sensitivity 0.10283 0.11406 0.017042 0.021482 0.09281 0.19130
## Specificity 0.90069 0.88757 0.981115 0.974582 0.91001 0.80984
## Pos Pred Value 0.13234 0.12321 0.111913 0.108108 0.12603 0.12460
## Neg Pred Value 0.87204 0.87854 0.877265 0.874128 0.87767 0.87621
## Prevalence 0.12839 0.12166 0.122533 0.125429 0.12267 0.12395
## Detection Rate 0.01320 0.01388 0.002088 0.002695 0.01138 0.02371
## Detection Prevalence 0.09976 0.11263 0.018659 0.024924 0.09033 0.19030
## Balanced Accuracy 0.50176 0.50082 0.499079 0.498032 0.50141 0.50057
## Class: 7 Class: 8
## Sensitivity 0.13429 0.32040
## Specificity 0.86567 0.66966
## Pos Pred Value 0.12688 0.12508
## Neg Pred Value 0.87308 0.86988
## Prevalence 0.12691 0.12846
## Detection Rate 0.01704 0.04116
## Detection Prevalence 0.13432 0.32907
## Balanced Accuracy 0.49998 0.49503
model_dump <- xgb.dump(model_xgboost, with_stats = T)
importance.matrix <- xgb.importance(names(data.roughfix), model_xgboost)
xgb.plot.importance(importance.matrix[1:30])
##### Categorical variables. One-hot encoding the most important.
categorical_string <- as.character("Product_Info_1, Product_Info_2, Product_Info_3, Product_Info_5, Product_Info_6, Product_Info_7, Employment_Info_2, Employment_Info_3, Employment_Info_5, InsuredInfo_1, InsuredInfo_2, InsuredInfo_3, InsuredInfo_4, InsuredInfo_5, InsuredInfo_6, InsuredInfo_7, Insurance_History_1, Insurance_History_2, Insurance_History_3, Insurance_History_4, Insurance_History_7, Insurance_History_8, Insurance_History_9, Family_Hist_1, Medical_History_2, Medical_History_3, Medical_History_4, Medical_History_5, Medical_History_6, Medical_History_7, Medical_History_8, Medical_History_9, Medical_History_11, Medical_History_12, Medical_History_13, Medical_History_14, Medical_History_16, Medical_History_17, Medical_History_18, Medical_History_19, Medical_History_20, Medical_History_21, Medical_History_22, Medical_History_23, Medical_History_25, Medical_History_26, Medical_History_27, Medical_History_28, Medical_History_29, Medical_History_30, Medical_History_31, Medical_History_33, Medical_History_34, Medical_History_35, Medical_History_36, Medical_History_37, Medical_History_38, Medical_History_39, Medical_History_40, Medical_History_41")
categorical_names <- unlist(strsplit(categorical_string, split = ", "))
top30features <- importance.matrix$Feature[1:30]
which(top30features %in% categorical_names)
## [1] 3 4 10 11 12 13 14 15 18
top30categorical_names <- top30features[which(top30features %in% categorical_names)]
# One-hot encoding top 15 categorical variables
top30categorical_factor <- as.data.frame(apply(data.roughfix[,top30categorical_names],2,as.factor))
categorical_one_hot <- as.data.frame(model.matrix(~.-1, top30categorical_factor[-8])) # Except Medical_History_2 which has too many levels.
categorical_one_hot2 <- as.data.frame(sapply(categorical_one_hot,as.factor))
str(categorical_one_hot2)
## 'data.frame': 79146 obs. of 66 variables:
## $ Medical_History_231: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 2 ...
## $ Medical_History_232: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Medical_History_233: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 1 ...
## $ Medical_History_42 : Factor w/ 2 levels "0","1": 1 1 2 2 2 2 2 2 2 2 ...
## $ Medical_History_282: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Medical_History_283: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Employment_Info_210: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Employment_Info_211: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Employment_Info_212: Factor w/ 2 levels "0","1": 2 1 1 1 1 1 1 2 1 1 ...
## $ Employment_Info_213: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Employment_Info_214: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Employment_Info_215: Factor w/ 2 levels "0","1": 1 1 1 1 1 2 1 1 1 1 ...
## $ Employment_Info_216: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Employment_Info_217: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Employment_Info_218: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Employment_Info_219: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Employment_Info_22 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Employment_Info_220: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Employment_Info_221: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Employment_Info_222: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Employment_Info_223: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Employment_Info_224: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Employment_Info_225: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Employment_Info_226: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Employment_Info_227: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Employment_Info_228: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Employment_Info_229: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Employment_Info_23 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Employment_Info_230: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Employment_Info_231: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Employment_Info_232: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Employment_Info_233: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Employment_Info_234: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Employment_Info_235: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Employment_Info_236: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Employment_Info_237: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Employment_Info_238: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Employment_Info_24 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Employment_Info_25 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Employment_Info_26 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Employment_Info_27 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Employment_Info_28 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Employment_Info_29 : Factor w/ 2 levels "0","1": 1 1 2 2 2 1 1 1 2 1 ...
## $ Medical_History_302: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ Medical_History_303: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Medical_History_402: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Medical_History_403: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ InsuredInfo_53 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Product_Info_210 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Product_Info_211 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Product_Info_212 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Product_Info_213 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Product_Info_214 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Product_Info_215 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Product_Info_216 : Factor w/ 2 levels "0","1": 1 1 1 1 2 2 1 2 1 1 ...
## $ Product_Info_217 : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 1 1 2 1 ...
## $ Product_Info_218 : Factor w/ 2 levels "0","1": 1 1 1 2 1 1 1 1 1 1 ...
## $ Product_Info_219 : Factor w/ 2 levels "0","1": 1 1 2 1 1 1 1 1 1 2 ...
## $ Product_Info_22 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Product_Info_23 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Product_Info_24 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Product_Info_25 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Product_Info_26 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Product_Info_27 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Product_Info_28 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 2 1 1 1 ...
## $ Product_Info_29 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
data.roughfix2 <- cbind(data.roughfix, categorical_one_hot2)
system.time(model2 <- xgboost(data = data.matrix(data.roughfix2[data.roughfix2$training==1,]),
label = y,
nround = 10,
objective = "multi:softprob",
eval_metric = "merror",
num_class=8,
eta = 0.01,
max.depth = 3,
missing = NaN,
verbose = TRUE,
print_every_n = 1,
early_stopping_rounds = 10 ))
## [1] train-merror:0.471615
## Will train until train_merror hasn't improved in 10 rounds.
##
## [2] train-merror:0.471615
## [3] train-merror:0.471599
## [4] train-merror:0.471599
## [5] train-merror:0.471531
## [6] train-merror:0.464610
## [7] train-merror:0.464728
## [8] train-merror:0.464677
## [9] train-merror:0.464543
## [10] train-merror:0.474007
## user system elapsed
## 34.16 0.45 9.80
model_dump <- xgb.dump(model2, with_stats = T)
importance.matrix <- xgb.importance(names(data.roughfix2), model2)
xgb.plot.importance(importance.matrix[1:30])
##### Crossvalidating with ScoreQuadraticWeightedKappa
folds <- createFolds(response, 2)
training <- data.roughfix[data.roughfix$training == 1,]
cv_results <- lapply(folds, function(x){
train <- data.matrix(training[-x,])
test <- data.matrix(training[x,])
model <- xgboost(data = train,
label = y[-x],
nround = 10,
objective = "multi:softprob",
eval_metric = "merror",
num_class=8,
eta = 0.01,
max.depth = 3,
missing = NaN,
verbose = TRUE,
print_every_n = 1,
early_stopping_rounds = 10
)
model_pred <- predict(model, test, missing=NaN)
pred_m<- matrix(model_pred, nrow=length(model_pred)/num.class, ncol=num.class)
pred_m = max.col(pred_m, "last")
actual <- response[x]
qwkappa <- Metrics::ScoreQuadraticWeightedKappa(actual, pred_m)
print(qwkappa)
return(qwkappa)
})
## [1] train-merror:0.471153
## Will train until train_merror hasn't improved in 10 rounds.
##
## [2] train-merror:0.471153
## [3] train-merror:0.464417
## [4] train-merror:0.465562
## [5] train-merror:0.475464
## [6] train-merror:0.475531
## [7] train-merror:0.475565
## [8] train-merror:0.475262
## [9] train-merror:0.474487
## [10] train-merror:0.473948
## [1] -0.004025722
## [1] train-merror:0.472853
## Will train until train_merror hasn't improved in 10 rounds.
##
## [2] train-merror:0.472853
## [3] train-merror:0.472819
## [4] train-merror:0.472954
## [5] train-merror:0.472954
## [6] train-merror:0.472954
## [7] train-merror:0.472853
## [8] train-merror:0.472449
## [9] train-merror:0.465443
## [10] train-merror:0.465477
## [1] 0.0006519459
cv_results
## $Fold1
## [1] -0.004025722
##
## $Fold2
## [1] 0.0006519459
#options(scipen=999);set.seed(19890624)
#localH2O <- h2o.init(ip = 'localhost', port = 54321, max_mem_size = '2g')
# 1. Read Data #####
#load('datasets/xgb_meta.RData')
#mthd <- 'GLM' # GBM, DL, RF, GLM
# 2. Eval Func ######
#evalerror_2 = function(x = seq(1.5, 7.5, by = 1), preds, labels) {
# cuts = c(min(preds), x[1], x[2], x[3], x[4], x[5], x[6], x[7], max(preds))
# preds = as.numeric(Hmisc::cut2(preds, cuts))
# err = Metrics::ScoreQuadraticWeightedKappa(as.numeric(labels), preds, 1, 8)
# return(-err)
#}
# 3. Model strategies #####
# cv <- 10
# folds <- createFolds(as.factor(train$Response), k = cv, list = FALSE)
# dropitems <- c('Id','Response')
# independent <- names(train)[!names(train) %in% dropitems]
# dependent <- "Response"
# colnames(train) <- c('Id', paste0('var_', 1:length(independent)), dependent)
# colnames(test) <- c('Id', paste0('var_', 1:length(independent)))
# independent <- paste0('var_', 1:length(independent))
#
# ### Setup Results Table ###
# results <- as.data.frame(matrix(rep(0,11*cv), cv))
# names(results) <- c('cv_num', 'kappa', 'optim_kappa', 'fixed_kappa', '1st_cut', '2nd_cut',
# '3rd_cut', '4th_cut', '5th_cut', '6th_cut', '7th_cut')
# for(i in 1:cv){
# f <- folds==i
#
# train_df_dl <- as.h2o(train[!f,],key="train_df")
# validation_df_dl <- as.h2o(train[f,],key="validation_df")
# validation_dl <- train[f,]
#
#
# print('Start training Deep Learning...')
# fit <-
# h2o.deeplearning(
# y = dependent, x = independent, training_frame = train_df_dl, overwrite_with_best_model = T, #autoencoder
# use_all_factor_levels = T, activation = "RectifierWithDropout",#TanhWithDropout "RectifierWithDropout"
# hidden = c(256,128), epochs = 18, train_samples_per_iteration = -2, adaptive_rate = T, rho = 0.99, #c(300,150,75)
# epsilon = 1e-6, rate = 0.035, rate_decay = 0.9, momentum_start = 0.9, momentum_stable = 0.99,
# nesterov_accelerated_gradient = T, input_dropout_ratio = 0.5, hidden_dropout_ratios = c(0.5,0.5),
# l1 = 1e-5, l2 = 3e-5, loss = 'Quadratic', classification_stop = 0.01,
# diagnostics = T, variable_importances = F, fast_mode = F, ignore_const_cols = T,
# force_load_balance = T, replicate_training_data = T, shuffle_training_data = T
# )
# print(fit)
# #print(fit)
# ### Predict against validation dataset
# validPreds <- as.data.frame(h2o.predict(object = fit, newdata = validation_df_dl))
# kappa <- evalerror_2(preds = validPreds$predict, labels = train[f,'Response'])
# ### Find optimal cutoff
# optCuts = optim(seq(1.5, 7.5, by = 1), evalerror_2, preds = validPreds$predict, labels = train[f,'Response'],
# method = 'Nelder-Mead', control = list(maxit = 30000, trace = TRUE, REPORT = 500))
#
# validPredsOptim = as.numeric(Hmisc::cut2(validPreds$predict, c(-Inf, optCuts$par, Inf))); table(validPredsOptim)
# optimal_kappa <- evalerror_2(preds = validPredsOptim, labels = validation$Response)
# fix_cut <- c(2.6121, 3.3566, 4.1097, 5.0359, 5.5267, 6.4481, 6.7450)
# # c(2.951759, 3.653780, 4.402781, 4.911808, 5.543988, 6.135754, 6.716891)
# validPredsFix = as.numeric(Hmisc::cut2(validPreds[,1], c(-Inf, fix_cut, Inf)));
# fix_kappa = evalerror_2(preds = validPredsFix, labels = train[f,'Response'])
#
# results[i,1:11] <- c(paste0('CV_', i), -kappa, -optimal_kappa, -fix_kappa, optCuts$par)
# View(results)
#}