We continue from the previous section. We first load the data and construct the new data set
library(lightgbm)
## Loading required package: R6
library(stats)
library(Matrix)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
setwd("~/Desktop/FALL_2019/Machine Learning--ParisLab")
concrete <- read.csv("data_cleaned.csv")
concrete <- na.omit(concrete)
## we impute the missing values with negative for the lightgbm tree to distinguish
concrete[is.na(concrete)] = -1
## we remove the samples with no designed strength, and we reduce our sample size to 9557
concrete <- concrete[!concrete$Design_Strength == 0,]
concrete$prec_Fly_Ash <- as.numeric(sub("%","",concrete$prec_Fly_Ash))/100
concrete$diff <- concrete$X28_Day/concrete$Design_Strength
concrete$over <- ifelse(concrete$diff > 1, 1,0)
concrete$over <- as.numeric(as.character(concrete$over))
var_index_keep <- c(3,6:8,17:19,24,29:30,32) ## the index
res_index <- 37
vars <- concrete[var_index_keep]
names(vars)
## [1] "Coarse_Agg_Weight" "Fine_Agg_Weight" "Cement_Weight"
## [4] "Fly_Ash_Weight" "Load_Size" "AEA_Dose"
## [7] "TYPE_AWRA_Dose" "Description_D" "Con_Temp"
## [10] "Ambient.Temp" "Water"
vars_int <- model.matrix(~(Coarse_Agg_Weight+Fine_Agg_Weight+Cement_Weight+Fly_Ash_Weight+Load_Size+AEA_Dose+TYPE_AWRA_Dose+Con_Temp+Ambient.Temp+Water)^2,vars)
## 2 and 3 order
num_vars <- vars[-8]
num_vars_2 <- num_vars^2
colnames(num_vars_2) <- c("Coarse_Agg_Weight_2","Fine_Agg_Weight_2","Cement_Weight_2","Fly_Ash_Weight_2","Load_Size_2","AEA_Dose_2","TYPE_AWRA_Dose_2","Con_Temp_2" ,"Ambient.Temp_2","Water_2")
num_vars_3 <- num_vars^3
colnames(num_vars_3) <- c("Coarse_Agg_Weight_3","Fine_Agg_Weight_3","Cement_Weight_3","Fly_Ash_Weight_3","Load_Size_3","AEA_Dose_3","TYPE_AWRA_Dose_3", "Con_Temp_3" ,"Ambient.Temp_3","Water_3")
## concatenating to create the full variable list
vars_complete <- cbind(vars, num_vars_2, num_vars_3, vars_int)
## creating the full dataset
res <- concrete[res_index]
concrete_up <- cbind(vars_complete, res)
data_samp <- data.frame(concrete_up)
set.seed(1234567)
samp_perc <-sample(1:nrow(data_samp),nrow(data_samp)*0.8,replace = F)
train <-data_samp[samp_perc,]
test <- data_samp[-samp_perc,]
train_mod <- sparse.model.matrix(over ~., data = train)
train_label = train[, "over"]
test_mod <- sparse.model.matrix(over ~., data = test)
test_label = test[, "over"]
dtrain = lgb.Dataset(data = data.matrix(train_mod), label = train_label, free_raw_data = FALSE)
dtest = lgb.Dataset(data = data.matrix(train_mod), label = test_label, free_raw_data = FALSE)
lgb.Dataset.construct(dtrain)
dtest1 = lgb.Dataset.create.valid(dtrain, data = as.matrix(train_mod), label = train_label)
dtest2 = lgb.Dataset.create.valid(dtrain, data = data.matrix(test_mod), label = test_label)
valids <- list(test1 = dtest1, test2 = dtest2) ## creating validation set
scale_pos_weight = length(train$over)/sum(train$over == 1) - 1
params = list(learning_rate = 0.01,
objective = "binary",
metric = 'binary_logloss')
m5<- lgb.train(
data = dtrain,
valids = valids,
nrounds = 10000,
num_leaves = 30,
params = params,
scale_pos_weight = scale_pos_weight,
eval_freq = 100,
early_stopping_rounds = 50
)
## [1]: test1's binary_logloss:0.125326 test2's binary_logloss:0.135431
var_imp <- lgb.importance(m5)
new_var <- rep(NA, length(var_imp$Feature))
for (i in 1: length(var_imp$Feature)){
new_var[i] <- sub("\\.", ":", var_imp$Feature[i]) ## we use regular expression here because lightgbm change the interaction term : to .
}
concrete_upp <- cbind(concrete_up[new_var[1:20]],res)
names(concrete_upp)
## [1] "AEA_Dose:TYPE_AWRA_Dose"
## [2] "Cement_Weight"
## [3] "Fine_Agg_Weight:TYPE_AWRA_Dose"
## [4] "Coarse_Agg_Weight:Fine_Agg_Weight"
## [5] "Con_Temp:Ambient.Temp"
## [6] "Fine_Agg_Weight:Fly_Ash_Weight"
## [7] "TYPE_AWRA_Dose:Con_Temp"
## [8] "AEA_Dose:Con_Temp"
## [9] "Coarse_Agg_Weight:Cement_Weight"
## [10] "Coarse_Agg_Weight:TYPE_AWRA_Dose"
## [11] "Fly_Ash_Weight:Con_Temp"
## [12] "AEA_Dose:Water"
## [13] "Load_Size:AEA_Dose"
## [14] "Fly_Ash_Weight:AEA_Dose"
## [15] "Cement_Weight:Con_Temp"
## [16] "Fine_Agg_Weight:Con_Temp"
## [17] "Coarse_Agg_Weight:AEA_Dose"
## [18] "Coarse_Agg_Weight:Con_Temp"
## [19] "AEA_Dose:Ambient.Temp"
## [20] "Coarse_Agg_Weight"
## [21] "over"
We construct new lightgbm model based on the selected 20 variables
data_samp <- data.frame(concrete_upp)
set.seed(1234567)
samp_perc <-sample(1:nrow(data_samp),nrow(data_samp)*0.8,replace = F)
train <-data_samp[samp_perc,]
test <- data_samp[-samp_perc,]
train_mod <- sparse.model.matrix(over ~., data = train)
train_label = train[, "over"]
test_mod <- sparse.model.matrix(over ~., data = test)
test_label = test[, "over"]
dtrain = lgb.Dataset(data = data.matrix(train_mod), label = train_label, free_raw_data = FALSE)
dtest = lgb.Dataset(data = data.matrix(train_mod), label = test_label, free_raw_data = FALSE)
lgb.Dataset.construct(dtrain)
dtest1 = lgb.Dataset.create.valid(dtrain, data = as.matrix(train_mod), label = train_label)
dtest2 = lgb.Dataset.create.valid(dtrain, data = data.matrix(test_mod), label = test_label)
valids <- list(test1 = dtest1, test2 = dtest2)
scale_pos_weight = length(train$over)/sum(train$over == 1) - 1
m6<- lgb.train(
data = dtrain,
valids = valids,
nrounds = 10000,
num_leaves = 30,
params = params,
scale_pos_weight = scale_pos_weight,
eval_freq = 100,
early_stopping_rounds = 50
)
## [1]: test1's binary_logloss:0.125526 test2's binary_logloss:0.135176
pred_prob <- predict(m6, test_mod)
predicted = ifelse(pred_prob > 0.95,1,0)
confusionMatrix(factor(predicted), factor(test$over))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 28 685
## 1 30 1169
##
## Accuracy : 0.626
## 95% CI : (0.6039, 0.6478)
## No Information Rate : 0.9697
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0175
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.48276
## Specificity : 0.63053
## Pos Pred Value : 0.03927
## Neg Pred Value : 0.97498
## Prevalence : 0.03033
## Detection Rate : 0.01464
## Detection Prevalence : 0.37291
## Balanced Accuracy : 0.55664
##
## 'Positive' Class : 0
##
From the confusion matrix: the overall accuracy is 0.5638, and the detection of truly underdesigned samples is 0.4827586 (28/(30+28)) in the testing set
Now we attempt to use our new lightgbm model to predict the entire dataset
complete_mod <- sparse.model.matrix(over ~., data = data_samp)
# complete_label = data_samp[, "over"]
#
# dtrain = lgb.Dataset(data = data.matrix(complete_mod), label = complete_label, free_raw_data = FALSE)
# dtest = lgb.Dataset(data = data.matrix(complete_mod), label = complete_label, free_raw_data = FALSE)
# lgb.Dataset.construct(dtrain)
# scale_pos_weight = length(data_samp$over)/sum(data_samp$over == 1) - 1
pred_prob <- predict(m6, complete_mod)
predicted = ifelse(pred_prob > 0.95,1,0)
confusionMatrix(factor(predicted), factor(data_samp$over))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 243 3397
## 1 32 5885
##
## Accuracy : 0.6412
## 95% CI : (0.6315, 0.6508)
## No Information Rate : 0.9712
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0746
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.88364
## Specificity : 0.63402
## Pos Pred Value : 0.06676
## Neg Pred Value : 0.99459
## Prevalence : 0.02877
## Detection Rate : 0.02543
## Detection Prevalence : 0.38087
## Balanced Accuracy : 0.75883
##
## 'Positive' Class : 0
##
From our the model we built using light gbm, we can see that the accuracy rate is 0.6412 and the detection rate is approximately 88% (ie: approximately 88% of the samples that are underdesign can be identifies)