We continue from the previous section. We first load the data and construct the new data set

library(lightgbm)
## Loading required package: R6
library(stats)
library(Matrix)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
setwd("~/Desktop/FALL_2019/Machine Learning--ParisLab")
concrete <- read.csv("data_cleaned.csv")
concrete <- na.omit(concrete)
## we impute the missing values with negative for the lightgbm tree to distinguish
concrete[is.na(concrete)] = -1
## we remove the samples with no designed strength, and we reduce our sample size to 9557
concrete <- concrete[!concrete$Design_Strength == 0,]
concrete$prec_Fly_Ash <- as.numeric(sub("%","",concrete$prec_Fly_Ash))/100
concrete$diff <- concrete$X28_Day/concrete$Design_Strength
concrete$over <- ifelse(concrete$diff > 1, 1,0)
concrete$over <- as.numeric(as.character(concrete$over))
var_index_keep <- c(3,6:8,17:19,24,29:30,32) ## the index
res_index <- 37
vars <- concrete[var_index_keep]
names(vars)
##  [1] "Coarse_Agg_Weight" "Fine_Agg_Weight"   "Cement_Weight"    
##  [4] "Fly_Ash_Weight"    "Load_Size"         "AEA_Dose"         
##  [7] "TYPE_AWRA_Dose"    "Description_D"     "Con_Temp"         
## [10] "Ambient.Temp"      "Water"
vars_int <- model.matrix(~(Coarse_Agg_Weight+Fine_Agg_Weight+Cement_Weight+Fly_Ash_Weight+Load_Size+AEA_Dose+TYPE_AWRA_Dose+Con_Temp+Ambient.Temp+Water)^2,vars)
## 2 and 3 order
num_vars <- vars[-8]
num_vars_2 <- num_vars^2
colnames(num_vars_2) <- c("Coarse_Agg_Weight_2","Fine_Agg_Weight_2","Cement_Weight_2","Fly_Ash_Weight_2","Load_Size_2","AEA_Dose_2","TYPE_AWRA_Dose_2","Con_Temp_2" ,"Ambient.Temp_2","Water_2")
num_vars_3 <- num_vars^3
colnames(num_vars_3) <- c("Coarse_Agg_Weight_3","Fine_Agg_Weight_3","Cement_Weight_3","Fly_Ash_Weight_3","Load_Size_3","AEA_Dose_3","TYPE_AWRA_Dose_3", "Con_Temp_3" ,"Ambient.Temp_3","Water_3")
## concatenating to create the full variable list
vars_complete <- cbind(vars, num_vars_2, num_vars_3, vars_int)
## creating the full dataset
res <- concrete[res_index]
concrete_up <- cbind(vars_complete, res)
data_samp <- data.frame(concrete_up)
set.seed(1234567)
samp_perc <-sample(1:nrow(data_samp),nrow(data_samp)*0.8,replace = F) 
train <-data_samp[samp_perc,]
test <- data_samp[-samp_perc,]
train_mod <- sparse.model.matrix(over ~., data = train)
train_label = train[, "over"]
test_mod <- sparse.model.matrix(over ~., data = test)
test_label = test[, "over"]

dtrain = lgb.Dataset(data =  data.matrix(train_mod), label = train_label, free_raw_data = FALSE)
dtest = lgb.Dataset(data =  data.matrix(train_mod), label = test_label, free_raw_data = FALSE)

lgb.Dataset.construct(dtrain)

dtest1 = lgb.Dataset.create.valid(dtrain, data =  as.matrix(train_mod), label = train_label)
dtest2 = lgb.Dataset.create.valid(dtrain, data =  data.matrix(test_mod), label = test_label)
valids <- list(test1 = dtest1, test2 = dtest2) ## creating validation set
scale_pos_weight = length(train$over)/sum(train$over == 1) - 1
params = list(learning_rate = 0.01,
              objective = "binary",
              metric = 'binary_logloss')


m5<- lgb.train(
  data = dtrain,
  valids = valids,
  nrounds = 10000,
  num_leaves = 30,
  params = params,
  scale_pos_weight = scale_pos_weight,
  eval_freq = 100,
  early_stopping_rounds = 50
)
## [1]: test1's binary_logloss:0.125326 test2's binary_logloss:0.135431
var_imp <- lgb.importance(m5)
new_var <- rep(NA, length(var_imp$Feature))

for (i in 1: length(var_imp$Feature)){
  new_var[i] <- sub("\\.", ":", var_imp$Feature[i]) ## we use regular expression here because lightgbm change the interaction term : to .
}

concrete_upp <- cbind(concrete_up[new_var[1:20]],res)
names(concrete_upp)
##  [1] "AEA_Dose:TYPE_AWRA_Dose"          
##  [2] "Cement_Weight"                    
##  [3] "Fine_Agg_Weight:TYPE_AWRA_Dose"   
##  [4] "Coarse_Agg_Weight:Fine_Agg_Weight"
##  [5] "Con_Temp:Ambient.Temp"            
##  [6] "Fine_Agg_Weight:Fly_Ash_Weight"   
##  [7] "TYPE_AWRA_Dose:Con_Temp"          
##  [8] "AEA_Dose:Con_Temp"                
##  [9] "Coarse_Agg_Weight:Cement_Weight"  
## [10] "Coarse_Agg_Weight:TYPE_AWRA_Dose" 
## [11] "Fly_Ash_Weight:Con_Temp"          
## [12] "AEA_Dose:Water"                   
## [13] "Load_Size:AEA_Dose"               
## [14] "Fly_Ash_Weight:AEA_Dose"          
## [15] "Cement_Weight:Con_Temp"           
## [16] "Fine_Agg_Weight:Con_Temp"         
## [17] "Coarse_Agg_Weight:AEA_Dose"       
## [18] "Coarse_Agg_Weight:Con_Temp"       
## [19] "AEA_Dose:Ambient.Temp"            
## [20] "Coarse_Agg_Weight"                
## [21] "over"

We construct new lightgbm model based on the selected 20 variables

data_samp <- data.frame(concrete_upp)
set.seed(1234567)
samp_perc <-sample(1:nrow(data_samp),nrow(data_samp)*0.8,replace = F) 
train <-data_samp[samp_perc,]
test <- data_samp[-samp_perc,]
train_mod <- sparse.model.matrix(over ~., data = train)
train_label = train[, "over"]
test_mod <- sparse.model.matrix(over ~., data = test)
test_label = test[, "over"]

dtrain = lgb.Dataset(data =  data.matrix(train_mod), label = train_label, free_raw_data = FALSE)
dtest = lgb.Dataset(data =  data.matrix(train_mod), label = test_label, free_raw_data = FALSE)

lgb.Dataset.construct(dtrain)

dtest1 = lgb.Dataset.create.valid(dtrain, data =  as.matrix(train_mod), label = train_label)
dtest2 = lgb.Dataset.create.valid(dtrain, data =  data.matrix(test_mod), label = test_label)
valids <- list(test1 = dtest1, test2 = dtest2)

scale_pos_weight = length(train$over)/sum(train$over == 1) - 1

m6<- lgb.train(
  data = dtrain,
  valids = valids,
  nrounds = 10000,
  num_leaves = 30,
  params = params,
  scale_pos_weight = scale_pos_weight,
  eval_freq = 100,
  early_stopping_rounds = 50
)
## [1]: test1's binary_logloss:0.125526 test2's binary_logloss:0.135176
pred_prob <- predict(m6, test_mod)
predicted = ifelse(pred_prob > 0.95,1,0)
confusionMatrix(factor(predicted), factor(test$over))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0   28  685
##          1   30 1169
##                                           
##                Accuracy : 0.626           
##                  95% CI : (0.6039, 0.6478)
##     No Information Rate : 0.9697          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0175          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.48276         
##             Specificity : 0.63053         
##          Pos Pred Value : 0.03927         
##          Neg Pred Value : 0.97498         
##              Prevalence : 0.03033         
##          Detection Rate : 0.01464         
##    Detection Prevalence : 0.37291         
##       Balanced Accuracy : 0.55664         
##                                           
##        'Positive' Class : 0               
## 

From the confusion matrix: the overall accuracy is 0.5638, and the detection of truly underdesigned samples is 0.4827586 (28/(30+28)) in the testing set

Now we attempt to use our new lightgbm model to predict the entire dataset

complete_mod <- sparse.model.matrix(over ~., data = data_samp)
# complete_label = data_samp[, "over"]
# 
# dtrain = lgb.Dataset(data =  data.matrix(complete_mod), label = complete_label, free_raw_data = FALSE)
# dtest = lgb.Dataset(data =  data.matrix(complete_mod), label = complete_label, free_raw_data = FALSE)
# lgb.Dataset.construct(dtrain)
# scale_pos_weight = length(data_samp$over)/sum(data_samp$over == 1) - 1

pred_prob <- predict(m6, complete_mod)
predicted = ifelse(pred_prob > 0.95,1,0)
confusionMatrix(factor(predicted), factor(data_samp$over))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0  243 3397
##          1   32 5885
##                                           
##                Accuracy : 0.6412          
##                  95% CI : (0.6315, 0.6508)
##     No Information Rate : 0.9712          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0746          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.88364         
##             Specificity : 0.63402         
##          Pos Pred Value : 0.06676         
##          Neg Pred Value : 0.99459         
##              Prevalence : 0.02877         
##          Detection Rate : 0.02543         
##    Detection Prevalence : 0.38087         
##       Balanced Accuracy : 0.75883         
##                                           
##        'Positive' Class : 0               
## 

From our the model we built using light gbm, we can see that the accuracy rate is 0.6412 and the detection rate is approximately 88% (ie: approximately 88% of the samples that are underdesign can be identifies)