Previously, we have completed the process of variable selection, model selection, hyperparameter tuning. However, after careful examination and discussion with our other group members, we concluded that there is still room for improvement (the initial dataset we obtained contained no categorical variables, no variable interactions and no variables to the second or third degree) In order to further improve our model performance to achieve higher accuracy, we start the whole process from variable selection.
Because our aim is to correctly identify the underdesigned concrete samples, we need to balance overall prediction accuracy and detection rate. We have previously found that the concrete dataset is extremely unbalanced, meaning that number of samples that are above target greatly exceed the samples that are below target. To achieve an overall high prediction accuracy, the model need only to estimate all observations to be above target; however, this method overlooks samples that are truly underdesign and thus defeats the purpose of correctly identifying underdesigned concrete samples. For phase two of our model building process, we attempt to balance the overall prediction accuracy and detection rate.
In conclusion the second phase of our model building:
In this section, we use lightgbm for classification on above and below target, and identify the top 20 most important predictors
library(lightgbm)
## Loading required package: R6
library(stats)
library(Matrix)
setwd("~/Desktop/FALL_2019/Machine Learning--ParisLab")
concrete <- read.csv("data_cleaned.csv")
concrete <- na.omit(concrete)
## we impute the missing values with negative for the lightgbm tree to distinguish
concrete[is.na(concrete)] = -1
## we remove the samples with no designed strength, and we reduce our sample size to 9557
concrete <- concrete[!concrete$Design_Strength == 0,]
Now we check if all the factors in our original data set has more than one level
## checking factor with one level
l <- sapply(concrete, function(x) is.factor(x))
m <- concrete[, l]
ifelse(n <- sapply(m, function(x) length(levels(x))) == 1, "DROP", "NODROP")
## Coarse_Agg_Source Coarse_Agg_Description
## "NODROP" "NODROP"
## Fine_Agg_Source Fine_Agg_Description
## "NODROP" "NODROP"
## prec_Fly_Ash Cement_Type
## "NODROP" "DROP"
## Fly_Ash_or_Straight_Cement Mix_ID
## "NODROP" "NODROP"
## Date_of_Batch Description_A
## "NODROP" "NODROP"
## Description_B Description_C
## "NODROP" "NODROP"
## Description_D Description_E
## "NODROP" "NODROP"
## Mix_ID.1
## "NODROP"
From the analysis, we can drop “Cement_Type” because it only have one level.
Next we change the storage type of some of the variables
concrete$prec_Fly_Ash <- as.numeric(sub("%","",concrete$prec_Fly_Ash))/100
concrete$diff <- concrete$X28_Day/concrete$Design_Strength
concrete$over <- ifelse(concrete$diff > 1, 1,0)
concrete$over <- as.numeric(as.character(concrete$over))
After discussing, we decide to keep the following variables
var_index_keep <- c(3,6:8,17:19,24,29:30,32) ## the index
res_index <- 37
vars <- concrete[var_index_keep]
names(vars)
## [1] "Coarse_Agg_Weight" "Fine_Agg_Weight" "Cement_Weight"
## [4] "Fly_Ash_Weight" "Load_Size" "AEA_Dose"
## [7] "TYPE_AWRA_Dose" "Description_D" "Con_Temp"
## [10] "Ambient.Temp" "Water"
We create the interaction variables and the variables to the order 2 and 3
## interaction
vars_int <- model.matrix(~(Coarse_Agg_Weight+Fine_Agg_Weight+Cement_Weight+Fly_Ash_Weight+Load_Size+AEA_Dose+TYPE_AWRA_Dose+Con_Temp+Ambient.Temp+Water)^2,vars)
## 2 and 3 order
num_vars <- vars[-8]
num_vars_2 <- num_vars^2
colnames(num_vars_2) <- c("Coarse_Agg_Weight_2","Fine_Agg_Weight_2","Cement_Weight_2","Fly_Ash_Weight_2","Load_Size_2","AEA_Dose_2","TYPE_AWRA_Dose_2","Con_Temp_2" ,"Ambient.Temp_2","Water_2")
num_vars_3 <- num_vars^3
colnames(num_vars_3) <- c("Coarse_Agg_Weight_3","Fine_Agg_Weight_3","Cement_Weight_3","Fly_Ash_Weight_3","Load_Size_3","AEA_Dose_3","TYPE_AWRA_Dose_3", "Con_Temp_3" ,"Ambient.Temp_3","Water_3")
## concatenating to create the full variable list
vars_complete <- cbind(vars, num_vars_2, num_vars_3, vars_int)
## creating the full dataset
res <- concrete[res_index]
concrete_up <- cbind(vars_complete, res)
data_samp <- data.frame(concrete_up)
Next we split the data into training and testing and construct
set.seed(1234567)
samp_perc <-sample(1:nrow(data_samp),nrow(data_samp)*0.8,replace = F)
train <-data_samp[samp_perc,]
test <- data_samp[-samp_perc,]
train_mod <- sparse.model.matrix(over ~., data = train)
train_label = train[, "over"]
test_mod <- sparse.model.matrix(over ~., data = test)
test_label = test[, "over"]
dtrain = lgb.Dataset(data = data.matrix(train_mod), label = train_label, free_raw_data = FALSE)
dtest = lgb.Dataset(data = data.matrix(train_mod), label = test_label, free_raw_data = FALSE)
lgb.Dataset.construct(dtrain)
dtest1 = lgb.Dataset.create.valid(dtrain, data = as.matrix(train_mod), label = train_label)
dtest2 = lgb.Dataset.create.valid(dtrain, data = data.matrix(test_mod), label = test_label)
valids <- list(test1 = dtest1, test2 = dtest2) ## creating validation set
Because our data is highly unbalanced, we use the scale functionality in our dataset
scale_pos_weight = length(train$over)/sum(train$over == 1) - 1
Setting the parameters for lightgbm and building the lightgbm model
params = list(learning_rate = 0.01,
objective = "binary",
metric = 'binary_logloss')
m5<- lgb.train(
data = dtrain,
valids = valids,
nrounds = 10000,
num_leaves = 30,
params = params,
scale_pos_weight = scale_pos_weight,
eval_freq = 100,
early_stopping_rounds = 50
)
## [1]: test1's binary_logloss:0.125326 test2's binary_logloss:0.135431
Select the top 20 variables from the lightgbm model
## New variable Selection
var_imp <- lgb.importance(m5)
lgb.plot.importance(var_imp, top_n = 20)
new_var <- rep(NA, length(var_imp$Feature))
for (i in 1: length(var_imp$Feature)){
new_var[i] <- sub("\\.", ":", var_imp$Feature[i]) ## we use regular expression here because lightgbm change the interaction term : to .
}
concrete_upp <- cbind(concrete_up[new_var[1:20]],res)
names(concrete_upp)
## [1] "AEA_Dose:TYPE_AWRA_Dose"
## [2] "Cement_Weight"
## [3] "Fine_Agg_Weight:TYPE_AWRA_Dose"
## [4] "Coarse_Agg_Weight:Fine_Agg_Weight"
## [5] "Con_Temp:Ambient.Temp"
## [6] "Fine_Agg_Weight:Fly_Ash_Weight"
## [7] "TYPE_AWRA_Dose:Con_Temp"
## [8] "AEA_Dose:Con_Temp"
## [9] "Coarse_Agg_Weight:Cement_Weight"
## [10] "Coarse_Agg_Weight:TYPE_AWRA_Dose"
## [11] "Fly_Ash_Weight:Con_Temp"
## [12] "AEA_Dose:Water"
## [13] "Load_Size:AEA_Dose"
## [14] "Fly_Ash_Weight:AEA_Dose"
## [15] "Cement_Weight:Con_Temp"
## [16] "Fine_Agg_Weight:Con_Temp"
## [17] "Coarse_Agg_Weight:AEA_Dose"
## [18] "Coarse_Agg_Weight:Con_Temp"
## [19] "AEA_Dose:Ambient.Temp"
## [20] "Coarse_Agg_Weight"
## [21] "over"