PARIS variable selection update

Previously, we have completed the process of variable selection, model selection, hyperparameter tuning. However, after careful examination and discussion with our other group members, we concluded that there is still room for improvement (the initial dataset we obtained contained no categorical variables, no variable interactions and no variables to the second or third degree) In order to further improve our model performance to achieve higher accuracy, we start the whole process from variable selection.

Because our aim is to correctly identify the underdesigned concrete samples, we need to balance overall prediction accuracy and detection rate. We have previously found that the concrete dataset is extremely unbalanced, meaning that number of samples that are above target greatly exceed the samples that are below target. To achieve an overall high prediction accuracy, the model need only to estimate all observations to be above target; however, this method overlooks samples that are truly underdesign and thus defeats the purpose of correctly identifying underdesigned concrete samples. For phase two of our model building process, we attempt to balance the overall prediction accuracy and detection rate.

In conclusion the second phase of our model building:

set new criteria for model performance: maximize both overall prediction accuracy and detection rate
include new variables
include interaction terms, variables to degree 2 and 3
utilize new model: Lightgbm

In this section, we use lightgbm for classification on above and below target, and identify the top 20 most important predictors

library(lightgbm)

## Loading required package: R6

library(stats)
library(Matrix)
setwd("~/Desktop/FALL_2019/Machine Learning--ParisLab")
concrete <- read.csv("data_cleaned.csv")
concrete <- na.omit(concrete)
## we impute the missing values with negative for the lightgbm tree to distinguish
concrete[is.na(concrete)] = -1
## we remove the samples with no designed strength, and we reduce our sample size to 9557
concrete <- concrete[!concrete$Design_Strength == 0,]

Now we check if all the factors in our original data set has more than one level

## checking factor with one level
l <- sapply(concrete, function(x) is.factor(x))
m <- concrete[, l]
ifelse(n <- sapply(m, function(x) length(levels(x))) == 1, "DROP", "NODROP")

##          Coarse_Agg_Source     Coarse_Agg_Description 
##                   "NODROP"                   "NODROP" 
##            Fine_Agg_Source       Fine_Agg_Description 
##                   "NODROP"                   "NODROP" 
##               prec_Fly_Ash                Cement_Type 
##                   "NODROP"                     "DROP" 
## Fly_Ash_or_Straight_Cement                     Mix_ID 
##                   "NODROP"                   "NODROP" 
##              Date_of_Batch              Description_A 
##                   "NODROP"                   "NODROP" 
##              Description_B              Description_C 
##                   "NODROP"                   "NODROP" 
##              Description_D              Description_E 
##                   "NODROP"                   "NODROP" 
##                   Mix_ID.1 
##                   "NODROP"

From the analysis, we can drop “Cement_Type” because it only have one level.

Next we change the storage type of some of the variables

concrete$prec_Fly_Ash <- as.numeric(sub("%","",concrete$prec_Fly_Ash))/100
concrete$diff <- concrete$X28_Day/concrete$Design_Strength
concrete$over <- ifelse(concrete$diff > 1, 1,0)
concrete$over <- as.numeric(as.character(concrete$over))

After discussing, we decide to keep the following variables

var_index_keep <- c(3,6:8,17:19,24,29:30,32) ## the index
res_index <- 37
vars <- concrete[var_index_keep]
names(vars)

##  [1] "Coarse_Agg_Weight" "Fine_Agg_Weight"   "Cement_Weight"    
##  [4] "Fly_Ash_Weight"    "Load_Size"         "AEA_Dose"         
##  [7] "TYPE_AWRA_Dose"    "Description_D"     "Con_Temp"         
## [10] "Ambient.Temp"      "Water"

We create the interaction variables and the variables to the order 2 and 3

## interaction
vars_int <- model.matrix(~(Coarse_Agg_Weight+Fine_Agg_Weight+Cement_Weight+Fly_Ash_Weight+Load_Size+AEA_Dose+TYPE_AWRA_Dose+Con_Temp+Ambient.Temp+Water)^2,vars)
## 2 and 3 order
num_vars <- vars[-8]
num_vars_2 <- num_vars^2
colnames(num_vars_2) <- c("Coarse_Agg_Weight_2","Fine_Agg_Weight_2","Cement_Weight_2","Fly_Ash_Weight_2","Load_Size_2","AEA_Dose_2","TYPE_AWRA_Dose_2","Con_Temp_2" ,"Ambient.Temp_2","Water_2")
num_vars_3 <- num_vars^3
colnames(num_vars_3) <- c("Coarse_Agg_Weight_3","Fine_Agg_Weight_3","Cement_Weight_3","Fly_Ash_Weight_3","Load_Size_3","AEA_Dose_3","TYPE_AWRA_Dose_3", "Con_Temp_3" ,"Ambient.Temp_3","Water_3")
## concatenating to create the full variable list
vars_complete <- cbind(vars, num_vars_2, num_vars_3, vars_int)
## creating the full dataset
res <- concrete[res_index]
concrete_up <- cbind(vars_complete, res)
data_samp <- data.frame(concrete_up)

Next we split the data into training and testing and construct

set.seed(1234567)
samp_perc <-sample(1:nrow(data_samp),nrow(data_samp)*0.8,replace = F) 
train <-data_samp[samp_perc,]
test <- data_samp[-samp_perc,]
train_mod <- sparse.model.matrix(over ~., data = train)
train_label = train[, "over"]
test_mod <- sparse.model.matrix(over ~., data = test)
test_label = test[, "over"]

dtrain = lgb.Dataset(data =  data.matrix(train_mod), label = train_label, free_raw_data = FALSE)
dtest = lgb.Dataset(data =  data.matrix(train_mod), label = test_label, free_raw_data = FALSE)

lgb.Dataset.construct(dtrain)

dtest1 = lgb.Dataset.create.valid(dtrain, data =  as.matrix(train_mod), label = train_label)
dtest2 = lgb.Dataset.create.valid(dtrain, data =  data.matrix(test_mod), label = test_label)
valids <- list(test1 = dtest1, test2 = dtest2) ## creating validation set

Because our data is highly unbalanced, we use the scale functionality in our dataset

scale_pos_weight = length(train$over)/sum(train$over == 1) - 1

Setting the parameters for lightgbm and building the lightgbm model

params = list(learning_rate = 0.01,
              objective = "binary",
              metric = 'binary_logloss')


m5<- lgb.train(
  data = dtrain,
  valids = valids,
  nrounds = 10000,
  num_leaves = 30,
  params = params,
  scale_pos_weight = scale_pos_weight,
  eval_freq = 100,
  early_stopping_rounds = 50
)

## [1]: test1's binary_logloss:0.125326 test2's binary_logloss:0.135431

Select the top 20 variables from the lightgbm model

## New variable Selection
var_imp <- lgb.importance(m5)
lgb.plot.importance(var_imp, top_n = 20)

new_var <- rep(NA, length(var_imp$Feature))

for (i in 1: length(var_imp$Feature)){
  new_var[i] <- sub("\\.", ":", var_imp$Feature[i]) ## we use regular expression here because lightgbm change the interaction term : to .
}

concrete_upp <- cbind(concrete_up[new_var[1:20]],res)
names(concrete_upp)

##  [1] "AEA_Dose:TYPE_AWRA_Dose"          
##  [2] "Cement_Weight"                    
##  [3] "Fine_Agg_Weight:TYPE_AWRA_Dose"   
##  [4] "Coarse_Agg_Weight:Fine_Agg_Weight"
##  [5] "Con_Temp:Ambient.Temp"            
##  [6] "Fine_Agg_Weight:Fly_Ash_Weight"   
##  [7] "TYPE_AWRA_Dose:Con_Temp"          
##  [8] "AEA_Dose:Con_Temp"                
##  [9] "Coarse_Agg_Weight:Cement_Weight"  
## [10] "Coarse_Agg_Weight:TYPE_AWRA_Dose" 
## [11] "Fly_Ash_Weight:Con_Temp"          
## [12] "AEA_Dose:Water"                   
## [13] "Load_Size:AEA_Dose"               
## [14] "Fly_Ash_Weight:AEA_Dose"          
## [15] "Cement_Weight:Con_Temp"           
## [16] "Fine_Agg_Weight:Con_Temp"         
## [17] "Coarse_Agg_Weight:AEA_Dose"       
## [18] "Coarse_Agg_Weight:Con_Temp"       
## [19] "AEA_Dose:Ambient.Temp"            
## [20] "Coarse_Agg_Weight"                
## [21] "over"

PARIS variable selection update

Huizi Yu

11/13/2019