Concrete is one of the major material in construction, therefore it is crucial to have accurate estimates of concrete strength to develop safety guidelines in construction. Concrete performance varies greatly due to a wide variety of ingredients that interact in complex ways. As a result, it is difficult to accurately predict the strength of the final product. In this chance, i would like to make concrete strength prediction with randomforest modeling.
library(dplyr)
library(caret)
library(ggplot2)
library(readr)
library(tidyverse)
library(tidymodels)
library(ranger)
library(plotly)
library(GGally)
library(randomForest)concrete <- read_csv("data/data-train.csv")## Parsed with column specification:
## cols(
## cement = col_double(),
## slag = col_double(),
## flyash = col_double(),
## water = col_double(),
## super_plast = col_double(),
## coarse_agg = col_double(),
## fine_agg = col_double(),
## age = col_double(),
## strength = col_double()
## )
concreteData descriptions: - cement: The amount of cement (Kg) in a m3 mixture.
slag: The amount of blast furnace slag (Kg) in a m3 mixture.
flyash: The amount of fly ash (Kg) in a m3 mixture.
water: The amount of water (Kg) in a m3 mixture.
super_plast: The amount of Superplasticizer (Kg) in a m3 mixture.
coarse_agg: The amount of Coarse Aggreagate (Kg) in a m3 mixture.
fine_agg: The amount of Fine Aggreagate (Kg) in a m3 mixture.
age: the number of resting days before the compressive strength measurement.
strength: Concrete compressive strength measurement in MPa unit.
str(concrete)## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 825 obs. of 9 variables:
## $ cement : num 540 540 332 332 199 ...
## $ slag : num 0 0 142 142 132 ...
## $ flyash : num 0 0 0 0 0 0 0 0 0 0 ...
## $ water : num 162 162 228 228 192 228 228 228 192 192 ...
## $ super_plast: num 2.5 2.5 0 0 0 0 0 0 0 0 ...
## $ coarse_agg : num 1040 1055 932 932 978 ...
## $ fine_agg : num 676 676 594 594 826 ...
## $ age : num 28 28 270 365 360 365 28 28 90 28 ...
## $ strength : num 80 61.9 40.3 41 44.3 ...
## - attr(*, "spec")=
## .. cols(
## .. cement = col_double(),
## .. slag = col_double(),
## .. flyash = col_double(),
## .. water = col_double(),
## .. super_plast = col_double(),
## .. coarse_agg = col_double(),
## .. fine_agg = col_double(),
## .. age = col_double(),
## .. strength = col_double()
## .. )
colSums(is.na(concrete))## cement slag flyash water super_plast coarse_agg
## 0 0 0 0 0 0
## fine_agg age strength
## 0 0 0
ggcorr(concrete,label = T)set.seed(100)
ct_intrain <- sample(nrow(concrete), nrow(concrete)*0.75)
ct_train <- concrete[ct_intrain, ]
ct_test <- concrete[-ct_intrain, ]# ctrl <- trainControl(method="repeatedcv", number=100, repeats=7)
# concrete_RF100 <- train(strength ~ ., data=ct_train, method="rf", trControl = ctrl, importance=T)
# concrete_RF100I save the result on below, so it won’t take long time to run the modeling:
#saveRDS(concrete_RF100,"ct_RF100.RDS")
ct_RF100=readRDS('ct_RF100.rds')
ct_RF100## Random Forest
##
## 618 samples
## 8 predictor
##
## No pre-processing
## Resampling: Cross-Validated (100 fold, repeated 7 times)
## Summary of sample sizes: 612, 611, 611, 612, 612, 610, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 5.706603 0.9072602 4.552258
## 5 5.020551 0.9176976 3.962189
## 8 5.054900 0.9141815 3.938514
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 5.
From above summary, we could see the optimal condition is when the mtry = 5 with accuracy 91.77%, and MAE 3.96%
ct_RF100$finalModel##
## Call:
## randomForest(x = x, y = y, mtry = param$mtry, importance = ..1)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 5
##
## Mean of squared residuals: 29.57649
## % Var explained: 89.21
pred <- predict(ct_RF100, ct_test)
head(pred)## 1 2 3 4 5 6
## 42.67593 43.50573 39.13263 42.18321 41.35476 42.66009
caret::MAE(pred, ct_test$strength)## [1] 3.67997
plot(ct_RF100$finalModel)varImp(ct_RF100)## rf variable importance
##
## Overall
## age 100.000
## cement 47.971
## water 20.595
## slag 16.567
## fine_agg 12.979
## super_plast 8.785
## coarse_agg 6.439
## flyash 0.000
ct_sub<-read_csv("data/data-submission.csv")## Parsed with column specification:
## cols(
## cement = col_double(),
## slag = col_double(),
## flyash = col_double(),
## water = col_double(),
## super_plast = col_double(),
## coarse_agg = col_double(),
## fine_agg = col_double(),
## age = col_double(),
## strength = col_logical()
## )
ct_strength <- predict(ct_RF100, ct_sub)
concrete_sub<-ct_sub %>%
cbind(ct_strength) %>%
select(-strength)
names(concrete_sub)[9]<-"strength"
concrete_sub#write.csv(concrete_sub,"concrete_sub.csv")# ctrl <- trainControl(method="repeatedcv", number=200, repeats=5)
# concrete_RF <- train(strength ~ ., data=ct_train, method="rf", trControl = ctrl, importance=T)
# concrete_RFI save the result on below, so it won’t take long time to run the modeling:
#saveRDS(concrete_RF,"ct_RF200.RDS")
ct_RF200 <- readRDS("ct_RF200.RDS")
ct_RF200## Random Forest
##
## 618 samples
## 8 predictor
##
## No pre-processing
## Resampling: Cross-Validated (200 fold, repeated 5 times)
## Summary of sample sizes: 615, 614, 615, 615, 614, 616, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 5.373947 0.9286058 4.551409
## 5 4.737652 0.9338431 3.960690
## 8 4.763043 0.9298682 3.955941
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 5.
From above summary, we could see the optimal condition is when the mtry = 5 with accuracy 93.38%, and MAE 3.96%
ct_RF200$finalModel##
## Call:
## randomForest(x = x, y = y, mtry = param$mtry, importance = ..1)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 5
##
## Mean of squared residuals: 29.57542
## % Var explained: 89.21
pred200 <- predict(ct_RF200, ct_test)
head(pred200)## 1 2 3 4 5 6
## 43.07430 43.91824 39.25442 42.31380 41.52721 42.67081
caret::MAE(pred, ct_test$strength)## [1] 3.67997
plot(ct_RF200$finalModel)varImp(ct_RF200)## rf variable importance
##
## Overall
## age 100.000
## cement 48.955
## slag 16.513
## water 15.614
## fine_agg 12.727
## coarse_agg 7.560
## super_plast 6.286
## flyash 0.000
ct_sub<-read_csv("data/data-submission.csv")## Parsed with column specification:
## cols(
## cement = col_double(),
## slag = col_double(),
## flyash = col_double(),
## water = col_double(),
## super_plast = col_double(),
## coarse_agg = col_double(),
## fine_agg = col_double(),
## age = col_double(),
## strength = col_logical()
## )
ct_strength2 <- predict(ct_RF200, ct_sub)
conc_sub<-ct_sub %>%
cbind(ct_strength2) %>%
select(-strength)
names(conc_sub)[9]<-"strength"
conc_subset.seed(100)
# create stratified initial split
splitted2 <- initial_split(concrete, prop = 0.85)To access the observations reserved for train data with training() and test data with `testing()’ :
splitted2 %>%
training() %>%
glimpse()## Observations: 702
## Variables: 9
## $ cement <dbl> 540.0, 540.0, 332.5, 332.5, 198.6, 380.0, 380.0, 475…
## $ slag <dbl> 0.0, 0.0, 142.5, 142.5, 132.4, 95.0, 95.0, 0.0, 132.…
## $ flyash <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ water <dbl> 162, 162, 228, 228, 192, 228, 228, 228, 192, 228, 22…
## $ super_plast <dbl> 2.5, 2.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.…
## $ coarse_agg <dbl> 1040.0, 1055.0, 932.0, 932.0, 978.4, 932.0, 932.0, 9…
## $ fine_agg <dbl> 676.0, 676.0, 594.0, 594.0, 825.5, 594.0, 594.0, 594…
## $ age <dbl> 28, 28, 270, 365, 360, 365, 28, 28, 28, 28, 90, 90, …
## $ strength <dbl> 79.99, 61.89, 40.27, 41.05, 44.30, 43.70, 36.45, 39.…
splitted2 %>%
testing() %>%
glimpse()## Observations: 123
## Variables: 9
## $ cement <dbl> 198.6, 380.0, 475.0, 237.5, 237.5, 304.0, 425.0, 323…
## $ slag <dbl> 132.4, 95.0, 0.0, 237.5, 237.5, 76.0, 106.3, 282.8, …
## $ flyash <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.…
## $ water <dbl> 192.0, 228.0, 228.0, 228.0, 228.0, 228.0, 153.5, 183…
## $ super_plast <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 16.5, 10.3, 22.0, 10.1…
## $ coarse_agg <dbl> 978.4, 932.0, 932.0, 932.0, 932.0, 932.0, 852.1, 942…
## $ fine_agg <dbl> 825.5, 594.0, 594.0, 594.0, 594.0, 670.0, 887.1, 659…
## $ age <dbl> 90, 270, 270, 270, 28, 270, 3, 3, 7, 28, 28, 91, 91,…
## $ strength <dbl> 38.07, 41.15, 42.13, 38.41, 30.08, 54.38, 33.40, 28.…
concrete_recipe2 <- training(splitted2) %>%
recipe(strength ~.) %>%
step_corr(all_predictors()) %>%
step_center(all_predictors(), -all_outcomes()) %>%
step_scale(all_predictors(), -all_outcomes()) %>%
prep()
concrete_recipe2## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 8
##
## Training data contained 702 data points and no missing data.
##
## Operations:
##
## Correlation filter removed no terms [trained]
## Centering for cement, slag, flyash, water, ... [trained]
## Scaling for cement, slag, flyash, water, ... [trained]
concrete_testing2 <- concrete_recipe2 %>%
bake(testing(splitted2))concrete_training2 <- juice(concrete_recipe2)concrete_ranger2 <- rand_forest(trees = 650,
mtry=4,
mode = "regression") %>%
set_engine("ranger") %>%
fit(strength ~ ., data = concrete_training2)
concrete_ranger2## parsnip model object
##
## Ranger result
##
## Call:
## ranger::ranger(formula = formula, data = data, mtry = ~4, num.trees = ~650, num.threads = 1, verbose = FALSE, seed = sample.int(10^5, 1))
##
## Type: Regression
## Number of trees: 650
## Sample size: 702
## Number of independent variables: 8
## Mtry: 4
## Target node size: 5
## Variable importance mode: none
## Splitrule: variance
## OOB prediction error (MSE): 28.12362
## R squared (OOB): 0.8995779
concrete_rf2 <- rand_forest(trees = 650,
mtry=4,
mode = "regression") %>%
set_engine("randomForest") %>%
fit(strength ~ ., data = concrete_training2)
concrete_rf2## parsnip model object
##
##
## Call:
## randomForest(x = as.data.frame(x), y = y, ntree = ~650, mtry = ~4)
## Type of random forest: regression
## Number of trees: 650
## No. of variables tried at each split: 4
##
## Mean of squared residuals: 28.2172
## % Var explained: 89.91
pred_ranger2<-predict(concrete_ranger2,concrete_testing2)
head(pred_ranger2)pred_rf3<-predict(concrete_rf2,concrete_testing2)
head(pred_rf3)concrete_ranger2 %>%
predict(concrete_testing2) %>%
bind_cols(concrete_testing2)concrete_rf2 %>%
predict(concrete_testing2) %>%
bind_cols(concrete_testing2)concrete_ranger2 %>%
predict(concrete_testing2) %>%
bind_cols(concrete_testing2) %>%
metrics(truth = strength, estimate = .pred)concrete_rf2 %>%
predict(concrete_testing2) %>%
bind_cols(concrete_testing2) %>%
metrics(truth = strength, estimate = .pred)ct_sub<-read_csv("data/data-submission.csv")## Parsed with column specification:
## cols(
## cement = col_double(),
## slag = col_double(),
## flyash = col_double(),
## water = col_double(),
## super_plast = col_double(),
## coarse_agg = col_double(),
## fine_agg = col_double(),
## age = col_double(),
## strength = col_logical()
## )
pred_rf4<-predict(concrete_rf2,bake(concrete_recipe2,ct_sub))
concrete_sub2<-ct_sub %>%
cbind(pred_rf4) %>%
select(-strength)
names(concrete_sub2)[9]<-"strength"
concrete_sub2write.csv(concrete_sub2,"regita_concrete-rm-predict.csv")The best fit model that result best prediction among model types above is modeling using tidymodels package with ‘set_engine=“randomForest”’. After the prediction being used in the data submission we get MAE 3.71 with R-squared 91%, while prediction from other models give results slightly below it.