This is a model built for predicting concrete compressive strength - an important material in civil engineering.



The concrete compressive strength is a highly nonlinear function of age and ingredients.


These ingredients includes:

NB: The actual concrete compressive strength (MPa) for a given mixture under a specific age (days) was determined from laboratory. The data can be downloaded here: UCI


Load the requied packages for building the model:

library(tidyverse)
library(readxl)
library(caret)
library(GGally)
library(rattle)
library(modelr)
library(partykit)
library(e1071)

Get the data into R:

concrete <- read_excel("Concrete_Data.xls")

Rename the variables:

names(concrete) <- c("cement", "blast_furnace",
                 "fly_ash", "water",
                 "superplasticizer",
                 "coarse_aggregate",
                 "fine_aggregate",
                 "age",
                 "concrete_strength")

Compute the skewness across the columns - If the predictor distribution is roughly symmetric, the skewness values will be close to zero

skewValues <- map_dbl(concrete, skewness)
skewValues
##            cement     blast_furnace           fly_ash             water 
##        0.50803436        0.79840662        0.53588075        0.07410764 
##  superplasticizer  coarse_aggregate    fine_aggregate               age 
##        0.90546945       -0.04008937       -0.25224294        3.25966169 
## concrete_strength 
##        0.41570873

The skewed variables have some values containing zero which makes it impossible to apply the BoxCox transformation


Get some exploratory graphs:

ggplot(concrete, aes(concrete_strength)) +
        geom_histogram()

ggplot(concrete, aes(x = concrete_strength, y = cement)) +
        geom_point()

ggplot(concrete, aes(x = concrete_strength, y = coarse_aggregate)) +
        geom_point()


Seperate out the outcome from the predictors:

concrete_strength <- concrete$concrete_strength
concrete <- select(concrete, -concrete_strength)

Partition the data into a training and a test set

idx <- createDataPartition(concrete_strength, p = 2/3, list = FALSE)

train_set <- concrete[idx, ]
train_outcome <- concrete_strength[idx]

test_set <- concrete[-idx, ]
test_outcome <- concrete_strength[-idx]

Fit a single linear model and conduct 10-fold cross-validation to estimate the error:

ctrl <- trainControl(method = "cv")

set.seed(3)
lm_tune <- train(x = train_set, y = train_outcome,
                 method = "lm",
                 trControl = ctrl)

lm_tune
## Linear Regression 
## 
## 688 samples
##   8 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 620, 618, 620, 619, 619, 620, ... 
## Resampling results:
## 
##   RMSE      Rsquared 
##   10.38347  0.6162678
## 
## 

predict the test set data and save the testset result in a data frame:

test_result <- data.frame(observations = test_outcome, lm_predictions = predict(lm_tune, test_set))

Tune the regression tree using “rpart”

set.seed(3)
tree_tune <- train(x = train_set, y = train_outcome,
                   method = "rpart",
                   trControl = ctrl,
                   tuneLength = 5)
plot(tree_tune)

test_result$tree_predictions <- predict(tree_tune, test_set)

Plot the regression tree:

fancyRpartPlot(tree_tune$finalModel)


Fit the multi-addaptive regression model:

set.seed(3)
mars_tune <- train(x = train_set, y = train_outcome,
                   method = "earth",
                   trControl = ctrl,
                   tuneLength = 20)

plot(mars_tune)

test_result$mars_predictions <- predict(mars_tune, test_set)

Compute and plot the variable importance

marsImp <- varImp(mars_tune)
plot(marsImp)


Support vector machines:

set.seed(3)
svm_tune <- train(x = train_set, y = train_outcome,
                   method = "svmRadial",
                   trControl = ctrl,
                  preProcess = c("center", "scale"),
                  tuneLength = 10)



test_result$svm_predictions <- predict(svm_tune, test_set)

head(test_result)
##   observations lm_predictions tree_predictions        y svm_predictions
## 1     44.29608       57.69736         39.89158 40.95794        47.94661
## 2     47.02985       26.91493         39.89158 46.64152        48.96341
## 3     36.44777       30.63779         56.95580 34.75287        38.69849
## 4     45.85429       20.40793         39.89158 36.29651        41.28479
## 5     39.28979       32.52414         56.95580 35.01859        40.70545
## 6     43.01296       56.97924         56.95580 48.38309        44.27967

Get test set performance values for each of the models

postResample(pred = test_result$lm_predictions, obs = test_result$observations)
##       RMSE   Rsquared 
## 10.6799255  0.6116895

postResample(pred = test_result$tree_predictions, obs = test_result$observations)
##       RMSE   Rsquared 
## 11.3403607  0.5553381

postResample(pred = test_result$mars_predictions, obs = test_result$observations)
##      RMSE  Rsquared 
## 6.8015400 0.8402243

postResample(pred = test_result$svm_predictions, obs = test_result$observations)
##      RMSE  Rsquared 
## 6.1048931 0.8714146