This is a model built for predicting concrete compressive strength - an important material in civil engineering.
The concrete compressive strength is a highly nonlinear function of age and ingredients.
These ingredients includes:
NB: The actual concrete compressive strength (MPa) for a given mixture under a specific age (days) was determined from laboratory. The data can be downloaded here: UCI
Load the requied packages for building the model:
library(tidyverse)
library(readxl)
library(caret)
library(GGally)
library(rattle)
library(modelr)
library(partykit)
library(e1071)
Get the data into R:
concrete <- read_excel("Concrete_Data.xls")
Rename the variables:
names(concrete) <- c("cement", "blast_furnace",
"fly_ash", "water",
"superplasticizer",
"coarse_aggregate",
"fine_aggregate",
"age",
"concrete_strength")
Compute the skewness across the columns - If the predictor distribution is roughly symmetric, the skewness values will be close to zero
skewValues <- map_dbl(concrete, skewness)
skewValues
## cement blast_furnace fly_ash water
## 0.50803436 0.79840662 0.53588075 0.07410764
## superplasticizer coarse_aggregate fine_aggregate age
## 0.90546945 -0.04008937 -0.25224294 3.25966169
## concrete_strength
## 0.41570873
The skewed variables have some values containing zero which makes it impossible to apply the BoxCox transformation
ggplot(concrete, aes(concrete_strength)) +
geom_histogram()
ggplot(concrete, aes(x = concrete_strength, y = cement)) +
geom_point()
ggplot(concrete, aes(x = concrete_strength, y = coarse_aggregate)) +
geom_point()
Seperate out the outcome from the predictors:
concrete_strength <- concrete$concrete_strength
concrete <- select(concrete, -concrete_strength)
Partition the data into a training and a test set
idx <- createDataPartition(concrete_strength, p = 2/3, list = FALSE)
train_set <- concrete[idx, ]
train_outcome <- concrete_strength[idx]
test_set <- concrete[-idx, ]
test_outcome <- concrete_strength[-idx]
Fit a single linear model and conduct 10-fold cross-validation to estimate the error:
ctrl <- trainControl(method = "cv")
set.seed(3)
lm_tune <- train(x = train_set, y = train_outcome,
method = "lm",
trControl = ctrl)
lm_tune
## Linear Regression
##
## 688 samples
## 8 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 620, 618, 620, 619, 619, 620, ...
## Resampling results:
##
## RMSE Rsquared
## 10.38347 0.6162678
##
##
predict the test set data and save the testset result in a data frame:
test_result <- data.frame(observations = test_outcome, lm_predictions = predict(lm_tune, test_set))
Tune the regression tree using “rpart”
set.seed(3)
tree_tune <- train(x = train_set, y = train_outcome,
method = "rpart",
trControl = ctrl,
tuneLength = 5)
plot(tree_tune)
test_result$tree_predictions <- predict(tree_tune, test_set)
Plot the regression tree:
fancyRpartPlot(tree_tune$finalModel)
Fit the multi-addaptive regression model:
set.seed(3)
mars_tune <- train(x = train_set, y = train_outcome,
method = "earth",
trControl = ctrl,
tuneLength = 20)
plot(mars_tune)
test_result$mars_predictions <- predict(mars_tune, test_set)
Compute and plot the variable importance
marsImp <- varImp(mars_tune)
plot(marsImp)
Support vector machines:
set.seed(3)
svm_tune <- train(x = train_set, y = train_outcome,
method = "svmRadial",
trControl = ctrl,
preProcess = c("center", "scale"),
tuneLength = 10)
test_result$svm_predictions <- predict(svm_tune, test_set)
head(test_result)
## observations lm_predictions tree_predictions y svm_predictions
## 1 44.29608 57.69736 39.89158 40.95794 47.94661
## 2 47.02985 26.91493 39.89158 46.64152 48.96341
## 3 36.44777 30.63779 56.95580 34.75287 38.69849
## 4 45.85429 20.40793 39.89158 36.29651 41.28479
## 5 39.28979 32.52414 56.95580 35.01859 40.70545
## 6 43.01296 56.97924 56.95580 48.38309 44.27967
postResample(pred = test_result$lm_predictions, obs = test_result$observations)
## RMSE Rsquared
## 10.6799255 0.6116895
postResample(pred = test_result$tree_predictions, obs = test_result$observations)
## RMSE Rsquared
## 11.3403607 0.5553381
postResample(pred = test_result$mars_predictions, obs = test_result$observations)
## RMSE Rsquared
## 6.8015400 0.8402243
postResample(pred = test_result$svm_predictions, obs = test_result$observations)
## RMSE Rsquared
## 6.1048931 0.8714146