The dataset includes the following columns:
Cement (component 1) – quantitative – kg in a m3 mixture
Blast Furnace Slag (component 2) – quantitative – kg in a m3 mixture
Fly Ash (component 3) – quantitative – kg in a m3 mixture
Water (component 4) – quantitative – kg in a m3 mixture
Superplasticizer (component 5) – quantitative – kg in a m3 mixture
Coarse Aggregate (component 6) – quantitative – kg in a m3 mixture
Fine Aggregate (component 7) – quantitative – kg in a m3 mixture
Age – quantitative – Day (1~365)
Concrete compressive strength – quantitative – MPa
MSE and R2 as the regression metric for dataset.
library(Hmisc)
library(caret)
library(tidyverse)
library(caret)
library(pROC)
library(corrplot)
library(PerformanceAnalytics)
library(ggplot2)
library(wesanderson)
library(tree)
library(randomForest)
library(gbm)
library(readr)
Concrete_Data_Yeh <- read_csv("Concrete_Data_Yeh.csv")
Data preparation (also referred to as data preprocessing) is the process of transforming raw data so that data scientists and analysts can run it through machine learning algorithms to uncover insights or make predictions. It is an important prerequisite for machine learning algorithms because most machine learning algorithms require data to be formatted in a very specific way, so datasets generally require some amount of preparation before they can yield useful insights. The data preparation process can be complicated by issues such as:
Missing or incomplete records
Outliers or anomalies
Improperly formatted / structured data
Inconsistent values and non-standardized categorical variables
Limited or sparse features / attributes
glimpse(Concrete_Data_Yeh)
head(Concrete_Data_Yeh)
summary(Concrete_Data_Yeh)
#number of columns and rows
nrow(Concrete_Data_Yeh)
ncol(Concrete_Data_Yeh)
which(is.na(Concrete_Data_Yeh))
## integer(0)
M<-cor(Concrete_Data_Yeh)
head(round(M,2))
## cement slag flyash water superplasticizer coarseaggregate
## cement 1.00 -0.28 -0.40 -0.08 0.09 -0.11
## slag -0.28 1.00 -0.32 0.11 0.04 -0.28
## flyash -0.40 -0.32 1.00 -0.26 0.38 -0.01
## water -0.08 0.11 -0.26 1.00 -0.66 -0.18
## superplasticizer 0.09 0.04 0.38 -0.66 1.00 -0.27
## coarseaggregate -0.11 -0.28 -0.01 -0.18 -0.27 1.00
## fineaggregate age csMPa
## cement -0.22 0.08 0.50
## slag -0.28 -0.04 0.13
## flyash 0.08 -0.15 -0.11
## water -0.45 0.28 -0.29
## superplasticizer 0.22 -0.19 0.37
## coarseaggregate -0.18 0.00 -0.16
corrplot.mixed(M, upper = "square",
lower = "number",
tl.col = "black",
tl.pos = "lt",
tl.cex = 0.7,
number.cex = 0.7,
lower.col = "black")
ggplot(Concrete_Data_Yeh,aes(x = cement, y = csMPa)) + geom_point() +geom_smooth(method = "lm")+
ggtitle("cement vs csMPa")
## `geom_smooth()` using formula = 'y ~ x'
# Visualization of the variables
par(mfrow=c(2,2))
hist(Concrete_Data_Yeh$cement, main = 'Cement')
hist(Concrete_Data_Yeh$slag, main = 'slag')
hist(Concrete_Data_Yeh$flyash, main = 'Flyash')
hist(Concrete_Data_Yeh$water, main = 'Water')
par(mfrow=c(2,2))
hist(Concrete_Data_Yeh$superplasticizer, main = 'superplasticizer')
hist(Concrete_Data_Yeh$coarseaggregate, main = 'coarseaggregate')
hist(Concrete_Data_Yeh$fineaggregate, main = 'fineaggregate')
hist(Concrete_Data_Yeh$age, main = 'Age')
#dev.off()
hist(Concrete_Data_Yeh$csMPa, main = 'csMPa')
par(mfrow=c(2,3))
par(cex.main=2)
# Box plot for slag
out <- boxplot.stats(Concrete_Data_Yeh$slag)$out
boxplot(Concrete_Data_Yeh$slag,
ylab = "Frequency", main = "Slag"
)
mtext(paste("Outliers: ", paste(out, collapse = ", ")))
length(out)
## [1] 2
#Box plot for Water
out <- boxplot.stats(Concrete_Data_Yeh$water)$out
boxplot(Concrete_Data_Yeh$water,
ylab = "Frequency", main = "Water"
)
#mtext(paste(paste(out, collapse = ", ")))
length(out)
## [1] 9
# Box plot for superplasticizer
out <- boxplot.stats(Concrete_Data_Yeh$superplasticizer)$out
boxplot(Concrete_Data_Yeh$superplasticizer,
ylab = "Frequency", main = "Superplasticizer"
)
length(out)
## [1] 10
# Box plot for csMPa
out <- boxplot.stats(Concrete_Data_Yeh$csMPa)$out
boxplot(Concrete_Data_Yeh$csMPa,
ylab = "Frequency", main = "csMPa"
)
length(out)
## [1] 4
# Box plot for coarseaggregate
out <- boxplot.stats(Concrete_Data_Yeh$coarseaggregate)$out
boxplot(Concrete_Data_Yeh$coarseaggregate,
ylab = "Frequency", main = "Coarse-aggregate"
)
length(out)
## [1] 0
# Box plot for Age
out <- boxplot.stats(Concrete_Data_Yeh$age)$out
boxplot(Concrete_Data_Yeh$age,
ylab = "Frequency", main = 'Age'
)
length(out)
## [1] 59
lower_bound <- quantile(Concrete_Data_Yeh$age, 0.01)
lower_bound
## 1%
## 3
upper_bound <- quantile(Concrete_Data_Yeh$age, 0.99)
upper_bound
## 99%
## 365
outlier_ind <- which(Concrete_Data_Yeh$age < lower_bound | Concrete_Data_Yeh$age > upper_bound)
outlier_ind
## [1] 747 764
Concrete_Data_Yeh[outlier_ind, ]
## # A tibble: 2 × 9
## cement slag flyash water superplasticizer coarseaggregate finea…¹ age csMPa
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 500 0 0 200 0 1125 613 1 12.6
## 2 385 0 0 186 0 966 763 1 6.27
## # … with abbreviated variable name ¹​fineaggregate
set.seed(1234)
Concrete_which_train <- createDataPartition(Concrete_Data_Yeh$csMPa,
p = 0.7,
list = FALSE)
# 70% of the observations will be Concrete_train
Concrete_train <- Concrete_Data_Yeh[Concrete_which_train,]
Concrete_test <- Concrete_Data_Yeh[-Concrete_which_train,]
# checking the distribution of the target variable in both samples
summary(Concrete_train)
summary(Concrete_test)
tree.model <- tree(csMPa~., data = Concrete_train)
summary(tree.model)
##
## Regression tree:
## tree(formula = csMPa ~ ., data = Concrete_train)
## Variables actually used in tree construction:
## [1] "age" "superplasticizer" "cement" "flyash"
## [5] "slag" "water"
## Number of terminal nodes: 15
## Residual mean deviance: 65.64 = 46400 / 707
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -22.9500 -4.7670 -0.3692 0.0000 5.4130 32.5900
# Ploting the Decision tree model
plot(tree.model, main = 'Decision tree model')
# we go ahead and use pretty = 0 since the dataset has categorical varibales
text(tree.model, pretty = 0)
# Getting predictions based on the testing data
Tree.pred <- predict(tree.model, newdata = Concrete_test)
#Computing the error term
set.seed(12345)
Tree.MSE <- mean((Tree.pred - Concrete_test$csMPa)^2)
Tree.MSE
## [1] 85.96495
# R-square value
R2_tree <- R2(Tree.pred, Concrete_test$csMPa)
R2_tree
## [1] 0.7126051
set.seed(12345)
cv_tree <- cv.tree(tree.model)
names(cv_tree)
## [1] "size" "dev" "k" "method"
# Geting the minimum size of cv of 15
plot(cv_tree$size, cv_tree$dev, type = "b", col = 'red', main = 'Cross validation on Decision trees')
# pruning our model
pruned_md <- prune.tree(tree.model, best = 15)
plot(pruned_md)
text(pruned_md, pretty = 0)
# #geting error for the prunned tree
prun.tree.pred <- predict(pruned_md, Concrete_test)
prune.Tree.MSE <- mean((prun.tree.pred - Concrete_test$csMPa)^2)
prune.Tree.MSE
## [1] 85.96495
## The pruned MSE is 87.0994 which is a bit higher
# R-square value
R2_tree1 <- R2(prun.tree.pred, Concrete_test$csMPa)
R2_tree1
## [1] 0.7126051
# Estimating the model
set.seed(12345)
Concrete.rf <- randomForest(csMPa~.,
data = Concrete_test)
#Getting predictions based on the testing data
rf.pred <- predict(Concrete.rf, newdata = Concrete_test)
#Computing the error term
set.seed(12345)
rf.MSE <- mean((rf.pred - Concrete_test$csMPa)^2)
rf.MSE
## [1] 13.36312
# R-square value
R2_rf <- R2(rf.pred, Concrete_test$csMPa)
R2_rf
## [1] 0.9702419
mtry <- c(1,2,3,4,5,6,7)
counter <- 1
r.mtry <- rep(0,7)
#using the loop
for (d in mtry) {
r.model <- randomForest(csMPa~., data = Concrete_test, trControl = trainControl(method = "cv", number = 7))
predict.r <- predict(r.model, newdata = Concrete_test, mtry = d)
r.mtry[counter] <- mean((predict.r - Concrete_test$csMPa)^2)
counter < counter + 1
}
plot(mtry, r.mtry,type = "b",main = "Finding Optimal Mtry",
xlab = "Number of mtry's",col="blue")
# Estimating the model
set.seed(12345)
Concrete.rf1 <- randomForest(csMPa~.,
data = Concrete_test, mtry = 7)
#Getting predictions based on the testing data
rf.pred1 <- predict(Concrete.rf1, newdata = Concrete_test)
#Computing the error term
set.seed(12345)
rf.MSE1 <- mean((rf.pred1 - Concrete_test$csMPa)^2)
rf.MSE1
## [1] 8.431719
# R-square value
R2_rf1 <- R2(rf.pred1, Concrete_test$csMPa)
R2_rf1
## [1] 0.9753512
set.seed(12345)
Concrete.boost <- gbm(csMPa~., data = Concrete_train, distribution = "gaussian", n.trees = 500)
summary(Concrete.boost)
## var rel.inf
## age age 33.5292320
## cement cement 30.9723990
## water water 16.5070291
## slag slag 9.5332755
## fineaggregate fineaggregate 3.2537197
## superplasticizer superplasticizer 2.8826155
## coarseaggregate coarseaggregate 2.3860049
## flyash flyash 0.9357244
# #getting prediction from Concrete.boost model
predict.boost <- predict(Concrete.boost, newdata = Concrete_test, n.trees = 500)
#calculating the MSE for the model
set.seed(12345)
boost.MSE <- mean((predict.boost - Concrete_test$csMPa)^2)
boost.MSE
## [1] 29.74035
# R-square value
R2_boost <- R2(predict.boost, Concrete_test$csMPa)
R2_boost
## [1] 0.9013621
# Changing the value for the ntrees to get the tress that minimize our MSE
b.tree <- c(100, 200, 300, 400, 500, 600)
counter <- 1
b.tree.boost <- rep(0,6)
#using the loop
for (d in b.tree) {
Concrete.boost <- gbm(csMPa~., data = Concrete_train, distribution = "gaussian", n.trees = d)
predict.boost <- predict(Concrete.boost, newdata = Concrete_test, n.trees = d)
b.tree.boost[counter] <- mean((predict.boost - Concrete_test$csMPa)^2)
counter < counter + 1
}
plot(b.tree, b.tree.boost,type = "b",main = "Finding Optimal n.tree",
xlab = "Number of trees",col="blue")
set.seed(12345)
Concrete.boost1 <- gbm(csMPa~., data = Concrete_train, distribution = "gaussian", n.trees = 600)
Concrete.boost1
## gbm(formula = csMPa ~ ., distribution = "gaussian", data = Concrete_train,
## n.trees = 600)
## A gradient boosted model with gaussian loss function.
## 600 iterations were performed.
## There were 8 predictors of which 8 had non-zero influence.
#calculating the model with ntree adjustments
predict.boost1 <- predict(Concrete.boost, newdata = Concrete_test, n.trees = 600)
#Calculating MSE for the adjusted model
set.seed(12345)
boost.MSE1 <- mean((predict.boost1 - Concrete_test$csMPa)^2)
boost.MSE1
## [1] 30.0953
# R-square value
R2_boost1 <- R2(predict.boost1, Concrete_test$csMPa)
R2_boost1
## [1] 0.9000489
# Evaluation of all the models
Model <- c("Regression Decision Trees","Regression Decision Trees(Pruned tree)", "Random Forest","Random Forest(mtry)", "Gradient Boosting", "Gradient Boosting(adjusted ntree)")
MSE_value <- c(Tree.MSE, prune.Tree.MSE,rf.MSE,rf.MSE1, boost.MSE, boost.MSE1)
R2_Value <- c(R2_tree, R2_tree1,R2_rf,R2_rf1, R2_boost, R2_boost1)
MODEL_OUTPUT_TABLE <- data.frame(Model, MSE_value, R2_Value)
MODEL_OUTPUT_TABLE
## Model MSE_value R2_Value
## 1 Regression Decision Trees 85.964946 0.7126051
## 2 Regression Decision Trees(Pruned tree) 85.964946 0.7126051
## 3 Random Forest 13.363116 0.9702419
## 4 Random Forest(mtry) 8.431719 0.9753512
## 5 Gradient Boosting 29.740353 0.9013621
## 6 Gradient Boosting(adjusted ntree) 30.095305 0.9000489