# file path of the data file
filepath <- "ConcreteStrengthData.csv"
# Read the CSV file into a data frame
df <- read.csv(filepath, header = T)
# Examine dataset structure
head(df, 5)
summary(df)
## cement slag ash water
## Min. :102.0 Min. : 0.0 Min. : 0.00 Min. :121.8
## 1st Qu.:192.4 1st Qu.: 0.0 1st Qu.: 0.00 1st Qu.:164.9
## Median :272.9 Median : 22.0 Median : 0.00 Median :185.0
## Mean :281.2 Mean : 73.9 Mean : 54.19 Mean :181.6
## 3rd Qu.:350.0 3rd Qu.:142.9 3rd Qu.:118.30 3rd Qu.:192.0
## Max. :540.0 Max. :359.4 Max. :200.10 Max. :247.0
##
## plasticizer coarse_agg fine_agg age
## Min. : 0.000 Min. : 801.0 Min. :594.0 Min. : 1.00
## 1st Qu.: 0.000 1st Qu.: 932.0 1st Qu.:731.0 1st Qu.: 7.00
## Median : 6.400 Median : 968.0 Median :779.5 Median : 28.00
## Mean : 6.212 Mean : 972.9 Mean :773.6 Mean : 45.66
## 3rd Qu.:10.200 3rd Qu.:1029.4 3rd Qu.:824.0 3rd Qu.: 56.00
## Max. :32.200 Max. :1145.0 Max. :992.6 Max. :365.00
## NA's :3
## strength
## Min. : 2.33
## 1st Qu.:23.71
## Median :34.45
## Mean :35.82
## 3rd Qu.:46.13
## Max. :82.60
##
str(df)
## 'data.frame': 1030 obs. of 9 variables:
## $ cement : num 540 540 332 332 199 ...
## $ slag : num 0 0 142 142 132 ...
## $ ash : num 0 0 0 0 0 0 0 0 0 0 ...
## $ water : num 162 162 228 228 192 228 228 228 228 228 ...
## $ plasticizer: num 2.5 2.5 0 0 0 0 0 0 0 0 ...
## $ coarse_agg : num 1040 1055 932 932 978 ...
## $ fine_agg : num 676 676 594 594 826 ...
## $ age : int 28 28 270 365 360 90 365 28 28 28 ...
## $ strength : num 80 61.9 40.3 41 44.3 ...
# Identify missing values in each column
missing_values <- sapply(df, function(x) sum(is.na(x)))
# Display columns with missing values
missing_values[missing_values > 0]
## plasticizer
## 3
Given that there are only 3 missing values in the same column (plasticizer), I will use the mean value of that column and impute the missing values. Based on the summary data, the mean and median are close together which indicates that outliers are not skewing the data.
# Calculate the mean of the plasticizer column
mean_plasticizer <- mean(df$plasticizer, na.rm = TRUE)
# Impute NA values in the plasticizer column with the mean
df$plasticizer[is.na(df$plasticizer)] <- mean_plasticizer
# Function to identify and remove outliers using z-scores
remove_outliers_z <- function(data) {
z_scores <- scale(data, center = TRUE, scale = TRUE)
data[abs(z_scores) > 3] <- NA # Replace outliers with NA
return(data)
}
# Clean the data
df_cleaned <- remove_outliers_z(df)
# Optionally, remove rows with NA values resulting from outlier removal
df_cleaned <- na.omit(df_cleaned)
# Check the structure and summary of the cleaned data
summary(df_cleaned)
## cement slag ash water
## Min. :102.0 Min. : 0.00 Min. : 0.00 Min. :121.8
## 1st Qu.:190.3 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.:164.9
## Median :266.0 Median : 22.00 Median : 0.00 Median :184.0
## Mean :277.5 Mean : 73.11 Mean : 56.75 Mean :180.8
## 3rd Qu.:349.0 3rd Qu.:144.20 3rd Qu.:118.30 3rd Qu.:192.0
## Max. :540.0 Max. :316.10 Max. :200.10 Max. :237.0
## plasticizer coarse_agg fine_agg age
## Min. : 0.000 Min. : 801.0 Min. :594.0 Min. : 1.00
## 1st Qu.: 0.000 1st Qu.: 932.0 1st Qu.:739.0 1st Qu.: 7.00
## Median : 6.700 Median : 968.0 Median :780.0 Median : 28.00
## Mean : 6.202 Mean : 974.7 Mean :776.0 Mean : 36.45
## 3rd Qu.:10.200 3rd Qu.:1038.0 3rd Qu.:822.2 3rd Qu.: 28.00
## Max. :23.400 Max. :1145.0 Max. :992.6 Max. :180.00
## strength
## Min. : 2.33
## 1st Qu.:23.22
## Median :33.69
## Mean :35.29
## 3rd Qu.:45.30
## Max. :82.60
str(df_cleaned)
## 'data.frame': 981 obs. of 9 variables:
## $ cement : num 540 540 266 380 266 ...
## $ slag : num 0 0 114 95 114 ...
## $ ash : num 0 0 0 0 0 0 0 0 0 0 ...
## $ water : num 162 162 228 228 228 228 192 192 228 228 ...
## $ plasticizer: num 2.5 2.5 0 0 0 0 0 0 0 0 ...
## $ coarse_agg : num 1040 1055 932 932 932 ...
## $ fine_agg : num 676 676 670 594 670 ...
## $ age : int 28 28 90 28 28 28 90 28 90 28 ...
## $ strength : num 80 61.9 47 36.5 45.9 ...
## - attr(*, "na.action")= 'omit' Named int [1:49] 3 4 5 7 13 18 25 26 27 31 ...
## ..- attr(*, "names")= chr [1:49] "3" "4" "5" "7" ...
print(nrow(df))
## [1] 1030
print(nrow(df_cleaned))
## [1] 981
# omit target variable strength
pairs.panels(df[c("cement", "slag", "ash", "water", "plasticizer", "coarse_agg", "fine_agg", "age")])
hist(log(df$slag))
hist(log(df$ash))
hist(log(df$plasticizer))
hist(log(df$age))
df_transformed <- df %>%
mutate(
log_slag = log(slag + 1),
log_ash = log(ash + 1),
log_plasticizer = log(plasticizer + 1),
log_age = log(age + 1)
) %>%
select(-slag, -ash, -plasticizer, -age)
head(df_transformed)
based on the histograms above, it seems that the features: cement, water, coarse_agg, and fine_agg are relatively normally distributed, while slag, ash, plasticizer, and age will need to be transformed to fit the Gaussian expectation for statistical models. After some analysis of the non-gaussian distributed features, a log transformation seems to work best for all four features to normalize their distributions. I tried using log(), sqrt(), 1/() and ()^2 on the features.
# Omit target variable strength from correlation plots
cor(df_transformed[c("cement", "log_slag", "log_ash", "water", "log_plasticizer", "coarse_agg", "fine_agg", "log_age")])
## cement log_slag log_ash water
## cement 1.000000000 -0.205788055 -0.36142750 -0.08158675
## log_slag -0.205788055 1.000000000 -0.20978818 0.03218551
## log_ash -0.361427498 -0.209788184 1.00000000 -0.26271241
## water -0.081586748 0.032185505 -0.26271241 1.00000000
## log_plasticizer -0.037294168 0.138376310 0.58969740 -0.61156906
## coarse_agg -0.109348994 -0.356712782 -0.01191924 -0.18229360
## fine_agg -0.222717849 -0.245950531 0.11323799 -0.45066117
## log_age 0.003339031 -0.009521493 -0.02410884 0.17021254
## log_plasticizer coarse_agg fine_agg log_age
## cement -0.03729417 -0.10934899 -0.2227178 0.003339031
## log_slag 0.13837631 -0.35671278 -0.2459505 -0.009521493
## log_ash 0.58969740 -0.01191924 0.1132380 -0.024108840
## water -0.61156906 -0.18229360 -0.4506612 0.170212538
## log_plasticizer 1.00000000 -0.23538840 0.1944212 -0.052316619
## coarse_agg -0.23538840 1.00000000 -0.1784810 -0.038134306
## fine_agg 0.19442123 -0.17848096 1.0000000 -0.114853296
## log_age -0.05231662 -0.03813431 -0.1148533 1.000000000
pairs(df_transformed[c("cement", "log_slag", "log_ash", "water", "log_plasticizer", "coarse_agg", "fine_agg", "log_age")], cex = 0.1,)
The correlation matrix and scatterplots indicate that there is no
significant multicolinearity between any of the predictor features. The
highest correlation seen in the matrix above is -0.61, between the water
and plasticizer features. The presence of multicolinearity would
indicate that multiple predictive variables directly predict the target
variable, which could lead to high p-values and a more inaccurate
model.
# Partition data on target variable strength
train_index <- createDataPartition(df_transformed$strength, p = 0.85, list = FALSE)
# Split the data into training and validation sets
train_set <- df_transformed[train_index, ]
validation_set <- df_transformed[-train_index, ]
# build regression model with all features
model <- lm(strength ~ ., data = df_transformed)
# examine model summary statistics
summary(model)
##
## Call:
## lm(formula = strength ~ ., data = df_transformed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.9714 -4.5144 0.0033 4.4393 27.7889
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15.203277 13.269840 1.146 0.2522
## cement 0.088355 0.003770 23.439 < 2e-16 ***
## water -0.208242 0.022521 -9.247 < 2e-16 ***
## coarse_agg 0.002854 0.005184 0.551 0.5821
## fine_agg -0.010651 0.005305 -2.008 0.0449 *
## log_slag 2.210081 0.166947 13.238 < 2e-16 ***
## log_ash 0.327646 0.169923 1.928 0.0541 .
## log_plasticizer 2.348875 0.378139 6.212 7.62e-10 ***
## log_age 9.029453 0.211338 42.725 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.384 on 1021 degrees of freedom
## Multiple R-squared: 0.8061, Adjusted R-squared: 0.8046
## F-statistic: 530.7 on 8 and 1021 DF, p-value: < 2.2e-16
AIC(model)
## [1] 7052.641
# Perform stepwise selection based on AIC
stepwise_model <- stepAIC(model, direction = "both", trace = TRUE)
## Start: AIC=4127.63
## strength ~ cement + water + coarse_agg + fine_agg + log_slag +
## log_ash + log_plasticizer + log_age
##
## Df Sum of Sq RSS AIC
## - coarse_agg 1 17 55689 4125.9
## <none> 55673 4127.6
## - log_ash 1 203 55875 4129.4
## - fine_agg 1 220 55892 4129.7
## - log_plasticizer 1 2104 57777 4163.8
## - water 1 4662 60335 4208.5
## - log_slag 1 9556 65229 4288.8
## - cement 1 29957 85629 4569.1
## - log_age 1 99537 155210 5181.7
##
## Step: AIC=4125.93
## strength ~ cement + water + fine_agg + log_slag + log_ash + log_plasticizer +
## log_age
##
## Df Sum of Sq RSS AIC
## <none> 55689 4125.9
## - log_ash 1 186 55875 4127.4
## + coarse_agg 1 17 55673 4127.6
## - fine_agg 1 624 56313 4135.4
## - log_plasticizer 1 2261 57951 4164.9
## - water 1 10307 65996 4298.8
## - log_slag 1 15242 70931 4373.1
## - cement 1 45565 101254 4739.7
## - log_age 1 99522 155211 5179.7
# Display the final model summary
summary(stepwise_model)
##
## Call:
## lm(formula = strength ~ cement + water + fine_agg + log_slag +
## log_ash + log_plasticizer + log_age, data = df_transformed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.9527 -4.5431 0.0202 4.3824 28.0179
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 21.847999 5.513603 3.963 7.93e-05 ***
## cement 0.087108 0.003012 28.917 < 2e-16 ***
## water -0.217083 0.015784 -13.753 < 2e-16 ***
## fine_agg -0.012712 0.003758 -3.383 0.000744 ***
## log_slag 2.151530 0.128645 16.725 < 2e-16 ***
## log_ash 0.302501 0.163614 1.849 0.064765 .
## log_plasticizer 2.274492 0.353059 6.442 1.81e-10 ***
## log_age 9.027523 0.211237 42.737 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.382 on 1022 degrees of freedom
## Multiple R-squared: 0.8061, Adjusted R-squared: 0.8048
## F-statistic: 606.9 on 7 and 1022 DF, p-value: < 2.2e-16
# Make predictions on the validation dataset
predictions <- predict(stepwise_model, newdata = validation_set)
# Calculate Mean Squared Error (MSE)
mse <- mean((validation_set$strength - predictions)^2)
print(mse)
## [1] 58.42121
# Create a new data frame for the new observation
new_data <- data.frame(
cement = 535,
log_slag = log(0+1),
log_ash = log(0+1),
water = 160,
log_plasticizer = log(2.5+1),
coarse_agg = 1000,
fine_agg = 695,
log_age = log(30+1)
)
# Make the prediction
predicted_strength <- predict(stepwise_model, newdata = new_data)
print(predicted_strength)
## 1
## 58.73232
# Extract Residual Standard Error
rse <- summary(stepwise_model)$sigma
# Calculate 95% CI
upper_bound <- predicted_strength + 1.96*rse
lower_bound <- predicted_strength - 1.96*rse
# Display 95% CI
print(lower_bound)
## 1
## 44.26408
print(upper_bound)
## 1
## 73.20057