DA5030 Regression

Problem 2:

# file path of the data file
filepath <- "ConcreteStrengthData.csv"

# Read the CSV file into a data frame
df <- read.csv(filepath, header = T)

# Examine dataset structure
head(df, 5)

summary(df)

##      cement           slag            ash             water      
##  Min.   :102.0   Min.   :  0.0   Min.   :  0.00   Min.   :121.8  
##  1st Qu.:192.4   1st Qu.:  0.0   1st Qu.:  0.00   1st Qu.:164.9  
##  Median :272.9   Median : 22.0   Median :  0.00   Median :185.0  
##  Mean   :281.2   Mean   : 73.9   Mean   : 54.19   Mean   :181.6  
##  3rd Qu.:350.0   3rd Qu.:142.9   3rd Qu.:118.30   3rd Qu.:192.0  
##  Max.   :540.0   Max.   :359.4   Max.   :200.10   Max.   :247.0  
##                                                                  
##   plasticizer       coarse_agg        fine_agg          age        
##  Min.   : 0.000   Min.   : 801.0   Min.   :594.0   Min.   :  1.00  
##  1st Qu.: 0.000   1st Qu.: 932.0   1st Qu.:731.0   1st Qu.:  7.00  
##  Median : 6.400   Median : 968.0   Median :779.5   Median : 28.00  
##  Mean   : 6.212   Mean   : 972.9   Mean   :773.6   Mean   : 45.66  
##  3rd Qu.:10.200   3rd Qu.:1029.4   3rd Qu.:824.0   3rd Qu.: 56.00  
##  Max.   :32.200   Max.   :1145.0   Max.   :992.6   Max.   :365.00  
##  NA's   :3                                                         
##     strength    
##  Min.   : 2.33  
##  1st Qu.:23.71  
##  Median :34.45  
##  Mean   :35.82  
##  3rd Qu.:46.13  
##  Max.   :82.60  
##

str(df)

## 'data.frame':    1030 obs. of  9 variables:
##  $ cement     : num  540 540 332 332 199 ...
##  $ slag       : num  0 0 142 142 132 ...
##  $ ash        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ water      : num  162 162 228 228 192 228 228 228 228 228 ...
##  $ plasticizer: num  2.5 2.5 0 0 0 0 0 0 0 0 ...
##  $ coarse_agg : num  1040 1055 932 932 978 ...
##  $ fine_agg   : num  676 676 594 594 826 ...
##  $ age        : int  28 28 270 365 360 90 365 28 28 28 ...
##  $ strength   : num  80 61.9 40.3 41 44.3 ...

Problem 3:

# Identify missing values in each column
missing_values <- sapply(df, function(x) sum(is.na(x)))

# Display columns with missing values
missing_values[missing_values > 0]

## plasticizer 
##           3

Given that there are only 3 missing values in the same column (plasticizer), I will use the mean value of that column and impute the missing values. Based on the summary data, the mean and median are close together which indicates that outliers are not skewing the data.

Problem 4:

# Calculate the mean of the plasticizer column
mean_plasticizer <- mean(df$plasticizer, na.rm = TRUE)

# Impute NA values in the plasticizer column with the mean
df$plasticizer[is.na(df$plasticizer)] <- mean_plasticizer

Problem 5:

# Function to identify and remove outliers using z-scores
remove_outliers_z <- function(data) {
  z_scores <- scale(data, center = TRUE, scale = TRUE)
  data[abs(z_scores) > 3] <- NA  # Replace outliers with NA
  return(data)
}

# Clean the data
df_cleaned <- remove_outliers_z(df)

# Optionally, remove rows with NA values resulting from outlier removal
df_cleaned <- na.omit(df_cleaned)

# Check the structure and summary of the cleaned data
summary(df_cleaned)

##      cement           slag             ash             water      
##  Min.   :102.0   Min.   :  0.00   Min.   :  0.00   Min.   :121.8  
##  1st Qu.:190.3   1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:164.9  
##  Median :266.0   Median : 22.00   Median :  0.00   Median :184.0  
##  Mean   :277.5   Mean   : 73.11   Mean   : 56.75   Mean   :180.8  
##  3rd Qu.:349.0   3rd Qu.:144.20   3rd Qu.:118.30   3rd Qu.:192.0  
##  Max.   :540.0   Max.   :316.10   Max.   :200.10   Max.   :237.0  
##   plasticizer       coarse_agg        fine_agg          age        
##  Min.   : 0.000   Min.   : 801.0   Min.   :594.0   Min.   :  1.00  
##  1st Qu.: 0.000   1st Qu.: 932.0   1st Qu.:739.0   1st Qu.:  7.00  
##  Median : 6.700   Median : 968.0   Median :780.0   Median : 28.00  
##  Mean   : 6.202   Mean   : 974.7   Mean   :776.0   Mean   : 36.45  
##  3rd Qu.:10.200   3rd Qu.:1038.0   3rd Qu.:822.2   3rd Qu.: 28.00  
##  Max.   :23.400   Max.   :1145.0   Max.   :992.6   Max.   :180.00  
##     strength    
##  Min.   : 2.33  
##  1st Qu.:23.22  
##  Median :33.69  
##  Mean   :35.29  
##  3rd Qu.:45.30  
##  Max.   :82.60

str(df_cleaned)

## 'data.frame':    981 obs. of  9 variables:
##  $ cement     : num  540 540 266 380 266 ...
##  $ slag       : num  0 0 114 95 114 ...
##  $ ash        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ water      : num  162 162 228 228 228 228 192 192 228 228 ...
##  $ plasticizer: num  2.5 2.5 0 0 0 0 0 0 0 0 ...
##  $ coarse_agg : num  1040 1055 932 932 932 ...
##  $ fine_agg   : num  676 676 670 594 670 ...
##  $ age        : int  28 28 90 28 28 28 90 28 90 28 ...
##  $ strength   : num  80 61.9 47 36.5 45.9 ...
##  - attr(*, "na.action")= 'omit' Named int [1:49] 3 4 5 7 13 18 25 26 27 31 ...
##   ..- attr(*, "names")= chr [1:49] "3" "4" "5" "7" ...

print(nrow(df))

## [1] 1030

print(nrow(df_cleaned))

## [1] 981

Problem 6:

# omit target variable strength
pairs.panels(df[c("cement", "slag", "ash", "water", "plasticizer", "coarse_agg", "fine_agg", "age")])

hist(log(df$slag))

hist(log(df$ash))

hist(log(df$plasticizer))

hist(log(df$age))

df_transformed <- df %>%
  mutate(
    log_slag = log(slag + 1),
    log_ash = log(ash + 1),
    log_plasticizer = log(plasticizer + 1),
    log_age = log(age + 1)
  ) %>%
  select(-slag, -ash, -plasticizer, -age) 

head(df_transformed)

based on the histograms above, it seems that the features: cement, water, coarse_agg, and fine_agg are relatively normally distributed, while slag, ash, plasticizer, and age will need to be transformed to fit the Gaussian expectation for statistical models. After some analysis of the non-gaussian distributed features, a log transformation seems to work best for all four features to normalize their distributions. I tried using log(), sqrt(), 1/() and ()^2 on the features.

Problem 7:

# Omit target variable strength from correlation plots
cor(df_transformed[c("cement", "log_slag", "log_ash", "water", "log_plasticizer", "coarse_agg", "fine_agg", "log_age")])

##                       cement     log_slag     log_ash       water
## cement           1.000000000 -0.205788055 -0.36142750 -0.08158675
## log_slag        -0.205788055  1.000000000 -0.20978818  0.03218551
## log_ash         -0.361427498 -0.209788184  1.00000000 -0.26271241
## water           -0.081586748  0.032185505 -0.26271241  1.00000000
## log_plasticizer -0.037294168  0.138376310  0.58969740 -0.61156906
## coarse_agg      -0.109348994 -0.356712782 -0.01191924 -0.18229360
## fine_agg        -0.222717849 -0.245950531  0.11323799 -0.45066117
## log_age          0.003339031 -0.009521493 -0.02410884  0.17021254
##                 log_plasticizer  coarse_agg   fine_agg      log_age
## cement              -0.03729417 -0.10934899 -0.2227178  0.003339031
## log_slag             0.13837631 -0.35671278 -0.2459505 -0.009521493
## log_ash              0.58969740 -0.01191924  0.1132380 -0.024108840
## water               -0.61156906 -0.18229360 -0.4506612  0.170212538
## log_plasticizer      1.00000000 -0.23538840  0.1944212 -0.052316619
## coarse_agg          -0.23538840  1.00000000 -0.1784810 -0.038134306
## fine_agg             0.19442123 -0.17848096  1.0000000 -0.114853296
## log_age             -0.05231662 -0.03813431 -0.1148533  1.000000000

pairs(df_transformed[c("cement", "log_slag", "log_ash", "water", "log_plasticizer", "coarse_agg", "fine_agg", "log_age")], cex = 0.1,)

The correlation matrix and scatterplots indicate that there is no significant multicolinearity between any of the predictor features. The highest correlation seen in the matrix above is -0.61, between the water and plasticizer features. The presence of multicolinearity would indicate that multiple predictive variables directly predict the target variable, which could lead to high p-values and a more inaccurate model.

Problem 8:

# Partition data on target variable strength
train_index <- createDataPartition(df_transformed$strength, p = 0.85, list = FALSE)

# Split the data into training and validation sets
train_set <- df_transformed[train_index, ]
validation_set <- df_transformed[-train_index, ]

Problem 9:

# build regression model with all features
model <- lm(strength ~ ., data = df_transformed)

# examine model summary statistics
summary(model)

## 
## Call:
## lm(formula = strength ~ ., data = df_transformed)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -20.9714  -4.5144   0.0033   4.4393  27.7889 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     15.203277  13.269840   1.146   0.2522    
## cement           0.088355   0.003770  23.439  < 2e-16 ***
## water           -0.208242   0.022521  -9.247  < 2e-16 ***
## coarse_agg       0.002854   0.005184   0.551   0.5821    
## fine_agg        -0.010651   0.005305  -2.008   0.0449 *  
## log_slag         2.210081   0.166947  13.238  < 2e-16 ***
## log_ash          0.327646   0.169923   1.928   0.0541 .  
## log_plasticizer  2.348875   0.378139   6.212 7.62e-10 ***
## log_age          9.029453   0.211338  42.725  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.384 on 1021 degrees of freedom
## Multiple R-squared:  0.8061, Adjusted R-squared:  0.8046 
## F-statistic: 530.7 on 8 and 1021 DF,  p-value: < 2.2e-16

AIC(model)

## [1] 7052.641

# Perform stepwise selection based on AIC
stepwise_model <- stepAIC(model, direction = "both", trace = TRUE)

## Start:  AIC=4127.63
## strength ~ cement + water + coarse_agg + fine_agg + log_slag + 
##     log_ash + log_plasticizer + log_age
## 
##                   Df Sum of Sq    RSS    AIC
## - coarse_agg       1        17  55689 4125.9
## <none>                          55673 4127.6
## - log_ash          1       203  55875 4129.4
## - fine_agg         1       220  55892 4129.7
## - log_plasticizer  1      2104  57777 4163.8
## - water            1      4662  60335 4208.5
## - log_slag         1      9556  65229 4288.8
## - cement           1     29957  85629 4569.1
## - log_age          1     99537 155210 5181.7
## 
## Step:  AIC=4125.93
## strength ~ cement + water + fine_agg + log_slag + log_ash + log_plasticizer + 
##     log_age
## 
##                   Df Sum of Sq    RSS    AIC
## <none>                          55689 4125.9
## - log_ash          1       186  55875 4127.4
## + coarse_agg       1        17  55673 4127.6
## - fine_agg         1       624  56313 4135.4
## - log_plasticizer  1      2261  57951 4164.9
## - water            1     10307  65996 4298.8
## - log_slag         1     15242  70931 4373.1
## - cement           1     45565 101254 4739.7
## - log_age          1     99522 155211 5179.7

# Display the final model summary
summary(stepwise_model)

## 
## Call:
## lm(formula = strength ~ cement + water + fine_agg + log_slag + 
##     log_ash + log_plasticizer + log_age, data = df_transformed)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -20.9527  -4.5431   0.0202   4.3824  28.0179 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     21.847999   5.513603   3.963 7.93e-05 ***
## cement           0.087108   0.003012  28.917  < 2e-16 ***
## water           -0.217083   0.015784 -13.753  < 2e-16 ***
## fine_agg        -0.012712   0.003758  -3.383 0.000744 ***
## log_slag         2.151530   0.128645  16.725  < 2e-16 ***
## log_ash          0.302501   0.163614   1.849 0.064765 .  
## log_plasticizer  2.274492   0.353059   6.442 1.81e-10 ***
## log_age          9.027523   0.211237  42.737  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.382 on 1022 degrees of freedom
## Multiple R-squared:  0.8061, Adjusted R-squared:  0.8048 
## F-statistic: 606.9 on 7 and 1022 DF,  p-value: < 2.2e-16

Problem 10:

# Make predictions on the validation dataset
predictions <- predict(stepwise_model, newdata = validation_set)

# Calculate Mean Squared Error (MSE)
mse <- mean((validation_set$strength - predictions)^2)
print(mse)

## [1] 58.42121

Problem 11:

# Create a new data frame for the new observation
new_data <- data.frame(
  cement = 535,
  log_slag = log(0+1),
  log_ash = log(0+1),
  water = 160,
  log_plasticizer = log(2.5+1),
  coarse_agg = 1000,
  fine_agg = 695,
  log_age = log(30+1)
)

# Make the prediction
predicted_strength <- predict(stepwise_model, newdata = new_data)
print(predicted_strength)

##        1 
## 58.73232

Problem 12:

# Extract Residual Standard Error
rse <- summary(stepwise_model)$sigma

# Calculate 95% CI
upper_bound <- predicted_strength + 1.96*rse
lower_bound <- predicted_strength - 1.96*rse

# Display 95% CI
print(lower_bound)

##        1 
## 44.26408

print(upper_bound)

##        1 
## 73.20057