Here we import our train and test data, student_train
and student_eval
, and evaluate for missing data and
additional exploratory steps.
Here we can preview the data structure:
student_train = read.csv('https://raw.githubusercontent.com/deepasharma06/Data-624/refs/heads/main/StudentData_training.csv')
student_eval = read.csv('https://raw.githubusercontent.com/deepasharma06/Data-624/refs/heads/main/StudentEvaluation_test.csv')
head(student_train) %>% kable()
Brand.Code | Carb.Volume | Fill.Ounces | PC.Volume | Carb.Pressure | Carb.Temp | PSC | PSC.Fill | PSC.CO2 | Mnf.Flow | Carb.Pressure1 | Fill.Pressure | Hyd.Pressure1 | Hyd.Pressure2 | Hyd.Pressure3 | Hyd.Pressure4 | Filler.Level | Filler.Speed | Temperature | Usage.cont | Carb.Flow | Density | MFR | Balling | Pressure.Vacuum | PH | Oxygen.Filler | Bowl.Setpoint | Pressure.Setpoint | Air.Pressurer | Alch.Rel | Carb.Rel | Balling.Lvl |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
B | 5.340000 | 23.96667 | 0.2633333 | 68.2 | 141.2 | 0.104 | 0.26 | 0.04 | -100 | 118.8 | 46.0 | 0 | NA | NA | 118 | 121.2 | 4002 | 66.0 | 16.18 | 2932 | 0.88 | 725.0 | 1.398 | -4.0 | 8.36 | 0.022 | 120 | 46.4 | 142.6 | 6.58 | 5.32 | 1.48 |
A | 5.426667 | 24.00667 | 0.2386667 | 68.4 | 139.6 | 0.124 | 0.22 | 0.04 | -100 | 121.6 | 46.0 | 0 | NA | NA | 106 | 118.6 | 3986 | 67.6 | 19.90 | 3144 | 0.92 | 726.8 | 1.498 | -4.0 | 8.26 | 0.026 | 120 | 46.8 | 143.0 | 6.56 | 5.30 | 1.56 |
B | 5.286667 | 24.06000 | 0.2633333 | 70.8 | 144.8 | 0.090 | 0.34 | 0.16 | -100 | 120.2 | 46.0 | 0 | NA | NA | 82 | 120.0 | 4020 | 67.0 | 17.76 | 2914 | 1.58 | 735.0 | 3.142 | -3.8 | 8.94 | 0.024 | 120 | 46.6 | 142.0 | 7.66 | 5.84 | 3.28 |
A | 5.440000 | 24.00667 | 0.2933333 | 63.0 | 132.6 | NA | 0.42 | 0.04 | -100 | 115.2 | 46.4 | 0 | 0 | 0 | 92 | 117.8 | 4012 | 65.6 | 17.42 | 3062 | 1.54 | 730.6 | 3.042 | -4.4 | 8.24 | 0.030 | 120 | 46.0 | 146.2 | 7.14 | 5.42 | 3.04 |
A | 5.486667 | 24.31333 | 0.1113333 | 67.2 | 136.8 | 0.026 | 0.16 | 0.12 | -100 | 118.4 | 45.8 | 0 | 0 | 0 | 92 | 118.6 | 4010 | 65.6 | 17.68 | 3054 | 1.54 | 722.8 | 3.042 | -4.4 | 8.26 | 0.030 | 120 | 46.0 | 146.2 | 7.14 | 5.44 | 3.04 |
A | 5.380000 | 23.92667 | 0.2693333 | 66.6 | 138.4 | 0.090 | 0.24 | 0.04 | -100 | 119.6 | 45.6 | 0 | 0 | 0 | 116 | 120.2 | 4014 | 66.2 | 23.82 | 2948 | 1.52 | 738.8 | 2.992 | -4.4 | 8.32 | 0.024 | 120 | 46.0 | 146.6 | 7.16 | 5.44 | 3.02 |
# Produce the model that can impute values using kNN
imputeModel <- preProcess(student_train, method = c("knnImpute"))
# Impute the missing values for the training and test data
student_train <- predict(imputeModel, student_train)
student_eval <- predict(imputeModel, student_eval)
# There are now zero NA values in our train and test data
sum(is.na(student_train))
## [1] 0
sum(is.na(student_eval))
## [1] 0
# Remove problematic predictors from train and test data
student_train_x <- subset(student_train, select = -c(Hyd.Pressure1))
student_eval_y <- subset(student_eval, select = -c(Hyd.Pressure1))
# Replacing Brand.Code with BCB
student_train_x1 <- student_train_x |> mutate(BCB = as.numeric(Brand.Code =='B')) |> select(-Brand.Code)
student_eval_y1 <- student_eval_y |> mutate(BCB = as.numeric(Brand.Code =='B')) |> select(-Brand.Code)
# New model with BCB instead of Brand.Code
model <- lm(PH ~ ., data = student_train_x)
#summary(model)
# Calculating VIF
vif_values <- vif(model)
vif_values
## GVIF Df GVIF^(1/(2*Df))
## Brand.Code 53.942578 4 1.646234
## Carb.Volume 7.872617 1 2.805818
## Fill.Ounces 1.151544 1 1.073100
## PC.Volume 1.500187 1 1.224821
## Carb.Pressure 15.485772 1 3.935197
## Carb.Temp 12.827035 1 3.581485
## PSC 1.152323 1 1.073463
## PSC.Fill 1.102569 1 1.050033
## PSC.CO2 1.068812 1 1.033834
## Mnf.Flow 4.382860 1 2.093528
## Carb.Pressure1 1.579213 1 1.256668
## Fill.Pressure 2.180153 1 1.476534
## Hyd.Pressure2 8.964442 1 2.994068
## Hyd.Pressure3 12.433205 1 3.526075
## Hyd.Pressure4 2.368986 1 1.539151
## Filler.Level 10.684198 1 3.268669
## Filler.Speed 4.897880 1 2.213116
## Temperature 1.455792 1 1.206562
## Usage.cont 1.684034 1 1.297703
## Carb.Flow 2.168168 1 1.472470
## Density 16.274585 1 4.034177
## MFR 4.048062 1 2.011980
## Balling 65.945084 1 8.120658
## Pressure.Vacuum 2.582057 1 1.606878
## Oxygen.Filler 1.494535 1 1.222512
## Bowl.Setpoint 11.085276 1 3.329456
## Pressure.Setpoint 2.281597 1 1.510495
## Air.Pressurer 1.195910 1 1.093577
## Alch.Rel 16.848866 1 4.104737
## Carb.Rel 5.209240 1 2.282376
## Balling.Lvl 50.180230 1 7.083801
# Create new predictor PT
student_train_x2 <- student_train_x1 |> mutate(PT = Balling.Lvl/Balling) |> select(-c(Balling, Balling.Lvl))
student_eval_y2 <- student_eval_y1 |> mutate(PT = Balling.Lvl/Balling) |> select(-c(Balling, Balling.Lvl))
Now I will use PCR/PLS for further feature reduction.
# Separate predictors and response
X_train <- student_train_x2 %>% select(-PH)
y_train <- student_train_x2$PH
X_test <- student_eval_y2 %>% select(-PH)
y_test <- student_eval_y2$PH
# Scale the training and test sets
# We'll use caret's preProcess for scaling
preProcValues <- preProcess(X_train, method = c("center", "scale"))
X_train_scaled <- predict(preProcValues, X_train)
X_test_scaled <- predict(preProcValues, X_test)
# Perform PCA on the scaled training predictors
pca_result <- prcomp(X_train_scaled, center = FALSE, scale. = FALSE)
# Summary of PCA variance
summary(pca_result)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.2811 2.1735 1.68172 1.32321 1.2562 1.19853 1.16009
## Proportion of Variance 0.1734 0.1575 0.09427 0.05836 0.0526 0.04788 0.04486
## Cumulative Proportion 0.1734 0.3309 0.42520 0.48356 0.5362 0.58404 0.62890
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 1.04011 1.01262 0.99666 0.94693 0.91894 0.87965 0.86579
## Proportion of Variance 0.03606 0.03418 0.03311 0.02989 0.02815 0.02579 0.02499
## Cumulative Proportion 0.66496 0.69914 0.73225 0.76214 0.79029 0.81608 0.84107
## PC15 PC16 PC17 PC18 PC19 PC20 PC21
## Standard deviation 0.84955 0.77508 0.71546 0.69902 0.68471 0.64256 0.62167
## Proportion of Variance 0.02406 0.02002 0.01706 0.01629 0.01563 0.01376 0.01288
## Cumulative Proportion 0.86513 0.88515 0.90222 0.91850 0.93413 0.94789 0.96078
## PC22 PC23 PC24 PC25 PC26 PC27 PC28
## Standard deviation 0.52727 0.47220 0.4414 0.37281 0.35244 0.29545 0.22780
## Proportion of Variance 0.00927 0.00743 0.0065 0.00463 0.00414 0.00291 0.00173
## Cumulative Proportion 0.97004 0.97748 0.9840 0.98861 0.99275 0.99566 0.99738
## PC29 PC30
## Standard deviation 0.21788 0.17601
## Proportion of Variance 0.00158 0.00103
## Cumulative Proportion 0.99897 1.00000
The principal components are chosen based on max variability, we can crack open PC1 and PC2 to see which variables cover the widest range of variability and get a sense for how PCR would reduce multi-collinearity:
# Contributions to PC1
fviz_contrib(pca_result, choice = "var", axes = 1, top = 10) +
labs(title = "Contributions of Variables to PC1")
# Contributions to PC1
fviz_contrib(pca_result, choice = "var", axes = 2, top = 10) +
labs(title = "Contributions of Variables to PC2")
On to PCR:
library(pls)
# Set a seed for reproducibility
set.seed(123)
# Using the already processed data 'student_train_x2'
# Ensure that PH is the response and the rest are predictors
pcr_model <- pcr(PH ~ ., data = student_train_x2, scale = TRUE, validation = "CV")
# Summary of the PCR model
summary(pcr_model)
## Data: X dimension: 2571 30
## Y dimension: 2571 1
## Fit method: svdpc
## Number of components considered: 30
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 0.9998 0.9577 0.9589 0.9506 0.9499 0.9461 0.9611
## adjCV 0.9998 0.9542 0.9546 0.9467 0.9461 0.9427 0.9564
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 0.9508 1.013 1.13 1.849 1.840 1.143 1.280
## adjCV 0.9459 1.003 1.11 1.778 1.767 1.120 1.244
## 14 comps 15 comps 16 comps 17 comps 18 comps 19 comps 20 comps
## CV 1.253 1.158 1.279 1.183 1.025 1.029 0.9195
## adjCV 1.219 1.131 1.242 1.152 1.008 1.011 0.9108
## 21 comps 22 comps 23 comps 24 comps 25 comps 26 comps 27 comps
## CV 0.9870 0.9840 1.0152 1.021 1.035 1.016 0.9434
## adjCV 0.9703 0.9676 0.9958 1.001 1.013 0.996 0.9284
## 28 comps 29 comps 30 comps
## CV 1.0018 1.0000 0.9993
## adjCV 0.9815 0.9797 0.9791
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 17.35 33.09 42.52 48.36 53.62 58.40 62.89 66.50
## PH 15.05 16.53 17.27 17.29 17.32 17.33 19.34 19.34
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 69.91 73.23 76.21 79.03 81.61 84.11 86.51
## PH 19.62 19.76 23.33 23.41 26.33 27.28 28.56
## 16 comps 17 comps 18 comps 19 comps 20 comps 21 comps 22 comps
## X 88.52 90.22 91.85 93.41 94.79 96.08 97.0
## PH 29.86 31.45 31.57 32.82 32.87 35.79 35.8
## 23 comps 24 comps 25 comps 26 comps 27 comps 28 comps 29 comps
## X 97.75 98.40 98.86 99.27 99.57 99.74 99.90
## PH 36.74 37.65 37.80 38.05 39.93 40.80 40.89
## 30 comps
## X 100.00
## PH 40.91
# Plot the cross-validation results using validationplot()
# val.type options: "MSEP", "RMSEP", "R2"
validationplot(pcr_model, val.type = "RMSEP", main = "PCR Cross-Validation (RMSEP)")
# Print RMSEP for each number of components
rmsep_values <- RMSEP(pcr_model, estimate = "CV")
rmsep_values
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps
## 0.9998 0.9577 0.9589 0.9506 0.9499 0.9461
## 6 comps 7 comps 8 comps 9 comps 10 comps 11 comps
## 0.9611 0.9508 1.0134 1.1304 1.8490 1.8405
## 12 comps 13 comps 14 comps 15 comps 16 comps 17 comps
## 1.1432 1.2803 1.2528 1.1584 1.2790 1.1828
## 18 comps 19 comps 20 comps 21 comps 22 comps 23 comps
## 1.0253 1.0294 0.9195 0.9870 0.9840 1.0152
## 24 comps 25 comps 26 comps 27 comps 28 comps 29 comps
## 1.0210 1.0348 1.0162 0.9434 1.0018 1.0000
## 30 comps
## 0.9993
# Also check R² to see how well the model explains the variance
validationplot(pcr_model, val.type = "R2", main = "PCR Cross-Validation (R2)")
# Print out R2 for each number of components
r2_values <- R2(pcr_model, estimate = "CV")
r2_values
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps
## -0.0007784 0.0817204 0.0795193 0.0953255 0.0966418 0.1037929
## 6 comps 7 comps 8 comps 9 comps 10 comps 11 comps
## 0.0753052 0.0948758 -0.0280725 -0.2791882 -2.4227091 -2.3911300
## 12 comps 13 comps 14 comps 15 comps 16 comps 17 comps
## -0.3082899 -0.6410022 -0.5713197 -0.3432995 -0.6376337 -0.4005158
## 18 comps 19 comps 20 comps 21 comps 22 comps 23 comps
## -0.0524146 -0.0608081 0.1535729 0.0247478 0.0307351 -0.0317707
## 24 comps 25 comps 26 comps 27 comps 28 comps 29 comps
## -0.0436857 -0.0720937 -0.0337501 0.1089018 -0.0047271 -0.0010823
## 30 comps
## 0.0002551
It looks like 5 components gives us both the highest R2 and lowest RMSEP before the model sharply drops in performance. It does improve again at 20 components, but for the sake of having a simpler model we will stick to 5.
library(pls)
# Set a seed for reproducibility
set.seed(123)
# Fit the PCR model with exactly 5 components
pcr_model_5 <- pcr(PH ~ .,
data = student_train_x2,
scale = TRUE,
ncomp = 5,
validation = "CV")
# Print summary of the model
summary(pcr_model_5)
## Data: X dimension: 2571 30
## Y dimension: 2571 1
## Fit method: svdpc
## Number of components considered: 5
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps
## CV 0.9998 0.9577 0.9589 0.9506 0.9499 0.9461
## adjCV 0.9998 0.9542 0.9546 0.9467 0.9461 0.9427
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps
## X 17.35 33.09 42.52 48.36 53.62
## PH 15.05 16.53 17.27 17.29 17.32
# Extract R2 and RMSEP for cross-validation
r2_5 <- R2(pcr_model_5, estimate = "CV")
rmsep_5 <- RMSEP(pcr_model_5, estimate = "CV")
# If you have a test set (student_eval_y2) and want to make predictions:
predictions_5 <- predict(pcr_model_5, newdata = student_eval_y2, ncomp = 5)
# Calculate prediction metrics on the test set
test_res <- postResample(predictions_5, student_eval_y2$PH)
cat("\nTest set performance (using 5 components):\n")
##
## Test set performance (using 5 components):
print(test_res)
## RMSE Rsquared MAE
## 0.6155150 0.2832964 0.5087567
# RMSE 0.6155150
# Rsquared 0.2832964
# MAE 0.5087567
The results with PCR are lackluster, on to PLS:
library(pls)
set.seed(123) # For reproducibility
# 1. Make the PLS model using cross-validation
pls_model <- plsr(PH ~ .,
data = student_train_x2,
scale = TRUE, # Scale predictors
validation = "CV") # Use cross-validation
# Print a summary of the model to see initial info
summary(pls_model)
## Data: X dimension: 2571 30
## Y dimension: 2571 1
## Fit method: kernelpls
## Number of components considered: 30
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 0.9998 0.9918 1.201 1.191 1.084 0.9236 0.9112
## adjCV 0.9998 0.9813 1.168 1.158 1.059 0.9108 0.8990
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1.094 0.9879 0.9589 0.9967 1.0059 0.9694 0.9848
## adjCV 1.066 0.9688 0.9421 0.9767 0.9852 0.9517 0.9658
## 14 comps 15 comps 16 comps 17 comps 18 comps 19 comps 20 comps
## CV 1.0151 1.0006 0.9967 0.9999 0.9993 1.0001 1.0004
## adjCV 0.9936 0.9803 0.9767 0.9797 0.9791 0.9798 0.9801
## 21 comps 22 comps 23 comps 24 comps 25 comps 26 comps 27 comps
## CV 0.9988 0.9991 0.9994 0.9992 0.9993 0.9992 0.9993
## adjCV 0.9787 0.9789 0.9792 0.9790 0.9791 0.9790 0.9791
## 28 comps 29 comps 30 comps
## CV 0.9993 0.9993 0.9993
## adjCV 0.9791 0.9791 0.9791
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 16.76 21.44 31.84 41.84 47.84 50.65 52.86 56.85
## PH 22.63 34.56 36.74 37.89 38.94 39.80 40.40 40.60
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 59.57 62.18 65.80 68.10 70.52 72.8 75.89
## PH 40.75 40.83 40.86 40.88 40.89 40.9 40.90
## 16 comps 17 comps 18 comps 19 comps 20 comps 21 comps 22 comps
## X 77.86 79.26 81.43 83.71 85.10 87.22 89.29
## PH 40.90 40.90 40.90 40.91 40.91 40.91 40.91
## 23 comps 24 comps 25 comps 26 comps 27 comps 28 comps 29 comps
## X 90.37 92.79 93.90 95.58 96.89 97.89 98.51
## PH 40.91 40.91 40.91 40.91 40.91 40.91 40.91
## 30 comps
## X 100.00
## PH 40.91
# Using validationplot to check RMSEP across components
validationplot(pls_model, val.type = "RMSEP", main = "PLS: CV RMSEP by Number of Components")
# This plot helps visualize how RMSEP changes as we increase the number of components.
validationplot(pls_model, val.type = "R2", main = "PLS: CV R2 by Number of Components")
# Check R² via R2() function and RMSEP via RMSEP() function
r2_values_pls <- R2(pls_model, estimate = "CV")
rmsep_values_pls <- RMSEP(pls_model, estimate = "CV")
r2_values <- r2_values_pls$val["CV", "PH", ]
r2_comps <- r2_values[-1]
rmse_values <- rmsep_values_pls$val["CV", "PH", ]
rmsep_comps <- rmse_values[-1]
# Identify the best number of components based on minimum RMSEP
opt_comp_rmsep <- which.min(rmsep_comps)
min_rmsep <- rmsep_comps[opt_comp_rmsep]
# Identify the best number of components based on max R²
opt_comp_r2 <- which.max(r2_comps)
max_r2 <- r2_comps[opt_comp_r2]
cat("\nOptimal number of components based on RMSEP:", opt_comp_rmsep, "with RMSEP =", min_rmsep, "\n")
##
## Optimal number of components based on RMSEP: 6 with RMSEP = 0.9111578
cat("Optimal number of components based on R²:", opt_comp_r2, "with R² =", max_r2, "\n")
## Optimal number of components based on R²: 6 with R² = 0.1688543
# Let's choose the optimal number of components. In practice, you might consider a balance
# between minimal RMSEP and complexity. Here, let's use the one chosen by RMSEP for demonstration.
final_ncomp <- opt_comp_rmsep
cat("\nFinal chosen number of components:", final_ncomp, "\n")
##
## Final chosen number of components: 6
# 4. Make a final PLS model with the chosen number of components and evaluate it
pls_model_final <- plsr(PH ~ .,
data = student_train_x2,
scale = TRUE,
ncomp = final_ncomp,
validation = "none") # no need for CV here since we fixed ncomp
# Summary of the final model
summary(pls_model_final)
## Data: X dimension: 2571 30
## Y dimension: 2571 1
## Fit method: kernelpls
## Number of components considered: 6
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## X 16.76 21.44 31.84 41.84 47.84 50.65
## PH 22.63 34.56 36.74 37.89 38.94 39.80
preds_pls <- predict(pls_model_final, newdata = student_eval_y2, ncomp = final_ncomp)
results_test <- postResample(preds_pls, student_eval_y2$PH)
cat("\nTest set performance with", final_ncomp, "components:\n")
##
## Test set performance with 6 components:
print(results_test)
## RMSE Rsquared MAE
## 0.5593341 0.4342728 0.4397250
# RMSE 0.5593341
# Rsquared 0.4342728
# MAE 0.4397250
X-Loadings: Indicate how the original predictors are combined linearly to form the PLS components. A higher absolute loading value means the predictor strongly influences that component.
# View the X-loadings
x_loadings <- pls_model_final$loadings
cat("X-loadings:\n")
## X-loadings:
# print(x_loadings)
# View the X-loading weights
x_loading_weights <- pls_model_final$loading.weights
cat("\nX-loading weights:\n")
##
## X-loading weights:
# print(x_loading_weights)
# Convert to data frames for easier handling
x_loadings_matrix <- unclass(pls_model_final$loadings)
x_loadings_df <- as.data.frame(x_loadings_matrix)
x_loadings_weights_matrix <- unclass(pls_model_final$loading.weights)
x_loading_weights_df <- as.data.frame(x_loadings_matrix)
# Rename columns for clarity: each column corresponds to a component
colnames(x_loadings_df) <- paste0("Comp", 1:ncol(x_loadings_df))
colnames(x_loading_weights_df) <- paste0("Comp", 1:ncol(x_loading_weights_df))
# Let's also see the coefficients for each number of components
coefficients_array <- pls_model_final$coefficients
# `coefficients_array` is a multidimensional array: [predictor, response, component]
# For a single response model, we can simplify it:
coefficients_matrix <- coefficients_array[,1,] # Extract for the single response variable
colnames(coefficients_matrix) <- paste0("Comp", 1:ncol(coefficients_matrix))
cat("\nCoefficients for each component:\n")
##
## Coefficients for each component:
#print(coefficients_matrix)
# Visualizing Loadings
# As with PCA, you can plot the loadings to see which predictors have strong influence on each component.
library(reshape2)
library(ggplot2)
# Melt the loadings data frame for plotting
x_loadings_long <- melt(x_loadings_df, variable.name = "Component", value.name = "Loading")
x_loadings_long$Variable <- rownames(x_loadings_df)
ggplot(x_loadings_long, aes(x = reorder(Variable, Loading), y = Loading, fill = Component)) +
geom_bar(stat="identity", position="dodge") +
coord_flip() +
facet_wrap(~ Component, scales = "free_y") +
theme_minimal() +
labs(title = "PLS X-Loadings by Component", x = "Predictors", y = "Loading")
Coefficients: Show how each predictor contributes to predicting the response variable when using a given number of components. Higher absolute coefficients indicate greater influence of a predictor on the predicted outcome.
# Extract the coefficients matrix from the PLS model
coefficients_array <- pls_model_final$coefficients
coefficients_matrix <- coefficients_array[, 1, ] # For a single-response model
colnames(coefficients_matrix) <- paste0("Comp", 1:ncol(coefficients_matrix))
# Convert to a data frame
coefficients_df <- as.data.frame(coefficients_matrix)
coefficients_df$Variable <- rownames(coefficients_df)
# Reshape to long format for plotting
library(reshape2)
coefficients_long <- melt(coefficients_df, id.vars = "Variable",
variable.name = "Component", value.name = "Coefficient")
# Plot using ggplot2, faceting by Component
library(ggplot2)
ggplot(coefficients_long, aes(x = reorder(Variable, Coefficient), y = Coefficient, fill = Component)) +
geom_bar(stat="identity", position="dodge") +
coord_flip() +
facet_wrap(~ Component, scales = "free_y") +
theme_minimal() +
labs(title = "PLS Coefficients by Component",
x = "Predictor Variables",
y = "Coefficient") +
theme(legend.position = "none")
The good thing about using PLS is that the multi-colinearity is addressed along with predictive power. The components or latent variables are chosen such that the covariance between predictors is maximized, and each component is orthogonal to the ones prior so the correlation between components is also minimized.
Each predictor is also weighed differently in each component it appears in based on its effect on the target variable. Some next steps we can take are to look at which predictors have consistently low coefficients/loadings and remove them- this could possibly improve the performance of the model.
PT: Coefficients: Around ±0.001 to ±0.007 across all components, never exceeding about 0.0075 in absolute value. These are very small relative to other variables that have coefficients in the 0.05–0.4 range. Loadings: PT does not appear to strongly load on any component (not listed or near zero in the given snippet).
Air.Pressurer: Coefficients: About -0.003 to -0.013 across all components. These values are also quite small compared to other more influential variables. Loadings: Only appears once at about -0.114, which is not large. Most important variables have loadings at least above ±0.2–0.3 somewhere.
Carb.Temp: Coefficients: Ranging around 0.0059 to 0.0158, slightly larger than PT or Air.Pressurer but still relatively small. Some variables show coefficients well above 0.05 or even 0.1. Loadings: Carb.Temp does not appear with a noticeable loading in the snippet (it’s blank), suggesting it may not be significantly influencing the latent structure.
library(pls)
# Remove the chosen predictors from the dataset
predictors_to_remove <- c("PT", "Air.Pressurer", "Carb.Temp")
student_train_reduced <- student_train_x2[ , !(names(student_train_x2) %in% predictors_to_remove)]
# Fit a PLS model again with cross-validation
set.seed(123)
pls_model_reduced <- plsr(PH ~ .,
data = student_train_reduced,
scale = TRUE,
validation = "CV")
# Check summary
summary(pls_model_reduced)
## Data: X dimension: 2571 27
## Y dimension: 2571 1
## Fit method: kernelpls
## Number of components considered: 27
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 0.9998 0.8815 0.8157 0.8027 0.7964 0.7906 0.7858
## adjCV 0.9998 0.8815 0.8153 0.8024 0.7960 0.7901 0.7852
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 0.7819 0.7806 0.7799 0.7793 0.7791 0.7790 0.7791
## adjCV 0.7813 0.7801 0.7794 0.7788 0.7786 0.7785 0.7786
## 14 comps 15 comps 16 comps 17 comps 18 comps 19 comps 20 comps
## CV 0.7790 0.7790 0.7790 0.7789 0.7790 0.7790 0.7790
## adjCV 0.7784 0.7784 0.7784 0.7784 0.7784 0.7784 0.7784
## 21 comps 22 comps 23 comps 24 comps 25 comps 26 comps 27 comps
## CV 0.7790 0.7790 0.7790 0.7790 0.7790 0.7790 0.7790
## adjCV 0.7784 0.7784 0.7784 0.7784 0.7784 0.7784 0.7784
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 18.62 23.82 35.37 46.40 53.02 56.15 58.19 62.01
## PH 22.61 34.54 36.73 37.88 38.93 39.78 40.40 40.60
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 65.07 66.83 69.00 72.50 74.75 76.60 79.41
## PH 40.72 40.80 40.83 40.84 40.85 40.85 40.85
## 16 comps 17 comps 18 comps 19 comps 20 comps 21 comps 22 comps
## X 80.73 82.77 84.62 86.89 89.27 90.91 92.52
## PH 40.85 40.85 40.85 40.85 40.85 40.85 40.85
## 23 comps 24 comps 25 comps 26 comps 27 comps
## X 93.75 95.61 97.40 98.30 100.00
## PH 40.85 40.85 40.85 40.85 40.85
r2_values_reduced <- R2(pls_model_reduced, estimate = "CV")
rmsep_values_reduced <- RMSEP(pls_model_reduced, estimate = "CV")
cat("\nCross-validated R² values after removal:\n")
##
## Cross-validated R² values after removal:
cat("\nCross-validated RMSEP values after removal:\n")
##
## Cross-validated RMSEP values after removal:
# Visualize MSEP to see if there's an improvement
validationplot(pls_model_reduced, val.type = "MSEP", main = "PLS with Reduced Predictors: CV MSEP by #Components")
# Identify optimal number of components in the reduced model based on RMSEP
r2_values_reduced_mod <- r2_values_reduced$val["CV", "PH", ]
r2_comps_reduced <- r2_values_reduced_mod[-1]
rmse_values_reduced_mod <- rmsep_values_reduced$val["CV", "PH", ]
rmsep_comps_reduced <- rmse_values_reduced_mod[-1]
opt_comp_rmsep_reduced <- which.min(rmsep_comps_reduced)
min_rmsep_reduced <- rmsep_comps_reduced[opt_comp_rmsep_reduced]
opt_comp_r2_reduced <- which.max(r2_comps_reduced)
max_r2_reduced <- r2_comps_reduced[opt_comp_r2_reduced]
cat("\nAfter predictor removal:\n")
##
## After predictor removal:
cat("Optimal #Components by RMSEP:", opt_comp_rmsep_reduced, "with RMSEP =", min_rmsep_reduced, "\n")
## Optimal #Components by RMSEP: 17 with RMSEP = 0.7789427
cat("Optimal #Components by R²:", opt_comp_r2_reduced, "with R² =", max_r2_reduced, "\n")
## Optimal #Components by R²: 17 with R² = 0.3925633
# Fit the final model with the chosen number of components
final_pls_model <- plsr(PH ~ .,
data = student_train_reduced,
scale = TRUE,
ncomp = opt_comp_rmsep_reduced,
validation = "none")
# Check summary
summary(final_pls_model)
## Data: X dimension: 2571 27
## Y dimension: 2571 1
## Fit method: kernelpls
## Number of components considered: 17
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 18.62 23.82 35.37 46.40 53.02 56.15 58.19 62.01
## PH 22.61 34.54 36.73 37.88 38.93 39.78 40.40 40.60
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 65.07 66.83 69.00 72.50 74.75 76.60 79.41
## PH 40.72 40.80 40.83 40.84 40.85 40.85 40.85
## 16 comps 17 comps
## X 80.73 82.77
## PH 40.85 40.85
# Predict on the test set
preds_final <- predict(final_pls_model, newdata = student_eval_y2, ncomp = opt_comp_rmsep_reduced)
test_performance <- postResample(preds_final, student_eval_y2$PH)
cat("\nTest set performance (RMSE, R2, MAE):\n")
##
## Test set performance (RMSE, R2, MAE):
print(test_performance)
## RMSE Rsquared MAE
## 0.5562690 0.4452060 0.4302488
# RMSE 0.5562690
# Rsquared 0.4452060
# MAE 0.4302488
There was a small lift in Rsquared after removing some of the less relevant predictors.
Rsquared:
PCR: 0.2832964 PLS: 0.4342728 PLS with some predictors removed: 0.4452060