Data

Here we import our train and test data, student_train and student_eval, and evaluate for missing data and additional exploratory steps.


Data Acquisition

Here we can preview the data structure:

student_train = read.csv('https://raw.githubusercontent.com/deepasharma06/Data-624/refs/heads/main/StudentData_training.csv')

student_eval = read.csv('https://raw.githubusercontent.com/deepasharma06/Data-624/refs/heads/main/StudentEvaluation_test.csv')

head(student_train) %>% kable()
Brand.Code Carb.Volume Fill.Ounces PC.Volume Carb.Pressure Carb.Temp PSC PSC.Fill PSC.CO2 Mnf.Flow Carb.Pressure1 Fill.Pressure Hyd.Pressure1 Hyd.Pressure2 Hyd.Pressure3 Hyd.Pressure4 Filler.Level Filler.Speed Temperature Usage.cont Carb.Flow Density MFR Balling Pressure.Vacuum PH Oxygen.Filler Bowl.Setpoint Pressure.Setpoint Air.Pressurer Alch.Rel Carb.Rel Balling.Lvl
B 5.340000 23.96667 0.2633333 68.2 141.2 0.104 0.26 0.04 -100 118.8 46.0 0 NA NA 118 121.2 4002 66.0 16.18 2932 0.88 725.0 1.398 -4.0 8.36 0.022 120 46.4 142.6 6.58 5.32 1.48
A 5.426667 24.00667 0.2386667 68.4 139.6 0.124 0.22 0.04 -100 121.6 46.0 0 NA NA 106 118.6 3986 67.6 19.90 3144 0.92 726.8 1.498 -4.0 8.26 0.026 120 46.8 143.0 6.56 5.30 1.56
B 5.286667 24.06000 0.2633333 70.8 144.8 0.090 0.34 0.16 -100 120.2 46.0 0 NA NA 82 120.0 4020 67.0 17.76 2914 1.58 735.0 3.142 -3.8 8.94 0.024 120 46.6 142.0 7.66 5.84 3.28
A 5.440000 24.00667 0.2933333 63.0 132.6 NA 0.42 0.04 -100 115.2 46.4 0 0 0 92 117.8 4012 65.6 17.42 3062 1.54 730.6 3.042 -4.4 8.24 0.030 120 46.0 146.2 7.14 5.42 3.04
A 5.486667 24.31333 0.1113333 67.2 136.8 0.026 0.16 0.12 -100 118.4 45.8 0 0 0 92 118.6 4010 65.6 17.68 3054 1.54 722.8 3.042 -4.4 8.26 0.030 120 46.0 146.2 7.14 5.44 3.04
A 5.380000 23.92667 0.2693333 66.6 138.4 0.090 0.24 0.04 -100 119.6 45.6 0 0 0 116 120.2 4014 66.2 23.82 2948 1.52 738.8 2.992 -4.4 8.32 0.024 120 46.0 146.6 7.16 5.44 3.02
# Produce the model that can impute values using kNN
imputeModel <- preProcess(student_train, method = c("knnImpute"))

# Impute the missing values for the training and test data
student_train <- predict(imputeModel, student_train)
student_eval <- predict(imputeModel, student_eval)

# There are now zero NA values in our train and test data
sum(is.na(student_train))
## [1] 0
sum(is.na(student_eval))
## [1] 0
# Remove problematic predictors from train and test data
student_train_x <- subset(student_train, select = -c(Hyd.Pressure1))
student_eval_y <-  subset(student_eval, select =  -c(Hyd.Pressure1))
# Replacing Brand.Code with BCB
student_train_x1 <- student_train_x |> mutate(BCB = as.numeric(Brand.Code =='B')) |> select(-Brand.Code)
student_eval_y1 <- student_eval_y |> mutate(BCB = as.numeric(Brand.Code =='B')) |> select(-Brand.Code)

# New model with BCB instead of Brand.Code
model <- lm(PH ~ ., data = student_train_x)
#summary(model)

# Calculating VIF
vif_values <- vif(model)
vif_values
##                        GVIF Df GVIF^(1/(2*Df))
## Brand.Code        53.942578  4        1.646234
## Carb.Volume        7.872617  1        2.805818
## Fill.Ounces        1.151544  1        1.073100
## PC.Volume          1.500187  1        1.224821
## Carb.Pressure     15.485772  1        3.935197
## Carb.Temp         12.827035  1        3.581485
## PSC                1.152323  1        1.073463
## PSC.Fill           1.102569  1        1.050033
## PSC.CO2            1.068812  1        1.033834
## Mnf.Flow           4.382860  1        2.093528
## Carb.Pressure1     1.579213  1        1.256668
## Fill.Pressure      2.180153  1        1.476534
## Hyd.Pressure2      8.964442  1        2.994068
## Hyd.Pressure3     12.433205  1        3.526075
## Hyd.Pressure4      2.368986  1        1.539151
## Filler.Level      10.684198  1        3.268669
## Filler.Speed       4.897880  1        2.213116
## Temperature        1.455792  1        1.206562
## Usage.cont         1.684034  1        1.297703
## Carb.Flow          2.168168  1        1.472470
## Density           16.274585  1        4.034177
## MFR                4.048062  1        2.011980
## Balling           65.945084  1        8.120658
## Pressure.Vacuum    2.582057  1        1.606878
## Oxygen.Filler      1.494535  1        1.222512
## Bowl.Setpoint     11.085276  1        3.329456
## Pressure.Setpoint  2.281597  1        1.510495
## Air.Pressurer      1.195910  1        1.093577
## Alch.Rel          16.848866  1        4.104737
## Carb.Rel           5.209240  1        2.282376
## Balling.Lvl       50.180230  1        7.083801
# Create new predictor PT
student_train_x2 <- student_train_x1 |> mutate(PT = Balling.Lvl/Balling) |> select(-c(Balling, Balling.Lvl))
student_eval_y2 <- student_eval_y1 |> mutate(PT = Balling.Lvl/Balling) |> select(-c(Balling, Balling.Lvl))

Now I will use PCR/PLS for further feature reduction.

  1. Do PCR and analyze (rsquared)
  2. Do PLS, get optimal number of components using RMSEP, do PCR and analyze (rsquared)
# Separate predictors and response
X_train <- student_train_x2 %>% select(-PH)
y_train <- student_train_x2$PH

X_test <- student_eval_y2 %>% select(-PH)
y_test <- student_eval_y2$PH

# Scale the training and test sets
# We'll use caret's preProcess for scaling
preProcValues <- preProcess(X_train, method = c("center", "scale"))
X_train_scaled <- predict(preProcValues, X_train)
X_test_scaled  <- predict(preProcValues, X_test)

# Perform PCA on the scaled training predictors
pca_result <- prcomp(X_train_scaled, center = FALSE, scale. = FALSE)

# Summary of PCA variance
summary(pca_result)
## Importance of components:
##                           PC1    PC2     PC3     PC4    PC5     PC6     PC7
## Standard deviation     2.2811 2.1735 1.68172 1.32321 1.2562 1.19853 1.16009
## Proportion of Variance 0.1734 0.1575 0.09427 0.05836 0.0526 0.04788 0.04486
## Cumulative Proportion  0.1734 0.3309 0.42520 0.48356 0.5362 0.58404 0.62890
##                            PC8     PC9    PC10    PC11    PC12    PC13    PC14
## Standard deviation     1.04011 1.01262 0.99666 0.94693 0.91894 0.87965 0.86579
## Proportion of Variance 0.03606 0.03418 0.03311 0.02989 0.02815 0.02579 0.02499
## Cumulative Proportion  0.66496 0.69914 0.73225 0.76214 0.79029 0.81608 0.84107
##                           PC15    PC16    PC17    PC18    PC19    PC20    PC21
## Standard deviation     0.84955 0.77508 0.71546 0.69902 0.68471 0.64256 0.62167
## Proportion of Variance 0.02406 0.02002 0.01706 0.01629 0.01563 0.01376 0.01288
## Cumulative Proportion  0.86513 0.88515 0.90222 0.91850 0.93413 0.94789 0.96078
##                           PC22    PC23   PC24    PC25    PC26    PC27    PC28
## Standard deviation     0.52727 0.47220 0.4414 0.37281 0.35244 0.29545 0.22780
## Proportion of Variance 0.00927 0.00743 0.0065 0.00463 0.00414 0.00291 0.00173
## Cumulative Proportion  0.97004 0.97748 0.9840 0.98861 0.99275 0.99566 0.99738
##                           PC29    PC30
## Standard deviation     0.21788 0.17601
## Proportion of Variance 0.00158 0.00103
## Cumulative Proportion  0.99897 1.00000

The principal components are chosen based on max variability, we can crack open PC1 and PC2 to see which variables cover the widest range of variability and get a sense for how PCR would reduce multi-collinearity:

# Contributions to PC1
fviz_contrib(pca_result, choice = "var", axes = 1, top = 10) +
  labs(title = "Contributions of Variables to PC1")

# Contributions to PC1
fviz_contrib(pca_result, choice = "var", axes = 2, top = 10) +
  labs(title = "Contributions of Variables to PC2")

On to PCR:

library(pls)

# Set a seed for reproducibility
set.seed(123)

# Using the already processed data 'student_train_x2'
# Ensure that PH is the response and the rest are predictors
pcr_model <- pcr(PH ~ ., data = student_train_x2, scale = TRUE,  validation = "CV")   

# Summary of the PCR model
summary(pcr_model)
## Data:    X dimension: 2571 30 
##  Y dimension: 2571 1
## Fit method: svdpc
## Number of components considered: 30
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV          0.9998   0.9577   0.9589   0.9506   0.9499   0.9461   0.9611
## adjCV       0.9998   0.9542   0.9546   0.9467   0.9461   0.9427   0.9564
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV      0.9508    1.013     1.13     1.849     1.840     1.143     1.280
## adjCV   0.9459    1.003     1.11     1.778     1.767     1.120     1.244
##        14 comps  15 comps  16 comps  17 comps  18 comps  19 comps  20 comps
## CV        1.253     1.158     1.279     1.183     1.025     1.029    0.9195
## adjCV     1.219     1.131     1.242     1.152     1.008     1.011    0.9108
##        21 comps  22 comps  23 comps  24 comps  25 comps  26 comps  27 comps
## CV       0.9870    0.9840    1.0152     1.021     1.035     1.016    0.9434
## adjCV    0.9703    0.9676    0.9958     1.001     1.013     0.996    0.9284
##        28 comps  29 comps  30 comps
## CV       1.0018    1.0000    0.9993
## adjCV    0.9815    0.9797    0.9791
## 
## TRAINING: % variance explained
##     1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps  8 comps
## X     17.35    33.09    42.52    48.36    53.62    58.40    62.89    66.50
## PH    15.05    16.53    17.27    17.29    17.32    17.33    19.34    19.34
##     9 comps  10 comps  11 comps  12 comps  13 comps  14 comps  15 comps
## X     69.91     73.23     76.21     79.03     81.61     84.11     86.51
## PH    19.62     19.76     23.33     23.41     26.33     27.28     28.56
##     16 comps  17 comps  18 comps  19 comps  20 comps  21 comps  22 comps
## X      88.52     90.22     91.85     93.41     94.79     96.08      97.0
## PH     29.86     31.45     31.57     32.82     32.87     35.79      35.8
##     23 comps  24 comps  25 comps  26 comps  27 comps  28 comps  29 comps
## X      97.75     98.40     98.86     99.27     99.57     99.74     99.90
## PH     36.74     37.65     37.80     38.05     39.93     40.80     40.89
##     30 comps
## X     100.00
## PH     40.91
# Plot the cross-validation results using validationplot()

# val.type options: "MSEP", "RMSEP", "R2"
validationplot(pcr_model, val.type = "RMSEP", main = "PCR Cross-Validation (RMSEP)")

# Print RMSEP for each number of components
rmsep_values <- RMSEP(pcr_model, estimate = "CV")
rmsep_values
## (Intercept)      1 comps      2 comps      3 comps      4 comps      5 comps  
##      0.9998       0.9577       0.9589       0.9506       0.9499       0.9461  
##     6 comps      7 comps      8 comps      9 comps     10 comps     11 comps  
##      0.9611       0.9508       1.0134       1.1304       1.8490       1.8405  
##    12 comps     13 comps     14 comps     15 comps     16 comps     17 comps  
##      1.1432       1.2803       1.2528       1.1584       1.2790       1.1828  
##    18 comps     19 comps     20 comps     21 comps     22 comps     23 comps  
##      1.0253       1.0294       0.9195       0.9870       0.9840       1.0152  
##    24 comps     25 comps     26 comps     27 comps     28 comps     29 comps  
##      1.0210       1.0348       1.0162       0.9434       1.0018       1.0000  
##    30 comps  
##      0.9993
# Also check R² to see how well the model explains the variance
validationplot(pcr_model, val.type = "R2", main = "PCR Cross-Validation (R2)")

# Print out R2 for each number of components
r2_values <- R2(pcr_model, estimate = "CV")
r2_values
## (Intercept)      1 comps      2 comps      3 comps      4 comps      5 comps  
##  -0.0007784    0.0817204    0.0795193    0.0953255    0.0966418    0.1037929  
##     6 comps      7 comps      8 comps      9 comps     10 comps     11 comps  
##   0.0753052    0.0948758   -0.0280725   -0.2791882   -2.4227091   -2.3911300  
##    12 comps     13 comps     14 comps     15 comps     16 comps     17 comps  
##  -0.3082899   -0.6410022   -0.5713197   -0.3432995   -0.6376337   -0.4005158  
##    18 comps     19 comps     20 comps     21 comps     22 comps     23 comps  
##  -0.0524146   -0.0608081    0.1535729    0.0247478    0.0307351   -0.0317707  
##    24 comps     25 comps     26 comps     27 comps     28 comps     29 comps  
##  -0.0436857   -0.0720937   -0.0337501    0.1089018   -0.0047271   -0.0010823  
##    30 comps  
##   0.0002551

It looks like 5 components gives us both the highest R2 and lowest RMSEP before the model sharply drops in performance. It does improve again at 20 components, but for the sake of having a simpler model we will stick to 5.

library(pls)

# Set a seed for reproducibility
set.seed(123)

# Fit the PCR model with exactly 5 components
pcr_model_5 <- pcr(PH ~ ., 
                   data = student_train_x2, 
                   scale = TRUE, 
                   ncomp = 5, 
                   validation = "CV")

# Print summary of the model
summary(pcr_model_5)
## Data:    X dimension: 2571 30 
##  Y dimension: 2571 1
## Fit method: svdpc
## Number of components considered: 5
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps
## CV          0.9998   0.9577   0.9589   0.9506   0.9499   0.9461
## adjCV       0.9998   0.9542   0.9546   0.9467   0.9461   0.9427
## 
## TRAINING: % variance explained
##     1 comps  2 comps  3 comps  4 comps  5 comps
## X     17.35    33.09    42.52    48.36    53.62
## PH    15.05    16.53    17.27    17.29    17.32
# Extract R2 and RMSEP for cross-validation
r2_5 <- R2(pcr_model_5, estimate = "CV")
rmsep_5 <- RMSEP(pcr_model_5, estimate = "CV")


# If you have a test set (student_eval_y2) and want to make predictions:

predictions_5 <- predict(pcr_model_5, newdata = student_eval_y2, ncomp = 5)
  
# Calculate prediction metrics on the test set
test_res <- postResample(predictions_5, student_eval_y2$PH)
cat("\nTest set performance (using 5 components):\n")
## 
## Test set performance (using 5 components):
print(test_res)
##      RMSE  Rsquared       MAE 
## 0.6155150 0.2832964 0.5087567
    # RMSE       0.6155150 
    # Rsquared   0.2832964
    # MAE        0.5087567 

The results with PCR are lackluster, on to PLS:

library(pls)

set.seed(123) # For reproducibility

# 1. Make the PLS model using cross-validation
pls_model <- plsr(PH ~ ., 
                  data = student_train_x2,
                  scale = TRUE,          # Scale predictors
                  validation = "CV")     # Use cross-validation

# Print a summary of the model to see initial info
summary(pls_model)
## Data:    X dimension: 2571 30 
##  Y dimension: 2571 1
## Fit method: kernelpls
## Number of components considered: 30
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV          0.9998   0.9918    1.201    1.191    1.084   0.9236   0.9112
## adjCV       0.9998   0.9813    1.168    1.158    1.059   0.9108   0.8990
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV       1.094   0.9879   0.9589    0.9967    1.0059    0.9694    0.9848
## adjCV    1.066   0.9688   0.9421    0.9767    0.9852    0.9517    0.9658
##        14 comps  15 comps  16 comps  17 comps  18 comps  19 comps  20 comps
## CV       1.0151    1.0006    0.9967    0.9999    0.9993    1.0001    1.0004
## adjCV    0.9936    0.9803    0.9767    0.9797    0.9791    0.9798    0.9801
##        21 comps  22 comps  23 comps  24 comps  25 comps  26 comps  27 comps
## CV       0.9988    0.9991    0.9994    0.9992    0.9993    0.9992    0.9993
## adjCV    0.9787    0.9789    0.9792    0.9790    0.9791    0.9790    0.9791
##        28 comps  29 comps  30 comps
## CV       0.9993    0.9993    0.9993
## adjCV    0.9791    0.9791    0.9791
## 
## TRAINING: % variance explained
##     1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps  8 comps
## X     16.76    21.44    31.84    41.84    47.84    50.65    52.86    56.85
## PH    22.63    34.56    36.74    37.89    38.94    39.80    40.40    40.60
##     9 comps  10 comps  11 comps  12 comps  13 comps  14 comps  15 comps
## X     59.57     62.18     65.80     68.10     70.52      72.8     75.89
## PH    40.75     40.83     40.86     40.88     40.89      40.9     40.90
##     16 comps  17 comps  18 comps  19 comps  20 comps  21 comps  22 comps
## X      77.86     79.26     81.43     83.71     85.10     87.22     89.29
## PH     40.90     40.90     40.90     40.91     40.91     40.91     40.91
##     23 comps  24 comps  25 comps  26 comps  27 comps  28 comps  29 comps
## X      90.37     92.79     93.90     95.58     96.89     97.89     98.51
## PH     40.91     40.91     40.91     40.91     40.91     40.91     40.91
##     30 comps
## X     100.00
## PH     40.91
# Using validationplot to check RMSEP across components
validationplot(pls_model, val.type = "RMSEP", main = "PLS: CV RMSEP by Number of Components")

# This plot helps visualize how RMSEP changes as we increase the number of components.
validationplot(pls_model, val.type = "R2", main = "PLS: CV R2 by Number of Components")

# Check R² via R2() function and RMSEP via RMSEP() function
r2_values_pls <- R2(pls_model, estimate = "CV")
rmsep_values_pls <- RMSEP(pls_model, estimate = "CV")
r2_values <- r2_values_pls$val["CV", "PH", ]
r2_comps <- r2_values[-1]

rmse_values <- rmsep_values_pls$val["CV", "PH", ]
rmsep_comps <- rmse_values[-1]
# Identify the best number of components based on minimum RMSEP
opt_comp_rmsep <- which.min(rmsep_comps)
min_rmsep <- rmsep_comps[opt_comp_rmsep]

# Identify the best number of components based on max R²
opt_comp_r2 <- which.max(r2_comps)
max_r2 <- r2_comps[opt_comp_r2]

cat("\nOptimal number of components based on RMSEP:", opt_comp_rmsep, "with RMSEP =", min_rmsep, "\n")
## 
## Optimal number of components based on RMSEP: 6 with RMSEP = 0.9111578
cat("Optimal number of components based on R²:", opt_comp_r2, "with R² =", max_r2, "\n")
## Optimal number of components based on R²: 6 with R² = 0.1688543
# Let's choose the optimal number of components. In practice, you might consider a balance 
# between minimal RMSEP and complexity. Here, let's use the one chosen by RMSEP for demonstration.
final_ncomp <- opt_comp_rmsep

cat("\nFinal chosen number of components:", final_ncomp, "\n")
## 
## Final chosen number of components: 6
# 4. Make a final PLS model with the chosen number of components and evaluate it
pls_model_final <- plsr(PH ~ ., 
                        data = student_train_x2,
                        scale = TRUE,
                        ncomp = final_ncomp,
                        validation = "none") # no need for CV here since we fixed ncomp

# Summary of the final model
summary(pls_model_final)
## Data:    X dimension: 2571 30 
##  Y dimension: 2571 1
## Fit method: kernelpls
## Number of components considered: 6
## TRAINING: % variance explained
##     1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## X     16.76    21.44    31.84    41.84    47.84    50.65
## PH    22.63    34.56    36.74    37.89    38.94    39.80
preds_pls <- predict(pls_model_final, newdata = student_eval_y2, ncomp = final_ncomp)
  
results_test <- postResample(preds_pls, student_eval_y2$PH)
cat("\nTest set performance with", final_ncomp, "components:\n")
## 
## Test set performance with 6 components:
print(results_test)
##      RMSE  Rsquared       MAE 
## 0.5593341 0.4342728 0.4397250
# RMSE     0.5593341
# Rsquared 0.4342728       
# MAE      0.4397250

X-Loadings: Indicate how the original predictors are combined linearly to form the PLS components. A higher absolute loading value means the predictor strongly influences that component.

# View the X-loadings
x_loadings <- pls_model_final$loadings
cat("X-loadings:\n")
## X-loadings:
# print(x_loadings)

# View the X-loading weights
x_loading_weights <- pls_model_final$loading.weights
cat("\nX-loading weights:\n")
## 
## X-loading weights:
# print(x_loading_weights)

# Convert to data frames for easier handling
x_loadings_matrix <- unclass(pls_model_final$loadings)
x_loadings_df <- as.data.frame(x_loadings_matrix)

x_loadings_weights_matrix <- unclass(pls_model_final$loading.weights)
x_loading_weights_df <- as.data.frame(x_loadings_matrix)

# Rename columns for clarity: each column corresponds to a component
colnames(x_loadings_df) <- paste0("Comp", 1:ncol(x_loadings_df))
colnames(x_loading_weights_df) <- paste0("Comp", 1:ncol(x_loading_weights_df))

# Let's also see the coefficients for each number of components
coefficients_array <- pls_model_final$coefficients
# `coefficients_array` is a multidimensional array: [predictor, response, component]
# For a single response model, we can simplify it:
coefficients_matrix <- coefficients_array[,1,] # Extract for the single response variable
colnames(coefficients_matrix) <- paste0("Comp", 1:ncol(coefficients_matrix))

cat("\nCoefficients for each component:\n")
## 
## Coefficients for each component:
#print(coefficients_matrix)

# Visualizing Loadings
# As with PCA, you can plot the loadings to see which predictors have strong influence on each component.
library(reshape2)
library(ggplot2)

# Melt the loadings data frame for plotting
x_loadings_long <- melt(x_loadings_df, variable.name = "Component", value.name = "Loading")
x_loadings_long$Variable <- rownames(x_loadings_df)

ggplot(x_loadings_long, aes(x = reorder(Variable, Loading), y = Loading, fill = Component)) +
  geom_bar(stat="identity", position="dodge") +
  coord_flip() +
  facet_wrap(~ Component, scales = "free_y") +
  theme_minimal() +
  labs(title = "PLS X-Loadings by Component", x = "Predictors", y = "Loading")

Coefficients: Show how each predictor contributes to predicting the response variable when using a given number of components. Higher absolute coefficients indicate greater influence of a predictor on the predicted outcome.

# Extract the coefficients matrix from the PLS model
coefficients_array <- pls_model_final$coefficients
coefficients_matrix <- coefficients_array[, 1, ] # For a single-response model
colnames(coefficients_matrix) <- paste0("Comp", 1:ncol(coefficients_matrix))

# Convert to a data frame
coefficients_df <- as.data.frame(coefficients_matrix)
coefficients_df$Variable <- rownames(coefficients_df)

# Reshape to long format for plotting
library(reshape2)
coefficients_long <- melt(coefficients_df, id.vars = "Variable", 
                          variable.name = "Component", value.name = "Coefficient")

# Plot using ggplot2, faceting by Component
library(ggplot2)
ggplot(coefficients_long, aes(x = reorder(Variable, Coefficient), y = Coefficient, fill = Component)) +
  geom_bar(stat="identity", position="dodge") +
  coord_flip() +
  facet_wrap(~ Component, scales = "free_y") +
  theme_minimal() +
  labs(title = "PLS Coefficients by Component",
       x = "Predictor Variables",
       y = "Coefficient") +
  theme(legend.position = "none")

The good thing about using PLS is that the multi-colinearity is addressed along with predictive power. The components or latent variables are chosen such that the covariance between predictors is maximized, and each component is orthogonal to the ones prior so the correlation between components is also minimized.

Each predictor is also weighed differently in each component it appears in based on its effect on the target variable. Some next steps we can take are to look at which predictors have consistently low coefficients/loadings and remove them- this could possibly improve the performance of the model.

PT: Coefficients: Around ±0.001 to ±0.007 across all components, never exceeding about 0.0075 in absolute value. These are very small relative to other variables that have coefficients in the 0.05–0.4 range. Loadings: PT does not appear to strongly load on any component (not listed or near zero in the given snippet).

Air.Pressurer: Coefficients: About -0.003 to -0.013 across all components. These values are also quite small compared to other more influential variables. Loadings: Only appears once at about -0.114, which is not large. Most important variables have loadings at least above ±0.2–0.3 somewhere.

Carb.Temp: Coefficients: Ranging around 0.0059 to 0.0158, slightly larger than PT or Air.Pressurer but still relatively small. Some variables show coefficients well above 0.05 or even 0.1. Loadings: Carb.Temp does not appear with a noticeable loading in the snippet (it’s blank), suggesting it may not be significantly influencing the latent structure.

library(pls)

# Remove the chosen predictors from the dataset
predictors_to_remove <- c("PT", "Air.Pressurer", "Carb.Temp")

student_train_reduced <- student_train_x2[ , !(names(student_train_x2) %in% predictors_to_remove)]

# Fit a PLS model again with cross-validation
set.seed(123)
pls_model_reduced <- plsr(PH ~ ., 
                          data = student_train_reduced,
                          scale = TRUE,
                          validation = "CV")

# Check summary
summary(pls_model_reduced)
## Data:    X dimension: 2571 27 
##  Y dimension: 2571 1
## Fit method: kernelpls
## Number of components considered: 27
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV          0.9998   0.8815   0.8157   0.8027   0.7964   0.7906   0.7858
## adjCV       0.9998   0.8815   0.8153   0.8024   0.7960   0.7901   0.7852
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV      0.7819   0.7806   0.7799    0.7793    0.7791    0.7790    0.7791
## adjCV   0.7813   0.7801   0.7794    0.7788    0.7786    0.7785    0.7786
##        14 comps  15 comps  16 comps  17 comps  18 comps  19 comps  20 comps
## CV       0.7790    0.7790    0.7790    0.7789    0.7790    0.7790    0.7790
## adjCV    0.7784    0.7784    0.7784    0.7784    0.7784    0.7784    0.7784
##        21 comps  22 comps  23 comps  24 comps  25 comps  26 comps  27 comps
## CV       0.7790    0.7790    0.7790    0.7790    0.7790    0.7790    0.7790
## adjCV    0.7784    0.7784    0.7784    0.7784    0.7784    0.7784    0.7784
## 
## TRAINING: % variance explained
##     1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps  8 comps
## X     18.62    23.82    35.37    46.40    53.02    56.15    58.19    62.01
## PH    22.61    34.54    36.73    37.88    38.93    39.78    40.40    40.60
##     9 comps  10 comps  11 comps  12 comps  13 comps  14 comps  15 comps
## X     65.07     66.83     69.00     72.50     74.75     76.60     79.41
## PH    40.72     40.80     40.83     40.84     40.85     40.85     40.85
##     16 comps  17 comps  18 comps  19 comps  20 comps  21 comps  22 comps
## X      80.73     82.77     84.62     86.89     89.27     90.91     92.52
## PH     40.85     40.85     40.85     40.85     40.85     40.85     40.85
##     23 comps  24 comps  25 comps  26 comps  27 comps
## X      93.75     95.61     97.40     98.30    100.00
## PH     40.85     40.85     40.85     40.85     40.85
r2_values_reduced <- R2(pls_model_reduced, estimate = "CV")
rmsep_values_reduced <- RMSEP(pls_model_reduced, estimate = "CV")



cat("\nCross-validated R² values after removal:\n")
## 
## Cross-validated R² values after removal:
cat("\nCross-validated RMSEP values after removal:\n")
## 
## Cross-validated RMSEP values after removal:
# Visualize MSEP to see if there's an improvement
validationplot(pls_model_reduced, val.type = "MSEP", main = "PLS with Reduced Predictors: CV MSEP by #Components")

# Identify optimal number of components in the reduced model based on RMSEP

r2_values_reduced_mod <- r2_values_reduced$val["CV", "PH", ]
r2_comps_reduced <- r2_values_reduced_mod[-1]

rmse_values_reduced_mod <- rmsep_values_reduced$val["CV", "PH", ]
rmsep_comps_reduced <- rmse_values_reduced_mod[-1]

opt_comp_rmsep_reduced <- which.min(rmsep_comps_reduced)
min_rmsep_reduced <- rmsep_comps_reduced[opt_comp_rmsep_reduced]
opt_comp_r2_reduced <- which.max(r2_comps_reduced)
max_r2_reduced <- r2_comps_reduced[opt_comp_r2_reduced]

cat("\nAfter predictor removal:\n")
## 
## After predictor removal:
cat("Optimal #Components by RMSEP:", opt_comp_rmsep_reduced, "with RMSEP =", min_rmsep_reduced, "\n")
## Optimal #Components by RMSEP: 17 with RMSEP = 0.7789427
cat("Optimal #Components by R²:", opt_comp_r2_reduced, "with R² =", max_r2_reduced, "\n")
## Optimal #Components by R²: 17 with R² = 0.3925633
# Fit the final model with the chosen number of components
final_pls_model <- plsr(PH ~ ., 
                        data = student_train_reduced,
                        scale = TRUE,
                        ncomp = opt_comp_rmsep_reduced,
                        validation = "none")

# Check summary
summary(final_pls_model)
## Data:    X dimension: 2571 27 
##  Y dimension: 2571 1
## Fit method: kernelpls
## Number of components considered: 17
## TRAINING: % variance explained
##     1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps  8 comps
## X     18.62    23.82    35.37    46.40    53.02    56.15    58.19    62.01
## PH    22.61    34.54    36.73    37.88    38.93    39.78    40.40    40.60
##     9 comps  10 comps  11 comps  12 comps  13 comps  14 comps  15 comps
## X     65.07     66.83     69.00     72.50     74.75     76.60     79.41
## PH    40.72     40.80     40.83     40.84     40.85     40.85     40.85
##     16 comps  17 comps
## X      80.73     82.77
## PH     40.85     40.85
# Predict on the test set
preds_final <- predict(final_pls_model, newdata = student_eval_y2, ncomp = opt_comp_rmsep_reduced)
  
test_performance <- postResample(preds_final, student_eval_y2$PH)
cat("\nTest set performance (RMSE, R2, MAE):\n")
## 
## Test set performance (RMSE, R2, MAE):
print(test_performance)
##      RMSE  Rsquared       MAE 
## 0.5562690 0.4452060 0.4302488
# RMSE      0.5562690
# Rsquared  0.4452060
# MAE        0.4302488 

There was a small lift in Rsquared after removing some of the less relevant predictors.

Rsquared:

PCR: 0.2832964 PLS: 0.4342728 PLS with some predictors removed: 0.4452060