\[6.2\] \[Libraries:\]
library(AppliedPredictiveModeling)
## Warning: package 'AppliedPredictiveModeling' was built under R version 4.3.3
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.3.3
## Loading required package: lattice
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.3
## Warning: package 'tidyr' was built under R version 4.3.2
## Warning: package 'readr' was built under R version 4.3.2
## Warning: package 'purrr' was built under R version 4.3.2
## Warning: package 'dplyr' was built under R version 4.3.2
## Warning: package 'stringr' was built under R version 4.3.2
## Warning: package 'forcats' was built under R version 4.3.3
## Warning: package 'lubridate' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.3
## corrplot 0.95 loaded
library(Amelia)
## Warning: package 'Amelia' was built under R version 4.3.3
## Loading required package: Rcpp
## Warning: package 'Rcpp' was built under R version 4.3.2
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.2, built: 2024-04-10)
## ## Copyright (C) 2005-2024 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
## a.
data(permeability)
dim(permeability)
## [1] 165 1
dim(fingerprints)
## [1] 165 1107
## b.
fingerprints_filtered <- fingerprints[,-nearZeroVar(fingerprints)]
dim(fingerprints_filtered)
## [1] 165 388
## c.
set.seed(1234)
train <- sample(nrow(fingerprints_filtered), nrow(fingerprints_filtered)*.8, replace=F) # split the data
X_train <- fingerprints_filtered[train,]
X_test <- fingerprints_filtered[-train,]
y_train <- permeability[train,]
y_test <- permeability[-train,]
any(is.na(X_train)) | any(is.na(X_test)) # check if there is missing values
## [1] FALSE
ctrl <- trainControl(method="CV",
number=10)
pls_mod <- train(X_train,
y_train,
method = "pls",
preProcess = c("center", "scale"),
trControl = ctrl,
tuneLength=20)
plot(pls_mod)
## Cross-fold validation to determine the optimal number of components
(resampling technique) and we will scale and center the data within the
PLS function.
pls_mod
## Partial Least Squares
##
## 132 samples
## 388 predictors
##
## Pre-processing: centered (388), scaled (388)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 119, 120, 117, 120, 119, 118, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 12.96096 0.3822261 10.043384
## 2 11.92929 0.5458639 8.316843
## 3 12.02574 0.5074062 8.833744
## 4 11.98724 0.4997168 8.800333
## 5 11.68597 0.5300868 8.547348
## 6 11.34780 0.5589800 8.307307
## 7 11.48813 0.5472951 8.708420
## 8 11.69585 0.5493433 8.776256
## 9 11.85949 0.5376061 8.928409
## 10 12.20686 0.5175105 9.178848
## 11 12.46616 0.5110085 9.290571
## 12 12.66490 0.5040105 9.409175
## 13 12.72550 0.4967728 9.352964
## 14 12.84813 0.4870512 9.383638
## 15 12.97335 0.4756392 9.473380
## 16 13.35639 0.4575402 9.750261
## 17 13.43329 0.4583120 9.716293
## 18 13.55068 0.4570093 9.857390
## 19 13.80212 0.4489555 10.086780
## 20 14.02903 0.4361979 10.357474
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 6.
## d.
pred <- predict(pls_mod, X_test)
# r2 value
R2(pred, y_test)
## [1] 0.4028607
# RMSE
RMSE(pred, y_test)
## [1] 12.06105
## e.
pca_mod <- train(X_train,
y_train,
method = "pcr",
preProcess = c("center", "scale"),
trControl = ctrl,
tuneLength=20)
plot(pca_mod)
pca_mod
## Principal Component Analysis
##
## 132 samples
## 388 predictors
##
## Pre-processing: centered (388), scaled (388)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 119, 120, 119, 119, 118, 117, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 14.74006 0.1489287 11.369367
## 2 14.63358 0.1801134 11.463617
## 3 13.07586 0.3551410 10.077880
## 4 11.90177 0.4599715 8.924979
## 5 11.79113 0.4847720 8.531223
## 6 11.79783 0.4810229 8.657492
## 7 11.79315 0.4803741 8.654395
## 8 11.72098 0.4870030 8.483058
## 9 11.53054 0.4958056 8.503358
## 10 11.62187 0.4898977 8.614341
## 11 11.38917 0.5108846 8.496665
## 12 11.43631 0.5049972 8.509126
## 13 11.56143 0.4923461 8.705262
## 14 11.71926 0.4855099 8.876823
## 15 11.74719 0.4825926 8.913693
## 16 11.86704 0.4714033 8.993744
## 17 11.78860 0.4852519 8.731502
## 18 11.68345 0.5020917 8.833788
## 19 11.78921 0.4924325 8.898431
## 20 11.41907 0.5191135 8.689291
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 11.
pca_pred <- predict(pca_mod, X_test)
# r2 value
R2(pca_pred, y_test)
## [1] 0.3403256
# RMSE
RMSE(pca_pred, y_test)
## [1] 12.64507
\[6.3\]
## a.
data(ChemicalManufacturingProcess)
## b.
missmap(ChemicalManufacturingProcess)
imputations <- preProcess(ChemicalManufacturingProcess,
method = c("knnImpute"),
k=5)
chem_man_imputed <- predict(imputations, ChemicalManufacturingProcess)
# check
any(is.na(chem_man_imputed)) # no more missing values.
## [1] FALSE
## c.
chem_man_filtered <- chem_man_imputed[,-nearZeroVar(chem_man_imputed)]
set.seed(532)
# split into training and testing
train_indices <- sample(nrow(chem_man_filtered), nrow(chem_man_filtered)*.8, replace=F)
train <- chem_man_filtered[train_indices,]
test <- chem_man_filtered[-train_indices,]
lasso_mod <- train(Yield ~.,
data=train,
method = "glmnet",
preProcess = c("center", "scale"),
trControl = ctrl,
tuneGrid = expand.grid(.alpha = 1, .lambda = seq(0, 1, 0.05)))
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
plot(lasso_mod)
lasso_mod
## glmnet
##
## 140 samples
## 56 predictor
##
## Pre-processing: centered (56), scaled (56)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 127, 126, 126, 126, 125, 126, ...
## Resampling results across tuning parameters:
##
## lambda RMSE Rsquared MAE
## 0.00 2.4852668 0.3662810 1.1368255
## 0.05 0.6150924 0.6243606 0.4959297
## 0.10 0.6490388 0.6027606 0.5362728
## 0.15 0.6686156 0.6013160 0.5561954
## 0.20 0.6916044 0.6017251 0.5722321
## 0.25 0.7220113 0.5996926 0.5923676
## 0.30 0.7574999 0.5965839 0.6211488
## 0.35 0.7981168 0.5905034 0.6540803
## 0.40 0.8428674 0.5772446 0.6900248
## 0.45 0.8911673 0.5339642 0.7284218
## 0.50 0.9377941 0.4493852 0.7664760
## 0.55 0.9707667 0.4069614 0.7940046
## 0.60 0.9939838 0.3679470 0.8135680
## 0.65 0.9978349 NaN 0.8161743
## 0.70 0.9978349 NaN 0.8161743
## 0.75 0.9978349 NaN 0.8161743
## 0.80 0.9978349 NaN 0.8161743
## 0.85 0.9978349 NaN 0.8161743
## 0.90 0.9978349 NaN 0.8161743
## 0.95 0.9978349 NaN 0.8161743
## 1.00 0.9978349 NaN 0.8161743
##
## Tuning parameter 'alpha' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 1 and lambda = 0.05.
## d.
lasso_pred <- predict(lasso_mod, test)
R2(lasso_pred, test$Yield)
## [1] 0.5230744
RMSE(lasso_pred, test$Yield)
## [1] 0.6585246
## e.
varImp(lasso_mod)
## glmnet variable importance
##
## only 20 most important variables shown (out of 56)
##
## Overall
## ManufacturingProcess32 100.000
## ManufacturingProcess09 73.107
## ManufacturingProcess37 19.700
## ManufacturingProcess17 18.809
## BiologicalMaterial05 15.454
## ManufacturingProcess01 13.079
## ManufacturingProcess34 11.677
## ManufacturingProcess06 10.857
## ManufacturingProcess36 7.191
## ManufacturingProcess07 5.141
## ManufacturingProcess13 5.051
## BiologicalMaterial03 2.864
## ManufacturingProcess26 0.000
## ManufacturingProcess15 0.000
## BiologicalMaterial04 0.000
## ManufacturingProcess29 0.000
## ManufacturingProcess30 0.000
## BiologicalMaterial09 0.000
## ManufacturingProcess43 0.000
## ManufacturingProcess16 0.000
plot(varImp(lasso_mod))
## f.
top_10 <- varImp(lasso_mod)$importance |>
arrange(desc(Overall)) |>
head(10) |>
rownames_to_column()
names(top_10) <- c("predictor", "importance")
chem_man_imputed[,c("Yield", top_10$predictor)] |>
cor() |>
corrplot(method="color",
diag=FALSE,
type="lower",
addCoef.col = "black",
number.cex=0.5)
## Based on the impact of each of the most important predictors, one of
the predictors can provide greater overall yield.