library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(RANN)

Ask

In Kuhn and Johnson do problems 6.2 and 6.3. There are only two but they consist of many parts. Please submit a link to your Rpubs and submit the .rmd file as well.

6.2

library(AppliedPredictiveModeling)
data(permeability)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift

B

dim(fingerprints)
## [1]  165 1107
sparse <- nearZeroVar(fingerprints)
not_sparse <- fingerprints[, -sparse]
dim(not_sparse)
## [1] 165 388

388 predictors are left after applying the nearZeroVar function on fingerprints.

C

df <- as.data.frame(fingerprints)

df <- df |>
  mutate(target = permeability)

sparse <- nearZeroVar(df)
df <- df[, -sparse]
dim(df)
## [1] 165 389
splitdf <- createDataPartition(df$target, times = 1, p = 0.8, list = FALSE)
train <- df[splitdf, ]
test <- df[-splitdf, ]

set.seed(123)

pls_model <- train(target ~ ., data = train, method = "pls", metric="Rsquared",
  center = TRUE, trControl = trainControl("cv", number = 10),
  tuneLength = 25
)
pls_model
## Partial Least Squares 
## 
## 133 samples
## 388 predictors
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 120, 119, 118, 120, 121, 119, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE      Rsquared   MAE      
##    1     13.89629  0.2595202  10.523834
##    2     12.08958  0.3930697   8.606173
##    3     11.92805  0.4412262   8.773964
##    4     11.69603  0.4647318   8.833409
##    5     11.73702  0.4685687   8.914923
##    6     11.74632  0.4891905   8.825990
##    7     11.92629  0.4782318   9.088580
##    8     11.87671  0.4891900   9.168992
##    9     11.80357  0.4871362   9.016852
##   10     11.71772  0.5031537   8.983574
##   11     11.74937  0.4967466   8.901958
##   12     11.86636  0.4876363   8.963664
##   13     11.63549  0.4972131   8.726712
##   14     11.62791  0.5001112   8.799882
##   15     11.67511  0.5050307   8.788139
##   16     11.59268  0.5224644   8.597767
##   17     11.19736  0.5503581   8.324308
##   18     11.18188  0.5514033   8.170512
##   19     11.33144  0.5381611   8.250898
##   20     11.31008  0.5447829   8.291521
##   21     11.49961  0.5436926   8.421616
##   22     11.54810  0.5456161   8.453671
##   23     11.63709  0.5369965   8.515249
##   24     11.68096  0.5406349   8.563958
##   25     11.94825  0.5326035   8.739179
## 
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was ncomp = 18.
summary(pls_model)
## Data:    X dimension: 133 388 
##  Y dimension: 133 1
## Fit method: oscorespls
## Number of components considered: 18
## TRAINING: % variance explained
##           1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps
## X           31.14    42.51    49.42    54.46    63.50    67.29    69.87
## .outcome    23.37    45.32    53.69    61.87    65.45    70.96    75.02
##           8 comps  9 comps  10 comps  11 comps  12 comps  13 comps  14 comps
## X           73.14    75.04     77.65     80.01     81.14     82.75     84.80
## .outcome    76.81    79.08     80.85     82.28     84.12     85.27     85.97
##           15 comps  16 comps  17 comps  18 comps
## X            86.16     87.81     88.75     89.88
## .outcome     86.76     87.39     88.34     88.96

The highest \(R^2\) occurs at ncomp = 6, the corresponding R^2 is 0.5678856.

D

predict_test <- predict(pls_model, test)
obs_values <- data.frame(obs = test$target, pred = predict_test)
colnames(obs_values) <- c("obs","pred")

defaultSummary(obs_values)
##       RMSE   Rsquared        MAE 
## 13.2695528  0.4305899 10.1390421

The \(R^2\) is 67.62% which is higher than part C.

E

set.seed(123)
pcr_model <- train(target ~ ., data = train, method = "pcr",
  center = TRUE,
  trControl = trainControl("cv", number = 10),
  tuneLength = 25
)

predict_pcr <- predict(pcr_model, test)

obs_values <- data.frame(obs = test$target, pred = predict_pcr)
colnames(obs_values) <- c("obs","pred")
defaultSummary(obs_values)
##      RMSE  Rsquared       MAE 
## 10.132705  0.592234  8.058536

The R^2 is 0.33 , which did not perform better than the PLS model.

Now to try with the ridge method.

set.seed(123)
ridgeGrid <- data.frame(.lambda = seq(0, .1, length = 15))

ridgeRegFit <- train(target ~ ., data = train,
method = "ridge",
metric = "Rsquared",
tuneGrid = ridgeGrid,
trControl = trainControl(method = "cv", number = 10),
preProc = c("center", "scale"))

predict_ridge <- predict(ridgeRegFit, test)

obs_values <- data.frame(obs = test$target, pred = predict_ridge)
colnames(obs_values) <- c("obs","pred")
defaultSummary(obs_values)
##       RMSE   Rsquared        MAE 
## 13.5934925  0.4031599 10.3669077

The R^2 was 0.40 which performed better than the PCR model but not the PLS.

F

Of the above models, I would not use PCR. The best performing model is the PLS model.

6.3

A

data("ChemicalManufacturingProcess")
dim(ChemicalManufacturingProcess)
## [1] 176  58

B

Imputing the missing values using KNN.

set.seed(123)
chem_impute <- preProcess(ChemicalManufacturingProcess, method=c('knnImpute'))
df <- predict(chem_impute, ChemicalManufacturingProcess)

C

dfx <- df |> select(-Yield)
dfy <- df |> select(Yield)
set.seed(123)
chem_train <- createDataPartition(dfy$Yield, p = .80, list= FALSE)
x_train <- dfx[chem_train,]
x_test <- dfx[-chem_train,]
y_train <- dfy[chem_train,]
y_test <- dfy[-chem_train,]
set.seed(123)
ridgeGrid <- data.frame(.lambda = seq(0,0.1,length=15))

ridgeFit <- train(x=x_train , y=y_train,
                       method='ridge',
                       tuneGrid=ridgeGrid,
                       trControl=trainControl(method = "cv", number = 10),
                       preProc = c('center','scale'))
              

ridgeFit
## Ridge Regression 
## 
## 144 samples
##  57 predictor
## 
## Pre-processing: centered (57), scaled (57) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 128, 129, 129, 130, 128, 131, ... 
## Resampling results across tuning parameters:
## 
##   lambda       RMSE      Rsquared   MAE      
##   0.000000000  6.559444  0.3705740  2.1257894
##   0.007142857  2.905917  0.4076010  1.1706542
##   0.014285714  2.356944  0.4321264  1.0217333
##   0.021428571  2.061485  0.4500623  0.9420376
##   0.028571429  1.868746  0.4640859  0.8889647
##   0.035714286  1.730620  0.4755309  0.8500597
##   0.042857143  1.625723  0.4851112  0.8199353
##   0.050000000  1.542823  0.4932645  0.7957412
##   0.057142857  1.475365  0.5002888  0.7761056
##   0.064285714  1.419231  0.5064010  0.7598050
##   0.071428571  1.371687  0.5117652  0.7460655
##   0.078571429  1.330839  0.5165093  0.7341978
##   0.085714286  1.295329  0.5207341  0.7238684
##   0.092857143  1.264153  0.5245201  0.7148742
##   0.100000000  1.236554  0.5279324  0.7068879
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was lambda = 0.1.

Using the ridge method the optimal value is 0.1.

D

predict_ridge <- predict(ridgeFit, x_test)

defaultSummary(data.frame(pred=predict_ridge,obs=y_test))
##      RMSE  Rsquared       MAE 
## 0.7428885 0.4953262 0.6433366

The \(R^2\) is 15% and the prediction shows a worse \(R^2\) than the training dataset.

E

The most important Predictors are Manifacturing Process 32, 13, and the Biological Material 06.

plot(varImp(ridgeFit, scale = FALSE), top=20, scales = list(y = list(cex = 0.8)))

F

correlation <- cor(select(df, 'ManufacturingProcess32','ManufacturingProcess13','BiologicalMaterial06',  'Yield'))
corrplot::corrplot(correlation, method='square', type="upper")

There are obvious correlations between the top predictors and the response. These predictors can lead to a better fitting of model creation and would likely be more statistically relevant. This also shows the direct relationship between which processes will have a negative or positive Yield.