Homework 7 Data 624

\[6.2\] \[Libraries:\]

library(AppliedPredictiveModeling)

## Warning: package 'AppliedPredictiveModeling' was built under R version 4.3.3

library(caret)

## Warning: package 'caret' was built under R version 4.3.3

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 4.3.3

## Loading required package: lattice

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.3.3

## Warning: package 'tidyr' was built under R version 4.3.2

## Warning: package 'readr' was built under R version 4.3.2

## Warning: package 'purrr' was built under R version 4.3.2

## Warning: package 'dplyr' was built under R version 4.3.2

## Warning: package 'stringr' was built under R version 4.3.2

## Warning: package 'forcats' was built under R version 4.3.3

## Warning: package 'lubridate' was built under R version 4.3.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ purrr::lift()   masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(corrplot)

## Warning: package 'corrplot' was built under R version 4.3.3

## corrplot 0.95 loaded

library(Amelia)

## Warning: package 'Amelia' was built under R version 4.3.3

## Loading required package: Rcpp

## Warning: package 'Rcpp' was built under R version 4.3.2

## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.2, built: 2024-04-10)
## ## Copyright (C) 2005-2024 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##

##  a.

data(permeability)
dim(permeability)

## [1] 165   1

dim(fingerprints)

## [1]  165 1107

##  b.

fingerprints_filtered <- fingerprints[,-nearZeroVar(fingerprints)]
dim(fingerprints_filtered)

## [1] 165 388

##  c.

set.seed(1234)
train <- sample(nrow(fingerprints_filtered), nrow(fingerprints_filtered)*.8, replace=F) # split the data 
X_train <- fingerprints_filtered[train,]
X_test <- fingerprints_filtered[-train,]
y_train <- permeability[train,]
y_test <- permeability[-train,]
any(is.na(X_train)) | any(is.na(X_test)) # check if there is missing values

## [1] FALSE

ctrl <- trainControl(method="CV",
                     number=10)
pls_mod <- train(X_train,
                 y_train,
                 method = "pls",
                 preProcess = c("center", "scale"),
                 trControl = ctrl,
                 tuneLength=20)

plot(pls_mod)

## Cross-fold validation to determine the optimal number of components (resampling technique) and we will scale and center the data within the PLS function.

pls_mod

## Partial Least Squares 
## 
## 132 samples
## 388 predictors
## 
## Pre-processing: centered (388), scaled (388) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 119, 120, 117, 120, 119, 118, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE      Rsquared   MAE      
##    1     12.96096  0.3822261  10.043384
##    2     11.92929  0.5458639   8.316843
##    3     12.02574  0.5074062   8.833744
##    4     11.98724  0.4997168   8.800333
##    5     11.68597  0.5300868   8.547348
##    6     11.34780  0.5589800   8.307307
##    7     11.48813  0.5472951   8.708420
##    8     11.69585  0.5493433   8.776256
##    9     11.85949  0.5376061   8.928409
##   10     12.20686  0.5175105   9.178848
##   11     12.46616  0.5110085   9.290571
##   12     12.66490  0.5040105   9.409175
##   13     12.72550  0.4967728   9.352964
##   14     12.84813  0.4870512   9.383638
##   15     12.97335  0.4756392   9.473380
##   16     13.35639  0.4575402   9.750261
##   17     13.43329  0.4583120   9.716293
##   18     13.55068  0.4570093   9.857390
##   19     13.80212  0.4489555  10.086780
##   20     14.02903  0.4361979  10.357474
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 6.

According to the model, the optimal number of latent components is 6 and the corresponding R2 value is ~ 0.56

##  d.

pred <- predict(pls_mod, X_test)

# r2 value

R2(pred, y_test)

## [1] 0.4028607

# RMSE

RMSE(pred, y_test)

## [1] 12.06105

## e.

pca_mod <- train(X_train,
                 y_train,
                 method = "pcr",
                 preProcess = c("center", "scale"),
                 trControl = ctrl,
                 tuneLength=20)

plot(pca_mod)

pca_mod

## Principal Component Analysis 
## 
## 132 samples
## 388 predictors
## 
## Pre-processing: centered (388), scaled (388) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 119, 120, 119, 119, 118, 117, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE      Rsquared   MAE      
##    1     14.74006  0.1489287  11.369367
##    2     14.63358  0.1801134  11.463617
##    3     13.07586  0.3551410  10.077880
##    4     11.90177  0.4599715   8.924979
##    5     11.79113  0.4847720   8.531223
##    6     11.79783  0.4810229   8.657492
##    7     11.79315  0.4803741   8.654395
##    8     11.72098  0.4870030   8.483058
##    9     11.53054  0.4958056   8.503358
##   10     11.62187  0.4898977   8.614341
##   11     11.38917  0.5108846   8.496665
##   12     11.43631  0.5049972   8.509126
##   13     11.56143  0.4923461   8.705262
##   14     11.71926  0.4855099   8.876823
##   15     11.74719  0.4825926   8.913693
##   16     11.86704  0.4714033   8.993744
##   17     11.78860  0.4852519   8.731502
##   18     11.68345  0.5020917   8.833788
##   19     11.78921  0.4924325   8.898431
##   20     11.41907  0.5191135   8.689291
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 11.

pca_pred <- predict(pca_mod, X_test)

# r2 value
R2(pca_pred, y_test)

## [1] 0.3403256

# RMSE 
RMSE(pca_pred, y_test)

## [1] 12.64507

PLS model performed better than PCA model that I created.

\[6.3\]

##  a.

data(ChemicalManufacturingProcess)

##  b.

missmap(ChemicalManufacturingProcess)

It appears that the missing values are quite minor and seem to be missing at random.

imputations <- preProcess(ChemicalManufacturingProcess, 
               method = c("knnImpute"), 
               k=5)
chem_man_imputed <- predict(imputations, ChemicalManufacturingProcess)

# check

any(is.na(chem_man_imputed)) # no more missing values.

## [1] FALSE

##  c.

chem_man_filtered <- chem_man_imputed[,-nearZeroVar(chem_man_imputed)]

set.seed(532) 

# split into training and testing

train_indices <- sample(nrow(chem_man_filtered), nrow(chem_man_filtered)*.8, replace=F)

train <- chem_man_filtered[train_indices,]
test <- chem_man_filtered[-train_indices,]

lasso_mod <- train(Yield ~.,
                   data=train,
                   method = "glmnet",
                   preProcess = c("center", "scale"),
                   trControl = ctrl,
                   tuneGrid = expand.grid(.alpha = 1, .lambda = seq(0, 1, 0.05)))

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.

plot(lasso_mod)

lasso_mod

## glmnet 
## 
## 140 samples
##  56 predictor
## 
## Pre-processing: centered (56), scaled (56) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 127, 126, 126, 126, 125, 126, ... 
## Resampling results across tuning parameters:
## 
##   lambda  RMSE       Rsquared   MAE      
##   0.00    2.4852668  0.3662810  1.1368255
##   0.05    0.6150924  0.6243606  0.4959297
##   0.10    0.6490388  0.6027606  0.5362728
##   0.15    0.6686156  0.6013160  0.5561954
##   0.20    0.6916044  0.6017251  0.5722321
##   0.25    0.7220113  0.5996926  0.5923676
##   0.30    0.7574999  0.5965839  0.6211488
##   0.35    0.7981168  0.5905034  0.6540803
##   0.40    0.8428674  0.5772446  0.6900248
##   0.45    0.8911673  0.5339642  0.7284218
##   0.50    0.9377941  0.4493852  0.7664760
##   0.55    0.9707667  0.4069614  0.7940046
##   0.60    0.9939838  0.3679470  0.8135680
##   0.65    0.9978349        NaN  0.8161743
##   0.70    0.9978349        NaN  0.8161743
##   0.75    0.9978349        NaN  0.8161743
##   0.80    0.9978349        NaN  0.8161743
##   0.85    0.9978349        NaN  0.8161743
##   0.90    0.9978349        NaN  0.8161743
##   0.95    0.9978349        NaN  0.8161743
##   1.00    0.9978349        NaN  0.8161743
## 
## Tuning parameter 'alpha' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 1 and lambda = 0.05.

## d.

lasso_pred <- predict(lasso_mod, test)

R2(lasso_pred, test$Yield)

## [1] 0.5230744

RMSE(lasso_pred, test$Yield)

## [1] 0.6585246

## e.

varImp(lasso_mod)

## glmnet variable importance
## 
##   only 20 most important variables shown (out of 56)
## 
##                        Overall
## ManufacturingProcess32 100.000
## ManufacturingProcess09  73.107
## ManufacturingProcess37  19.700
## ManufacturingProcess17  18.809
## BiologicalMaterial05    15.454
## ManufacturingProcess01  13.079
## ManufacturingProcess34  11.677
## ManufacturingProcess06  10.857
## ManufacturingProcess36   7.191
## ManufacturingProcess07   5.141
## ManufacturingProcess13   5.051
## BiologicalMaterial03     2.864
## ManufacturingProcess26   0.000
## ManufacturingProcess15   0.000
## BiologicalMaterial04     0.000
## ManufacturingProcess29   0.000
## ManufacturingProcess30   0.000
## BiologicalMaterial09     0.000
## ManufacturingProcess43   0.000
## ManufacturingProcess16   0.000

The top most important variable is ManufacturingProcess32.

plot(varImp(lasso_mod))

##  f.

top_10 <- varImp(lasso_mod)$importance |>
  arrange(desc(Overall)) |>
  head(10) |>
  rownames_to_column()

names(top_10) <- c("predictor", "importance")

chem_man_imputed[,c("Yield", top_10$predictor)] |>
  cor() |>
  corrplot(method="color",
           diag=FALSE,
           type="lower",
           addCoef.col = "black",
           number.cex=0.5)

## Based on the impact of each of the most important predictors, one of the predictors can provide greater overall yield.

Homework 7 Data 624

CN

2024-11-02

According to the model, the optimal number of latent components is 6 and the corresponding R2 value is ~ 0.56

PLS model performed better than PCA model that I created.

It appears that the missing values are quite minor and seem to be missing at random.

The top most important variable is ManufacturingProcess32.