library(AppliedPredictiveModeling)

## Warning: package 'AppliedPredictiveModeling' was built under R version 4.1.3

library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(DataExplorer)
library(RANN)

## Warning: package 'RANN' was built under R version 4.1.3

library(corrplot)

## Warning: package 'corrplot' was built under R version 4.1.3

## corrplot 0.92 loaded

data(permeability)

clean_fingerprints = fingerprints[,-caret::nearZeroVar(fingerprints)]

dim(clean_fingerprints)

## [1] 165 388

set.seed(234)
partition = createDataPartition(permeability,p = .8,list=FALSE)

trainx = clean_fingerprints[partition,]
trainy =  permeability[partition,]
testx = clean_fingerprints[-partition,]
testy = permeability[-partition]

pls_model = train(trainx,trainy,method='pls',tuneLength=20,trControl = trainControl(method = "cv", number = 10),preProc = c("center", "scale"))
pls_model

## Partial Least Squares 
## 
## 133 samples
## 388 predictors
## 
## Pre-processing: centered (388), scaled (388) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 119, 119, 120, 119, 120, 120, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE      Rsquared   MAE      
##    1     13.18611  0.3434393  10.202531
##    2     11.85658  0.4797263   8.279253
##    3     11.83417  0.4840390   8.870427
##    4     11.71698  0.4869811   9.002545
##    5     11.85543  0.4718882   9.147745
##    6     11.40338  0.5044622   8.693787
##    7     11.16218  0.5276655   8.716494
##    8     11.10036  0.5459316   8.698217
##    9     11.27816  0.5426732   8.654846
##   10     11.42424  0.5277375   8.802697
##   11     11.76379  0.5137673   8.979652
##   12     12.17605  0.4957517   9.239574
##   13     12.31047  0.4906547   9.471656
##   14     12.54160  0.4841179   9.719038
##   15     12.68314  0.4878009   9.781537
##   16     13.09624  0.4621940  10.105568
##   17     13.15496  0.4606289  10.155725
##   18     13.48162  0.4528951  10.339999
##   19     13.62623  0.4558228  10.457063
##   20     13.91732  0.4415159  10.495558
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 8.

pls_model$results[pls_model$bestTune$ncomp,]

##   ncomp     RMSE  Rsquared      MAE   RMSESD RsquaredSD    MAESD
## 8     8 11.10036 0.5459316 8.698217 2.099516  0.1562487 1.727553

plot(pls_model)

pred_data = predict(pls_model,testx)
postResample(pred = pred_data,obs=testy)

##       RMSE   Rsquared        MAE 
## 10.9890489  0.5320801  8.1958014

ridge_data_frame = data.frame(.lambda=seq(0,.1,length=15))
set.seed(245)
ridgefit = train(trainx,trainy,method = 'ridge',
                 tuneGrid=ridge_data_frame,
                 trControl=trainControl(method='cv',number=10),
                 preProc=c('center','scale'))

## Warning: model fit failed for Fold08: lambda=0.000000 Error in if (zmin < gamhat) { : missing value where TRUE/FALSE needed

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.

ridgefit

## Ridge Regression 
## 
## 133 samples
## 388 predictors
## 
## Pre-processing: centered (388), scaled (388) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 118, 120, 121, 118, 120, 121, ... 
## Resampling results across tuning parameters:
## 
##   lambda       RMSE        Rsquared   MAE        
##   0.000000000   708.32150  0.3569901   330.042701
##   0.007142857    14.79006  0.3503073    10.913636
##   0.014285714  2530.40746  0.3828814  1214.999665
##   0.021428571    13.65438  0.3974206    10.316406
##   0.028571429    13.78486  0.3956952    10.275089
##   0.035714286    13.12587  0.4327413     9.971987
##   0.042857143    12.90641  0.4460507     9.810001
##   0.050000000    12.84436  0.4531486     9.742447
##   0.057142857    12.71828  0.4596228     9.654935
##   0.064285714    12.63492  0.4655818     9.581454
##   0.071428571    12.64149  0.4679259     9.573756
##   0.078571429    12.50998  0.4751516     9.485307
##   0.085714286    12.63624  0.4718566     9.560340
##   0.092857143    12.50965  0.4785303     9.478886
##   0.100000000    12.48249  0.4802439     9.454193
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was lambda = 0.1.

ridgefit$results[ridgefit$results$lambda==ridgefit$bestTune$lambda,]

##    lambda     RMSE  Rsquared      MAE   RMSESD RsquaredSD    MAESD
## 15    0.1 12.48249 0.4802439 9.454193 3.054923    0.22661 2.006753

6.2f

The alternative model is not acceptable to replace the permeability laboratory experiment.

data(ChemicalManufacturingProcess)

plot_missing(ChemicalManufacturingProcess)

imputationdata = preProcess(ChemicalManufacturingProcess,method = 'knnImpute')

chem_new_data = predict(imputationdata,ChemicalManufacturingProcess)

plot_missing(chem_new_data)

set.seed(234)
chempartition = createDataPartition(chem_new_data$Yield,p = .8,list=FALSE)

trainx_chem = chem_new_data[chempartition,]
testx_chem = chem_new_data[-chempartition,]


x_data_train = trainx_chem %>% select(-Yield)
x_data_test = testx_chem %>% select(-Yield)

ridge_model = train(x_data_train,trainx_chem$Yield,method='ridge', tuneGrid=ridge_data_frame,trControl = trainControl(method = "cv", number = 10),preProc = c("center", "scale"))
ridge_model

## Ridge Regression 
## 
## 144 samples
##  57 predictor
## 
## Pre-processing: centered (57), scaled (57) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 129, 130, 131, 129, 130, 130, ... 
## Resampling results across tuning parameters:
## 
##   lambda       RMSE      Rsquared   MAE      
##   0.000000000  5.094269  0.3480776  1.7642172
##   0.007142857  2.599591  0.3857913  1.0860379
##   0.014285714  2.164726  0.4041163  0.9679730
##   0.021428571  1.942303  0.4148506  0.9077559
##   0.028571429  1.798228  0.4222012  0.8687096
##   0.035714286  1.694378  0.4277393  0.8406936
##   0.042857143  1.614765  0.4321782  0.8191833
##   0.050000000  1.551214  0.4358905  0.8018943
##   0.057142857  1.499002  0.4390916  0.7878561
##   0.064285714  1.455168  0.4419151  0.7762709
##   0.071428571  1.417739  0.4444491  0.7663120
##   0.078571429  1.385340  0.4467542  0.7577322
##   0.085714286  1.356980  0.4488736  0.7504527
##   0.092857143  1.331920  0.4508392  0.7440236
##   0.100000000  1.309600  0.4526751  0.7382854
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was lambda = 0.1.

ridge_model$results[ridge_model$results$lambda==ridge_model$bestTune$lambda,]

##    lambda   RMSE  Rsquared       MAE   RMSESD RsquaredSD     MAESD
## 15    0.1 1.3096 0.4526751 0.7382854 1.238652  0.2540459 0.3853613

plot(ridge_model)

pred_data_chem = predict(ridge_model,x_data_test)
postResample(pred = pred_data_chem,obs=testx_chem$Yield)

##      RMSE  Rsquared       MAE 
## 0.6089531 0.6867799 0.4803808

key_vars = varImp(ridge_model)
key_vars

## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 57)
## 
##                        Overall
## ManufacturingProcess32  100.00
## ManufacturingProcess13   94.14
## BiologicalMaterial06     78.83
## ManufacturingProcess36   71.21
## ManufacturingProcess17   70.75
## ManufacturingProcess09   69.02
## BiologicalMaterial03     63.32
## BiologicalMaterial02     63.30
## BiologicalMaterial12     60.48
## ManufacturingProcess06   57.46
## ManufacturingProcess31   56.95
## ManufacturingProcess33   52.25
## ManufacturingProcess29   49.54
## ManufacturingProcess02   44.92
## BiologicalMaterial04     42.89
## ManufacturingProcess11   42.73
## BiologicalMaterial11     36.61
## BiologicalMaterial09     35.43
## ManufacturingProcess30   34.11
## BiologicalMaterial01     34.06

print(paste("Manufacturing:",length(key_vars$importance[grepl('Manufacturing', rownames(key_vars$importance), fixed = TRUE),])))

## [1] "Manufacturing: 45"

print(paste("Biology:",length(key_vars$importance[grepl('Biological', rownames(key_vars$importance), fixed = TRUE),])))

## [1] "Biology: 12"

corrplot(cor(chem_new_data[c('Yield',rownames(key_vars$importance))[1:(length(rownames(key_vars$importance))/3)]]),method="circle",type='full')

corrplot(cor(chem_new_data[c('Yield',rownames(key_vars$importance))[(length(rownames(key_vars$importance))/3):((length(rownames(key_vars$importance))/3)*2)]]),method="circle",type='full')

corrplot(cor(chem_new_data[c('Yield',rownames(key_vars$importance))[((length(rownames(key_vars$importance))/3)*2):((length(rownames(key_vars$importance))/3)*3)]]),method="circle",type='full')

Data624_HW7

Hector Santana

4/10/2022

6.2f