library(AppliedPredictiveModeling)
## Warning: package 'AppliedPredictiveModeling' was built under R version 4.1.3
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(DataExplorer)
library(RANN)
## Warning: package 'RANN' was built under R version 4.1.3
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.1.3
## corrplot 0.92 loaded
data(permeability)
clean_fingerprints = fingerprints[,-caret::nearZeroVar(fingerprints)]
dim(clean_fingerprints)
## [1] 165 388
set.seed(234)
partition = createDataPartition(permeability,p = .8,list=FALSE)
trainx = clean_fingerprints[partition,]
trainy = permeability[partition,]
testx = clean_fingerprints[-partition,]
testy = permeability[-partition]
pls_model = train(trainx,trainy,method='pls',tuneLength=20,trControl = trainControl(method = "cv", number = 10),preProc = c("center", "scale"))
pls_model
## Partial Least Squares
##
## 133 samples
## 388 predictors
##
## Pre-processing: centered (388), scaled (388)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 119, 119, 120, 119, 120, 120, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 13.18611 0.3434393 10.202531
## 2 11.85658 0.4797263 8.279253
## 3 11.83417 0.4840390 8.870427
## 4 11.71698 0.4869811 9.002545
## 5 11.85543 0.4718882 9.147745
## 6 11.40338 0.5044622 8.693787
## 7 11.16218 0.5276655 8.716494
## 8 11.10036 0.5459316 8.698217
## 9 11.27816 0.5426732 8.654846
## 10 11.42424 0.5277375 8.802697
## 11 11.76379 0.5137673 8.979652
## 12 12.17605 0.4957517 9.239574
## 13 12.31047 0.4906547 9.471656
## 14 12.54160 0.4841179 9.719038
## 15 12.68314 0.4878009 9.781537
## 16 13.09624 0.4621940 10.105568
## 17 13.15496 0.4606289 10.155725
## 18 13.48162 0.4528951 10.339999
## 19 13.62623 0.4558228 10.457063
## 20 13.91732 0.4415159 10.495558
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 8.
pls_model$results[pls_model$bestTune$ncomp,]
## ncomp RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 8 8 11.10036 0.5459316 8.698217 2.099516 0.1562487 1.727553
plot(pls_model)
pred_data = predict(pls_model,testx)
postResample(pred = pred_data,obs=testy)
## RMSE Rsquared MAE
## 10.9890489 0.5320801 8.1958014
ridge_data_frame = data.frame(.lambda=seq(0,.1,length=15))
set.seed(245)
ridgefit = train(trainx,trainy,method = 'ridge',
tuneGrid=ridge_data_frame,
trControl=trainControl(method='cv',number=10),
preProc=c('center','scale'))
## Warning: model fit failed for Fold08: lambda=0.000000 Error in if (zmin < gamhat) { : missing value where TRUE/FALSE needed
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
ridgefit
## Ridge Regression
##
## 133 samples
## 388 predictors
##
## Pre-processing: centered (388), scaled (388)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 118, 120, 121, 118, 120, 121, ...
## Resampling results across tuning parameters:
##
## lambda RMSE Rsquared MAE
## 0.000000000 708.32150 0.3569901 330.042701
## 0.007142857 14.79006 0.3503073 10.913636
## 0.014285714 2530.40746 0.3828814 1214.999665
## 0.021428571 13.65438 0.3974206 10.316406
## 0.028571429 13.78486 0.3956952 10.275089
## 0.035714286 13.12587 0.4327413 9.971987
## 0.042857143 12.90641 0.4460507 9.810001
## 0.050000000 12.84436 0.4531486 9.742447
## 0.057142857 12.71828 0.4596228 9.654935
## 0.064285714 12.63492 0.4655818 9.581454
## 0.071428571 12.64149 0.4679259 9.573756
## 0.078571429 12.50998 0.4751516 9.485307
## 0.085714286 12.63624 0.4718566 9.560340
## 0.092857143 12.50965 0.4785303 9.478886
## 0.100000000 12.48249 0.4802439 9.454193
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was lambda = 0.1.
ridgefit$results[ridgefit$results$lambda==ridgefit$bestTune$lambda,]
## lambda RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 15 0.1 12.48249 0.4802439 9.454193 3.054923 0.22661 2.006753
The alternative model is not acceptable to replace the permeability laboratory experiment.
data(ChemicalManufacturingProcess)
plot_missing(ChemicalManufacturingProcess)
imputationdata = preProcess(ChemicalManufacturingProcess,method = 'knnImpute')
chem_new_data = predict(imputationdata,ChemicalManufacturingProcess)
plot_missing(chem_new_data)
set.seed(234)
chempartition = createDataPartition(chem_new_data$Yield,p = .8,list=FALSE)
trainx_chem = chem_new_data[chempartition,]
testx_chem = chem_new_data[-chempartition,]
x_data_train = trainx_chem %>% select(-Yield)
x_data_test = testx_chem %>% select(-Yield)
ridge_model = train(x_data_train,trainx_chem$Yield,method='ridge', tuneGrid=ridge_data_frame,trControl = trainControl(method = "cv", number = 10),preProc = c("center", "scale"))
ridge_model
## Ridge Regression
##
## 144 samples
## 57 predictor
##
## Pre-processing: centered (57), scaled (57)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 129, 130, 131, 129, 130, 130, ...
## Resampling results across tuning parameters:
##
## lambda RMSE Rsquared MAE
## 0.000000000 5.094269 0.3480776 1.7642172
## 0.007142857 2.599591 0.3857913 1.0860379
## 0.014285714 2.164726 0.4041163 0.9679730
## 0.021428571 1.942303 0.4148506 0.9077559
## 0.028571429 1.798228 0.4222012 0.8687096
## 0.035714286 1.694378 0.4277393 0.8406936
## 0.042857143 1.614765 0.4321782 0.8191833
## 0.050000000 1.551214 0.4358905 0.8018943
## 0.057142857 1.499002 0.4390916 0.7878561
## 0.064285714 1.455168 0.4419151 0.7762709
## 0.071428571 1.417739 0.4444491 0.7663120
## 0.078571429 1.385340 0.4467542 0.7577322
## 0.085714286 1.356980 0.4488736 0.7504527
## 0.092857143 1.331920 0.4508392 0.7440236
## 0.100000000 1.309600 0.4526751 0.7382854
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was lambda = 0.1.
ridge_model$results[ridge_model$results$lambda==ridge_model$bestTune$lambda,]
## lambda RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 15 0.1 1.3096 0.4526751 0.7382854 1.238652 0.2540459 0.3853613
plot(ridge_model)
pred_data_chem = predict(ridge_model,x_data_test)
postResample(pred = pred_data_chem,obs=testx_chem$Yield)
## RMSE Rsquared MAE
## 0.6089531 0.6867799 0.4803808
key_vars = varImp(ridge_model)
key_vars
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 57)
##
## Overall
## ManufacturingProcess32 100.00
## ManufacturingProcess13 94.14
## BiologicalMaterial06 78.83
## ManufacturingProcess36 71.21
## ManufacturingProcess17 70.75
## ManufacturingProcess09 69.02
## BiologicalMaterial03 63.32
## BiologicalMaterial02 63.30
## BiologicalMaterial12 60.48
## ManufacturingProcess06 57.46
## ManufacturingProcess31 56.95
## ManufacturingProcess33 52.25
## ManufacturingProcess29 49.54
## ManufacturingProcess02 44.92
## BiologicalMaterial04 42.89
## ManufacturingProcess11 42.73
## BiologicalMaterial11 36.61
## BiologicalMaterial09 35.43
## ManufacturingProcess30 34.11
## BiologicalMaterial01 34.06
print(paste("Manufacturing:",length(key_vars$importance[grepl('Manufacturing', rownames(key_vars$importance), fixed = TRUE),])))
## [1] "Manufacturing: 45"
print(paste("Biology:",length(key_vars$importance[grepl('Biological', rownames(key_vars$importance), fixed = TRUE),])))
## [1] "Biology: 12"
corrplot(cor(chem_new_data[c('Yield',rownames(key_vars$importance))[1:(length(rownames(key_vars$importance))/3)]]),method="circle",type='full')
corrplot(cor(chem_new_data[c('Yield',rownames(key_vars$importance))[(length(rownames(key_vars$importance))/3):((length(rownames(key_vars$importance))/3)*2)]]),method="circle",type='full')
corrplot(cor(chem_new_data[c('Yield',rownames(key_vars$importance))[((length(rownames(key_vars$importance))/3)*2):((length(rownames(key_vars$importance))/3)*3)]]),method="circle",type='full')