library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(AppliedPredictiveModeling)
data("permeability")
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
ncol(fingerprints)
## [1] 1107
length(nearZeroVar(fingerprints))
## [1] 719
The nearZeroVar function returned 719 columns, so of the 1107 columns, 388 are left for modeling s### c.Â
library(pls)
##
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
##
## R2
## The following object is masked from 'package:stats':
##
## loadings
zero <- nearZeroVar(fingerprints)
finger_df <- fingerprints[,-zero]
set.seed(329)
part <- permeability %>%
createDataPartition(p=0.8,list=FALSE,times=1)
dep_var_train <- finger_df[part,]
dep_var_test <- finger_df[-part,]
indep_var_train <- permeability[part,]
indep_var_test <- permeability[-part,]
pls_model <- train(x=dep_var_train, y=indep_var_train,
method="pls",
metric='Rsquared',
tuneLength=20,
trControl=trainControl(method = 'cv', number = 10),
preProc=c('center','scale'))
pls_model
## Partial Least Squares
##
## 133 samples
## 388 predictors
##
## Pre-processing: centered (388), scaled (388)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 121, 119, 120, 118, 120, 120, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 13.62598 0.3365374 10.309205
## 2 12.18732 0.4872366 8.817615
## 3 12.02501 0.4891116 9.163251
## 4 12.10455 0.4813254 9.343701
## 5 11.80763 0.5162117 9.044054
## 6 11.85446 0.5096861 8.937348
## 7 11.64945 0.5157421 8.896158
## 8 11.82122 0.5127728 9.136694
## 9 11.91974 0.5067269 9.047863
## 10 12.07800 0.5020342 9.074149
## 11 12.24071 0.4952065 9.336880
## 12 12.24351 0.4906759 9.340567
## 13 12.24273 0.4835190 9.331712
## 14 12.58929 0.4623902 9.461092
## 15 12.89396 0.4455262 9.655297
## 16 13.05410 0.4481954 9.905222
## 17 13.17905 0.4410183 10.041873
## 18 13.53412 0.4220510 10.391394
## 19 13.72958 0.4083498 10.594558
## 20 13.96633 0.4022988 10.780009
##
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was ncomp = 5.
plot(pls_model)
5 latent variables are optimal and the coreesponding resampled estimate of R-squared is 0.5162117
pred_pls <- predict(pls_model, newdata=dep_var_test)
postResample(pred=pred_pls, obs=indep_var_test)
## RMSE Rsquared MAE
## 9.886461 0.505552 7.380955
R-squared is 0.505552
ridgeGrid <- data.frame(.lambda = seq(0, .1, length = 15))
ridge_model <- train(x=dep_var_train, y=indep_var_train,
method="ridge",
metric='Rsquared',
tuneGrid=ridgeGrid,
trControl=trainControl(method = 'cv', number = 10),
preProc=c('center','scale'))
enetGrid <- expand.grid(.lambda = c(0, 0.01, .1),
.fraction = seq(.05, 1, length = 20))
enet_model <- train(dep_var_train, indep_var_train,
method = "enet",
tuneGrid = enetGrid,
trControl = trainControl(method = 'cv', number = 10),
preProc = c("center", "scale"))
ridge_model
## Ridge Regression
##
## 133 samples
## 388 predictors
##
## Pre-processing: centered (388), scaled (388)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 120, 120, 121, 119, 119, 118, ...
## Resampling results across tuning parameters:
##
## lambda RMSE Rsquared MAE
## 0.000000000 6.041888e+15 0.1566410 2.263504e+15
## 0.007142857 2.154538e+03 0.2959339 1.590920e+03
## 0.014285714 1.396359e+01 0.3961027 1.040388e+01
## 0.021428571 1.353324e+01 0.4195692 1.008089e+01
## 0.028571429 1.327603e+01 0.4417345 9.812119e+00
## 0.035714286 1.301607e+01 0.4588161 9.635619e+00
## 0.042857143 1.291253e+01 0.4689554 9.535743e+00
## 0.050000000 1.271933e+01 0.4813903 9.401978e+00
## 0.057142857 1.264687e+01 0.4868521 9.362782e+00
## 0.064285714 1.254063e+01 0.4959862 9.299290e+00
## 0.071428571 1.247959e+01 0.5009572 9.265791e+00
## 0.078571429 1.241794e+01 0.5059755 9.233517e+00
## 0.085714286 1.236657e+01 0.5102200 9.207772e+00
## 0.092857143 1.232135e+01 0.5142373 9.181933e+00
## 0.100000000 1.229678e+01 0.5169221 9.172932e+00
##
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was lambda = 0.1.
plot(ridge_model)
enet_model
## Elasticnet
##
## 133 samples
## 388 predictors
##
## Pre-processing: centered (388), scaled (388)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 120, 120, 121, 121, 117, 120, ...
## Resampling results across tuning parameters:
##
## lambda fraction RMSE Rsquared MAE
## 0.00 0.05 2.050362e+16 0.3953541 1.004741e+16
## 0.00 0.10 4.092599e+16 0.3583007 2.037379e+16
## 0.00 0.15 6.136141e+16 0.3534069 3.070017e+16
## 0.00 0.20 8.180011e+16 0.3462468 4.102655e+16
## 0.00 0.25 1.022401e+17 0.3434621 5.135293e+16
## 0.00 0.30 1.226808e+17 0.3435596 6.167931e+16
## 0.00 0.35 1.431218e+17 0.3394221 7.200570e+16
## 0.00 0.40 1.635631e+17 0.3341926 8.233208e+16
## 0.00 0.45 1.840045e+17 0.3362654 9.265846e+16
## 0.00 0.50 2.044461e+17 0.3324543 1.029848e+17
## 0.00 0.55 2.248877e+17 0.3238881 1.133112e+17
## 0.00 0.60 2.453294e+17 0.3230661 1.236376e+17
## 0.00 0.65 2.657711e+17 0.3184394 1.339640e+17
## 0.00 0.70 2.862128e+17 0.3169251 1.442904e+17
## 0.00 0.75 3.066546e+17 0.3117650 1.546167e+17
## 0.00 0.80 3.270964e+17 0.3088340 1.649431e+17
## 0.00 0.85 3.475383e+17 0.3089970 1.752695e+17
## 0.00 0.90 3.679801e+17 0.3089014 1.855959e+17
## 0.00 0.95 3.884220e+17 0.3073362 1.959223e+17
## 0.00 1.00 4.088638e+17 0.3149600 2.062487e+17
## 0.01 0.05 2.609184e+01 0.3880351 1.881405e+01
## 0.01 0.10 4.162396e+01 0.3610719 2.989143e+01
## 0.01 0.15 5.651777e+01 0.3693145 4.045312e+01
## 0.01 0.20 7.126880e+01 0.3874966 5.079519e+01
## 0.01 0.25 8.586673e+01 0.3970273 6.119685e+01
## 0.01 0.30 1.009275e+02 0.3917432 7.195044e+01
## 0.01 0.35 1.158119e+02 0.3863885 8.260432e+01
## 0.01 0.40 1.309276e+02 0.3784225 9.345762e+01
## 0.01 0.45 1.463839e+02 0.3700128 1.043391e+02
## 0.01 0.50 1.616195e+02 0.3646698 1.150244e+02
## 0.01 0.55 1.768281e+02 0.3567480 1.257176e+02
## 0.01 0.60 1.920786e+02 0.3454999 1.365168e+02
## 0.01 0.65 2.071960e+02 0.3325282 1.472707e+02
## 0.01 0.70 2.223368e+02 0.3194204 1.580493e+02
## 0.01 0.75 2.374404e+02 0.3094080 1.687871e+02
## 0.01 0.80 2.524171e+02 0.3027810 1.794435e+02
## 0.01 0.85 2.672948e+02 0.2965771 1.900494e+02
## 0.01 0.90 2.821750e+02 0.2922559 2.007147e+02
## 0.01 0.95 2.969906e+02 0.2897853 2.113596e+02
## 0.01 1.00 3.118617e+02 0.2874091 2.220673e+02
## 0.10 0.05 1.244936e+01 0.5316761 9.539829e+00
## 0.10 0.10 1.198920e+01 0.4884311 8.813599e+00
## 0.10 0.15 1.212936e+01 0.4725444 8.965440e+00
## 0.10 0.20 1.228602e+01 0.4664706 9.132690e+00
## 0.10 0.25 1.231336e+01 0.4735508 9.214734e+00
## 0.10 0.30 1.238419e+01 0.4766569 9.302568e+00
## 0.10 0.35 1.239108e+01 0.4840354 9.325327e+00
## 0.10 0.40 1.239906e+01 0.4888709 9.352925e+00
## 0.10 0.45 1.244732e+01 0.4888835 9.412340e+00
## 0.10 0.50 1.252430e+01 0.4877303 9.511774e+00
## 0.10 0.55 1.261288e+01 0.4849697 9.603833e+00
## 0.10 0.60 1.269909e+01 0.4814964 9.707767e+00
## 0.10 0.65 1.278202e+01 0.4768213 9.793030e+00
## 0.10 0.70 1.282367e+01 0.4745373 9.836045e+00
## 0.10 0.75 1.284464e+01 0.4739590 9.872424e+00
## 0.10 0.80 1.288013e+01 0.4726541 9.928584e+00
## 0.10 0.85 1.294160e+01 0.4707654 1.000283e+01
## 0.10 0.90 1.300912e+01 0.4686746 1.007368e+01
## 0.10 0.95 1.306908e+01 0.4667916 1.013625e+01
## 0.10 1.00 1.313379e+01 0.4647849 1.019146e+01
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were fraction = 0.1 and lambda = 0.1.
plot(enet_model)
pred_ridge <- predict(ridge_model, newdata=dep_var_test)
pred_enet <- predict(enet_model, newdata=dep_var_test)
postResample(pred=pred_ridge, obs=indep_var_test)
## RMSE Rsquared MAE
## 10.5695413 0.4963474 7.3257501
postResample(pred=pred_enet, obs=indep_var_test)
## RMSE Rsquared MAE
## 8.628018 0.587423 6.392301
The ridge model appears to have the best R-squared though it is very similar to the PLS model. I will note that the processing speed of the PLS model is by far the fastest, while the ridge and lasso model are similar.
data("ChemicalManufacturingProcess")
library(mice)
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
md.pattern(ChemicalManufacturingProcess)
## Yield BiologicalMaterial01 BiologicalMaterial02 BiologicalMaterial03
## 152 1 1 1 1
## 6 1 1 1 1
## 1 1 1 1 1
## 7 1 1 1 1
## 5 1 1 1 1
## 2 1 1 1 1
## 1 1 1 1 1
## 1 1 1 1 1
## 1 1 1 1 1
## 0 0 0 0
## BiologicalMaterial04 BiologicalMaterial05 BiologicalMaterial06
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 0 0 0
## BiologicalMaterial07 BiologicalMaterial08 BiologicalMaterial09
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 0 0 0
## BiologicalMaterial10 BiologicalMaterial11 BiologicalMaterial12
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 0 0 0
## ManufacturingProcess09 ManufacturingProcess13 ManufacturingProcess15
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 0 0 0
## ManufacturingProcess16 ManufacturingProcess17 ManufacturingProcess18
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 0 0 0
## ManufacturingProcess19 ManufacturingProcess20 ManufacturingProcess21
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 0 0 0
## ManufacturingProcess32 ManufacturingProcess37 ManufacturingProcess38
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 0 0 0
## ManufacturingProcess39 ManufacturingProcess42 ManufacturingProcess43
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 0 0 0
## ManufacturingProcess44 ManufacturingProcess45 ManufacturingProcess01
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 0
## 0 0 1
## ManufacturingProcess04 ManufacturingProcess05 ManufacturingProcess07
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 0 0 0
## 1 1 1
## ManufacturingProcess08 ManufacturingProcess12 ManufacturingProcess14
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 0
## 1 0 0 1
## 1 1 1
## ManufacturingProcess22 ManufacturingProcess23 ManufacturingProcess24
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 0 0 0
## 1 1 1
## ManufacturingProcess40 ManufacturingProcess41 ManufacturingProcess06
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 0
## 1 1 1 1
## 1 0 0 0
## 1 1 2
## ManufacturingProcess02 ManufacturingProcess25 ManufacturingProcess26
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 0 0
## 2 0 1 1
## 1 1 1 1
## 1 1 1 1
## 1 0 1 1
## 3 5 5
## ManufacturingProcess27 ManufacturingProcess28 ManufacturingProcess29
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 0 0 0
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 5 5 5
## ManufacturingProcess30 ManufacturingProcess31 ManufacturingProcess33
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 0 0 0
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 5 5 5
## ManufacturingProcess34 ManufacturingProcess35 ManufacturingProcess36
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 0 0 0
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 5 5 5
## ManufacturingProcess10 ManufacturingProcess11 ManufacturingProcess03
## 152 1 1 1 0
## 6 1 1 0 1
## 1 1 0 1 1
## 7 0 0 0 3
## 5 1 1 1 11
## 2 1 1 1 1
## 1 1 1 1 1
## 1 0 0 0 4
## 1 0 0 0 16
## 9 10 15 106
Above is a visual representation of the missing values. We will impute the missings.
cmp_df <- kNN(ChemicalManufacturingProcess,imp_var=FALSE)
zeroVar <- nearZeroVar(cmp_df)
cmp_df_final <- cmp_df[,-zeroVar]
part <- ChemicalManufacturingProcess$Yield %>%
createDataPartition(p=0.8,list=FALSE,times=1)
x_train <- cmp_df_final[part,]
x_test <- cmp_df_final[-part,]
y_train <- ChemicalManufacturingProcess$Yield[part]
y_test <- ChemicalManufacturingProcess$Yield[-part]
cmp_pls_model <- train(x=x_train, y=y_train,
method="pls",
tuneLength=20,
trControl=trainControl(method = 'cv'),
preProc=c('center','scale'))
cmp_pls_model
## Partial Least Squares
##
## 144 samples
## 57 predictor
##
## Pre-processing: centered (57), scaled (57)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 130, 129, 130, 130, 131, 129, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 1.22489130 0.5787539 0.97030866
## 2 1.39840738 0.6534780 0.94420992
## 3 0.80956405 0.8240471 0.65453786
## 4 0.81327223 0.8212696 0.60136421
## 5 0.69915117 0.8497912 0.46409424
## 6 0.64738161 0.8636482 0.40605417
## 7 0.42498123 0.9296910 0.26112045
## 8 0.35802498 0.9402109 0.19914394
## 9 0.30887961 0.9494743 0.15871036
## 10 0.25653498 0.9600011 0.13163416
## 11 0.21584654 0.9682484 0.11061272
## 12 0.17627678 0.9734948 0.08986207
## 13 0.16951010 0.9758586 0.08249007
## 14 0.15912031 0.9784818 0.07473519
## 15 0.13777973 0.9828630 0.06554454
## 16 0.11728843 0.9882352 0.05657115
## 17 0.11624088 0.9893568 0.05046004
## 18 0.10862454 0.9886350 0.04404457
## 19 0.09248278 0.9903899 0.03686739
## 20 0.07740403 0.9927286 0.03067158
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 20.
pred_cmp <- predict(cmp_pls_model,x_test)
postResample(pred_cmp,y_test)
## RMSE Rsquared MAE
## 0.01273013 0.99995231 0.01001609
This model has a low RMSE of 0.032 and an R-squared of 0.999. This seems like a good model.
caret::varImp(cmp_pls_model)
## pls variable importance
##
## only 20 most important variables shown (out of 57)
##
## Overall
## Yield 100.00
## ManufacturingProcess32 41.94
## ManufacturingProcess36 38.61
## ManufacturingProcess13 35.39
## ManufacturingProcess09 34.59
## BiologicalMaterial02 29.24
## ManufacturingProcess17 28.82
## BiologicalMaterial08 28.62
## BiologicalMaterial06 28.34
## ManufacturingProcess33 26.13
## BiologicalMaterial12 25.95
## ManufacturingProcess06 25.55
## BiologicalMaterial03 25.36
## BiologicalMaterial11 24.79
## ManufacturingProcess12 24.73
## ManufacturingProcess11 24.60
## BiologicalMaterial04 24.35
## BiologicalMaterial01 23.88
## ManufacturingProcess28 22.48
## ManufacturingProcess04 21.55
Process 32, 09, and 13 seem to be the top most important variables.
cor(ChemicalManufacturingProcess$Yield,
ChemicalManufacturingProcess$ManufacturingProcess32)
## [1] 0.6083321
cor(ChemicalManufacturingProcess$Yield,
ChemicalManufacturingProcess$ManufacturingProcess09)
## [1] 0.5034705
cor(ChemicalManufacturingProcess$Yield,
ChemicalManufacturingProcess$ManufacturingProcess13)
## [1] -0.5036797
cor(ChemicalManufacturingProcess$Yield,
ChemicalManufacturingProcess$ManufacturingProcess17)
## [1] -0.4258069
Those predictors that are negatively correlated with yield should likely be lowered and those that are positive correlated should be increased.