if (!require("fpp2")) install.packages("fpp2")
if (!require("caret")) install.packages("caret")
if (!require("AppliedPredictiveModeling")) install.packages("AppliedPredictiveModeling")
if (!require("naniar")) install.packages("naniar")
if (!require("missForest")) install.packages("missForest")## permeability
## 1 12.520
## 2 1.120
## 3 19.405
## 4 1.730
## 5 1.680
## 6 0.510
nearZeroVar function from the caret package. How many predictors are left for modeling?## [1] 165 1107
## [1] 165 388
##
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
##
## R2
## The following object is masked from 'package:fpp2':
##
## gasoline
## The following object is masked from 'package:stats':
##
## loadings
set.seed(17)
ctrl <- trainControl(method = "cv", number = 10)
trainingRows <- createDataPartition(permeability,
p = 0.75,
list = FALSE)
trainFingerprints <- fingerprints_nZ[trainingRows,]
trainPermeability <- permeability[trainingRows,]
testFingerprints <- fingerprints_nZ[-trainingRows,]
testPermeability <- permeability[-trainingRows,]
plsTune <- train(x = trainFingerprints, y = log10(trainPermeability),
method = "pls",
tuneGrid = expand.grid(ncomp = 1:15),
trControl = ctrl)
plsTune## Partial Least Squares
##
## 125 samples
## 388 predictors
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 112, 113, 112, 113, 113, 113, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 0.5922041 0.3181665 0.4853361
## 2 0.5337828 0.4266420 0.4200577
## 3 0.5008638 0.4620277 0.3917844
## 4 0.4997614 0.4869776 0.3989571
## 5 0.4872194 0.4983582 0.3888616
## 6 0.4734407 0.5420731 0.3730143
## 7 0.4579397 0.5739015 0.3711766
## 8 0.4607445 0.5564830 0.3685672
## 9 0.4677172 0.5537102 0.3736950
## 10 0.4748490 0.5507459 0.3845518
## 11 0.4674174 0.5656159 0.3754585
## 12 0.4743214 0.5543262 0.3848375
## 13 0.4762040 0.5513360 0.3867613
## 14 0.4757131 0.5561693 0.3850442
## 15 0.4910557 0.5331300 0.3897072
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 7.
## Data: X dimension: 125 388
## Y dimension: 125 1
## Fit method: oscorespls
## Number of components considered: 7
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 26.26 40.03 49.00 56.32 63.06 65.79 68.17
## .outcome 26.11 50.77 58.87 65.46 70.39 75.89 79.33
PLS model is 7 is the optimal model. This captures 68.17% of the variation in the predictors and 79.33% of the variation
pls.pred <- predict(plsTune, newdata = testFingerprints)
postResample(pred = pls.pred, obs = testPermeability)## RMSE Rsquared MAE
## 15.7580242 0.4324274 10.0108477
The RMSE is 15.7580242 and \(R^2\) of the test set prediction is 0.4324274
ridgeGrid <- data.frame(lambda = seq(0.02, .35, length = 9))
ridgeTune <- train(trainFingerprints, trainPermeability, method = "ridge",
tuneGrid = ridgeGrid,
trControl = ctrl,
preProc = c("center", "scale"))
ridgeTune## Ridge Regression
##
## 125 samples
## 388 predictors
##
## Pre-processing: centered (388), scaled (388)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 113, 112, 113, 112, 113, 113, ...
## Resampling results across tuning parameters:
##
## lambda RMSE Rsquared MAE
## 0.02000 12.31918 0.5352243 9.267566
## 0.06125 11.80614 0.5536890 8.866912
## 0.10250 11.76777 0.5563432 8.801663
## 0.14375 11.83743 0.5572497 8.883637
## 0.18500 11.94462 0.5585418 9.002302
## 0.22625 12.09303 0.5593004 9.133174
## 0.26750 12.26041 0.5603573 9.275228
## 0.30875 12.45165 0.5612793 9.456168
## 0.35000 12.66074 0.5620652 9.653745
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was lambda = 0.1025.
ridge.pred <- predict(ridgeTune, newdata = testFingerprints)
postResample(pred = ridge.pred, obs = testPermeability)## RMSE Rsquared MAE
## 13.5313372 0.3687824 9.8385404
The RMSE is 13.5313372 and \(R^2\) of the test set prediction is 0.3687824
The PLS model worked better on this data due to the lower accuracy scores revealed
## [1] 176 58
## missForest iteration 1 in progress...
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want to
## do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want to
## do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want to
## do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want to
## do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want to
## do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want to
## do regression?
## done!
## missForest iteration 2 in progress...
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want to
## do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want to
## do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want to
## do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want to
## do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want to
## do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want to
## do regression?
## done!
## missForest iteration 3 in progress...
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want to
## do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want to
## do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want to
## do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want to
## do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want to
## do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want to
## do regression?
## done!
## missForest iteration 4 in progress...
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want to
## do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want to
## do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want to
## do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want to
## do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want to
## do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want to
## do regression?
## done!
## missForest iteration 5 in progress...
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want to
## do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want to
## do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want to
## do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want to
## do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want to
## do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want to
## do regression?
## done!
## missForest iteration 6 in progress...
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want to
## do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want to
## do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want to
## do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want to
## do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want to
## do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want to
## do regression?
## done!
## missForest iteration 7 in progress...
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want to
## do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want to
## do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want to
## do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want to
## do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want to
## do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want to
## do regression?
## done!
cmp_impute <- as.data.frame(cmp_impute$ximp)
gg_miss_var(cmp_impute, show_pct = TRUE) + labs(y = "Look at all the missing ones")library(pls)
set.seed(17)
smp1 <- floor(0.75 * nrow(cmp_impute))
trainingRows <- sample(seq_len(nrow(cmp_impute)), size = smp1)
y1 <- sample(seq_len(nrow(cmp_impute)), size = smp1)
cmpTrain <- cmp_impute[trainingRows,-1]
cmpTest <- cmp_impute[-trainingRows,-1]
yTrain <- cmp_impute[y1,1]
yTest <- cmp_impute[-y1,1]
cmpTune <- train(cmpTrain, yTrain,
method = "pls",
tuneLength = 20, trControl = ctrl,
preProc = c("center", "scale"))
cmpTune## Partial Least Squares
##
## 132 samples
## 57 predictor
##
## Pre-processing: centered (57), scaled (57)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 119, 118, 120, 117, 120, 120, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 2.086601 0.1231083 1.662202
## 2 2.592784 0.1391767 1.720826
## 3 2.658867 0.1392382 1.755226
## 4 2.449071 0.1594926 1.697227
## 5 2.639974 0.1670721 1.760308
## 6 2.902953 0.1949944 1.826592
## 7 2.842816 0.1697681 1.860282
## 8 2.703167 0.1464872 1.865939
## 9 2.713845 0.1587348 1.856070
## 10 2.908729 0.1406090 1.959426
## 11 3.165194 0.1328351 2.050657
## 12 3.252465 0.1250539 2.071121
## 13 3.433214 0.1234438 2.130068
## 14 3.668955 0.1260363 2.207788
## 15 3.971703 0.1257412 2.302059
## 16 4.184109 0.1344587 2.361407
## 17 4.299645 0.1292648 2.403458
## 18 4.335854 0.1316881 2.412877
## 19 4.525885 0.1368628 2.471902
## 20 4.784033 0.1293965 2.564042
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 1.
## RMSE Rsquared MAE
## 2.130686e+00 1.583659e-05 1.664855e+00
The \(R^2\) is a really low 0.0001139327
ridgeGrid <- data.frame(.lambda = seq(0, .1, length = 15))
set.seed(17)
ridgeTune2 <- train(cmpTrain, yTrain, method = "ridge",
tuneGrid = ridgeGrid,
trControl = ctrl)
ridgeTune2## Ridge Regression
##
## 132 samples
## 57 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 120, 120, 120, 119, 119, 118, ...
## Resampling results across tuning parameters:
##
## lambda RMSE Rsquared MAE
## 0.000000000 18.930687 0.1506448 6.371037
## 0.007142857 5.565949 0.1503616 2.705400
## 0.014285714 4.476050 0.1573590 2.391740
## 0.021428571 3.958375 0.1625925 2.239005
## 0.028571429 3.646169 0.1665722 2.144118
## 0.035714286 3.435074 0.1696674 2.077669
## 0.042857143 3.282145 0.1721305 2.027880
## 0.050000000 3.166053 0.1741347 1.988835
## 0.057142857 3.074884 0.1758014 1.957554
## 0.064285714 3.001410 0.1772180 1.932097
## 0.071428571 2.940979 0.1784481 1.911618
## 0.078571429 2.890446 0.1795388 1.894298
## 0.085714286 2.847608 0.1805252 1.879272
## 0.092857143 2.810873 0.1814336 1.866090
## 0.100000000 2.779060 0.1822835 1.854418
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was lambda = 0.1.
## RMSE Rsquared MAE
## 2.18188056 0.01650807 1.67904155
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 57)
##
## Overall
## ManufacturingProcess05 100.00
## ManufacturingProcess35 47.34
## ManufacturingProcess02 43.16
## ManufacturingProcess31 37.91
## ManufacturingProcess21 37.91
## ManufacturingProcess06 37.83
## ManufacturingProcess14 33.86
## ManufacturingProcess09 32.66
## ManufacturingProcess10 32.20
## BiologicalMaterial09 29.68
## ManufacturingProcess27 29.66
## BiologicalMaterial11 27.88
## ManufacturingProcess26 27.25
## BiologicalMaterial05 26.43
## BiologicalMaterial03 24.31
## BiologicalMaterial01 23.20
## ManufacturingProcess29 22.43
## ManufacturingProcess39 21.35
## BiologicalMaterial10 21.08
## ManufacturingProcess17 19.04
ManufacturingProcess seems to have the most importance than BiologicalMaterial
the most important manufacturing process steps allows the company to pinpoint where they can start fine tuning the procedure.