library(caret)
library(ggplot2)
library(fpp3)
library(ggcorrplot)
set.seed(1443)
library(AppliedPredictiveModeling)
data(permeability)
print(dim(permeability))
## [1] 165 1
print(dim(fingerprints))
## [1] 165 1107
var_pred <- fingerprints[, -nearZeroVar(fingerprints)]
## Combine our predictor with our outcomes
dim(var_pred)
## [1] 165 388
split <- createDataPartition(permeability, p = 0.8, list = FALSE)
train_x <- var_pred[split,]
train_y <- permeability[split,]
test_X <- var_pred[-split,]
test_y <-permeability[-split,]
set.seed(1443)
pls_train <- train(train_x, train_y, method = "pls", tuneLength = 25, trControl = trainControl(method = "cv", number = 10), preProc = c("center", "scale"))
plot(pls_train)
pls_train
## Partial Least Squares
##
## 133 samples
## 388 predictors
##
## Pre-processing: centered (388), scaled (388)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 120, 121, 120, 120, 119, 120, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 13.21323 0.3301230 9.872105
## 2 11.83099 0.4680823 8.467334
## 3 11.75278 0.4886548 8.780985
## 4 11.99988 0.4785925 8.989065
## 5 12.06330 0.4616812 9.001322
## 6 12.16404 0.4547237 9.064125
## 7 12.04014 0.4537933 9.163298
## 8 12.12776 0.4553275 9.261217
## 9 12.20789 0.4525521 9.210646
## 10 12.53937 0.4351989 9.390303
## 11 13.07614 0.4093738 9.728870
## 12 13.12346 0.4142870 9.871322
## 13 13.35677 0.4022612 10.104557
## 14 13.66186 0.3880024 10.299425
## 15 13.89203 0.3777918 10.522542
## 16 14.16532 0.3736698 10.856697
## 17 14.36446 0.3707620 10.946462
## 18 14.47205 0.3659479 11.067558
## 19 14.64323 0.3614900 11.010311
## 20 14.69715 0.3593224 10.971995
## 21 14.98362 0.3475234 11.038768
## 22 15.32376 0.3334012 11.175969
## 23 15.35426 0.3328376 11.051959
## 24 15.59607 0.3255702 11.184672
## 25 16.00694 0.3113101 11.486284
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 3.
predict_pls <- predict(pls_train, test_X)
R2(predict_pls,test_y)
## [1] 0.4313704
RMSE(predict_pls, test_y)
## [1] 15.86217
set.seed(1443)
pcr_train <- train(train_x, train_y,
method = "pcr", tuneLength = 25,
trControl = trainControl(method = "cv", number = 10),
preProc = c("center", "scale"))
plot(pcr_train)
pcr_train
## Principal Component Analysis
##
## 133 samples
## 388 predictors
##
## Pre-processing: centered (388), scaled (388)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 120, 121, 120, 120, 119, 120, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 14.88717 0.1410354 11.348058
## 2 14.95585 0.1324485 11.378366
## 3 13.73106 0.3365519 10.354973
## 4 13.36107 0.3277952 10.300683
## 5 12.64280 0.3908609 9.205762
## 6 12.70794 0.3864897 9.234212
## 7 12.63019 0.3961566 9.057372
## 8 12.35925 0.4233419 8.898799
## 9 12.44389 0.4114303 8.987111
## 10 11.63303 0.4930499 8.555707
## 11 11.73377 0.4867210 8.576545
## 12 11.78192 0.4821678 8.665948
## 13 11.98691 0.4542380 8.754383
## 14 12.04689 0.4472255 8.859189
## 15 11.97690 0.4531954 8.697334
## 16 11.90845 0.4653214 8.617335
## 17 12.16827 0.4398928 8.760784
## 18 12.08281 0.4436485 8.750053
## 19 12.31086 0.4344520 8.892193
## 20 12.29946 0.4333404 8.951419
## 21 12.49868 0.4146492 9.094315
## 22 12.44630 0.4236480 9.145401
## 23 12.53863 0.4142497 9.293348
## 24 12.42796 0.4264953 9.293154
## 25 12.22914 0.4402080 9.239369
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 10.
predict_pcr <- predict(pcr_train, test_X)
R2(predict_pcr, test_y)
## [1] 0.4332031
RMSE(predict_pcr, test_y)
## [1] 15.757
ridgeGrid <- data.frame(.lambda = seq(0, .1, length = 15))
set.seed(1443)
ridge_train <- train(train_x, train_y,
method = "ridge", tuneGrid = ridgeGrid,
trControl = trainControl(method = "cv", number = 10),
preProc = c("center", "scale"))
## Warning: model fit failed for Fold01: lambda=0.000000 Error in if (zmin < gamhat) { : missing value where TRUE/FALSE needed
## Warning: model fit failed for Fold02: lambda=0.000000 Error in if (zmin < gamhat) { : missing value where TRUE/FALSE needed
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
plot(ridge_train)
ridge_train
## Ridge Regression
##
## 133 samples
## 388 predictors
##
## Pre-processing: centered (388), scaled (388)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 120, 121, 120, 120, 119, 120, ...
## Resampling results across tuning parameters:
##
## lambda RMSE Rsquared MAE
## 0.000000000 22.50605 0.2978755 15.904101
## 0.007142857 128.47388 0.1691512 83.696576
## 0.014285714 16.29640 0.2900475 11.730548
## 0.021428571 15.33957 0.3284795 11.152521
## 0.028571429 14.73754 0.3530934 10.755540
## 0.035714286 29.01421 0.3428775 20.120233
## 0.042857143 14.06749 0.3800531 10.339861
## 0.050000000 13.86963 0.3890570 10.219722
## 0.057142857 13.75425 0.3949576 10.167233
## 0.064285714 13.62162 0.4011882 10.101068
## 0.071428571 13.57279 0.4044552 10.088458
## 0.078571429 13.46615 0.4100259 10.034688
## 0.085714286 13.39811 0.4139794 9.998634
## 0.092857143 13.38930 0.4151784 10.004624
## 0.100000000 13.27311 0.4214283 9.937255
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was lambda = 0.1.
predict_ridge <- predict(ridge_train, test_X)
R2(predict_ridge, test_y)
## [1] 0.6227042
RMSE(predict_ridge, test_y)
## [1] 16.49997
library(AppliedPredictiveModeling)
data("ChemicalManufacturingProcess")
set.seed(1443)
process <- preProcess(ChemicalManufacturingProcess, method = "bagImpute")
imputed_chemical <- predict(process,ChemicalManufacturingProcess)
split <- createDataPartition(imputed_chemical$Yield, p = 0.8, list = FALSE)
y <- imputed_chemical[,1]
x <- imputed_chemical[,-1]
train_x <- x[split,]
train_y <- y[split]
test_x <- x[-split,]
test_y <- y[-split]
set.seed(1443)
pls_train <- train(train_x, train_y, method = "pls", tuneLength = 25, trControl = trainControl(method = "cv", number = 10), preProc = c("center", "scale"))
plot(pls_train)
pls_train
## Partial Least Squares
##
## 144 samples
## 57 predictor
##
## Pre-processing: centered (57), scaled (57)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 129, 130, 128, 131, 130, 131, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 1.532986 0.4248748 1.214934
## 2 2.079642 0.4405548 1.267836
## 3 1.532556 0.5419626 1.099851
## 4 1.814502 0.5850576 1.184010
## 5 2.452461 0.5793607 1.333425
## 6 2.505524 0.5725612 1.344234
## 7 2.602309 0.5676628 1.385044
## 8 2.863044 0.5523034 1.472757
## 9 3.155712 0.5621558 1.553665
## 10 3.322721 0.5637690 1.607689
## 11 3.508670 0.5471983 1.662974
## 12 3.613347 0.5266699 1.694394
## 13 3.724715 0.5032166 1.729366
## 14 3.742217 0.5038309 1.728465
## 15 3.785565 0.5029822 1.739391
## 16 3.837287 0.5071728 1.758467
## 17 3.923864 0.5083730 1.778735
## 18 3.950756 0.5155231 1.785996
## 19 3.988199 0.5178042 1.791827
## 20 4.069812 0.5167287 1.816241
## 21 4.117213 0.5090540 1.839434
## 22 4.256853 0.5046730 1.881262
## 23 4.414629 0.4964198 1.927851
## 24 4.500891 0.4897974 1.951304
## 25 4.662419 0.4844464 2.002332
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 3.
pls_pred <- predict(pls_train, test_x)
R2(pls_pred, test_y)
## [1] 0.5399706
RMSE(pls_pred, test_y)
## [1] 40.266
varImp(pls_train)
## Warning: package 'pls' was built under R version 4.3.3
##
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
##
## R2
## The following object is masked from 'package:stats':
##
## loadings
## pls variable importance
##
## only 20 most important variables shown (out of 57)
##
## Overall
## ManufacturingProcess32 100.00
## ManufacturingProcess13 85.33
## ManufacturingProcess36 83.91
## ManufacturingProcess09 83.52
## ManufacturingProcess17 80.64
## ManufacturingProcess33 61.26
## BiologicalMaterial03 60.70
## BiologicalMaterial02 60.14
## BiologicalMaterial06 59.39
## ManufacturingProcess12 58.12
## BiologicalMaterial08 55.71
## ManufacturingProcess11 54.51
## ManufacturingProcess06 54.38
## BiologicalMaterial12 53.12
## BiologicalMaterial11 51.76
## BiologicalMaterial01 50.74
## BiologicalMaterial04 49.94
## ManufacturingProcess28 44.28
## ManufacturingProcess34 42.21
## ManufacturingProcess04 39.58
importance <- varImp(pls_train)
top_10 <-importance$importance %>% arrange(-Overall) %>%
head(10)
top_10 <- row.names(top_10)
imputed_chemical %>%
select(c("Yield", top_10)) %>%
cor() %>%
ggcorrplot(method = "square", lab =TRUE, lab_size = 3)
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(top_10)
##
## # Now:
## data %>% select(all_of(top_10))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.