library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(RANN)
In Kuhn and Johnson do problems 6.2 and 6.3. There are only two but they consist of many parts. Please submit a link to your Rpubs and submit the .rmd file as well.
library(AppliedPredictiveModeling)
data(permeability)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
dim(fingerprints)
## [1] 165 1107
sparse <- nearZeroVar(fingerprints)
not_sparse <- fingerprints[, -sparse]
dim(not_sparse)
## [1] 165 388
388 predictors are left after applying the nearZeroVar
function on fingerprints.
df <- as.data.frame(fingerprints)
df <- df |>
mutate(target = permeability)
sparse <- nearZeroVar(df)
df <- df[, -sparse]
dim(df)
## [1] 165 389
splitdf <- createDataPartition(df$target, times = 1, p = 0.8, list = FALSE)
train <- df[splitdf, ]
test <- df[-splitdf, ]
set.seed(123)
pls_model <- train(target ~ ., data = train, method = "pls", metric="Rsquared",
center = TRUE, trControl = trainControl("cv", number = 10),
tuneLength = 25
)
pls_model
## Partial Least Squares
##
## 133 samples
## 388 predictors
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 120, 119, 118, 120, 121, 119, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 13.89629 0.2595202 10.523834
## 2 12.08958 0.3930697 8.606173
## 3 11.92805 0.4412262 8.773964
## 4 11.69603 0.4647318 8.833409
## 5 11.73702 0.4685687 8.914923
## 6 11.74632 0.4891905 8.825990
## 7 11.92629 0.4782318 9.088580
## 8 11.87671 0.4891900 9.168992
## 9 11.80357 0.4871362 9.016852
## 10 11.71772 0.5031537 8.983574
## 11 11.74937 0.4967466 8.901958
## 12 11.86636 0.4876363 8.963664
## 13 11.63549 0.4972131 8.726712
## 14 11.62791 0.5001112 8.799882
## 15 11.67511 0.5050307 8.788139
## 16 11.59268 0.5224644 8.597767
## 17 11.19736 0.5503581 8.324308
## 18 11.18188 0.5514033 8.170512
## 19 11.33144 0.5381611 8.250898
## 20 11.31008 0.5447829 8.291521
## 21 11.49961 0.5436926 8.421616
## 22 11.54810 0.5456161 8.453671
## 23 11.63709 0.5369965 8.515249
## 24 11.68096 0.5406349 8.563958
## 25 11.94825 0.5326035 8.739179
##
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was ncomp = 18.
summary(pls_model)
## Data: X dimension: 133 388
## Y dimension: 133 1
## Fit method: oscorespls
## Number of components considered: 18
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 31.14 42.51 49.42 54.46 63.50 67.29 69.87
## .outcome 23.37 45.32 53.69 61.87 65.45 70.96 75.02
## 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps
## X 73.14 75.04 77.65 80.01 81.14 82.75 84.80
## .outcome 76.81 79.08 80.85 82.28 84.12 85.27 85.97
## 15 comps 16 comps 17 comps 18 comps
## X 86.16 87.81 88.75 89.88
## .outcome 86.76 87.39 88.34 88.96
The highest \(R^2\) occurs at ncomp = 6, the corresponding R^2 is 0.5678856.
predict_test <- predict(pls_model, test)
obs_values <- data.frame(obs = test$target, pred = predict_test)
colnames(obs_values) <- c("obs","pred")
defaultSummary(obs_values)
## RMSE Rsquared MAE
## 13.2695528 0.4305899 10.1390421
The \(R^2\) is 67.62% which is higher than part C.
set.seed(123)
pcr_model <- train(target ~ ., data = train, method = "pcr",
center = TRUE,
trControl = trainControl("cv", number = 10),
tuneLength = 25
)
predict_pcr <- predict(pcr_model, test)
obs_values <- data.frame(obs = test$target, pred = predict_pcr)
colnames(obs_values) <- c("obs","pred")
defaultSummary(obs_values)
## RMSE Rsquared MAE
## 10.132705 0.592234 8.058536
The R^2 is 0.33 , which did not perform better than the PLS model.
Now to try with the ridge method.
set.seed(123)
ridgeGrid <- data.frame(.lambda = seq(0, .1, length = 15))
ridgeRegFit <- train(target ~ ., data = train,
method = "ridge",
metric = "Rsquared",
tuneGrid = ridgeGrid,
trControl = trainControl(method = "cv", number = 10),
preProc = c("center", "scale"))
predict_ridge <- predict(ridgeRegFit, test)
obs_values <- data.frame(obs = test$target, pred = predict_ridge)
colnames(obs_values) <- c("obs","pred")
defaultSummary(obs_values)
## RMSE Rsquared MAE
## 13.5934925 0.4031599 10.3669077
The R^2 was 0.40 which performed better than the PCR model but not the PLS.
Of the above models, I would not use PCR. The best performing model is the PLS model.
data("ChemicalManufacturingProcess")
dim(ChemicalManufacturingProcess)
## [1] 176 58
Imputing the missing values using KNN.
set.seed(123)
chem_impute <- preProcess(ChemicalManufacturingProcess, method=c('knnImpute'))
df <- predict(chem_impute, ChemicalManufacturingProcess)
dfx <- df |> select(-Yield)
dfy <- df |> select(Yield)
set.seed(123)
chem_train <- createDataPartition(dfy$Yield, p = .80, list= FALSE)
x_train <- dfx[chem_train,]
x_test <- dfx[-chem_train,]
y_train <- dfy[chem_train,]
y_test <- dfy[-chem_train,]
set.seed(123)
ridgeGrid <- data.frame(.lambda = seq(0,0.1,length=15))
ridgeFit <- train(x=x_train , y=y_train,
method='ridge',
tuneGrid=ridgeGrid,
trControl=trainControl(method = "cv", number = 10),
preProc = c('center','scale'))
ridgeFit
## Ridge Regression
##
## 144 samples
## 57 predictor
##
## Pre-processing: centered (57), scaled (57)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 128, 129, 129, 130, 128, 131, ...
## Resampling results across tuning parameters:
##
## lambda RMSE Rsquared MAE
## 0.000000000 6.559444 0.3705740 2.1257894
## 0.007142857 2.905917 0.4076010 1.1706542
## 0.014285714 2.356944 0.4321264 1.0217333
## 0.021428571 2.061485 0.4500623 0.9420376
## 0.028571429 1.868746 0.4640859 0.8889647
## 0.035714286 1.730620 0.4755309 0.8500597
## 0.042857143 1.625723 0.4851112 0.8199353
## 0.050000000 1.542823 0.4932645 0.7957412
## 0.057142857 1.475365 0.5002888 0.7761056
## 0.064285714 1.419231 0.5064010 0.7598050
## 0.071428571 1.371687 0.5117652 0.7460655
## 0.078571429 1.330839 0.5165093 0.7341978
## 0.085714286 1.295329 0.5207341 0.7238684
## 0.092857143 1.264153 0.5245201 0.7148742
## 0.100000000 1.236554 0.5279324 0.7068879
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was lambda = 0.1.
Using the ridge method the optimal value is 0.1.
predict_ridge <- predict(ridgeFit, x_test)
defaultSummary(data.frame(pred=predict_ridge,obs=y_test))
## RMSE Rsquared MAE
## 0.7428885 0.4953262 0.6433366
The \(R^2\) is 15% and the prediction shows a worse \(R^2\) than the training dataset.
The most important Predictors are Manifacturing Process 32, 13, and the Biological Material 06.
plot(varImp(ridgeFit, scale = FALSE), top=20, scales = list(y = list(cex = 0.8)))
correlation <- cor(select(df, 'ManufacturingProcess32','ManufacturingProcess13','BiologicalMaterial06', 'Yield'))
corrplot::corrplot(correlation, method='square', type="upper")
There are obvious correlations between the top predictors and the response. These predictors can lead to a better fitting of model creation and would likely be more statistically relevant. This also shows the direct relationship between which processes will have a negative or positive Yield.