library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(AppliedPredictiveModeling)
library(e1071)
#library(MASS)
data(permeability)
head(permeability)
## permeability
## 1 12.520
## 2 1.120
## 3 19.405
## 4 1.730
## 5 1.680
## 6 0.510
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
fingerprints_filter <- fingerprints[, -nearZeroVar(fingerprints)]
ncol(fingerprints_filter)
## [1] 388
Nearzerovar diagnoses predictors that have one unique value (i.e. are zero variance predictors) or predictors that are have both of the following characteristics: they have very few unique values relative to the number of samples and the ratio of the frequency of the most common value to the frequency of the second most common value is large. After applyiing nearzerovar, there are 388 predictors left for modeling
fingerprintsdf <- as.data.frame(fingerprints_filter)
df <- as.data.frame(fingerprintsdf) %>% mutate(permeability = permeability)
#Set random seed
set.seed(10)
#createdatapartition creates a series of test/training partitions with 80% train and 20% test
in_train <- createDataPartition(df$permeability, p = 0.8, times =1, list = FALSE)
train_df <- df[in_train, ]
test_df <- df[-in_train, ]
pls_model <- train(
permeability ~ ., data = train_df, method = "pls",
center = TRUE,
trControl = trainControl("cv", number = 10),
tuneLength = 10
)
# Plot model RMSE vs different values of components
ggplot(pls_model)+xlab("Number of Variables")+ggtitle("PLS Model")
#best tuning parameter ncomp that minimizes the cross-validation error, RMSE
pls_model$bestTune$ncomp
## [1] 6
Training set is minimized RMSE at 6 components
summary(pls_model$finalModel)
## Data: X dimension: 133 388
## Y dimension: 133 1
## Fit method: oscorespls
## Number of components considered: 6
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## X 27.14 43.27 50.06 52.74 56.36 65.55
## .outcome 31.03 51.39 59.75 69.77 75.26 77.04
pls_model$results
## ncomp RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 1 13.09201 0.3906819 10.321638 3.908703 0.2666489 2.876551
## 2 2 11.72473 0.5206390 8.445616 3.400597 0.1985467 2.163855
## 3 3 11.43216 0.5331825 8.482856 2.581718 0.1581227 1.931290
## 4 4 11.46963 0.5377636 8.951449 1.753067 0.1380532 1.541152
## 5 5 11.17025 0.5475971 8.604173 1.782655 0.1617694 1.294497
## 6 6 10.78514 0.5696499 8.346736 1.787969 0.1587404 1.319356
## 7 7 10.81335 0.5702843 8.402793 1.972811 0.1430146 1.230677
## 8 8 10.88491 0.5658647 8.377881 1.810761 0.1495770 1.357485
## 9 9 10.98097 0.5651603 8.386326 2.151344 0.1519615 1.618178
## 10 10 11.01367 0.5787205 8.371964 2.198121 0.1270145 1.692152
65.55% of the variation (or information) contained in the predictors are captured by 6 principal components (ncomp = 6). Setting ncomp = 6, captures 77% of the information in the outcome variable. R2 is 0.55
# Make predictions
predictions <- pls_model%>% predict(test_df)
#RMSE <- RMSE(predictions, test_df$permeability)
#R2 <- R2(predictions, test_df$permeability)
#R2
Test set estimate of R2 is 0.2414261
plot(predictions)
fit <-lm(permeability ~ ., train_df)
#library(leaps)
#step.model <- train(permeability ~., data = train_df,
# method = "leapSeq",tuneGrid = data.frame(nvmax = 1:5)
# )
#step.model$results
I explored using StepWise Regression for the permeability model.
A stepwise regression model starts with no predictors, then sequentially add the most contributive predictors (like forward selection). After adding each new variable, remove any variables that no longer provide an improvement in the model fit (like backward selection).
library(RANN)
library(AppliedPredictiveModeling)
data("ChemicalManufacturingProcess")
#preProcess function applies imputation methods based on K-nearest neighbors or bagged trees.
knn_model <- preProcess(ChemicalManufacturingProcess, "knnImpute")
df <- predict(knn_model, ChemicalManufacturingProcess)
##(c) Split the data into a training and a test set, pre-process the data, and tune a model of your choice from this chapter. What is the optimal value of the performance metric?
CMPdf <- as.data.frame(ChemicalManufacturingProcess)
#Set random seed
set.seed(12)
#createdatapartition creates a series of test/training partitions with 80% train and 20% test
in_train <- createDataPartition(CMPdf$Yield, p = 0.8, times =1, list = FALSE)
train_df <- df[in_train, ]
test_df <- df[-in_train, ]
pls_model <- train(
Yield ~ ., data = train_df, method = "pls",
center = TRUE,
trControl = trainControl("cv", number = 10),
tuneLength = 10
)
# Plot model RMSE vs different values of components
ggplot(pls_model)+xlab("Number of Variables")+ggtitle("PLS Model")
#best tuning parameter ncomp that minimizes the cross-validation error, RMSE
pls_model$bestTune$ncomp
## [1] 3
Training set is minimized RMSE at 3 components
summary(pls_model$finalModel)
## Data: X dimension: 144 57
## Y dimension: 144 1
## Fit method: oscorespls
## Number of components considered: 3
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps
## X 16.94 24.20 30.91
## .outcome 46.06 62.41 66.56
30.91% of the variation (or information) contained in the predictors are captured by 3 principal components (ncomp = 3). Setting ncomp = 3, captures 67% of the information in the outcome variable.
pls_model$results
## ncomp RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 1 0.7154980 0.4716383 0.5987667 0.17286072 0.2382826 0.15390159
## 2 2 0.6479572 0.5498812 0.5308600 0.10769795 0.1386762 0.08735811
## 3 3 0.6346496 0.5788219 0.5241662 0.08976446 0.1124878 0.08169289
## 4 4 0.6441229 0.5747939 0.5387781 0.08311165 0.1116881 0.07596516
## 5 5 0.6594725 0.5606652 0.5509134 0.08812827 0.1039268 0.08407498
## 6 6 0.6665982 0.5564884 0.5544757 0.10630413 0.1102803 0.09654060
## 7 7 0.6786587 0.5459157 0.5598025 0.11199890 0.1113569 0.09574859
## 8 8 0.6901993 0.5401769 0.5711970 0.12888688 0.1157777 0.09998550
## 9 9 0.7130042 0.5266754 0.5872945 0.13825953 0.1213003 0.10826496
## 10 10 0.7395996 0.5064334 0.6055324 0.16093844 0.1397360 0.12131821
R2 is 0.57
##(d) Predict the response for the test set. What is the value of the performance metric and how does this compare with the resampled performance metric on the training set?
# Make predictions
predictions <- pls_model%>% predict(test_df)
#RMSE = RMSE(predictions, test_df$Yield)
#R2 = R2(predictions, test_df$Yield)
#R2
R2 is 0.65
plot(predictions)
pls_importance <- varImp(pls_model)$importance %>%
as.data.frame() %>%
filter(Overall >= 50) %>%
arrange(desc(Overall)) %>%
mutate(importance = row_number())
##
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
##
## R2
## The following object is masked from 'package:stats':
##
## loadings
varImp(pls_model) %>%
plot(, top = max(pls_importance$importance))
The process predictors are most important to the yield, especially Mfg Process 32, 13,36,17, and 09
##(f) Explore the relationships between each of the top predictors and the response. How could this information be helpful in improving yield in future runs of the manufacturing process?
library(corrplot)
## corrplot 0.84 loaded
##
## Attaching package: 'corrplot'
## The following object is masked from 'package:pls':
##
## corrplot
df %>% select(c('ManufacturingProcess32','ManufacturingProcess13','ManufacturingProcess36','ManufacturingProcess17','ManufacturingProcess09','Yield')) %>% cor() %>% corrplot(method = 'circle')
Mfg process 32 and 09 are highly positively correlated to the yield, while Mfg process 13,36,and 17 are negatively correlated to the yield. Mfg process 17 and 13 are positively correlated Mfg process 36 and 32 are negatively correlated Mfg process 09 and 13 are negatively correlated Mfg process 09 and 17 are negatively correlated
Understanding the relationships between the different mfg processes can help to fine tune the process and yield.