6.2
A and B
data(permeability)
fingerprints <- fingerprints
fingerprints_trim <- fingerprints[, -nearZeroVar(fingerprints)]
fingerprints_df <- data.frame(fingerprints)
fingerprints_trim_df <- data.frame(fingerprints_trim)The trimmed data leaves 388 variables, as opposed to the original 1107.
C
set.seed(34)
train_idx <- sample(c(TRUE,FALSE), nrow(fingerprints_trim),
replace=TRUE, prob=c(0.7,0.3))
train_set <- fingerprints_trim[train_idx,]
test_set <- fingerprints_trim[!train_idx,]
train_set_permeability <- permeability[train_idx,]
test_set_permeability <- permeability[!train_idx,]
fit <- train(train_set,train_set_permeability, method = "pls",tunelength = 15, trControl = trainControl(method = "cv"))
plot(fit)fit2 <- train(train_set,train_set_permeability, method = "pls",metric = "Rsquared",tunelength = 15, trControl = trainControl(method = "cv"),preProcess = c("center","scale"))
plot(fit2)fit2## Partial Least Squares
##
## 106 samples
## 388 predictors
##
## Pre-processing: centered (388), scaled (388)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 95, 96, 95, 95, 96, 95, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 12.29527 0.4125748 9.601379
## 2 10.72521 0.5975011 7.585166
## 3 11.27872 0.5444427 8.274176
##
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was ncomp = 2.
It appears that the RMSE is minimized at a latent variable count of 3. The R-Squared value is maximized at a component count of three, and the optimal component count is 3, with an RMSE of 11.62, an R-Squared of 0.485, and an MAE of 8.59.
D
fit2_predict <- predict(fit2,test_set)
postResample(fit2_predict,test_set_permeability)## RMSE Rsquared MAE
## 14.2869967 0.3011959 9.3219372
The R-Squared value for the predictions versus the test set is 0.35, which is not particularly accurate.
plot(fit2_predict,test_set_permeability,main="Actual Permeability vs. Predicted")
abline(0,1,col='green',lwd=3)E
set.seed(34)
ridgeModel <- enet(x = train_set, y = train_set_permeability,lambda = 0.001)
ridgePred <- predict(ridgeModel, newx = test_set,s=1,mode="fraction",type="fit")
#ridgeModel
ridgeGrid <- data.frame(.lambda = seq(0,0.1,length = 15))
ridgeFit <- train(train_set,train_set_permeability,method="ridge",tuneGrid=ridgeGrid,trControl = trainControl(method = "cv"),preProc = c("center","scale"))
#ridgeFit
ridgeFitPred <- predict(ridgeFit,test_set)
enetFit <- enet(train_set,train_set_permeability,lambda = 0.01,normalize = TRUE)
enetPredict <- predict(enetFit,newx = test_set,s=0.1,mode="fraction",type="fit")
enetCoef <- predict(enetFit,newx = test_set,s=0.1,mode="fraction",type="coefficients")
enetGrid <- expand.grid(.lambda = c(0,0.01,.1),.fraction = seq(0.05,1,length = 20))
set.seed(34)
enetTune <- train(train_set,train_set_permeability,method="enet",tuneGrid = enetGrid,trControl = trainControl(method = "cv"),preProc = c("center","scale"))
enetTunePred <- predict(enetTune,test_set)
enetTune## Elasticnet
##
## 106 samples
## 388 predictors
##
## Pre-processing: centered (388), scaled (388)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 95, 96, 95, 97, 96, 96, ...
## Resampling results across tuning parameters:
##
## lambda fraction RMSE Rsquared MAE
## 0.00 0.05 9.056620e+13 0.3621577 5.188847e+13
## 0.00 0.10 1.713543e+14 0.3322881 9.534385e+13
## 0.00 0.15 2.365535e+14 0.2706256 1.313106e+14
## 0.00 0.20 2.992768e+14 0.2327538 1.647291e+14
## 0.00 0.25 3.587014e+14 0.2119950 1.967287e+14
## 0.00 0.30 4.178840e+14 0.2081764 2.285876e+14
## 0.00 0.35 4.781395e+14 0.2118402 2.605348e+14
## 0.00 0.40 5.381015e+14 0.2115654 2.923785e+14
## 0.00 0.45 5.913883e+14 0.2093031 3.207279e+14
## 0.00 0.50 6.444731e+14 0.2007574 3.489566e+14
## 0.00 0.55 6.975674e+14 0.1858921 3.771849e+14
## 0.00 0.60 7.506665e+14 0.1787935 4.054116e+14
## 0.00 0.65 8.037655e+14 0.1752424 4.336354e+14
## 0.00 0.70 8.568698e+14 0.1759662 4.618591e+14
## 0.00 0.75 9.099785e+14 0.1760993 4.900829e+14
## 0.00 0.80 9.630908e+14 0.1773988 5.183066e+14
## 0.00 0.85 1.016206e+15 0.1768748 5.465304e+14
## 0.00 0.90 1.069324e+15 0.1774035 5.747541e+14
## 0.00 0.95 1.122445e+15 0.1802759 6.029779e+14
## 0.00 1.00 1.175567e+15 0.1822871 6.312016e+14
## 0.01 0.05 1.179430e+01 0.4912372 8.674437e+00
## 0.01 0.10 1.217692e+01 0.4159076 8.966008e+00
## 0.01 0.15 1.254164e+01 0.3713976 9.529837e+00
## 0.01 0.20 1.234775e+01 0.3740702 9.371762e+00
## 0.01 0.25 1.235898e+01 0.3772326 9.366483e+00
## 0.01 0.30 1.247293e+01 0.3831246 9.363444e+00
## 0.01 0.35 1.269398e+01 0.3896171 9.650732e+00
## 0.01 0.40 1.313598e+01 0.3806053 1.006789e+01
## 0.01 0.45 1.343670e+01 0.3811773 1.033216e+01
## 0.01 0.50 1.370869e+01 0.3856325 1.056208e+01
## 0.01 0.55 1.388165e+01 0.3931708 1.070514e+01
## 0.01 0.60 1.410055e+01 0.3983391 1.085938e+01
## 0.01 0.65 1.439299e+01 0.3901870 1.107602e+01
## 0.01 0.70 1.474899e+01 0.3697299 1.140296e+01
## 0.01 0.75 1.508900e+01 0.3536673 1.172481e+01
## 0.01 0.80 1.542673e+01 0.3429953 1.200177e+01
## 0.01 0.85 1.571090e+01 0.3364431 1.220613e+01
## 0.01 0.90 1.593133e+01 0.3294242 1.234311e+01
## 0.01 0.95 1.617689e+01 0.3197790 1.251011e+01
## 0.01 1.00 1.643975e+01 0.3124922 1.270207e+01
## 0.10 0.05 1.216666e+01 0.4928557 9.299628e+00
## 0.10 0.10 1.155801e+01 0.4928101 8.448121e+00
## 0.10 0.15 1.161601e+01 0.4747397 8.444421e+00
## 0.10 0.20 1.203268e+01 0.4343678 8.870756e+00
## 0.10 0.25 1.235444e+01 0.4045202 9.279089e+00
## 0.10 0.30 1.247271e+01 0.3936483 9.451679e+00
## 0.10 0.35 1.255844e+01 0.3917786 9.505062e+00
## 0.10 0.40 1.262808e+01 0.3926941 9.583938e+00
## 0.10 0.45 1.267503e+01 0.3948013 9.655057e+00
## 0.10 0.50 1.275742e+01 0.3974248 9.744174e+00
## 0.10 0.55 1.288060e+01 0.4003520 9.882386e+00
## 0.10 0.60 1.304758e+01 0.4002051 1.006542e+01
## 0.10 0.65 1.322454e+01 0.3974483 1.024794e+01
## 0.10 0.70 1.338071e+01 0.3949131 1.040379e+01
## 0.10 0.75 1.348294e+01 0.3940636 1.050251e+01
## 0.10 0.80 1.354638e+01 0.3922357 1.055728e+01
## 0.10 0.85 1.358146e+01 0.3907525 1.058106e+01
## 0.10 0.90 1.361097e+01 0.3892241 1.059270e+01
## 0.10 0.95 1.364295e+01 0.3876222 1.059820e+01
## 0.10 1.00 1.367401e+01 0.3854058 1.060001e+01
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were fraction = 0.1 and lambda = 0.1.
plot(enetTune)postResample(ridgeFitPred,test_set_permeability)## RMSE Rsquared MAE
## 12.9702704 0.4401115 8.4143759
postResample(ridgePred$fit,test_set_permeability)## RMSE Rsquared MAE
## 22.0446443 0.1290779 13.5856572
postResample(enetTunePred,test_set_permeability)## RMSE Rsquared MAE
## 11.9205610 0.4323006 8.1705706
The enet based ridge model with a lambda of 0.1 provided the best fit with an R-squared of 0.440. The PLS model’s actual output yielded an R-squared of 0.353. The basic ridge model that used train( ) produced an R-squared of 0.129.
The enetTune model, with a lambda of 0.1, and fraction of 0.1, yielded an R-squared of 0.432. This falls in the middle, where it is slightly better than the PLS model, but not as strong as the enet based ridge model.
r2_eval <- data.frame(0.440,0.353,0.129,0.432)
colnames(r2_eval) <- c("Enet Based Ridge Model","PLS","Train Based Ridge Model","enetTune Model")
rownames(r2_eval) <- c("R-Squared")
kbl(r2_eval, longtable = T, booktabs = T, caption = "Model Performance Summary") %>%
kable_styling(latex_options = c("repeat_header"))| Enet Based Ridge Model | PLS | Train Based Ridge Model | enetTune Model | |
|---|---|---|---|---|
| R-Squared | 0.44 | 0.353 | 0.129 | 0.432 |
F
From the performances of the models, I would consider replacing the PLS model with the Enet based Ridge model or the enetTune model, due to their superior performance on the test set.
6.3
A
data(ChemicalManufacturingProcess)B
Very few NA values are present in the dataset. Using preProcess, we can impute the missing values with a more systematic approach than simply imputing median values when dealing with highly complex sets.
#summary(ChemicalManufacturingProcess)
gaps <- preProcess(ChemicalManufacturingProcess,method = "bagImpute")
ChemicalManufacturingProcess_cleaned <- predict(gaps,ChemicalManufacturingProcess)
#w3summary(ChemicalManufacturingProcess_cleaned)
cmp_c <- ChemicalManufacturingProcess_cleanedC
set.seed(34)
train_idx <- sample(c(TRUE,FALSE), nrow(cmp_c),
replace=TRUE, prob=c(0.7,0.3))
train_set <- cmp_c[train_idx,]
test_set <- cmp_c[!train_idx,]
train_yield <- data.frame(cmp_c[train_idx,1])
test_yield <- data.frame(cmp_c[!train_idx,1])
colnames(train_yield) <- c('Yield')
colnames(test_yield) <- c('Yield')
fit <- train(Yield ~.,train_set, method = "pls",tunelength = 25, trControl = trainControl(method = "cv"))
plot(fit)fit## Partial Least Squares
##
## 115 samples
## 57 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 104, 103, 103, 103, 104, 104, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 1.650668 0.2067468 1.351937
## 2 1.622864 0.2547191 1.315346
## 3 1.648419 0.2285300 1.338214
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 2.
Without pre-processing, a PLS model with 3 components yields an RMSE of 1.53 with an R-squared of 0.292. This leaves much to be desired.
fit2 <- train(Yield ~.,train_set, method = "pls",metric = "Rsquared",tunelength = 25, trControl = trainControl(method = "cv"),preProcess = c("center","scale"))## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
plot(fit2)fit2## Partial Least Squares
##
## 115 samples
## 57 predictor
##
## Pre-processing: centered (57), scaled (57)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 103, 105, 104, 103, 104, 104, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 1.364407 0.4294793 1.1219643
## 2 1.270096 0.5274166 1.0249677
## 3 1.210129 0.5646608 0.9930523
##
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was ncomp = 3.
With basic Pre-Processing, a much better result is obtained. With 3 components, the RMSE is 1.186 and the R-squared is 0.573.
The PLS model may be a suitable model to proceed with.
D
fit2_predict <- predict(fit2,test_set[,-1])
postResample(fit2_predict,test_set[,1])## RMSE Rsquared MAE
## 2.0710733 0.3053201 1.2606323
The actual performance of the PLS model was not as accurate as the training suggested.
Next, Enet and LARS models will be tested.
LARS:
set.seed(34)
fit3 <- train(Yield ~.,train_set, method = "lars",metric = "Rsquared",tuneLength = 20, trControl = trainControl(method = "cv"),preProc = c("center","scale"))## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
#fit3
plot(fit3)fit3_predict <- predict(fit3,test_set[,-1])
postResample(fit3_predict,test_set[,1])## RMSE Rsquared MAE
## 1.2004979 0.6312233 0.9101644
Training performance:
0.10 1.206940 0.5617840 0.9657577
Enet:
set.seed(34)
fit4 <- train(Yield ~.,train_set,method="enet",tuneGrid = enetGrid,trControl = trainControl(method = "cv"),preProc = c("center","scale"))## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning: model fit failed for Fold03: lambda=0.10, fraction=1 Error in elasticnet::enet(as.matrix(x), y, lambda = param$lambda) :
## Some of the columns of x have zero variance
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning: model fit failed for Fold03: lambda=0.01, fraction=1 Error in elasticnet::enet(as.matrix(x), y, lambda = param$lambda) :
## Some of the columns of x have zero variance
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning: model fit failed for Fold03: lambda=0.00, fraction=1 Error in elasticnet::enet(as.matrix(x), y, lambda = param$lambda) :
## Some of the columns of x have zero variance
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
#fit4
plot(fit4)fit4_predict <- predict(fit4,test_set[,-1])
postResample(fit4_predict,test_set[,1])## RMSE Rsquared MAE
## 1.2558419 0.6031449 0.9381976
#fit4Both the LARS and Enet models were very successful, in fact, their results were identical. From the elasticnet documentation, it seems like LARS and Enet are from the same algorithm.
r2_eval <- data.frame(0.305,0.6699,0.6699)
colnames(r2_eval) <- c("PLS","LARS Model","enetTune Model")
r2_train <- data.frame(0.573,0.561,0.578)
colnames(r2_train) <- c("PLS","LARS Model","enetTune Model")
r2_eval <- rbind(r2_eval,r2_train)
colnames(r2_eval) <- c("PLS","LARS Model","enetTune Model")
rownames(r2_eval) <- c("R-Squared: Actual","R-Squared: Trained")
kbl(r2_eval, longtable = T, booktabs = T, caption = "Model Performance Summary") %>%
kable_styling(latex_options = c("repeat_header"))| PLS | LARS Model | enetTune Model | |
|---|---|---|---|
| R-Squared: Actual | 0.305 | 0.6699 | 0.6699 |
| R-Squared: Trained | 0.573 | 0.5610 | 0.5780 |
E
In each model ManufacturingProcess 32 was the most important variable. Only the LARS model sees Biological variables crack the top 5 in importance. Otherwise, the importance lists are dominated by manufacturing process variables.
varImp(fit2)##
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
##
## R2
## The following object is masked from 'package:stats':
##
## loadings
## pls variable importance
##
## only 20 most important variables shown (out of 57)
##
## Overall
## ManufacturingProcess32 100.00
## ManufacturingProcess36 81.48
## ManufacturingProcess13 76.11
## ManufacturingProcess09 74.98
## ManufacturingProcess17 71.88
## ManufacturingProcess06 57.58
## ManufacturingProcess33 53.79
## ManufacturingProcess12 53.42
## BiologicalMaterial02 51.26
## BiologicalMaterial06 51.00
## BiologicalMaterial03 47.59
## ManufacturingProcess28 44.21
## ManufacturingProcess29 41.29
## ManufacturingProcess37 40.48
## BiologicalMaterial01 40.47
## BiologicalMaterial04 40.37
## ManufacturingProcess04 39.33
## BiologicalMaterial08 38.87
## ManufacturingProcess31 34.70
## BiologicalMaterial12 34.48
varImp(fit3)## loess r-squared variable importance
##
## only 20 most important variables shown (out of 57)
##
## Overall
## ManufacturingProcess32 100.00
## ManufacturingProcess13 69.87
## BiologicalMaterial06 58.29
## ManufacturingProcess17 53.74
## ManufacturingProcess36 53.51
## ManufacturingProcess09 51.94
## BiologicalMaterial03 50.78
## BiologicalMaterial02 43.62
## ManufacturingProcess31 42.19
## ManufacturingProcess27 40.52
## ManufacturingProcess20 39.46
## BiologicalMaterial01 36.25
## ManufacturingProcess02 34.95
## BiologicalMaterial12 34.52
## ManufacturingProcess29 33.73
## ManufacturingProcess06 32.30
## ManufacturingProcess33 31.55
## ManufacturingProcess12 28.63
## BiologicalMaterial04 26.79
## BiologicalMaterial09 22.41
varImp(fit4)## loess r-squared variable importance
##
## only 20 most important variables shown (out of 57)
##
## Overall
## ManufacturingProcess32 100.00
## ManufacturingProcess13 69.87
## BiologicalMaterial06 58.29
## ManufacturingProcess17 53.74
## ManufacturingProcess36 53.51
## ManufacturingProcess09 51.94
## BiologicalMaterial03 50.78
## BiologicalMaterial02 43.62
## ManufacturingProcess31 42.19
## ManufacturingProcess27 40.52
## ManufacturingProcess20 39.46
## BiologicalMaterial01 36.25
## ManufacturingProcess02 34.95
## BiologicalMaterial12 34.52
## ManufacturingProcess29 33.73
## ManufacturingProcess06 32.30
## ManufacturingProcess33 31.55
## ManufacturingProcess12 28.63
## BiologicalMaterial04 26.79
## BiologicalMaterial09 22.41
F
The correlation plot cannot fit all of the variables, so it will be filtered to show the top 15 variables only.
The biological material variables are heavily correlated with one another.
corr <- round(cor(cmp_c), 1)
corr15 <- corr[1:15,1:15]
ggcorrplot(corr15,method="circle")The top predictors’ correlation plot follows. The Enet model will be used to display the predictors’ correlation.
fit4_imp <- varImp(fit4)$importance %>%
arrange(-Overall) %>%
head(15)
corr <- round(cor(cmp_c), 1)
corr15 <- cmp_c %>%
select(c("Yield",row.names(fit4_imp)))
corr15 <- cor(corr15)
ggcorrplot(corr15,method="circle")Of the variables with the highest importance, ManufacturingProcess13 and ManufacturingProcess36 are negatively correlated with Yield. To improve future yield, those two processes should be refined, as they have a poor impact on yield.