6.2

A and B

data(permeability)
fingerprints <- fingerprints
fingerprints_trim <- fingerprints[, -nearZeroVar(fingerprints)]

fingerprints_df <- data.frame(fingerprints)
fingerprints_trim_df <- data.frame(fingerprints_trim)

The trimmed data leaves 388 variables, as opposed to the original 1107.

C

set.seed(34)

train_idx <- sample(c(TRUE,FALSE), nrow(fingerprints_trim), 
                 replace=TRUE, prob=c(0.7,0.3))

train_set <- fingerprints_trim[train_idx,]
test_set <- fingerprints_trim[!train_idx,]

train_set_permeability <- permeability[train_idx,]
test_set_permeability <- permeability[!train_idx,]

fit <- train(train_set,train_set_permeability, method = "pls",tunelength = 15, trControl = trainControl(method = "cv"))

plot(fit)

fit2 <- train(train_set,train_set_permeability, method = "pls",metric = "Rsquared",tunelength = 15, trControl = trainControl(method = "cv"),preProcess = c("center","scale"))

plot(fit2)

fit2
## Partial Least Squares 
## 
## 106 samples
## 388 predictors
## 
## Pre-processing: centered (388), scaled (388) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 95, 96, 95, 95, 96, 95, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE      Rsquared   MAE     
##   1      12.29527  0.4125748  9.601379
##   2      10.72521  0.5975011  7.585166
##   3      11.27872  0.5444427  8.274176
## 
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was ncomp = 2.

It appears that the RMSE is minimized at a latent variable count of 3. The R-Squared value is maximized at a component count of three, and the optimal component count is 3, with an RMSE of 11.62, an R-Squared of 0.485, and an MAE of 8.59.

D

fit2_predict <- predict(fit2,test_set)

postResample(fit2_predict,test_set_permeability)
##       RMSE   Rsquared        MAE 
## 14.2869967  0.3011959  9.3219372

The R-Squared value for the predictions versus the test set is 0.35, which is not particularly accurate.

plot(fit2_predict,test_set_permeability,main="Actual Permeability vs. Predicted")
abline(0,1,col='green',lwd=3)

E

set.seed(34)

ridgeModel <- enet(x = train_set, y = train_set_permeability,lambda = 0.001)
ridgePred <- predict(ridgeModel, newx = test_set,s=1,mode="fraction",type="fit")

#ridgeModel

ridgeGrid <- data.frame(.lambda = seq(0,0.1,length = 15))
ridgeFit <- train(train_set,train_set_permeability,method="ridge",tuneGrid=ridgeGrid,trControl = trainControl(method = "cv"),preProc = c("center","scale"))

#ridgeFit

ridgeFitPred <- predict(ridgeFit,test_set)


enetFit <- enet(train_set,train_set_permeability,lambda = 0.01,normalize = TRUE)

enetPredict <- predict(enetFit,newx = test_set,s=0.1,mode="fraction",type="fit")

enetCoef <- predict(enetFit,newx = test_set,s=0.1,mode="fraction",type="coefficients")

enetGrid <- expand.grid(.lambda = c(0,0.01,.1),.fraction = seq(0.05,1,length = 20))

set.seed(34)

enetTune <- train(train_set,train_set_permeability,method="enet",tuneGrid = enetGrid,trControl = trainControl(method = "cv"),preProc = c("center","scale"))

enetTunePred <- predict(enetTune,test_set)
enetTune
## Elasticnet 
## 
## 106 samples
## 388 predictors
## 
## Pre-processing: centered (388), scaled (388) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 95, 96, 95, 97, 96, 96, ... 
## Resampling results across tuning parameters:
## 
##   lambda  fraction  RMSE          Rsquared   MAE         
##   0.00    0.05      9.056620e+13  0.3621577  5.188847e+13
##   0.00    0.10      1.713543e+14  0.3322881  9.534385e+13
##   0.00    0.15      2.365535e+14  0.2706256  1.313106e+14
##   0.00    0.20      2.992768e+14  0.2327538  1.647291e+14
##   0.00    0.25      3.587014e+14  0.2119950  1.967287e+14
##   0.00    0.30      4.178840e+14  0.2081764  2.285876e+14
##   0.00    0.35      4.781395e+14  0.2118402  2.605348e+14
##   0.00    0.40      5.381015e+14  0.2115654  2.923785e+14
##   0.00    0.45      5.913883e+14  0.2093031  3.207279e+14
##   0.00    0.50      6.444731e+14  0.2007574  3.489566e+14
##   0.00    0.55      6.975674e+14  0.1858921  3.771849e+14
##   0.00    0.60      7.506665e+14  0.1787935  4.054116e+14
##   0.00    0.65      8.037655e+14  0.1752424  4.336354e+14
##   0.00    0.70      8.568698e+14  0.1759662  4.618591e+14
##   0.00    0.75      9.099785e+14  0.1760993  4.900829e+14
##   0.00    0.80      9.630908e+14  0.1773988  5.183066e+14
##   0.00    0.85      1.016206e+15  0.1768748  5.465304e+14
##   0.00    0.90      1.069324e+15  0.1774035  5.747541e+14
##   0.00    0.95      1.122445e+15  0.1802759  6.029779e+14
##   0.00    1.00      1.175567e+15  0.1822871  6.312016e+14
##   0.01    0.05      1.179430e+01  0.4912372  8.674437e+00
##   0.01    0.10      1.217692e+01  0.4159076  8.966008e+00
##   0.01    0.15      1.254164e+01  0.3713976  9.529837e+00
##   0.01    0.20      1.234775e+01  0.3740702  9.371762e+00
##   0.01    0.25      1.235898e+01  0.3772326  9.366483e+00
##   0.01    0.30      1.247293e+01  0.3831246  9.363444e+00
##   0.01    0.35      1.269398e+01  0.3896171  9.650732e+00
##   0.01    0.40      1.313598e+01  0.3806053  1.006789e+01
##   0.01    0.45      1.343670e+01  0.3811773  1.033216e+01
##   0.01    0.50      1.370869e+01  0.3856325  1.056208e+01
##   0.01    0.55      1.388165e+01  0.3931708  1.070514e+01
##   0.01    0.60      1.410055e+01  0.3983391  1.085938e+01
##   0.01    0.65      1.439299e+01  0.3901870  1.107602e+01
##   0.01    0.70      1.474899e+01  0.3697299  1.140296e+01
##   0.01    0.75      1.508900e+01  0.3536673  1.172481e+01
##   0.01    0.80      1.542673e+01  0.3429953  1.200177e+01
##   0.01    0.85      1.571090e+01  0.3364431  1.220613e+01
##   0.01    0.90      1.593133e+01  0.3294242  1.234311e+01
##   0.01    0.95      1.617689e+01  0.3197790  1.251011e+01
##   0.01    1.00      1.643975e+01  0.3124922  1.270207e+01
##   0.10    0.05      1.216666e+01  0.4928557  9.299628e+00
##   0.10    0.10      1.155801e+01  0.4928101  8.448121e+00
##   0.10    0.15      1.161601e+01  0.4747397  8.444421e+00
##   0.10    0.20      1.203268e+01  0.4343678  8.870756e+00
##   0.10    0.25      1.235444e+01  0.4045202  9.279089e+00
##   0.10    0.30      1.247271e+01  0.3936483  9.451679e+00
##   0.10    0.35      1.255844e+01  0.3917786  9.505062e+00
##   0.10    0.40      1.262808e+01  0.3926941  9.583938e+00
##   0.10    0.45      1.267503e+01  0.3948013  9.655057e+00
##   0.10    0.50      1.275742e+01  0.3974248  9.744174e+00
##   0.10    0.55      1.288060e+01  0.4003520  9.882386e+00
##   0.10    0.60      1.304758e+01  0.4002051  1.006542e+01
##   0.10    0.65      1.322454e+01  0.3974483  1.024794e+01
##   0.10    0.70      1.338071e+01  0.3949131  1.040379e+01
##   0.10    0.75      1.348294e+01  0.3940636  1.050251e+01
##   0.10    0.80      1.354638e+01  0.3922357  1.055728e+01
##   0.10    0.85      1.358146e+01  0.3907525  1.058106e+01
##   0.10    0.90      1.361097e+01  0.3892241  1.059270e+01
##   0.10    0.95      1.364295e+01  0.3876222  1.059820e+01
##   0.10    1.00      1.367401e+01  0.3854058  1.060001e+01
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were fraction = 0.1 and lambda = 0.1.
plot(enetTune)

postResample(ridgeFitPred,test_set_permeability)
##       RMSE   Rsquared        MAE 
## 12.9702704  0.4401115  8.4143759
postResample(ridgePred$fit,test_set_permeability)
##       RMSE   Rsquared        MAE 
## 22.0446443  0.1290779 13.5856572
postResample(enetTunePred,test_set_permeability)
##       RMSE   Rsquared        MAE 
## 11.9205610  0.4323006  8.1705706

The enet based ridge model with a lambda of 0.1 provided the best fit with an R-squared of 0.440. The PLS model’s actual output yielded an R-squared of 0.353. The basic ridge model that used train( ) produced an R-squared of 0.129.

The enetTune model, with a lambda of 0.1, and fraction of 0.1, yielded an R-squared of 0.432. This falls in the middle, where it is slightly better than the PLS model, but not as strong as the enet based ridge model.

r2_eval <- data.frame(0.440,0.353,0.129,0.432)

colnames(r2_eval) <- c("Enet Based Ridge Model","PLS","Train Based Ridge Model","enetTune Model")
rownames(r2_eval) <- c("R-Squared")

kbl(r2_eval, longtable = T, booktabs = T, caption = "Model Performance Summary") %>%
  kable_styling(latex_options = c("repeat_header"))
Model Performance Summary
Enet Based Ridge Model PLS Train Based Ridge Model enetTune Model
R-Squared 0.44 0.353 0.129 0.432

F

From the performances of the models, I would consider replacing the PLS model with the Enet based Ridge model or the enetTune model, due to their superior performance on the test set.

6.3

A

data(ChemicalManufacturingProcess)

B

Very few NA values are present in the dataset. Using preProcess, we can impute the missing values with a more systematic approach than simply imputing median values when dealing with highly complex sets.

#summary(ChemicalManufacturingProcess)

gaps <- preProcess(ChemicalManufacturingProcess,method = "bagImpute")

ChemicalManufacturingProcess_cleaned <- predict(gaps,ChemicalManufacturingProcess)

#w3summary(ChemicalManufacturingProcess_cleaned)

cmp_c <- ChemicalManufacturingProcess_cleaned

C

set.seed(34)

train_idx <- sample(c(TRUE,FALSE), nrow(cmp_c), 
                 replace=TRUE, prob=c(0.7,0.3))

train_set <- cmp_c[train_idx,]
test_set <- cmp_c[!train_idx,]

train_yield <- data.frame(cmp_c[train_idx,1])
test_yield <- data.frame(cmp_c[!train_idx,1])

colnames(train_yield) <- c('Yield')
colnames(test_yield) <- c('Yield')


fit <- train(Yield ~.,train_set, method = "pls",tunelength = 25, trControl = trainControl(method = "cv"))

plot(fit)

fit
## Partial Least Squares 
## 
## 115 samples
##  57 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 104, 103, 103, 103, 104, 104, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE      Rsquared   MAE     
##   1      1.650668  0.2067468  1.351937
##   2      1.622864  0.2547191  1.315346
##   3      1.648419  0.2285300  1.338214
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 2.

Without pre-processing, a PLS model with 3 components yields an RMSE of 1.53 with an R-squared of 0.292. This leaves much to be desired.

fit2 <- train(Yield ~.,train_set, method = "pls",metric = "Rsquared",tunelength = 25, trControl = trainControl(method = "cv"),preProcess = c("center","scale"))
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
plot(fit2)

fit2
## Partial Least Squares 
## 
## 115 samples
##  57 predictor
## 
## Pre-processing: centered (57), scaled (57) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 103, 105, 104, 103, 104, 104, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE      Rsquared   MAE      
##   1      1.364407  0.4294793  1.1219643
##   2      1.270096  0.5274166  1.0249677
##   3      1.210129  0.5646608  0.9930523
## 
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was ncomp = 3.

With basic Pre-Processing, a much better result is obtained. With 3 components, the RMSE is 1.186 and the R-squared is 0.573.

The PLS model may be a suitable model to proceed with.

D

fit2_predict <- predict(fit2,test_set[,-1])

postResample(fit2_predict,test_set[,1])
##      RMSE  Rsquared       MAE 
## 2.0710733 0.3053201 1.2606323

The actual performance of the PLS model was not as accurate as the training suggested.

Next, Enet and LARS models will be tested.

LARS:

set.seed(34)

fit3 <- train(Yield ~.,train_set, method = "lars",metric = "Rsquared",tuneLength = 20, trControl = trainControl(method = "cv"),preProc = c("center","scale"))
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
#fit3

plot(fit3)

fit3_predict <- predict(fit3,test_set[,-1])

postResample(fit3_predict,test_set[,1])
##      RMSE  Rsquared       MAE 
## 1.2004979 0.6312233 0.9101644

Training performance:

0.10 1.206940 0.5617840 0.9657577

Enet:

set.seed(34)

fit4 <- train(Yield ~.,train_set,method="enet",tuneGrid = enetGrid,trControl = trainControl(method = "cv"),preProc = c("center","scale"))
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning: model fit failed for Fold03: lambda=0.10, fraction=1 Error in elasticnet::enet(as.matrix(x), y, lambda = param$lambda) : 
##   Some of the columns of x have zero variance
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning: model fit failed for Fold03: lambda=0.01, fraction=1 Error in elasticnet::enet(as.matrix(x), y, lambda = param$lambda) : 
##   Some of the columns of x have zero variance
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning: model fit failed for Fold03: lambda=0.00, fraction=1 Error in elasticnet::enet(as.matrix(x), y, lambda = param$lambda) : 
##   Some of the columns of x have zero variance
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
#fit4
plot(fit4)

fit4_predict <- predict(fit4,test_set[,-1])

postResample(fit4_predict,test_set[,1])
##      RMSE  Rsquared       MAE 
## 1.2558419 0.6031449 0.9381976
#fit4

Both the LARS and Enet models were very successful, in fact, their results were identical. From the elasticnet documentation, it seems like LARS and Enet are from the same algorithm.

r2_eval <- data.frame(0.305,0.6699,0.6699)
colnames(r2_eval) <- c("PLS","LARS Model","enetTune Model")
r2_train <- data.frame(0.573,0.561,0.578)
colnames(r2_train) <- c("PLS","LARS Model","enetTune Model")
r2_eval <- rbind(r2_eval,r2_train)

colnames(r2_eval) <- c("PLS","LARS Model","enetTune Model")
rownames(r2_eval) <- c("R-Squared: Actual","R-Squared: Trained")

kbl(r2_eval, longtable = T, booktabs = T, caption = "Model Performance Summary") %>%
  kable_styling(latex_options = c("repeat_header"))
Model Performance Summary
PLS LARS Model enetTune Model
R-Squared: Actual 0.305 0.6699 0.6699
R-Squared: Trained 0.573 0.5610 0.5780

E

In each model ManufacturingProcess 32 was the most important variable. Only the LARS model sees Biological variables crack the top 5 in importance. Otherwise, the importance lists are dominated by manufacturing process variables.

varImp(fit2)
## 
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
## 
##     R2
## The following object is masked from 'package:stats':
## 
##     loadings
## pls variable importance
## 
##   only 20 most important variables shown (out of 57)
## 
##                        Overall
## ManufacturingProcess32  100.00
## ManufacturingProcess36   81.48
## ManufacturingProcess13   76.11
## ManufacturingProcess09   74.98
## ManufacturingProcess17   71.88
## ManufacturingProcess06   57.58
## ManufacturingProcess33   53.79
## ManufacturingProcess12   53.42
## BiologicalMaterial02     51.26
## BiologicalMaterial06     51.00
## BiologicalMaterial03     47.59
## ManufacturingProcess28   44.21
## ManufacturingProcess29   41.29
## ManufacturingProcess37   40.48
## BiologicalMaterial01     40.47
## BiologicalMaterial04     40.37
## ManufacturingProcess04   39.33
## BiologicalMaterial08     38.87
## ManufacturingProcess31   34.70
## BiologicalMaterial12     34.48
varImp(fit3)
## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 57)
## 
##                        Overall
## ManufacturingProcess32  100.00
## ManufacturingProcess13   69.87
## BiologicalMaterial06     58.29
## ManufacturingProcess17   53.74
## ManufacturingProcess36   53.51
## ManufacturingProcess09   51.94
## BiologicalMaterial03     50.78
## BiologicalMaterial02     43.62
## ManufacturingProcess31   42.19
## ManufacturingProcess27   40.52
## ManufacturingProcess20   39.46
## BiologicalMaterial01     36.25
## ManufacturingProcess02   34.95
## BiologicalMaterial12     34.52
## ManufacturingProcess29   33.73
## ManufacturingProcess06   32.30
## ManufacturingProcess33   31.55
## ManufacturingProcess12   28.63
## BiologicalMaterial04     26.79
## BiologicalMaterial09     22.41
varImp(fit4)
## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 57)
## 
##                        Overall
## ManufacturingProcess32  100.00
## ManufacturingProcess13   69.87
## BiologicalMaterial06     58.29
## ManufacturingProcess17   53.74
## ManufacturingProcess36   53.51
## ManufacturingProcess09   51.94
## BiologicalMaterial03     50.78
## BiologicalMaterial02     43.62
## ManufacturingProcess31   42.19
## ManufacturingProcess27   40.52
## ManufacturingProcess20   39.46
## BiologicalMaterial01     36.25
## ManufacturingProcess02   34.95
## BiologicalMaterial12     34.52
## ManufacturingProcess29   33.73
## ManufacturingProcess06   32.30
## ManufacturingProcess33   31.55
## ManufacturingProcess12   28.63
## BiologicalMaterial04     26.79
## BiologicalMaterial09     22.41

F

The correlation plot cannot fit all of the variables, so it will be filtered to show the top 15 variables only.

The biological material variables are heavily correlated with one another.

corr <- round(cor(cmp_c), 1)

corr15 <- corr[1:15,1:15]

ggcorrplot(corr15,method="circle")

The top predictors’ correlation plot follows. The Enet model will be used to display the predictors’ correlation.

fit4_imp <- varImp(fit4)$importance %>%
  arrange(-Overall) %>%
  head(15)

corr <- round(cor(cmp_c), 1)

corr15 <- cmp_c %>%
  select(c("Yield",row.names(fit4_imp)))

corr15 <- cor(corr15)

ggcorrplot(corr15,method="circle")

Of the variables with the highest importance, ManufacturingProcess13 and ManufacturingProcess36 are negatively correlated with Yield. To improve future yield, those two processes should be refined, as they have a poor impact on yield.