6.2

a.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(AppliedPredictiveModeling)
data("permeability")

b.

library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
ncol(fingerprints)
## [1] 1107
length(nearZeroVar(fingerprints))
## [1] 719

The nearZeroVar function returned 719 columns, so of the 1107 columns, 388 are left for modeling s### c. 

library(pls)
## 
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
## 
##     R2
## The following object is masked from 'package:stats':
## 
##     loadings
zero <- nearZeroVar(fingerprints)
finger_df <- fingerprints[,-zero]

set.seed(329)

part <- permeability %>%
  createDataPartition(p=0.8,list=FALSE,times=1)

dep_var_train <- finger_df[part,]
dep_var_test <- finger_df[-part,]

indep_var_train <- permeability[part,]
indep_var_test <- permeability[-part,]

pls_model <- train(x=dep_var_train, y=indep_var_train,
                   method="pls",
                   metric='Rsquared',
                   tuneLength=20,
                   trControl=trainControl(method = 'cv', number = 10),
                   preProc=c('center','scale'))

pls_model
## Partial Least Squares 
## 
## 133 samples
## 388 predictors
## 
## Pre-processing: centered (388), scaled (388) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 121, 119, 120, 118, 120, 120, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE      Rsquared   MAE      
##    1     13.62598  0.3365374  10.309205
##    2     12.18732  0.4872366   8.817615
##    3     12.02501  0.4891116   9.163251
##    4     12.10455  0.4813254   9.343701
##    5     11.80763  0.5162117   9.044054
##    6     11.85446  0.5096861   8.937348
##    7     11.64945  0.5157421   8.896158
##    8     11.82122  0.5127728   9.136694
##    9     11.91974  0.5067269   9.047863
##   10     12.07800  0.5020342   9.074149
##   11     12.24071  0.4952065   9.336880
##   12     12.24351  0.4906759   9.340567
##   13     12.24273  0.4835190   9.331712
##   14     12.58929  0.4623902   9.461092
##   15     12.89396  0.4455262   9.655297
##   16     13.05410  0.4481954   9.905222
##   17     13.17905  0.4410183  10.041873
##   18     13.53412  0.4220510  10.391394
##   19     13.72958  0.4083498  10.594558
##   20     13.96633  0.4022988  10.780009
## 
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was ncomp = 5.
plot(pls_model)

5 latent variables are optimal and the coreesponding resampled estimate of R-squared is 0.5162117

d.

pred_pls <- predict(pls_model, newdata=dep_var_test)
postResample(pred=pred_pls, obs=indep_var_test)
##     RMSE Rsquared      MAE 
## 9.886461 0.505552 7.380955

R-squared is 0.505552

e.

ridgeGrid <- data.frame(.lambda = seq(0, .1, length = 15))
ridge_model <- train(x=dep_var_train, y=indep_var_train,
                   method="ridge",
                   metric='Rsquared',
                   tuneGrid=ridgeGrid,
                   trControl=trainControl(method = 'cv', number = 10),
                   preProc=c('center','scale'))


enetGrid <- expand.grid(.lambda = c(0, 0.01, .1),
                        .fraction = seq(.05, 1, length = 20))
enet_model <- train(dep_var_train, indep_var_train,
                  method = "enet",
                  tuneGrid = enetGrid,
                  trControl = trainControl(method = 'cv', number = 10),
                  preProc = c("center", "scale"))

ridge_model
## Ridge Regression 
## 
## 133 samples
## 388 predictors
## 
## Pre-processing: centered (388), scaled (388) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 120, 120, 121, 119, 119, 118, ... 
## Resampling results across tuning parameters:
## 
##   lambda       RMSE          Rsquared   MAE         
##   0.000000000  6.041888e+15  0.1566410  2.263504e+15
##   0.007142857  2.154538e+03  0.2959339  1.590920e+03
##   0.014285714  1.396359e+01  0.3961027  1.040388e+01
##   0.021428571  1.353324e+01  0.4195692  1.008089e+01
##   0.028571429  1.327603e+01  0.4417345  9.812119e+00
##   0.035714286  1.301607e+01  0.4588161  9.635619e+00
##   0.042857143  1.291253e+01  0.4689554  9.535743e+00
##   0.050000000  1.271933e+01  0.4813903  9.401978e+00
##   0.057142857  1.264687e+01  0.4868521  9.362782e+00
##   0.064285714  1.254063e+01  0.4959862  9.299290e+00
##   0.071428571  1.247959e+01  0.5009572  9.265791e+00
##   0.078571429  1.241794e+01  0.5059755  9.233517e+00
##   0.085714286  1.236657e+01  0.5102200  9.207772e+00
##   0.092857143  1.232135e+01  0.5142373  9.181933e+00
##   0.100000000  1.229678e+01  0.5169221  9.172932e+00
## 
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was lambda = 0.1.
plot(ridge_model)

enet_model
## Elasticnet 
## 
## 133 samples
## 388 predictors
## 
## Pre-processing: centered (388), scaled (388) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 120, 120, 121, 121, 117, 120, ... 
## Resampling results across tuning parameters:
## 
##   lambda  fraction  RMSE          Rsquared   MAE         
##   0.00    0.05      2.050362e+16  0.3953541  1.004741e+16
##   0.00    0.10      4.092599e+16  0.3583007  2.037379e+16
##   0.00    0.15      6.136141e+16  0.3534069  3.070017e+16
##   0.00    0.20      8.180011e+16  0.3462468  4.102655e+16
##   0.00    0.25      1.022401e+17  0.3434621  5.135293e+16
##   0.00    0.30      1.226808e+17  0.3435596  6.167931e+16
##   0.00    0.35      1.431218e+17  0.3394221  7.200570e+16
##   0.00    0.40      1.635631e+17  0.3341926  8.233208e+16
##   0.00    0.45      1.840045e+17  0.3362654  9.265846e+16
##   0.00    0.50      2.044461e+17  0.3324543  1.029848e+17
##   0.00    0.55      2.248877e+17  0.3238881  1.133112e+17
##   0.00    0.60      2.453294e+17  0.3230661  1.236376e+17
##   0.00    0.65      2.657711e+17  0.3184394  1.339640e+17
##   0.00    0.70      2.862128e+17  0.3169251  1.442904e+17
##   0.00    0.75      3.066546e+17  0.3117650  1.546167e+17
##   0.00    0.80      3.270964e+17  0.3088340  1.649431e+17
##   0.00    0.85      3.475383e+17  0.3089970  1.752695e+17
##   0.00    0.90      3.679801e+17  0.3089014  1.855959e+17
##   0.00    0.95      3.884220e+17  0.3073362  1.959223e+17
##   0.00    1.00      4.088638e+17  0.3149600  2.062487e+17
##   0.01    0.05      2.609184e+01  0.3880351  1.881405e+01
##   0.01    0.10      4.162396e+01  0.3610719  2.989143e+01
##   0.01    0.15      5.651777e+01  0.3693145  4.045312e+01
##   0.01    0.20      7.126880e+01  0.3874966  5.079519e+01
##   0.01    0.25      8.586673e+01  0.3970273  6.119685e+01
##   0.01    0.30      1.009275e+02  0.3917432  7.195044e+01
##   0.01    0.35      1.158119e+02  0.3863885  8.260432e+01
##   0.01    0.40      1.309276e+02  0.3784225  9.345762e+01
##   0.01    0.45      1.463839e+02  0.3700128  1.043391e+02
##   0.01    0.50      1.616195e+02  0.3646698  1.150244e+02
##   0.01    0.55      1.768281e+02  0.3567480  1.257176e+02
##   0.01    0.60      1.920786e+02  0.3454999  1.365168e+02
##   0.01    0.65      2.071960e+02  0.3325282  1.472707e+02
##   0.01    0.70      2.223368e+02  0.3194204  1.580493e+02
##   0.01    0.75      2.374404e+02  0.3094080  1.687871e+02
##   0.01    0.80      2.524171e+02  0.3027810  1.794435e+02
##   0.01    0.85      2.672948e+02  0.2965771  1.900494e+02
##   0.01    0.90      2.821750e+02  0.2922559  2.007147e+02
##   0.01    0.95      2.969906e+02  0.2897853  2.113596e+02
##   0.01    1.00      3.118617e+02  0.2874091  2.220673e+02
##   0.10    0.05      1.244936e+01  0.5316761  9.539829e+00
##   0.10    0.10      1.198920e+01  0.4884311  8.813599e+00
##   0.10    0.15      1.212936e+01  0.4725444  8.965440e+00
##   0.10    0.20      1.228602e+01  0.4664706  9.132690e+00
##   0.10    0.25      1.231336e+01  0.4735508  9.214734e+00
##   0.10    0.30      1.238419e+01  0.4766569  9.302568e+00
##   0.10    0.35      1.239108e+01  0.4840354  9.325327e+00
##   0.10    0.40      1.239906e+01  0.4888709  9.352925e+00
##   0.10    0.45      1.244732e+01  0.4888835  9.412340e+00
##   0.10    0.50      1.252430e+01  0.4877303  9.511774e+00
##   0.10    0.55      1.261288e+01  0.4849697  9.603833e+00
##   0.10    0.60      1.269909e+01  0.4814964  9.707767e+00
##   0.10    0.65      1.278202e+01  0.4768213  9.793030e+00
##   0.10    0.70      1.282367e+01  0.4745373  9.836045e+00
##   0.10    0.75      1.284464e+01  0.4739590  9.872424e+00
##   0.10    0.80      1.288013e+01  0.4726541  9.928584e+00
##   0.10    0.85      1.294160e+01  0.4707654  1.000283e+01
##   0.10    0.90      1.300912e+01  0.4686746  1.007368e+01
##   0.10    0.95      1.306908e+01  0.4667916  1.013625e+01
##   0.10    1.00      1.313379e+01  0.4647849  1.019146e+01
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were fraction = 0.1 and lambda = 0.1.
plot(enet_model)

pred_ridge <- predict(ridge_model, newdata=dep_var_test)
pred_enet <- predict(enet_model, newdata=dep_var_test)

postResample(pred=pred_ridge, obs=indep_var_test)
##       RMSE   Rsquared        MAE 
## 10.5695413  0.4963474  7.3257501
postResample(pred=pred_enet, obs=indep_var_test)
##     RMSE Rsquared      MAE 
## 8.628018 0.587423 6.392301

f.

The ridge model appears to have the best R-squared though it is very similar to the PLS model. I will note that the processing speed of the PLS model is by far the fastest, while the ridge and lasso model are similar.

6.2

a.

data("ChemicalManufacturingProcess")

b.

library(mice)
## 
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
md.pattern(ChemicalManufacturingProcess)

##     Yield BiologicalMaterial01 BiologicalMaterial02 BiologicalMaterial03
## 152     1                    1                    1                    1
## 6       1                    1                    1                    1
## 1       1                    1                    1                    1
## 7       1                    1                    1                    1
## 5       1                    1                    1                    1
## 2       1                    1                    1                    1
## 1       1                    1                    1                    1
## 1       1                    1                    1                    1
## 1       1                    1                    1                    1
##         0                    0                    0                    0
##     BiologicalMaterial04 BiologicalMaterial05 BiologicalMaterial06
## 152                    1                    1                    1
## 6                      1                    1                    1
## 1                      1                    1                    1
## 7                      1                    1                    1
## 5                      1                    1                    1
## 2                      1                    1                    1
## 1                      1                    1                    1
## 1                      1                    1                    1
## 1                      1                    1                    1
##                        0                    0                    0
##     BiologicalMaterial07 BiologicalMaterial08 BiologicalMaterial09
## 152                    1                    1                    1
## 6                      1                    1                    1
## 1                      1                    1                    1
## 7                      1                    1                    1
## 5                      1                    1                    1
## 2                      1                    1                    1
## 1                      1                    1                    1
## 1                      1                    1                    1
## 1                      1                    1                    1
##                        0                    0                    0
##     BiologicalMaterial10 BiologicalMaterial11 BiologicalMaterial12
## 152                    1                    1                    1
## 6                      1                    1                    1
## 1                      1                    1                    1
## 7                      1                    1                    1
## 5                      1                    1                    1
## 2                      1                    1                    1
## 1                      1                    1                    1
## 1                      1                    1                    1
## 1                      1                    1                    1
##                        0                    0                    0
##     ManufacturingProcess09 ManufacturingProcess13 ManufacturingProcess15
## 152                      1                      1                      1
## 6                        1                      1                      1
## 1                        1                      1                      1
## 7                        1                      1                      1
## 5                        1                      1                      1
## 2                        1                      1                      1
## 1                        1                      1                      1
## 1                        1                      1                      1
## 1                        1                      1                      1
##                          0                      0                      0
##     ManufacturingProcess16 ManufacturingProcess17 ManufacturingProcess18
## 152                      1                      1                      1
## 6                        1                      1                      1
## 1                        1                      1                      1
## 7                        1                      1                      1
## 5                        1                      1                      1
## 2                        1                      1                      1
## 1                        1                      1                      1
## 1                        1                      1                      1
## 1                        1                      1                      1
##                          0                      0                      0
##     ManufacturingProcess19 ManufacturingProcess20 ManufacturingProcess21
## 152                      1                      1                      1
## 6                        1                      1                      1
## 1                        1                      1                      1
## 7                        1                      1                      1
## 5                        1                      1                      1
## 2                        1                      1                      1
## 1                        1                      1                      1
## 1                        1                      1                      1
## 1                        1                      1                      1
##                          0                      0                      0
##     ManufacturingProcess32 ManufacturingProcess37 ManufacturingProcess38
## 152                      1                      1                      1
## 6                        1                      1                      1
## 1                        1                      1                      1
## 7                        1                      1                      1
## 5                        1                      1                      1
## 2                        1                      1                      1
## 1                        1                      1                      1
## 1                        1                      1                      1
## 1                        1                      1                      1
##                          0                      0                      0
##     ManufacturingProcess39 ManufacturingProcess42 ManufacturingProcess43
## 152                      1                      1                      1
## 6                        1                      1                      1
## 1                        1                      1                      1
## 7                        1                      1                      1
## 5                        1                      1                      1
## 2                        1                      1                      1
## 1                        1                      1                      1
## 1                        1                      1                      1
## 1                        1                      1                      1
##                          0                      0                      0
##     ManufacturingProcess44 ManufacturingProcess45 ManufacturingProcess01
## 152                      1                      1                      1
## 6                        1                      1                      1
## 1                        1                      1                      1
## 7                        1                      1                      1
## 5                        1                      1                      1
## 2                        1                      1                      1
## 1                        1                      1                      1
## 1                        1                      1                      1
## 1                        1                      1                      0
##                          0                      0                      1
##     ManufacturingProcess04 ManufacturingProcess05 ManufacturingProcess07
## 152                      1                      1                      1
## 6                        1                      1                      1
## 1                        1                      1                      1
## 7                        1                      1                      1
## 5                        1                      1                      1
## 2                        1                      1                      1
## 1                        1                      1                      1
## 1                        1                      1                      1
## 1                        0                      0                      0
##                          1                      1                      1
##     ManufacturingProcess08 ManufacturingProcess12 ManufacturingProcess14
## 152                      1                      1                      1
## 6                        1                      1                      1
## 1                        1                      1                      1
## 7                        1                      1                      1
## 5                        1                      1                      1
## 2                        1                      1                      1
## 1                        1                      1                      1
## 1                        1                      1                      0
## 1                        0                      0                      1
##                          1                      1                      1
##     ManufacturingProcess22 ManufacturingProcess23 ManufacturingProcess24
## 152                      1                      1                      1
## 6                        1                      1                      1
## 1                        1                      1                      1
## 7                        1                      1                      1
## 5                        1                      1                      1
## 2                        1                      1                      1
## 1                        1                      1                      1
## 1                        1                      1                      1
## 1                        0                      0                      0
##                          1                      1                      1
##     ManufacturingProcess40 ManufacturingProcess41 ManufacturingProcess06
## 152                      1                      1                      1
## 6                        1                      1                      1
## 1                        1                      1                      1
## 7                        1                      1                      1
## 5                        1                      1                      1
## 2                        1                      1                      1
## 1                        1                      1                      0
## 1                        1                      1                      1
## 1                        0                      0                      0
##                          1                      1                      2
##     ManufacturingProcess02 ManufacturingProcess25 ManufacturingProcess26
## 152                      1                      1                      1
## 6                        1                      1                      1
## 1                        1                      1                      1
## 7                        1                      1                      1
## 5                        1                      0                      0
## 2                        0                      1                      1
## 1                        1                      1                      1
## 1                        1                      1                      1
## 1                        0                      1                      1
##                          3                      5                      5
##     ManufacturingProcess27 ManufacturingProcess28 ManufacturingProcess29
## 152                      1                      1                      1
## 6                        1                      1                      1
## 1                        1                      1                      1
## 7                        1                      1                      1
## 5                        0                      0                      0
## 2                        1                      1                      1
## 1                        1                      1                      1
## 1                        1                      1                      1
## 1                        1                      1                      1
##                          5                      5                      5
##     ManufacturingProcess30 ManufacturingProcess31 ManufacturingProcess33
## 152                      1                      1                      1
## 6                        1                      1                      1
## 1                        1                      1                      1
## 7                        1                      1                      1
## 5                        0                      0                      0
## 2                        1                      1                      1
## 1                        1                      1                      1
## 1                        1                      1                      1
## 1                        1                      1                      1
##                          5                      5                      5
##     ManufacturingProcess34 ManufacturingProcess35 ManufacturingProcess36
## 152                      1                      1                      1
## 6                        1                      1                      1
## 1                        1                      1                      1
## 7                        1                      1                      1
## 5                        0                      0                      0
## 2                        1                      1                      1
## 1                        1                      1                      1
## 1                        1                      1                      1
## 1                        1                      1                      1
##                          5                      5                      5
##     ManufacturingProcess10 ManufacturingProcess11 ManufacturingProcess03    
## 152                      1                      1                      1   0
## 6                        1                      1                      0   1
## 1                        1                      0                      1   1
## 7                        0                      0                      0   3
## 5                        1                      1                      1  11
## 2                        1                      1                      1   1
## 1                        1                      1                      1   1
## 1                        0                      0                      0   4
## 1                        0                      0                      0  16
##                          9                     10                     15 106

Above is a visual representation of the missing values. We will impute the missings.

cmp_df <- kNN(ChemicalManufacturingProcess,imp_var=FALSE)

c.

zeroVar <- nearZeroVar(cmp_df)

cmp_df_final <- cmp_df[,-zeroVar]

part <- ChemicalManufacturingProcess$Yield %>%
  createDataPartition(p=0.8,list=FALSE,times=1)

x_train <- cmp_df_final[part,]
x_test <- cmp_df_final[-part,]

y_train <- ChemicalManufacturingProcess$Yield[part]
y_test <- ChemicalManufacturingProcess$Yield[-part]


cmp_pls_model <- train(x=x_train, y=y_train,
                   method="pls",
                   tuneLength=20,
                   trControl=trainControl(method = 'cv'),
                   preProc=c('center','scale'))

cmp_pls_model
## Partial Least Squares 
## 
## 144 samples
##  57 predictor
## 
## Pre-processing: centered (57), scaled (57) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 130, 129, 130, 130, 131, 129, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE        Rsquared   MAE       
##    1     1.22489130  0.5787539  0.97030866
##    2     1.39840738  0.6534780  0.94420992
##    3     0.80956405  0.8240471  0.65453786
##    4     0.81327223  0.8212696  0.60136421
##    5     0.69915117  0.8497912  0.46409424
##    6     0.64738161  0.8636482  0.40605417
##    7     0.42498123  0.9296910  0.26112045
##    8     0.35802498  0.9402109  0.19914394
##    9     0.30887961  0.9494743  0.15871036
##   10     0.25653498  0.9600011  0.13163416
##   11     0.21584654  0.9682484  0.11061272
##   12     0.17627678  0.9734948  0.08986207
##   13     0.16951010  0.9758586  0.08249007
##   14     0.15912031  0.9784818  0.07473519
##   15     0.13777973  0.9828630  0.06554454
##   16     0.11728843  0.9882352  0.05657115
##   17     0.11624088  0.9893568  0.05046004
##   18     0.10862454  0.9886350  0.04404457
##   19     0.09248278  0.9903899  0.03686739
##   20     0.07740403  0.9927286  0.03067158
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 20.

d.

pred_cmp <- predict(cmp_pls_model,x_test)
postResample(pred_cmp,y_test)
##       RMSE   Rsquared        MAE 
## 0.01273013 0.99995231 0.01001609

This model has a low RMSE of 0.032 and an R-squared of 0.999. This seems like a good model.

e.

caret::varImp(cmp_pls_model)
## pls variable importance
## 
##   only 20 most important variables shown (out of 57)
## 
##                        Overall
## Yield                   100.00
## ManufacturingProcess32   41.94
## ManufacturingProcess36   38.61
## ManufacturingProcess13   35.39
## ManufacturingProcess09   34.59
## BiologicalMaterial02     29.24
## ManufacturingProcess17   28.82
## BiologicalMaterial08     28.62
## BiologicalMaterial06     28.34
## ManufacturingProcess33   26.13
## BiologicalMaterial12     25.95
## ManufacturingProcess06   25.55
## BiologicalMaterial03     25.36
## BiologicalMaterial11     24.79
## ManufacturingProcess12   24.73
## ManufacturingProcess11   24.60
## BiologicalMaterial04     24.35
## BiologicalMaterial01     23.88
## ManufacturingProcess28   22.48
## ManufacturingProcess04   21.55

Process 32, 09, and 13 seem to be the top most important variables.

f.

cor(ChemicalManufacturingProcess$Yield,
    ChemicalManufacturingProcess$ManufacturingProcess32)
## [1] 0.6083321
cor(ChemicalManufacturingProcess$Yield,
    ChemicalManufacturingProcess$ManufacturingProcess09)
## [1] 0.5034705
cor(ChemicalManufacturingProcess$Yield,
    ChemicalManufacturingProcess$ManufacturingProcess13)
## [1] -0.5036797
cor(ChemicalManufacturingProcess$Yield,
    ChemicalManufacturingProcess$ManufacturingProcess17)
## [1] -0.4258069

Those predictors that are negatively correlated with yield should likely be lowered and those that are positive correlated should be increased.