library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(AppliedPredictiveModeling)
library(mice)
## 
## Attaching package: 'mice'
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 3.0-2

6.2

A

data("permeability")

dat <- permeability
dat2 <- fingerprints

B

df <- dim(fingerprints[,-nearZeroVar(fingerprints)])[2]
df
## [1] 388

We have 338 predictors for our modeling

C

set.seed(1400)

new.data <- as.data.frame(cbind(fingerprints,permeability))
permeability_data <- createDataPartition(new.data$permeability, p=0.8, list=F)


train_data <- new.data[permeability_data, ]
test_data <- new.data[-permeability_data, ]



mod <- train(
  permeability~., data = train_data, method = "pls",
  trControl = trainControl("cv", number = 10),
  center = T,
  tuneLength = 20,
  metrics = "Rsquared"
  )

plot(mod)

The best minumum value of error by cross validation is:

mod$bestTune
##   ncomp
## 3     3
summary(mod$finalModel)
## Data:    X dimension: 133 1107 
##  Y dimension: 133 1
## Fit method: oscorespls
## Number of components considered: 3
## TRAINING: % variance explained
##           1 comps  2 comps  3 comps
## X           23.19    36.66    42.88
## .outcome    31.75    55.39    63.84
mod$results
##    ncomp     RMSE  Rsquared       MAE   RMSESD RsquaredSD    MAESD
## 1      1 12.93172 0.3351261  9.841722 3.039620  0.2447308 2.053158
## 2      2 11.15716 0.4831454  7.896399 3.353268  0.3150186 2.001888
## 3      3 11.07482 0.5066626  8.122682 2.477757  0.2509652 1.485253
## 4      4 11.42386 0.4935893  8.396569 2.213181  0.2360651 1.342995
## 5      5 11.67331 0.4945688  8.239465 2.795997  0.2499660 1.682423
## 6      6 11.48762 0.5150028  8.006757 2.701481  0.2510276 1.478286
## 7      7 11.77365 0.5037757  8.277425 2.881039  0.2548350 1.539398
## 8      8 12.00479 0.5092530  8.438426 2.924365  0.2554937 1.827299
## 9      9 12.40315 0.5025325  8.663590 3.541779  0.2711937 2.146337
## 10    10 12.80205 0.4795636  8.950261 3.828658  0.2788369 2.287315
## 11    11 13.02164 0.4616623  9.161543 3.576577  0.2600280 2.263643
## 12    12 12.98284 0.4582188  9.109361 3.344780  0.2650879 2.047402
## 13    13 13.03531 0.4619478  9.175161 3.346709  0.2585533 1.914774
## 14    14 13.70301 0.4258028  9.646819 3.593048  0.2770046 1.923263
## 15    15 14.02607 0.4147695  9.943917 3.701597  0.2761658 2.123089
## 16    16 14.39303 0.4011090 10.224226 3.770252  0.2780173 2.063147
## 17    17 14.93109 0.3803281 10.604764 3.889356  0.2875709 1.974486
## 18    18 15.21721 0.3724576 10.758723 4.020845  0.2856575 2.016884
## 19    19 15.64776 0.3630096 11.024948 4.284849  0.2801553 2.186162
## 20    20 15.88980 0.3577192 11.183060 4.462945  0.2749085 2.361377

We can see this model on the comp 3 has an 42.88 on the x predictors variables with an outcome of 63.84 with a R2 of 50.6 %.

D

paste("RMSE",RMSE(predict(mod,test_data), test_data$permeability))
## [1] "RMSE 13.6316247965792"
paste("R2",caret::R2(predict(mod,test_data), test_data$permeability))
## [1] "R2 0.347871675030573"

R2 is 0.3478

E

model 1
mod1 <- train(
  permeability~., data = train_data, method = "glmnet",
  trControl = trainControl("cv", number = 10),
  center = T,
  tuneLength = 20,
  metrics = "Rsquared"
  )
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
plot(mod1)

mod1$bestTune
##     alpha   lambda
## 390     1 1.799462
summary(mod1$finalModel)
##             Length Class      Mode     
## a0             100 -none-     numeric  
## beta        110700 dgCMatrix  S4       
## df             100 -none-     numeric  
## dim              2 -none-     numeric  
## lambda         100 -none-     numeric  
## dev.ratio      100 -none-     numeric  
## nulldev          1 -none-     numeric  
## npasses          1 -none-     numeric  
## jerr             1 -none-     numeric  
## offset           1 -none-     logical  
## call             7 -none-     call     
## nobs             1 -none-     numeric  
## lambdaOpt        1 -none-     numeric  
## xNames        1107 -none-     character
## problemType      1 -none-     character
## tuneValue        2 data.frame list     
## obsLevels        1 -none-     logical  
## param            2 -none-     list
paste("RMSE",RMSE(predict(mod1,test_data), test_data$permeability))
## [1] "RMSE 12.3050874155395"
paste("R2",caret::R2(predict(mod1,test_data), test_data$permeability))
## [1] "R2 0.46291299709214"
Model 2
mod2 <- train(
  permeability~., data = train_data, method = "pcr",
  trControl = trainControl("cv", number = 10),
  center = T,
  tuneLength = 20,
  metrics = "Rsquared"
  )

plot(mod2)

mod2$bestTune
##    ncomp
## 17    17
summary(mod2$finalModel)
## Data:    X dimension: 133 1107 
##  Y dimension: 133 1
## Fit method: svdpc
## Number of components considered: 17
## TRAINING: % variance explained
##           1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps
## X          29.800   40.189    49.05    54.28    58.82    62.91    66.35
## .outcome    5.167    5.792    19.50    42.87    45.39    46.01    46.67
##           8 comps  9 comps  10 comps  11 comps  12 comps  13 comps  14 comps
## X           69.20    71.70     73.88     75.88     77.64     79.18     80.49
## .outcome    46.69    52.72     53.57     55.58     55.61     55.66     55.97
##           15 comps  16 comps  17 comps
## X            81.73     82.73     83.65
## .outcome     56.39     56.40     57.36
mod2$results
##    ncomp     RMSE  Rsquared       MAE   RMSESD RsquaredSD    MAESD
## 1      1 14.79009 0.1519534 11.480522 2.490965 0.10361619 2.205150
## 2      2 14.77858 0.1539522 11.477633 2.487834 0.09716527 2.239800
## 3      3 13.66013 0.2526413 10.258908 2.655047 0.18563342 1.724640
## 4      4 11.77870 0.4306635  8.552210 2.550695 0.22533899 1.223889
## 5      5 11.64759 0.4511448  8.413204 2.299047 0.22467373 1.051394
## 6      6 11.48353 0.4594357  8.135658 2.429382 0.23880718 1.293749
## 7      7 11.53880 0.4557292  8.184401 2.256420 0.21854654 1.140828
## 8      8 11.67572 0.4518755  8.301738 2.270811 0.21981810 1.122076
## 9      9 11.36532 0.4616893  8.054911 2.182108 0.21275487 1.124947
## 10    10 11.15384 0.4853072  8.001134 1.904545 0.18692994 1.060373
## 11    11 10.89251 0.5057594  8.012929 1.930829 0.19563908 1.230808
## 12    12 10.84685 0.5152638  7.918239 1.989881 0.17516668 1.163658
## 13    13 10.89556 0.5136264  7.992289 1.868681 0.17353345 1.152282
## 14    14 10.96545 0.5031068  8.043833 1.898426 0.19111961 1.207874
## 15    15 10.89504 0.5120011  7.975537 1.922607 0.19352537 1.270080
## 16    16 10.90560 0.5115201  7.975967 1.928843 0.19285497 1.265844
## 17    17 10.79799 0.5212194  7.870075 1.844177 0.18671358 1.132338
## 18    18 10.80584 0.5137015  7.992305 1.954020 0.18838451 1.310510
## 19    19 10.88644 0.5041611  8.068908 2.067165 0.18705354 1.339628
## 20    20 10.99311 0.4957307  8.061443 2.020147 0.18803072 1.303968
paste("RMSE",RMSE(predict(mod2,test_data), test_data$permeability))
## [1] "RMSE 14.7707835875675"
paste("R2",caret::R2(predict(mod2,test_data), test_data$permeability))
## [1] "R2 0.252549830854557"

f

After observing the result I would recomend using the Lasso & Elastic-Net Linear Model. the glmnet model obtain the best R2 0.45.

6.3

A

data("ChemicalManufacturingProcess")

B

data <- ChemicalManufacturingProcess

Amelia::missmap(data)

temp <- mice(data,m=5, maxit = 50, method = 'pmm', seed = 500,printFlag = F)
## Warning: Number of logged events: 6750
imputed.data <- complete(temp)

Amelia::missmap(imputed.data)

c

set.seed(1400)
yield_data <- createDataPartition(imputed.data$Yield, p=0.8, list=F)

train_data <- imputed.data[yield_data, ]
test_data <- imputed.data[-yield_data, ]




model <- train(
  Yield~., data = train_data, method = "pls",
  trControl = trainControl("cv", number = 10),
  scale = T,
  tuneLength = 20
  )

plot(model)

The best minumum value of error by cross validation is:

model$bestTune
##   ncomp
## 3     3
summary(model$finalModel)
## Data:    X dimension: 144 57 
##  Y dimension: 144 1
## Fit method: oscorespls
## Number of components considered: 3
## TRAINING: % variance explained
##           1 comps  2 comps  3 comps
## X           16.29    23.97    29.59
## .outcome    47.28    63.43    69.47

d

paste("RMSE",RMSE(predict(model,test_data), test_data$Yield))
## [1] "RMSE 1.02010343947618"
paste("R2",caret::R2(predict(model,test_data), test_data$Yield))
## [1] "R2 0.618384551291956"

e

varImp(model)
## 
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
## 
##     R2
## The following object is masked from 'package:stats':
## 
##     loadings
## pls variable importance
## 
##   only 20 most important variables shown (out of 57)
## 
##                        Overall
## ManufacturingProcess32  100.00
## ManufacturingProcess13   84.74
## ManufacturingProcess17   80.36
## ManufacturingProcess09   78.69
## ManufacturingProcess36   71.79
## BiologicalMaterial02     59.60
## BiologicalMaterial06     57.33
## ManufacturingProcess33   55.76
## BiologicalMaterial03     55.42
## ManufacturingProcess06   53.15
## ManufacturingProcess12   52.62
## BiologicalMaterial08     51.95
## BiologicalMaterial12     51.28
## BiologicalMaterial11     51.01
## BiologicalMaterial04     46.90
## BiologicalMaterial01     45.88
## ManufacturingProcess11   45.74
## ManufacturingProcess04   37.63
## ManufacturingProcess28   37.23
## BiologicalMaterial10     33.83

we can see that most of the manufacturing process variables are the most significant variables.

d

cor(imputed.data$Yield,imputed.data$ManufacturingProcess32)
## [1] 0.6083321
cor(imputed.data$Yield,imputed.data$BiologicalMaterial02)
## [1] 0.4815158

This information is useful because it tells you that manufacturing process 32 have a high correlation with the response variables and focusing on those process can get good Yield results, the same with the variables of Biological materials 02 also shows how those materials are very important for using them on relation to the other process will have a good impact on the Yield result.

Reference:

http://www.sthda.com/english/articles/37-model-selection-essentials-in-r/152-principal-component-and-partial-least-squares-regression-essentials/