library(caret)## Loading required package: lattice
## Loading required package: ggplot2
library(AppliedPredictiveModeling)
library(mice)##
## Attaching package: 'mice'
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(glmnet)## Loading required package: Matrix
## Loaded glmnet 3.0-2
data("permeability")
dat <- permeability
dat2 <- fingerprintsdf <- dim(fingerprints[,-nearZeroVar(fingerprints)])[2]
df## [1] 388
We have 338 predictors for our modeling
set.seed(1400)
new.data <- as.data.frame(cbind(fingerprints,permeability))
permeability_data <- createDataPartition(new.data$permeability, p=0.8, list=F)
train_data <- new.data[permeability_data, ]
test_data <- new.data[-permeability_data, ]
mod <- train(
permeability~., data = train_data, method = "pls",
trControl = trainControl("cv", number = 10),
center = T,
tuneLength = 20,
metrics = "Rsquared"
)
plot(mod)The best minumum value of error by cross validation is:
mod$bestTune## ncomp
## 3 3
summary(mod$finalModel)## Data: X dimension: 133 1107
## Y dimension: 133 1
## Fit method: oscorespls
## Number of components considered: 3
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps
## X 23.19 36.66 42.88
## .outcome 31.75 55.39 63.84
mod$results## ncomp RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 1 12.93172 0.3351261 9.841722 3.039620 0.2447308 2.053158
## 2 2 11.15716 0.4831454 7.896399 3.353268 0.3150186 2.001888
## 3 3 11.07482 0.5066626 8.122682 2.477757 0.2509652 1.485253
## 4 4 11.42386 0.4935893 8.396569 2.213181 0.2360651 1.342995
## 5 5 11.67331 0.4945688 8.239465 2.795997 0.2499660 1.682423
## 6 6 11.48762 0.5150028 8.006757 2.701481 0.2510276 1.478286
## 7 7 11.77365 0.5037757 8.277425 2.881039 0.2548350 1.539398
## 8 8 12.00479 0.5092530 8.438426 2.924365 0.2554937 1.827299
## 9 9 12.40315 0.5025325 8.663590 3.541779 0.2711937 2.146337
## 10 10 12.80205 0.4795636 8.950261 3.828658 0.2788369 2.287315
## 11 11 13.02164 0.4616623 9.161543 3.576577 0.2600280 2.263643
## 12 12 12.98284 0.4582188 9.109361 3.344780 0.2650879 2.047402
## 13 13 13.03531 0.4619478 9.175161 3.346709 0.2585533 1.914774
## 14 14 13.70301 0.4258028 9.646819 3.593048 0.2770046 1.923263
## 15 15 14.02607 0.4147695 9.943917 3.701597 0.2761658 2.123089
## 16 16 14.39303 0.4011090 10.224226 3.770252 0.2780173 2.063147
## 17 17 14.93109 0.3803281 10.604764 3.889356 0.2875709 1.974486
## 18 18 15.21721 0.3724576 10.758723 4.020845 0.2856575 2.016884
## 19 19 15.64776 0.3630096 11.024948 4.284849 0.2801553 2.186162
## 20 20 15.88980 0.3577192 11.183060 4.462945 0.2749085 2.361377
We can see this model on the comp 3 has an 42.88 on the x predictors variables with an outcome of 63.84 with a R2 of 50.6 %.
paste("RMSE",RMSE(predict(mod,test_data), test_data$permeability))## [1] "RMSE 13.6316247965792"
paste("R2",caret::R2(predict(mod,test_data), test_data$permeability))## [1] "R2 0.347871675030573"
R2 is 0.3478
mod1 <- train(
permeability~., data = train_data, method = "glmnet",
trControl = trainControl("cv", number = 10),
center = T,
tuneLength = 20,
metrics = "Rsquared"
)## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
plot(mod1)mod1$bestTune## alpha lambda
## 390 1 1.799462
summary(mod1$finalModel)## Length Class Mode
## a0 100 -none- numeric
## beta 110700 dgCMatrix S4
## df 100 -none- numeric
## dim 2 -none- numeric
## lambda 100 -none- numeric
## dev.ratio 100 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## call 7 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 1107 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 1 -none- logical
## param 2 -none- list
paste("RMSE",RMSE(predict(mod1,test_data), test_data$permeability))## [1] "RMSE 12.3050874155395"
paste("R2",caret::R2(predict(mod1,test_data), test_data$permeability))## [1] "R2 0.46291299709214"
mod2 <- train(
permeability~., data = train_data, method = "pcr",
trControl = trainControl("cv", number = 10),
center = T,
tuneLength = 20,
metrics = "Rsquared"
)
plot(mod2)mod2$bestTune## ncomp
## 17 17
summary(mod2$finalModel)## Data: X dimension: 133 1107
## Y dimension: 133 1
## Fit method: svdpc
## Number of components considered: 17
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 29.800 40.189 49.05 54.28 58.82 62.91 66.35
## .outcome 5.167 5.792 19.50 42.87 45.39 46.01 46.67
## 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps
## X 69.20 71.70 73.88 75.88 77.64 79.18 80.49
## .outcome 46.69 52.72 53.57 55.58 55.61 55.66 55.97
## 15 comps 16 comps 17 comps
## X 81.73 82.73 83.65
## .outcome 56.39 56.40 57.36
mod2$results## ncomp RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 1 14.79009 0.1519534 11.480522 2.490965 0.10361619 2.205150
## 2 2 14.77858 0.1539522 11.477633 2.487834 0.09716527 2.239800
## 3 3 13.66013 0.2526413 10.258908 2.655047 0.18563342 1.724640
## 4 4 11.77870 0.4306635 8.552210 2.550695 0.22533899 1.223889
## 5 5 11.64759 0.4511448 8.413204 2.299047 0.22467373 1.051394
## 6 6 11.48353 0.4594357 8.135658 2.429382 0.23880718 1.293749
## 7 7 11.53880 0.4557292 8.184401 2.256420 0.21854654 1.140828
## 8 8 11.67572 0.4518755 8.301738 2.270811 0.21981810 1.122076
## 9 9 11.36532 0.4616893 8.054911 2.182108 0.21275487 1.124947
## 10 10 11.15384 0.4853072 8.001134 1.904545 0.18692994 1.060373
## 11 11 10.89251 0.5057594 8.012929 1.930829 0.19563908 1.230808
## 12 12 10.84685 0.5152638 7.918239 1.989881 0.17516668 1.163658
## 13 13 10.89556 0.5136264 7.992289 1.868681 0.17353345 1.152282
## 14 14 10.96545 0.5031068 8.043833 1.898426 0.19111961 1.207874
## 15 15 10.89504 0.5120011 7.975537 1.922607 0.19352537 1.270080
## 16 16 10.90560 0.5115201 7.975967 1.928843 0.19285497 1.265844
## 17 17 10.79799 0.5212194 7.870075 1.844177 0.18671358 1.132338
## 18 18 10.80584 0.5137015 7.992305 1.954020 0.18838451 1.310510
## 19 19 10.88644 0.5041611 8.068908 2.067165 0.18705354 1.339628
## 20 20 10.99311 0.4957307 8.061443 2.020147 0.18803072 1.303968
paste("RMSE",RMSE(predict(mod2,test_data), test_data$permeability))## [1] "RMSE 14.7707835875675"
paste("R2",caret::R2(predict(mod2,test_data), test_data$permeability))## [1] "R2 0.252549830854557"
After observing the result I would recomend using the Lasso & Elastic-Net Linear Model. the glmnet model obtain the best R2 0.45.
data("ChemicalManufacturingProcess")data <- ChemicalManufacturingProcess
Amelia::missmap(data)temp <- mice(data,m=5, maxit = 50, method = 'pmm', seed = 500,printFlag = F)## Warning: Number of logged events: 6750
imputed.data <- complete(temp)
Amelia::missmap(imputed.data)set.seed(1400)
yield_data <- createDataPartition(imputed.data$Yield, p=0.8, list=F)
train_data <- imputed.data[yield_data, ]
test_data <- imputed.data[-yield_data, ]
model <- train(
Yield~., data = train_data, method = "pls",
trControl = trainControl("cv", number = 10),
scale = T,
tuneLength = 20
)
plot(model)The best minumum value of error by cross validation is:
model$bestTune## ncomp
## 3 3
summary(model$finalModel)## Data: X dimension: 144 57
## Y dimension: 144 1
## Fit method: oscorespls
## Number of components considered: 3
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps
## X 16.29 23.97 29.59
## .outcome 47.28 63.43 69.47
paste("RMSE",RMSE(predict(model,test_data), test_data$Yield))## [1] "RMSE 1.02010343947618"
paste("R2",caret::R2(predict(model,test_data), test_data$Yield))## [1] "R2 0.618384551291956"
varImp(model)##
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
##
## R2
## The following object is masked from 'package:stats':
##
## loadings
## pls variable importance
##
## only 20 most important variables shown (out of 57)
##
## Overall
## ManufacturingProcess32 100.00
## ManufacturingProcess13 84.74
## ManufacturingProcess17 80.36
## ManufacturingProcess09 78.69
## ManufacturingProcess36 71.79
## BiologicalMaterial02 59.60
## BiologicalMaterial06 57.33
## ManufacturingProcess33 55.76
## BiologicalMaterial03 55.42
## ManufacturingProcess06 53.15
## ManufacturingProcess12 52.62
## BiologicalMaterial08 51.95
## BiologicalMaterial12 51.28
## BiologicalMaterial11 51.01
## BiologicalMaterial04 46.90
## BiologicalMaterial01 45.88
## ManufacturingProcess11 45.74
## ManufacturingProcess04 37.63
## ManufacturingProcess28 37.23
## BiologicalMaterial10 33.83
we can see that most of the manufacturing process variables are the most significant variables.
cor(imputed.data$Yield,imputed.data$ManufacturingProcess32)## [1] 0.6083321
cor(imputed.data$Yield,imputed.data$BiologicalMaterial02)## [1] 0.4815158
This information is useful because it tells you that manufacturing process 32 have a high correlation with the response variables and focusing on those process can get good Yield results, the same with the variables of Biological materials 02 also shows how those materials are very important for using them on relation to the other process will have a good impact on the Yield result.
Reference: