#install.packages('elasticnet')
library(corrplot)
## corrplot 0.84 loaded
library(mlbench)
library(e1071)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(plyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(missMDA)
library(DMwR)
## Loading required package: grid
##
## Attaching package: 'DMwR'
## The following object is masked from 'package:plyr':
##
## join
library(pls)
##
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
##
## R2
## The following object is masked from 'package:corrplot':
##
## corrplot
## The following object is masked from 'package:stats':
##
## loadings
library(elasticnet)
## Loading required package: lars
## Loaded lars 1.2
Developing a model to predict permeability (see Sect.1.4) could save signi???cant resources for a pharmaceutical company, while at the same time more rapidly identifying molecules that have a su???cient permeability to become a drug:
library(AppliedPredictiveModeling)
data(permeability)
summary(permeability)
## permeability
## Min. : 0.06
## 1st Qu.: 1.55
## Median : 4.91
## Mean :12.24
## 3rd Qu.:15.47
## Max. :55.60
head(permeability)
## permeability
## 1 12.520
## 2 1.120
## 3 19.405
## 4 1.730
## 5 1.680
## 6 0.510
The matrix fingerprints contains the 1,107 binary molecular predictors for the 165 compounds, while permeability contains permeability response.
nz <- nearZeroVar(fingerprints)
length(nz)
## [1] 719
# Filter predictors
fp_df <- fingerprints[, -nz]
719 predictors are left for modeling (c) Split the data into a training and a test set, pre-process the data, and tune a PLS model. How many latent variables are optimal and what is the corresponding resampled estimate of R2? Split data into train and test set
## 75% of the sample size
smp_size <- floor(0.75 * nrow(fp_df))
## set the seed to make your partition reproducible
set.seed(123)
train_ind <- sample(seq_len(nrow(fp_df)), size = smp_size)
train <- fp_df[train_ind, ]
test <-fp_df[-train_ind, ]
test_fp<-test
permeability_train <- permeability[train_ind, ]
permeability_test <- permeability[-train_ind, ]
pls_model <- train(train, permeability_train,
method = "pls",
tuneLength = 10,
trControl = trainControl(method = "cv"))
pls_model
## Partial Least Squares
##
## 123 samples
## 388 predictors
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 110, 111, 111, 111, 111, 111, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 12.62475 0.2900612 9.443021
## 2 11.27070 0.4976929 7.873413
## 3 10.85735 0.5183636 8.077933
## 4 10.78911 0.5252378 8.375595
## 5 10.57609 0.5543138 8.029613
## 6 10.35604 0.5628768 7.731702
## 7 10.19970 0.5711496 7.485803
## 8 10.24760 0.5704241 7.586381
## 9 10.27141 0.5612497 7.611205
## 10 10.52644 0.5380197 7.665872
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 7.
We are picking model 7 10.19970 as RMSE 0.5711496 as Rsquared 7.485803 as MAE
lmPred1_fp <- predict(pls_model, test)
lmValues1 <- data.frame(obs = permeability_test, pred = lmPred1_fp)
defaultSummary(lmValues1)
## RMSE Rsquared MAE
## 12.878365 0.496656 9.308729
We see that the RMSE is 12.87 and Rsquared is 0.49 and MAE is 9.3
Let’s try to build the Ridge regression model
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
ctrl <- trainControl(method = "cv", number = 10)
ridgeModel <- enet (x = as.matrix(train), y = permeability_train, lambda = 0.001)
ridgeGrid <- data.frame(.lambda = seq(0, .1, length = 15))
ridgeRegFit <- train(train, permeability_train, method = "ridge", tuneGrid = ridgeGrid, trControl = ctrl,preProc = c("center", "scale"))
ridgeRegFit
## Ridge Regression
##
## 123 samples
## 388 predictors
##
## Pre-processing: centered (388), scaled (388)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 111, 111, 111, 111, 111, 110, ...
## Resampling results across tuning parameters:
##
## lambda RMSE Rsquared MAE
## 0.000000000 10.70911 0.5389154 7.677982
## 0.007142857 4481.47248 0.3111252 3007.946093
## 0.014285714 224.54490 0.4327764 148.895417
## 0.021428571 11.31681 0.4434589 8.190686
## 0.028571429 11.07845 0.4576801 8.074427
## 0.035714286 10.90095 0.4706866 7.997531
## 0.042857143 10.79583 0.4795058 7.967427
## 0.050000000 10.72215 0.4869756 7.947439
## 0.057142857 10.65453 0.4935822 7.927341
## 0.064285714 10.63009 0.4980133 7.937547
## 0.071428571 10.59211 0.5024700 7.939073
## 0.078571429 10.56938 0.5061914 7.941613
## 0.085714286 10.56359 0.5085101 7.959751
## 0.092857143 10.53339 0.5128997 7.950083
## 0.100000000 10.52636 0.5151716 7.963822
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was lambda = 0.1.
rgPred1_fp <- predict(ridgeRegFit, test_fp)
rgValues1 <- data.frame(obs = permeability_test, pred = rgPred1_fp)
defaultSummary(rgValues1)
## RMSE Rsquared MAE
## 13.3108143 0.4749396 9.6809970
13.3108143 as RMSE 0.4749396 as Rsquared 9.6809970 2 as MAE
6.3. A chemical manufacturing process for a pharmaceutical product was discussed in Sect.1.4. In this problem, the objective is to understand the relationship between biological measurements of the raw materials (predictors),measurements of the manufacturing process (predictors), and the response of product yield. Biological predictors cannot be changed but can be used to assess the quality of the raw material before processing. On the other hand, manufacturing process predictors can be changed in the manufacturing process. Improving product yield by 1% will boost revenue by approximately one hundred thousand dollars per batch:
library(AppliedPredictiveModeling)
data("ChemicalManufacturingProcess")
head(ChemicalManufacturingProcess)
## Yield BiologicalMaterial01 BiologicalMaterial02 BiologicalMaterial03
## 1 38.00 6.25 49.58 56.97
## 2 42.44 8.01 60.97 67.48
## 3 42.03 8.01 60.97 67.48
## 4 41.42 8.01 60.97 67.48
## 5 42.49 7.47 63.33 72.25
## 6 43.57 6.12 58.36 65.31
## BiologicalMaterial04 BiologicalMaterial05 BiologicalMaterial06
## 1 12.74 19.51 43.73
## 2 14.65 19.36 53.14
## 3 14.65 19.36 53.14
## 4 14.65 19.36 53.14
## 5 14.02 17.91 54.66
## 6 15.17 21.79 51.23
## BiologicalMaterial07 BiologicalMaterial08 BiologicalMaterial09
## 1 100 16.66 11.44
## 2 100 19.04 12.55
## 3 100 19.04 12.55
## 4 100 19.04 12.55
## 5 100 18.22 12.80
## 6 100 18.30 12.13
## BiologicalMaterial10 BiologicalMaterial11 BiologicalMaterial12
## 1 3.46 138.09 18.83
## 2 3.46 153.67 21.05
## 3 3.46 153.67 21.05
## 4 3.46 153.67 21.05
## 5 3.05 147.61 21.05
## 6 3.78 151.88 20.76
## ManufacturingProcess01 ManufacturingProcess02 ManufacturingProcess03
## 1 NA NA NA
## 2 0.0 0 NA
## 3 0.0 0 NA
## 4 0.0 0 NA
## 5 10.7 0 NA
## 6 12.0 0 NA
## ManufacturingProcess04 ManufacturingProcess05 ManufacturingProcess06
## 1 NA NA NA
## 2 917 1032.2 210.0
## 3 912 1003.6 207.1
## 4 911 1014.6 213.3
## 5 918 1027.5 205.7
## 6 924 1016.8 208.9
## ManufacturingProcess07 ManufacturingProcess08 ManufacturingProcess09
## 1 NA NA 43.00
## 2 177 178 46.57
## 3 178 178 45.07
## 4 177 177 44.92
## 5 178 178 44.96
## 6 178 178 45.32
## ManufacturingProcess10 ManufacturingProcess11 ManufacturingProcess12
## 1 NA NA NA
## 2 NA NA 0
## 3 NA NA 0
## 4 NA NA 0
## 5 NA NA 0
## 6 NA NA 0
## ManufacturingProcess13 ManufacturingProcess14 ManufacturingProcess15
## 1 35.5 4898 6108
## 2 34.0 4869 6095
## 3 34.8 4878 6087
## 4 34.8 4897 6102
## 5 34.6 4992 6233
## 6 34.0 4985 6222
## ManufacturingProcess16 ManufacturingProcess17 ManufacturingProcess18
## 1 4682 35.5 4865
## 2 4617 34.0 4867
## 3 4617 34.8 4877
## 4 4635 34.8 4872
## 5 4733 33.9 4886
## 6 4786 33.4 4862
## ManufacturingProcess19 ManufacturingProcess20 ManufacturingProcess21
## 1 6049 4665 0.0
## 2 6097 4621 0.0
## 3 6078 4621 0.0
## 4 6073 4611 0.0
## 5 6102 4659 -0.7
## 6 6115 4696 -0.6
## ManufacturingProcess22 ManufacturingProcess23 ManufacturingProcess24
## 1 NA NA NA
## 2 3 0 3
## 3 4 1 4
## 4 5 2 5
## 5 8 4 18
## 6 9 1 1
## ManufacturingProcess25 ManufacturingProcess26 ManufacturingProcess27
## 1 4873 6074 4685
## 2 4869 6107 4630
## 3 4897 6116 4637
## 4 4892 6111 4630
## 5 4930 6151 4684
## 6 4871 6128 4687
## ManufacturingProcess28 ManufacturingProcess29 ManufacturingProcess30
## 1 10.7 21.0 9.9
## 2 11.2 21.4 9.9
## 3 11.1 21.3 9.4
## 4 11.1 21.3 9.4
## 5 11.3 21.6 9.0
## 6 11.4 21.7 10.1
## ManufacturingProcess31 ManufacturingProcess32 ManufacturingProcess33
## 1 69.1 156 66
## 2 68.7 169 66
## 3 69.3 173 66
## 4 69.3 171 68
## 5 69.4 171 70
## 6 68.2 173 70
## ManufacturingProcess34 ManufacturingProcess35 ManufacturingProcess36
## 1 2.4 486 0.019
## 2 2.6 508 0.019
## 3 2.6 509 0.018
## 4 2.5 496 0.018
## 5 2.5 468 0.017
## 6 2.5 490 0.018
## ManufacturingProcess37 ManufacturingProcess38 ManufacturingProcess39
## 1 0.5 3 7.2
## 2 2.0 2 7.2
## 3 0.7 2 7.2
## 4 1.2 2 7.2
## 5 0.2 2 7.3
## 6 0.4 2 7.2
## ManufacturingProcess40 ManufacturingProcess41 ManufacturingProcess42
## 1 NA NA 11.6
## 2 0.1 0.15 11.1
## 3 0.0 0.00 12.0
## 4 0.0 0.00 10.6
## 5 0.0 0.00 11.0
## 6 0.0 0.00 11.5
## ManufacturingProcess43 ManufacturingProcess44 ManufacturingProcess45
## 1 3.0 1.8 2.4
## 2 0.9 1.9 2.2
## 3 1.0 1.8 2.3
## 4 1.1 1.8 2.1
## 5 1.1 1.7 2.1
## 6 2.2 1.8 2.0
summary(ChemicalManufacturingProcess)
## Yield BiologicalMaterial01 BiologicalMaterial02
## Min. :35.25 Min. :4.580 Min. :46.87
## 1st Qu.:38.75 1st Qu.:5.978 1st Qu.:52.68
## Median :39.97 Median :6.305 Median :55.09
## Mean :40.18 Mean :6.411 Mean :55.69
## 3rd Qu.:41.48 3rd Qu.:6.870 3rd Qu.:58.74
## Max. :46.34 Max. :8.810 Max. :64.75
##
## BiologicalMaterial03 BiologicalMaterial04 BiologicalMaterial05
## Min. :56.97 Min. : 9.38 Min. :13.24
## 1st Qu.:64.98 1st Qu.:11.24 1st Qu.:17.23
## Median :67.22 Median :12.10 Median :18.49
## Mean :67.70 Mean :12.35 Mean :18.60
## 3rd Qu.:70.43 3rd Qu.:13.22 3rd Qu.:19.90
## Max. :78.25 Max. :23.09 Max. :24.85
##
## BiologicalMaterial06 BiologicalMaterial07 BiologicalMaterial08
## Min. :40.60 Min. :100.0 Min. :15.88
## 1st Qu.:46.05 1st Qu.:100.0 1st Qu.:17.06
## Median :48.46 Median :100.0 Median :17.51
## Mean :48.91 Mean :100.0 Mean :17.49
## 3rd Qu.:51.34 3rd Qu.:100.0 3rd Qu.:17.88
## Max. :59.38 Max. :100.8 Max. :19.14
##
## BiologicalMaterial09 BiologicalMaterial10 BiologicalMaterial11
## Min. :11.44 Min. :1.770 Min. :135.8
## 1st Qu.:12.60 1st Qu.:2.460 1st Qu.:143.8
## Median :12.84 Median :2.710 Median :146.1
## Mean :12.85 Mean :2.801 Mean :147.0
## 3rd Qu.:13.13 3rd Qu.:2.990 3rd Qu.:149.6
## Max. :14.08 Max. :6.870 Max. :158.7
##
## BiologicalMaterial12 ManufacturingProcess01 ManufacturingProcess02
## Min. :18.35 Min. : 0.00 Min. : 0.00
## 1st Qu.:19.73 1st Qu.:10.80 1st Qu.:19.30
## Median :20.12 Median :11.40 Median :21.00
## Mean :20.20 Mean :11.21 Mean :16.68
## 3rd Qu.:20.75 3rd Qu.:12.15 3rd Qu.:21.50
## Max. :22.21 Max. :14.10 Max. :22.50
## NA's :1 NA's :3
## ManufacturingProcess03 ManufacturingProcess04 ManufacturingProcess05
## Min. :1.47 Min. :911.0 Min. : 923.0
## 1st Qu.:1.53 1st Qu.:928.0 1st Qu.: 986.8
## Median :1.54 Median :934.0 Median : 999.2
## Mean :1.54 Mean :931.9 Mean :1001.7
## 3rd Qu.:1.55 3rd Qu.:936.0 3rd Qu.:1008.9
## Max. :1.60 Max. :946.0 Max. :1175.3
## NA's :15 NA's :1 NA's :1
## ManufacturingProcess06 ManufacturingProcess07 ManufacturingProcess08
## Min. :203.0 Min. :177.0 Min. :177.0
## 1st Qu.:205.7 1st Qu.:177.0 1st Qu.:177.0
## Median :206.8 Median :177.0 Median :178.0
## Mean :207.4 Mean :177.5 Mean :177.6
## 3rd Qu.:208.7 3rd Qu.:178.0 3rd Qu.:178.0
## Max. :227.4 Max. :178.0 Max. :178.0
## NA's :2 NA's :1 NA's :1
## ManufacturingProcess09 ManufacturingProcess10 ManufacturingProcess11
## Min. :38.89 Min. : 7.500 Min. : 7.500
## 1st Qu.:44.89 1st Qu.: 8.700 1st Qu.: 9.000
## Median :45.73 Median : 9.100 Median : 9.400
## Mean :45.66 Mean : 9.179 Mean : 9.386
## 3rd Qu.:46.52 3rd Qu.: 9.550 3rd Qu.: 9.900
## Max. :49.36 Max. :11.600 Max. :11.500
## NA's :9 NA's :10
## ManufacturingProcess12 ManufacturingProcess13 ManufacturingProcess14
## Min. : 0.0 Min. :32.10 Min. :4701
## 1st Qu.: 0.0 1st Qu.:33.90 1st Qu.:4828
## Median : 0.0 Median :34.60 Median :4856
## Mean : 857.8 Mean :34.51 Mean :4854
## 3rd Qu.: 0.0 3rd Qu.:35.20 3rd Qu.:4882
## Max. :4549.0 Max. :38.60 Max. :5055
## NA's :1 NA's :1
## ManufacturingProcess15 ManufacturingProcess16 ManufacturingProcess17
## Min. :5904 Min. : 0 Min. :31.30
## 1st Qu.:6010 1st Qu.:4561 1st Qu.:33.50
## Median :6032 Median :4588 Median :34.40
## Mean :6039 Mean :4566 Mean :34.34
## 3rd Qu.:6061 3rd Qu.:4619 3rd Qu.:35.10
## Max. :6233 Max. :4852 Max. :40.00
##
## ManufacturingProcess18 ManufacturingProcess19 ManufacturingProcess20
## Min. : 0 Min. :5890 Min. : 0
## 1st Qu.:4813 1st Qu.:6001 1st Qu.:4553
## Median :4835 Median :6022 Median :4582
## Mean :4810 Mean :6028 Mean :4556
## 3rd Qu.:4862 3rd Qu.:6050 3rd Qu.:4610
## Max. :4971 Max. :6146 Max. :4759
##
## ManufacturingProcess21 ManufacturingProcess22 ManufacturingProcess23
## Min. :-1.8000 Min. : 0.000 Min. :0.000
## 1st Qu.:-0.6000 1st Qu.: 3.000 1st Qu.:2.000
## Median :-0.3000 Median : 5.000 Median :3.000
## Mean :-0.1642 Mean : 5.406 Mean :3.017
## 3rd Qu.: 0.0000 3rd Qu.: 8.000 3rd Qu.:4.000
## Max. : 3.6000 Max. :12.000 Max. :6.000
## NA's :1 NA's :1
## ManufacturingProcess24 ManufacturingProcess25 ManufacturingProcess26
## Min. : 0.000 Min. : 0 Min. : 0
## 1st Qu.: 4.000 1st Qu.:4832 1st Qu.:6020
## Median : 8.000 Median :4855 Median :6047
## Mean : 8.834 Mean :4828 Mean :6016
## 3rd Qu.:14.000 3rd Qu.:4877 3rd Qu.:6070
## Max. :23.000 Max. :4990 Max. :6161
## NA's :1 NA's :5 NA's :5
## ManufacturingProcess27 ManufacturingProcess28 ManufacturingProcess29
## Min. : 0 Min. : 0.000 Min. : 0.00
## 1st Qu.:4560 1st Qu.: 0.000 1st Qu.:19.70
## Median :4587 Median :10.400 Median :19.90
## Mean :4563 Mean : 6.592 Mean :20.01
## 3rd Qu.:4609 3rd Qu.:10.750 3rd Qu.:20.40
## Max. :4710 Max. :11.500 Max. :22.00
## NA's :5 NA's :5 NA's :5
## ManufacturingProcess30 ManufacturingProcess31 ManufacturingProcess32
## Min. : 0.000 Min. : 0.00 Min. :143.0
## 1st Qu.: 8.800 1st Qu.:70.10 1st Qu.:155.0
## Median : 9.100 Median :70.80 Median :158.0
## Mean : 9.161 Mean :70.18 Mean :158.5
## 3rd Qu.: 9.700 3rd Qu.:71.40 3rd Qu.:162.0
## Max. :11.200 Max. :72.50 Max. :173.0
## NA's :5 NA's :5
## ManufacturingProcess33 ManufacturingProcess34 ManufacturingProcess35
## Min. :56.00 Min. :2.300 Min. :463.0
## 1st Qu.:62.00 1st Qu.:2.500 1st Qu.:490.0
## Median :64.00 Median :2.500 Median :495.0
## Mean :63.54 Mean :2.494 Mean :495.6
## 3rd Qu.:65.00 3rd Qu.:2.500 3rd Qu.:501.5
## Max. :70.00 Max. :2.600 Max. :522.0
## NA's :5 NA's :5 NA's :5
## ManufacturingProcess36 ManufacturingProcess37 ManufacturingProcess38
## Min. :0.01700 Min. :0.000 Min. :0.000
## 1st Qu.:0.01900 1st Qu.:0.700 1st Qu.:2.000
## Median :0.02000 Median :1.000 Median :3.000
## Mean :0.01957 Mean :1.014 Mean :2.534
## 3rd Qu.:0.02000 3rd Qu.:1.300 3rd Qu.:3.000
## Max. :0.02200 Max. :2.300 Max. :3.000
## NA's :5
## ManufacturingProcess39 ManufacturingProcess40 ManufacturingProcess41
## Min. :0.000 Min. :0.00000 Min. :0.00000
## 1st Qu.:7.100 1st Qu.:0.00000 1st Qu.:0.00000
## Median :7.200 Median :0.00000 Median :0.00000
## Mean :6.851 Mean :0.01771 Mean :0.02371
## 3rd Qu.:7.300 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :7.500 Max. :0.10000 Max. :0.20000
## NA's :1 NA's :1
## ManufacturingProcess42 ManufacturingProcess43 ManufacturingProcess44
## Min. : 0.00 Min. : 0.0000 Min. :0.000
## 1st Qu.:11.40 1st Qu.: 0.6000 1st Qu.:1.800
## Median :11.60 Median : 0.8000 Median :1.900
## Mean :11.21 Mean : 0.9119 Mean :1.805
## 3rd Qu.:11.70 3rd Qu.: 1.0250 3rd Qu.:1.900
## Max. :12.10 Max. :11.0000 Max. :2.100
##
## ManufacturingProcess45
## Min. :0.000
## 1st Qu.:2.100
## Median :2.200
## Mean :2.138
## 3rd Qu.:2.300
## Max. :2.600
##
We clearly see some predictors have missing values let’s impute it with knn
ChemicalManuf_df<-ChemicalManufacturingProcess
ChemicalManuf_df <- knnImputation(ChemicalManuf_df[, !names(ChemicalManuf_df) %in% "Yield"])
anyNA(ChemicalManuf_df)
## [1] FALSE
ChemicalManuf_df$Yield <-ChemicalManufacturingProcess$Yield
Split data into train and test set
## 75% of the sample size
smp_size <- floor(0.75 * nrow(ChemicalManuf_df))
## set the seed to make your partition reproducible
set.seed(123)
train_ind <- sample(seq_len(nrow(ChemicalManuf_df)), size = smp_size)
train <- ChemicalManuf_df[train_ind, ]
test <-ChemicalManuf_df[-train_ind, ]
Since we already imputed missing values we don’t have much left to pre-process
Let’s try using ordinary linear regression
lmFitAllPredictors <- lm(Yield ~ ., data = train)
summary(lmFitAllPredictors)
##
## Call:
## lm(formula = Yield ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.77835 -0.49030 -0.03638 0.44151 2.16912
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.770e+02 1.576e+02 1.758 0.08286 .
## BiologicalMaterial01 3.168e-01 4.241e-01 0.747 0.45740
## BiologicalMaterial02 -7.270e-02 1.544e-01 -0.471 0.63907
## BiologicalMaterial03 7.080e-01 3.306e-01 2.141 0.03549 *
## BiologicalMaterial04 -1.195e-01 6.703e-01 -0.178 0.85895
## BiologicalMaterial05 2.656e-01 1.352e-01 1.964 0.05318 .
## BiologicalMaterial06 -7.302e-01 4.366e-01 -1.673 0.09857 .
## BiologicalMaterial07 -1.394e+00 1.141e+00 -1.222 0.22572
## BiologicalMaterial08 1.490e+00 8.851e-01 1.683 0.09645 .
## BiologicalMaterial09 -3.322e+00 1.922e+00 -1.729 0.08798 .
## BiologicalMaterial10 -4.643e-01 1.771e+00 -0.262 0.79391
## BiologicalMaterial11 -9.107e-02 1.008e-01 -0.904 0.36897
## BiologicalMaterial12 7.872e-01 7.897e-01 0.997 0.32203
## ManufacturingProcess01 1.898e-01 1.396e-01 1.360 0.17795
## ManufacturingProcess02 -5.480e-02 6.184e-02 -0.886 0.37836
## ManufacturingProcess03 1.652e-01 6.872e+00 0.024 0.98089
## ManufacturingProcess04 3.383e-02 4.228e-02 0.800 0.42621
## ManufacturingProcess05 3.062e-03 4.465e-03 0.686 0.49486
## ManufacturingProcess06 1.207e-01 8.263e-02 1.460 0.14839
## ManufacturingProcess07 -1.552e-01 2.726e-01 -0.569 0.57099
## ManufacturingProcess08 -2.466e-01 3.308e-01 -0.745 0.45838
## ManufacturingProcess09 3.161e-01 2.404e-01 1.315 0.19254
## ManufacturingProcess10 -6.921e-01 7.025e-01 -0.985 0.32767
## ManufacturingProcess11 4.460e-02 8.668e-01 0.051 0.95910
## ManufacturingProcess12 3.869e-05 1.436e-04 0.269 0.78836
## ManufacturingProcess13 -4.554e-01 4.738e-01 -0.961 0.33955
## ManufacturingProcess14 -6.512e-03 1.327e-02 -0.491 0.62508
## ManufacturingProcess15 2.713e-03 1.159e-02 0.234 0.81554
## ManufacturingProcess16 -2.433e-05 3.747e-04 -0.065 0.94841
## ManufacturingProcess17 -6.513e-02 3.907e-01 -0.167 0.86803
## ManufacturingProcess18 7.537e-03 5.537e-03 1.361 0.17751
## ManufacturingProcess19 -1.508e-02 1.077e-02 -1.401 0.16546
## ManufacturingProcess20 -7.739e-03 5.836e-03 -1.326 0.18880
## ManufacturingProcess21 NA NA NA NA
## ManufacturingProcess22 -7.860e-02 5.510e-02 -1.427 0.15782
## ManufacturingProcess23 1.338e-01 1.104e-01 1.213 0.22899
## ManufacturingProcess24 -6.122e-02 3.100e-02 -1.975 0.05197 .
## ManufacturingProcess25 -5.284e-03 1.832e-02 -0.288 0.77378
## ManufacturingProcess26 1.483e-03 1.337e-02 0.111 0.91195
## ManufacturingProcess27 -3.732e-03 9.485e-03 -0.393 0.69512
## ManufacturingProcess28 -1.051e-01 4.166e-02 -2.524 0.01373 *
## ManufacturingProcess29 2.233e+00 1.017e+00 2.196 0.03120 *
## ManufacturingProcess30 -1.131e+00 7.711e-01 -1.467 0.14660
## ManufacturingProcess31 7.921e-03 1.340e-01 0.059 0.95303
## ManufacturingProcess32 2.794e-01 8.577e-02 3.258 0.00169 **
## ManufacturingProcess33 -2.789e-01 1.629e-01 -1.713 0.09089 .
## ManufacturingProcess34 1.871e-01 3.726e+00 0.050 0.96008
## ManufacturingProcess35 -1.756e-02 2.471e-02 -0.710 0.47969
## ManufacturingProcess36 5.057e+02 4.177e+02 1.211 0.22981
## ManufacturingProcess37 -8.858e-01 4.078e-01 -2.172 0.03303 *
## ManufacturingProcess38 -1.191e-01 2.939e-01 -0.405 0.68633
## ManufacturingProcess39 -2.072e-03 1.718e-01 -0.012 0.99041
## ManufacturingProcess40 3.166e+00 8.926e+00 0.355 0.72380
## ManufacturingProcess41 -6.858e-01 6.972e+00 -0.098 0.92190
## ManufacturingProcess42 2.395e-01 2.685e-01 0.892 0.37511
## ManufacturingProcess43 4.897e-01 4.674e-01 1.048 0.29812
## ManufacturingProcess44 -1.838e+00 1.558e+00 -1.179 0.24199
## ManufacturingProcess45 1.137e+00 7.006e-01 1.623 0.10889
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.082 on 75 degrees of freedom
## Multiple R-squared: 0.8036, Adjusted R-squared: 0.657
## F-statistic: 5.481 on 56 and 75 DF, p-value: 1.127e-11
plot(lmFitAllPredictors)
Adjusted Rsquared is 0.657
We see a lot of insignificant predictors so we should probably apply a PLS or reduce the collinearity
We can also see that there is some non linearity in our data set and also the residuals are not normally distributed around the mean. SO let’s try a different model
Let’s check for other model parameters
ctrl <- trainControl(method = "cv", number = 10)
solTrainXtrans <- train[, !names(train) %in% "Yield"]
solTrainY <- train[, "Yield"]
set.seed(100)
lmFit1 <- train(x = solTrainXtrans, y = solTrainY, method = "lm", trControl = ctrl)
lmFit1
## Linear Regression
##
## 132 samples
## 57 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 118, 118, 117, 119, 119, 119, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 3.922679 0.332014 1.981117
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
We can see we have an RMSE of 3.922 and Rsquared of 0.33 and MAE of 1.98. Tuning parameter ‘intercept’ was held constant at a value of TRUE
Let’s find highly correlated predictors and see the difference it makes with the Rsqure. We will filter out the collinear predictors
corThresh <- .9
tooHigh <- findCorrelation(cor(train), corThresh)
corrPred <- names(train)[tooHigh]
trainXfiltered <- train[, -tooHigh]
testXfiltered <- test[, -tooHigh]
solTrainXtransfilter <- trainXfiltered[, !names(trainXfiltered) %in% "Yield"]
solTrainYfilter <- trainXfiltered[, "Yield"]
set.seed(100)
lmFiltered <- train(solTrainXtransfilter,solTrainYfilter, method = "lm", trControl = ctrl)
lmFiltered
## Linear Regression
##
## 132 samples
## 48 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 118, 118, 117, 119, 119, 119, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 6.30165 0.3542627 2.564754
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
We can see that the RMSE increased to 6.301, Rsquared is 0.35 increased and MAE as 2.564
plsFit <- plsr(Yield ~ ., data = train)
summary(plsFit)
## Data: X dimension: 132 57
## Y dimension: 132 1
## Fit method: kernelpls
## Number of components considered: 57
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 73.17 81.78 93.57 97.91 99.86 99.90 99.94
## Yield 10.91 11.71 12.69 17.53 23.64 38.97 44.04
## 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps
## X 99.96 99.99 99.99 99.99 99.99 100.00 100.00
## Yield 45.83 47.30 54.78 60.73 63.66 64.61 66.03
## 15 comps 16 comps 17 comps 18 comps 19 comps 20 comps
## X 100.0 100.00 100.0 100.00 100.00 100.00
## Yield 70.4 71.01 72.3 72.96 73.25 73.95
## 21 comps 22 comps 23 comps 24 comps 25 comps 26 comps
## X 100.00 100.00 100.00 100.00 100.00 100.00
## Yield 74.48 74.85 75.09 75.73 76.46 76.83
## 27 comps 28 comps 29 comps 30 comps 31 comps 32 comps
## X 100.00 100.00 100.00 100.00 100.00 100.00
## Yield 76.98 77.13 77.41 77.91 78.32 78.56
## 33 comps 34 comps 35 comps 36 comps 37 comps 38 comps
## X 100.0 100.00 100.0 100.00 100.00 100.00
## Yield 78.9 79.12 79.2 79.35 79.46 79.65
## 39 comps 40 comps 41 comps 42 comps 43 comps 44 comps
## X 100.00 100.00 100.00 100.0 100.00 100.00
## Yield 79.78 79.83 79.87 79.9 79.93 79.95
## 45 comps 46 comps 47 comps 48 comps 49 comps 50 comps
## X 100.00 100.00 100.00 100.00 100.00 100.00
## Yield 79.96 79.97 79.98 79.98 79.98 79.98
## 51 comps 52 comps 53 comps 54 comps 55 comps 56 comps
## X 100.00 100.00 100.00 100.00 100.00 100.00
## Yield 79.98 79.98 79.98 79.98 79.98 80.36
## 57 comps
## X 100.00
## Yield 80.36
plot(plsFit)
pls_model <- train(solTrainXtrans, solTrainY,
method = "pls",
tuneLength = 20,
trControl = trainControl(method = "cv"),
preProc = c("center", "scale"))
pls_model_filtered <- train(solTrainXtransfilter,solTrainYfilter,
method = "pls",
tuneLength = 20,
trControl = trainControl(method = "cv"),
preProc = c("center", "scale"))
solTestXtrans <- test[, !names(test) %in% "Yield"]
solTestY <- test[, "Yield"]
lmPred1 <- predict(lmFit1, solTestXtrans)
lmValues1 <- data.frame(obs = solTestY, pred = lmPred1)
defaultSummary(lmValues1)
## RMSE Rsquared MAE
## 1.4409881 0.5114828 1.1757253
The result for OLS looks better than the model diagnositic performance metric from training set
The most important predictors can be seen by looking at the results of the linear model output. They are ManufacturingProcess37 -8.858e-01 4.078e-01 -2.172 0.03303 ManufacturingProcess32 2.794e-01 8.577e-02 3.258 0.00169 ManufacturingProcess28 -1.051e-01 4.166e-02 -2.524 0.01373 ManufacturingProcess29 2.233e+00 1.017e+00 2.196 0.03120 BiologicalMaterial03 7.080e-01 3.306e-01 2.141 0.03549
We clearly see the manufacturingProcess dominates the list
#
plot(ChemicalManuf_df$ManufacturingProcess37,ChemicalManuf_df$Yield)
abline(lm(ChemicalManuf_df$Yield~ChemicalManuf_df$ManufacturingProcess37), col="red")
plot(ChemicalManuf_df$ManufacturingProcess32,ChemicalManuf_df$Yield)
abline(lm(ChemicalManuf_df$Yield~ChemicalManuf_df$ManufacturingProcess32), col="red")
plot(ChemicalManuf_df$ManufacturingProcess28,ChemicalManuf_df$Yield)
abline(lm(ChemicalManuf_df$Yield~ChemicalManuf_df$ManufacturingProcess28), col="red")
plot(ChemicalManuf_df$ManufacturingProcess29,ChemicalManuf_df$Yield)
abline(lm(ChemicalManuf_df$Yield~ChemicalManuf_df$ManufacturingProcess29), col="red")
plot(ChemicalManuf_df$BiologicalMaterial03,ChemicalManuf_df$Yield)
abline(lm(ChemicalManuf_df$Yield~ChemicalManuf_df$BiologicalMaterial03), col="red")
We can say that the ManufacturingProcess 37 has a slight negative correlation with yield while ManufacturingProcess 32 and BiologicalMaterial03 has a decent positive correlation with yield. If we increase them the yield will increase