Juan Olazaba 12/03/2023
The matrix processPredictors contains the 57 predictors (12 describing the input biological material and 45 describing the process predictors) for the 176 manufacturing runs. Yield contains the percent yield for each run.
FirstHalf <- ChemicalManufacturingProcess[,1:29]
SecondHalf <- ChemicalManufacturingProcess[,30:58]
par(mfrow=c(1,3))
hist(ChemicalManufacturingProcess$ManufacturingProcess11,type = "count",main="Distribution MP11")## Warning in plot.window(xlim, ylim, "", ...): graphical parameter "type" is
## obsolete
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## graphical parameter "type" is obsolete
## Warning in axis(1, ...): graphical parameter "type" is obsolete
## Warning in axis(2, at = yt, ...): graphical parameter "type" is obsolete
## Warning in plot.window(xlim, ylim, "", ...): graphical parameter "type" is
## obsolete
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## graphical parameter "type" is obsolete
## Warning in axis(1, ...): graphical parameter "type" is obsolete
## Warning in axis(2, at = yt, ...): graphical parameter "type" is obsolete
## Warning in plot.window(xlim, ylim, "", ...): graphical parameter "type" is
## obsolete
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## graphical parameter "type" is obsolete
## Warning in axis(1, ...): graphical parameter "type" is obsolete
## Warning in axis(2, at = yt, ...): graphical parameter "type" is obsolete
data_im <- ChemicalManufacturingProcess[c("ManufacturingProcess11", "ManufacturingProcess03", "ManufacturingProcess10")]
imputed_values <- mice(data_im, method = "pmm", printFlag = F, seed =112)
completedData <- complete(imputed_values, 1)
ChemicalManufacturingProcess$ManufacturingProcess11 <- completedData$ManufacturingProcess11
ChemicalManufacturingProcess$ManufacturingProcess10 <- completedData$ManufacturingProcess10
ChemicalManufacturingProcess$ManufacturingProcess03 <- completedData$ManufacturingProcess03
new_data <- na.omit(ChemicalManufacturingProcess)
plot_missing(new_data) # Clean data :)Ridge Regression
x <- model.matrix(Yield ~., new_data)[, -1]
y <- new_data$Yield
# data splitting
set.seed(96)
train <- sample(1:nrow(new_data), nrow(new_data) / 2)
test <- (-train)
y.test <- y[test]
## Model fitting
grid <- 10^seq(10, -2, length = 100)
###Ridge Regression, used a random lambda value test the power of the model.
ridge.mod <- glmnet(x[train , ], y[train], alpha = 0,
lambda = grid, thresh = 1e-12)
ridge.pred <- predict(ridge.mod , s = 20, newx = x[test , ])
mean((ridge.pred - y.test)^2) #MSE: 2.467558## [1] 2.467558
#Finding the optimal lambda utilizing cross-validation for ridge regression.
set.seed(6)
cv.out <- cv.glmnet(x[train , ], y[train], alpha = 0)
plot(cv.out)## [1] 0.460014
# Coefficients using optimal lambda
out <- glmnet(x, y, alpha = 0)
predict(out , type = "coefficients", s = bestlam)[1:57, ]## (Intercept) BiologicalMaterial01 BiologicalMaterial02
## 8.910848e+01 1.029318e-01 7.244448e-03
## BiologicalMaterial03 BiologicalMaterial04 BiologicalMaterial05
## 3.090888e-02 3.581320e-02 3.858972e-02
## BiologicalMaterial06 BiologicalMaterial07 BiologicalMaterial08
## 2.698948e-02 -9.329293e-01 7.808427e-02
## BiologicalMaterial09 BiologicalMaterial10 BiologicalMaterial11
## -2.455511e-01 -7.938717e-02 -7.200355e-04
## BiologicalMaterial12 ManufacturingProcess01 ManufacturingProcess02
## 1.603601e-02 2.071566e-02 2.314465e-03
## ManufacturingProcess03 ManufacturingProcess04 ManufacturingProcess05
## -3.332547e+00 2.737263e-02 -9.747124e-04
## ManufacturingProcess06 ManufacturingProcess07 ManufacturingProcess08
## 4.328888e-02 -2.602450e-01 3.128700e-02
## ManufacturingProcess09 ManufacturingProcess10 ManufacturingProcess11
## 2.275637e-01 4.166123e-02 1.565552e-01
## ManufacturingProcess12 ManufacturingProcess13 ManufacturingProcess14
## 6.306160e-05 -2.370125e-01 4.426441e-04
## ManufacturingProcess15 ManufacturingProcess16 ManufacturingProcess17
## 2.625980e-03 -1.384694e-03 -1.754136e-01
## ManufacturingProcess18 ManufacturingProcess19 ManufacturingProcess20
## 8.057177e-05 3.750315e-03 1.821843e-05
## ManufacturingProcess21 ManufacturingProcess22 ManufacturingProcess23
## -4.961324e-02 1.077821e-05 -3.063913e-02
## ManufacturingProcess24 ManufacturingProcess25 ManufacturingProcess26
## -1.548427e-02 -4.427763e-05 1.085923e-05
## ManufacturingProcess27 ManufacturingProcess28 ManufacturingProcess29
## -6.651666e-05 -3.766564e-02 4.911396e-02
## ManufacturingProcess30 ManufacturingProcess31 ManufacturingProcess32
## 3.974552e-02 -5.000457e-03 7.477786e-02
## ManufacturingProcess33 ManufacturingProcess34 ManufacturingProcess35
## 1.860308e-02 4.148849e+00 1.377736e-03
## ManufacturingProcess36 ManufacturingProcess37 ManufacturingProcess38
## -2.712846e+02 -4.788460e-01 -1.097401e-01
## ManufacturingProcess39 ManufacturingProcess40 ManufacturingProcess41
## 1.032430e-01 6.804015e-02 -4.699237e-01
## ManufacturingProcess42 ManufacturingProcess43 ManufacturingProcess44
## -2.628558e-02 1.021157e-01 6.090091e-03
LASSO
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
#Cross validation for optimal lambda
set.seed(19)
cv.outl <- cv.glmnet(x[train , ], y[train], alpha = 1)
plot(cv.outl)PCR
### PCR
set.seed(27)
pcr.fit <- pcr(Yield ~ ., data = new_data , subset = train,
scale = TRUE , validation = "CV")
validationplot(pcr.fit , val.type = "MSEP") #Lowest cross-validation error is with 10 components## Data: X dimension: 166 57
## Y dimension: 166 1
## Fit method: svdpc
## Number of components considered: 10
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 18.64 28.61 37.59 43.70 49.06 53.66 58.05 61.94
## y 26.41 30.54 42.87 46.64 48.02 48.96 49.06 52.43
## 9 comps 10 comps
## X 65.50 68.77
## y 59.55 60.29
PLS
### PLS
set.seed(45)
pls.fit <- plsr(Yield ~ ., data = new_data , subset = train , scale
= TRUE, validation = "CV")
summary(pls.fit)## Data: X dimension: 83 57
## Y dimension: 83 1
## Fit method: kernelpls
## Number of components considered: 57
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 1.951 1.352 1.289 1.266 1.331 1.346 1.318
## adjCV 1.951 1.349 1.281 1.248 1.297 1.306 1.284
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1.291 1.282 1.266 1.272 1.302 1.317 1.298
## adjCV 1.251 1.240 1.227 1.232 1.260 1.274 1.255
## 14 comps 15 comps 16 comps 17 comps 18 comps 19 comps 20 comps
## CV 1.330 1.357 1.356 1.338 1.348 1.381 1.396
## adjCV 1.285 1.310 1.310 1.293 1.302 1.333 1.348
## 21 comps 22 comps 23 comps 24 comps 25 comps 26 comps 27 comps
## CV 1.43 1.458 1.481 1.515 1.556 1.617 1.676
## adjCV 1.38 1.406 1.428 1.460 1.498 1.555 1.611
## 28 comps 29 comps 30 comps 31 comps 32 comps 33 comps 34 comps
## CV 1.725 1.793 1.850 1.947 1.986 2.046 2.084
## adjCV 1.656 1.720 1.773 1.864 1.900 1.956 1.992
## 35 comps 36 comps 37 comps 38 comps 39 comps 40 comps 41 comps
## CV 2.157 2.265 2.316 2.350 2.388 2.429 2.443
## adjCV 2.060 2.161 2.209 2.241 2.277 2.315 2.328
## 42 comps 43 comps 44 comps 45 comps 46 comps 47 comps 48 comps
## CV 2.508 2.556 2.642 2.654 2.695 2.789 2.835
## adjCV 2.389 2.435 2.515 2.526 2.565 2.653 2.695
## 49 comps 50 comps 51 comps 52 comps 53 comps 54 comps 55 comps
## CV 2.856 2.887 2.903 2.914 2.924 2.924 2.916
## adjCV 2.716 2.745 2.760 2.769 2.779 2.779 2.772
## 56 comps 57 comps
## CV 2.911 2.914
## adjCV 2.767 2.769
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 19.92 31.92 37.38 40.83 44.62 52.33 54.73 58.74
## Yield 56.56 67.75 76.40 81.97 84.94 85.69 87.62 88.85
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 63.03 65.62 68.38 71.09 73.20 74.98 76.63
## Yield 89.85 90.75 91.17 91.53 91.93 92.35 92.60
## 16 comps 17 comps 18 comps 19 comps 20 comps 21 comps 22 comps
## X 78.17 79.38 80.43 81.39 83.15 84.70 85.97
## Yield 92.75 92.91 93.07 93.18 93.21 93.25 93.29
## 23 comps 24 comps 25 comps 26 comps 27 comps 28 comps 29 comps
## X 87.46 88.50 90.00 90.68 91.45 92.44 93.33
## Yield 93.32 93.36 93.38 93.41 93.43 93.45 93.48
## 30 comps 31 comps 32 comps 33 comps 34 comps 35 comps 36 comps
## X 94.39 95.16 95.83 96.32 96.86 97.21 97.75
## Yield 93.51 93.54 93.57 93.59 93.60 93.62 93.64
## 37 comps 38 comps 39 comps 40 comps 41 comps 42 comps 43 comps
## X 98.46 98.94 99.13 99.28 99.35 99.45 99.54
## Yield 93.65 93.68 93.71 93.73 93.75 93.76 93.78
## 44 comps 45 comps 46 comps 47 comps 48 comps 49 comps 50 comps
## X 99.62 99.70 99.79 99.82 99.85 99.88 99.91
## Yield 93.78 93.79 93.79 93.81 93.81 93.81 93.81
## 51 comps 52 comps 53 comps 54 comps 55 comps 56 comps 57 comps
## X 99.94 99.95 99.97 99.98 99.99 100.00 100.08
## Yield 93.81 93.81 93.81 93.81 93.81 93.81 93.81
## Data: X dimension: 166 57
## Y dimension: 166 1
## Fit method: kernelpls
## Number of components considered: 4
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps
## X 16.95 25.96 31.47 36.37
## y 49.70 64.07 69.95 72.69
## (Intercept) BiologicalMaterial01 BiologicalMaterial02
## 1.366141e+02 0.000000e+00 0.000000e+00
## BiologicalMaterial03 BiologicalMaterial04 BiologicalMaterial05
## 1.275920e-02 0.000000e+00 7.025633e-02
## BiologicalMaterial06 BiologicalMaterial07 BiologicalMaterial08
## 1.201891e-01 -1.122690e+00 0.000000e+00
## BiologicalMaterial09 BiologicalMaterial10 BiologicalMaterial11
## 0.000000e+00 -2.898049e-01 0.000000e+00
## BiologicalMaterial12 ManufacturingProcess01 ManufacturingProcess02
## 0.000000e+00 0.000000e+00 0.000000e+00
## ManufacturingProcess03 ManufacturingProcess04 ManufacturingProcess05
## -4.759549e+00 7.229469e-02 -8.837081e-04
## ManufacturingProcess06 ManufacturingProcess07 ManufacturingProcess08
## 8.216265e-03 -5.891644e-01 -1.673217e-01
## ManufacturingProcess09 ManufacturingProcess10 ManufacturingProcess11
## 5.069050e-01 0.000000e+00 0.000000e+00
## ManufacturingProcess12 ManufacturingProcess13 ManufacturingProcess14
## 0.000000e+00 0.000000e+00 0.000000e+00
## ManufacturingProcess15 ManufacturingProcess16 ManufacturingProcess17
## 0.000000e+00 0.000000e+00 -4.597614e-01
## ManufacturingProcess18 ManufacturingProcess19 ManufacturingProcess20
## -2.062011e-04 0.000000e+00 0.000000e+00
## ManufacturingProcess21 ManufacturingProcess22 ManufacturingProcess23
## 0.000000e+00 -3.114361e-02 -7.218787e-02
## ManufacturingProcess24 ManufacturingProcess25 ManufacturingProcess26
## -2.184447e-02 0.000000e+00 6.408898e-03
## ManufacturingProcess27 ManufacturingProcess28 ManufacturingProcess29
## 0.000000e+00 -3.611126e-02 1.722118e-01
## ManufacturingProcess30 ManufacturingProcess31 ManufacturingProcess32
## 0.000000e+00 2.468144e-02 2.368757e-01
## ManufacturingProcess33 ManufacturingProcess34 ManufacturingProcess35
## -1.448558e-01 0.000000e+00 0.000000e+00
## ManufacturingProcess36 ManufacturingProcess37 ManufacturingProcess38
## 5.138167e+01 -9.543530e-01 -3.312017e-01
## ManufacturingProcess39 ManufacturingProcess40 ManufacturingProcess41
## 2.085729e-01 2.283147e+00 0.000000e+00
## ManufacturingProcess42 ManufacturingProcess43 ManufacturingProcess44
## 5.489576e-01 1.553575e-01 -1.796786e+00
## Number of zeros in the vector: 25
## (Intercept) BiologicalMaterial03 BiologicalMaterial05
## 1.366141e+02 1.275920e-02 7.025633e-02
## BiologicalMaterial06 BiologicalMaterial07 BiologicalMaterial10
## 1.201891e-01 -1.122690e+00 -2.898049e-01
## ManufacturingProcess03 ManufacturingProcess04 ManufacturingProcess05
## -4.759549e+00 7.229469e-02 -8.837081e-04
## ManufacturingProcess06 ManufacturingProcess07 ManufacturingProcess08
## 8.216265e-03 -5.891644e-01 -1.673217e-01
## ManufacturingProcess09 ManufacturingProcess17 ManufacturingProcess18
## 5.069050e-01 -4.597614e-01 -2.062011e-04
## ManufacturingProcess22 ManufacturingProcess23 ManufacturingProcess24
## -3.114361e-02 -7.218787e-02 -2.184447e-02
## ManufacturingProcess26 ManufacturingProcess28 ManufacturingProcess29
## 6.408898e-03 -3.611126e-02 1.722118e-01
## ManufacturingProcess31 ManufacturingProcess32 ManufacturingProcess33
## 2.468144e-02 2.368757e-01 -1.448558e-01
## ManufacturingProcess36 ManufacturingProcess37 ManufacturingProcess38
## 5.138167e+01 -9.543530e-01 -3.312017e-01
## ManufacturingProcess39 ManufacturingProcess40 ManufacturingProcess42
## 2.085729e-01 2.283147e+00 5.489576e-01
## ManufacturingProcess43 ManufacturingProcess44
## 1.553575e-01 -1.796786e+00
LassoPredictors <- new_data[c("Yield","BiologicalMaterial03", "BiologicalMaterial05", "BiologicalMaterial06",
"BiologicalMaterial07", "BiologicalMaterial10", "ManufacturingProcess03",
"ManufacturingProcess04", "ManufacturingProcess05", "ManufacturingProcess06",
"ManufacturingProcess07", "ManufacturingProcess08", "ManufacturingProcess09",
"ManufacturingProcess17", "ManufacturingProcess18", "ManufacturingProcess22",
"ManufacturingProcess23", "ManufacturingProcess24", "ManufacturingProcess26",
"ManufacturingProcess28", "ManufacturingProcess29", "ManufacturingProcess31",
"ManufacturingProcess32", "ManufacturingProcess33", "ManufacturingProcess36",
"ManufacturingProcess37", "ManufacturingProcess38", "ManufacturingProcess39",
"ManufacturingProcess40", "ManufacturingProcess42", "ManufacturingProcess43",
"ManufacturingProcess44")]
Matrix <- cor(LassoPredictors)
corrplot(Matrix, method = 'color')