Start R and use these commands to load the data:
## Warning: package 'AppliedPredictiveModeling' was built under R version 3.6.3
The matrix processPredictors contains the 57 predictors (12 describing the input biological material and 45 describing the process predictors) for the 176 manufacturing runs. yield contains the percent yield for each run.
processPredictors <- ChemicalManufacturingProcess[2:58]
print(paste0("The number of columns is ", ncol(processPredictors), " and the number of rows is ", nrow(processPredictors)))## [1] "The number of columns is 57 and the number of rows is 176"
## BiologicalMaterial01 BiologicalMaterial02 BiologicalMaterial03
## Min. :4.580 Min. :46.87 Min. :56.97
## 1st Qu.:5.978 1st Qu.:52.68 1st Qu.:64.98
## Median :6.305 Median :55.09 Median :67.22
## Mean :6.411 Mean :55.69 Mean :67.70
## 3rd Qu.:6.870 3rd Qu.:58.74 3rd Qu.:70.43
## Max. :8.810 Max. :64.75 Max. :78.25
##
## BiologicalMaterial04 BiologicalMaterial05 BiologicalMaterial06
## Min. : 9.38 Min. :13.24 Min. :40.60
## 1st Qu.:11.24 1st Qu.:17.23 1st Qu.:46.05
## Median :12.10 Median :18.49 Median :48.46
## Mean :12.35 Mean :18.60 Mean :48.91
## 3rd Qu.:13.22 3rd Qu.:19.90 3rd Qu.:51.34
## Max. :23.09 Max. :24.85 Max. :59.38
##
## BiologicalMaterial07 BiologicalMaterial08 BiologicalMaterial09
## Min. :100.0 Min. :15.88 Min. :11.44
## 1st Qu.:100.0 1st Qu.:17.06 1st Qu.:12.60
## Median :100.0 Median :17.51 Median :12.84
## Mean :100.0 Mean :17.49 Mean :12.85
## 3rd Qu.:100.0 3rd Qu.:17.88 3rd Qu.:13.13
## Max. :100.8 Max. :19.14 Max. :14.08
##
## BiologicalMaterial10 BiologicalMaterial11 BiologicalMaterial12
## Min. :1.770 Min. :135.8 Min. :18.35
## 1st Qu.:2.460 1st Qu.:143.8 1st Qu.:19.73
## Median :2.710 Median :146.1 Median :20.12
## Mean :2.801 Mean :147.0 Mean :20.20
## 3rd Qu.:2.990 3rd Qu.:149.6 3rd Qu.:20.75
## Max. :6.870 Max. :158.7 Max. :22.21
##
## ManufacturingProcess01 ManufacturingProcess02 ManufacturingProcess03
## Min. : 0.00 Min. : 0.00 Min. :1.47
## 1st Qu.:10.80 1st Qu.:19.30 1st Qu.:1.53
## Median :11.40 Median :21.00 Median :1.54
## Mean :11.21 Mean :16.68 Mean :1.54
## 3rd Qu.:12.15 3rd Qu.:21.50 3rd Qu.:1.55
## Max. :14.10 Max. :22.50 Max. :1.60
## NA's :1 NA's :3 NA's :15
## ManufacturingProcess04 ManufacturingProcess05 ManufacturingProcess06
## Min. :911.0 Min. : 923.0 Min. :203.0
## 1st Qu.:928.0 1st Qu.: 986.8 1st Qu.:205.7
## Median :934.0 Median : 999.2 Median :206.8
## Mean :931.9 Mean :1001.7 Mean :207.4
## 3rd Qu.:936.0 3rd Qu.:1008.9 3rd Qu.:208.7
## Max. :946.0 Max. :1175.3 Max. :227.4
## NA's :1 NA's :1 NA's :2
## ManufacturingProcess07 ManufacturingProcess08 ManufacturingProcess09
## Min. :177.0 Min. :177.0 Min. :38.89
## 1st Qu.:177.0 1st Qu.:177.0 1st Qu.:44.89
## Median :177.0 Median :178.0 Median :45.73
## Mean :177.5 Mean :177.6 Mean :45.66
## 3rd Qu.:178.0 3rd Qu.:178.0 3rd Qu.:46.52
## Max. :178.0 Max. :178.0 Max. :49.36
## NA's :1 NA's :1
## ManufacturingProcess10 ManufacturingProcess11 ManufacturingProcess12
## Min. : 7.500 Min. : 7.500 Min. : 0.0
## 1st Qu.: 8.700 1st Qu.: 9.000 1st Qu.: 0.0
## Median : 9.100 Median : 9.400 Median : 0.0
## Mean : 9.179 Mean : 9.386 Mean : 857.8
## 3rd Qu.: 9.550 3rd Qu.: 9.900 3rd Qu.: 0.0
## Max. :11.600 Max. :11.500 Max. :4549.0
## NA's :9 NA's :10 NA's :1
## ManufacturingProcess13 ManufacturingProcess14 ManufacturingProcess15
## Min. :32.10 Min. :4701 Min. :5904
## 1st Qu.:33.90 1st Qu.:4828 1st Qu.:6010
## Median :34.60 Median :4856 Median :6032
## Mean :34.51 Mean :4854 Mean :6039
## 3rd Qu.:35.20 3rd Qu.:4882 3rd Qu.:6061
## Max. :38.60 Max. :5055 Max. :6233
## NA's :1
## ManufacturingProcess16 ManufacturingProcess17 ManufacturingProcess18
## Min. : 0 Min. :31.30 Min. : 0
## 1st Qu.:4561 1st Qu.:33.50 1st Qu.:4813
## Median :4588 Median :34.40 Median :4835
## Mean :4566 Mean :34.34 Mean :4810
## 3rd Qu.:4619 3rd Qu.:35.10 3rd Qu.:4862
## Max. :4852 Max. :40.00 Max. :4971
##
## ManufacturingProcess19 ManufacturingProcess20 ManufacturingProcess21
## Min. :5890 Min. : 0 Min. :-1.8000
## 1st Qu.:6001 1st Qu.:4553 1st Qu.:-0.6000
## Median :6022 Median :4582 Median :-0.3000
## Mean :6028 Mean :4556 Mean :-0.1642
## 3rd Qu.:6050 3rd Qu.:4610 3rd Qu.: 0.0000
## Max. :6146 Max. :4759 Max. : 3.6000
##
## ManufacturingProcess22 ManufacturingProcess23 ManufacturingProcess24
## Min. : 0.000 Min. :0.000 Min. : 0.000
## 1st Qu.: 3.000 1st Qu.:2.000 1st Qu.: 4.000
## Median : 5.000 Median :3.000 Median : 8.000
## Mean : 5.406 Mean :3.017 Mean : 8.834
## 3rd Qu.: 8.000 3rd Qu.:4.000 3rd Qu.:14.000
## Max. :12.000 Max. :6.000 Max. :23.000
## NA's :1 NA's :1 NA's :1
## ManufacturingProcess25 ManufacturingProcess26 ManufacturingProcess27
## Min. : 0 Min. : 0 Min. : 0
## 1st Qu.:4832 1st Qu.:6020 1st Qu.:4560
## Median :4855 Median :6047 Median :4587
## Mean :4828 Mean :6016 Mean :4563
## 3rd Qu.:4877 3rd Qu.:6070 3rd Qu.:4609
## Max. :4990 Max. :6161 Max. :4710
## NA's :5 NA's :5 NA's :5
## ManufacturingProcess28 ManufacturingProcess29 ManufacturingProcess30
## Min. : 0.000 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.:19.70 1st Qu.: 8.800
## Median :10.400 Median :19.90 Median : 9.100
## Mean : 6.592 Mean :20.01 Mean : 9.161
## 3rd Qu.:10.750 3rd Qu.:20.40 3rd Qu.: 9.700
## Max. :11.500 Max. :22.00 Max. :11.200
## NA's :5 NA's :5 NA's :5
## ManufacturingProcess31 ManufacturingProcess32 ManufacturingProcess33
## Min. : 0.00 Min. :143.0 Min. :56.00
## 1st Qu.:70.10 1st Qu.:155.0 1st Qu.:62.00
## Median :70.80 Median :158.0 Median :64.00
## Mean :70.18 Mean :158.5 Mean :63.54
## 3rd Qu.:71.40 3rd Qu.:162.0 3rd Qu.:65.00
## Max. :72.50 Max. :173.0 Max. :70.00
## NA's :5 NA's :5
## ManufacturingProcess34 ManufacturingProcess35 ManufacturingProcess36
## Min. :2.300 Min. :463.0 Min. :0.01700
## 1st Qu.:2.500 1st Qu.:490.0 1st Qu.:0.01900
## Median :2.500 Median :495.0 Median :0.02000
## Mean :2.494 Mean :495.6 Mean :0.01957
## 3rd Qu.:2.500 3rd Qu.:501.5 3rd Qu.:0.02000
## Max. :2.600 Max. :522.0 Max. :0.02200
## NA's :5 NA's :5 NA's :5
## ManufacturingProcess37 ManufacturingProcess38 ManufacturingProcess39
## Min. :0.000 Min. :0.000 Min. :0.000
## 1st Qu.:0.700 1st Qu.:2.000 1st Qu.:7.100
## Median :1.000 Median :3.000 Median :7.200
## Mean :1.014 Mean :2.534 Mean :6.851
## 3rd Qu.:1.300 3rd Qu.:3.000 3rd Qu.:7.300
## Max. :2.300 Max. :3.000 Max. :7.500
##
## ManufacturingProcess40 ManufacturingProcess41 ManufacturingProcess42
## Min. :0.00000 Min. :0.00000 Min. : 0.00
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:11.40
## Median :0.00000 Median :0.00000 Median :11.60
## Mean :0.01771 Mean :0.02371 Mean :11.21
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:11.70
## Max. :0.10000 Max. :0.20000 Max. :12.10
## NA's :1 NA's :1
## ManufacturingProcess43 ManufacturingProcess44 ManufacturingProcess45
## Min. : 0.0000 Min. :0.000 Min. :0.000
## 1st Qu.: 0.6000 1st Qu.:1.800 1st Qu.:2.100
## Median : 0.8000 Median :1.900 Median :2.200
## Mean : 0.9119 Mean :1.805 Mean :2.138
## 3rd Qu.: 1.0250 3rd Qu.:1.900 3rd Qu.:2.300
## Max. :11.0000 Max. :2.100 Max. :2.600
##
A small percentage of cells in the predictor set contain missing values. Use an imputation function to fill in these missing values.
Let’s visually look at the missing data:
missingData <- as.data.frame(colSums(is.na(processPredictors)))
colnames(missingData) <- c("NAs")
missingData <- cbind(Predictors = rownames(missingData), missingData)
rownames(missingData) <- 1:nrow(missingData)
missingData <- missingData[missingData$NAs != 0,] %>%
arrange(desc(NAs))
head(missingData)## Predictors NAs
## 1 ManufacturingProcess03 15
## 2 ManufacturingProcess11 10
## 3 ManufacturingProcess10 9
## 4 ManufacturingProcess25 5
## 5 ManufacturingProcess26 5
## 6 ManufacturingProcess27 5
missingData %>%
ggplot() +
geom_bar(aes(x=reorder(Predictors,NAs), y=NAs, fill=factor(NAs)), stat = 'identity', ) +
labs(x='Predictor', y="NAs", title='Number of missing values') +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) + coord_flip() I used a kNN imputation strategy to fill in for the missing predictors I also used the default value of k=5.
set.seed(24)
knnImputedValues = preProcess(processPredictors, "knnImpute")
processPredictors_imputed <- try(predict(knnImputedValues, processPredictors), silent = TRUE)
head(processPredictors_imputed)## BiologicalMaterial01 BiologicalMaterial02 BiologicalMaterial03
## 1 -0.2261036 -1.5140979 -2.68303622
## 2 2.2391498 1.3089960 -0.05623504
## 3 2.2391498 1.3089960 -0.05623504
## 4 2.2391498 1.3089960 -0.05623504
## 5 1.4827653 1.8939391 1.13594780
## 6 -0.4081962 0.6620886 -0.59859075
## BiologicalMaterial04 BiologicalMaterial05 BiologicalMaterial06
## 1 0.2201765 0.4941942 -1.3828880
## 2 1.2964386 0.4128555 1.1290767
## 3 1.2964386 0.4128555 1.1290767
## 4 1.2964386 0.4128555 1.1290767
## 5 0.9414412 -0.3734185 1.5348350
## 6 1.5894524 1.7305423 0.6192092
## BiologicalMaterial07 BiologicalMaterial08 BiologicalMaterial09
## 1 -0.1313107 -1.233131 -3.3962895
## 2 -0.1313107 2.282619 -0.7227225
## 3 -0.1313107 2.282619 -0.7227225
## 4 -0.1313107 2.282619 -0.7227225
## 5 -0.1313107 1.071310 -0.1205678
## 6 -0.1313107 1.189487 -1.7343424
## BiologicalMaterial10 BiologicalMaterial11 BiologicalMaterial12
## 1 1.1005296 -1.838655 -1.7709224
## 2 1.1005296 1.393395 1.0989855
## 3 1.1005296 1.393395 1.0989855
## 4 1.1005296 1.393395 1.0989855
## 5 0.4162193 0.136256 1.0989855
## 6 1.6346255 1.022062 0.7240877
## ManufacturingProcess01 ManufacturingProcess02 ManufacturingProcess03
## 1 0.2154105 0.5662872 0.3765810
## 2 -6.1497028 -1.9692525 0.1979962
## 3 -6.1497028 -1.9692525 0.1087038
## 4 -6.1497028 -1.9692525 0.4658734
## 5 -0.2784345 -1.9692525 0.1087038
## 6 0.4348971 -1.9692525 0.5551658
## ManufacturingProcess04 ManufacturingProcess05 ManufacturingProcess06
## 1 0.5655598 -0.44593467 -0.5414997
## 2 -2.3669726 0.99933318 0.9625383
## 3 -3.1638563 0.06246417 -0.1117745
## 4 -3.3232331 0.42279841 2.1850322
## 5 -2.2075958 0.84537219 -0.6304083
## 6 -1.2513352 0.49486525 0.5550403
## ManufacturingProcess07 ManufacturingProcess08 ManufacturingProcess09
## 1 -0.1596700 -0.3095182 -1.7201524
## 2 -0.9580199 0.8941637 0.5883746
## 3 1.0378549 0.8941637 -0.3815947
## 4 -0.9580199 -1.1119728 -0.4785917
## 5 1.0378549 0.8941637 -0.4527258
## 6 1.0378549 0.8941637 -0.2199332
## ManufacturingProcess10 ManufacturingProcess11 ManufacturingProcess12
## 1 -0.07700901 -0.09157342 -0.4806937
## 2 0.52297397 1.08204765 -0.4806937
## 3 0.31428424 0.55112383 -0.4806937
## 4 -0.02483658 0.80261406 -0.4806937
## 5 -0.39004361 0.10403009 -0.4806937
## 6 0.28819802 1.41736795 -0.4806937
## ManufacturingProcess13 ManufacturingProcess14 ManufacturingProcess15
## 1 0.97711512 0.8093999 1.1846438
## 2 -0.50030980 0.2775205 0.9617071
## 3 0.28765016 0.4425865 0.8245152
## 4 0.28765016 0.7910592 1.0817499
## 5 0.09066017 2.5334227 3.3282665
## 6 -0.50030980 2.4050380 3.1396277
## ManufacturingProcess16 ManufacturingProcess17 ManufacturingProcess18
## 1 0.3303945 0.9263296 0.1505348
## 2 0.1455765 -0.2753953 0.1559773
## 3 0.1455765 0.3655246 0.1831898
## 4 0.1967569 0.3655246 0.1695836
## 5 0.4754056 -0.3555103 0.2076811
## 6 0.6261033 -0.7560852 0.1423710
## ManufacturingProcess19 ManufacturingProcess20 ManufacturingProcess21
## 1 0.4563798 0.3109942 0.2109804
## 2 1.5095063 0.1849230 0.2109804
## 3 1.0926437 0.1849230 0.2109804
## 4 0.9829430 0.1562704 0.2109804
## 5 1.6192070 0.2938027 -0.6884239
## 6 1.9044287 0.3998171 -0.5599376
## ManufacturingProcess22 ManufacturingProcess23 ManufacturingProcess24
## 1 0.05833309 0.8317688 0.8907291
## 2 -0.72230090 -1.8147683 -1.0060115
## 3 -0.42205706 -1.2132826 -0.8335805
## 4 -0.12181322 -0.6117969 -0.6611496
## 5 0.77891831 0.5911745 1.5804530
## 6 1.07916216 -1.2132826 -1.3508734
## ManufacturingProcess25 ManufacturingProcess26 ManufacturingProcess27
## 1 0.1200183 0.1256347 0.3460352
## 2 0.1093082 0.1966227 0.1906613
## 3 0.1842786 0.2159831 0.2104362
## 4 0.1708910 0.2052273 0.1906613
## 5 0.2726365 0.2912733 0.3432102
## 6 0.1146633 0.2417969 0.3516852
## ManufacturingProcess28 ManufacturingProcess29 ManufacturingProcess30
## 1 0.7826636 0.5943242 0.7566948
## 2 0.8779201 0.8347250 0.7566948
## 3 0.8588688 0.7746248 0.2444430
## 4 0.8588688 0.7746248 0.2444430
## 5 0.8969714 0.9549255 -0.1653585
## 6 0.9160227 1.0150257 0.9615956
## ManufacturingProcess31 ManufacturingProcess32 ManufacturingProcess33
## 1 -0.1952552 -0.4568829 0.9890307
## 2 -0.2672523 1.9517531 0.9890307
## 3 -0.1592567 2.6928719 0.9890307
## 4 -0.1592567 2.3223125 1.7943843
## 5 -0.1412574 2.3223125 2.5997378
## 6 -0.3572486 2.6928719 2.5997378
## ManufacturingProcess34 ManufacturingProcess35 ManufacturingProcess36
## 1 -1.7202722 -0.88694718 -0.6557774
## 2 1.9568096 1.14638329 -0.6557774
## 3 1.9568096 1.23880740 -1.8000420
## 4 0.1182687 0.03729394 -1.8000420
## 5 0.1182687 -2.55058120 -2.9443066
## 6 0.1182687 -0.51725073 -1.8000420
## ManufacturingProcess37 ManufacturingProcess38 ManufacturingProcess39
## 1 -1.1540243 0.7174727 0.2317270
## 2 2.2161351 -0.8224687 0.2317270
## 3 -0.7046697 -0.8224687 0.2317270
## 4 0.4187168 -0.8224687 0.2317270
## 5 -1.8280562 -0.8224687 0.2981503
## 6 -1.3787016 -0.8224687 0.2317270
## ManufacturingProcess40 ManufacturingProcess41 ManufacturingProcess42
## 1 0.05969714 -0.06900773 0.20279570
## 2 2.14909691 2.34626280 -0.05472265
## 3 -0.46265281 -0.44058781 0.40881037
## 4 -0.46265281 -0.44058781 -0.31224099
## 5 -0.46265281 -0.44058781 -0.10622632
## 6 -0.46265281 -0.44058781 0.15129203
## ManufacturingProcess43 ManufacturingProcess44 ManufacturingProcess45
## 1 2.40564734 -0.01588055 0.64371849
## 2 -0.01374656 0.29467248 0.15220242
## 3 0.10146268 -0.01588055 0.39796046
## 4 0.21667191 -0.01588055 -0.09355562
## 5 0.21667191 -0.32643359 -0.09355562
## 6 1.48397347 -0.01588055 -0.33931365
missingData <- as.data.frame(colSums(is.na(processPredictors_imputed)))
colnames(missingData) <- c("NAs")
missingData <- cbind(Predictors = rownames(missingData), missingData)
rownames(missingData) <- 1:nrow(missingData)
missingData <- missingData[missingData$NAs != 0,]
head(missingData)## [1] Predictors NAs
## <0 rows> (or 0-length row.names)
Split the data into a training and a test set, pre-process the data, and tune a model of your choice from this chapter. What is the optimal value of the performance metric?
## [1] "BiologicalMaterial07"
## [1] -0.1313107 -0.1313107 -0.1313107 -0.1313107 -0.1313107 -0.1313107
## [7] -0.1313107 -0.1313107 -0.1313107 -0.1313107 -0.1313107 -0.1313107
## [13] -0.1313107 -0.1313107 -0.1313107 -0.1313107 -0.1313107 -0.1313107
## [19] -0.1313107 -0.1313107
We can remove these variables for multi-colinearity:
“BiologicalMaterial02” “BiologicalMaterial04”
“BiologicalMaterial12”
“ManufacturingProcess29” “ManufacturingProcess42” “ManufacturingProcess27” “ManufacturingProcess25” “ManufacturingProcess31” “ManufacturingProcess18” “ManufacturingProcess40”
# Look at correlation between variables
corr <- round(cor(processPredictors_imputed), 1)
ggcorrplot(corr,
type="lower",
lab=TRUE,
lab_size=3,
method="circle",
colors=c("tomato2", "white", "springgreen3"),
title="Correlation of variables in Training Data Set",
ggtheme=theme_bw)removePredictors <- findCorrelation(cor(processPredictors_imputed), 0.9, names = TRUE)
removePredictors## [1] "BiologicalMaterial02" "BiologicalMaterial04" "BiologicalMaterial12"
## [4] "ManufacturingProcess29" "ManufacturingProcess42" "ManufacturingProcess27"
## [7] "ManufacturingProcess25" "ManufacturingProcess31" "ManufacturingProcess18"
## [10] "ManufacturingProcess40"
## [1] "BiologicalMaterial01" "BiologicalMaterial03" "BiologicalMaterial05"
## [4] "BiologicalMaterial06" "BiologicalMaterial08" "BiologicalMaterial09"
## [7] "BiologicalMaterial10" "BiologicalMaterial11" "ManufacturingProcess01"
## [10] "ManufacturingProcess02" "ManufacturingProcess03" "ManufacturingProcess04"
## [13] "ManufacturingProcess05" "ManufacturingProcess06" "ManufacturingProcess07"
## [16] "ManufacturingProcess08" "ManufacturingProcess09" "ManufacturingProcess10"
## [19] "ManufacturingProcess11" "ManufacturingProcess12" "ManufacturingProcess13"
## [22] "ManufacturingProcess14" "ManufacturingProcess15" "ManufacturingProcess16"
## [25] "ManufacturingProcess17" "ManufacturingProcess19" "ManufacturingProcess20"
## [28] "ManufacturingProcess21" "ManufacturingProcess22" "ManufacturingProcess23"
## [31] "ManufacturingProcess24" "ManufacturingProcess26" "ManufacturingProcess28"
## [34] "ManufacturingProcess30" "ManufacturingProcess32" "ManufacturingProcess33"
## [37] "ManufacturingProcess34" "ManufacturingProcess35" "ManufacturingProcess36"
## [40] "ManufacturingProcess37" "ManufacturingProcess38" "ManufacturingProcess39"
## [43] "ManufacturingProcess41" "ManufacturingProcess43" "ManufacturingProcess44"
## [46] "ManufacturingProcess45"
## BiologicalMaterial01 BiologicalMaterial03 BiologicalMaterial05
## 2.733165e-01 2.851075e-02 3.040053e-01
## BiologicalMaterial06 BiologicalMaterial08 BiologicalMaterial09
## 3.685344e-01 2.200539e-01 -2.684177e-01
## BiologicalMaterial10 BiologicalMaterial11 ManufacturingProcess01
## 2.402378e+00 3.588211e-01 -3.933603e+00
## ManufacturingProcess02 ManufacturingProcess03 ManufacturingProcess04
## -1.457407e+00 -5.780286e-01 -7.066169e-01
## ManufacturingProcess05 ManufacturingProcess06 ManufacturingProcess07
## 2.597437e+00 3.059522e+00 8.228220e-02
## ManufacturingProcess08 ManufacturingProcess09 ManufacturingProcess10
## -2.119250e-01 -9.406685e-01 6.162450e-01
## ManufacturingProcess11 ManufacturingProcess12 ManufacturingProcess13
## -5.276473e-02 1.587654e+00 4.802776e-01
## ManufacturingProcess14 ManufacturingProcess15 ManufacturingProcess16
## 1.522719e-05 6.743478e-01 -1.242022e+01
## ManufacturingProcess17 ManufacturingProcess19 ManufacturingProcess20
## 1.162972e+00 2.973414e-01 -1.263833e+01
## ManufacturingProcess21 ManufacturingProcess22 ManufacturingProcess23
## 1.729114e+00 3.148052e-01 1.853733e-01
## ManufacturingProcess24 ManufacturingProcess26 ManufacturingProcess28
## 3.468427e-01 -1.285807e+01 -4.012169e-01
## ManufacturingProcess30 ManufacturingProcess32 ManufacturingProcess33
## -4.821694e+00 2.112252e-01 -1.175389e-01
## ManufacturingProcess34 ManufacturingProcess35 ManufacturingProcess36
## -2.468109e-01 -1.091573e-01 1.758717e-01
## ManufacturingProcess37 ManufacturingProcess38 ManufacturingProcess39
## 3.783578e-01 -1.681805e+00 -4.269121e+00
## ManufacturingProcess41 ManufacturingProcess43 ManufacturingProcess44
## 2.176066e+00 9.054875e+00 -4.970355e+00
## ManufacturingProcess45
## -4.077941e+00
## Created from 176 samples and 46 variables
##
## Pre-processing:
## - centered (46)
## - ignored (0)
## - scaled (46)
## BiologicalMaterial01 BiologicalMaterial03 BiologicalMaterial05
## 2.733165e-01 2.851075e-02 3.040053e-01
## BiologicalMaterial06 BiologicalMaterial08 BiologicalMaterial09
## 3.685344e-01 2.200539e-01 -2.684177e-01
## BiologicalMaterial10 BiologicalMaterial11 ManufacturingProcess01
## 2.402378e+00 3.588211e-01 -3.933603e+00
## ManufacturingProcess02 ManufacturingProcess03 ManufacturingProcess04
## -1.457407e+00 -5.780286e-01 -7.066169e-01
## ManufacturingProcess05 ManufacturingProcess06 ManufacturingProcess07
## 2.597437e+00 3.059522e+00 8.228220e-02
## ManufacturingProcess08 ManufacturingProcess09 ManufacturingProcess10
## -2.119250e-01 -9.406685e-01 6.162450e-01
## ManufacturingProcess11 ManufacturingProcess12 ManufacturingProcess13
## -5.276473e-02 1.587654e+00 4.802776e-01
## ManufacturingProcess14 ManufacturingProcess15 ManufacturingProcess16
## 1.522719e-05 6.743478e-01 -1.242022e+01
## ManufacturingProcess17 ManufacturingProcess19 ManufacturingProcess20
## 1.162972e+00 2.973414e-01 -1.263833e+01
## ManufacturingProcess21 ManufacturingProcess22 ManufacturingProcess23
## 1.729114e+00 3.148052e-01 1.853733e-01
## ManufacturingProcess24 ManufacturingProcess26 ManufacturingProcess28
## 3.468427e-01 -1.285807e+01 -4.012169e-01
## ManufacturingProcess30 ManufacturingProcess32 ManufacturingProcess33
## -4.821694e+00 2.112252e-01 -1.175389e-01
## ManufacturingProcess34 ManufacturingProcess35 ManufacturingProcess36
## -2.468109e-01 -1.091573e-01 1.758717e-01
## ManufacturingProcess37 ManufacturingProcess38 ManufacturingProcess39
## 3.783578e-01 -1.681805e+00 -4.269121e+00
## ManufacturingProcess41 ManufacturingProcess43 ManufacturingProcess44
## 2.176066e+00 9.054875e+00 -4.970355e+00
## ManufacturingProcess45
## -4.077941e+00
chemmfgproc <- cbind(ChemicalManufacturingProcess$Yield, processPredictors_transformed)
names(chemmfgproc)[names(chemmfgproc) == "ChemicalManufacturingProcess$Yield"] <- "Yield"5. Split the data into Training and Test Set
chemmfgproc_train <- initial_split(chemmfgproc, prop = 0.8, strata = "Yield")
train_chemmfgproc <- training(chemmfgproc_train)
test_chemmfgproc <- testing(chemmfgproc_train)
print (paste0("The number of observations in the training set is ", nrow(train_chemmfgproc)))## [1] "The number of observations in the training set is 144"
## [1] "The number of observations in the test set is 32"
6. Tune a model of your choice from this chapter
I tried two models. The first one is the traditional linear model or “lm”, and the second model which was th Partial Least Squares or “pls”.
A summary of the linear model showed that the most important predictor was ManufacturingProcess32. None of the biological processes were among the most important. The residuals showed independence. The RMSE for the linear model was 1.36.
For the pls model, there were three main components to the model and a RMSE of 1.23 for the training set.
Linear Model
set.seed(100)
y_train = train_chemmfgproc$Yield
x_train = train_chemmfgproc[,2:47]
ctrl <- trainControl(method = "cv", number = 10)
lmFit1 <- train(x_train, y_train, method = "lm", trControl = ctrl)##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.84468 -0.61001 -0.03228 0.49874 2.04023
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.788648 0.359047 110.817 < 2e-16 ***
## BiologicalMaterial01 0.017688 0.244626 0.072 0.94250
## BiologicalMaterial03 0.249893 0.729100 0.343 0.73253
## BiologicalMaterial05 0.278578 0.220180 1.265 0.20879
## BiologicalMaterial06 -0.050823 0.858274 -0.059 0.95290
## BiologicalMaterial08 0.469859 0.444451 1.057 0.29303
## BiologicalMaterial09 -0.235983 0.434054 -0.544 0.58790
## BiologicalMaterial10 -0.275980 0.237208 -1.163 0.24747
## BiologicalMaterial11 -0.186049 0.312601 -0.595 0.55311
## ManufacturingProcess01 0.163776 0.172301 0.951 0.34419
## ManufacturingProcess02 -0.131226 0.338340 -0.388 0.69897
## ManufacturingProcess03 -0.091327 0.135101 -0.676 0.50064
## ManufacturingProcess04 0.287464 0.207875 1.383 0.16985
## ManufacturingProcess05 -0.002509 0.118187 -0.021 0.98310
## ManufacturingProcess06 0.056070 0.122569 0.457 0.64836
## ManufacturingProcess07 -0.073481 0.122789 -0.598 0.55093
## ManufacturingProcess08 -0.133036 0.136932 -0.972 0.33367
## ManufacturingProcess09 0.602144 0.310813 1.937 0.05559 .
## ManufacturingProcess10 -0.375694 0.443904 -0.846 0.39942
## ManufacturingProcess11 0.127286 0.528856 0.241 0.81030
## ManufacturingProcess12 0.060058 0.191207 0.314 0.75411
## ManufacturingProcess13 -0.577263 0.421208 -1.370 0.17366
## ManufacturingProcess14 0.563152 0.591145 0.953 0.34311
## ManufacturingProcess15 -0.397312 0.586895 -0.677 0.50002
## ManufacturingProcess16 -1.196599 1.598202 -0.749 0.45582
## ManufacturingProcess17 0.041370 0.378977 0.109 0.91330
## ManufacturingProcess19 -0.083440 0.381114 -0.219 0.82715
## ManufacturingProcess20 0.061164 0.139187 0.439 0.66131
## ManufacturingProcess21 NA NA NA NA
## ManufacturingProcess22 0.034855 0.143656 0.243 0.80880
## ManufacturingProcess23 -0.061469 0.152284 -0.404 0.68735
## ManufacturingProcess24 -0.104537 0.146012 -0.716 0.47572
## ManufacturingProcess26 6.273486 3.947793 1.589 0.11526
## ManufacturingProcess28 -0.388547 0.165019 -2.355 0.02054 *
## ManufacturingProcess30 0.210706 0.848653 0.248 0.80444
## ManufacturingProcess32 1.674154 0.378523 4.423 2.52e-05 ***
## ManufacturingProcess33 -0.803240 0.340619 -2.358 0.02035 *
## ManufacturingProcess34 0.024774 0.164074 0.151 0.88029
## ManufacturingProcess35 -0.247740 0.216063 -1.147 0.25434
## ManufacturingProcess36 0.354261 0.294344 1.204 0.23166
## ManufacturingProcess37 -0.386424 0.142533 -2.711 0.00792 **
## ManufacturingProcess38 -0.143298 0.164293 -0.872 0.38522
## ManufacturingProcess39 0.145553 0.223052 0.653 0.51557
## ManufacturingProcess41 0.071153 0.102427 0.695 0.48891
## ManufacturingProcess43 0.211614 0.109615 1.931 0.05643 .
## ManufacturingProcess44 -0.126021 0.290241 -0.434 0.66510
## ManufacturingProcess45 0.476900 0.229657 2.077 0.04046 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.047 on 98 degrees of freedom
## Multiple R-squared: 0.7825, Adjusted R-squared: 0.6826
## F-statistic: 7.833 on 45 and 98 DF, p-value: < 2.2e-16
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## intercept RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 TRUE 1.356479 0.5463053 1.075012 0.1687234 0.1300263 0.08993038
Partial Least Squares or PLS
set.seed(100)
plsFit1 <- train(x_train, y_train,
method = "pls",
tuneLength = 25,
trControl = trainControl("cv", number = 10)
)## Data: X dimension: 144 46
## Y dimension: 144 1
## Fit method: oscorespls
## Number of components considered: 3
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps
## X 17.79 27.71 35.03
## .outcome 51.70 62.91 67.69
## ncomp
## 3 3
## ncomp RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 3 1.227284 0.5832037 1.018451 0.1561764 0.1052495 0.1289533
Predict the response for the test set. What is the value of the performance metric and how does this compare with the resampled performance metric on the training set?
The RMSE for the test set is 1.076 compared to the RMSE of the training set 1.23.
plsPredict <- predict(plsFit1, test_chemmfgproc, ncomp=4)
print(paste0("The RMSE for the test set is: ", RMSE(plsPredict, test_chemmfgproc$Yield), " and the RMSE of the training set is: ", train_set_results$RMSE))## [1] "The RMSE for the test set is: 1.0763904615528 and the RMSE of the training set is: 1.22728403618774"
Which predictors are most important in the model you have trained? Do either the biological or process predictors dominate the list?
The top 20 predictors are listed below. The top predictor was the ManufacturingProcess32, Only 6 predictors are biological.
## pls variable importance
##
## only 20 most important variables shown (out of 46)
##
## Overall
## ManufacturingProcess32 100.00
## ManufacturingProcess09 92.25
## ManufacturingProcess13 89.29
## ManufacturingProcess17 79.76
## ManufacturingProcess36 76.59
## BiologicalMaterial06 71.81
## BiologicalMaterial08 68.31
## ManufacturingProcess06 65.81
## ManufacturingProcess33 62.39
## BiologicalMaterial03 60.56
## ManufacturingProcess11 60.20
## BiologicalMaterial01 60.06
## BiologicalMaterial11 59.69
## ManufacturingProcess12 53.26
## ManufacturingProcess04 50.16
## ManufacturingProcess28 46.57
## ManufacturingProcess24 44.75
## ManufacturingProcess02 42.85
## ManufacturingProcess30 38.67
## ManufacturingProcess37 37.93
Explore the relationships between each of the top predictors and the response. How could this information be helpful in improving yield in future runs of the manufacturing process?
From the varImp function, we see the top 20 predictors for the model. The most important predictor is ManufacturingProcess32. The top 5 predictors with the most postive correlation with Yield are list below:
0.6083321 ManufacturingProcess32
0.5034705 ManufacturingProcess09
0.4781634 BiologicalMaterial06
0.4450860 BiologicalMaterial03
0.4249171 ManufacturingProcess33
Conversely, the bottom 3 predictors with the most negative correlation with Yield are listed below:
-0.4258069 ManufacturingProcess17
-0.5036797 ManufacturingProcess13
-0.5237389 ManufacturingProcess36
A further investigation into how these processing are both positively and negatively correlated with the Yield may lead to more information on how to maximize the Yield.
top20Predictors <- varImp(plsFit1)$importance %>%
filter(Overall >= 32.33072) %>%
arrange(desc(Overall))mostImportantPredictors <- chemmfgproc[colnames(chemmfgproc) %in% rownames(top20Predictors)]
mostImportant_df <- cbind(chemmfgproc$Yield, mostImportantPredictors)
names(mostImportant_df)[names(mostImportant_df) == "chemmfgproc$Yield"] <- "Yield"top20_correlations_to_Yield <- as.data.frame(cor(chemmfgproc[colnames(chemmfgproc) %in% rownames(top20Predictors)], chemmfgproc$Yield))
top20_correlations_to_Yield['Predictors'] <- rownames(top20_correlations_to_Yield)
rownames(top20_correlations_to_Yield ) <- 1:nrow(top20_correlations_to_Yield)
top20_correlations_to_Yield %>%
arrange(desc(V1))## V1 Predictors
## 1 0.6083321 ManufacturingProcess32
## 2 0.5034705 ManufacturingProcess09
## 3 0.4781634 BiologicalMaterial06
## 4 0.4450860 BiologicalMaterial03
## 5 0.4249171 ManufacturingProcess33
## 6 0.3918329 ManufacturingProcess06
## 7 0.3809402 BiologicalMaterial08
## 8 0.3589380 BiologicalMaterial01
## 9 0.3549143 BiologicalMaterial11
## 10 0.3525799 ManufacturingProcess11
## 11 0.3513037 ManufacturingProcess12
## 12 0.2655854 ManufacturingProcess28
## 13 0.2304898 ManufacturingProcess30
## 14 0.2161880 ManufacturingProcess15
## 15 0.2133838 ManufacturingProcess10
## 16 0.2008305 BiologicalMaterial10
## 17 -0.1593141 ManufacturingProcess37
## 18 -0.2146715 ManufacturingProcess02
## 19 -0.2148909 ManufacturingProcess24
## 20 -0.2660733 ManufacturingProcess04
## 21 -0.4258069 ManufacturingProcess17
## 22 -0.5036797 ManufacturingProcess13
## 23 -0.5237389 ManufacturingProcess36
corr <- round(cor(mostImportant_df), 1)
ggcorrplot(corr,
type="lower",
lab=TRUE,
lab_size=3,
method="circle",
colors=c("tomato2", "white", "springgreen3"),
title="Correlation of variables in Training Data Set",
ggtheme=theme_bw)