Load packages

library(zoo)
library(forecast)
library(readxl)
library(tidyverse)
library(imputeTS)
library(openxlsx)
library(fpp3)
library(lubridate)
library(caret)
library(pls)
library(RANN)
library(glmnet)

Homework 7

In Kuhn and Johnson do problems 6.2 and 6.3. There are only two but they consist of many parts. Please submit a link to your Rpubs and submit the .rmd file as well.

Question 6.2

Developing a model to predict permeability (see Sect.1.4) could save significant resources for a pharmaceutical company, while at the same time more rapidly identifying molecules that have a sufficient permeability to become a drug

  1. Start R and use these commands to load the data
library(AppliedPredictiveModeling)
data(permeability)
  1. The fingerprint predictors indicate the presence or absence of substructures of a molecule and are often sparse meaning that relatively few of the molecules contain each substructure. Filter out the predictors that have low frequencies using the nearZeroVar function from the caret package. How many predictors are left for modeling?

After filtering, the predictors were decreased from 1107 to 388.

fingerprints_filtered <- fingerprints[,-nearZeroVar(fingerprints)]
ncol(fingerprints_filtered)
## [1] 388
ncol(fingerprints)
## [1] 1107
  1. Split the data into a training and a test set, pre-process the data, and tune a PLS model. How many latent variables are optimal and what is the corresponding resampled estimate of R2?

According to the RMSE, 6 latent variables are optimal. The corresponding resampled estimate of R-squared is approximately 0.429.

smp_size <- floor(0.80 * nrow(permeability))
set.seed(63)

trainingDataindex <- sample(seq_len(nrow(permeability)), 
                            size = smp_size)

trainY <- permeability[trainingDataindex,]
testY <- permeability[-trainingDataindex,]
trainX <- fingerprints_filtered[trainingDataindex,]
testX <- fingerprints_filtered[-trainingDataindex,]

ctrl <- trainControl(method = "cv", number = 10)
plsTune_RMSE <- train(trainX,trainY, 
                      method = "pls", 
                      tuneLength = 20, 
                      trControl = ctrl, 
                      preProcess = c("center","scale"))
plot(plsTune_RMSE)

plsTune_RMSE
## Partial Least Squares 
## 
## 132 samples
## 388 predictors
## 
## Pre-processing: centered (388), scaled (388) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 119, 117, 118, 120, 119, 120, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE      Rsquared   MAE      
##    1     12.41154  0.3010846   9.278115
##    2     11.66428  0.4336483   8.283584
##    3     11.50006  0.4126173   8.460753
##    4     11.43274  0.3955810   8.662347
##    5     11.25505  0.4033256   8.394702
##    6     11.20513  0.4028330   8.492960
##    7     11.32395  0.3773739   8.522066
##    8     11.65818  0.3606319   8.884521
##    9     11.86653  0.3580183   9.131870
##   10     11.95114  0.3613253   9.112079
##   11     12.28667  0.3527594   9.199224
##   12     12.43209  0.3417924   9.240733
##   13     12.52618  0.3405473   9.146879
##   14     12.63797  0.3423856   9.135995
##   15     12.97477  0.3311300   9.248560
##   16     13.39126  0.3128652   9.576581
##   17     13.77485  0.2967797   9.828394
##   18     14.07538  0.2862263  10.008771
##   19     14.22405  0.2860567  10.135767
##   20     14.46824  0.2798174  10.324863
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 6.
plsTune <- train(trainX,trainY, 
                 method = "pls", 
                 tuneLength = 6, 
                 trControl = ctrl, 
                 preProcess = c("center","scale"))
  1. Predict the response for the test set. What is the test set estimate of R2?

The test set estimate of R-squared is approximately 0.504.

pls_predict <- predict(plsTune, testX)
pls_predict
##  [1] -1.4473856  9.9575712  2.8106381  8.2235579  9.9880304  8.4958148
##  [7]  7.1877076 -0.8662342 16.4210439 -4.4841389 25.3687203 26.3084614
## [13] 31.0026934 11.5372793 14.1859472 12.2208902 11.3135970 24.2265975
## [19]  3.8374952 35.5745105 31.5721502 26.1629810 24.0523809 24.5488152
## [25]  8.0418396 -2.8073497 28.5688430 24.3252861 12.3707114  0.6701642
## [31]  9.9781934  1.9271097 -0.1488661
R_pls <- cor(pls_predict, testY)
R2_pls <- R_pls^2
R2_pls
## [1] 0.5044235
  1. Try building other models discussed in this chapter. Do any have better predictive performance?

I tested an RLM PCA model, a regular linear regression model, a penalized linear regression model, and two other PLS models using wide kernel PLS and simpls. The PLS models, regardless of method, all had the same results and therefore the same R-squared values. The RLM PCA, LM PCA, and penalized LM PCA model had worse predictive performance, with an R-squared in the 0.43-0.45 range.

rlmPCA <- train(trainX, trainY, 
                method = "rlm", 
                preProcess = "pca", 
                trControl = ctrl)
rlmPCA
## Robust Linear Model 
## 
## 132 samples
## 388 predictors
## 
## Pre-processing: principal component signal extraction (388), centered
##  (388), scaled (388) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 118, 118, 119, 118, 120, 118, ... 
## Resampling results across tuning parameters:
## 
##   intercept  psi           RMSE      Rsquared   MAE      
##   FALSE      psi.huber     15.72736  0.3848468  11.792842
##   FALSE      psi.hampel    15.81550  0.3687765  12.195135
##   FALSE      psi.bisquare  16.38937  0.3370810  12.115602
##    TRUE      psi.huber     11.53158  0.3692087   7.746914
##    TRUE      psi.hampel    12.24233  0.3237643   8.197525
##    TRUE      psi.bisquare  12.20349  0.3525813   7.698339
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were intercept = TRUE and psi = psi.huber.
rlmPCA_predict <- predict(rlmPCA, testX)
rlmPCA_predict
##          4         10         16         20         26         31         32 
##  5.3504665  7.8025038  4.1207200  4.6605423  6.9609123  8.6222278  4.6229334 
##         44         54         57         61         64         69         70 
##  4.1210598 11.7232769  0.4590693 17.5560333 17.4450841 30.7811297  2.6633294 
##         71         73        103        111        114        118        119 
##  5.2084823  7.2038731  8.0640462 16.9028794  7.0124822 33.6809822 30.6976745 
##        122        123        125        127        137        143        144 
## 16.9189746 19.4481503 19.4822334  5.0142965  2.1891482 17.9551952 16.0998784 
##        146        147        152        153        162 
## 10.1643241  0.6669210  4.5106037  6.3765577  4.5897811
cor(rlmPCA_predict, testY)^2
## [1] 0.4325927
lmPCA <- train(trainX, trainY, 
               method = "lm", 
               preProcess = "pca", 
               trControl = ctrl)

lmPCA
## Linear Regression 
## 
## 132 samples
## 388 predictors
## 
## Pre-processing: principal component signal extraction (388), centered
##  (388), scaled (388) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 118, 120, 118, 118, 118, 119, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   11.59802  0.4057905  8.464152
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
lmPCA_predict <- predict(lmPCA, testX)
lmPCA_predict
##          4         10         16         20         26         31         32 
##  5.8641117 11.2797243  3.7312966  6.8545965  8.7622541  8.1302083  5.6235174 
##         44         54         57         61         64         69         70 
##  2.5425950 17.9098644 -0.5072346 23.7630747 24.4381638 37.5365796  3.7799848 
##         71         73        103        111        114        118        119 
##  6.2479900  7.3673932 11.1351881 22.8509422  7.1159582 35.6591850 34.8216506 
##        122        123        125        127        137        143        144 
## 22.5322271 24.8076985 25.0477358  5.0954433  4.2287213 20.8421848 17.2891533 
##        146        147        152        153        162 
## 13.7362779  2.2214386  5.1881920  5.3367736  3.3627942
cor(lmPCA_predict, testY)^2
## [1] 0.4358732
lasso_PCA <- train(trainX, trainY, 
                   method = "glmnet", 
                   preProcess = "pca", 
                   trControl = ctrl)

lasso_PCA
## glmnet 
## 
## 132 samples
## 388 predictors
## 
## Pre-processing: principal component signal extraction (388), centered
##  (388), scaled (388) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 119, 119, 118, 119, 117, 119, ... 
## Resampling results across tuning parameters:
## 
##   alpha  lambda       RMSE      Rsquared   MAE     
##   0.10   0.009925983  10.94436  0.3972746  8.297547
##   0.10   0.099259834  10.93834  0.3972680  8.289601
##   0.10   0.992598335  10.83689  0.3969846  8.123197
##   0.55   0.009925983  10.94640  0.3972670  8.297941
##   0.55   0.099259834  10.91826  0.3971908  8.252090
##   0.55   0.992598335  10.97338  0.3973776  8.116889
##   1.00   0.009925983  10.94710  0.3972666  8.298406
##   1.00   0.099259834  10.90362  0.3969845  8.220084
##   1.00   0.992598335  11.46074  0.3821102  8.611531
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 0.1 and lambda = 0.9925983.
lasso_PCA_predict <- predict(lasso_PCA, testX)
lasso_PCA_predict
##          4         10         16         20         26         31         32 
##  5.6752371 11.3128403  4.4079335  7.2921218  9.2833210  8.1421702  6.0192593 
##         44         54         57         61         64         69         70 
##  3.1639307 17.0011321  0.7143191 22.8398747 23.4664922 34.8976863  4.4996435 
##         71         73        103        111        114        118        119 
##  6.4964998  7.4765327 10.7029216 22.0934818  7.3246423 33.3764055 32.7563820 
##        122        123        125        127        137        143        144 
## 21.7381260 23.9584169 24.0962090  5.8624514  4.8783337 20.1771736 17.0846203 
##        146        147        152        153        162 
## 13.2562092  2.6802196  6.2595230  5.8271646  4.2450245
cor(lasso_PCA_predict, testY)^2
## [1] 0.4428676
wk_pls <- train(trainX, trainY, 
                method = "widekernelpls", 
                trControl = ctrl, 
                preProcess = c("center","scale"))

wk_pls
## Partial Least Squares 
## 
## 132 samples
## 388 predictors
## 
## Pre-processing: centered (388), scaled (388) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 120, 119, 117, 119, 119, 120, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE      Rsquared   MAE     
##   1      12.25574  0.3235845  9.260325
##   2      11.33268  0.4179886  8.124855
##   3      11.17124  0.4640138  8.386148
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 3.
wk_pls_predict <- predict(wk_pls, testX)
wk_pls_predict
##  [1] -1.4473856  9.9575712  2.8106381  8.2235579  9.9880304  8.4958148
##  [7]  7.1877076 -0.8662342 16.4210439 -4.4841389 25.3687203 26.3084614
## [13] 31.0026934 11.5372793 14.1859472 12.2208902 11.3135970 24.2265975
## [19]  3.8374952 35.5745105 31.5721502 26.1629810 24.0523809 24.5488152
## [25]  8.0418396 -2.8073497 28.5688430 24.3252861 12.3707114  0.6701642
## [31]  9.9781934  1.9271097 -0.1488661
cor(wk_pls_predict, testY)^2
## [1] 0.5044235
simpls <- train(trainX, trainY, 
                method = "simpls", 
                trControl = ctrl, 
                preProcess = c("center","scale"))

simpls
## Partial Least Squares 
## 
## 132 samples
## 388 predictors
## 
## Pre-processing: centered (388), scaled (388) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 118, 119, 120, 119, 116, 119, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE      Rsquared   MAE     
##   1      12.16414  0.3078650  9.109961
##   2      11.39583  0.4257089  8.227754
##   3      11.23258  0.4594413  8.467877
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 3.
simpls_predict <- predict(simpls, testX)
simpls_predict
##  [1] -1.4473856  9.9575712  2.8106381  8.2235579  9.9880304  8.4958148
##  [7]  7.1877076 -0.8662342 16.4210439 -4.4841389 25.3687203 26.3084614
## [13] 31.0026934 11.5372793 14.1859472 12.2208902 11.3135970 24.2265975
## [19]  3.8374952 35.5745105 31.5721502 26.1629810 24.0523809 24.5488152
## [25]  8.0418396 -2.8073497 28.5688430 24.3252861 12.3707114  0.6701642
## [31]  9.9781934  1.9271097 -0.1488661
cor(simpls_predict, testY)^2
## [1] 0.5044235
  1. Would you recommend any of your models to replace the permeability laboratory experiment?

Since the permeability dataset is used for pharmaceutical drug development and all models had an R-squared of approximately 0.5 or less, I would not recommend any of the models to replace the permeability laboratory experiment. Since R squared is a metric for how much variance in the response is accounted for by the predictors, an upper level of 50% does not seem like a strong enough correlation to justify replacing the permeability laboratory experiment.

Question 6.3

A chemical manufacturing process for a pharmaceutical product was discussed in Sect.1.4. In this problem, the objective is to understand the relationship between biological measurements of the raw materials (predictors), measurements of the manufacturing process (predictors), and the response of product yield. Biological predictors cannot be changed but can be used to assess the quality of the raw material before processing. On the other hand, manufacturing process predictors can be changed in the manufacturing process. Improving product yield by 1% will boost revenue by approximately one hundred thousand dollars per batch:

  1. Start R and use these commands to load the data:
data(ChemicalManufacturingProcess)

The matrix processPredictors contains the 57 predictors (12 describing the input biological material and 45 describing the process predictors) for the 176 manufacturing runs. yield contains the percent yield for each run.

  1. A small percentage of cells in the predictor set contain missing values. Use an imputation function to fill in these missing values (e.g., see Sect.3.8).

Assuming that the small percentage of missing values in the predictor set is missing at random, KNN imputation was used to fill in these missing values. This was done with the preProcess function, with knnImpute as the method. The transformed data was piped into a new variable and placed into a dataframe.

ChemicalManufacturingProcess_preProc <- 
  preProcess(ChemicalManufacturingProcess, 
             method = "knnImpute")
transformed_ChemMan <- 
  predict(ChemicalManufacturingProcess_preProc, 
          newdata = ChemicalManufacturingProcess)
df <- as.data.frame(transformed_ChemMan$Yield) %>% 
  rename(Yield = `transformed_ChemMan$Yield`)
  1. Split the data into a training and a test set, pre-process the data, and tune a model of your choice from this chapter. What is the optimal value of the performance metric?

The optimal value of the performance metric is 3, as shown by the root mean squared error. This was compared also to the R-squared metric was marginally higher for 4 components. Since RMSE was lower by a larger percentage, 3 components was chosen.

trainingDataindex <- 
  sample(seq_len(nrow(df)), 
         size = smp_size)

trainY <- df[trainingDataindex,]
testY <- df[-trainingDataindex,]
trainX <- 
  transformed_ChemMan[trainingDataindex,] %>% 
  select(-Yield)
testX <- 
  transformed_ChemMan[-trainingDataindex,] %>% 
  select(-Yield)

plsTune_RMSE <- train(trainX,trainY, 
                      method = "pls", 
                      tuneLength = 20, 
                      trControl = ctrl, 
                      preProcess = c("center","scale"))

plsTune_R2 <- train(trainX,trainY, 
                    method = "pls", 
                    tuneLength = 20, 
                    trControl = ctrl, 
                    preProcess = c("center","scale"), 
                    metric = "Rsquared")

plot(plsTune_RMSE)

plot(plsTune_R2)

plsTune_RMSE
## Partial Least Squares 
## 
## 132 samples
##  57 predictor
## 
## Pre-processing: centered (57), scaled (57) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 118, 119, 119, 119, 118, 119, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE       Rsquared   MAE      
##    1     0.7957411  0.4568994  0.6329938
##    2     1.0607101  0.5170700  0.6737357
##    3     0.6766499  0.6354155  0.5499443
##    4     0.7330987  0.6134668  0.5696902
##    5     0.9459309  0.5441647  0.6386960
##    6     1.0457239  0.5341147  0.6725087
##    7     1.1376994  0.5441714  0.7010528
##    8     1.1809492  0.5514285  0.7158071
##    9     1.1476238  0.5753177  0.7034479
##   10     1.2268688  0.5762581  0.7271974
##   11     1.3248746  0.5569062  0.7657038
##   12     1.4368352  0.5340744  0.8072475
##   13     1.4677461  0.5202478  0.8218249
##   14     1.5441504  0.5060173  0.8493337
##   15     1.6107929  0.5025954  0.8701117
##   16     1.6423380  0.4985809  0.8830953
##   17     1.7012126  0.4975336  0.9013705
##   18     1.7483179  0.4947957  0.9150027
##   19     1.8175242  0.4919380  0.9328794
##   20     1.9208661  0.4859960  0.9620018
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 3.
plsTune <- train(trainX,trainY, 
                 method = "pls", 
                 tuneLength = 3, 
                 trControl = ctrl, 
                 preProcess = c("center","scale"))
  1. Predict the response for the test set. What is the value of the performance metric and how does this compare with the resampled performance metric on the training set?

The R-squared for the test set was approximately 0.572, while the R-squared for the training set was approximately 0.703. This might suggest that the model is overfitted to the training data, as it performs significantly worse with the test set.

pls_predict <- predict(plsTune, testX)
pls_predict
##  [1]  1.29806184  0.55636066  0.80279836  0.28818217 -0.18855507  1.08427054
##  [7]  0.86931808 -2.03273060  1.08531817  1.26129301  1.27493440  1.27144629
## [13]  1.68652739  1.27753021  1.22464153  0.78358794  0.76177475  0.43232898
## [19] -0.67437431  0.42106356  0.18908059  1.13797085  0.29383488 -0.22494436
## [25] -0.16509792 -0.50397460 -1.11348963 -0.60453328 -0.07350641 -0.37819609
## [31]  0.21881107 -1.19358586 -0.64790765 -0.18592081  0.11092909 -0.02771271
## [37] -0.26700358 -0.40840581 -0.82490558 -0.70773987 -0.37292066 -1.48941619
## [43] -1.68618406 -0.20650018
R_pls <- cor(pls_predict, testY)
R2_pls <- R_pls^2
R2_pls
## [1] 0.4749322
pls_predict_train <- predict(plsTune, trainX)
pls_predict_train
##   [1]  0.86131310  0.25343129  0.27375521  1.65940147  1.30835075  0.56904209
##   [7]  2.31181306 -0.35946562  0.08574419  1.23008163 -0.74508749 -1.23271543
##  [13]  0.18871160  0.36328648  0.15468841 -0.80416809  2.31691312  1.08970847
##  [19] -1.25619740  0.99108071 -0.21921038  1.64510493 -0.97023431 -0.46179792
##  [25] -0.18087492 -0.98784066 -1.06698125  1.21701205  1.44414651  1.08292000
##  [31] -0.93853727  0.29031217 -0.08838669  0.31176577 -0.43605302  0.23723719
##  [37] -0.46097806 -1.03977261 -0.90104285  1.64031008  0.93891238 -0.20070777
##  [43] -0.11974701  1.38485429  0.06632316  0.59506895 -0.20564090 -0.71015975
##  [49] -0.20416836 -0.87629135 -0.49675808 -0.56795062 -1.06893600  1.20877729
##  [55]  1.82816946 -0.93573300  0.22942453 -0.34982307  0.18186760 -0.42196600
##  [61] -0.73292206 -0.37377120 -0.80814721 -0.70546476  1.04473573 -0.31284927
##  [67]  0.02704369  0.14043138  0.01997861 -0.36878589 -0.38368525  1.42115846
##  [73]  0.66958262 -1.35629528  0.14002318 -1.00290941  0.47150009 -0.75994800
##  [79] -0.64000968 -0.49160445  0.23241167  1.10139833  1.38036029  1.21786572
##  [85] -1.20310842  0.16134000 -0.13512792  0.02181715 -1.17294866 -0.33917872
##  [91] -1.21866728 -0.22725213 -1.75751954 -2.01040046 -1.30554108 -0.50583053
##  [97] -0.10650532 -0.58467841  0.13587493 -0.34235985  0.05783961  0.92205002
## [103] -0.54452255 -0.92363909  0.24475056 -0.27138206 -0.44131432 -0.56001015
## [109]  0.40472318 -0.54536698 -0.79023150  0.37286529 -0.39925454 -0.46946102
## [115] -0.19819236  0.37595831 -0.29754441  2.45005717 -1.33052883 -0.29455247
## [121] -0.52589964  0.22023500 -0.13538004 -0.85294833  0.85851881  0.21194579
## [127] -0.49551964 -0.61960165 -0.69086999 -0.12293395  0.07078805  0.93167723
R_pls_train <- cor(pls_predict_train, trainY)
R2_pls_train <- R_pls_train^2
R2_pls_train
## [1] 0.7138012
  1. Which predictors are most important in the model you have trained? Do either the biological or process predictors dominate the list?

It looks like the most important predictors were ManufacturingProcess32, ManufacturingProcess13, ManufacturingProcess36, ManufacturingProcess17, ManufacturingProcess09. It looks like the process predictors dominate the list.

print(varImp(plsTune))
## pls variable importance
## 
##   only 20 most important variables shown (out of 57)
## 
##                        Overall
## ManufacturingProcess32  100.00
## ManufacturingProcess36   79.66
## ManufacturingProcess13   79.51
## ManufacturingProcess17   75.12
## ManufacturingProcess09   72.98
## ManufacturingProcess33   63.73
## ManufacturingProcess06   58.65
## BiologicalMaterial02     58.04
## BiologicalMaterial08     55.31
## BiologicalMaterial06     54.95
## BiologicalMaterial04     52.23
## ManufacturingProcess11   51.73
## BiologicalMaterial12     50.49
## BiologicalMaterial11     49.42
## BiologicalMaterial01     49.00
## ManufacturingProcess12   48.29
## BiologicalMaterial03     47.84
## ManufacturingProcess28   41.25
## BiologicalMaterial10     38.00
## ManufacturingProcess04   37.96
  1. Explore the relationships between each of the top predictors and the response. How could this information be helpful in improving yield in future runs of the manufacturing process?

It seems like the strongest relationship between the yield and predictor is found with ManufacturingProcess32. Since the predictor is positively correlated to yield, this information could be helpful in improving yield by determining how to influence/coerce higher values of ManufacturingProcess32.

data <- cbind(trainX, Yield = trainY)

ManufacturingProcess32lm <- 
  lm(Yield ~ ManufacturingProcess32, data = data)

summary(ManufacturingProcess32lm)
## 
## Call:
## lm(formula = Yield ~ ManufacturingProcess32, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.13937 -0.55251 -0.06372  0.40474  2.26401 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            0.009744   0.068004   0.143    0.886    
## ManufacturingProcess32 0.673967   0.068123   9.893   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7797 on 130 degrees of freedom
## Multiple R-squared:  0.4295, Adjusted R-squared:  0.4251 
## F-statistic: 97.88 on 1 and 130 DF,  p-value: < 2.2e-16
ggplot(data, 
       aes(x = ManufacturingProcess32, y = Yield)) +
  geom_point() +
  labs(x = "ManufacturingProcess32",
      y = "Yield") +
  geom_smooth(method = "lm", se = FALSE) +
  theme_minimal() 

ManufacturingProcess13lm <- 
  lm(Yield ~ ManufacturingProcess13, data = data)

summary(ManufacturingProcess13lm)
## 
## Call:
## lm(formula = Yield ~ ManufacturingProcess13, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.67353 -0.68280 -0.01469  0.61956  2.19859 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            -0.02250    0.07950  -0.283    0.778    
## ManufacturingProcess13 -0.49050    0.08161  -6.010 1.75e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9132 on 130 degrees of freedom
## Multiple R-squared:  0.2174, Adjusted R-squared:  0.2114 
## F-statistic: 36.12 on 1 and 130 DF,  p-value: 1.75e-08
ggplot(data, 
       aes(x = ManufacturingProcess13, y = Yield)) +
  geom_point() +
  labs(x = "ManufacturingProcess13",
      y = "Yield") +
  geom_smooth(method = "lm", se = FALSE) +
  theme_minimal() 

ManufacturingProcess36lm <- 
  lm(Yield ~ ManufacturingProcess36, data = data)

summary(ManufacturingProcess36lm)
## 
## Call:
## lm(formula = Yield ~ ManufacturingProcess36, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.17235 -0.59533 -0.03074  0.57708  2.46159 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            -0.02050    0.07442  -0.275    0.783    
## ManufacturingProcess36 -0.57895    0.07499  -7.720 2.72e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8548 on 130 degrees of freedom
## Multiple R-squared:  0.3144, Adjusted R-squared:  0.3091 
## F-statistic:  59.6 on 1 and 130 DF,  p-value: 2.723e-12
ggplot(data, 
       aes(x = ManufacturingProcess36, y = Yield)) +
  geom_point() +
  labs(x = "ManufacturingProcess36",
      y = "Yield") +
  geom_smooth(method = "lm", se = FALSE) +
  theme_minimal() 

ManufacturingProcess17lm <- 
  lm(Yield ~ ManufacturingProcess17, data = data)

summary(ManufacturingProcess17lm)
## 
## Call:
## lm(formula = Yield ~ ManufacturingProcess17, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.70036 -0.71964 -0.09345  0.63767  2.43915 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            -0.03799    0.08284  -0.459    0.647    
## ManufacturingProcess17 -0.41183    0.08593  -4.793 4.42e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9517 on 130 degrees of freedom
## Multiple R-squared:  0.1502, Adjusted R-squared:  0.1436 
## F-statistic: 22.97 on 1 and 130 DF,  p-value: 4.424e-06
ggplot(data, 
       aes(x = ManufacturingProcess17, y = Yield)) +
  geom_point() +
  labs(x = "ManufacturingProcess17",
      y = "Yield") +
  geom_smooth(method = "lm", se = FALSE) +
  theme_minimal() 

ManufacturingProcess09lm <- 
  lm(Yield ~ ManufacturingProcess09, data = data)

summary(ManufacturingProcess09lm)
## 
## Call:
## lm(formula = Yield ~ ManufacturingProcess09, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.68689 -0.67439 -0.01261  0.61499  2.82175 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            -0.03719    0.07973  -0.466    0.642    
## ManufacturingProcess09  0.47939    0.08089   5.926 2.62e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.916 on 130 degrees of freedom
## Multiple R-squared:  0.2127, Adjusted R-squared:  0.2066 
## F-statistic: 35.12 on 1 and 130 DF,  p-value: 2.618e-08
ggplot(data, 
       aes(x = ManufacturingProcess09, y = Yield)) +
  geom_point() +
  labs(x = "ManufacturingProcess09",
      y = "Yield") +
  geom_smooth(method = "lm", se = FALSE) +
  theme_minimal()