library(zoo)
library(forecast)
library(readxl)
library(tidyverse)
library(imputeTS)
library(openxlsx)
library(fpp3)
library(lubridate)
library(caret)
library(pls)
library(RANN)
library(glmnet)
In Kuhn and Johnson do problems 6.2 and 6.3. There are only two but they consist of many parts. Please submit a link to your Rpubs and submit the .rmd file as well.
Developing a model to predict permeability (see Sect.1.4) could save significant resources for a pharmaceutical company, while at the same time more rapidly identifying molecules that have a sufficient permeability to become a drug
library(AppliedPredictiveModeling)
data(permeability)
After filtering, the predictors were decreased from 1107 to 388.
fingerprints_filtered <- fingerprints[,-nearZeroVar(fingerprints)]
ncol(fingerprints_filtered)
## [1] 388
ncol(fingerprints)
## [1] 1107
According to the RMSE, 6 latent variables are optimal. The corresponding resampled estimate of R-squared is approximately 0.429.
smp_size <- floor(0.80 * nrow(permeability))
set.seed(63)
trainingDataindex <- sample(seq_len(nrow(permeability)),
size = smp_size)
trainY <- permeability[trainingDataindex,]
testY <- permeability[-trainingDataindex,]
trainX <- fingerprints_filtered[trainingDataindex,]
testX <- fingerprints_filtered[-trainingDataindex,]
ctrl <- trainControl(method = "cv", number = 10)
plsTune_RMSE <- train(trainX,trainY,
method = "pls",
tuneLength = 20,
trControl = ctrl,
preProcess = c("center","scale"))
plot(plsTune_RMSE)
plsTune_RMSE
## Partial Least Squares
##
## 132 samples
## 388 predictors
##
## Pre-processing: centered (388), scaled (388)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 119, 117, 118, 120, 119, 120, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 12.41154 0.3010846 9.278115
## 2 11.66428 0.4336483 8.283584
## 3 11.50006 0.4126173 8.460753
## 4 11.43274 0.3955810 8.662347
## 5 11.25505 0.4033256 8.394702
## 6 11.20513 0.4028330 8.492960
## 7 11.32395 0.3773739 8.522066
## 8 11.65818 0.3606319 8.884521
## 9 11.86653 0.3580183 9.131870
## 10 11.95114 0.3613253 9.112079
## 11 12.28667 0.3527594 9.199224
## 12 12.43209 0.3417924 9.240733
## 13 12.52618 0.3405473 9.146879
## 14 12.63797 0.3423856 9.135995
## 15 12.97477 0.3311300 9.248560
## 16 13.39126 0.3128652 9.576581
## 17 13.77485 0.2967797 9.828394
## 18 14.07538 0.2862263 10.008771
## 19 14.22405 0.2860567 10.135767
## 20 14.46824 0.2798174 10.324863
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 6.
plsTune <- train(trainX,trainY,
method = "pls",
tuneLength = 6,
trControl = ctrl,
preProcess = c("center","scale"))
The test set estimate of R-squared is approximately 0.504.
pls_predict <- predict(plsTune, testX)
pls_predict
## [1] -1.4473856 9.9575712 2.8106381 8.2235579 9.9880304 8.4958148
## [7] 7.1877076 -0.8662342 16.4210439 -4.4841389 25.3687203 26.3084614
## [13] 31.0026934 11.5372793 14.1859472 12.2208902 11.3135970 24.2265975
## [19] 3.8374952 35.5745105 31.5721502 26.1629810 24.0523809 24.5488152
## [25] 8.0418396 -2.8073497 28.5688430 24.3252861 12.3707114 0.6701642
## [31] 9.9781934 1.9271097 -0.1488661
R_pls <- cor(pls_predict, testY)
R2_pls <- R_pls^2
R2_pls
## [1] 0.5044235
I tested an RLM PCA model, a regular linear regression model, a penalized linear regression model, and two other PLS models using wide kernel PLS and simpls. The PLS models, regardless of method, all had the same results and therefore the same R-squared values. The RLM PCA, LM PCA, and penalized LM PCA model had worse predictive performance, with an R-squared in the 0.43-0.45 range.
rlmPCA <- train(trainX, trainY,
method = "rlm",
preProcess = "pca",
trControl = ctrl)
rlmPCA
## Robust Linear Model
##
## 132 samples
## 388 predictors
##
## Pre-processing: principal component signal extraction (388), centered
## (388), scaled (388)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 118, 118, 119, 118, 120, 118, ...
## Resampling results across tuning parameters:
##
## intercept psi RMSE Rsquared MAE
## FALSE psi.huber 15.72736 0.3848468 11.792842
## FALSE psi.hampel 15.81550 0.3687765 12.195135
## FALSE psi.bisquare 16.38937 0.3370810 12.115602
## TRUE psi.huber 11.53158 0.3692087 7.746914
## TRUE psi.hampel 12.24233 0.3237643 8.197525
## TRUE psi.bisquare 12.20349 0.3525813 7.698339
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were intercept = TRUE and psi = psi.huber.
rlmPCA_predict <- predict(rlmPCA, testX)
rlmPCA_predict
## 4 10 16 20 26 31 32
## 5.3504665 7.8025038 4.1207200 4.6605423 6.9609123 8.6222278 4.6229334
## 44 54 57 61 64 69 70
## 4.1210598 11.7232769 0.4590693 17.5560333 17.4450841 30.7811297 2.6633294
## 71 73 103 111 114 118 119
## 5.2084823 7.2038731 8.0640462 16.9028794 7.0124822 33.6809822 30.6976745
## 122 123 125 127 137 143 144
## 16.9189746 19.4481503 19.4822334 5.0142965 2.1891482 17.9551952 16.0998784
## 146 147 152 153 162
## 10.1643241 0.6669210 4.5106037 6.3765577 4.5897811
cor(rlmPCA_predict, testY)^2
## [1] 0.4325927
lmPCA <- train(trainX, trainY,
method = "lm",
preProcess = "pca",
trControl = ctrl)
lmPCA
## Linear Regression
##
## 132 samples
## 388 predictors
##
## Pre-processing: principal component signal extraction (388), centered
## (388), scaled (388)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 118, 120, 118, 118, 118, 119, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 11.59802 0.4057905 8.464152
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
lmPCA_predict <- predict(lmPCA, testX)
lmPCA_predict
## 4 10 16 20 26 31 32
## 5.8641117 11.2797243 3.7312966 6.8545965 8.7622541 8.1302083 5.6235174
## 44 54 57 61 64 69 70
## 2.5425950 17.9098644 -0.5072346 23.7630747 24.4381638 37.5365796 3.7799848
## 71 73 103 111 114 118 119
## 6.2479900 7.3673932 11.1351881 22.8509422 7.1159582 35.6591850 34.8216506
## 122 123 125 127 137 143 144
## 22.5322271 24.8076985 25.0477358 5.0954433 4.2287213 20.8421848 17.2891533
## 146 147 152 153 162
## 13.7362779 2.2214386 5.1881920 5.3367736 3.3627942
cor(lmPCA_predict, testY)^2
## [1] 0.4358732
lasso_PCA <- train(trainX, trainY,
method = "glmnet",
preProcess = "pca",
trControl = ctrl)
lasso_PCA
## glmnet
##
## 132 samples
## 388 predictors
##
## Pre-processing: principal component signal extraction (388), centered
## (388), scaled (388)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 119, 119, 118, 119, 117, 119, ...
## Resampling results across tuning parameters:
##
## alpha lambda RMSE Rsquared MAE
## 0.10 0.009925983 10.94436 0.3972746 8.297547
## 0.10 0.099259834 10.93834 0.3972680 8.289601
## 0.10 0.992598335 10.83689 0.3969846 8.123197
## 0.55 0.009925983 10.94640 0.3972670 8.297941
## 0.55 0.099259834 10.91826 0.3971908 8.252090
## 0.55 0.992598335 10.97338 0.3973776 8.116889
## 1.00 0.009925983 10.94710 0.3972666 8.298406
## 1.00 0.099259834 10.90362 0.3969845 8.220084
## 1.00 0.992598335 11.46074 0.3821102 8.611531
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 0.1 and lambda = 0.9925983.
lasso_PCA_predict <- predict(lasso_PCA, testX)
lasso_PCA_predict
## 4 10 16 20 26 31 32
## 5.6752371 11.3128403 4.4079335 7.2921218 9.2833210 8.1421702 6.0192593
## 44 54 57 61 64 69 70
## 3.1639307 17.0011321 0.7143191 22.8398747 23.4664922 34.8976863 4.4996435
## 71 73 103 111 114 118 119
## 6.4964998 7.4765327 10.7029216 22.0934818 7.3246423 33.3764055 32.7563820
## 122 123 125 127 137 143 144
## 21.7381260 23.9584169 24.0962090 5.8624514 4.8783337 20.1771736 17.0846203
## 146 147 152 153 162
## 13.2562092 2.6802196 6.2595230 5.8271646 4.2450245
cor(lasso_PCA_predict, testY)^2
## [1] 0.4428676
wk_pls <- train(trainX, trainY,
method = "widekernelpls",
trControl = ctrl,
preProcess = c("center","scale"))
wk_pls
## Partial Least Squares
##
## 132 samples
## 388 predictors
##
## Pre-processing: centered (388), scaled (388)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 120, 119, 117, 119, 119, 120, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 12.25574 0.3235845 9.260325
## 2 11.33268 0.4179886 8.124855
## 3 11.17124 0.4640138 8.386148
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 3.
wk_pls_predict <- predict(wk_pls, testX)
wk_pls_predict
## [1] -1.4473856 9.9575712 2.8106381 8.2235579 9.9880304 8.4958148
## [7] 7.1877076 -0.8662342 16.4210439 -4.4841389 25.3687203 26.3084614
## [13] 31.0026934 11.5372793 14.1859472 12.2208902 11.3135970 24.2265975
## [19] 3.8374952 35.5745105 31.5721502 26.1629810 24.0523809 24.5488152
## [25] 8.0418396 -2.8073497 28.5688430 24.3252861 12.3707114 0.6701642
## [31] 9.9781934 1.9271097 -0.1488661
cor(wk_pls_predict, testY)^2
## [1] 0.5044235
simpls <- train(trainX, trainY,
method = "simpls",
trControl = ctrl,
preProcess = c("center","scale"))
simpls
## Partial Least Squares
##
## 132 samples
## 388 predictors
##
## Pre-processing: centered (388), scaled (388)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 118, 119, 120, 119, 116, 119, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 12.16414 0.3078650 9.109961
## 2 11.39583 0.4257089 8.227754
## 3 11.23258 0.4594413 8.467877
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 3.
simpls_predict <- predict(simpls, testX)
simpls_predict
## [1] -1.4473856 9.9575712 2.8106381 8.2235579 9.9880304 8.4958148
## [7] 7.1877076 -0.8662342 16.4210439 -4.4841389 25.3687203 26.3084614
## [13] 31.0026934 11.5372793 14.1859472 12.2208902 11.3135970 24.2265975
## [19] 3.8374952 35.5745105 31.5721502 26.1629810 24.0523809 24.5488152
## [25] 8.0418396 -2.8073497 28.5688430 24.3252861 12.3707114 0.6701642
## [31] 9.9781934 1.9271097 -0.1488661
cor(simpls_predict, testY)^2
## [1] 0.5044235
Since the permeability dataset is used for pharmaceutical drug development and all models had an R-squared of approximately 0.5 or less, I would not recommend any of the models to replace the permeability laboratory experiment. Since R squared is a metric for how much variance in the response is accounted for by the predictors, an upper level of 50% does not seem like a strong enough correlation to justify replacing the permeability laboratory experiment.
A chemical manufacturing process for a pharmaceutical product was discussed in Sect.1.4. In this problem, the objective is to understand the relationship between biological measurements of the raw materials (predictors), measurements of the manufacturing process (predictors), and the response of product yield. Biological predictors cannot be changed but can be used to assess the quality of the raw material before processing. On the other hand, manufacturing process predictors can be changed in the manufacturing process. Improving product yield by 1% will boost revenue by approximately one hundred thousand dollars per batch:
data(ChemicalManufacturingProcess)
The matrix processPredictors contains the 57 predictors (12 describing the input biological material and 45 describing the process predictors) for the 176 manufacturing runs. yield contains the percent yield for each run.
Assuming that the small percentage of missing values in the predictor set is missing at random, KNN imputation was used to fill in these missing values. This was done with the preProcess function, with knnImpute as the method. The transformed data was piped into a new variable and placed into a dataframe.
ChemicalManufacturingProcess_preProc <-
preProcess(ChemicalManufacturingProcess,
method = "knnImpute")
transformed_ChemMan <-
predict(ChemicalManufacturingProcess_preProc,
newdata = ChemicalManufacturingProcess)
df <- as.data.frame(transformed_ChemMan$Yield) %>%
rename(Yield = `transformed_ChemMan$Yield`)
The optimal value of the performance metric is 3, as shown by the root mean squared error. This was compared also to the R-squared metric was marginally higher for 4 components. Since RMSE was lower by a larger percentage, 3 components was chosen.
trainingDataindex <-
sample(seq_len(nrow(df)),
size = smp_size)
trainY <- df[trainingDataindex,]
testY <- df[-trainingDataindex,]
trainX <-
transformed_ChemMan[trainingDataindex,] %>%
select(-Yield)
testX <-
transformed_ChemMan[-trainingDataindex,] %>%
select(-Yield)
plsTune_RMSE <- train(trainX,trainY,
method = "pls",
tuneLength = 20,
trControl = ctrl,
preProcess = c("center","scale"))
plsTune_R2 <- train(trainX,trainY,
method = "pls",
tuneLength = 20,
trControl = ctrl,
preProcess = c("center","scale"),
metric = "Rsquared")
plot(plsTune_RMSE)
plot(plsTune_R2)
plsTune_RMSE
## Partial Least Squares
##
## 132 samples
## 57 predictor
##
## Pre-processing: centered (57), scaled (57)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 118, 119, 119, 119, 118, 119, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 0.7957411 0.4568994 0.6329938
## 2 1.0607101 0.5170700 0.6737357
## 3 0.6766499 0.6354155 0.5499443
## 4 0.7330987 0.6134668 0.5696902
## 5 0.9459309 0.5441647 0.6386960
## 6 1.0457239 0.5341147 0.6725087
## 7 1.1376994 0.5441714 0.7010528
## 8 1.1809492 0.5514285 0.7158071
## 9 1.1476238 0.5753177 0.7034479
## 10 1.2268688 0.5762581 0.7271974
## 11 1.3248746 0.5569062 0.7657038
## 12 1.4368352 0.5340744 0.8072475
## 13 1.4677461 0.5202478 0.8218249
## 14 1.5441504 0.5060173 0.8493337
## 15 1.6107929 0.5025954 0.8701117
## 16 1.6423380 0.4985809 0.8830953
## 17 1.7012126 0.4975336 0.9013705
## 18 1.7483179 0.4947957 0.9150027
## 19 1.8175242 0.4919380 0.9328794
## 20 1.9208661 0.4859960 0.9620018
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 3.
plsTune <- train(trainX,trainY,
method = "pls",
tuneLength = 3,
trControl = ctrl,
preProcess = c("center","scale"))
The R-squared for the test set was approximately 0.572, while the R-squared for the training set was approximately 0.703. This might suggest that the model is overfitted to the training data, as it performs significantly worse with the test set.
pls_predict <- predict(plsTune, testX)
pls_predict
## [1] 1.29806184 0.55636066 0.80279836 0.28818217 -0.18855507 1.08427054
## [7] 0.86931808 -2.03273060 1.08531817 1.26129301 1.27493440 1.27144629
## [13] 1.68652739 1.27753021 1.22464153 0.78358794 0.76177475 0.43232898
## [19] -0.67437431 0.42106356 0.18908059 1.13797085 0.29383488 -0.22494436
## [25] -0.16509792 -0.50397460 -1.11348963 -0.60453328 -0.07350641 -0.37819609
## [31] 0.21881107 -1.19358586 -0.64790765 -0.18592081 0.11092909 -0.02771271
## [37] -0.26700358 -0.40840581 -0.82490558 -0.70773987 -0.37292066 -1.48941619
## [43] -1.68618406 -0.20650018
R_pls <- cor(pls_predict, testY)
R2_pls <- R_pls^2
R2_pls
## [1] 0.4749322
pls_predict_train <- predict(plsTune, trainX)
pls_predict_train
## [1] 0.86131310 0.25343129 0.27375521 1.65940147 1.30835075 0.56904209
## [7] 2.31181306 -0.35946562 0.08574419 1.23008163 -0.74508749 -1.23271543
## [13] 0.18871160 0.36328648 0.15468841 -0.80416809 2.31691312 1.08970847
## [19] -1.25619740 0.99108071 -0.21921038 1.64510493 -0.97023431 -0.46179792
## [25] -0.18087492 -0.98784066 -1.06698125 1.21701205 1.44414651 1.08292000
## [31] -0.93853727 0.29031217 -0.08838669 0.31176577 -0.43605302 0.23723719
## [37] -0.46097806 -1.03977261 -0.90104285 1.64031008 0.93891238 -0.20070777
## [43] -0.11974701 1.38485429 0.06632316 0.59506895 -0.20564090 -0.71015975
## [49] -0.20416836 -0.87629135 -0.49675808 -0.56795062 -1.06893600 1.20877729
## [55] 1.82816946 -0.93573300 0.22942453 -0.34982307 0.18186760 -0.42196600
## [61] -0.73292206 -0.37377120 -0.80814721 -0.70546476 1.04473573 -0.31284927
## [67] 0.02704369 0.14043138 0.01997861 -0.36878589 -0.38368525 1.42115846
## [73] 0.66958262 -1.35629528 0.14002318 -1.00290941 0.47150009 -0.75994800
## [79] -0.64000968 -0.49160445 0.23241167 1.10139833 1.38036029 1.21786572
## [85] -1.20310842 0.16134000 -0.13512792 0.02181715 -1.17294866 -0.33917872
## [91] -1.21866728 -0.22725213 -1.75751954 -2.01040046 -1.30554108 -0.50583053
## [97] -0.10650532 -0.58467841 0.13587493 -0.34235985 0.05783961 0.92205002
## [103] -0.54452255 -0.92363909 0.24475056 -0.27138206 -0.44131432 -0.56001015
## [109] 0.40472318 -0.54536698 -0.79023150 0.37286529 -0.39925454 -0.46946102
## [115] -0.19819236 0.37595831 -0.29754441 2.45005717 -1.33052883 -0.29455247
## [121] -0.52589964 0.22023500 -0.13538004 -0.85294833 0.85851881 0.21194579
## [127] -0.49551964 -0.61960165 -0.69086999 -0.12293395 0.07078805 0.93167723
R_pls_train <- cor(pls_predict_train, trainY)
R2_pls_train <- R_pls_train^2
R2_pls_train
## [1] 0.7138012
It looks like the most important predictors were ManufacturingProcess32, ManufacturingProcess13, ManufacturingProcess36, ManufacturingProcess17, ManufacturingProcess09. It looks like the process predictors dominate the list.
print(varImp(plsTune))
## pls variable importance
##
## only 20 most important variables shown (out of 57)
##
## Overall
## ManufacturingProcess32 100.00
## ManufacturingProcess36 79.66
## ManufacturingProcess13 79.51
## ManufacturingProcess17 75.12
## ManufacturingProcess09 72.98
## ManufacturingProcess33 63.73
## ManufacturingProcess06 58.65
## BiologicalMaterial02 58.04
## BiologicalMaterial08 55.31
## BiologicalMaterial06 54.95
## BiologicalMaterial04 52.23
## ManufacturingProcess11 51.73
## BiologicalMaterial12 50.49
## BiologicalMaterial11 49.42
## BiologicalMaterial01 49.00
## ManufacturingProcess12 48.29
## BiologicalMaterial03 47.84
## ManufacturingProcess28 41.25
## BiologicalMaterial10 38.00
## ManufacturingProcess04 37.96
It seems like the strongest relationship between the yield and predictor is found with ManufacturingProcess32. Since the predictor is positively correlated to yield, this information could be helpful in improving yield by determining how to influence/coerce higher values of ManufacturingProcess32.
data <- cbind(trainX, Yield = trainY)
ManufacturingProcess32lm <-
lm(Yield ~ ManufacturingProcess32, data = data)
summary(ManufacturingProcess32lm)
##
## Call:
## lm(formula = Yield ~ ManufacturingProcess32, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.13937 -0.55251 -0.06372 0.40474 2.26401
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.009744 0.068004 0.143 0.886
## ManufacturingProcess32 0.673967 0.068123 9.893 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7797 on 130 degrees of freedom
## Multiple R-squared: 0.4295, Adjusted R-squared: 0.4251
## F-statistic: 97.88 on 1 and 130 DF, p-value: < 2.2e-16
ggplot(data,
aes(x = ManufacturingProcess32, y = Yield)) +
geom_point() +
labs(x = "ManufacturingProcess32",
y = "Yield") +
geom_smooth(method = "lm", se = FALSE) +
theme_minimal()
ManufacturingProcess13lm <-
lm(Yield ~ ManufacturingProcess13, data = data)
summary(ManufacturingProcess13lm)
##
## Call:
## lm(formula = Yield ~ ManufacturingProcess13, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.67353 -0.68280 -0.01469 0.61956 2.19859
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.02250 0.07950 -0.283 0.778
## ManufacturingProcess13 -0.49050 0.08161 -6.010 1.75e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9132 on 130 degrees of freedom
## Multiple R-squared: 0.2174, Adjusted R-squared: 0.2114
## F-statistic: 36.12 on 1 and 130 DF, p-value: 1.75e-08
ggplot(data,
aes(x = ManufacturingProcess13, y = Yield)) +
geom_point() +
labs(x = "ManufacturingProcess13",
y = "Yield") +
geom_smooth(method = "lm", se = FALSE) +
theme_minimal()
ManufacturingProcess36lm <-
lm(Yield ~ ManufacturingProcess36, data = data)
summary(ManufacturingProcess36lm)
##
## Call:
## lm(formula = Yield ~ ManufacturingProcess36, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.17235 -0.59533 -0.03074 0.57708 2.46159
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.02050 0.07442 -0.275 0.783
## ManufacturingProcess36 -0.57895 0.07499 -7.720 2.72e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8548 on 130 degrees of freedom
## Multiple R-squared: 0.3144, Adjusted R-squared: 0.3091
## F-statistic: 59.6 on 1 and 130 DF, p-value: 2.723e-12
ggplot(data,
aes(x = ManufacturingProcess36, y = Yield)) +
geom_point() +
labs(x = "ManufacturingProcess36",
y = "Yield") +
geom_smooth(method = "lm", se = FALSE) +
theme_minimal()
ManufacturingProcess17lm <-
lm(Yield ~ ManufacturingProcess17, data = data)
summary(ManufacturingProcess17lm)
##
## Call:
## lm(formula = Yield ~ ManufacturingProcess17, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.70036 -0.71964 -0.09345 0.63767 2.43915
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.03799 0.08284 -0.459 0.647
## ManufacturingProcess17 -0.41183 0.08593 -4.793 4.42e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9517 on 130 degrees of freedom
## Multiple R-squared: 0.1502, Adjusted R-squared: 0.1436
## F-statistic: 22.97 on 1 and 130 DF, p-value: 4.424e-06
ggplot(data,
aes(x = ManufacturingProcess17, y = Yield)) +
geom_point() +
labs(x = "ManufacturingProcess17",
y = "Yield") +
geom_smooth(method = "lm", se = FALSE) +
theme_minimal()
ManufacturingProcess09lm <-
lm(Yield ~ ManufacturingProcess09, data = data)
summary(ManufacturingProcess09lm)
##
## Call:
## lm(formula = Yield ~ ManufacturingProcess09, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.68689 -0.67439 -0.01261 0.61499 2.82175
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.03719 0.07973 -0.466 0.642
## ManufacturingProcess09 0.47939 0.08089 5.926 2.62e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.916 on 130 degrees of freedom
## Multiple R-squared: 0.2127, Adjusted R-squared: 0.2066
## F-statistic: 35.12 on 1 and 130 DF, p-value: 2.618e-08
ggplot(data,
aes(x = ManufacturingProcess09, y = Yield)) +
geom_point() +
labs(x = "ManufacturingProcess09",
y = "Yield") +
geom_smooth(method = "lm", se = FALSE) +
theme_minimal()