DATA 624 Homework 7

Problem 6.2

Developing a model to predict permeability (see Sect. 1.4) could save significant resources for a pharmaceutical company, while at the same time more rapidly identifying molecules that have a sufficient permeability to become a drug:

library(caret)
library(AppliedPredictiveModeling)
library(tidyr)
library(dplyr)
library(e1071)
library(elasticnet)
library(knitr)
library(pls)
library(ggplot2)
library(tidyverse)
library(kableExtra)
library(RANN)
library(corrplot)

Part A

Start R and use these commands to load the data

data(permeability)
head(permeability) %>%
  kable() %>%
    kable_styling()

permeability
12.520
1.120
19.405
1.730
1.680
0.510

Permeability data: This pharmaceutical data set was used to develop a model for predicting compounds’ permeability (i.e. a molecule’s ability to cross a membrane). It holds 165 unique compounds; 1107 molecular fingerprints

Part B

The fingerprint predictors indicate the presence or absence of substructures of a molecule and are often sparse meaning that relatively few of the molecules contain each substructure. Filter out the predictors that have low frequencies using the nearZeroVar function from the caret package. How many predictors are left for modeling?

nzvar <- nearZeroVar(fingerprints)
fpfilter <- fingerprints[,-nzvar]
ncol(fpfilter)

## [1] 388

The nearZeroVar function can be used to identify near zero-variance. They have very few unique values relative to the number of samples and the ratio of the frequency of the most common value to the frequency of the second most common value is large.

Part C

Split the data into a training and a test set, pre-process the data, and tune a PLS model. How many latent variables are optimal and what is the corresponding resampled estimate of R2?

fingerprintsdf <- as.data.frame(fpfilter)
df <- as.data.frame(fingerprintsdf) %>% mutate(permeability = permeability)
head(df) %>%
  kable() %>%
    kable_styling()

X6	X12	X86	X87	X88	X96	X97	X98	X101	X102	X103	X108	X111	X141	X143	X150	X152	X153	X154	X156	X157	X158	X159	X162	X163	X167	X168	X169	X170	X171	X172	X173	X174	X175	X176	X177	X178	X179	X180	X181	X182	X183	X184	X185	X186	X187	X188	X189	X190	X191	X192	X193	X194	X195	X196	X197	X198	X199	X200	X201	X202	X203	X204	X205	X206	X207	X208	X209	X210	X211	X212	X213	X214	X215	X221	X223	X224	X225	X226	X227	X228	X229	X230	X231	X232	X233	X234	X235	X236	X237	X238	X239	X240	X241	X242	X244	X245	X246	X247	X248	X249	X250	X251	X253	X254	X255	X256	X257	X258	X260	X261	X262	X263	X264	X265	X266	X267	X268	X269	X270	X271	X272	X274	X276	X278	X279	X280	X281	X284	X285	X286	X290	X291	X293	X294	X295	X296	X297	X298	X299	X300	X301	X302	X303	X304	X305	X306	X307	X308	X309	X310	X311	X312	X313	X314	X315	X316	X317	X318	X319	X320	X321	X322	X323	X324	X325	X326	X327	X328	X329	X330	X331	X332	X333	X334	X335	X336	X337	X338	X339	X340	X341	X342	X343	X344	X345	permeability
1	1	1	1	1	1	1	1	1	1	0	0	0	0	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	12.520
0	1	1	0	0	1	0	0	1	0	0	0	0	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	0	1	1	1	1	0	1	1	1	1	1	1	1	0	1	1	1	0	1	1	1	1	1	1	0	0	1	1	1	1	1	0	1	1	1	1	0	1	1	0	1	1	1	0	0	1	0	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1.120
1	0	1	0	0	1	0	0	1	0	1	1	1	0	1	1	1	1	1	0	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	1	1	1	1	0	1	1	1	1	1	1	0	0	1	1	1	1	1	0	1	1	1	1	0	1	1	0	1	1	1	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	1	1	0	0	0	1	1	1	0	0	0	0	0	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	0	19.405
0	1	1	0	0	1	0	0	1	0	0	0	0	0	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	0	1	1	1	1	0	1	1	1	1	1	1	1	1	1	1	1	0	1	1	1	1	1	1	0	0	1	1	1	1	1	0	1	1	1	1	0	1	1	0	1	1	1	0	0	1	0	0	0	1	1	0	0	0	0	0	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	0	1	1	0	0	0	0	0	0	0	0	0	0	0	1	1	1	1	1	1	1	1	1	0	1	0	0	0	0	0	0	0	0	0	1	1.730
0	1	1	0	0	1	0	0	1	0	0	0	0	0	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	0	1	1	1	1	0	1	1	1	1	1	1	1	0	1	1	1	0	1	1	1	1	1	1	0	0	1	1	1	1	1	0	1	1	1	1	0	1	1	0	1	1	1	0	0	1	0	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1.680
0	1	1	0	0	1	0	0	1	0	0	0	0	0	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	0	1	1	1	1	0	1	1	1	1	1	1	1	1	1	1	1	0	1	1	1	1	1	1	0	0	1	1	1	1	1	0	1	1	1	1	0	1	1	0	1	1	1	0	0	1	0	0	0	1	1	0	0	0	0	0	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	0	1	1	0	0	0	0	0	0	0	0	0	0	0	1	1	1	1	1	1	1	1	1	0	1	0	0	0	0	0	0	0	0	0	1	0.510

#Set random seed
set.seed(10)

#Create test/ train at 80-20
in_train <- createDataPartition(df$permeability, p = 0.8, times =1, list = FALSE)
train_df <- df[in_train, ]
test_df <- df[-in_train, ]


pmodel <- train(permeability ~ ., data = train_df, method = "pls",  center = TRUE,  trControl = trainControl("cv", number = 10),  tuneLength = 10)

## Plot model RMSE vs different values of components
ggplot(pmodel) + 
  xlab("Number of Variables") +
    ggtitle("PLS Model")

pmodel$bestTune$ncomp

## [1] 6

Best tuning parameter ncomp that minimizes the cross-validation error, RMSE is 6

summary(pmodel$finalModel)

## Data:    X dimension: 133 388 
##  Y dimension: 133 1
## Fit method: oscorespls
## Number of components considered: 6
## TRAINING: % variance explained
##           1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## X           27.14    43.27    50.06    52.74    56.36    65.55
## .outcome    31.03    51.39    59.75    69.77    75.26    77.04

pmodel$results %>%
  kable() %>%
    kable_styling()

ncomp	RMSE	Rsquared	MAE	RMSESD	RsquaredSD	MAESD
1	13.09201	0.3906819	10.321637	3.908703	0.2666489	2.876551
2	11.72473	0.5206390	8.445616	3.400597	0.1985467	2.163855
3	11.43216	0.5331825	8.482856	2.581718	0.1581227	1.931290
4	11.46963	0.5377636	8.951449	1.753067	0.1380532	1.541152
5	11.17025	0.5475971	8.604173	1.782655	0.1617694	1.294497
6	10.78514	0.5696499	8.346736	1.787969	0.1587404	1.319356
7	10.81335	0.5702843	8.402793	1.972811	0.1430146	1.230677
8	10.88491	0.5658647	8.377881	1.810761	0.1495770	1.357485
9	10.98097	0.5651603	8.386326	2.151343	0.1519615	1.618178
10	11.01367	0.5787205	8.371964	2.198121	0.1270145	1.692152

6 components captures 65.55% of information contained in the predictors. It captures 77.04% of information in the outcome variable.

Part D

Predict the response for the test set. What is the test set estimate of R2?

# Make predictions
predictions <- pmodel %>% predict(test_df)
results <- data.frame(Model = "PLS Model",
                      RMSE = caret::RMSE(predictions, test_df$permeability),
                      Rsquared = caret::R2(predictions, test_df$permeability))
results %>%
  kable() %>%
  kable_styling()

	Model	RMSE	Rsquared
permeability	PLS Model	13.40002	0.2414261

R2 is 0.24 and RMSE is 13.40

plot(predictions)

Part E

Try building other models discussed in this chapter. Do any have better predictive performance?

fit <-lm(permeability ~ ., train_df)

data_clear  <- fingerprints[, -nearZeroVar(fingerprints)]
data_clear <- cbind(data.frame(permeability),data_clear) #adding permeability
number <-  floor(0.70 * nrow(data_clear)) # 70/30 split
idx <- sample(seq_len(nrow(data_clear)), size = number)
train_df <- data_clear[idx, ]
test_df <- data_clear[-idx, ]

#train the Elastic Net model
elastic_model <-  train(x=train_df[,-1],
                 y=train_df$permeability,
                 method='enet',
                 metric='RMSE', # error mettric
                 tuneGrid=expand.grid(.fraction = seq(0, 1, by=0.2), 
                                      .lambda = seq(0, 1, by=0.2)),
                 trControl=trainControl(method='cv',number=10),
                 preProcess=c('center','scale'))

plot(elastic_model)

# Best params
elastic_model$bestTune

##   fraction lambda
## 8      0.2    0.2

# Perf of best params
getTrainPerf(elastic_model)

##   TrainRMSE TrainRsquared TrainMAE method
## 1  11.20948     0.5283559 7.925961   enet

Elastic net is a popular type of regularized linear regression that combines two popular penalties, specifically the L1 and L2 penalty functions. As we see in the chart there is a decline in R2

Part F

Would you recommend any of your models to replace the permeability laboratory experiment?

I would not use the Elastic net because the lab experiment numbers are better.

Problem 6.3

A chemical manufacturing process for a pharmaceutical product was discussed in Sect. 1.4. In this problem, the objective is to understand the relationship between biological measurements of the raw materials (predictors), 6.5 Computing 139 measurements of the manufacturing process (predictors), and the response of product yield. Biological predictors cannot be changed but can be used to assess the quality of the raw material before processing. On the other hand, manufacturing process predictors can be changed in the manufacturing process. Improving product yield by 1% will boost revenue by approximately one hundred thousand dollars per batch:

Part A

Start R and use these commands to load the data:

data(ChemicalManufacturingProcess)
chem <- ChemicalManufacturingProcess

head(chem) %>%
  kable() %>%
    kable_styling()

Yield	BiologicalMaterial01	BiologicalMaterial02	BiologicalMaterial03	BiologicalMaterial04	BiologicalMaterial05	BiologicalMaterial06	BiologicalMaterial07	BiologicalMaterial08	BiologicalMaterial09	BiologicalMaterial10	BiologicalMaterial11	BiologicalMaterial12	ManufacturingProcess01	ManufacturingProcess02	ManufacturingProcess03	ManufacturingProcess04	ManufacturingProcess05	ManufacturingProcess06	ManufacturingProcess07	ManufacturingProcess08	ManufacturingProcess09	ManufacturingProcess10	ManufacturingProcess11	ManufacturingProcess12	ManufacturingProcess13	ManufacturingProcess14	ManufacturingProcess15	ManufacturingProcess16	ManufacturingProcess17	ManufacturingProcess18	ManufacturingProcess19	ManufacturingProcess20	ManufacturingProcess21	ManufacturingProcess22	ManufacturingProcess23	ManufacturingProcess24	ManufacturingProcess25	ManufacturingProcess26	ManufacturingProcess27	ManufacturingProcess28	ManufacturingProcess29	ManufacturingProcess30	ManufacturingProcess31	ManufacturingProcess32	ManufacturingProcess33	ManufacturingProcess34	ManufacturingProcess35	ManufacturingProcess36	ManufacturingProcess37	ManufacturingProcess38	ManufacturingProcess39	ManufacturingProcess40	ManufacturingProcess41	ManufacturingProcess42	ManufacturingProcess43	ManufacturingProcess44	ManufacturingProcess45
38.00	6.25	49.58	56.97	12.74	19.51	43.73	100	16.66	11.44	3.46	138.09	18.83	NA	NA	NA	NA	NA	NA	NA	NA	43.00	NA	NA	NA	35.5	4898	6108	4682	35.5	4865	6049	4665	0.0	NA	NA	NA	4873	6074	4685	10.7	21.0	9.9	69.1	156	66	2.4	486	0.019	0.5	3	7.2	NA	NA	11.6	3.0	1.8	2.4
42.44	8.01	60.97	67.48	14.65	19.36	53.14	100	19.04	12.55	3.46	153.67	21.05	0.0	0	NA	917	1032.2	210.0	177	178	46.57	NA	NA	0	34.0	4869	6095	4617	34.0	4867	6097	4621	0.0	3	0	3	4869	6107	4630	11.2	21.4	9.9	68.7	169	66	2.6	508	0.019	2.0	2	7.2	0.1	0.15	11.1	0.9	1.9	2.2
42.03	8.01	60.97	67.48	14.65	19.36	53.14	100	19.04	12.55	3.46	153.67	21.05	0.0	0	NA	912	1003.6	207.1	178	178	45.07	NA	NA	0	34.8	4878	6087	4617	34.8	4877	6078	4621	0.0	4	1	4	4897	6116	4637	11.1	21.3	9.4	69.3	173	66	2.6	509	0.018	0.7	2	7.2	0.0	0.00	12.0	1.0	1.8	2.3
41.42	8.01	60.97	67.48	14.65	19.36	53.14	100	19.04	12.55	3.46	153.67	21.05	0.0	0	NA	911	1014.6	213.3	177	177	44.92	NA	NA	0	34.8	4897	6102	4635	34.8	4872	6073	4611	0.0	5	2	5	4892	6111	4630	11.1	21.3	9.4	69.3	171	68	2.5	496	0.018	1.2	2	7.2	0.0	0.00	10.6	1.1	1.8	2.1
42.49	7.47	63.33	72.25	14.02	17.91	54.66	100	18.22	12.80	3.05	147.61	21.05	10.7	0	NA	918	1027.5	205.7	178	178	44.96	NA	NA	0	34.6	4992	6233	4733	33.9	4886	6102	4659	-0.7	8	4	18	4930	6151	4684	11.3	21.6	9.0	69.4	171	70	2.5	468	0.017	0.2	2	7.3	0.0	0.00	11.0	1.1	1.7	2.1
43.57	6.12	58.36	65.31	15.17	21.79	51.23	100	18.30	12.13	3.78	151.88	20.76	12.0	0	NA	924	1016.8	208.9	178	178	45.32	NA	NA	0	34.0	4985	6222	4786	33.4	4862	6115	4696	-0.6	9	1	1	4871	6128	4687	11.4	21.7	10.1	68.2	173	70	2.5	490	0.018	0.4	2	7.2	0.0	0.00	11.5	2.2	1.8	2.0

This data set contains information about a chemical manufacturing process, in which the goal is to understand the relationship between the process and the resulting final product yield. Raw material in this process is put through a sequence of 27 steps to generate the final pharmaceutical product. The starting material is generated from a biological unit and has a range of quality and characteristics. The objective in this project was to develop a model to predict percent yield of the manufacturing process. The data set consisted of 177 samples of biological material for which 57 characteristics were measured. Of the 57 characteristics, there were 12 measurements of the biological starting material, and 45 measurements of the manufacturing process. The process variables included measurements such as temperature, drying time, washing time, and concentrations of by–products at various steps. Some of the process measurements can be controlled, while others are observed. Predictors are continuous, count, categorical; some are correlated, and some contain missing values. Samples are not independent because sets of samples come from the same batch of biological starting material.

Source: https://rdrr.io/rforge/AppliedPredictiveModeling/man/ChemicalManufacturingProcess.html

Part B

A small percentage of cells in the predictor set contain missing values. Use an imputation function to fill in these missing values (e.g., see Sect. 3.8).

# Make this reproducible
set.seed(42)
knn_model <- preProcess(ChemicalManufacturingProcess, "knnImpute")
df_no_missing <- predict(knn_model, ChemicalManufacturingProcess)

Impute missing data using nearest-neighbor method

Part C

Split the data into a training and a test set, pre-process the data, and tune a model of your choice from this chapter. What is the optimal value of the performance metric?

Split data at 70:30 ratio.

number <-  floor(0.70 * nrow(df_no_missing)) # 70/30 split
idx <- sample(seq_len(nrow(df_no_missing)), size = number)
training_df <- df_no_missing[idx, ]
testing_df <- df_no_missing[-idx, ]

# build PLS model
pmodel <- train(
  Yield  ~ ., data = training_df, method = "pls",
  center = TRUE,
  trControl = trainControl("cv", number = 10),
  tuneLength = 25
)
#pls model results
plot(pmodel$results$Rsquared,
     xlab = "ncomp",
     ylab = "Rsquared"
     )

pmodel$results %>%
  filter(ncomp == pmodel$bestTune$ncomp) %>%
  select(ncomp, RMSE, Rsquared) %>%
  kable() %>%
  kable_styling()

ncomp	RMSE	Rsquared
3	0.6430892	0.613201

The ncomp is 3 RMSE is 0.64 and RSquared value is 0.61. The optimal number of model is 3. The model captures 6% of the yield.

Part D

Predict the response for the test set.What is the value of the performance metric and how does this compare with the resampled performance metric on the training set?

# Make predictions
pred <- predict(pmodel, testing_df)

# Model Evaluation
results <- data.frame(Model = "PLS Model",
                      RMSE = caret::RMSE(pred, testing_df$Yield),
                      Rsquared = caret::R2(pred, testing_df$Yield))
results %>%
  kable() %>%
  kable_styling()

Model	RMSE	Rsquared
PLS Model	0.6557724	0.6132977

RMSE is root mean squared error. It is based the assumption that data error follow normal distribution. It is 0.66 while RSquared if 0.61

Part E

Which predictors are most important in the model you have trained? Do either the biological or process predictors dominate the list?

pls_importance <- varImp(pmodel)$importance %>%
  as.data.frame() %>%
  rownames_to_column("Variable") %>%
  filter(Overall >= 50) %>% # set a threshold for vairables importance
  arrange(desc(Overall)) %>%
  mutate(importance = row_number())
varImp(pmodel) %>%
  plot(., top = max(pls_importance$importance), main = "PLS Model Feature Importance")

ManufacturingProcess32 look important. We can set a threshold and only pass the variables that threshold for example 50%

Part F

Explore the relationships between each of the top predictors and the response. How could this information be helpful in improving yield in future rounds of the manufacturing process?

important_vars <- df_no_missing %>%
  select_at(vars(Yield, pls_importance$Variable))

important_vars_p <- cor.mtest(important_vars)$p
important_vars %>%
  cor() %>%
  corrplot(method = "color", type = "lower", order = "hclust",
           tl.cex = 0.8, tl.col = "black", tl.srt = 45,
           addCoef.col = "black", number.cex = 0.7,
           p.mat = important_vars_p,  sig.level = 0.05, diag = FALSE)

The correlation heat map shows that variables are positively correleted with Yield respond. The Manufacuring process 32 is the most correleted variable to respond variable. Some variables are negatively correleted to othe explanatory variable. For example, Manufacuring process 32 is negatively correlated with manufacturing process 13.

DATA 624 Homework 7

Monu Chacko

4/18/2021

Problem 6.2

Part A

Part B

Part C

Part D

Part E

Part F

Problem 6.3

Part A

Part B

Part C

Part D

Part E

Part F