Libraries

library(tidyverse)
library(caret)
library(pls)
library(glmnet)
library(Amelia)
library(knitr)
library(mice)
library(psych)
library(corrplot)
library(elasticnet)

Exercise 6.2

Developing a model to predict permeability (see Sect. 1.4) could save significant resources for a pharmaceutical company, while at the same time more rapidly identifying molecules that have a sufficient permeability to become a drug:

(a)Start R and use these commands to load the data:

library(AppliedPredictiveModeling)
data(permeability)

The matrix fingerprints contains the 1,107 binary molecular predictors for the 165 compounds, while permeability contains permeability response.

(b)The fingerprint predictors indicate the presence or absence of substructures of a molecule and are often sparse meaning that relatively few of the molecules contain each substructure. Filter out the predictors that have low frequencies using the nearZeroVar function from the caret package. How many predictors are left for modeling?

dim(fingerprints)

## [1]  165 1107

We can see there are 1107 predictors in the fingerprints matrix. Let’s filter out predictors that have low frequencies using the nearZeroVar function.

fingerprints_nearZeroVar <- fingerprints[, -nearZeroVar(fingerprints)]
dim(fingerprints_nearZeroVar)

## [1] 165 388

There are 388 predictors left for modeling.

(c)Split the data into a training and a test set, pre-process the data, and tune a PLS model. How many latent variables are optimal and what is the corresponding resampled estimate of R2?

I am only considering the predictors remained after filtering out with nearZeroVar().

tooHighCor <- findCorrelation(cor(fingerprints_nearZeroVar), 0.90)
fingerprints_filtered <- fingerprints_nearZeroVar[, -tooHighCor]
dim(fingerprints_filtered)

## [1] 165 110

In these data, a significant number of predictors had pair-wise absolute correlations that were larger than 0.90. Because of this, a high-correlation filter was used on the predictor set to remove these highly redundant predictors from the data. In the end, 278 predictors were eliminated from the data for this reason. I will do 80/20 training/test split.

table(is.na(fingerprints_filtered))

## 
## FALSE 
## 18150

There is no NA value found in the dataset.

# Set seed
set.seed(43)
Split <- createDataPartition(permeability, p=0.8, list=FALSE)
x_train <- fingerprints_filtered[Split, ]
y_train <- permeability[Split, ]
x_test <- fingerprints_filtered[-Split, ]
y_test <- permeability[-Split, ]

# Pre-processing was done using preProc = c("center", "scale")
pls_model <- train(x=x_train, y=y_train, method = "pls",
    preProc = c("center", "scale"),
  trControl = trainControl("cv", number = 10),
  tuneLength = 25
)
# Plot model RMSE against different values of components
title <- paste("Training Set RMSE Minimized at",
               pls_model$bestTune$ncomp,
               "Components")
plot(pls_model, main = title)

pls_model$results %>%
  filter(ncomp == pls_model$bestTune$ncomp) %>%
  select(ncomp, RMSE, Rsquared)

##   ncomp     RMSE  Rsquared
## 1     6 10.56881 0.5866853

6 latent variables are optimal. This captures 59% of the variation in the permeability.

(d)Predict the response for the test set. What is the test set estimate of R2?

# Make predictions
pls_predictions <- predict(pls_model, newdata = x_test)
# Model performance metrics
results <- data.frame(Model = "PLS",
                      RMSE = caret::RMSE(pls_predictions, y_test),
                      Rsquared = caret::R2(pls_predictions, y_test))
results

##   Model     RMSE  Rsquared
## 1   PLS 12.40905 0.4161315

(e)Try building other models discussed in this chapter. Do any have better predictive performance?

Ridge Regression

ridge_fit <- train(x=x_train,
                  y= y_train,
                  method='ridge',
                  metric='Rsquared',
                  tuneGrid=data.frame(.lambda = seq(0, 1, by=0.1)),
                  trControl=trainControl(method='cv'),
                  preProcess=c('center','scale')
                  )
plot(ridge_fit)

# Make predictions
rr_predictions <- predict(ridge_fit, newdata = x_test)
# Model performance metrics
rr_results <- data.frame(Model = "Ridge Regression",
                      RMSE = caret::RMSE(rr_predictions, y_test),
                      Rsquared = caret::R2(rr_predictions, y_test))
rr_results

##              Model     RMSE  Rsquared
## 1 Ridge Regression 13.70153 0.3899949

PLS performed better than ridge regression.

Lasso Regression

lasso_fit <- train(x=x_train,
                  y= y_train,
                  method='lasso',
                  metric='Rsquared',
                  tuneGrid=data.frame(.fraction = seq(0, 0.5, by=0.05)),
                  trControl=trainControl(method='cv'),
                  preProcess=c('center','scale')
                  )

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.

## Warning in train.default(x = x_train, y = y_train, method = "lasso", metric =
## "Rsquared", : missing values found in aggregated results

plot(lasso_fit)

# Make predictions
lr_predictions <- predict(lasso_fit, newdata = x_test)
# Model performance metrics
lr_results <- data.frame(Model = "Lasso Regression",
                      RMSE = caret::RMSE(lr_predictions, y_test),
                      Rsquared = caret::R2(lr_predictions, y_test))
lr_results

##              Model     RMSE    Rsquared
## 1 Lasso Regression 943.8564 0.004716371

PLS is better than lasso regression.

Elastic Net Regression

en_fit <- train(x=x_train,
                  y= y_train,
                 method='enet',
                 metric='Rsquared',
                 tuneGrid=expand.grid(.fraction = seq(0, 1, by=0.1), 
                                      .lambda = seq(0, 1, by=0.1)),
                 trControl=trainControl(method='cv'),
                 preProcess=c('center','scale')
                  )

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.

## Warning in train.default(x = x_train, y = y_train, method = "enet", metric =
## "Rsquared", : missing values found in aggregated results

plot(en_fit)

# Make predictions
en_predictions <- predict(en_fit, newdata = x_test)
# Model performance metrics
en_results <- data.frame(Model = "Elastic Net Regression",
                      RMSE = caret::RMSE(en_predictions, y_test),
                      Rsquared = caret::R2(en_predictions, y_test))
en_results

##                    Model     RMSE  Rsquared
## 1 Elastic Net Regression 16.05876 0.3745884

There is no improvement compared to PLS.

Summary

pls_model$results %>%
  filter(ncomp == pls_model$bestTune$ncomp) %>%
  mutate("Model" = "PLS") %>%
  select(Model, RMSE, Rsquared) %>%
  as.data.frame() %>%
  bind_rows(rr_results) %>%
  bind_rows(lr_results) %>%
  bind_rows(en_results) %>%
  arrange(desc(Rsquared)) %>%
  kable()

Model	RMSE	Rsquared
PLS	10.56881	0.5866853
Ridge Regression	13.70153	0.3899949
Elastic Net Regression	16.05876	0.3745884
Lasso Regression	943.85642	0.0047164

(f)Would you recommend any of your models to replace the permeability laboratory experiment?

NO, I wouldn’t recommend any other models. We can see from above summary that PLS has the lowest RMSE and highest R- squared value which indicates it’s the best model here.

Exercise 6.3

A chemical manufacturing process for a pharmaceutical product was discussed in Sect. 1.4. In this problem, the objective is to understand the relationship between biological measurements of the raw materials (predictors), measurements of the manufacturing process (predictors), and the response of product yield. Biological predictors cannot be changed but can be used to assess the quality of the raw material before processing. On the other hand, manufacturing process predictors can be changed in the manufacturing process. Improving product yield by 1 % will boost revenue by approximately one hundred thousand dollars per batch:

(a)Start R and use these commands to load the data:

library(AppliedPredictiveModeling)
data(ChemicalManufacturingProcess)
dim(ChemicalManufacturingProcess)

## [1] 176  58

The matrix processPredictors contains the 57 predictors (12 describing the input biological material and 45 describing the process predictors) for the 176 manufacturing runs. yield contains the percent yield for each run.

(b)A small percentage of cells in the predictor set contain missing values. Use an imputation function to fill in these missing values (e.g., see Sect. 3.8).

table(is.na(ChemicalManufacturingProcess))

## 
## FALSE  TRUE 
## 10102   106

missmap(ChemicalManufacturingProcess)

We can see there are 106 (around 1%) missing values in our dataset. I will impute the dataset using mice package which tend to give really good imputation compared to other techniques mentions in the book.

chem_imputed <- mice(ChemicalManufacturingProcess, print = F, m = 1) %>% complete()

## Warning: Number of logged events: 135

missmap(chem_imputed)

There is no more NA in our dataset.

(c)Split the data into a training and a test set, pre-process the data, and tune a model of your choice from this chapter. What is the optimal value of the performance metric?

tooHighCor <- findCorrelation(cor(chem_imputed), 0.90)
chem_filtered <- chem_imputed[, -tooHighCor]
dim(chem_filtered)

## [1] 176  48

In these data, a significant number of predictors had pair-wise absolute correlations that were larger than 0.90. Because of this, a high-correlation filter was used on the predictor set to remove these highly redundant predictors from the data. In the end, 10 predictors were eliminated from the data for this reason.

Let’s check the histogram of all fields for skewness.

multi.hist(chem_filtered[1:9])

multi.hist(chem_filtered[10:18])

multi.hist(chem_filtered[19:27])

multi.hist(chem_filtered[28:36])

multi.hist(chem_filtered[37:48])

I will do Pre-processing using preProc = c(“center”, “scale”) to address skewness of the data.

# Set seed
set.seed(43)
train <- createDataPartition(chem_filtered$Yield, times = 1, p = 0.8, list = FALSE)
train_df <- chem_filtered[train, ]
test_df <- chem_filtered[-train, ]

pls_model <- train(
  Yield ~ ., data = train_df, method = "pls",
  preProc = c("center", "scale"),
  trControl = trainControl("cv", number = 10),
  tuneLength = 25
)
# Plot model RMSE against different values of components
title <- paste("Training Set RMSE Minimized at",
               pls_model$bestTune$ncomp,
               "Components")
plot(pls_model, main = title)

I will look at the RMSE and the \(R^2\) for the performance metric. 3 latent variables are optimal. This captures 61% of the variation in the data.

pls_model$results %>%
  filter(ncomp == pls_model$bestTune$ncomp) %>%
  select(ncomp, RMSE, Rsquared)

##   ncomp    RMSE  Rsquared
## 1     3 1.18063 0.6122402

(d)Predict the response for the test set. What is the value of the performance metric and how does this compare with the resampled performance metric on the training set?

# predictions
pls_predictions <- predict(pls_model, test_df)
# Performance metrics
results <- data.frame(RMSE = caret::RMSE(pls_predictions, test_df$Yield),
           Rsquared = caret::R2(pls_predictions, test_df$Yield))
results

##       RMSE  Rsquared
## 1 2.294512 0.2428515

We have an RMSE of about 2.29 and a \(R^2\) of roughly 0.24 on the test set. While RMSE value is not that bad but \(R^2\) got reduced from 61% to 24% in testing phase which indicates this model is not a optimal model. We should run different models to get a better performing model in future.

(e)Which predictors are most important in the model you have trained? Do either the biological or process predictors dominate the list?

I will find out important variables using varImp score greater than or equal to 50.

pls_importance <- varImp(pls_model)$importance %>%
  as.data.frame() %>%
  rownames_to_column("Variable") %>%
  filter(Overall >= 50) %>%
  arrange(desc(Overall)) %>%
  mutate(importance = row_number())
varImp(pls_model) %>%
  plot(., top = max(pls_importance$importance), main = "Important Variables")

pls_importance %>%
  mutate(Variable = gsub("[0-9]+", "", Variable)) %>%
  group_by(Variable) %>%
  tally() %>%
  arrange(desc(n))

## # A tibble: 2 x 2
##   Variable                 n
##   <chr>                <int>
## 1 ManufacturingProcess     8
## 2 BiologicalMaterial       5

There are 13 important variables and the process predictors dominate the list.

(f)Explore the relationships between each of the top predictors and the response. How could this information be helpful in improving yield in future runs of the manufacturing process?

important_vars <- chem_filtered %>% select_at(vars(Yield, pls_importance$Variable))

important_vars %>% cor() %>%
  corrplot(method = "color", type = "lower", order = "hclust",
           tl.cex = 0.8, tl.col = "black", tl.srt = 45,
           addCoef.col = "black", number.cex = 0.7, sig.level = 0.05, diag = FALSE)

We see from above that there are significant correlation between the important predictors and yield(response variable). This information would be valuable to in improving yield in future runs of the manufacturing process. Manufacturer can increase the variables with positive coefficient and decrease variables with negative coefficient to improve yield (response variable).

DATA 624: Homework 07

DATA 624: Homework 07

Libraries

Exercise 6.2

Exercise 6.3