Developing a model to predict permeability could save significant resources for a pharmaceutical company, while at the same time more rapidly identifying molecules that have a sufficient permeability to become a drug:
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.1.1 ✓ dplyr 1.0.6
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
#install.packages('AppliedPredictiveModeling')
# A)
library(AppliedPredictiveModeling)
data(permeability)
permeability <- as_tibble(permeability)
class(fingerprints)
## [1] "matrix" "array"
fingerprints <- as_tibble(fingerprints)
head(fingerprints)
## # A tibble: 6 x 1,107
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 0 0 0 0 1 1 1 0 0 0 1 0
## 2 0 0 0 0 0 0 1 1 0 0 0 1 1
## 3 0 0 0 0 0 1 1 1 0 0 0 0 1
## 4 0 0 0 0 0 0 1 1 0 0 0 1 1
## 5 0 0 0 0 0 0 1 1 0 0 0 1 1
## 6 0 0 0 0 0 0 1 1 0 0 0 1 1
## # … with 1,094 more variables: X14 <dbl>, X15 <dbl>, X16 <dbl>, X17 <dbl>,
## # X18 <dbl>, X19 <dbl>, X20 <dbl>, X21 <dbl>, X22 <dbl>, X23 <dbl>,
## # X24 <dbl>, X25 <dbl>, X26 <dbl>, X27 <dbl>, X28 <dbl>, X29 <dbl>,
## # X30 <dbl>, X31 <dbl>, X32 <dbl>, X33 <dbl>, X34 <dbl>, X35 <dbl>,
## # X36 <dbl>, X37 <dbl>, X38 <dbl>, X39 <dbl>, X40 <dbl>, X41 <dbl>,
## # X42 <dbl>, X43 <dbl>, X44 <dbl>, X45 <dbl>, X46 <dbl>, X47 <dbl>,
## # X48 <dbl>, X49 <dbl>, X50 <dbl>, X51 <dbl>, X52 <dbl>, X53 <dbl>,
## # X54 <dbl>, X55 <dbl>, X56 <dbl>, X57 <dbl>, X58 <dbl>, X59 <dbl>,
## # X60 <dbl>, X61 <dbl>, X62 <dbl>, X63 <dbl>, X64 <dbl>, X65 <dbl>,
## # X66 <dbl>, X67 <dbl>, X68 <dbl>, X69 <dbl>, X70 <dbl>, X71 <dbl>,
## # X72 <dbl>, X73 <dbl>, X74 <dbl>, X75 <dbl>, X76 <dbl>, X77 <dbl>,
## # X78 <dbl>, X79 <dbl>, X80 <dbl>, X81 <dbl>, X82 <dbl>, X83 <dbl>,
## # X84 <dbl>, X85 <dbl>, X86 <dbl>, X87 <dbl>, X88 <dbl>, X89 <dbl>,
## # X90 <dbl>, X91 <dbl>, X92 <dbl>, X93 <dbl>, X94 <dbl>, X95 <dbl>,
## # X96 <dbl>, X97 <dbl>, X98 <dbl>, X99 <dbl>, X100 <dbl>, X101 <dbl>,
## # X102 <dbl>, X103 <dbl>, X104 <dbl>, X105 <dbl>, X106 <dbl>, X107 <dbl>,
## # X108 <dbl>, X109 <dbl>, X110 <dbl>, X111 <dbl>, X112 <dbl>, X113 <dbl>, …
head(permeability)
## # A tibble: 6 x 1
## permeability
## <dbl>
## 1 12.5
## 2 1.12
## 3 19.4
## 4 1.73
## 5 1.68
## 6 0.51
# B)
# the nearZeroVar function identifies the predictors with almost zero variance so we can remove them from the model
fingerprints_w_near_zero <- caret::nearZeroVar(fingerprints)
fingerprints <- fingerprints[,-fingerprints_w_near_zero] # 388 variables
# 388 predictors are left for modeling
# C)
fingerprints <- tibble::rowid_to_column(fingerprints, "index")
permeability <- tibble::rowid_to_column(permeability, "index")
fingerprints_and_permeability <- left_join(fingerprints, permeability, by = "index") %>%
dplyr::select(-index)
train.rows <- sample(nrow(fingerprints_and_permeability), nrow(fingerprints_and_permeability) * .7)
fingerprints_train <- fingerprints_and_permeability[train.rows,]
fingerprints_test <- fingerprints_and_permeability[-train.rows,]
set.seed(1024)
permeability <- permeability %>%
dplyr::select(-index)
partial_least_squares_model <- train(permeability ~ ., data = fingerprints_train, method = "pls", tuneLength = 10,
trControl = trainControl(method = "repeatedcv", repeats = 8), preProcess = c("center"))
ggplot(partial_least_squares_model) + xlab("# of Variables") +
labs(title = "Permeability", subtitle = "Training Data") +
theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
theme(plot.subtitle = element_text(hjust = 0.5))
# Our training set gets the lowest RMSE with 6 variables
# D)
prediction <- predict(partial_least_squares_model, fingerprints_test)
# Model performance metrics
postResample(pred = prediction, obs = fingerprints_test$permeability)
## RMSE Rsquared MAE
## 11.3068627 0.3920157 7.9204677
# Our test set estimate for r-squared is 0.3888396
## RMSE Rsquared MAE
## 1.97627936 0.07697501 1.56062576