Objective

  1. To develop a function that relates descriptors to toxicity.
  2. To compare relative importance of QuickProp descriptors on toxicity prediction.

Outline

In this experiment, regression models relating QuickProp descriptors to predict toxicity are built from a data set consists of 322 compounds that were experimentally assessed.

Require packages

start <- function(pkg){
  npkg <- pkg[!(pkg %in% installed.packages()[,"Package"])]
  if (length(npkg))
    install.packages(npkg, dependencies = TRUE)
  lapply(pkg, require, character.only=TRUE)
}

pkgs <- c("QSARdata",'caret','ggplot2','pls')
start(pkgs)
## Loading required package: QSARdata
## Loading required package: caret
## Loading required package: lattice
## Loading required package: ggplot2
## Loading required package: pls
## 
## Attaching package: 'pls'
## 
## The following object is masked from 'package:caret':
## 
##     R2
## 
## The following object is masked from 'package:stats':
## 
##     loadings
## [[1]]
## [1] TRUE
## 
## [[2]]
## [1] TRUE
## 
## [[3]]
## [1] TRUE
## 
## [[4]]
## [1] TRUE

Load data

data(AquaticTox)
head(AquaticTox_Outcome)
##                                             Molecule Activity
## 1                                       (d)-limonene     5.29
## 2 111-trichloro-2-methyl-2-propanolol(chlorobytanol)     3.12
## 3                                111-trichloroethane     3.40
## 4                             1122-tetrachloroethane     3.92
## 5                                112-trichloroethane     3.21
## 6                     11-dichloroethylene(vinylidene     2.84
descriptors<-AquaticTox_QuickProp
data<-cbind(descriptors,AquaticTox_Outcome$Activity)
str(data)
## 'data.frame':    322 obs. of  51 variables:
##  $ Molecule                          : Factor w/ 320 levels "(d)-limonene",..: 1 31 32 34 33 30 42 40 41 45 ...
##  $ QikProp_.stars                    : int  7 6 12 11 12 12 7 7 11 8 ...
##  $ QikProp_.amine                    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ QikProp_.acid                     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ QikProp_.amide                    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ QikProp_.rotor                    : int  1 1 0 0 0 0 0 0 0 0 ...
##  $ QikProp_.rtvFG                    : int  0 0 0 0 0 0 0 0 1 0 ...
##  $ QikProp_CNS                       : int  2 2 1 1 1 1 1 1 1 1 ...
##  $ QikProp_mol_MW                    : num  136 177 133 168 133 ...
##  $ QikProp_dipole                    : num  0.436 3.953 2.475 3.058 3.978 ...
##  $ QikProp_SASA                      : num  382 317 271 290 270 ...
##  $ QikProp_FOSA                      : num  316.5 125.8 80.3 46.3 79.9 ...
##  $ QikProp_FISA                      : num  0 35.7 0 0 0 ...
##  $ QikProp_PISA                      : num  65.3 0 0 0 0 ...
##  $ QikProp_WPSA                      : num  0 156 190 244 190 ...
##  $ QikProp_volume                    : num  613 492 387 425 385 ...
##  $ QikProp_donorHB                   : num  0 1 0 0 0 0 0 0 0 0 ...
##  $ QikProp_accptHB                   : num  0 0.75 0 0 0 0 0 0 0 0 ...
##  $ QikProp_dip.2.V                   : num  0.000309 0.03175 0.015848 0.022004 0.041087 ...
##  $ QikProp_ACxDN..5.SA               : num  0 0.00236 0 0 0 ...
##  $ QikProp_glob                      : num  0.914 0.95 0.949 0.943 0.948 ...
##  $ QikProp_QPpolrz                   : num  18.38 12.9 9.31 10.86 9.26 ...
##  $ QikProp_QPlogPC16                 : num  4.5 4.7 3.24 4 3.22 ...
##  $ QikProp_QPlogPoct                 : num  4.93 6.2 2.9 3.78 3.23 ...
##  $ QikProp_QPlogPw                   : num  -0.289 2.763 0.437 0.502 0.435 ...
##  $ QikProp_QPlogPo.w                 : num  3.91 2.12 2.49 2.58 2.49 ...
##  $ QikProp_QPlogS                    : num  -4.01 -1.96 -1.96 -2.07 -1.96 ...
##  $ QikProp_CIQPlogS                  : num  -4.2 -2.39 -1.84 -2.24 -1.82 ...
##  $ QikProp_QPlogHERG                 : num  -3.23 -2.34 -2.37 -2.45 -2.37 ...
##  $ QikProp_QPPCaco                   : num  9906 4542 9906 9906 9906 ...
##  $ QikProp_QPlogBB                   : num  0.826 0.622 0.353 0.285 0.352 0.289 0.192 0.192 0.412 0.192 ...
##  $ QikProp_QPPMDCK                   : num  5899 10000 10000 10000 10000 ...
##  $ QikProp_QPlogKp                   : num  -1.19 -2.08 -1.52 -1.52 -1.52 ...
##  $ QikProp_IP.eV.                    : num  9.63 10.6 10.83 10.79 10.6 ...
##  $ QikProp_EA.eV.                    : num  -1.181 0.26 0.343 0.191 -0.107 ...
##  $ QikProp_.metab                    : int  5 1 0 0 0 0 0 0 0 0 ...
##  $ QikProp_QPlogKhsa                 : num  0.356 -0.278 -0.26 -0.148 -0.264 -0.436 0.187 0.072 -0.094 0.2 ...
##  $ QikProp_HumanOralAbsorption       : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ QikProp_PercentHumanOralAbsorption: num  100 100 100 100 100 100 100 100 100 100 ...
##  $ QikProp_SAfluorine                : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ QikProp_SAamideO                  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ QikProp_PSA                       : num  0 20.1 0 0 0 ...
##  $ QikProp_.NandO                    : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ QikProp_RuleOfFive                : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ QikProp_.ringatoms                : int  6 0 0 0 0 0 6 6 0 6 ...
##  $ QikProp_.in56                     : int  6 0 0 0 0 0 6 6 0 6 ...
##  $ QikProp_.noncon                   : int  4 0 0 0 0 0 0 0 0 0 ...
##  $ QikProp_.nonHatm                  : int  10 8 5 6 5 4 10 9 6 10 ...
##  $ QikProp_RuleOfThree               : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ QikProp_ACxDN..5.SAxSASA.MW       : num  0 0.00423 0 0 0 ...
##  $ AquaticTox_Outcome$Activity       : num  5.29 3.12 3.4 3.92 3.21 2.84 5.29 4.89 3.41 5.83 ...
colnames(data)[51]<- "Activity"
colSums(is.na(data))/nrow(data)
##                           Molecule                     QikProp_.stars 
##                         0.00000000                         0.00931677 
##                     QikProp_.amine                      QikProp_.acid 
##                         0.00931677                         0.00931677 
##                     QikProp_.amide                     QikProp_.rotor 
##                         0.00931677                         0.00931677 
##                     QikProp_.rtvFG                        QikProp_CNS 
##                         0.00931677                         0.00931677 
##                     QikProp_mol_MW                     QikProp_dipole 
##                         0.00931677                         0.00931677 
##                       QikProp_SASA                       QikProp_FOSA 
##                         0.00931677                         0.00931677 
##                       QikProp_FISA                       QikProp_PISA 
##                         0.00931677                         0.00931677 
##                       QikProp_WPSA                     QikProp_volume 
##                         0.00931677                         0.00931677 
##                    QikProp_donorHB                    QikProp_accptHB 
##                         0.00931677                         0.00931677 
##                    QikProp_dip.2.V                QikProp_ACxDN..5.SA 
##                         0.00931677                         0.00931677 
##                       QikProp_glob                    QikProp_QPpolrz 
##                         0.00931677                         0.00931677 
##                  QikProp_QPlogPC16                  QikProp_QPlogPoct 
##                         0.00931677                         0.00931677 
##                    QikProp_QPlogPw                  QikProp_QPlogPo.w 
##                         0.00931677                         0.00931677 
##                     QikProp_QPlogS                   QikProp_CIQPlogS 
##                         0.00931677                         0.00931677 
##                  QikProp_QPlogHERG                    QikProp_QPPCaco 
##                         0.00931677                         0.00931677 
##                    QikProp_QPlogBB                    QikProp_QPPMDCK 
##                         0.00931677                         0.00931677 
##                    QikProp_QPlogKp                     QikProp_IP.eV. 
##                         0.00931677                         0.00931677 
##                     QikProp_EA.eV.                     QikProp_.metab 
##                         0.00931677                         0.00931677 
##                  QikProp_QPlogKhsa        QikProp_HumanOralAbsorption 
##                         0.00931677                         0.00931677 
## QikProp_PercentHumanOralAbsorption                 QikProp_SAfluorine 
##                         0.00931677                         0.00931677 
##                   QikProp_SAamideO                        QikProp_PSA 
##                         0.00931677                         0.00931677 
##                     QikProp_.NandO                 QikProp_RuleOfFive 
##                         0.00931677                         0.00931677 
##                 QikProp_.ringatoms                      QikProp_.in56 
##                         0.00931677                         0.00931677 
##                    QikProp_.noncon                   QikProp_.nonHatm 
##                         0.00931677                         0.00931677 
##                QikProp_RuleOfThree        QikProp_ACxDN..5.SAxSASA.MW 
##                         0.00931677                         0.00931677 
##                           Activity 
##                         0.00000000

The outcome is the negative log of activity labeled as “Activity”.

Missing values in each column can be ignored.

Cleaning

Variables with correlations larger than 0.29 are omitted.

data<-na.omit(data)

descs <- data[, !apply(data, 2, function(x) any(is.na(x)) )]

descs <- descs[, !apply( descs, 2, function(x) length(unique(x)) == 1 )]

r2 <- which(cor(descs[2:50])^2 > .29, arr.ind=TRUE)

r2 <- r2[ r2[,1] > r2[,2] , ]
d <- descs[, -unique(r2[,2])]

Preprocessing

Box-Cox transformation was performed on each column for better normality.

Dataset was split into training and test sets on a ratio of 8:2.

Tran <- preProcess(d[,-1],method = "BoxCox")
data <-predict(Tran,d[,-1])

ind<-sample(2,nrow(data),replace=TRUE,prob=c(0.8,0.2))
training<-data[ind==1,-1]
test<-data[ind==2,-1]

Linear least squares

Starting from a simple regression model.

lm.fit <- lm(Activity ~ ., data = training)
summary(lm.fit)
## 
## Call:
## lm(formula = Activity ~ ., data = training)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.08201 -0.37379 -0.02004  0.32489  2.92244 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)
## (Intercept)                         1.244e+00  4.627e-01   2.690  0.00765
## QikProp_accptHB                    -5.331e-01  8.253e-02  -6.459 5.61e-10
## QikProp_EA.eV.                      2.635e-01  4.600e-02   5.728 2.97e-08
## QikProp_PercentHumanOralAbsorption -1.455e-04  9.709e-05  -1.498  0.13537
## QikProp_SAfluorine                 -5.648e-03  2.201e-03  -2.566  0.01087
## QikProp_PSA                         2.371e-02  5.799e-03   4.090 5.86e-05
## QikProp_.NandO                     -4.148e-01  1.268e-01  -3.272  0.00122
## QikProp_.in56                      -1.043e-01  1.795e-02  -5.810 1.94e-08
## QikProp_.noncon                     6.224e-02  4.287e-02   1.452  0.14789
## QikProp_.nonHatm                    1.295e+00  7.529e-02  17.206  < 2e-16
## QikProp_RuleOfThree                -5.015e-01  5.196e-01  -0.965  0.33539
## QikProp_ACxDN..5.SAxSASA.MW        -5.679e+00  4.504e+00  -1.261  0.20855
##                                       
## (Intercept)                        ** 
## QikProp_accptHB                    ***
## QikProp_EA.eV.                     ***
## QikProp_PercentHumanOralAbsorption    
## QikProp_SAfluorine                 *  
## QikProp_PSA                        ***
## QikProp_.NandO                     ** 
## QikProp_.in56                      ***
## QikProp_.noncon                       
## QikProp_.nonHatm                   ***
## QikProp_RuleOfThree                   
## QikProp_ACxDN..5.SAxSASA.MW           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6489 on 245 degrees of freedom
## Multiple R-squared:  0.7493, Adjusted R-squared:  0.7381 
## F-statistic: 66.58 on 11 and 245 DF,  p-value: < 2.2e-16
names(lm.fit)
##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "xlevels"       "call"          "terms"         "model"
coef(lm.fit)
##                        (Intercept)                    QikProp_accptHB 
##                       1.2443895692                      -0.5330794430 
##                     QikProp_EA.eV. QikProp_PercentHumanOralAbsorption 
##                       0.2634919821                      -0.0001454556 
##                 QikProp_SAfluorine                        QikProp_PSA 
##                      -0.0056479779                       0.0237149692 
##                     QikProp_.NandO                      QikProp_.in56 
##                      -0.4147678027                      -0.1042807192 
##                    QikProp_.noncon                   QikProp_.nonHatm 
##                       0.0622350566                       1.2954261710 
##                QikProp_RuleOfThree        QikProp_ACxDN..5.SAxSASA.MW 
##                      -0.5015286851                      -5.6794663964
confint(lm.fit)
##                                            2.5 %        97.5 %
## (Intercept)                          0.333066799  2.155712e+00
## QikProp_accptHB                     -0.695632677 -3.705262e-01
## QikProp_EA.eV.                       0.172885231  3.540987e-01
## QikProp_PercentHumanOralAbsorption  -0.000336687  4.577585e-05
## QikProp_SAfluorine                  -0.009982664 -1.313292e-03
## QikProp_PSA                          0.012293646  3.513629e-02
## QikProp_.NandO                      -0.664468631 -1.650670e-01
## QikProp_.in56                       -0.139635840 -6.892560e-02
## QikProp_.noncon                     -0.022212043  1.466822e-01
## QikProp_.nonHatm                     1.147125521  1.443727e+00
## QikProp_RuleOfThree                 -1.524979954  5.219226e-01
## QikProp_ACxDN..5.SAxSASA.MW        -14.551710450  3.192778e+00

Variables with high correlations are dropped in data cleaning section. So only 12 descriptors are used, among which the largest weight is -5.24, indicating an estimated 5.24 degrees decrease in activity for every unit increase of that variable holding the remaining variables constant.

Residual plots:

par(mfrow = c(2,2))
plot(lm.fit)

plot(predict(lm.fit), residuals(lm.fit))

plot(predict(lm.fit), rstudent(lm.fit))

No systematic patterns or large outlying observations is detected from above residual plots.

Plot residuals versus molecules to zoom in the performance of the model:

e <- resid(lm.fit)
n <- length(e)
x <- 1:n

plot(x, e,
     xlab = "Molecule index", 
     ylab = "Residuals", 
     bg = "lightblue", 
     col = "black", cex = 2, pch = 21,frame = FALSE)
abline(h = 0, lwd = 2)
for (i in 1 : n) 
  lines(c(x[i], x[i]), c(e[i], 0), col = "blue" , lwd = 2)

Examining leverage values:

plot(hatvalues(lm.fit))

which.max(hatvalues(lm.fit))
## 253 
## 196

Molecules with indexes 128 and 179 have much higher leverage values. We further check their dfbetas values:

dfbetas(lm.fit)[c(128,179),2]
##         162         230 
## -0.08238692  0.02695810
summary(dfbetas(lm.fit)[-c(128,179),2])
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -0.277500 -0.022660 -0.000899  0.001515  0.014490  0.590600

With large leverage values and dfbetas, these two molecules are exerted.

Final model:

newlm <- lm(Activity~., data = training[-c(128, 179),])

Visualising the performance of the final model:

p1 <- data.frame(predict(newlm, test, interval = "confidence"))
p2 <- data.frame(predict(newlm, test, interval = "prediction"))
p1$interval = "confidence"
p2$interval = "prediction"
p1$x = 1:nrow(test)
p1$Activity <- test$Activity
p2$x = 1:nrow(test)
p2$Activity <- test$Activity
dat = rbind(p1, p2)
names(dat)[1] = "yhat"

ggplot(dat, aes(x, yhat)) +
  geom_ribbon(aes(ymin = lwr, ymax = upr, fill = interval), alpha = 0.2) +
  geom_line() +
  geom_point(aes(x, Activity), size = 4)

Partial Linear Squares

pls.fit <- plsr(Activity ~., data = training, scale = TRUE, validation = "CV")
summary(pls.fit) # The lowest cross-validation error occurs when M=5
## Data:    X dimension: 257 11 
##  Y dimension: 257 1
## Fit method: kernelpls
## Number of components considered: 11
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV            1.27   0.9503   0.9625   0.9873   0.9457   0.9014   0.8746
## adjCV         1.27   0.9467   0.9502   0.9628   0.9206   0.8796   0.8550
##        7 comps  8 comps  9 comps  10 comps  11 comps
## CV      0.8844   0.8863   0.8777    0.8829    0.8819
## adjCV   0.8640   0.8657   0.8578    0.8625    0.8616
## 
## TRAINING: % variance explained
##           1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps
## X           30.27    52.47    59.67    64.62    71.45    80.25    84.74
## Activity    49.00    58.11    69.92    74.19    74.81    74.86    74.91
##           8 comps  9 comps  10 comps  11 comps
## X           89.98    96.89     98.85    100.00
## Activity    74.93    74.93     74.93     74.93
validationplot(pls.fit, val.type = "MSEP")

pls.pred <- predict(pls.fit, test, ncomp = 5)
mean((pls.pred-test$Activity)^2)
## [1] 0.6072227
pls.all <- plsr(Activity ~., data = data, scale = TRUE, ncomp = 5)
summary(pls.all)
## Data:    X dimension: 319 12 
##  Y dimension: 319 1
## Fit method: kernelpls
## Number of components considered: 5
## TRAINING: % variance explained
##           1 comps  2 comps  3 comps  4 comps  5 comps
## X           25.57    48.15    59.31    66.07    72.91
## Activity    53.08    62.63    69.05    72.75    73.87