In this experiment, regression models relating QuickProp descriptors to predict toxicity are built from a data set consists of 322 compounds that were experimentally assessed.
start <- function(pkg){
npkg <- pkg[!(pkg %in% installed.packages()[,"Package"])]
if (length(npkg))
install.packages(npkg, dependencies = TRUE)
lapply(pkg, require, character.only=TRUE)
}
pkgs <- c("QSARdata",'caret','ggplot2')
start(pkgs)
## Loading required package: QSARdata
## Loading required package: caret
## Loading required package: lattice
## Loading required package: ggplot2
## [[1]]
## [1] TRUE
##
## [[2]]
## [1] TRUE
##
## [[3]]
## [1] TRUE
data(AquaticTox)
head(AquaticTox_Outcome)
## Molecule Activity
## 1 (d)-limonene 5.29
## 2 111-trichloro-2-methyl-2-propanolol(chlorobytanol) 3.12
## 3 111-trichloroethane 3.40
## 4 1122-tetrachloroethane 3.92
## 5 112-trichloroethane 3.21
## 6 11-dichloroethylene(vinylidene 2.84
descriptors<-AquaticTox_QuickProp
data<-cbind(descriptors,AquaticTox_Outcome$Activity)
str(data)
## 'data.frame': 322 obs. of 51 variables:
## $ Molecule : Factor w/ 320 levels "(d)-limonene",..: 1 31 32 34 33 30 42 40 41 45 ...
## $ QikProp_.stars : int 7 6 12 11 12 12 7 7 11 8 ...
## $ QikProp_.amine : int 0 0 0 0 0 0 0 0 0 0 ...
## $ QikProp_.acid : int 0 0 0 0 0 0 0 0 0 0 ...
## $ QikProp_.amide : int 0 0 0 0 0 0 0 0 0 0 ...
## $ QikProp_.rotor : int 1 1 0 0 0 0 0 0 0 0 ...
## $ QikProp_.rtvFG : int 0 0 0 0 0 0 0 0 1 0 ...
## $ QikProp_CNS : int 2 2 1 1 1 1 1 1 1 1 ...
## $ QikProp_mol_MW : num 136 177 133 168 133 ...
## $ QikProp_dipole : num 0.436 3.953 2.475 3.058 3.978 ...
## $ QikProp_SASA : num 382 317 271 290 270 ...
## $ QikProp_FOSA : num 316.5 125.8 80.3 46.3 79.9 ...
## $ QikProp_FISA : num 0 35.7 0 0 0 ...
## $ QikProp_PISA : num 65.3 0 0 0 0 ...
## $ QikProp_WPSA : num 0 156 190 244 190 ...
## $ QikProp_volume : num 613 492 387 425 385 ...
## $ QikProp_donorHB : num 0 1 0 0 0 0 0 0 0 0 ...
## $ QikProp_accptHB : num 0 0.75 0 0 0 0 0 0 0 0 ...
## $ QikProp_dip.2.V : num 0.000309 0.03175 0.015848 0.022004 0.041087 ...
## $ QikProp_ACxDN..5.SA : num 0 0.00236 0 0 0 ...
## $ QikProp_glob : num 0.914 0.95 0.949 0.943 0.948 ...
## $ QikProp_QPpolrz : num 18.38 12.9 9.31 10.86 9.26 ...
## $ QikProp_QPlogPC16 : num 4.5 4.7 3.24 4 3.22 ...
## $ QikProp_QPlogPoct : num 4.93 6.2 2.9 3.78 3.23 ...
## $ QikProp_QPlogPw : num -0.289 2.763 0.437 0.502 0.435 ...
## $ QikProp_QPlogPo.w : num 3.91 2.12 2.49 2.58 2.49 ...
## $ QikProp_QPlogS : num -4.01 -1.96 -1.96 -2.07 -1.96 ...
## $ QikProp_CIQPlogS : num -4.2 -2.39 -1.84 -2.24 -1.82 ...
## $ QikProp_QPlogHERG : num -3.23 -2.34 -2.37 -2.45 -2.37 ...
## $ QikProp_QPPCaco : num 9906 4542 9906 9906 9906 ...
## $ QikProp_QPlogBB : num 0.826 0.622 0.353 0.285 0.352 0.289 0.192 0.192 0.412 0.192 ...
## $ QikProp_QPPMDCK : num 5899 10000 10000 10000 10000 ...
## $ QikProp_QPlogKp : num -1.19 -2.08 -1.52 -1.52 -1.52 ...
## $ QikProp_IP.eV. : num 9.63 10.6 10.83 10.79 10.6 ...
## $ QikProp_EA.eV. : num -1.181 0.26 0.343 0.191 -0.107 ...
## $ QikProp_.metab : int 5 1 0 0 0 0 0 0 0 0 ...
## $ QikProp_QPlogKhsa : num 0.356 -0.278 -0.26 -0.148 -0.264 -0.436 0.187 0.072 -0.094 0.2 ...
## $ QikProp_HumanOralAbsorption : int 3 3 3 3 3 3 3 3 3 3 ...
## $ QikProp_PercentHumanOralAbsorption: num 100 100 100 100 100 100 100 100 100 100 ...
## $ QikProp_SAfluorine : num 0 0 0 0 0 0 0 0 0 0 ...
## $ QikProp_SAamideO : num 0 0 0 0 0 0 0 0 0 0 ...
## $ QikProp_PSA : num 0 20.1 0 0 0 ...
## $ QikProp_.NandO : int 0 1 0 0 0 0 0 0 0 0 ...
## $ QikProp_RuleOfFive : int 0 0 0 0 0 0 0 0 0 0 ...
## $ QikProp_.ringatoms : int 6 0 0 0 0 0 6 6 0 6 ...
## $ QikProp_.in56 : int 6 0 0 0 0 0 6 6 0 6 ...
## $ QikProp_.noncon : int 4 0 0 0 0 0 0 0 0 0 ...
## $ QikProp_.nonHatm : int 10 8 5 6 5 4 10 9 6 10 ...
## $ QikProp_RuleOfThree : int 0 0 0 0 0 0 0 0 0 0 ...
## $ QikProp_ACxDN..5.SAxSASA.MW : num 0 0.00423 0 0 0 ...
## $ AquaticTox_Outcome$Activity : num 5.29 3.12 3.4 3.92 3.21 2.84 5.29 4.89 3.41 5.83 ...
colnames(data)[51]<- "Activity"
colSums(is.na(data))/nrow(data)
## Molecule QikProp_.stars
## 0.00000000 0.00931677
## QikProp_.amine QikProp_.acid
## 0.00931677 0.00931677
## QikProp_.amide QikProp_.rotor
## 0.00931677 0.00931677
## QikProp_.rtvFG QikProp_CNS
## 0.00931677 0.00931677
## QikProp_mol_MW QikProp_dipole
## 0.00931677 0.00931677
## QikProp_SASA QikProp_FOSA
## 0.00931677 0.00931677
## QikProp_FISA QikProp_PISA
## 0.00931677 0.00931677
## QikProp_WPSA QikProp_volume
## 0.00931677 0.00931677
## QikProp_donorHB QikProp_accptHB
## 0.00931677 0.00931677
## QikProp_dip.2.V QikProp_ACxDN..5.SA
## 0.00931677 0.00931677
## QikProp_glob QikProp_QPpolrz
## 0.00931677 0.00931677
## QikProp_QPlogPC16 QikProp_QPlogPoct
## 0.00931677 0.00931677
## QikProp_QPlogPw QikProp_QPlogPo.w
## 0.00931677 0.00931677
## QikProp_QPlogS QikProp_CIQPlogS
## 0.00931677 0.00931677
## QikProp_QPlogHERG QikProp_QPPCaco
## 0.00931677 0.00931677
## QikProp_QPlogBB QikProp_QPPMDCK
## 0.00931677 0.00931677
## QikProp_QPlogKp QikProp_IP.eV.
## 0.00931677 0.00931677
## QikProp_EA.eV. QikProp_.metab
## 0.00931677 0.00931677
## QikProp_QPlogKhsa QikProp_HumanOralAbsorption
## 0.00931677 0.00931677
## QikProp_PercentHumanOralAbsorption QikProp_SAfluorine
## 0.00931677 0.00931677
## QikProp_SAamideO QikProp_PSA
## 0.00931677 0.00931677
## QikProp_.NandO QikProp_RuleOfFive
## 0.00931677 0.00931677
## QikProp_.ringatoms QikProp_.in56
## 0.00931677 0.00931677
## QikProp_.noncon QikProp_.nonHatm
## 0.00931677 0.00931677
## QikProp_RuleOfThree QikProp_ACxDN..5.SAxSASA.MW
## 0.00931677 0.00931677
## Activity
## 0.00000000
The outcome is the negative log of activity labeled as “Activity”.
Missing values in each column can be ignored.
Variables with correlations larger than 0.29 are omitted.
data<-na.omit(data)
descs <- data[, !apply(data, 2, function(x) any(is.na(x)) )]
descs <- descs[, !apply( descs, 2, function(x) length(unique(x)) == 1 )]
r2 <- which(cor(descs[2:50])^2 > .29, arr.ind=TRUE)
r2 <- r2[ r2[,1] > r2[,2] , ]
d <- descs[, -unique(r2[,2])]
Box-Cox transformation was performed on each column for better normality.
Dataset was split into training and test sets on a ratio of 8:2.
Tran <- preProcess(d[,-1],method = "BoxCox")
data <-predict(Tran,d[,-1])
set.seed(909)
ind<-sample(2,nrow(data),replace=TRUE,prob=c(0.8,0.2))
training<-data[ind==1,-1]
training$ID <- seq.int(nrow(training))
test<-data[ind==2,-1]
Starting from a simple regression model.
lm.fit <- lm(Activity ~ ., data = training[,-13])
summary(lm.fit)
##
## Call:
## lm(formula = Activity ~ ., data = training[, -13])
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.94521 -0.34591 -0.03278 0.30608 2.89760
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.765e-01 4.949e-01 1.771 0.07782
## QikProp_accptHB -4.594e-01 9.078e-02 -5.061 8.34e-07
## QikProp_EA.eV. 3.052e-01 5.102e-02 5.981 8.01e-09
## QikProp_PercentHumanOralAbsorption -7.354e-05 1.064e-04 -0.691 0.49001
## QikProp_SAfluorine -6.059e-03 2.356e-03 -2.572 0.01071
## QikProp_PSA 1.681e-02 6.826e-03 2.463 0.01450
## QikProp_.NandO -3.641e-01 1.399e-01 -2.603 0.00982
## QikProp_.in56 -1.070e-01 1.890e-02 -5.660 4.32e-08
## QikProp_.noncon 5.547e-02 5.505e-02 1.008 0.31459
## QikProp_.nonHatm 1.310e+00 8.264e-02 15.850 < 2e-16
## QikProp_RuleOfThree -8.072e-01 7.471e-01 -1.080 0.28103
## QikProp_ACxDN..5.SAxSASA.MW 2.971e+00 4.959e+00 0.599 0.54957
##
## (Intercept) .
## QikProp_accptHB ***
## QikProp_EA.eV. ***
## QikProp_PercentHumanOralAbsorption
## QikProp_SAfluorine *
## QikProp_PSA *
## QikProp_.NandO **
## QikProp_.in56 ***
## QikProp_.noncon
## QikProp_.nonHatm ***
## QikProp_RuleOfThree
## QikProp_ACxDN..5.SAxSASA.MW
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6967 on 239 degrees of freedom
## Multiple R-squared: 0.7283, Adjusted R-squared: 0.7158
## F-statistic: 58.24 on 11 and 239 DF, p-value: < 2.2e-16
names(lm.fit)
## [1] "coefficients" "residuals" "effects" "rank"
## [5] "fitted.values" "assign" "qr" "df.residual"
## [9] "xlevels" "call" "terms" "model"
coef(lm.fit)
## (Intercept) QikProp_accptHB
## 0.8765159293 -0.4593854929
## QikProp_EA.eV. QikProp_PercentHumanOralAbsorption
## 0.3051614092 -0.0000735434
## QikProp_SAfluorine QikProp_PSA
## -0.0060594938 0.0168111569
## QikProp_.NandO QikProp_.in56
## -0.3641379505 -0.1069772166
## QikProp_.noncon QikProp_.nonHatm
## 0.0554739717 1.3098810193
## QikProp_RuleOfThree QikProp_ACxDN..5.SAxSASA.MW
## -0.8072460767 2.9713774773
confint(lm.fit)
## 2.5 % 97.5 %
## (Intercept) -0.0984261421 1.8514580007
## QikProp_accptHB -0.6382127931 -0.2805581928
## QikProp_EA.eV. 0.2046594114 0.4056634069
## QikProp_PercentHumanOralAbsorption -0.0002830943 0.0001360075
## QikProp_SAfluorine -0.0107000084 -0.0014189792
## QikProp_PSA 0.0033637973 0.0302585166
## QikProp_.NandO -0.6397189039 -0.0885569972
## QikProp_.in56 -0.1442102849 -0.0697441483
## QikProp_.noncon -0.0529658474 0.1639137909
## QikProp_.nonHatm 1.1470806464 1.4726813922
## QikProp_RuleOfThree -2.2790703791 0.6645782257
## QikProp_ACxDN..5.SAxSASA.MW -6.7965830552 12.7393380098
Variables with high correlations are dropped in data cleaning section. So only 12 descriptors are used, among which the largest weight is -5.24, indicating an estimated 5.24 degrees decrease in activity for every unit increase of that variable holding the remaining variables constant.
The original sum of squares of the model with just an intercept is:
mu1 <- mean(training$Activity)
initss<- sum((training$Activity-mu1)^2)
initss
## [1] 426.9428
F-statistic: 58.24 on 11 and 239 DF, p-value: < 2.2e-16 tells that model is useful and statistically significant.
formula <- cat("Activity~",names(training[,-c(3,4,12,13)]), sep="+")
## Activity~+QikProp_accptHB+QikProp_EA.eV.+QikProp_PSA+QikProp_.NandO+QikProp_.in56+QikProp_.noncon+QikProp_.nonHatm+QikProp_RuleOfThree+QikProp_ACxDN..5.SAxSASA.MW
reduced <- lm(Activity ~QikProp_PercentHumanOralAbsorption+QikProp_SAfluorine, data =training[,-13])
anova(reduced, lm.fit)
## Analysis of Variance Table
##
## Model 1: Activity ~ QikProp_PercentHumanOralAbsorption + QikProp_SAfluorine
## Model 2: Activity ~ QikProp_accptHB + QikProp_EA.eV. + QikProp_PercentHumanOralAbsorption +
## QikProp_SAfluorine + QikProp_PSA + QikProp_.NandO + QikProp_.in56 +
## QikProp_.noncon + QikProp_.nonHatm + QikProp_RuleOfThree +
## QikProp_ACxDN..5.SAxSASA.MW
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 248 343.21
## 2 239 116.00 9 227.21 52.013 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The first column “DF” shows the “degrees of freedom” with each row. Here, DF value is 9.
The column “F value” indicates the mean of squares for the inclusion of the terms of interest (the sum of squares divided by the degrees of freedom) divided by the mean squared residuals (from the bottom row).
Null Hypothesis The subset of coefficients are equal to 0.
The results of the partial F-test F=52.013(p-value = 2.2e-16) indicates stronger evidence against the null hypothesis and thus we accept the alternative hypothesis. It appears that variables other than “QikProp_PercentHumanOralAbsorption” and “QikProp _SAfluorine" contribute significant information to Activity in addition to “QikProp _PercentHumanOralAbsorption" and “QikProp _SAfluorine“.
par(mfrow = c(2,2))
plot(lm.fit)
## Warning: not plotting observations with leverage one:
## 136
## Warning: not plotting observations with leverage one:
## 136
plot(predict(lm.fit), residuals(lm.fit))
plot(predict(lm.fit), rstudent(lm.fit))
No systematic patterns or large outlying observations is detected from above residual plots.
Plot residuals versus molecules to zoom in the performance of the model:
e <- resid(lm.fit)
n <- length(e)
x <- 1:n
plot(x, e,
xlab = "Molecule index",
ylab = "Residuals",
bg = "lightblue",
col = "black", cex = 2, pch = 21,frame = FALSE)
abline(h = 0, lwd = 2)
for (i in 1 : n)
lines(c(x[i], x[i]), c(e[i], 0), col = "blue" , lwd = 2)
Examining leverage values:
plot(hatvalues(lm.fit))
which.max(hatvalues(lm.fit))
## 170
## 136
Molecules with indexes 170, 136 have much higher leverage values and produce larger residuals. We further check their dfbetas values:
dfb <- data.frame(dfbetas(lm.fit))
summary(dfbetas(lm.fit)[-c(136,170),1])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.3060000 -0.0148100 0.0007689 0.0001814 0.0177600 0.3420000
With large leverage values and dfbetas, these two molecules are exerted.
Final model:
newlm <- lm(Activity~., data = training[-c(170, 136),-13])
Visualising the performance of the final model:
p1 <- data.frame(predict(newlm, test, interval = "confidence"))
## Warning in predict.lm(newlm, test, interval = "confidence"): prediction
## from a rank-deficient fit may be misleading
p2 <- data.frame(predict(newlm, test, interval = "prediction"))
## Warning in predict.lm(newlm, test, interval = "prediction"): prediction
## from a rank-deficient fit may be misleading
p1$interval = "confidence"
p2$interval = "prediction"
p1$x = 1:nrow(test)
p1$Activity <- test$Activity
p2$x = 1:nrow(test)
p2$Activity <- test$Activity
dat = rbind(p1, p2)
names(dat)[1] = "yhat"
ggplot(dat, aes(x, yhat)) +
geom_ribbon(aes(ymin = lwr, ymax = upr, fill = interval), alpha = 0.2) +
geom_line() +
geom_point(aes(x, Activity), size = 4)
The 95% prediction interval is much wider than the confidence interval, indicating that the variation about the mean is fairly large.