In this experiment, regression models relating QuickProp descriptors to predict toxicity are built from a data set consists of 322 compounds that were experimentally assessed.
start <- function(pkg){
npkg <- pkg[!(pkg %in% installed.packages()[,"Package"])]
if (length(npkg))
install.packages(npkg, dependencies = TRUE)
lapply(pkg, require, character.only=TRUE)
}
pkgs <- c("QSARdata",'caret','ggplot2','pls')
start(pkgs)
## Loading required package: QSARdata
## Loading required package: caret
## Loading required package: lattice
## Loading required package: ggplot2
## Loading required package: pls
##
## Attaching package: 'pls'
##
## The following object is masked from 'package:caret':
##
## R2
##
## The following object is masked from 'package:stats':
##
## loadings
## [[1]]
## [1] TRUE
##
## [[2]]
## [1] TRUE
##
## [[3]]
## [1] TRUE
##
## [[4]]
## [1] TRUE
data(AquaticTox)
head(AquaticTox_Outcome)
## Molecule Activity
## 1 (d)-limonene 5.29
## 2 111-trichloro-2-methyl-2-propanolol(chlorobytanol) 3.12
## 3 111-trichloroethane 3.40
## 4 1122-tetrachloroethane 3.92
## 5 112-trichloroethane 3.21
## 6 11-dichloroethylene(vinylidene 2.84
descriptors<-AquaticTox_QuickProp
data<-cbind(descriptors,AquaticTox_Outcome$Activity)
str(data)
## 'data.frame': 322 obs. of 51 variables:
## $ Molecule : Factor w/ 320 levels "(d)-limonene",..: 1 31 32 34 33 30 42 40 41 45 ...
## $ QikProp_.stars : int 7 6 12 11 12 12 7 7 11 8 ...
## $ QikProp_.amine : int 0 0 0 0 0 0 0 0 0 0 ...
## $ QikProp_.acid : int 0 0 0 0 0 0 0 0 0 0 ...
## $ QikProp_.amide : int 0 0 0 0 0 0 0 0 0 0 ...
## $ QikProp_.rotor : int 1 1 0 0 0 0 0 0 0 0 ...
## $ QikProp_.rtvFG : int 0 0 0 0 0 0 0 0 1 0 ...
## $ QikProp_CNS : int 2 2 1 1 1 1 1 1 1 1 ...
## $ QikProp_mol_MW : num 136 177 133 168 133 ...
## $ QikProp_dipole : num 0.436 3.953 2.475 3.058 3.978 ...
## $ QikProp_SASA : num 382 317 271 290 270 ...
## $ QikProp_FOSA : num 316.5 125.8 80.3 46.3 79.9 ...
## $ QikProp_FISA : num 0 35.7 0 0 0 ...
## $ QikProp_PISA : num 65.3 0 0 0 0 ...
## $ QikProp_WPSA : num 0 156 190 244 190 ...
## $ QikProp_volume : num 613 492 387 425 385 ...
## $ QikProp_donorHB : num 0 1 0 0 0 0 0 0 0 0 ...
## $ QikProp_accptHB : num 0 0.75 0 0 0 0 0 0 0 0 ...
## $ QikProp_dip.2.V : num 0.000309 0.03175 0.015848 0.022004 0.041087 ...
## $ QikProp_ACxDN..5.SA : num 0 0.00236 0 0 0 ...
## $ QikProp_glob : num 0.914 0.95 0.949 0.943 0.948 ...
## $ QikProp_QPpolrz : num 18.38 12.9 9.31 10.86 9.26 ...
## $ QikProp_QPlogPC16 : num 4.5 4.7 3.24 4 3.22 ...
## $ QikProp_QPlogPoct : num 4.93 6.2 2.9 3.78 3.23 ...
## $ QikProp_QPlogPw : num -0.289 2.763 0.437 0.502 0.435 ...
## $ QikProp_QPlogPo.w : num 3.91 2.12 2.49 2.58 2.49 ...
## $ QikProp_QPlogS : num -4.01 -1.96 -1.96 -2.07 -1.96 ...
## $ QikProp_CIQPlogS : num -4.2 -2.39 -1.84 -2.24 -1.82 ...
## $ QikProp_QPlogHERG : num -3.23 -2.34 -2.37 -2.45 -2.37 ...
## $ QikProp_QPPCaco : num 9906 4542 9906 9906 9906 ...
## $ QikProp_QPlogBB : num 0.826 0.622 0.353 0.285 0.352 0.289 0.192 0.192 0.412 0.192 ...
## $ QikProp_QPPMDCK : num 5899 10000 10000 10000 10000 ...
## $ QikProp_QPlogKp : num -1.19 -2.08 -1.52 -1.52 -1.52 ...
## $ QikProp_IP.eV. : num 9.63 10.6 10.83 10.79 10.6 ...
## $ QikProp_EA.eV. : num -1.181 0.26 0.343 0.191 -0.107 ...
## $ QikProp_.metab : int 5 1 0 0 0 0 0 0 0 0 ...
## $ QikProp_QPlogKhsa : num 0.356 -0.278 -0.26 -0.148 -0.264 -0.436 0.187 0.072 -0.094 0.2 ...
## $ QikProp_HumanOralAbsorption : int 3 3 3 3 3 3 3 3 3 3 ...
## $ QikProp_PercentHumanOralAbsorption: num 100 100 100 100 100 100 100 100 100 100 ...
## $ QikProp_SAfluorine : num 0 0 0 0 0 0 0 0 0 0 ...
## $ QikProp_SAamideO : num 0 0 0 0 0 0 0 0 0 0 ...
## $ QikProp_PSA : num 0 20.1 0 0 0 ...
## $ QikProp_.NandO : int 0 1 0 0 0 0 0 0 0 0 ...
## $ QikProp_RuleOfFive : int 0 0 0 0 0 0 0 0 0 0 ...
## $ QikProp_.ringatoms : int 6 0 0 0 0 0 6 6 0 6 ...
## $ QikProp_.in56 : int 6 0 0 0 0 0 6 6 0 6 ...
## $ QikProp_.noncon : int 4 0 0 0 0 0 0 0 0 0 ...
## $ QikProp_.nonHatm : int 10 8 5 6 5 4 10 9 6 10 ...
## $ QikProp_RuleOfThree : int 0 0 0 0 0 0 0 0 0 0 ...
## $ QikProp_ACxDN..5.SAxSASA.MW : num 0 0.00423 0 0 0 ...
## $ AquaticTox_Outcome$Activity : num 5.29 3.12 3.4 3.92 3.21 2.84 5.29 4.89 3.41 5.83 ...
colnames(data)[51]<- "Activity"
colSums(is.na(data))/nrow(data)
## Molecule QikProp_.stars
## 0.00000000 0.00931677
## QikProp_.amine QikProp_.acid
## 0.00931677 0.00931677
## QikProp_.amide QikProp_.rotor
## 0.00931677 0.00931677
## QikProp_.rtvFG QikProp_CNS
## 0.00931677 0.00931677
## QikProp_mol_MW QikProp_dipole
## 0.00931677 0.00931677
## QikProp_SASA QikProp_FOSA
## 0.00931677 0.00931677
## QikProp_FISA QikProp_PISA
## 0.00931677 0.00931677
## QikProp_WPSA QikProp_volume
## 0.00931677 0.00931677
## QikProp_donorHB QikProp_accptHB
## 0.00931677 0.00931677
## QikProp_dip.2.V QikProp_ACxDN..5.SA
## 0.00931677 0.00931677
## QikProp_glob QikProp_QPpolrz
## 0.00931677 0.00931677
## QikProp_QPlogPC16 QikProp_QPlogPoct
## 0.00931677 0.00931677
## QikProp_QPlogPw QikProp_QPlogPo.w
## 0.00931677 0.00931677
## QikProp_QPlogS QikProp_CIQPlogS
## 0.00931677 0.00931677
## QikProp_QPlogHERG QikProp_QPPCaco
## 0.00931677 0.00931677
## QikProp_QPlogBB QikProp_QPPMDCK
## 0.00931677 0.00931677
## QikProp_QPlogKp QikProp_IP.eV.
## 0.00931677 0.00931677
## QikProp_EA.eV. QikProp_.metab
## 0.00931677 0.00931677
## QikProp_QPlogKhsa QikProp_HumanOralAbsorption
## 0.00931677 0.00931677
## QikProp_PercentHumanOralAbsorption QikProp_SAfluorine
## 0.00931677 0.00931677
## QikProp_SAamideO QikProp_PSA
## 0.00931677 0.00931677
## QikProp_.NandO QikProp_RuleOfFive
## 0.00931677 0.00931677
## QikProp_.ringatoms QikProp_.in56
## 0.00931677 0.00931677
## QikProp_.noncon QikProp_.nonHatm
## 0.00931677 0.00931677
## QikProp_RuleOfThree QikProp_ACxDN..5.SAxSASA.MW
## 0.00931677 0.00931677
## Activity
## 0.00000000
The outcome is the negative log of activity labeled as “Activity”.
Missing values in each column can be ignored.
Variables with correlations larger than 0.29 are omitted.
data<-na.omit(data)
descs <- data[, !apply(data, 2, function(x) any(is.na(x)) )]
descs <- descs[, !apply( descs, 2, function(x) length(unique(x)) == 1 )]
r2 <- which(cor(descs[2:50])^2 > .29, arr.ind=TRUE)
r2 <- r2[ r2[,1] > r2[,2] , ]
d <- descs[, -unique(r2[,2])]
Box-Cox transformation was performed on each column for better normality.
Dataset was split into training and test sets on a ratio of 8:2.
Tran <- preProcess(d[,-1],method = "BoxCox")
data <-predict(Tran,d[,-1])
ind<-sample(2,nrow(data),replace=TRUE,prob=c(0.8,0.2))
training<-data[ind==1,-1]
test<-data[ind==2,-1]
Starting from a simple regression model.
lm.fit <- lm(Activity ~ ., data = training)
summary(lm.fit)
##
## Call:
## lm(formula = Activity ~ ., data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.08201 -0.37379 -0.02004 0.32489 2.92244
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.244e+00 4.627e-01 2.690 0.00765
## QikProp_accptHB -5.331e-01 8.253e-02 -6.459 5.61e-10
## QikProp_EA.eV. 2.635e-01 4.600e-02 5.728 2.97e-08
## QikProp_PercentHumanOralAbsorption -1.455e-04 9.709e-05 -1.498 0.13537
## QikProp_SAfluorine -5.648e-03 2.201e-03 -2.566 0.01087
## QikProp_PSA 2.371e-02 5.799e-03 4.090 5.86e-05
## QikProp_.NandO -4.148e-01 1.268e-01 -3.272 0.00122
## QikProp_.in56 -1.043e-01 1.795e-02 -5.810 1.94e-08
## QikProp_.noncon 6.224e-02 4.287e-02 1.452 0.14789
## QikProp_.nonHatm 1.295e+00 7.529e-02 17.206 < 2e-16
## QikProp_RuleOfThree -5.015e-01 5.196e-01 -0.965 0.33539
## QikProp_ACxDN..5.SAxSASA.MW -5.679e+00 4.504e+00 -1.261 0.20855
##
## (Intercept) **
## QikProp_accptHB ***
## QikProp_EA.eV. ***
## QikProp_PercentHumanOralAbsorption
## QikProp_SAfluorine *
## QikProp_PSA ***
## QikProp_.NandO **
## QikProp_.in56 ***
## QikProp_.noncon
## QikProp_.nonHatm ***
## QikProp_RuleOfThree
## QikProp_ACxDN..5.SAxSASA.MW
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6489 on 245 degrees of freedom
## Multiple R-squared: 0.7493, Adjusted R-squared: 0.7381
## F-statistic: 66.58 on 11 and 245 DF, p-value: < 2.2e-16
names(lm.fit)
## [1] "coefficients" "residuals" "effects" "rank"
## [5] "fitted.values" "assign" "qr" "df.residual"
## [9] "xlevels" "call" "terms" "model"
coef(lm.fit)
## (Intercept) QikProp_accptHB
## 1.2443895692 -0.5330794430
## QikProp_EA.eV. QikProp_PercentHumanOralAbsorption
## 0.2634919821 -0.0001454556
## QikProp_SAfluorine QikProp_PSA
## -0.0056479779 0.0237149692
## QikProp_.NandO QikProp_.in56
## -0.4147678027 -0.1042807192
## QikProp_.noncon QikProp_.nonHatm
## 0.0622350566 1.2954261710
## QikProp_RuleOfThree QikProp_ACxDN..5.SAxSASA.MW
## -0.5015286851 -5.6794663964
confint(lm.fit)
## 2.5 % 97.5 %
## (Intercept) 0.333066799 2.155712e+00
## QikProp_accptHB -0.695632677 -3.705262e-01
## QikProp_EA.eV. 0.172885231 3.540987e-01
## QikProp_PercentHumanOralAbsorption -0.000336687 4.577585e-05
## QikProp_SAfluorine -0.009982664 -1.313292e-03
## QikProp_PSA 0.012293646 3.513629e-02
## QikProp_.NandO -0.664468631 -1.650670e-01
## QikProp_.in56 -0.139635840 -6.892560e-02
## QikProp_.noncon -0.022212043 1.466822e-01
## QikProp_.nonHatm 1.147125521 1.443727e+00
## QikProp_RuleOfThree -1.524979954 5.219226e-01
## QikProp_ACxDN..5.SAxSASA.MW -14.551710450 3.192778e+00
Variables with high correlations are dropped in data cleaning section. So only 12 descriptors are used, among which the largest weight is -5.24, indicating an estimated 5.24 degrees decrease in activity for every unit increase of that variable holding the remaining variables constant.
Residual plots:
par(mfrow = c(2,2))
plot(lm.fit)
plot(predict(lm.fit), residuals(lm.fit))
plot(predict(lm.fit), rstudent(lm.fit))
No systematic patterns or large outlying observations is detected from above residual plots.
Plot residuals versus molecules to zoom in the performance of the model:
e <- resid(lm.fit)
n <- length(e)
x <- 1:n
plot(x, e,
xlab = "Molecule index",
ylab = "Residuals",
bg = "lightblue",
col = "black", cex = 2, pch = 21,frame = FALSE)
abline(h = 0, lwd = 2)
for (i in 1 : n)
lines(c(x[i], x[i]), c(e[i], 0), col = "blue" , lwd = 2)
Examining leverage values:
plot(hatvalues(lm.fit))
which.max(hatvalues(lm.fit))
## 253
## 196
Molecules with indexes 128 and 179 have much higher leverage values. We further check their dfbetas values:
dfbetas(lm.fit)[c(128,179),2]
## 162 230
## -0.08238692 0.02695810
summary(dfbetas(lm.fit)[-c(128,179),2])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.277500 -0.022660 -0.000899 0.001515 0.014490 0.590600
With large leverage values and dfbetas, these two molecules are exerted.
Final model:
newlm <- lm(Activity~., data = training[-c(128, 179),])
Visualising the performance of the final model:
p1 <- data.frame(predict(newlm, test, interval = "confidence"))
p2 <- data.frame(predict(newlm, test, interval = "prediction"))
p1$interval = "confidence"
p2$interval = "prediction"
p1$x = 1:nrow(test)
p1$Activity <- test$Activity
p2$x = 1:nrow(test)
p2$Activity <- test$Activity
dat = rbind(p1, p2)
names(dat)[1] = "yhat"
ggplot(dat, aes(x, yhat)) +
geom_ribbon(aes(ymin = lwr, ymax = upr, fill = interval), alpha = 0.2) +
geom_line() +
geom_point(aes(x, Activity), size = 4)
pls.fit <- plsr(Activity ~., data = training, scale = TRUE, validation = "CV")
summary(pls.fit) # The lowest cross-validation error occurs when M=5
## Data: X dimension: 257 11
## Y dimension: 257 1
## Fit method: kernelpls
## Number of components considered: 11
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 1.27 0.9503 0.9625 0.9873 0.9457 0.9014 0.8746
## adjCV 1.27 0.9467 0.9502 0.9628 0.9206 0.8796 0.8550
## 7 comps 8 comps 9 comps 10 comps 11 comps
## CV 0.8844 0.8863 0.8777 0.8829 0.8819
## adjCV 0.8640 0.8657 0.8578 0.8625 0.8616
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 30.27 52.47 59.67 64.62 71.45 80.25 84.74
## Activity 49.00 58.11 69.92 74.19 74.81 74.86 74.91
## 8 comps 9 comps 10 comps 11 comps
## X 89.98 96.89 98.85 100.00
## Activity 74.93 74.93 74.93 74.93
validationplot(pls.fit, val.type = "MSEP")
pls.pred <- predict(pls.fit, test, ncomp = 5)
mean((pls.pred-test$Activity)^2)
## [1] 0.6072227
pls.all <- plsr(Activity ~., data = data, scale = TRUE, ncomp = 5)
summary(pls.all)
## Data: X dimension: 319 12
## Y dimension: 319 1
## Fit method: kernelpls
## Number of components considered: 5
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps
## X 25.57 48.15 59.31 66.07 72.91
## Activity 53.08 62.63 69.05 72.75 73.87