library(rcdk)
## Loading required package: rcdklibs
## Loading required package: rJava
#Calculation of tpsa,xlogp,alogP for the CCl4
molecule_1 <- parse.smiles('C(Cl)(Cl)(Cl)Cl')[[1]]
get.tpsa(molecule_1)
## [1] 0
get.xlogp(molecule_1)
## [1] 2.864
get.alogp(molecule_1)
## [1] 3.5846
# QSPR activity
# Introduction of Descriptors
data(bpdata)
dc <- get.desc.categories()
dc
## [1] "hybrid" "constitutional" "topological" "electronic"
## [5] "geometrical"
dn <- get.desc.names(dc[4])
dn
## [1] "org.openscience.cdk.qsar.descriptors.molecular.FractionalPSADescriptor"
## [2] "org.openscience.cdk.qsar.descriptors.molecular.TPSADescriptor"
## [3] "org.openscience.cdk.qsar.descriptors.molecular.HBondDonorCountDescriptor"
## [4] "org.openscience.cdk.qsar.descriptors.molecular.HBondAcceptorCountDescriptor"
## [5] "org.openscience.cdk.qsar.descriptors.molecular.CPSADescriptor"
## [6] "org.openscience.cdk.qsar.descriptors.molecular.BPolDescriptor"
## [7] "org.openscience.cdk.qsar.descriptors.molecular.APolDescriptor"
aDesc <- eval.desc(molecule_1, dn[4])
aDesc
## nHBAcc
## 1 0
allDescs <- eval.desc(molecule_1, dn)
allDescs
## tpsaEfficiency TopoPSA nHBDon nHBAcc PPSA.1 PPSA.2 PPSA.3 PNSA.1 PNSA.2
## 1 0 0 0 0 NA NA NA NA NA
## PNSA.3 DPSA.1 DPSA.2 DPSA.3 FPSA.1 FPSA.2 FPSA.3 FNSA.1 FNSA.2 FNSA.3 WPSA.1
## 1 NA NA NA NA NA NA NA NA NA NA NA
## WPSA.2 WPSA.3 WNSA.1 WNSA.2 WNSA.3 RPCG RNCG RPCS RNCS THSA TPSA RHSA RPSA
## 1 NA NA NA NA NA NA NA NA NA NA NA NA NA
## bpol apol
## 1 1.68 10.48
#Parsing multiple compounds and picking up unique descriptors
mol_inb <- parse.smiles(bpdata[,1])
#Choosing descriptors
descNames <- c(
'org.openscience.cdk.qsar.descriptors.molecular.KierHallSmartsDescriptor',
'org.openscience.cdk.qsar.descriptors.molecular.APolDescriptor',
'org.openscience.cdk.qsar.descriptors.molecular.HBondDonorCountDescriptor')
#Evaluating descriptors
descs <- eval.desc(mol_inb, descNames)
#Removing descriptors with NA's
descs <- descs[, !apply(descs, 2, function(x) any(is.na(x)) )]
#Removing descriptors with zero as their values
descs <- descs[, !apply( descs, 2, function(x) length(unique(x)) == 1 )]
#Now we can neglect all those correlations which have value < 0.6 as they might not be
#having strong correlation which is done below
#Plotting Molecular weight vs Boiling point
#Getting molecular weights of bpdata
mol_inb <- parse.smiles(bpdata[,1])
form_mass = NULL
for(i in 1:length(mol_inb)){
molecs <- mol_inb[[i]]
convert.implicit.to.explicit(molecs)
formulex <- get.mol2formula(molecs,charge = 0)
form_mass[i] = formulex@mass
}
plot(bpdata$BP,form_mass,pch = 19,xlab = "Boiling Point",ylab = "Molecularweight")

#Adding Molecular wieghts to your data
bpdata[,"MW"] = form_mass
head(bpdata,5)
## SMILES BP MW
## bromo-trichloro-methane C(Br)(Cl)(Cl)Cl 378.0 195.82490
## chloro-trifluoro-methane ClC(F)(F)F 191.7 103.96406
## carbon tetrachloride C(Cl)(Cl)(Cl)Cl 349.8 151.87541
## tetrafluoromethane C(F)(F)(F)F 145.1 87.99361
## bromoform BrC(Br)Br 422.3 249.76284
#Plotting model parameters vs parameters
plot(bpdata$BP,descs$khs.sCH3,pch = 19)

plot(bpdata$BP,descs$apol,pch = 19)

plot(bpdata$BP,descs$khs.sF,pch = 19)

plot(bpdata$BP,descs$nHBDon,pch = 19)

r2 <- which(cor(descs)^2 > .6, arr.ind=TRUE)
r2
## row col
## khs.sCH3 1 1
## khs.dCH2 2 2
## khs.ssCH2 3 3
## khs.dsCH 4 4
## khs.aaCH 5 5
## khs.sssCH 6 6
## khs.dssC 7 7
## khs.dO 11 7
## khs.aasC 8 8
## khs.ssssC 9 9
## khs.sOH 10 10
## nHBDon 22 10
## khs.dssC 7 11
## khs.dO 11 11
## khs.ssO 12 12
## khs.aaO 13 13
## khs.sF 14 14
## khs.sSH 15 15
## khs.ssS 16 16
## khs.aaS 17 17
## khs.sCl 18 18
## khs.sBr 19 19
## khs.sI 20 20
## apol 21 21
## khs.sOH 10 22
## nHBDon 22 22
#Above you might have observed that most of the parameters have correlations with itself i.e
#corr = 1.Above method is a rude and basic method you can come up with much more good methods
#for predicting the correlations between molecules
#Removing upper triangle elements in correlation matrix
r2 <- r2[ r2[,1] > r2[,2] , ]
r2
## row col
## khs.dO 11 7
## nHBDon 22 10
#Applying them to descriptors (- indicates removal of upper triangular columns)
descs <- descs[, -unique(r2[,2])]
#Building the Model with out MW
model <- lm(BP ~ khs.sCH3 + khs.sF + apol + nHBDon, data.frame(bpdata, descs))
summary(model)
##
## Call:
## lm(formula = BP ~ khs.sCH3 + khs.sF + apol + nHBDon, data = data.frame(bpdata,
## descs))
##
## Residuals:
## Min 1Q Median 3Q Max
## -94.395 -20.911 -1.168 19.574 114.237
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 267.3135 6.0006 44.548 <2e-16 ***
## khs.sCH3 -22.7948 2.0676 -11.025 <2e-16 ***
## khs.sF -24.4121 2.6548 -9.196 <2e-16 ***
## apol 8.6211 0.3132 27.523 <2e-16 ***
## nHBDon 47.1187 3.7061 12.714 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 34.08 on 272 degrees of freedom
## Multiple R-squared: 0.837, Adjusted R-squared: 0.8346
## F-statistic: 349.1 on 4 and 272 DF, p-value: < 2.2e-16
plot(bpdata$BP,predict(model,data.frame(bpdata,descs)),pch = 19,xlab = "Observed BP",ylab = "Predicted BP")
abline(0,1,col = "red")

#Building Model with MW
model_1 <- lm(BP ~ khs.sCH3 + khs.sF + apol + nHBDon+MW, data.frame(bpdata, descs))
summary(model_1)
##
## Call:
## lm(formula = BP ~ khs.sCH3 + khs.sF + apol + nHBDon + MW, data = data.frame(bpdata,
## descs))
##
## Residuals:
## Min 1Q Median 3Q Max
## -71.637 -19.775 1.448 17.315 115.749
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 230.73812 6.04515 38.169 < 2e-16 ***
## khs.sCH3 -13.57662 1.92665 -7.047 1.52e-11 ***
## khs.sF -27.85514 2.24299 -12.419 < 2e-16 ***
## apol 5.72411 0.37399 15.305 < 2e-16 ***
## nHBDon 57.32190 3.23918 17.696 < 2e-16 ***
## MW 0.62922 0.05797 10.854 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 28.5 on 271 degrees of freedom
## Multiple R-squared: 0.8864, Adjusted R-squared: 0.8843
## F-statistic: 422.8 on 5 and 271 DF, p-value: < 2.2e-16
plot(bpdata$BP,predict(model_1,data.frame(bpdata,descs)),pch = 19,xlab = "Observed BP",ylab = "Predicted BP")
abline(0,1,col = "red")

#Comparing models
compare_0 = cbind(summary(model),summary(model_1))
colnames(compare_0) = c("Model With MW","Model without MW")
compare_0[c("r.squared","adj.r.squared"),]
## Model With MW Model without MW
## r.squared 0.8369626 0.8863644
## adj.r.squared 0.834565 0.8842678
#We can see there is an improvement in model with Molecular weight
#Now we will be using the above model to predict the BP of Dichlorodifluoromethane (CF2Cl4)
tetclo <- parse.smiles("C(F)(F)(Cl)Cl")[[1]]
desc_tet <- eval.desc(tetclo,descNames)
tet_dat = desc_tet[c("khs.sCH3","khs.sF","apol","nHBDon")]
tet_dat
## khs.sCH3 khs.sF apol nHBDon
## 1 0 2 7.234 0
#from above we can see only apol and khs.sF has the non-zero value
predBP_teclo = predict(model,tet_dat)
predBP_teclo
## 1
## 280.8542
#We can see the predicted boling point of the Dichlorodifluoromethane is 280.8 K where as
#the original value is 243.2 K wh
#Now we will be using the above model to predict the BP of Tetrachloroethylene (C2Cl4)
tetclo_0 <- parse.smiles("C(=C(Cl)Cl)(Cl)Cl")[[1]]
desc_tet_0 <- eval.desc(tetclo_0,descNames)
tet_dat_0 = desc_tet_0[c("khs.sCH3","khs.sF","apol","nHBDon")]
tet_dat_0
## khs.sCH3 khs.sF apol nHBDon
## 1 0 0 12.24 0
#from above we can see only apol has the non-zero value
predBP_teclo_0 = predict(model,tet_dat_0)
predBP_teclo_0
## 1
## 372.8355
#We can see the predicted boiling point of the Tetrachloroethylene is 372.8K where as
#the original value is 394.1K
#Therefore from above two examples we can see predicted values are varying from the originals
# around error magnitude of 20-30.If we could come up with much more efficient model
#we can move closer to the experiment value.