library(rcdk)
## Loading required package: rcdklibs
## Loading required package: rJava
#Calculation of tpsa,xlogp,alogP for the CCl4
molecule_1 <- parse.smiles('C(Cl)(Cl)(Cl)Cl')[[1]]
get.tpsa(molecule_1)
## [1] 0
get.xlogp(molecule_1)
## [1] 2.864
get.alogp(molecule_1)
## [1] 3.5846
# QSPR activity
# Introduction of Descriptors
data(bpdata)
dc <- get.desc.categories()
dc
## [1] "hybrid"         "constitutional" "topological"    "electronic"    
## [5] "geometrical"
dn <- get.desc.names(dc[4])
dn
## [1] "org.openscience.cdk.qsar.descriptors.molecular.FractionalPSADescriptor"     
## [2] "org.openscience.cdk.qsar.descriptors.molecular.TPSADescriptor"              
## [3] "org.openscience.cdk.qsar.descriptors.molecular.HBondDonorCountDescriptor"   
## [4] "org.openscience.cdk.qsar.descriptors.molecular.HBondAcceptorCountDescriptor"
## [5] "org.openscience.cdk.qsar.descriptors.molecular.CPSADescriptor"              
## [6] "org.openscience.cdk.qsar.descriptors.molecular.BPolDescriptor"              
## [7] "org.openscience.cdk.qsar.descriptors.molecular.APolDescriptor"
aDesc <- eval.desc(molecule_1, dn[4])
aDesc
##   nHBAcc
## 1      0
allDescs <- eval.desc(molecule_1, dn)
allDescs
##   tpsaEfficiency TopoPSA nHBDon nHBAcc PPSA.1 PPSA.2 PPSA.3 PNSA.1 PNSA.2
## 1              0       0      0      0     NA     NA     NA     NA     NA
##   PNSA.3 DPSA.1 DPSA.2 DPSA.3 FPSA.1 FPSA.2 FPSA.3 FNSA.1 FNSA.2 FNSA.3 WPSA.1
## 1     NA     NA     NA     NA     NA     NA     NA     NA     NA     NA     NA
##   WPSA.2 WPSA.3 WNSA.1 WNSA.2 WNSA.3 RPCG RNCG RPCS RNCS THSA TPSA RHSA RPSA
## 1     NA     NA     NA     NA     NA   NA   NA   NA   NA   NA   NA   NA   NA
##   bpol  apol
## 1 1.68 10.48
#Parsing multiple compounds and picking up unique descriptors
mol_inb <- parse.smiles(bpdata[,1])
#Choosing descriptors
descNames <- c(
  'org.openscience.cdk.qsar.descriptors.molecular.KierHallSmartsDescriptor',
  'org.openscience.cdk.qsar.descriptors.molecular.APolDescriptor',
  'org.openscience.cdk.qsar.descriptors.molecular.HBondDonorCountDescriptor')
#Evaluating descriptors
descs <- eval.desc(mol_inb, descNames)
#Removing descriptors with NA's
descs <- descs[, !apply(descs, 2, function(x) any(is.na(x)) )]
#Removing descriptors with zero as their values
descs <- descs[, !apply( descs, 2, function(x) length(unique(x)) == 1 )]
#Now we can neglect all those correlations which have value < 0.6 as they might not be 
#having strong correlation which is done below
#Plotting Molecular weight vs Boiling point
#Getting molecular weights of bpdata
mol_inb <- parse.smiles(bpdata[,1])
form_mass = NULL
for(i in 1:length(mol_inb)){
  molecs <- mol_inb[[i]]
  convert.implicit.to.explicit(molecs)
  formulex <- get.mol2formula(molecs,charge = 0)
  form_mass[i] = formulex@mass 
  
}
plot(bpdata$BP,form_mass,pch = 19,xlab = "Boiling Point",ylab = "Molecularweight")

#Adding Molecular wieghts to your data
bpdata[,"MW"] = form_mass
head(bpdata,5)
##                                   SMILES    BP        MW
## bromo-trichloro-methane  C(Br)(Cl)(Cl)Cl 378.0 195.82490
## chloro-trifluoro-methane      ClC(F)(F)F 191.7 103.96406
## carbon tetrachloride     C(Cl)(Cl)(Cl)Cl 349.8 151.87541
## tetrafluoromethane           C(F)(F)(F)F 145.1  87.99361
## bromoform                      BrC(Br)Br 422.3 249.76284
#Plotting model parameters vs parameters
plot(bpdata$BP,descs$khs.sCH3,pch = 19)

plot(bpdata$BP,descs$apol,pch = 19)

plot(bpdata$BP,descs$khs.sF,pch = 19)

plot(bpdata$BP,descs$nHBDon,pch = 19)

r2 <- which(cor(descs)^2 > .6, arr.ind=TRUE)
r2
##           row col
## khs.sCH3    1   1
## khs.dCH2    2   2
## khs.ssCH2   3   3
## khs.dsCH    4   4
## khs.aaCH    5   5
## khs.sssCH   6   6
## khs.dssC    7   7
## khs.dO     11   7
## khs.aasC    8   8
## khs.ssssC   9   9
## khs.sOH    10  10
## nHBDon     22  10
## khs.dssC    7  11
## khs.dO     11  11
## khs.ssO    12  12
## khs.aaO    13  13
## khs.sF     14  14
## khs.sSH    15  15
## khs.ssS    16  16
## khs.aaS    17  17
## khs.sCl    18  18
## khs.sBr    19  19
## khs.sI     20  20
## apol       21  21
## khs.sOH    10  22
## nHBDon     22  22
#Above you might have observed that most of the parameters have correlations with itself i.e
#corr = 1.Above method is a rude and basic method you can come up with much more good methods
#for predicting the correlations between molecules
#Removing upper triangle elements in correlation matrix
r2 <- r2[ r2[,1] > r2[,2] , ]
r2
##        row col
## khs.dO  11   7
## nHBDon  22  10
#Applying them to descriptors (- indicates removal of upper triangular columns)
descs <- descs[, -unique(r2[,2])]
#Building the Model with out MW
model <- lm(BP ~ khs.sCH3 + khs.sF + apol + nHBDon, data.frame(bpdata, descs))
summary(model)
## 
## Call:
## lm(formula = BP ~ khs.sCH3 + khs.sF + apol + nHBDon, data = data.frame(bpdata, 
##     descs))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -94.395 -20.911  -1.168  19.574 114.237 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 267.3135     6.0006  44.548   <2e-16 ***
## khs.sCH3    -22.7948     2.0676 -11.025   <2e-16 ***
## khs.sF      -24.4121     2.6548  -9.196   <2e-16 ***
## apol          8.6211     0.3132  27.523   <2e-16 ***
## nHBDon       47.1187     3.7061  12.714   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 34.08 on 272 degrees of freedom
## Multiple R-squared:  0.837,  Adjusted R-squared:  0.8346 
## F-statistic: 349.1 on 4 and 272 DF,  p-value: < 2.2e-16
plot(bpdata$BP,predict(model,data.frame(bpdata,descs)),pch = 19,xlab = "Observed BP",ylab = "Predicted BP")
abline(0,1,col = "red")

#Building Model with MW
model_1 <- lm(BP ~ khs.sCH3 + khs.sF + apol + nHBDon+MW, data.frame(bpdata, descs))
summary(model_1)
## 
## Call:
## lm(formula = BP ~ khs.sCH3 + khs.sF + apol + nHBDon + MW, data = data.frame(bpdata, 
##     descs))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -71.637 -19.775   1.448  17.315 115.749 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 230.73812    6.04515  38.169  < 2e-16 ***
## khs.sCH3    -13.57662    1.92665  -7.047 1.52e-11 ***
## khs.sF      -27.85514    2.24299 -12.419  < 2e-16 ***
## apol          5.72411    0.37399  15.305  < 2e-16 ***
## nHBDon       57.32190    3.23918  17.696  < 2e-16 ***
## MW            0.62922    0.05797  10.854  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 28.5 on 271 degrees of freedom
## Multiple R-squared:  0.8864, Adjusted R-squared:  0.8843 
## F-statistic: 422.8 on 5 and 271 DF,  p-value: < 2.2e-16
plot(bpdata$BP,predict(model_1,data.frame(bpdata,descs)),pch = 19,xlab = "Observed BP",ylab = "Predicted BP")
abline(0,1,col = "red")

#Comparing models
compare_0 = cbind(summary(model),summary(model_1))
colnames(compare_0) = c("Model With MW","Model without MW")
compare_0[c("r.squared","adj.r.squared"),]
##               Model With MW Model without MW
## r.squared     0.8369626     0.8863644       
## adj.r.squared 0.834565      0.8842678
#We can see there is an improvement in model with Molecular weight
#Now we will be using the above model to predict the BP of Dichlorodifluoromethane (CF2Cl4)
tetclo <- parse.smiles("C(F)(F)(Cl)Cl")[[1]]
desc_tet <- eval.desc(tetclo,descNames)
tet_dat = desc_tet[c("khs.sCH3","khs.sF","apol","nHBDon")]
tet_dat
##   khs.sCH3 khs.sF  apol nHBDon
## 1        0      2 7.234      0
#from above we can see only apol and khs.sF has the non-zero value
predBP_teclo = predict(model,tet_dat)
predBP_teclo
##        1 
## 280.8542
#We can see the predicted boling point of the Dichlorodifluoromethane is 280.8 K where as 
#the original value is 243.2 K wh
#Now we will be using the above model to predict the BP of Tetrachloroethylene (C2Cl4)
tetclo_0 <- parse.smiles("C(=C(Cl)Cl)(Cl)Cl")[[1]]
desc_tet_0 <- eval.desc(tetclo_0,descNames)
tet_dat_0 = desc_tet_0[c("khs.sCH3","khs.sF","apol","nHBDon")]
tet_dat_0
##   khs.sCH3 khs.sF  apol nHBDon
## 1        0      0 12.24      0
#from above we can see only apol has the non-zero value
predBP_teclo_0 = predict(model,tet_dat_0)
predBP_teclo_0
##        1 
## 372.8355
#We can see the predicted boiling point of the Tetrachloroethylene is 372.8K where as 
#the original value is 394.1K
#Therefore from above two examples we can see predicted values are varying from the originals 
# around error magnitude of 20-30.If we could come up with much more efficient model
#we can move closer to the experiment value.