rm(list = ls())
#if (!requireNamespace("BiocManager", quietly=TRUE))
# install.packages("BiocManager")
#BiocManager::install("ChemmineR")
library("ChemmineR")
#library(help="ChemmineR") # Lists all functions and classes
#vignette("ChemmineR") # Opens this PDF manual from R
data(sdfsample)
sdfset <- sdfsample
sdfset # Returns summary of SDFset
## An instance of "SDFset" with 100 molecules
sdfset[[1]] # Returns summarized content of one SDF
## An instance of "SDF"
##
## <<header>>
## Molecule_Name
## "650001"
## Source
## " -OEChem-07071010512D"
## Comment
## ""
## Counts_Line
## " 61 64 0 0 0 0 0 0 0999 V2000"
##
## <<atomblock>>
## C1 C2 C3 C5 C6 C7 C8 C9 C10 C11 C12 C13 C14 C15 C16
## O_1 7.0468 0.0839 0 0 0 0 0 0 0 0 0 0 0 0 0
## O_2 12.2708 1.0492 0 0 0 0 0 0 0 0 0 0 0 0 0
## ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
## H_60 1.8411 -1.5985 0 0 0 0 0 0 0 0 0 0 0 0 0
## H_61 2.6597 -1.2843 0 0 0 0 0 0 0 0 0 0 0 0 0
##
## <<bondblock>>
## C1 C2 C3 C4 C5 C6 C7
## 1 1 16 2 0 0 0 0
## 2 2 23 1 0 0 0 0
## ... ... ... ... ... ... ... ...
## 63 33 60 1 0 0 0 0
## 64 33 61 1 0 0 0 0
##
## <<datablock>> (33 data items)
## PUBCHEM_COMPOUND_CID PUBCHEM_COMPOUND_CANONICALIZED
## "650001" "1"
## PUBCHEM_CACTVS_COMPLEXITY PUBCHEM_CACTVS_HBOND_ACCEPTOR
## "700" "7"
##
## "..."
length(sdfset)
## [1] 100
##############################################
dir_path <- "C:\\Users\\liyix\\OneDrive\\Desktop\\sdf\\"
dir_path_name <- list.files(pattern = ".*",dir_path,full.names = T, recursive = T)
dir_path_name
## [1] "C:\\Users\\liyix\\OneDrive\\Desktop\\sdf\\2022-11-14_2244.sdf"
## [2] "C:\\Users\\liyix\\OneDrive\\Desktop\\sdf\\2022-11-14_2244_2DD.sdf"
## [3] "C:\\Users\\liyix\\OneDrive\\Desktop\\sdf\\2022-11-14_2244_3DD.sdf"
## [4] "C:\\Users\\liyix\\OneDrive\\Desktop\\sdf\\2022-11-14_sub.smi"
## [5] "C:\\Users\\liyix\\OneDrive\\Desktop\\sdf\\chemminer.R"
## [6] "C:\\Users\\liyix\\OneDrive\\Desktop\\sdf\\chemminer.spin.R"
## [7] "C:\\Users\\liyix\\OneDrive\\Desktop\\sdf\\chemminer.spin.Rmd"
## [8] "C:\\Users\\liyix\\OneDrive\\Desktop\\sdf\\PubChem_records.sdf"
sdfset <- read.SDFset(grep("PubChem_records.sdf",dir_path_name,value = T))
sdfset
## An instance of "SDFset" with 31 molecules
sdfset[[1]]
## An instance of "SDF"
##
## <<header>>
## Molecule_Name
## "2244"
## Source
## " -OEChem-11142211343D"
## Comment
## ""
## Counts_Line
## " 21 21 0 0 0 0 0 0 0999 V2000"
##
## <<atomblock>>
## C1 C2 C3 C5 C6 C7 C8 C9 C10 C11 C12 C13 C14 C15 C16
## O_1 1.2333 0.554 0.7792 0 0 0 0 0 0 0 0 0 0 0 0
## O_2 -0.6952 -2.7148 -0.7502 0 0 0 0 0 0 0 0 0 0 0 0
## ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
## H_20 3.7105 -0.3659 0.6426 0 0 0 0 0 0 0 0 0 0 0 0
## H_21 -0.2555 -3.5916 -0.7337 0 0 0 0 0 0 0 0 0 0 0 0
##
## <<bondblock>>
## C1 C2 C3 C4 C5 C6 C7
## 1 1 5 1 0 0 0 0
## 2 1 12 1 0 0 0 0
## ... ... ... ... ... ... ... ...
## 20 13 19 1 0 0 0 0
## 21 13 20 1 0 0 0 0
##
## <<datablock>> (22 data items)
## PUBCHEM_COMPOUND_CID
## "2244"
## PUBCHEM_CONFORMER_RMSD
## "0.6"
## PUBCHEM_CONFORMER_DIVERSEORDER
## "1 __ 11 __ 10 __ 3 __ 15 __ 17 __ 13 __ 5 __ 16 __ 7 __ 14 __ 9 __ 8 __ 4 __ 18 __ 6 __ 12 __ 2"
## PUBCHEM_MMFF94_PARTIAL_CHARGES
## "18 __ 1 -0.23 __ 10 -0.15 __ 11 0.63 __ 12 0.66 __ 13 0.06 __ 14 0.15 __ 15 0.15 __ 16 0.15 __ 17 0.15 __ 2 -0.65 __ 21 0.5 __ 3 -0.57 __ 4 -0.57 __ 5 0.08 __ 6 0.09 __ 7 -0.15 __ 8 -0.15 __ 9 -0.15"
##
## "..."
header(sdfset[[1]])
## Molecule_Name
## "2244"
## Source
## " -OEChem-11142211343D"
## Comment
## ""
## Counts_Line
## " 21 21 0 0 0 0 0 0 0999 V2000"
cid(sdfset) # Returns IDs from SDFset object
## [1] "CMP1" "CMP2" "CMP3" "CMP4" "CMP5" "CMP6" "CMP7" "CMP8" "CMP9"
## [10] "CMP10" "CMP11" "CMP12" "CMP13" "CMP14" "CMP15" "CMP16" "CMP17" "CMP18"
## [19] "CMP19" "CMP20" "CMP21" "CMP22" "CMP23" "CMP24" "CMP25" "CMP26" "CMP27"
## [28] "CMP28" "CMP29" "CMP30" "CMP31"
sdfid(sdfset)
## [1] "2244" "53040" "133472" "156866" "199027" "9818919"
## [7] "15280939" "25157143" "51404094" "91819941" "91820043" "91820534"
## [13] "1983" "9841438" "131953074" "132282528" "119032" "4064"
## [19] "68484" "68749" "10745" "15110" "21102" "67256"
## [25] "526502" "530150" "12280114" "16126783" "29971035" "46780045"
## [31] "71309054"
unique_ids <- makeUnique(sdfid(sdfset))
## [1] "No duplicates detected!"
unique_ids
## [1] "2244" "53040" "133472" "156866" "199027" "9818919"
## [7] "15280939" "25157143" "51404094" "91819941" "91820043" "91820534"
## [13] "1983" "9841438" "131953074" "132282528" "119032" "4064"
## [19] "68484" "68749" "10745" "15110" "21102" "67256"
## [25] "526502" "530150" "12280114" "16126783" "29971035" "46780045"
## [31] "71309054"
blockmatrix <- datablock2ma(datablocklist=datablock(sdfset)) # Converts data block to matrix
numchar <- splitNumChar(blockmatrix=blockmatrix) # Splits to numeric and character matrix
numchar[[1]]# Slice of numeric matrix
## PUBCHEM_COMPOUND_CID PUBCHEM_CONFORMER_RMSD PUBCHEM_EFFECTIVE_ROTOR_COUNT
## CMP1 2244 0.6 3.0
## CMP2 53040 1.0 8.0
## CMP3 133472 0.8 8.0
## CMP4 156866 1.0 10.0
## CMP5 199027 0.8 6.0
## CMP6 9818919 0.8 8.0
## CMP7 15280939 0.8 6.0
## CMP8 25157143 0.8 8.0
## CMP9 51404094 1.4 14.0
## CMP10 91819941 1.4 14.0
## CMP11 91820043 1.4 14.0
## CMP12 91820534 1.4 14.0
## CMP13 1983 0.4 2.0
## CMP14 9841438 1.2 14.0
## CMP15 131953074 1.2 14.0
## CMP16 132282528 1.4 14.0
## CMP17 119032 0.8 8.0
## CMP18 4064 0.8 8.0
## CMP19 68484 0.6 4.0
## CMP20 68749 0.8 6.0
## CMP21 10745 0.8 6.0
## CMP22 15110 0.8 8.0
## CMP23 21102 0.8 7.0
## CMP24 67256 0.8 5.0
## CMP25 526502 0.8 6.0
## CMP26 530150 0.6 5.0
## CMP27 12280114 0.6 3.0
## CMP28 16126783 1.2 14.0
## CMP29 29971035 0.8 7.2
## CMP30 46780045 0.6 3.0
## CMP31 71309054 0.6 3.0
## PUBCHEM_HEAVY_ATOM_COUNT PUBCHEM_ATOM_DEF_STEREO_COUNT
## CMP1 13 0
## CMP2 30 0
## CMP3 21 0
## CMP4 26 1
## CMP5 19 0
## CMP6 24 0
## CMP7 19 0
## CMP8 24 0
## CMP9 23 1
## CMP10 27 3
## CMP11 27 3
## CMP12 27 3
## CMP13 11 0
## CMP14 25 3
## CMP15 27 3
## CMP16 26 2
## CMP17 24 0
## CMP18 15 0
## CMP19 14 0
## CMP20 21 0
## CMP21 22 0
## CMP22 25 0
## CMP23 23 0
## CMP24 19 0
## CMP25 20 0
## CMP26 17 0
## CMP27 13 0
## CMP28 27 3
## CMP29 25 5
## CMP30 13 0
## CMP31 13 0
## PUBCHEM_ATOM_UDEF_STEREO_COUNT PUBCHEM_BOND_DEF_STEREO_COUNT
## CMP1 0 0
## CMP2 0 0
## CMP3 0 0
## CMP4 0 0
## CMP5 1 0
## CMP6 0 0
## CMP7 0 0
## CMP8 0 0
## CMP9 0 4
## CMP10 0 6
## CMP11 0 6
## CMP12 0 6
## CMP13 0 0
## CMP14 0 4
## CMP15 0 6
## CMP16 0 6
## CMP17 0 0
## CMP18 0 0
## CMP19 0 0
## CMP20 0 0
## CMP21 0 0
## CMP22 0 0
## CMP23 0 0
## CMP24 0 0
## CMP25 0 0
## CMP26 0 0
## CMP27 0 0
## CMP28 0 6
## CMP29 0 0
## CMP30 0 0
## CMP31 0 0
## PUBCHEM_BOND_UDEF_STEREO_COUNT PUBCHEM_ISOTOPIC_ATOM_COUNT
## CMP1 0 0
## CMP2 0 0
## CMP3 0 0
## CMP4 0 0
## CMP5 0 0
## CMP6 0 0
## CMP7 0 0
## CMP8 0 0
## CMP9 0 0
## CMP10 0 0
## CMP11 0 0
## CMP12 0 0
## CMP13 0 0
## CMP14 0 0
## CMP15 0 0
## CMP16 0 0
## CMP17 0 0
## CMP18 0 0
## CMP19 0 0
## CMP20 0 0
## CMP21 0 0
## CMP22 0 0
## CMP23 0 0
## CMP24 0 0
## CMP25 0 0
## CMP26 0 0
## CMP27 0 3
## CMP28 0 0
## CMP29 0 0
## CMP30 0 4
## CMP31 0 1
## PUBCHEM_COMPONENT_COUNT PUBCHEM_CACTVS_TAUTO_COUNT PUBCHEM_MMFF94_ENERGY
## CMP1 1 1 39.5952
## CMP2 1 2 88.1673
## CMP3 1 2 57.9614
## CMP4 1 2 65.5355
## CMP5 1 1 48.8369
## CMP6 1 1 79.9223
## CMP7 1 1 48.3873
## CMP8 1 1 83.4603
## CMP9 1 1 18.9872
## CMP10 1 1 30.3720
## CMP11 1 1 31.3779
## CMP12 1 1 36.2472
## CMP13 1 4 29.3225
## CMP14 1 1 30.0059
## CMP15 1 2 33.8375
## CMP16 1 -1 25.6253
## CMP17 1 1 79.8120
## CMP18 1 3 27.3092
## CMP19 1 1 42.8427
## CMP20 1 1 81.0989
## CMP21 1 1 77.2451
## CMP22 1 1 82.5085
## CMP23 1 2 79.1474
## CMP24 1 1 65.1057
## CMP25 1 1 53.5987
## CMP26 1 1 41.6475
## CMP27 1 1 39.6471
## CMP28 1 1 33.0035
## CMP29 1 1 69.4599
## CMP30 1 1 39.6583
## CMP31 1 1 39.6538
## PUBCHEM_FEATURE_SELFOVERLAP PUBCHEM_SHAPE_SELFOVERLAP
## CMP1 25.432 513.037
## CMP2 50.805 1240.284
## CMP3 50.865 805.561
## CMP4 30.447 1050.804
## CMP5 50.862 733.918
## CMP6 30.519 957.153
## CMP7 20.309 720.335
## CMP8 30.447 987.001
## CMP9 40.655 863.511
## CMP10 50.802 1015.662
## CMP11 50.814 1016.908
## CMP12 50.807 1019.673
## CMP13 20.297 432.756
## CMP14 55.886 926.934
## CMP15 50.811 1015.648
## CMP16 40.652 983.605
## CMP17 30.519 957.139
## CMP18 30.447 525.824
## CMP19 15.223 550.807
## CMP20 25.373 864.537
## CMP21 35.580 905.871
## CMP22 30.459 1020.760
## CMP23 30.447 939.361
## CMP24 20.297 792.177
## CMP25 20.297 815.716
## CMP26 15.223 688.837
## CMP27 25.429 512.920
## CMP28 50.814 1014.680
## CMP29 71.127 965.933
## CMP30 25.429 512.934
## CMP31 25.430 512.939
## PUBCHEM_SHAPE_VOLUME
## CMP1 136.0
## CMP2 320.8
## CMP3 215.9
## CMP4 279.2
## CMP5 195.9
## CMP6 244.0
## CMP7 189.2
## CMP8 254.7
## CMP9 279.0
## CMP10 316.5
## CMP11 316.1
## CMP12 315.1
## CMP13 120.2
## CMP14 295.2
## CMP15 316.2
## CMP16 309.0
## CMP17 244.5
## CMP18 167.5
## CMP19 149.5
## CMP20 219.1
## CMP21 223.4
## CMP22 255.6
## CMP23 237.9
## CMP24 198.9
## CMP25 241.9
## CMP26 204.9
## CMP27 135.7
## CMP28 316.7
## CMP29 248.2
## CMP30 135.7
## CMP31 135.7
#View(numchar[[1]])
#Compute atom frequency matrix, molecular weight and formula:
propma <- data.frame(MF=MF(sdfset), MW=MW(sdfset), atomcountMA(sdfset))
propma
## MF MW C H O N F Si
## CMP1 C9H8O4 180.1574 9 8 4 0 0 0
## CMP2 C23H25N3O4 407.4623 23 25 4 3 0 0
## CMP3 C13H13NO7 295.2448 13 13 7 1 0 0
## CMP4 C20H21NO5 355.3844 20 21 5 1 0 0
## CMP5 C12H13NO6 267.2347 12 13 6 1 0 0
## CMP6 C16H13NO7 331.2769 16 13 7 1 0 0
## CMP7 C11H7F3O5 276.1655 11 7 5 0 3 0
## CMP8 C19H18O5 326.3432 19 18 5 0 0 0
## CMP9 C20H31O3 319.4583 20 31 3 0 0 0
## CMP10 C22H31O5 375.4785 22 31 5 0 0 0
## CMP11 C22H31O5 375.4785 22 31 5 0 0 0
## CMP12 C22H31O5 375.4785 22 31 5 0 0 0
## CMP13 C8H9NO2 151.1626 8 9 2 1 0 0
## CMP14 C20H32O5 352.4651 20 32 5 0 0 0
## CMP15 C22H32O5 376.4865 22 32 5 0 0 0
## CMP16 C22H31O4 359.4791 22 31 4 0 0 0
## CMP17 C16H13NO7 331.2769 16 13 7 1 0 0
## CMP18 C9H18N2O4 218.2502 9 18 4 2 0 0
## CMP19 C10H10O4 194.1840 10 10 4 0 0 0
## CMP20 C16H14O5 286.2794 16 14 5 0 0 0
## CMP21 C16H12O6 300.2629 16 12 6 0 0 0
## CMP22 C18H14O7 342.2996 18 14 7 0 0 0
## CMP23 C17H15NO5 313.3047 17 15 5 1 0 0
## CMP24 C15H12O4 256.2534 15 12 4 0 0 0
## CMP25 C15H22O4Si 294.4183 15 22 4 0 0 1
## CMP26 C12H16O4Si 252.3385 12 16 4 0 0 1
## CMP27 C9H8O4 180.1574 9 8 4 0 0 0
## CMP28 C22H32O5 376.4865 22 32 5 0 0 0
## CMP29 C15H16O10 356.2815 15 16 10 0 0 0
## CMP30 C9H8O4 180.1574 9 8 4 0 0 0
## CMP31 C9H8O4 180.1574 9 8 4 0 0 0
class(propma)
## [1] "data.frame"
#View(sdfset[[1]])
write.SDF(sdfset[[1]], file=paste0(dir_path,Sys.Date(),"_",as.character(sdfset[[1]]@header[1]),".sdf"), sig=TRUE)
plot(sdfset[1:3], print=F) # Plots structures to R graphics device
sdf.visualize(sdfset) # Compound viewing in web browser
## [1] "http://chemmine.ucr.edu/ChemmineR/showJob/781f218a-644c-48de-9414-a57aa1f5e13b"
###################OpenBabel Functions
#BiocManager::install("ChemmineOB")
#library(ChemmineOB)
propOB(sdfset)
## cansmi
## CMP1 CC(=O)Oc1ccccc1C(=O)O
## CMP2 OC(=O)Cc1ccccc1C(=O)NCc1c(C(C)C)c(=O)n(n1C)c1ccccc1
## CMP3 OC(=O)CCC(=O)Nc1ccc(c(c1)C(=O)O)OC(=O)C
## CMP4 CCOC(=O)[C@@H](NC(=O)c1ccccc1OC(=O)C)Cc1ccccc1
## CMP5 CC(=O)Oc1ccc(cc1C(=O)O)C[C@@H](C(=O)O)N
## CMP6 [O-][N+](=O)OCc1ccc(cc1)OC(=O)c1ccccc1OC(=O)C
## CMP7 CC(=O)Oc1ccccc1C(=O)OC(=O)C(F)(F)F
## CMP8 C=CCc1ccc(c(c1)OC)OC(=O)c1ccccc1OC(=O)C
## CMP9 CCCCC[C@H](/C=C/C=C\\C/C=C\\C/C=C\\CCCC(=O)[O-])O
## CMP10 CC/C=C\\C[C@H](/C=C/C=C\\C[C@H](/C=C/C=C/C=C\\[C@H](CCC(=O)[O-])O)O)O
## CMP11 CC/C=C\\C[C@H](/C=C/C=C\\C=C\\C=C\\[C@H]([C@H](C/C=C\\CCC(=O)[O-])O)O)O
## CMP12 CC/C=C\\C[C@H]([C@@H](/C=C/C=C/C=C\\C=C\\[C@H](C/C=C\\CCC(=O)[O-])O)O)O
## CMP13 CC(=O)Nc1ccc(cc1)O
## CMP14 CCCCC[C@H](/C=C/C=C\\C=C\\C=C\\[C@H]([C@H](CCCC(=O)O)O)O)O
## CMP15 CC/C=C\\C[C@H](/C=C/C=C\\C/C=C\\C=C\\C=C\\[C@H]([C@H](CCC(=O)O)O)O)O
## CMP16 CC/C=C\\C[C@H](/C=C\\C=C\\C=C\\[C@@H](C/C=C\\C/C=C\\CCC(=O)[O-])O)O
## CMP17 [O-][N+](=O)OCc1cccc(c1)OC(=O)c1ccccc1OC(=O)C
## CMP18 CCCC(COC(=O)N)(COC(=O)N)C
## CMP19 COC(=O)c1ccccc1OC(=O)C
## CMP20 COc1ccccc1OC(=O)c1ccccc1OC(=O)C
## CMP21 CC(=O)Oc1ccccc1C(=O)Oc1ccccc1C(=O)O
## CMP22 CC(=O)Oc1ccccc1C(=O)OC(=O)c1ccccc1OC(=O)C
## CMP23 CC(=O)Nc1ccc(cc1)OC(=O)c1ccccc1OC(=O)C
## CMP24 CC(=O)Oc1ccccc1C(=O)Oc1ccccc1
## CMP25 CC(=O)Oc1ccccc1C(=O)O[Si](C(C)(C)C)(C)C
## CMP26 CC(=O)Oc1ccccc1C(=O)O[Si](C)(C)C
## CMP27 OC(=O)c1ccccc1OC(=O)C([2H])([2H])[2H]
## CMP28 CC/C=C\\C[C@H](/C=C/C=C\\C=C\\C=C\\[C@H]([C@H](C/C=C\\CCC(=O)O)O)O)O
## CMP29 CC(=O)Oc1ccccc1C(=O)O[C@@H]1O[C@H](C(=O)O)[C@H]([C@@H]([C@H]1O)O)O
## CMP30 CC(=O)Oc1c([2H])c([2H])c(c(c1C(=O)O)[2H])[2H]
## CMP31 CC(=O)Oc1ccccc1[13C](=O)O
## cansmiNS formula title
## CMP1 CC(=O)Oc1ccccc1C(=O)O C9H8O4 2244
## CMP2 OC(=O)Cc1ccccc1C(=O)NCc1c(C(C)C)c(=O)n(n1C)c1ccccc1 C23H25N3O4 53040
## CMP3 OC(=O)CCC(=O)Nc1ccc(c(c1)C(=O)O)OC(=O)C C13H13NO7 133472
## CMP4 CCOC(=O)C(NC(=O)c1ccccc1OC(=O)C)Cc1ccccc1 C20H21NO5 156866
## CMP5 CC(=O)Oc1ccc(cc1C(=O)O)CC(C(=O)O)N C12H13NO6 199027
## CMP6 [O-][N+](=O)OCc1ccc(cc1)OC(=O)c1ccccc1OC(=O)C C16H13NO7 9818919
## CMP7 CC(=O)Oc1ccccc1C(=O)OC(=O)C(F)(F)F C11H7F3O5 15280939
## CMP8 C=CCc1ccc(c(c1)OC)OC(=O)c1ccccc1OC(=O)C C19H18O5 25157143
## CMP9 CCCCCC(C=CC=CCC=CCC=CCCCC(=O)[O-])O C20H31O3- 51404094
## CMP10 CCC=CCC(C=CC=CCC(C=CC=CC=CC(CCC(=O)[O-])O)O)O C22H31O5- 91819941
## CMP11 CCC=CCC(C=CC=CC=CC=CC(C(CC=CCCC(=O)[O-])O)O)O C22H31O5- 91820043
## CMP12 CCC=CCC(C(C=CC=CC=CC=CC(CC=CCCC(=O)[O-])O)O)O C22H31O5- 91820534
## CMP13 CC(=O)Nc1ccc(cc1)O C8H9NO2 1983
## CMP14 CCCCCC(C=CC=CC=CC=CC(C(CCCC(=O)O)O)O)O C20H32O5 9841438
## CMP15 CCC=CCC(C=CC=CCC=CC=CC=CC(C(CCC(=O)O)O)O)O C22H32O5 131953074
## CMP16 CCC=CCC(C=CC=CC=CC(CC=CCC=CCCC(=O)[O-])O)O C22H31O4- 132282528
## CMP17 [O-][N+](=O)OCc1cccc(c1)OC(=O)c1ccccc1OC(=O)C C16H13NO7 119032
## CMP18 CCCC(COC(=O)N)(COC(=O)N)C C9H18N2O4 4064
## CMP19 COC(=O)c1ccccc1OC(=O)C C10H10O4 68484
## CMP20 COc1ccccc1OC(=O)c1ccccc1OC(=O)C C16H14O5 68749
## CMP21 CC(=O)Oc1ccccc1C(=O)Oc1ccccc1C(=O)O C16H12O6 10745
## CMP22 CC(=O)Oc1ccccc1C(=O)OC(=O)c1ccccc1OC(=O)C C18H14O7 15110
## CMP23 CC(=O)Nc1ccc(cc1)OC(=O)c1ccccc1OC(=O)C C17H15NO5 21102
## CMP24 CC(=O)Oc1ccccc1C(=O)Oc1ccccc1 C15H12O4 67256
## CMP25 CC(=O)Oc1ccccc1C(=O)O[Si](C(C)(C)C)(C)C C15H22O4Si 526502
## CMP26 CC(=O)Oc1ccccc1C(=O)O[Si](C)(C)C C12H16O4Si 530150
## CMP27 CC(=O)Oc1ccccc1C(=O)O C9H5D3O4 12280114
## CMP28 CCC=CCC(C=CC=CC=CC=CC(C(CC=CCCC(=O)O)O)O)O C22H32O5 16126783
## CMP29 CC(=O)Oc1ccccc1C(=O)OC1OC(C(=O)O)C(C(C1O)O)O C15H16O10 29971035
## CMP30 CC(=O)Oc1ccccc1C(=O)O C9H4D4O4 46780045
## CMP31 CC(=O)Oc1ccccc1C(=O)O C9H8O4 71309054
## InChI HBA1 HBA2 HBD logP MR MW nF TPSA
## CMP1 12 4 1 1.3101 44.9003 180.1574 0 63.60
## CMP2 30 7 2 3.2474 114.8990 407.4623 0 93.33
## CMP3 21 8 3 1.1864 70.5988 295.2448 0 130.00
## CMP4 27 6 1 2.9070 95.9372 355.3844 0 81.70
## CMP5 20 7 3 0.9648 63.9595 267.2347 0 126.92
## CMP6 19 4 0 3.0626 83.4975 331.2769 0 107.65
## CMP7 12 5 0 1.8576 54.4185 276.1655 3 69.67
## CMP8 23 5 0 3.5682 89.9395 326.3432 0 61.83
## CMP9 33 3 1 3.8528 97.3493 319.4583 0 60.36
## CMP10 35 5 3 2.1266 108.3389 375.4785 0 100.82
## CMP11 35 5 3 2.1266 108.3389 375.4785 0 100.82
## CMP12 35 5 3 2.1266 108.3389 375.4785 0 100.82
## CMP13 12 3 2 1.4236 42.7777 151.1626 0 49.33
## CMP14 37 5 4 3.1291 101.6152 352.4651 0 97.99
## CMP15 37 5 4 3.4613 110.2812 376.4865 0 97.99
## CMP16 34 4 2 3.1558 107.1771 359.4791 0 80.59
## CMP17 19 4 0 3.0626 83.4975 331.2769 0 107.65
## CMP18 24 6 2 2.3840 53.8798 218.2502 0 104.64
## CMP19 14 4 0 1.3985 49.2205 194.1840 0 52.60
## CMP20 19 5 0 2.8397 75.8335 286.2794 0 61.83
## CMP21 18 6 1 2.5293 76.3008 300.2629 0 89.90
## CMP22 21 7 0 2.5344 85.6280 342.2996 0 95.97
## CMP23 21 6 1 2.8625 83.6542 313.3047 0 81.70
## CMP24 16 4 0 2.8311 69.3415 256.2534 0 52.60
## CMP25 27 4 0 3.7739 81.1235 294.4183 0 52.60
## CMP26 21 4 0 2.6036 66.7025 252.3385 0 52.60
## CMP27 12 4 1 1.3101 44.9003 183.1759 0 63.60
## CMP28 37 5 4 3.4613 110.2812 376.4865 0 97.99
## CMP29 26 10 4 -1.3391 77.4837 356.2815 0 159.82
## CMP30 12 4 1 1.3101 44.9003 184.1821 0 63.60
## CMP31 12 4 1 1.3101 44.9003 181.1501 0 63.60
fingerprintOB(sdfset,"FP4")[[1]] #"FP2", "FP3", "FP4", and "MACCS".
## An instance of "FP" of type "unknown-3526"
## <<fingerprint>>
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... length: 512
class(fingerprintOB(sdfset,"FP4"))
## [1] "FPset"
## attr(,"package")
## [1] "ChemmineR"
smartsSearchOB(sdfset,"[!$(*#*)&!D1]-!@[!$(*#*)&!D1]",uniqueMatches=FALSE) ##count rotable bonds
## CMP1 CMP2 CMP3 CMP4 CMP5 CMP6 CMP7 CMP8 CMP9 CMP10 CMP11 CMP12 CMP13
## 10 24 22 24 20 18 14 20 32 36 36 36 8
## CMP14 CMP15 CMP16 CMP17 CMP18 CMP19 CMP20 CMP21 CMP22 CMP23 CMP24 CMP25 CMP26
## 38 38 34 18 24 12 16 16 20 18 12 24 18
## CMP27 CMP28 CMP29 CMP30 CMP31
## 10 38 22 10 10
exactMassOB(sdfset) #exactMassOB: Compute the monoisotopic (exact) mass of a set of compounds
## CMP1 CMP2 CMP3 CMP4 CMP5 CMP6 CMP7 CMP8
## 180.0423 407.1845 295.0692 355.1420 267.0743 331.0692 276.0246 326.1154
## CMP9 CMP10 CMP11 CMP12 CMP13 CMP14 CMP15 CMP16
## 319.2273 375.2171 375.2171 375.2171 151.0633 352.2250 376.2250 359.2222
## CMP17 CMP18 CMP19 CMP20 CMP21 CMP22 CMP23 CMP24
## 331.0692 218.1267 194.0579 286.0841 300.0634 342.0740 313.0950 256.0736
## CMP25 CMP26 CMP27 CMP28 CMP29 CMP30 CMP31
## 294.1287 252.0818 183.0611 376.2250 356.0743 184.0674 181.0456
sdfset2 = regenerateCoords(sdfset) #regenerateCoords: Re-compute the 2D coordinates of a compound using Open Babel.
#This can sometimes improve the quality of the compounds plot.
plot(sdfset2[1], regenCoords=TRUE,print=FALSE)


openBabelPlot(sdfset2[1],regenCoords=TRUE) #OpenBabel can also be used to plot compounds directly:

#generate3DCoords: Generate 3D coordinates for compounds with only 2D coordinates.
sdf3D = generate3DCoords(sdfset2[1])
write.SDF(sdf3D[[1]], file=paste0(dir_path,Sys.Date(),"_",as.character(sdfset[[1]]@header[1]),"_3DD.sdf"), sig=TRUE)
write.SDF(sdfset2[[1]], file=paste0(dir_path,Sys.Date(),"_",as.character(sdfset[[1]]@header[1]),"_2DD.sdf"), sig=TRUE)
#canonicalize: Compute a canonicalized atom numbering. This allows compounds with the same molecular structure but different atom numberings to be compared properly.
canonicalSdf = canonicalize(sdfset[1])
canonicalSdf[[1]]
## An instance of "SDF"
##
## <<header>>
## Molecule_Name
## "2244"
## Source
## " OpenBabel11142214443D"
## Comment
## ""
## Counts_Line
## " 21 21 0 0 0 0 0 0 0 0999 V2000"
##
## <<atomblock>>
## C1 C2 C3 C5 C6 C7 C8 C9 C10 C11 C12 C13 C14 C15 C16
## H_1 -0.2555 -3.5916 -0.7337 0 0 0 0 0 0 0 0 0 0 0 0
## O_2 -0.6952 -2.7148 -0.7502 0 0 0 0 0 0 0 0 0 0 0 0
## ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
## H_20 4.2045 0.6969 -0.6924 0 0 0 0 0 0 0 0 0 0 0 0
## H_21 3.7105 -0.3659 0.6426 0 0 0 0 0 0 0 0 0 0 0 0
##
## <<bondblock>>
## C1 C2 C3 C4 C5 C6 C7
## 1 2 3 1 0 0 0 0
## 2 2 1 1 0 0 0 0
## ... ... ... ... ... ... ... ...
## 20 16 21 1 0 0 0 0
## 21 17 12 2 0 0 0 0
##
## <<datablock>> (22 data items)
## PUBCHEM_COMPOUND_CID
## "2244"
## PUBCHEM_CONFORMER_RMSD
## "0.6"
## PUBCHEM_CONFORMER_DIVERSEORDER
## "1 __ 11 __ 10 __ 3 __ 15 __ 17 __ 13 __ 5 __ 16 __ 7 __ 14 __ 9 __ 8 __ 4 __ 18 __ 6 __ 12 __ 2"
## PUBCHEM_MMFF94_PARTIAL_CHARGES
## "18 __ 1 -0.23 __ 10 -0.15 __ 11 0.63 __ 12 0.66 __ 13 0.06 __ 14 0.15 __ 15 0.15 __ 16 0.15 __ 17 0.15 __ 2 -0.65 __ 21 0.5 __ 3 -0.57 __ 4 -0.57 __ 5 0.08 __ 6 0.09 __ 7 -0.15 __ 8 -0.15 __ 9 -0.15"
##
## "..."
###########################################SMILES Import
data(smisample); smiset <- smisample
smiset[[1]]
## An instance of "SMI"
## [1] "O=C(NC1CCCC1)CN(c1cc2OCCOc2cc1)C(=O)CCC(=O)Nc1noc(c1)C"
write.SMI(smiset[1:4], file=paste0(dir_path,Sys.Date(),"_","sub.smi"))
dir_path_name1 <- list.files(pattern = ".*",dir_path,full.names = T, recursive = T)
smiset <- read.SMIset(grep("sub.smi", dir_path_name1, value = T))
smiset@smilist
## $`650001`
## [1] "O=C(NC1CCCC1)CN(c1cc2OCCOc2cc1)C(=O)CCC(=O)Nc1noc(c1)C"
##
## $`650002`
## [1] "O=c1[nH]c(=O)n(c2nc(n(CCCc3ccccc3)c12)NCCCO)C"
##
## $`650003`
## [1] "s1c(nnc1NC(=O)c1c(O)n2c3c(CCC2)cccc3c1=O)C(C)C"
##
## $`650004`
## [1] "S(c1n(c2c(n(c(=O)n(c2=O)C)C)n1)CC)CC(=O)NCCc1cc(OC)c(OC)cc1"
cid(smiset[1:4])
## [1] "650001" "650002" "650003" "650004"
data.frame(as.character(smiset[1:2]))
## as.character.smiset.1.2..
## 650001 O=C(NC1CCCC1)CN(c1cc2OCCOc2cc1)C(=O)CCC(=O)Nc1noc(c1)C
## 650002 O=c1[nH]c(=O)n(c2nc(n(CCCc3ccccc3)c12)NCCCO)C
#########################################Format Interconversions
data(sdfsample)
sdfset <- sdfsample[1]
smiles <- sdf2smiles(sdfset)
smiles@smilist
## $`650001\r`
## [1] "O=C(NC1CCCC1)CN(c1cc2OCCOc2cc1)C(=O)CCC(=O)Nc1noc(c1)C"
sdf <- smiles2sdf("CC(=O)OC1=CC=CC=C1C(=O)O")
sdf@SDF
## [[1]]
## An instance of "SDF"
##
## <<header>>
## Molecule_Name
## ""
## Source
## " OpenBabel11142214442D"
## Comment
## ""
## Counts_Line
## " 13 13 0 0 0 0 0 0 0 0999 V2000"
##
## <<atomblock>>
## C1 C2 C3 C5 C6 C7 C8 C9 C10 C11 C12 C13 C14 C15 C16
## C_1 -2.5981 -1.5 0 0 0 0 0 0 0 0 0 0 0 0 0
## C_2 -1.7321 -2 0 0 0 0 0 0 0 0 0 0 0 0 0
## ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
## O_12 0.866 -1.5 0 0 0 0 0 0 0 0 0 0 0 0 0
## O_13 1.7321 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##
## <<bondblock>>
## C1 C2 C3 C4 C5 C6 C7
## 1 1 2 1 0 0 0 0
## 2 2 3 2 0 0 0 0
## ... ... ... ... ... ... ... ...
## 12 11 12 2 0 0 0 0
## 13 11 13 1 0 0 0 0
##
## <<datablock>> (0 data items)
## character(0)
sdf@ID
## [1] ""
sdfStr <- convertFormat("SMI","SDF","CC(=O)OC1=CC=CC=C1C(=O)O")
#This will return the given compound as an SDF formatted string.
#2D coordinates are also computed and included in the resulting SDF string.
#########################################search pubchem
compounds <- pubchemCidToSDF(c(650001, 650002))
header(compounds[[1]])
## Molecule_Name
## "650001"
## Source
## " -OEChem-11142214442D"
## Comment
## ""
## Counts_Line
## " 61 64 0 0 0 0 0 0 0999 V2000"
header(compounds[[2]])
## Molecule_Name
## "650002"
## Source
## " -OEChem-11142214442D"
## Comment
## ""
## Counts_Line
## " 49 51 0 0 0 0 0 0 0999 V2000"
#?pubchemCidToSDF
#Get Compound SDF from PubChem by InChIkey
inchikeys <- c("DFGRXLCBXZBMEZ-UHFFFAOYSA-N", "JIOQFGLCEPFQHB-UHFFFAOYSA-N")
# You should only have 2 SDF returned, 2 other not found
#inchikey_query <- pubchemInchikey2sdf(inchikeys)
#inchikey_query$sdf_set ##2D SDF
#write.SDF(inchikey_query$sdf_set[[1]], file=paste0(dir_path,Sys.Date(),"_","sub_2.sdf"), sig=TRUE)
#compounds@SDF
#sdf3D = generate3DCoords(compounds[[1]])
#?generate3DCoords
#? pubchemInchikey2sdf
# successful queries
#inchikey_query_index <- inchikey_query$sdf_index[inchikey_query$sdf_index != 0]
#inchikey_query_index
# get CID of these queries
#inchikey_query_cid <- cid(inchikey_query$sdf_set[inchikey_query_index])
#names(inchikey_query_cid) <- names(inchikey_query_index)
#inchikey_query_cid
job1 <- launchCMTool("pubchemID2SDF", 2244)
status(job1)
## [1] "FINISHED"
result1 <- result(job1)
result1[[1]]
## An instance of "SDF"
##
## <<header>>
## Molecule_Name
## "2244"
## Source
## " -OEChem-11142214442D"
## Comment
## ""
## Counts_Line
## " 21 21 0 0 0 0 0 0 0999 V2000"
##
## <<atomblock>>
## C1 C2 C3 C5 C6 C7 C8 C9 C10 C11 C12 C13 C14 C15 C16
## O_1 3.732 -0.06 0 0 0 0 0 0 0 0 0 0 0 0 0
## O_2 6.3301 1.44 0 0 0 0 0 0 0 0 0 0 0 0 0
## ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
## H_20 1.69 -0.5969 0 0 0 0 0 0 0 0 0 0 0 0 0
## H_21 6.3301 2.06 0 0 0 0 0 0 0 0 0 0 0 0 0
##
## <<bondblock>>
## C1 C2 C3 C4 C5 C6 C7
## 1 1 5 1 0 0 0 0
## 2 1 12 1 0 0 0 0
## ... ... ... ... ... ... ... ...
## 20 13 19 1 0 0 0 0
## 21 13 20 1 0 0 0 0
##
## <<datablock>> (34 data items)
## PUBCHEM_COMPOUND_CID PUBCHEM_COMPOUND_CANONICALIZED
## "2244" "1"
## PUBCHEM_CACTVS_COMPLEXITY PUBCHEM_CACTVS_HBOND_ACCEPTOR
## "212" "4"
##
## "..."
#REF https://www.bioconductor.org/packages/release/bioc/vignettes/ChemmineR/inst/doc/ChemmineR.html