rm(list = ls())
##############################input data 
dir_path <- "C:\\Users\\liyix\\OneDrive\\Desktop\\"
dir_path_name <- list.files(pattern = ".*",dir_path,full.names = T, recursive = F)
#dir_path_name
data_1 <- read.csv(grep("aaa.csv",dir_path_name,value = T),header = T,stringsAsFactors = F)
#dim(data_1) #[1] 247   3
#head(data_1,2)
#length(unique(data_1$Mapping.ID)) #[1] 247   
library(openxlsx)
data_2 <-read.xlsx(grep("bbb.xlsx",dir_path_name,value = T),sheet = 1)
#dim(data_2) #[1] 268  40
#View(head(data_2))
data_2 <- data_2[, c(1,2)]
#colnames(data_2)
data_3 <- merge(data_1, data_2, by = "Mapping.ID", all = F)
#dim(data_3) #[1] 268   4
#head(data_3,3)
data_3 <- data_3[1:3,]
data_3
##       Mapping.ID Sample.Name                                      Primary.MOA
## 1 AABFWJDLCCDJJN      SP 141 Tumour suppressor p53/oncoprotein Mdm2 Modulator
## 2 ACCFLVVUVBJNGT  LY-3023414                                   PI3K Inhibitor
## 3 AGYCYVPAEQVWJT   MPT-0B098                 Tubulin polymerization Inhibitor
##         Sample.ID
## 1 NCGC00387481-01
## 2 NCGC00485487-01
## 3 NCGC00510350-01
##################################################################
library(webchem)
## Warning: package 'webchem' was built under R version 4.0.3
## Help us improve the package
## Fill out our 2020 survey at https://forms.gle/V7dfGGn73dkesn5L6
for (i in 1:nrow(data_3)) {
  #print(i)
  #i = 1
  tryCatch({
    #cid_data <- get_cid(data_3$Sample.ID,first = T)
    data_3$cid[i] <- get_cid(data_3$Sample.ID[i],first = T)$cid
    
    if(is.na(data_3$cid[i])) {
      data_3$smile[i] <- NA
      data_3$name[i] <- NA } else {
        data_3$smile[i] <- as.character(pc_prop(data_3$cid[i],properties = "CanonicalSMILES")$CanonicalSMILES[1])
        data_3$name[i] <- as.character(pc_synonyms(data_3$cid[i], from = "cid", match = "first"))
      }
  }, error = function(e) {
    cat("ERROR :",conditionMessage(e), "\n")
    cat("ERROR :", conditionMessage(e),file = paste0("error.txt"), append = TRUE, "\n")
    #cat(message('** ERR at ', Sys.time(), " **"),file = "test_1.txt", append = TRUE)
    #print(e)
  })
}
## `first = TRUE` is deprecated. Use `match = 'first'` instead
## Querying NCGC00387481-01. OK (HTTP 200).
## Querying. OK (HTTP 200).
## Querying 59620153. OK (HTTP 200).
##  Multiple found. Returning first.
## `first = TRUE` is deprecated. Use `match = 'first'` instead
## Querying NCGC00485487-01. OK (HTTP 200).
## Querying. OK (HTTP 200).
## Querying 57519748. OK (HTTP 200).
##  Multiple found. Returning first.
## `first = TRUE` is deprecated. Use `match = 'first'` instead
## Querying NCGC00510350-01. Not Found (HTTP 404).
#View(data_3)
class(data_3)
## [1] "data.frame"
data_3
##       Mapping.ID Sample.Name                                      Primary.MOA
## 1 AABFWJDLCCDJJN      SP 141 Tumour suppressor p53/oncoprotein Mdm2 Modulator
## 2 ACCFLVVUVBJNGT  LY-3023414                                   PI3K Inhibitor
## 3 AGYCYVPAEQVWJT   MPT-0B098                 Tubulin polymerization Inhibitor
##         Sample.ID      cid
## 1 NCGC00387481-01 59620153
## 2 NCGC00485487-01 57519748
## 3 NCGC00510350-01     <NA>
##                                                         smile      name
## 1            COC1=CC2=C(C=C1)NC3=C2C=CN=C3C4=CC=CC5=CC=CC=C54    SP-141
## 2 CC(CN1C2=C3C=C(C=CC3=NC=C2N(C1=O)C)C4=CC(=CN=C4)C(C)(C)O)OC LY3023414
## 3                                                        <NA>      <NA>
write.csv(data_3, paste0(dir_path,Sys.Date(),"-","name_ncgc_all.csv"),row.names = FALSE)
##REF https://github.com/ropensci/webchem