Hackathon: CPD Salt Screening

This project contains our historical salt screening data obtained through the Solubility and Crystallization Worflow managed by Jun Qiu. The aim of this project is to gather draw insight from our historical data to continue to improve the workflow.

Data Exploration

acid_raw <- read_xlsx('Acid Screen Data Table.xlsx')
acid <- acid_raw
str(acid)
## Classes 'tbl_df', 'tbl' and 'data.frame':    12121 obs. of  7 variables:
##  $ Screen #       : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ Well #         : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Compound Name  : chr  "BMT-385562-01" "BMT-385562-01" "BMT-385562-01" "BMT-385562-01" ...
##  $ Counter-ion    : chr  "HCl" "HCl" "HCl" "HCl" ...
##  $ Solvent        : chr  "MIBK" "i-PrOAc" "Toluene" "2-MeTHF" ...
##  $ Outcome (Yield): num  39 45 0 79 35 21 0 0 0 0 ...
##  $ Equivalence    : num  1 1 1 1 1 1 1 1 1 1 ...

Data cleaning - acids

Find out typos and unify counter-ion and solvent names

acid_raw$`Counter-ion` <- tolower(acid_raw$`Counter-ion`)
unique(acid_raw$`Counter-ion`)
##  [1] "hcl"                            "h2so4"                         
##  [3] "hbr"                            "h3po4"                         
##  [5] "1,5-naphthalenedisulfonic acid" "benzenesulfonic acid"          
##  [7] "methanesulfonic acid"           "p-toluenesulfonic acid"        
##  [9] "s-camphorsulfonic acid"         "2-naphthalenesulfonic acid"    
## [11] "ethanesulfonic acid"            "acetic acid"                   
## [13] "benzoic acid"                   "citric acid"                   
## [15] "dichloroacetic acid"            "fumaric acid"                  
## [17] "hbf4"                           "succinic acid"                 
## [19] "l-tartaric acid"                "maleic acid"                   
## [21] "dibenzoyl-d-tartaric acid"      "oxalic acid"                   
## [23] "r-mandelic acid"                "salicylic acid"                
## [25] "tfa"                            "4-nitrobenzoic acid"           
## [27] "1,5-napthlanenedisulfonic acid" "trichloroacetic acid"          
## [29] "d-tartaric acid"                "s-mandelic acid"               
## [31] "boric acid"                     "l-matic acid"                  
## [33] "h3po5"                          "h3po6"                         
## [35] "h3po7"                          "h3po8"                         
## [37] "h3po9"                          "glycolic acid"                 
## [39] "malonic acid"                   "l-malic acid"                  
## [41] "l-lactic acid"                  "adipic acid"                   
## [43] "propionic acid"                 "d-dibenzoyl-d-tartaric acid"   
## [45] "2-butynoic acid"                "benzensulfonic acid"
acid$`Counter-ion` <- acid_raw$`Counter-ion` %>% 
  str_replace("1,5-napthlanenedisulfonic acid", "1,5-naphthalenedisulfonic acid") %>% 
  str_replace("h3po.", "h3po4") %>%
  str_replace("benzensulfonic acid", "benzenesulfonic acid") %>%
  str_replace("l-matic acid", "l-malic acid")
unique(acid$`Counter-ion`)
##  [1] "hcl"                            "h2so4"                         
##  [3] "hbr"                            "h3po4"                         
##  [5] "1,5-naphthalenedisulfonic acid" "benzenesulfonic acid"          
##  [7] "methanesulfonic acid"           "p-toluenesulfonic acid"        
##  [9] "s-camphorsulfonic acid"         "2-naphthalenesulfonic acid"    
## [11] "ethanesulfonic acid"            "acetic acid"                   
## [13] "benzoic acid"                   "citric acid"                   
## [15] "dichloroacetic acid"            "fumaric acid"                  
## [17] "hbf4"                           "succinic acid"                 
## [19] "l-tartaric acid"                "maleic acid"                   
## [21] "dibenzoyl-d-tartaric acid"      "oxalic acid"                   
## [23] "r-mandelic acid"                "salicylic acid"                
## [25] "tfa"                            "4-nitrobenzoic acid"           
## [27] "trichloroacetic acid"           "d-tartaric acid"               
## [29] "s-mandelic acid"                "boric acid"                    
## [31] "l-malic acid"                   "glycolic acid"                 
## [33] "malonic acid"                   "l-lactic acid"                 
## [35] "adipic acid"                    "propionic acid"                
## [37] "d-dibenzoyl-d-tartaric acid"    "2-butynoic acid"

Data cleaning - solvents

unique(acid_raw$Solvent)
##  [1] "MIBK"                           "i-PrOAc"                       
##  [3] "Toluene"                        "2-MeTHF"                       
##  [5] "MeCN"                           "CPME"                          
##  [7] "IPA"                            "50% IPA/Water"                 
##  [9] "Heptane"                        "THF"                           
## [11] "MTBE"                           "Anisole"                       
## [13] "50% THF/Heptane"                "50% 2-MeTHF/Heptane"           
## [15] "50% MTBE/Heptane"               "50% Anisole/Heptane"           
## [17] "MeTHF"                          "50% DCM/Heptane"               
## [19] "EtOAc"                          "t-Amyl alcohol"                
## [21] "10% IPA/MTBE"                   "20% IPA/MTBE"                  
## [23] "90% IPA/water"                  "20% DCM:CPME"                  
## [25] "50% DCM:Heptane"                "33% MeTHF:Heptane"             
## [27] "33% IPA:Heptane"                "Chlorobenzene"                 
## [29] "80% MeTHF/20% Heptane"          "50% MeTHF/50% Heptane"         
## [31] "20% MeTHF/80% Heptane"          "80% i-PrOAc/20% Heptane"       
## [33] "50% i-PrOAc/50% Heptane"        "20% i-PrOAc/80% Heptane"       
## [35] "Acetone"                        "EtOH"                          
## [37] "MeOH"                           "1,2-DCE"                       
## [39] "DCM"                            "1:1 DCM/MeOH"                  
## [41] "1:1 DCM/EtOAc"                  "1:1 DCM/MeCN"                  
## [43] "1:1 DCM/IPA"                    "25% 1:1 Formamide/NMP /Water"  
## [45] "25% 1:1 Formamide/NMP /IPA"     "25% 1:1 Formamide/NMP /MIBK"   
## [47] "25% 1:1 Formamide/NMP /MeCN"    "25% 1:1 Formamide/NMP /i-PrOAc"
## [49] "25% 1:1 Formamide/NMP /CPME"    "25% 1:1 Formamide/NMP /Anisole"
## [51] "25% 1:1 Formamide/NMP /Toluene" "80% Water"                     
## [53] "60% Water"                      "80% Heptane"                   
## [55] "60% Heptane"                    "20% Heptane"                   
## [57] "water"                          "50% IPA/water"                 
## [59] "50% MeCN/water"                 "50% DMF/water"                 
## [61] "50% NMP/water"                  "50% Dioxane/water"             
## [63] "50% acetic acid/water"          "50% propionic acid/water"      
## [65] "Cylcohexane"                    "95% EtOH/water"                
## [67] "75% IPA/water"
acid$Solvent <- acid_raw$Solvent %>%
  str_replace("2-MeTHF","MeTHF") %>%
  str_replace("Cylcohexane","Cyclohexane") %>%
  str_replace("/\\d\\d%\\s", "/") %>%
  str_replace(":", "/") %>%
  str_replace("1/1", "50%")
acid <- acid[!str_detect(acid$Solvent, "..% ..%"), ]
unique(acid$Solvent)
##  [1] "MIBK"                     "i-PrOAc"                 
##  [3] "Toluene"                  "MeTHF"                   
##  [5] "MeCN"                     "CPME"                    
##  [7] "IPA"                      "50% IPA/Water"           
##  [9] "Heptane"                  "THF"                     
## [11] "MTBE"                     "Anisole"                 
## [13] "50% THF/Heptane"          "50% MeTHF/Heptane"       
## [15] "50% MTBE/Heptane"         "50% Anisole/Heptane"     
## [17] "50% DCM/Heptane"          "EtOAc"                   
## [19] "t-Amyl alcohol"           "10% IPA/MTBE"            
## [21] "20% IPA/MTBE"             "90% IPA/water"           
## [23] "20% DCM/CPME"             "33% MeTHF/Heptane"       
## [25] "33% IPA/Heptane"          "Chlorobenzene"           
## [27] "80% MeTHF/Heptane"        "20% MeTHF/Heptane"       
## [29] "80% i-PrOAc/Heptane"      "50% i-PrOAc/Heptane"     
## [31] "20% i-PrOAc/Heptane"      "Acetone"                 
## [33] "EtOH"                     "MeOH"                    
## [35] "1,2-DCE"                  "DCM"                     
## [37] "50% DCM/MeOH"             "50% DCM/EtOAc"           
## [39] "50% DCM/MeCN"             "50% DCM/IPA"             
## [41] "80% Water"                "60% Water"               
## [43] "80% Heptane"              "60% Heptane"             
## [45] "20% Heptane"              "water"                   
## [47] "50% IPA/water"            "50% MeCN/water"          
## [49] "50% DMF/water"            "50% NMP/water"           
## [51] "50% Dioxane/water"        "50% acetic acid/water"   
## [53] "50% propionic acid/water" "Cyclohexane"             
## [55] "95% EtOH/water"           "75% IPA/water"