library(readr)
OA_SerieA_top50 <- read_csv("Results/OA_SerieA_top50.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   index = col_double(),
##   UniProt = col_character(),
##   Prob = col_double(),
##   `Unnamed: 0` = col_double(),
##   Entry.name = col_character(),
##   Gene = col_character(),
##   Gene.synonym = col_character(),
##   Ensembl = col_character()
## )

Figure

## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2     v dplyr   1.0.2
## v tibble  3.0.4     v stringr 1.4.0
## v tidyr   1.1.2     v forcats 0.5.0
## v purrr   0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## # A tibble: 50 x 9
##    index UniProt  Prob `Unnamed: 0` Entry.name  Gene  Gene.synonym Ensembl Label
##    <dbl> <chr>   <dbl>        <dbl> <chr>       <chr> <chr>        <chr>   <chr>
##  1  1504 Q9GZP0   78.2         9325 PDGFD_HUMAN PDGFD IEGF, MSTP0~ ENSG00~ F001 
##  2  1582 Q9NRA1   55.8         9727 PDGFC_HUMAN PDGFC fallotein, ~ ENSG00~ F002 
##  3  2345 Q15113   41.7        16073 PCOC1_HUMAN PCOL~ PCPE, PCPE1  ENSG00~ F003 
##  4  2394 P98066   38.2        16573 TSG6_HUMAN  TNFA~ TSG-6, TSG6  ENSG00~ F004 
##  5  1875 Q9UKZ9   37.6        11906 PCOC2_HUMAN PCOL~ PCPE2        ENSG00~ F005 
##  6   804 Q5VXM1   32.0         5044 CDCP2_HUMAN CDCP2 NaN          ENSG00~ F006 
##  7   731 P09038   29.6         4609 FGF2_HUMAN  FGF2  FGFB         ENSG00~ F007 
##  8  2303 P01137   25.9        15657 TGFB1_HUMAN TGFB1 CED, DPD1, ~ ENSG00~ F008 
##  9  1181 Q8TDF5   23.3         7270 NETO1_HUMAN NETO1 BCTL1, BTCL1 ENSG00~ F009 
## 10  2329 P10600   23.0        15905 TGFB3_HUMAN TGFB3 ARVD, ARVD1  ENSG00~ F010 
## # ... with 40 more rows
## # A tibble: 50 x 2
##    `%Edu FC (Pred)` Label
##               <dbl> <chr>
##  1             78.2 F001 
##  2             55.8 F002 
##  3             41.7 F003 
##  4             38.2 F004 
##  5             37.6 F005 
##  6             32.0 F006 
##  7             29.6 F007 
##  8             25.9 F008 
##  9             23.3 F009 
## 10             23.0 F010 
## # ... with 40 more rows

Formate table

#Load the libraries

library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
library(dplyr)
library(formattable)
library(tidyr)

customGreen0 = "#DeF7E9"
customGreen = "#71CA97"
customRed = "#ff7f7f"

a = formattable(head(temp_df, 15), 
    align =c("c","l"), 
    list(`Label` = formatter("span", style = ~ style(color = "grey",font.weight = "bold")),
         `%Edu FC (Pred)` = color_tile(customGreen0, customGreen)
))

Plot figure

library(ggplot2)

p <- ggplot(data = temp_df, aes(`Label`, `%Edu FC (Pred)`)) +
        geom_point() +
        scale_x_discrete(guide = guide_axis(angle = 90)) +
        theme_classic()
p

ggsave(plot=p, filename="result.png", width=7, height=4)

OA Figure

library(readr)
library(tidyverse)

setwd("~/Juvena")

raw <- read_csv("Results/AE_NN_OA_072620_E5 - AE_NN_OA_072620_E5.csv")
## Warning: Missing column names filled in: 'X1' [1]
## 
## -- Column specification --------------------------------------------------------
## cols(
##   X1 = col_double(),
##   UniProt = col_character(),
##   Prob = col_double(),
##   Indicator = col_character(),
##   `Unnamed: 0` = col_double(),
##   Entry.name = col_character(),
##   Gene = col_character(),
##   Gene.synonym = col_character(),
##   Ensembl = col_character()
## )
raw2 <- read_csv("Results/AE_NN_OA_071520_full - AE_NN_OA_071520_full.csv")
## Warning: Missing column names filled in: 'X1' [1]
## 
## -- Column specification --------------------------------------------------------
## cols(
##   X1 = col_double(),
##   UniProt = col_character(),
##   Prob = col_double(),
##   Indicator = col_character(),
##   `Unnamed: 0` = col_double(),
##   Entry.name = col_character(),
##   Gene = col_character(),
##   Gene.synonym = col_character(),
##   Ensembl = col_character()
## )
# Generate ground truth
temp <- raw %>% select(c(3, 4, 7))
temp_ground_truth <-  temp[temp$Indicator %in% c("True Positive", "True Negative"),]
label_ls <- c(1:length(temp_ground_truth$Prob))
temp_ground_truth$Label <- sprintf("L%03d", label_ls)
colnames(temp_ground_truth)[1] <- "Hit Candidate Probability"
temp_ground_truth
## # A tibble: 25 x 4
##    `Hit Candidate Probability` Indicator     Gene  Label
##                          <dbl> <chr>         <chr> <chr>
##  1                       1     True Positive SHH   L001 
##  2                       1     True Positive FGF9  L002 
##  3                       1     True Positive FGF8  L003 
##  4                       1     True Positive IHH   L004 
##  5                       1     True Positive FGF18 L005 
##  6                       1     True Positive FGF2  L006 
##  7                       1.00  True Positive WNT5A L007 
##  8                       1.00  True Positive WNT3A L008 
##  9                       1.00  True Positive PTHLH L009 
## 10                       0.997 True Positive DLL1  L010 
## # ... with 15 more rows
# generated prediction
temp <- raw2 %>% select(c(3, 4, 7))
temp <-  temp[temp$Indicator=="Secreted",]

temp <- temp[order(-temp$Prob),]
temp_top_50 <- temp[c(1:50),]

candidate_ls <- c(1:50)

temp_top_50$Label <- sprintf("F%03d", candidate_ls)
temp_top_50
## # A tibble: 50 x 4
##     Prob Indicator Gene  Label
##    <dbl> <chr>     <chr> <chr>
##  1 0.999 Secreted  FGF1  F001 
##  2 0.999 Secreted  DHH   F002 
##  3 0.998 Secreted  FGF17 F003 
##  4 0.998 Secreted  FGF22 F004 
##  5 0.997 Secreted  FGF4  F005 
##  6 0.997 Secreted  FGF6  F006 
##  7 0.997 Secreted  FGF16 F007 
##  8 0.997 Secreted  FGF5  F008 
##  9 0.997 Secreted  FGF3  F009 
## 10 0.996 Secreted  FGF20 F010 
## # ... with 40 more rows
temp_top_50 = select(temp_top_50, Prob, Label)
colnames(temp_top_50) <- c("Hit Candidate Probability", "Label")
temp_top_50
## # A tibble: 50 x 2
##    `Hit Candidate Probability` Label
##                          <dbl> <chr>
##  1                       0.999 F001 
##  2                       0.999 F002 
##  3                       0.998 F003 
##  4                       0.998 F004 
##  5                       0.997 F005 
##  6                       0.997 F006 
##  7                       0.997 F007 
##  8                       0.997 F008 
##  9                       0.997 F009 
## 10                       0.996 F010 
## # ... with 40 more rows
library(data.table)
library(dplyr)
library(formattable)
library(tidyr)

customGreen0 = "#DeF7E9"
customGreen = "#71CA97"
customRed = "#ff7f7f"

a = formattable(head(temp_top_50, 15), 
    align =c("c","l"), 
    list(`Label` = formatter("span", style = ~ style(color = "grey",font.weight = "bold")),
         `Hit Candidate Probability` = color_tile(customGreen0, customGreen)
))
library(ggplot2)
library(ggrepel)

p <- ggplot(data = temp_ground_truth, aes(`Label`, `Hit Candidate Probability`)) +
        geom_point(aes(color = `Indicator`)) +
        scale_color_manual(values = c("red", "green")) +
        scale_x_discrete(guide = guide_axis(angle = 90)) +
        # geom_text(aes(label = `Gene`), hjust = 0, nudge_x = 0.05) +
        theme_classic()
p

p <- ggplot(data = head(temp_top_50, 25), aes(`Label`, `Hit Candidate Probability`)) +
        geom_point() +
        scale_x_discrete(guide = guide_axis(angle = 90)) +
        theme_classic()
p  

OA in Vitro

library(readr)
CA016_combined <- read_csv("Results/CA016_combines_dosed.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   well = col_character(),
##   FID = col_character(),
##   Condition = col_character(),
##   FactorName = col_character(),
##   Category = col_character(),
##   Cell.ID = col_character(),
##   exp_title = col_character(),
##   plate = col_character(),
##   combo = col_logical(),
##   use_factordb = col_logical(),
##   timepoint = col_character(),
##   emy_fc = col_logical(),
##   PercentArea_eMyHC = col_logical(),
##   FileName_Single.Channel.Blue = col_character(),
##   FileName_Single.Channel.Red = col_character(),
##   SupID = col_logical(),
##   Dose = col_character(),
##   Unit = col_character()
## )
## i Use `spec()` for the full column specifications.
CA015_combined <- read_csv("Results/CA015_combines_dosed.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   well = col_character(),
##   FID = col_character(),
##   Condition = col_character(),
##   `Unnamed: 4` = col_logical(),
##   FactorName = col_character(),
##   Category = col_character(),
##   Cell.ID = col_character(),
##   exp_title = col_character(),
##   plate = col_character(),
##   combo = col_character(),
##   use_factordb = col_logical(),
##   timepoint = col_character(),
##   emy_fc = col_logical(),
##   PercentArea_eMyHC = col_logical(),
##   FileName_Single.Channel.Blue = col_character(),
##   FileName_Single.Channel.Red = col_character(),
##   SupID = col_logical(),
##   Dose = col_character(),
##   Unit = col_character()
## )
## i Use `spec()` for the full column specifications.
usable_column <- c("FactorName", "edu_count_fc", "Value")

CA016_combined <- CA016_combined[usable_column]
CA015_combined <- CA015_combined[usable_column]

overall_results <- rbind(CA015_combined, CA016_combined)

# overall_results$edu_count_fc

library(tidyverse)
overall_results_rm_na <- na.omit(overall_results)
result_summary <- overall_results_rm_na %>% group_by(`FactorName`, `Value`) %>% summarise(max_val = max(edu_count_fc))
## `summarise()` regrouping output by 'FactorName' (override with `.groups` argument)
range(result_summary$Value)
## [1]  0.01 10.00
result_summary$Value <- as.factor(result_summary$Value)
library(ggplot2)
# overall_results_rm_na

p <- ggplot(result_summary, aes(x = `FactorName`, y = `Value`, fill = `Value`)) +
    geom_bar(stat = "identity") +
    # theme_classic() +
    labs(
        x = "Factors",
        y = "Edu Count Fc",
        title = paste("")
    )
p

result_summary_sorted <- result_summary[order(-result_summary$max_val),]

result_summary_sorted
## # A tibble: 101 x 3
## # Groups:   FactorName [101]
##    FactorName Value max_val
##    <chr>      <fct>   <dbl>
##  1 PDGFD      0.5    257.  
##  2 FGF1       0.5     95.9 
##  3 FGF18      0.1     16.1 
##  4 PAMR1      0.2     11.6 
##  5 TGFBI      0.2     11.0 
##  6 FSTL1      1        7.86
##  7 NDNF       0.5      7.43
##  8 PLAU       2        6.57
##  9 Anastellin 1.2      6.43
## 10 SFRP1      1        5.84
## # ... with 91 more rows
# Series_A_Chondrocyte_Prediction_vs_in_vitro <- read_csv("Juvena/Results/Series_A_Chondrocyte _Prediction_vs_in_vitro.csv")
# 
# candidate_ls <- c(1:15)
# 
# Series_A_Chondrocyte_Prediction_vs_in_vitro$Label <- sprintf("F%03d", candidate_ls)
# Series_A_Chondrocyte_Prediction_vs_in_vitro
# 
# edu_min <- range(Series_A_Chondrocyte_Prediction_vs_in_vitro$`%edu_fc`)[1]
# edu_max <- range(Series_A_Chondrocyte_Prediction_vs_in_vitro$`%edu_fc`)[2]
# 
# Series_A_Chondrocyte_Prediction_vs_in_vitro$`%edu_fc_scalled` <- (Series_A_Chondrocyte_Prediction_vs_in_vitro$`%edu_fc`-edu_min)/(edu_max - edu_min)
# 
# 
# 
# prob_min <- range(Series_A_Chondrocyte_Prediction_vs_in_vitro$Probability)[1]
# prob_max <- range(Series_A_Chondrocyte_Prediction_vs_in_vitro$Probability)[2]
# 
# Series_A_Chondrocyte_Prediction_vs_in_vitro$`Prob_scaled` <- (Series_A_Chondrocyte_Prediction_vs_in_vitro$Probability - prob_min)/(prob_max - prob_min)
# Series_A_Chondrocyte_Prediction_vs_in_vitro <- data.frame(apply(Series_A_Chondrocyte_Prediction_vs_in_vitro, 4, unclass))
# library(hrbrthemes)
# p <- ggplot(Series_A_Chondrocyte_Prediction_vs_in_vitro, aes(x=Label)) +
#                 geom_bar( aes(y=Prob_scaled), stat="identity", size=2, fill="grey0", alpha=.2) + 
#                 geom_bar( aes(y=`%edu_fc_scalled`), stat="identity", size=2, fill=rgb(0.2, 0.6, 0.9, 1)) +
#           
#           scale_y_continuous(
#             
#             # Features of the first axis
#             name = expression("Normalized Hit Candidate Probability"),
#             
#             # Add a second axis and specify its features
#             sec.axis = sec_axis(~.*(edu_max-edu_min)+edu_min, name=expression(bold("In Vitro %Edu Fc")))
#           ) + 
#           
#           # theme_ipsum() +
#           theme_classic() +
#         
#           theme(
#               axis.title.x = element_text(size=20),
#             axis.title.y = element_text(color = "grey0", size=20),
#             axis.title.y.right = element_text(color = rgb(0.2, 0.6, 0.9, 1), size=20),
#             axis.text.x = element_text(size = 18, angle = 90),
#             axis.text.y = element_text(size = 18),
#             axis.text.y.right = element_text(size = 18)
#             )
#         + 
# 
#           ggtitle("QSAR prediction vs. in vitro")
# p

Figure for prediction and actual results overlapping

“QSAR+ model predicted hit candidates vs. in vitro results from proliferation analysis in human chondrocyte”

library(ggplot2)
library(readr)
library(scales)
## 
## Attaching package: 'scales'
## The following objects are masked from 'package:formattable':
## 
##     comma, percent, scientific
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
Chondrocyte_QSAR_candidates <- read_csv("Results/Chondrocyte_QSAR_candidates.csv")
## Warning: Missing column names filled in: 'X1' [1], 'X10' [10]
## 
## -- Column specification --------------------------------------------------------
## cols(
##   X1 = col_double(),
##   UniProt = col_character(),
##   Prob = col_double(),
##   Indicator = col_character(),
##   `Unnamed: 0` = col_double(),
##   Entry.name = col_character(),
##   Gene = col_character(),
##   Gene.synonym = col_character(),
##   Ensembl = col_character(),
##   X10 = col_logical(),
##   Rank = col_double(),
##   Protein = col_character(),
##   Probability = col_double(),
##   FC_EdU = col_logical(),
##   Condition = col_character(),
##   edu_fc = col_double()
## )
tested_protein <- Chondrocyte_QSAR_candidates[c('Condition', 'edu_fc')]

predicted_protein <- na.omit(Chondrocyte_QSAR_candidates[c('Protein', 'Probability')])


temp_cols <- data.frame(do.call("rbind", strsplit(as.character(tested_protein$Condition), ":", fixed = TRUE)))
tested_protein$Protein <- temp_cols$X1

tested_protein_correct <- merge(x=tested_protein, y=predicted_protein, by="Protein", all.x=FALSE, all.y=TRUE)

# tested_protein_correct[sort(tested_protein_correct$edu_fc),]

tested_protein_correct_narm_original <- na.omit(tested_protein_correct) # 70 rows

tested_protein_correct_narm_original <- tested_protein_correct_narm_original[order(-tested_protein_correct_narm_original$Probability),]
# temp_df$ground_truth <- temp_df$edu_fc >=2.0
# 
# cut_off_3_count <- sum(temp_df$edu_fc >=3.0, na.rm = TRUE)
# cut_off_2_count <- sum(temp_df$edu_fc >=2.0, na.rm = TRUE)
# cut_off_1_5_count <- sum(temp_df$edu_fc >=1.5, na.rm = TRUE)
# length(temp_df$Protein)
# 
# temp_df
# 
# temp_df$cutoff_3 <- 'Off'
# temp_df$cutoff_3[temp_df$edu_fc >=3.0] <- 'overlap'
# 
# temp_df$cutoff_2 <- 'Off'
# temp_df$cutoff_2[temp_df$edu_fc >=2.0] <- 'overlap'
# 
# temp_df$cutoff_1_5 <- 'Off'
# temp_df$cutoff_1_5[temp_df$edu_fc >=1.5] <- 'overlap'

tested_protein_correct_narm <- head(tested_protein_correct_narm_original, 25)
tested_protein_correct_narm$edu_fc[tested_protein_correct_narm$edu_fc >5] <- 5

range(tested_protein_correct_narm$Probability)
## [1] 0.9106392 1.0000000
edu_min <- range(tested_protein_correct_narm$`edu_fc`)[1]
edu_max <- range(tested_protein_correct_narm$`edu_fc`)[2]

tested_protein_correct_narm$edu_fc_scalled <- (tested_protein_correct_narm$edu_fc-edu_min)/(edu_max - edu_min)



candidate_ls <- c(1:length(tested_protein_correct_narm$Protein))
tested_protein_correct_narm$Label <- sprintf("F%03d", candidate_ls)


tested_protein_correct_narm[tested_protein_correct_narm$edu_fc>=1.5 & tested_protein_correct_narm$edu_fc>=1.6,]
##    Protein         Condition   edu_fc Probability edu_fc_scalled Label
## 7     APOB     APOB: 1 ug/mL 1.603599   1.0000000      0.2572685  F001
## 22    FGF1   FGF1: 0.5 ug/mL 5.000000   1.0000000      1.0000000  F002
## 24   FGF18 FGF18: 0.10 ug/mL 4.557981   1.0000000      0.9033385  F003
## 26  FGFBP3   FGFBP3: 1 ug/mL 1.664598   1.0000000      0.2706080  F004
## 41   LAMA1    LAMA1: 2 ug/mL 2.340812   0.9998283      0.4184837  F010
## 45    NDNF   NDNF: 0.5 ug/mL 5.000000   0.9991577      1.0000000  F013
## 3     AGRN     AGRN: 2 ug/mL 2.711375   0.9973943      0.4995191  F018
## 51  PDGFRL   PDGFRL: 3 ug/mL 3.727818   0.9867464      0.7217968  F021
## 20  DRAXIN   DRAXIN: 1 ug/mL 1.860166   0.9845426      0.3133752  F022
## 59   SFRP2    SFRP2: 1 ug/mL 2.753778   0.9747477      0.5087918  F024
h_line_mark <- (1.5-edu_min)/(edu_max - edu_min)

tested_protein_correct_narm$actual_label <- "Negative"
tested_protein_correct_narm$actual_label[tested_protein_correct_narm$edu_fc>=1.5] <- "Positive"


tested_protein_correct_narm$pred_label <- "Negative"
tested_protein_correct_narm$pred_label[tested_protein_correct_narm$Probability>=0.244439] <- "Positive"
# blank_theme <- theme_minimal()+
#   theme(
#   axis.title.x = element_blank(),
#   axis.title.y = element_blank(),
#   panel.border = element_blank(),
#   panel.grid=element_blank(),
#   axis.ticks = element_blank(),
#   plot.title=element_text(size=14, face="bold")
#   )
# 
# ggplot(temp_df, aes(x=factor(1), fill=cutoff_1_5))+
#   geom_bar(width = 1)+
#   coord_polar("y") +
#   scale_fill_brewer("Blues") + blank_theme +
#   theme(axis.text.x=element_blank())+
#   geom_text(aes(y = value/3 + c(0, cumsum(value)[-length(value)]), 
#                 label = percent(value/100)), size=5)




library(hrbrthemes)
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
##       Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
##       if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
p <- ggplot(tested_protein_correct_narm, aes(x=Label)) +
                geom_bar( aes(y=Probability), stat="identity", size=2, fill="grey0", alpha=.2) + 
                geom_bar( aes(y=edu_fc_scalled), stat="identity", size=2, fill=rgb(0.2, 0.6, 0.9, 1), alpha=.6) +
          
          scale_y_continuous(
            
            # Features of the first axis
            name = expression("Hit Candidate Probability"),
            
            # Add a second axis and specify its features
            sec.axis = sec_axis(~.*(edu_max-edu_min)+edu_min, name=expression(bold("In Vitro Edu")))
          ) + 
          
          # theme_ipsum() +
          theme_classic() +
        
          theme(
              axis.title.x = element_text(size=20),
            axis.title.y = element_text(color = "grey0", size=20),
            axis.title.y.right = element_text(color = rgb(0.2, 0.6, 0.9, 1), size=20),
            axis.text.x = element_text(size = 10, angle = 90),
            axis.text.y = element_text(size = 18),
            axis.text.y.right = element_text(size = 18)
            ) +
          geom_hline(yintercept = h_line_mark, color='red')
        
p

# confusion matrix.
library(ggplot2)
library(readr)
library(scales)
library(tidyverse)

Chondrocyte_QSAR_candidates <- read_csv("Results/Chondrocyte_QSAR_candidates - Sheet3.csv")
## Warning: Missing column names filled in: 'X7' [7], 'X8' [8], 'X9' [9],
## 'X10' [10]
## 
## -- Column specification --------------------------------------------------------
## cols(
##   HTS_Rank = col_double(),
##   QSAR_Rank = col_double(),
##   Protein = col_character(),
##   Probability = col_double(),
##   `%edu_fc` = col_double(),
##   `Log10(edu_FC)` = col_double(),
##   X7 = col_character(),
##   X8 = col_character(),
##   X9 = col_character(),
##   X10 = col_character()
## )
tested_protein <- Chondrocyte_QSAR_candidates[c('Protein', 'Probability', '%edu_fc')]
tested_protein <- tested_protein[order(-tested_protein$Probability) ,]

# tested_protein <- head(tested_protein, 25)
tested_protein$edu_fc <- tested_protein$`%edu_fc`
tested_protein$edu_fc[tested_protein$`%edu_fc` >20] <- 20

range(tested_protein$edu_fc)
## [1]  0.261 20.000
edu_min <- min(tested_protein$edu_fc)
edu_max <- max(tested_protein$edu_fc)
# tested_protein$edu_fc_scalled <- (tested_protein$edu_fc - edu_min)/(edu_max - edu_min)

# tested_protein$edu_fc[tested_protein$edu_fc>20] <-20

tested_protein$edu_fc_scaled <- rescale(tested_protein$edu_fc, to=c(0,1))


range(tested_protein$edu_fc_scaled)
## [1] 0 1
candidate_ls <- c(1:length(tested_protein$Protein))
tested_protein$Label <- sprintf("F%03d", candidate_ls)

tested_protein[tested_protein$`%edu_fc`>20,] # doing labeling the outliers
## # A tibble: 3 x 6
##   Protein Probability `%edu_fc` edu_fc edu_fc_scaled Label
##   <chr>         <dbl>     <dbl>  <dbl>         <dbl> <chr>
## 1 FGF1       1.00e+ 0      21.1     20             1 F001 
## 2 PDGFD      7.18e- 3      43.0     20             1 F061 
## 3 TGFB1      1.63e-10      26.2     20             1 F070
h_line_mark <- (1.5- edu_min)/(edu_max - edu_min)

length(tested_protein$Protein)
## [1] 70
############################################################################
cutoff_val <- 
cutoff_edu_min <- min(tested_protein$`%edu_fc`)
cutoff_edu_max <- max(tested_protein$`%edu_fc`)

prob_cut <- (cutoff_val-cutoff_edu_min)/(cutoff_edu_max - cutoff_edu_min)

tested_protein$actual_label <- "TN"
tested_protein$actual_label[tested_protein$`%edu_fc`>=1.5] <- "TP"


tested_protein$pred_label <- "TN"
tested_protein$pred_label[tested_protein$Probability>=0.3] <- "TP"


library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
conf_m <- NULL
conf_m <- confusionMatrix(factor(tested_protein$pred_label), factor(tested_protein$actual_label))

conf_table <- data.frame(conf_m$table)

plotTable <- conf_table %>%
  mutate(goodbad = ifelse(conf_table$Prediction == conf_table$Reference, "good", "bad")) %>%
  group_by(Reference) %>%
  mutate(prop = Freq/sum(Freq))


# 342 x 274
p <- ggplot(plotTable, mapping=aes(Prediction , Reference, fill= goodbad, alpha = prop)) +
        geom_tile() + 
        geom_text(aes(label=prop), vjust = .5, fontface  = "bold", alpha = 1) +
        scale_fill_gradient(low="white", high="white") +
        labs(x = "Reference",y = "Prediction")  +
        scale_fill_manual(values = c(good = "green", bad = "red")) +
        xlim(rev(levels(plotTable$Reference)))
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
        # scale_x_discrete(labels=c("Class_1","Class_2","Class_3","Class_4")) +
        # scale_y_discrete(labels=c("Class_4","Class_3","Class_2","Class_1"))
p

p <- ggplot(plotTable, mapping=aes(Prediction , Reference, fill= goodbad, alpha = prop)) +
        geom_tile() + 
        geom_text(aes(label=Freq), vjust = .5, fontface  = "bold", alpha = 1) +
        scale_fill_gradient(low="white", high="white") +
        labs(x = "Reference",y = "Prediction")  +
        scale_fill_manual(values = c(good = "green", bad = "red")) +
        xlim(rev(levels(plotTable$Reference)))
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
        # scale_x_discrete(labels=c("Class_1","Class_2","Class_3","Class_4")) +
        # scale_y_discrete(labels=c("Class_4","Class_3","Class_2","Class_1"))
p

library(hrbrthemes)
p <- ggplot(tested_protein, aes(x=Label)) +
                geom_bar( aes(y=Probability), stat="identity", size=2, fill="grey0", alpha=.2) + 
                geom_bar( aes(y=edu_fc_scaled), stat="identity", size=2, fill=rgb(0.2, 0.6, 0.9, 1), alpha=.6) +
          
          scale_y_continuous(
            
            # Features of the first axis
            name = expression("Hit Candidate Probability"),
            
            # Add a second axis and specify its features
            sec.axis = sec_axis(~.*(edu_max-edu_min)+edu_min, name=expression(bold("In Vitro %Edu Fc")))
          ) + 
          
          # theme_ipsum() +
          theme_classic() +
        
          theme(
              axis.title.x = element_text(size=20),
              # axis.title.x=element_blank(),
            axis.title.y = element_text(color = "grey0", size=20),
            axis.title.y.right = element_text(color = rgb(0.2, 0.6, 0.9, 1), size=20),
            # axis.text.x = element_text(size = 10, angle = 90),
            axis.text.x = element_blank(),
            axis.text.y = element_text(size = 18),
            axis.text.y.right = element_text(size = 18)
            ) +
          geom_hline(yintercept = 0.5, color='black')+
        geom_hline(yintercept = h_line_mark, color="blue")
        
p

# 862x537

Generate ROC and AUC

#load necessary packages
library(ggplot2)
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
# data <- ISLR::Default
# sample <- sample(c(TRUE, FALSE), nrow(data), replace=TRUE, prob=c(0.7,0.3))
# train <- data[sample, ]
# test <- data[!sample, ]
# 
# #fit logistic regression model to training set
# model <- glm(default~student+balance+income, family="binomial", data=train)
# 
# #use model to make predictions on test set
# predicted <- predict(model, test, type="response")
# 
# #define object to plot
# rocobj <- roc(test$default, predicted)
# auc <- round(auc(test$default, predicted),4)
# 
# #create ROC plot
# ggroc(rocobj, colour = 'steelblue', size = 2) +
#   ggtitle(paste0('ROC Curve ', '(AUC = ', auc, ')')) +
#   theme_minimal()

tested_protein
## # A tibble: 70 x 8
##    Protein Probability `%edu_fc` edu_fc edu_fc_scaled Label actual_label
##    <chr>         <dbl>     <dbl>  <dbl>         <dbl> <chr> <chr>       
##  1 FGF1           1       21.1   20            1      F001  TP          
##  2 FGF17          1       18     18            0.899  F002  TP          
##  3 FGF18          1        8.06   8.06         0.395  F003  TP          
##  4 APOB           1        4.13   4.13         0.196  F004  TP          
##  5 FGFBP3         1        3.37   3.37         0.158  F005  TP          
##  6 ADAMTS7        1.00     3.07   3.07         0.142  F006  TP          
##  7 HGFAC          1.00     0.919  0.919        0.0333 F007  TN          
##  8 ADAMTS~        1.00     2.94   2.94         0.136  F008  TP          
##  9 FLT1           1.00     1.92   1.92         0.0840 F009  TP          
## 10 FN1            1.00     3.11   3.11         0.144  F010  TP          
## # ... with 60 more rows, and 1 more variable: pred_label <chr>
rocobj <- roc(tested_protein$actual_label, tested_protein$Probability)
## Setting levels: control = TN, case = TP
## Setting direction: controls < cases
auc <- round(auc(tested_protein$actual_label, tested_protein$Probability),4)
## Setting levels: control = TN, case = TP
## Setting direction: controls < cases
# p <- ggroc(rocobj, colour = 'steelblue', size = 2) +
#   ggtitle(paste0('ROC Curve ', '(AUC = ', auc, ')')) +
#   theme_classic()
# 
# p


library(plotROC)
## 
## Attaching package: 'plotROC'
## The following object is masked from 'package:pROC':
## 
##     ggroc
p <- ggplot(tested_protein, aes(d = actual_label, m = Probability)) + 
    geom_roc(n.cuts = 0, labels = FALSE) +
    # geom_rocci(ci.at = quantile(tested_protein$Probability, c(.1, .4, .5, .6, .9))) +
    style_roc(xlab = "1 - Specificity", ylab= "Sensitivity") +
    geom_abline(intercept = 0, slope = 1, color="red") +
    ggtitle(paste0('ROC Curve ', '(AUC = ', auc, ')'))
    # theme_classic()
p # 470 x 438
## Warning in verify_d(data$d): D not labeled 0/1, assuming TN = 0 and TP = 1!

Thach later using the top hits from the above results %edu to run in the OA cell line.

library(ggplot2)
library(readr)
library(scales)


OA_df <- read_csv("Results/CA017 Top Hits - Sheet1.csv")
## Warning: Missing column names filled in: 'X1' [1]
## 
## -- Column specification --------------------------------------------------------
## cols(
##   X1 = col_double(),
##   Condition = col_character(),
##   Cell.ID = col_character(),
##   Category = col_character(),
##   N = col_double(),
##   Percent_EdU = col_double(),
##   sd = col_double(),
##   se = col_double(),
##   ci = col_double(),
##   `adj-pval` = col_double()
## )
OA_protein <- OA_df[c('Category', 'Percent_EdU')]
colnames(OA_protein) <- c('Protein', 'Percent_EdU')
# need to run the previous 2 cells
tested_protein_correct_narm_original 
##     Protein          Condition      edu_fc  Probability
## 7      APOB      APOB: 1 ug/mL   1.6035988 1.000000e+00
## 22     FGF1    FGF1: 0.5 ug/mL  64.2626545 1.000000e+00
## 24    FGF18  FGF18: 0.10 ug/mL   4.5579813 1.000000e+00
## 26   FGFBP3    FGFBP3: 1 ug/mL   1.6645980 1.000000e+00
## 2   ADAMTS7   ADAMTS7: 1 ug/mL   1.5131683 9.999999e-01
## 35    HGFAC     HGFAC: 2 ug/mL   0.7402934 9.999884e-01
## 1  ADAMTS19  ADAMTS19: 1 ug/mL   1.4493507 9.999758e-01
## 27     FLT1    FLT1: 0.5 ug/mL   0.8774524 9.999677e-01
## 28      FN1      FN1: 10 ug/mL   1.5348433 9.999155e-01
## 41    LAMA1     LAMA1: 2 ug/mL   2.3408121 9.998283e-01
## 47      NTS     NTS: 0.2 ug/mL   0.8503963 9.998029e-01
## 21    FBLN1     FBLN1: 1 ug/mL   0.8120715 9.996300e-01
## 45     NDNF    NDNF: 0.5 ug/mL   5.2407153 9.991577e-01
## 60    SPON1   SPON1: 0.5 ug/mL   1.1093920 9.989819e-01
## 72      TNC       TNC: 3 ug/mL   1.2622126 9.981275e-01
## 67    THBS2     THBS2: 1 ug/mL   0.4271477 9.980863e-01
## 6      APLN    APLN: 2.5 ug/mL   0.8572318 9.976410e-01
## 3      AGRN      AGRN: 2 ug/mL   2.7113750 9.973943e-01
## 5     ANOS1     ANOS1: 1 ug/mL   1.2784966 9.969820e-01
## 62     STC2      STC2: 1 ug/mL   1.2922563 9.925379e-01
## 51   PDGFRL    PDGFRL: 3 ug/mL   3.7278178 9.867464e-01
## 20   DRAXIN    DRAXIN: 1 ug/mL   1.8601663 9.845426e-01
## 63     TCN2      TCN2: 1 ug/mL   1.4196174 9.785267e-01
## 59    SFRP2     SFRP2: 1 ug/mL   2.7537777 9.747477e-01
## 12   CHRDL1    CHRDL1: 1 ug/mL   1.4833789 9.106392e-01
## 56    PTPRS   PTPRS: 2.5 ug/mL   2.4477719 9.048828e-01
## 34      HGF     HGF: 0.2 ug/mL   2.5292465 7.912208e-01
## 42    LTBP1     LTBP1: 1 ug/mL   1.5764840 7.367217e-01
## 69    THBS4     THBS4: 1 ug/mL   2.7343697 7.352176e-01
## 70    TIMP1  TIMP1: 0.50 ug/mL   3.6504186 7.302567e-01
## 73     VASN      VASN: 1 ug/mL   1.7205264 6.740634e-01
## 10      BTC     BTC: 0.5 ug/mL   1.0506944 6.189895e-01
## 30      FST    FST: 0.25 ug/mL   0.3676806 5.924256e-01
## 58    SFRP1     SFRP1: 1 ug/mL   1.9092366 5.274271e-01
## 57  RARRES2 RARRES2: 0.5 ug/mL   1.5516599 4.392245e-01
## 19     DAG1    DAG1: 2.5 ug/mL   0.7964278 4.278212e-01
## 55      PTN       PTN: 1 ug/mL   0.3720418 4.246091e-01
## 43    MATN2     MATN2: 1 ug/mL   1.1671517 3.794087e-01
## 37   IGFBP3  IGFBP3: 0.3 ug/mL   1.0924732 3.032465e-01
## 18   CXCL12    CXCL12: 1 ug/mL   2.3504654 2.808731e-01
## 74      VTN       VTN: 3 ug/mL   1.6996553 2.640201e-01
## 68    THBS3     THBS3: 1 ug/mL   1.8962563 2.544644e-01
## 16     CSF2      CSF2: 1 ug/mL   0.5257165 2.463287e-01
## 75    WISP2     WISP2: 1 ug/mL   2.2976018 2.450493e-01
## 49    PAMR1   PAMR1: 0.2 ug/mL   4.1632041 2.066439e-01
## 46      NOV     NOV: 1.5 ug/mL   1.6418589 1.691551e-01
## 38   IGFBP4  IGFBP4: 0.2 ug/mL   0.8694056 1.593942e-01
## 71    TIMP2   TIMP2: 0.5 ug/mL   2.3550088 1.534895e-01
## 44    NAMPT     NAMPT: 2 ug/mL   0.4533916 1.521385e-01
## 4       ANG       ANG: 1 ug/mL   2.0935040 1.474305e-01
## 52     PLAT      PLAT: 2 ug/mL   1.0712790 1.469910e-01
## 66    THBS1     THBS1: 1 ug/mL   0.6435273 1.200906e-01
## 53     PLAU      PLAU: 2 ug/mL   3.0352391 7.373726e-02
## 54    POSTN     POSTN: 1 ug/mL   0.9935289 7.317355e-02
## 11    CCL14     CCL14: 1 ug/mL   0.8223549 7.117944e-02
## 39   IGFBP5  IGFBP5: 0.5 ug/mL   0.9490404 5.785878e-02
## 65    TGFBI   TGFBI: 0.2 ug/mL  10.0223719 2.326525e-02
## 17     CTGF      CTGF: 3 ug/mL   0.5622329 2.042650e-02
## 61     STC1      STC1: 1 ug/mL   0.2107510 1.495876e-02
## 48   OLFML3 OLFML3: 0.75 ug/mL   0.3038827 1.307027e-02
## 31    FSTL1     FSTL1: 1 ug/mL   3.6943920 7.923565e-03
## 50    PDGFD   PDGFD: 0.5 ug/mL 131.1052279 7.183722e-03
## 14     COCH      COCH: 1 ug/mL   0.9398051 6.497679e-03
## 13      CLC       CLC: 1 ug/mL   1.4293817 6.226064e-03
## 36   IGFBP2  IGFBP2: 0.5 ug/mL   0.8417063 3.844506e-03
## 40   IGFBP7  IGFBP7: 0.1 ug/mL   1.1754256 7.894397e-04
## 33     GDNF    GDNF: 0.1 ug/mL   0.7590671 3.652716e-04
## 29     FRZB      FRZB: 1 ug/mL   0.9119177 3.556593e-04
## 32    GDF15     GDF15: 1 ug/mL   3.8664053 3.780000e-07
## 64    TGFB1  TGFB1: 0.01 ug/mL   1.4574688 1.630000e-10
OA_protein_df <- merge(x=OA_protein, y=tested_protein_correct_narm_original, by="Protein", all.x=FALSE, all.y=FALSE)


OA_protein_df <- OA_protein_df[order(-OA_protein_df$Probability),]
OA_protein_df <- OA_protein_df[c("Protein", "Percent_EdU", "edu_fc", "Probability")]
colnames(OA_protein_df) <- c("Protein", "OA_%edU", "Ch_edu", "Probability")
# temp_df$ground_truth <- temp_df$edu_fc >=2.0
# 
# cut_off_3_count <- sum(temp_df$edu_fc >=3.0, na.rm = TRUE)
# cut_off_2_count <- sum(temp_df$edu_fc >=2.0, na.rm = TRUE)
# cut_off_1_5_count <- sum(temp_df$edu_fc >=1.5, na.rm = TRUE)
# length(temp_df$Protein)
# 
# temp_df
# 
# temp_df$cutoff_3 <- 'Off'
# temp_df$cutoff_3[temp_df$edu_fc >=3.0] <- 'overlap'
# 
# temp_df$cutoff_2 <- 'Off'
# temp_df$cutoff_2[temp_df$edu_fc >=2.0] <- 'overlap'
# 
# temp_df$cutoff_1_5 <- 'Off'
# temp_df$cutoff_1_5[temp_df$edu_fc >=1.5] <- 'overlap'




OA_protein_df$OA_pedU_capped <- OA_protein_df$`OA_%edU`
OA_protein_df$OA_pedU_capped[OA_protein_df$OA_pedU_capped>6] <-6

edu_min <- range(OA_protein_df$OA_pedU_capped)[1]
edu_max <- range(OA_protein_df$OA_pedU_capped)[2]

OA_protein_df$p_edu_fc_scalled <- (OA_protein_df$OA_pedU_capped-edu_min)/(edu_max - edu_min)

# OA_protein_df <- OA_protein_df[sort(-OA_protein_df$Probability),]

candidate_ls <- c(1:length(OA_protein_df$Protein))
OA_protein_df$Label <- sprintf("OA%03d", candidate_ls)
library(hrbrthemes)
p <- ggplot(OA_protein_df, aes(x=Label)) +
                geom_bar( aes(y=Probability), stat="identity", size=2, fill="grey0", alpha=.2) + 
                geom_bar( aes(y=p_edu_fc_scalled), stat="identity", size=2, fill=rgb(0.2, 0.6, 0.9, 1), alpha=.6) +
          
          scale_y_continuous(
            
            # Features of the first axis
            name = expression("Hit Candidate Probability"),
            
            # Add a second axis and specify its features
            sec.axis = sec_axis(~.*(edu_max-edu_min)+edu_min, name=expression(bold("In Vitro %Edu Fc")))
          ) + 
          
          # theme_ipsum() +
          theme_classic() +
        
          theme(
              axis.title.x = element_text(size=20),
            axis.title.y = element_text(color = "grey0", size=20),
            axis.title.y.right = element_text(color = rgb(0.2, 0.6, 0.9, 1), size=20),
            axis.text.x = element_text(size = 18, angle = 90),
            axis.text.y = element_text(size = 18),
            axis.text.y.right = element_text(size = 18)
            ) 
          # geom_hline(yintercept = h_line_mark, color='red')
        
p