library(readr)
OA_SerieA_top50 <- read_csv("Results/OA_SerieA_top50.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## index = col_double(),
## UniProt = col_character(),
## Prob = col_double(),
## `Unnamed: 0` = col_double(),
## Entry.name = col_character(),
## Gene = col_character(),
## Gene.synonym = col_character(),
## Ensembl = col_character()
## )
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v dplyr 1.0.2
## v tibble 3.0.4 v stringr 1.4.0
## v tidyr 1.1.2 v forcats 0.5.0
## v purrr 0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## # A tibble: 50 x 9
## index UniProt Prob `Unnamed: 0` Entry.name Gene Gene.synonym Ensembl Label
## <dbl> <chr> <dbl> <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 1504 Q9GZP0 78.2 9325 PDGFD_HUMAN PDGFD IEGF, MSTP0~ ENSG00~ F001
## 2 1582 Q9NRA1 55.8 9727 PDGFC_HUMAN PDGFC fallotein, ~ ENSG00~ F002
## 3 2345 Q15113 41.7 16073 PCOC1_HUMAN PCOL~ PCPE, PCPE1 ENSG00~ F003
## 4 2394 P98066 38.2 16573 TSG6_HUMAN TNFA~ TSG-6, TSG6 ENSG00~ F004
## 5 1875 Q9UKZ9 37.6 11906 PCOC2_HUMAN PCOL~ PCPE2 ENSG00~ F005
## 6 804 Q5VXM1 32.0 5044 CDCP2_HUMAN CDCP2 NaN ENSG00~ F006
## 7 731 P09038 29.6 4609 FGF2_HUMAN FGF2 FGFB ENSG00~ F007
## 8 2303 P01137 25.9 15657 TGFB1_HUMAN TGFB1 CED, DPD1, ~ ENSG00~ F008
## 9 1181 Q8TDF5 23.3 7270 NETO1_HUMAN NETO1 BCTL1, BTCL1 ENSG00~ F009
## 10 2329 P10600 23.0 15905 TGFB3_HUMAN TGFB3 ARVD, ARVD1 ENSG00~ F010
## # ... with 40 more rows
## # A tibble: 50 x 2
## `%Edu FC (Pred)` Label
## <dbl> <chr>
## 1 78.2 F001
## 2 55.8 F002
## 3 41.7 F003
## 4 38.2 F004
## 5 37.6 F005
## 6 32.0 F006
## 7 29.6 F007
## 8 25.9 F008
## 9 23.3 F009
## 10 23.0 F010
## # ... with 40 more rows
#Load the libraries
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
library(dplyr)
library(formattable)
library(tidyr)
customGreen0 = "#DeF7E9"
customGreen = "#71CA97"
customRed = "#ff7f7f"
a = formattable(head(temp_df, 15),
align =c("c","l"),
list(`Label` = formatter("span", style = ~ style(color = "grey",font.weight = "bold")),
`%Edu FC (Pred)` = color_tile(customGreen0, customGreen)
))
library(ggplot2)
p <- ggplot(data = temp_df, aes(`Label`, `%Edu FC (Pred)`)) +
geom_point() +
scale_x_discrete(guide = guide_axis(angle = 90)) +
theme_classic()
p
ggsave(plot=p, filename="result.png", width=7, height=4)
library(readr)
library(tidyverse)
setwd("~/Juvena")
raw <- read_csv("Results/AE_NN_OA_072620_E5 - AE_NN_OA_072620_E5.csv")
## Warning: Missing column names filled in: 'X1' [1]
##
## -- Column specification --------------------------------------------------------
## cols(
## X1 = col_double(),
## UniProt = col_character(),
## Prob = col_double(),
## Indicator = col_character(),
## `Unnamed: 0` = col_double(),
## Entry.name = col_character(),
## Gene = col_character(),
## Gene.synonym = col_character(),
## Ensembl = col_character()
## )
raw2 <- read_csv("Results/AE_NN_OA_071520_full - AE_NN_OA_071520_full.csv")
## Warning: Missing column names filled in: 'X1' [1]
##
## -- Column specification --------------------------------------------------------
## cols(
## X1 = col_double(),
## UniProt = col_character(),
## Prob = col_double(),
## Indicator = col_character(),
## `Unnamed: 0` = col_double(),
## Entry.name = col_character(),
## Gene = col_character(),
## Gene.synonym = col_character(),
## Ensembl = col_character()
## )
# Generate ground truth
temp <- raw %>% select(c(3, 4, 7))
temp_ground_truth <- temp[temp$Indicator %in% c("True Positive", "True Negative"),]
label_ls <- c(1:length(temp_ground_truth$Prob))
temp_ground_truth$Label <- sprintf("L%03d", label_ls)
colnames(temp_ground_truth)[1] <- "Hit Candidate Probability"
temp_ground_truth
## # A tibble: 25 x 4
## `Hit Candidate Probability` Indicator Gene Label
## <dbl> <chr> <chr> <chr>
## 1 1 True Positive SHH L001
## 2 1 True Positive FGF9 L002
## 3 1 True Positive FGF8 L003
## 4 1 True Positive IHH L004
## 5 1 True Positive FGF18 L005
## 6 1 True Positive FGF2 L006
## 7 1.00 True Positive WNT5A L007
## 8 1.00 True Positive WNT3A L008
## 9 1.00 True Positive PTHLH L009
## 10 0.997 True Positive DLL1 L010
## # ... with 15 more rows
# generated prediction
temp <- raw2 %>% select(c(3, 4, 7))
temp <- temp[temp$Indicator=="Secreted",]
temp <- temp[order(-temp$Prob),]
temp_top_50 <- temp[c(1:50),]
candidate_ls <- c(1:50)
temp_top_50$Label <- sprintf("F%03d", candidate_ls)
temp_top_50
## # A tibble: 50 x 4
## Prob Indicator Gene Label
## <dbl> <chr> <chr> <chr>
## 1 0.999 Secreted FGF1 F001
## 2 0.999 Secreted DHH F002
## 3 0.998 Secreted FGF17 F003
## 4 0.998 Secreted FGF22 F004
## 5 0.997 Secreted FGF4 F005
## 6 0.997 Secreted FGF6 F006
## 7 0.997 Secreted FGF16 F007
## 8 0.997 Secreted FGF5 F008
## 9 0.997 Secreted FGF3 F009
## 10 0.996 Secreted FGF20 F010
## # ... with 40 more rows
temp_top_50 = select(temp_top_50, Prob, Label)
colnames(temp_top_50) <- c("Hit Candidate Probability", "Label")
temp_top_50
## # A tibble: 50 x 2
## `Hit Candidate Probability` Label
## <dbl> <chr>
## 1 0.999 F001
## 2 0.999 F002
## 3 0.998 F003
## 4 0.998 F004
## 5 0.997 F005
## 6 0.997 F006
## 7 0.997 F007
## 8 0.997 F008
## 9 0.997 F009
## 10 0.996 F010
## # ... with 40 more rows
library(data.table)
library(dplyr)
library(formattable)
library(tidyr)
customGreen0 = "#DeF7E9"
customGreen = "#71CA97"
customRed = "#ff7f7f"
a = formattable(head(temp_top_50, 15),
align =c("c","l"),
list(`Label` = formatter("span", style = ~ style(color = "grey",font.weight = "bold")),
`Hit Candidate Probability` = color_tile(customGreen0, customGreen)
))
library(ggplot2)
library(ggrepel)
p <- ggplot(data = temp_ground_truth, aes(`Label`, `Hit Candidate Probability`)) +
geom_point(aes(color = `Indicator`)) +
scale_color_manual(values = c("red", "green")) +
scale_x_discrete(guide = guide_axis(angle = 90)) +
# geom_text(aes(label = `Gene`), hjust = 0, nudge_x = 0.05) +
theme_classic()
p
p <- ggplot(data = head(temp_top_50, 25), aes(`Label`, `Hit Candidate Probability`)) +
geom_point() +
scale_x_discrete(guide = guide_axis(angle = 90)) +
theme_classic()
p
library(readr)
CA016_combined <- read_csv("Results/CA016_combines_dosed.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## well = col_character(),
## FID = col_character(),
## Condition = col_character(),
## FactorName = col_character(),
## Category = col_character(),
## Cell.ID = col_character(),
## exp_title = col_character(),
## plate = col_character(),
## combo = col_logical(),
## use_factordb = col_logical(),
## timepoint = col_character(),
## emy_fc = col_logical(),
## PercentArea_eMyHC = col_logical(),
## FileName_Single.Channel.Blue = col_character(),
## FileName_Single.Channel.Red = col_character(),
## SupID = col_logical(),
## Dose = col_character(),
## Unit = col_character()
## )
## i Use `spec()` for the full column specifications.
CA015_combined <- read_csv("Results/CA015_combines_dosed.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## well = col_character(),
## FID = col_character(),
## Condition = col_character(),
## `Unnamed: 4` = col_logical(),
## FactorName = col_character(),
## Category = col_character(),
## Cell.ID = col_character(),
## exp_title = col_character(),
## plate = col_character(),
## combo = col_character(),
## use_factordb = col_logical(),
## timepoint = col_character(),
## emy_fc = col_logical(),
## PercentArea_eMyHC = col_logical(),
## FileName_Single.Channel.Blue = col_character(),
## FileName_Single.Channel.Red = col_character(),
## SupID = col_logical(),
## Dose = col_character(),
## Unit = col_character()
## )
## i Use `spec()` for the full column specifications.
usable_column <- c("FactorName", "edu_count_fc", "Value")
CA016_combined <- CA016_combined[usable_column]
CA015_combined <- CA015_combined[usable_column]
overall_results <- rbind(CA015_combined, CA016_combined)
# overall_results$edu_count_fc
library(tidyverse)
overall_results_rm_na <- na.omit(overall_results)
result_summary <- overall_results_rm_na %>% group_by(`FactorName`, `Value`) %>% summarise(max_val = max(edu_count_fc))
## `summarise()` regrouping output by 'FactorName' (override with `.groups` argument)
range(result_summary$Value)
## [1] 0.01 10.00
result_summary$Value <- as.factor(result_summary$Value)
library(ggplot2)
# overall_results_rm_na
p <- ggplot(result_summary, aes(x = `FactorName`, y = `Value`, fill = `Value`)) +
geom_bar(stat = "identity") +
# theme_classic() +
labs(
x = "Factors",
y = "Edu Count Fc",
title = paste("")
)
p
result_summary_sorted <- result_summary[order(-result_summary$max_val),]
result_summary_sorted
## # A tibble: 101 x 3
## # Groups: FactorName [101]
## FactorName Value max_val
## <chr> <fct> <dbl>
## 1 PDGFD 0.5 257.
## 2 FGF1 0.5 95.9
## 3 FGF18 0.1 16.1
## 4 PAMR1 0.2 11.6
## 5 TGFBI 0.2 11.0
## 6 FSTL1 1 7.86
## 7 NDNF 0.5 7.43
## 8 PLAU 2 6.57
## 9 Anastellin 1.2 6.43
## 10 SFRP1 1 5.84
## # ... with 91 more rows
# Series_A_Chondrocyte_Prediction_vs_in_vitro <- read_csv("Juvena/Results/Series_A_Chondrocyte _Prediction_vs_in_vitro.csv")
#
# candidate_ls <- c(1:15)
#
# Series_A_Chondrocyte_Prediction_vs_in_vitro$Label <- sprintf("F%03d", candidate_ls)
# Series_A_Chondrocyte_Prediction_vs_in_vitro
#
# edu_min <- range(Series_A_Chondrocyte_Prediction_vs_in_vitro$`%edu_fc`)[1]
# edu_max <- range(Series_A_Chondrocyte_Prediction_vs_in_vitro$`%edu_fc`)[2]
#
# Series_A_Chondrocyte_Prediction_vs_in_vitro$`%edu_fc_scalled` <- (Series_A_Chondrocyte_Prediction_vs_in_vitro$`%edu_fc`-edu_min)/(edu_max - edu_min)
#
#
#
# prob_min <- range(Series_A_Chondrocyte_Prediction_vs_in_vitro$Probability)[1]
# prob_max <- range(Series_A_Chondrocyte_Prediction_vs_in_vitro$Probability)[2]
#
# Series_A_Chondrocyte_Prediction_vs_in_vitro$`Prob_scaled` <- (Series_A_Chondrocyte_Prediction_vs_in_vitro$Probability - prob_min)/(prob_max - prob_min)
# Series_A_Chondrocyte_Prediction_vs_in_vitro <- data.frame(apply(Series_A_Chondrocyte_Prediction_vs_in_vitro, 4, unclass))
# library(hrbrthemes)
# p <- ggplot(Series_A_Chondrocyte_Prediction_vs_in_vitro, aes(x=Label)) +
# geom_bar( aes(y=Prob_scaled), stat="identity", size=2, fill="grey0", alpha=.2) +
# geom_bar( aes(y=`%edu_fc_scalled`), stat="identity", size=2, fill=rgb(0.2, 0.6, 0.9, 1)) +
#
# scale_y_continuous(
#
# # Features of the first axis
# name = expression("Normalized Hit Candidate Probability"),
#
# # Add a second axis and specify its features
# sec.axis = sec_axis(~.*(edu_max-edu_min)+edu_min, name=expression(bold("In Vitro %Edu Fc")))
# ) +
#
# # theme_ipsum() +
# theme_classic() +
#
# theme(
# axis.title.x = element_text(size=20),
# axis.title.y = element_text(color = "grey0", size=20),
# axis.title.y.right = element_text(color = rgb(0.2, 0.6, 0.9, 1), size=20),
# axis.text.x = element_text(size = 18, angle = 90),
# axis.text.y = element_text(size = 18),
# axis.text.y.right = element_text(size = 18)
# )
# +
#
# ggtitle("QSAR prediction vs. in vitro")
# p
“QSAR+ model predicted hit candidates vs. in vitro results from proliferation analysis in human chondrocyte”
library(ggplot2)
library(readr)
library(scales)
##
## Attaching package: 'scales'
## The following objects are masked from 'package:formattable':
##
## comma, percent, scientific
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
Chondrocyte_QSAR_candidates <- read_csv("Results/Chondrocyte_QSAR_candidates.csv")
## Warning: Missing column names filled in: 'X1' [1], 'X10' [10]
##
## -- Column specification --------------------------------------------------------
## cols(
## X1 = col_double(),
## UniProt = col_character(),
## Prob = col_double(),
## Indicator = col_character(),
## `Unnamed: 0` = col_double(),
## Entry.name = col_character(),
## Gene = col_character(),
## Gene.synonym = col_character(),
## Ensembl = col_character(),
## X10 = col_logical(),
## Rank = col_double(),
## Protein = col_character(),
## Probability = col_double(),
## FC_EdU = col_logical(),
## Condition = col_character(),
## edu_fc = col_double()
## )
tested_protein <- Chondrocyte_QSAR_candidates[c('Condition', 'edu_fc')]
predicted_protein <- na.omit(Chondrocyte_QSAR_candidates[c('Protein', 'Probability')])
temp_cols <- data.frame(do.call("rbind", strsplit(as.character(tested_protein$Condition), ":", fixed = TRUE)))
tested_protein$Protein <- temp_cols$X1
tested_protein_correct <- merge(x=tested_protein, y=predicted_protein, by="Protein", all.x=FALSE, all.y=TRUE)
# tested_protein_correct[sort(tested_protein_correct$edu_fc),]
tested_protein_correct_narm_original <- na.omit(tested_protein_correct) # 70 rows
tested_protein_correct_narm_original <- tested_protein_correct_narm_original[order(-tested_protein_correct_narm_original$Probability),]
# temp_df$ground_truth <- temp_df$edu_fc >=2.0
#
# cut_off_3_count <- sum(temp_df$edu_fc >=3.0, na.rm = TRUE)
# cut_off_2_count <- sum(temp_df$edu_fc >=2.0, na.rm = TRUE)
# cut_off_1_5_count <- sum(temp_df$edu_fc >=1.5, na.rm = TRUE)
# length(temp_df$Protein)
#
# temp_df
#
# temp_df$cutoff_3 <- 'Off'
# temp_df$cutoff_3[temp_df$edu_fc >=3.0] <- 'overlap'
#
# temp_df$cutoff_2 <- 'Off'
# temp_df$cutoff_2[temp_df$edu_fc >=2.0] <- 'overlap'
#
# temp_df$cutoff_1_5 <- 'Off'
# temp_df$cutoff_1_5[temp_df$edu_fc >=1.5] <- 'overlap'
tested_protein_correct_narm <- head(tested_protein_correct_narm_original, 25)
tested_protein_correct_narm$edu_fc[tested_protein_correct_narm$edu_fc >5] <- 5
range(tested_protein_correct_narm$Probability)
## [1] 0.9106392 1.0000000
edu_min <- range(tested_protein_correct_narm$`edu_fc`)[1]
edu_max <- range(tested_protein_correct_narm$`edu_fc`)[2]
tested_protein_correct_narm$edu_fc_scalled <- (tested_protein_correct_narm$edu_fc-edu_min)/(edu_max - edu_min)
candidate_ls <- c(1:length(tested_protein_correct_narm$Protein))
tested_protein_correct_narm$Label <- sprintf("F%03d", candidate_ls)
tested_protein_correct_narm[tested_protein_correct_narm$edu_fc>=1.5 & tested_protein_correct_narm$edu_fc>=1.6,]
## Protein Condition edu_fc Probability edu_fc_scalled Label
## 7 APOB APOB: 1 ug/mL 1.603599 1.0000000 0.2572685 F001
## 22 FGF1 FGF1: 0.5 ug/mL 5.000000 1.0000000 1.0000000 F002
## 24 FGF18 FGF18: 0.10 ug/mL 4.557981 1.0000000 0.9033385 F003
## 26 FGFBP3 FGFBP3: 1 ug/mL 1.664598 1.0000000 0.2706080 F004
## 41 LAMA1 LAMA1: 2 ug/mL 2.340812 0.9998283 0.4184837 F010
## 45 NDNF NDNF: 0.5 ug/mL 5.000000 0.9991577 1.0000000 F013
## 3 AGRN AGRN: 2 ug/mL 2.711375 0.9973943 0.4995191 F018
## 51 PDGFRL PDGFRL: 3 ug/mL 3.727818 0.9867464 0.7217968 F021
## 20 DRAXIN DRAXIN: 1 ug/mL 1.860166 0.9845426 0.3133752 F022
## 59 SFRP2 SFRP2: 1 ug/mL 2.753778 0.9747477 0.5087918 F024
h_line_mark <- (1.5-edu_min)/(edu_max - edu_min)
tested_protein_correct_narm$actual_label <- "Negative"
tested_protein_correct_narm$actual_label[tested_protein_correct_narm$edu_fc>=1.5] <- "Positive"
tested_protein_correct_narm$pred_label <- "Negative"
tested_protein_correct_narm$pred_label[tested_protein_correct_narm$Probability>=0.244439] <- "Positive"
# blank_theme <- theme_minimal()+
# theme(
# axis.title.x = element_blank(),
# axis.title.y = element_blank(),
# panel.border = element_blank(),
# panel.grid=element_blank(),
# axis.ticks = element_blank(),
# plot.title=element_text(size=14, face="bold")
# )
#
# ggplot(temp_df, aes(x=factor(1), fill=cutoff_1_5))+
# geom_bar(width = 1)+
# coord_polar("y") +
# scale_fill_brewer("Blues") + blank_theme +
# theme(axis.text.x=element_blank())+
# geom_text(aes(y = value/3 + c(0, cumsum(value)[-length(value)]),
# label = percent(value/100)), size=5)
library(hrbrthemes)
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
## Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
## if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
p <- ggplot(tested_protein_correct_narm, aes(x=Label)) +
geom_bar( aes(y=Probability), stat="identity", size=2, fill="grey0", alpha=.2) +
geom_bar( aes(y=edu_fc_scalled), stat="identity", size=2, fill=rgb(0.2, 0.6, 0.9, 1), alpha=.6) +
scale_y_continuous(
# Features of the first axis
name = expression("Hit Candidate Probability"),
# Add a second axis and specify its features
sec.axis = sec_axis(~.*(edu_max-edu_min)+edu_min, name=expression(bold("In Vitro Edu")))
) +
# theme_ipsum() +
theme_classic() +
theme(
axis.title.x = element_text(size=20),
axis.title.y = element_text(color = "grey0", size=20),
axis.title.y.right = element_text(color = rgb(0.2, 0.6, 0.9, 1), size=20),
axis.text.x = element_text(size = 10, angle = 90),
axis.text.y = element_text(size = 18),
axis.text.y.right = element_text(size = 18)
) +
geom_hline(yintercept = h_line_mark, color='red')
p
# confusion matrix.
library(ggplot2)
library(readr)
library(scales)
library(tidyverse)
Chondrocyte_QSAR_candidates <- read_csv("Results/Chondrocyte_QSAR_candidates - Sheet3.csv")
## Warning: Missing column names filled in: 'X7' [7], 'X8' [8], 'X9' [9],
## 'X10' [10]
##
## -- Column specification --------------------------------------------------------
## cols(
## HTS_Rank = col_double(),
## QSAR_Rank = col_double(),
## Protein = col_character(),
## Probability = col_double(),
## `%edu_fc` = col_double(),
## `Log10(edu_FC)` = col_double(),
## X7 = col_character(),
## X8 = col_character(),
## X9 = col_character(),
## X10 = col_character()
## )
tested_protein <- Chondrocyte_QSAR_candidates[c('Protein', 'Probability', '%edu_fc')]
tested_protein <- tested_protein[order(-tested_protein$Probability) ,]
# tested_protein <- head(tested_protein, 25)
tested_protein$edu_fc <- tested_protein$`%edu_fc`
tested_protein$edu_fc[tested_protein$`%edu_fc` >20] <- 20
range(tested_protein$edu_fc)
## [1] 0.261 20.000
edu_min <- min(tested_protein$edu_fc)
edu_max <- max(tested_protein$edu_fc)
# tested_protein$edu_fc_scalled <- (tested_protein$edu_fc - edu_min)/(edu_max - edu_min)
# tested_protein$edu_fc[tested_protein$edu_fc>20] <-20
tested_protein$edu_fc_scaled <- rescale(tested_protein$edu_fc, to=c(0,1))
range(tested_protein$edu_fc_scaled)
## [1] 0 1
candidate_ls <- c(1:length(tested_protein$Protein))
tested_protein$Label <- sprintf("F%03d", candidate_ls)
tested_protein[tested_protein$`%edu_fc`>20,] # doing labeling the outliers
## # A tibble: 3 x 6
## Protein Probability `%edu_fc` edu_fc edu_fc_scaled Label
## <chr> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 FGF1 1.00e+ 0 21.1 20 1 F001
## 2 PDGFD 7.18e- 3 43.0 20 1 F061
## 3 TGFB1 1.63e-10 26.2 20 1 F070
h_line_mark <- (1.5- edu_min)/(edu_max - edu_min)
length(tested_protein$Protein)
## [1] 70
############################################################################
cutoff_val <-
cutoff_edu_min <- min(tested_protein$`%edu_fc`)
cutoff_edu_max <- max(tested_protein$`%edu_fc`)
prob_cut <- (cutoff_val-cutoff_edu_min)/(cutoff_edu_max - cutoff_edu_min)
tested_protein$actual_label <- "TN"
tested_protein$actual_label[tested_protein$`%edu_fc`>=1.5] <- "TP"
tested_protein$pred_label <- "TN"
tested_protein$pred_label[tested_protein$Probability>=0.3] <- "TP"
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
conf_m <- NULL
conf_m <- confusionMatrix(factor(tested_protein$pred_label), factor(tested_protein$actual_label))
conf_table <- data.frame(conf_m$table)
plotTable <- conf_table %>%
mutate(goodbad = ifelse(conf_table$Prediction == conf_table$Reference, "good", "bad")) %>%
group_by(Reference) %>%
mutate(prop = Freq/sum(Freq))
# 342 x 274
p <- ggplot(plotTable, mapping=aes(Prediction , Reference, fill= goodbad, alpha = prop)) +
geom_tile() +
geom_text(aes(label=prop), vjust = .5, fontface = "bold", alpha = 1) +
scale_fill_gradient(low="white", high="white") +
labs(x = "Reference",y = "Prediction") +
scale_fill_manual(values = c(good = "green", bad = "red")) +
xlim(rev(levels(plotTable$Reference)))
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
# scale_x_discrete(labels=c("Class_1","Class_2","Class_3","Class_4")) +
# scale_y_discrete(labels=c("Class_4","Class_3","Class_2","Class_1"))
p
p <- ggplot(plotTable, mapping=aes(Prediction , Reference, fill= goodbad, alpha = prop)) +
geom_tile() +
geom_text(aes(label=Freq), vjust = .5, fontface = "bold", alpha = 1) +
scale_fill_gradient(low="white", high="white") +
labs(x = "Reference",y = "Prediction") +
scale_fill_manual(values = c(good = "green", bad = "red")) +
xlim(rev(levels(plotTable$Reference)))
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
# scale_x_discrete(labels=c("Class_1","Class_2","Class_3","Class_4")) +
# scale_y_discrete(labels=c("Class_4","Class_3","Class_2","Class_1"))
p
library(hrbrthemes)
p <- ggplot(tested_protein, aes(x=Label)) +
geom_bar( aes(y=Probability), stat="identity", size=2, fill="grey0", alpha=.2) +
geom_bar( aes(y=edu_fc_scaled), stat="identity", size=2, fill=rgb(0.2, 0.6, 0.9, 1), alpha=.6) +
scale_y_continuous(
# Features of the first axis
name = expression("Hit Candidate Probability"),
# Add a second axis and specify its features
sec.axis = sec_axis(~.*(edu_max-edu_min)+edu_min, name=expression(bold("In Vitro %Edu Fc")))
) +
# theme_ipsum() +
theme_classic() +
theme(
axis.title.x = element_text(size=20),
# axis.title.x=element_blank(),
axis.title.y = element_text(color = "grey0", size=20),
axis.title.y.right = element_text(color = rgb(0.2, 0.6, 0.9, 1), size=20),
# axis.text.x = element_text(size = 10, angle = 90),
axis.text.x = element_blank(),
axis.text.y = element_text(size = 18),
axis.text.y.right = element_text(size = 18)
) +
geom_hline(yintercept = 0.5, color='black')+
geom_hline(yintercept = h_line_mark, color="blue")
p
# 862x537
#load necessary packages
library(ggplot2)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
# data <- ISLR::Default
# sample <- sample(c(TRUE, FALSE), nrow(data), replace=TRUE, prob=c(0.7,0.3))
# train <- data[sample, ]
# test <- data[!sample, ]
#
# #fit logistic regression model to training set
# model <- glm(default~student+balance+income, family="binomial", data=train)
#
# #use model to make predictions on test set
# predicted <- predict(model, test, type="response")
#
# #define object to plot
# rocobj <- roc(test$default, predicted)
# auc <- round(auc(test$default, predicted),4)
#
# #create ROC plot
# ggroc(rocobj, colour = 'steelblue', size = 2) +
# ggtitle(paste0('ROC Curve ', '(AUC = ', auc, ')')) +
# theme_minimal()
tested_protein
## # A tibble: 70 x 8
## Protein Probability `%edu_fc` edu_fc edu_fc_scaled Label actual_label
## <chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 FGF1 1 21.1 20 1 F001 TP
## 2 FGF17 1 18 18 0.899 F002 TP
## 3 FGF18 1 8.06 8.06 0.395 F003 TP
## 4 APOB 1 4.13 4.13 0.196 F004 TP
## 5 FGFBP3 1 3.37 3.37 0.158 F005 TP
## 6 ADAMTS7 1.00 3.07 3.07 0.142 F006 TP
## 7 HGFAC 1.00 0.919 0.919 0.0333 F007 TN
## 8 ADAMTS~ 1.00 2.94 2.94 0.136 F008 TP
## 9 FLT1 1.00 1.92 1.92 0.0840 F009 TP
## 10 FN1 1.00 3.11 3.11 0.144 F010 TP
## # ... with 60 more rows, and 1 more variable: pred_label <chr>
rocobj <- roc(tested_protein$actual_label, tested_protein$Probability)
## Setting levels: control = TN, case = TP
## Setting direction: controls < cases
auc <- round(auc(tested_protein$actual_label, tested_protein$Probability),4)
## Setting levels: control = TN, case = TP
## Setting direction: controls < cases
# p <- ggroc(rocobj, colour = 'steelblue', size = 2) +
# ggtitle(paste0('ROC Curve ', '(AUC = ', auc, ')')) +
# theme_classic()
#
# p
library(plotROC)
##
## Attaching package: 'plotROC'
## The following object is masked from 'package:pROC':
##
## ggroc
p <- ggplot(tested_protein, aes(d = actual_label, m = Probability)) +
geom_roc(n.cuts = 0, labels = FALSE) +
# geom_rocci(ci.at = quantile(tested_protein$Probability, c(.1, .4, .5, .6, .9))) +
style_roc(xlab = "1 - Specificity", ylab= "Sensitivity") +
geom_abline(intercept = 0, slope = 1, color="red") +
ggtitle(paste0('ROC Curve ', '(AUC = ', auc, ')'))
# theme_classic()
p # 470 x 438
## Warning in verify_d(data$d): D not labeled 0/1, assuming TN = 0 and TP = 1!
library(ggplot2)
library(readr)
library(scales)
OA_df <- read_csv("Results/CA017 Top Hits - Sheet1.csv")
## Warning: Missing column names filled in: 'X1' [1]
##
## -- Column specification --------------------------------------------------------
## cols(
## X1 = col_double(),
## Condition = col_character(),
## Cell.ID = col_character(),
## Category = col_character(),
## N = col_double(),
## Percent_EdU = col_double(),
## sd = col_double(),
## se = col_double(),
## ci = col_double(),
## `adj-pval` = col_double()
## )
OA_protein <- OA_df[c('Category', 'Percent_EdU')]
colnames(OA_protein) <- c('Protein', 'Percent_EdU')
# need to run the previous 2 cells
tested_protein_correct_narm_original
## Protein Condition edu_fc Probability
## 7 APOB APOB: 1 ug/mL 1.6035988 1.000000e+00
## 22 FGF1 FGF1: 0.5 ug/mL 64.2626545 1.000000e+00
## 24 FGF18 FGF18: 0.10 ug/mL 4.5579813 1.000000e+00
## 26 FGFBP3 FGFBP3: 1 ug/mL 1.6645980 1.000000e+00
## 2 ADAMTS7 ADAMTS7: 1 ug/mL 1.5131683 9.999999e-01
## 35 HGFAC HGFAC: 2 ug/mL 0.7402934 9.999884e-01
## 1 ADAMTS19 ADAMTS19: 1 ug/mL 1.4493507 9.999758e-01
## 27 FLT1 FLT1: 0.5 ug/mL 0.8774524 9.999677e-01
## 28 FN1 FN1: 10 ug/mL 1.5348433 9.999155e-01
## 41 LAMA1 LAMA1: 2 ug/mL 2.3408121 9.998283e-01
## 47 NTS NTS: 0.2 ug/mL 0.8503963 9.998029e-01
## 21 FBLN1 FBLN1: 1 ug/mL 0.8120715 9.996300e-01
## 45 NDNF NDNF: 0.5 ug/mL 5.2407153 9.991577e-01
## 60 SPON1 SPON1: 0.5 ug/mL 1.1093920 9.989819e-01
## 72 TNC TNC: 3 ug/mL 1.2622126 9.981275e-01
## 67 THBS2 THBS2: 1 ug/mL 0.4271477 9.980863e-01
## 6 APLN APLN: 2.5 ug/mL 0.8572318 9.976410e-01
## 3 AGRN AGRN: 2 ug/mL 2.7113750 9.973943e-01
## 5 ANOS1 ANOS1: 1 ug/mL 1.2784966 9.969820e-01
## 62 STC2 STC2: 1 ug/mL 1.2922563 9.925379e-01
## 51 PDGFRL PDGFRL: 3 ug/mL 3.7278178 9.867464e-01
## 20 DRAXIN DRAXIN: 1 ug/mL 1.8601663 9.845426e-01
## 63 TCN2 TCN2: 1 ug/mL 1.4196174 9.785267e-01
## 59 SFRP2 SFRP2: 1 ug/mL 2.7537777 9.747477e-01
## 12 CHRDL1 CHRDL1: 1 ug/mL 1.4833789 9.106392e-01
## 56 PTPRS PTPRS: 2.5 ug/mL 2.4477719 9.048828e-01
## 34 HGF HGF: 0.2 ug/mL 2.5292465 7.912208e-01
## 42 LTBP1 LTBP1: 1 ug/mL 1.5764840 7.367217e-01
## 69 THBS4 THBS4: 1 ug/mL 2.7343697 7.352176e-01
## 70 TIMP1 TIMP1: 0.50 ug/mL 3.6504186 7.302567e-01
## 73 VASN VASN: 1 ug/mL 1.7205264 6.740634e-01
## 10 BTC BTC: 0.5 ug/mL 1.0506944 6.189895e-01
## 30 FST FST: 0.25 ug/mL 0.3676806 5.924256e-01
## 58 SFRP1 SFRP1: 1 ug/mL 1.9092366 5.274271e-01
## 57 RARRES2 RARRES2: 0.5 ug/mL 1.5516599 4.392245e-01
## 19 DAG1 DAG1: 2.5 ug/mL 0.7964278 4.278212e-01
## 55 PTN PTN: 1 ug/mL 0.3720418 4.246091e-01
## 43 MATN2 MATN2: 1 ug/mL 1.1671517 3.794087e-01
## 37 IGFBP3 IGFBP3: 0.3 ug/mL 1.0924732 3.032465e-01
## 18 CXCL12 CXCL12: 1 ug/mL 2.3504654 2.808731e-01
## 74 VTN VTN: 3 ug/mL 1.6996553 2.640201e-01
## 68 THBS3 THBS3: 1 ug/mL 1.8962563 2.544644e-01
## 16 CSF2 CSF2: 1 ug/mL 0.5257165 2.463287e-01
## 75 WISP2 WISP2: 1 ug/mL 2.2976018 2.450493e-01
## 49 PAMR1 PAMR1: 0.2 ug/mL 4.1632041 2.066439e-01
## 46 NOV NOV: 1.5 ug/mL 1.6418589 1.691551e-01
## 38 IGFBP4 IGFBP4: 0.2 ug/mL 0.8694056 1.593942e-01
## 71 TIMP2 TIMP2: 0.5 ug/mL 2.3550088 1.534895e-01
## 44 NAMPT NAMPT: 2 ug/mL 0.4533916 1.521385e-01
## 4 ANG ANG: 1 ug/mL 2.0935040 1.474305e-01
## 52 PLAT PLAT: 2 ug/mL 1.0712790 1.469910e-01
## 66 THBS1 THBS1: 1 ug/mL 0.6435273 1.200906e-01
## 53 PLAU PLAU: 2 ug/mL 3.0352391 7.373726e-02
## 54 POSTN POSTN: 1 ug/mL 0.9935289 7.317355e-02
## 11 CCL14 CCL14: 1 ug/mL 0.8223549 7.117944e-02
## 39 IGFBP5 IGFBP5: 0.5 ug/mL 0.9490404 5.785878e-02
## 65 TGFBI TGFBI: 0.2 ug/mL 10.0223719 2.326525e-02
## 17 CTGF CTGF: 3 ug/mL 0.5622329 2.042650e-02
## 61 STC1 STC1: 1 ug/mL 0.2107510 1.495876e-02
## 48 OLFML3 OLFML3: 0.75 ug/mL 0.3038827 1.307027e-02
## 31 FSTL1 FSTL1: 1 ug/mL 3.6943920 7.923565e-03
## 50 PDGFD PDGFD: 0.5 ug/mL 131.1052279 7.183722e-03
## 14 COCH COCH: 1 ug/mL 0.9398051 6.497679e-03
## 13 CLC CLC: 1 ug/mL 1.4293817 6.226064e-03
## 36 IGFBP2 IGFBP2: 0.5 ug/mL 0.8417063 3.844506e-03
## 40 IGFBP7 IGFBP7: 0.1 ug/mL 1.1754256 7.894397e-04
## 33 GDNF GDNF: 0.1 ug/mL 0.7590671 3.652716e-04
## 29 FRZB FRZB: 1 ug/mL 0.9119177 3.556593e-04
## 32 GDF15 GDF15: 1 ug/mL 3.8664053 3.780000e-07
## 64 TGFB1 TGFB1: 0.01 ug/mL 1.4574688 1.630000e-10
OA_protein_df <- merge(x=OA_protein, y=tested_protein_correct_narm_original, by="Protein", all.x=FALSE, all.y=FALSE)
OA_protein_df <- OA_protein_df[order(-OA_protein_df$Probability),]
OA_protein_df <- OA_protein_df[c("Protein", "Percent_EdU", "edu_fc", "Probability")]
colnames(OA_protein_df) <- c("Protein", "OA_%edU", "Ch_edu", "Probability")
# temp_df$ground_truth <- temp_df$edu_fc >=2.0
#
# cut_off_3_count <- sum(temp_df$edu_fc >=3.0, na.rm = TRUE)
# cut_off_2_count <- sum(temp_df$edu_fc >=2.0, na.rm = TRUE)
# cut_off_1_5_count <- sum(temp_df$edu_fc >=1.5, na.rm = TRUE)
# length(temp_df$Protein)
#
# temp_df
#
# temp_df$cutoff_3 <- 'Off'
# temp_df$cutoff_3[temp_df$edu_fc >=3.0] <- 'overlap'
#
# temp_df$cutoff_2 <- 'Off'
# temp_df$cutoff_2[temp_df$edu_fc >=2.0] <- 'overlap'
#
# temp_df$cutoff_1_5 <- 'Off'
# temp_df$cutoff_1_5[temp_df$edu_fc >=1.5] <- 'overlap'
OA_protein_df$OA_pedU_capped <- OA_protein_df$`OA_%edU`
OA_protein_df$OA_pedU_capped[OA_protein_df$OA_pedU_capped>6] <-6
edu_min <- range(OA_protein_df$OA_pedU_capped)[1]
edu_max <- range(OA_protein_df$OA_pedU_capped)[2]
OA_protein_df$p_edu_fc_scalled <- (OA_protein_df$OA_pedU_capped-edu_min)/(edu_max - edu_min)
# OA_protein_df <- OA_protein_df[sort(-OA_protein_df$Probability),]
candidate_ls <- c(1:length(OA_protein_df$Protein))
OA_protein_df$Label <- sprintf("OA%03d", candidate_ls)
library(hrbrthemes)
p <- ggplot(OA_protein_df, aes(x=Label)) +
geom_bar( aes(y=Probability), stat="identity", size=2, fill="grey0", alpha=.2) +
geom_bar( aes(y=p_edu_fc_scalled), stat="identity", size=2, fill=rgb(0.2, 0.6, 0.9, 1), alpha=.6) +
scale_y_continuous(
# Features of the first axis
name = expression("Hit Candidate Probability"),
# Add a second axis and specify its features
sec.axis = sec_axis(~.*(edu_max-edu_min)+edu_min, name=expression(bold("In Vitro %Edu Fc")))
) +
# theme_ipsum() +
theme_classic() +
theme(
axis.title.x = element_text(size=20),
axis.title.y = element_text(color = "grey0", size=20),
axis.title.y.right = element_text(color = rgb(0.2, 0.6, 0.9, 1), size=20),
axis.text.x = element_text(size = 18, angle = 90),
axis.text.y = element_text(size = 18),
axis.text.y.right = element_text(size = 18)
)
# geom_hline(yintercept = h_line_mark, color='red')
p