library(tidyverse)
source("../6.paper_figures/viz_themes.R")
path <- "https://raw.githubusercontent.com/broadinstitute/lincs-profiling-complementarity/master/6.paper_figures/data/significant_moas_by_threshold_both_assays.tsv.gz"
moa_performance <- read_tsv(path)
Rows: 1100 Columns: 9── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (3): moa, dose, moa_color_passing
dbl (3): Cell Painting, L1000, avg_replicate_count
lgl (3): pass_thresh_cellpainting, pass_thresh_l1000, pass_both
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
path <- "https://raw.githubusercontent.com/broadinstitute/lincs-profiling-complementarity/3f39cdd2e97d98c394e1031e338cab072d106a1a/6.paper_figures/results/moa_scores.tsv"
moa_scores <- read_tsv(path)
Rows: 2200 Columns: 8── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (3): moa, dose, assay
dbl (4): no_of_replicates, matching_score, p_value, neg_log_10_p_val
lgl (1): pass_thresh
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# To add analysis facet of all dose percent matching
updated_dose_order <- c(dose_order, "All*", "All")
# Load precision scores
precision_file <- file.path("..", "1.Data-exploration", "results", "moa_target_precision.tsv.gz")
precision_cols <- readr::cols(
drug_impact = readr::col_character(),
dose = readr::col_character(),
avg_precision = readr::col_double(),
impact_category = readr::col_character(),
assay = readr::col_character()
)
# Load and process data for plotting
precision_df <-
readr::read_tsv(precision_file, col_types = precision_cols) %>%
filter(impact_category == "moa") %>%
rename(moa = "drug_impact") %>%
select(-impact_category) %>%
mutate(assay = forcats::fct_recode(assay, "Cell Painting" = "cell_painting"))
# Separate the dose comparison and recode dose separately
same_dose_precision_df <- precision_df %>%
dplyr::filter(dose_comparison == "same_dose")
same_dose_precision_df$dose <- as.numeric(paste(same_dose_precision_df$dose))
same_dose_precision_df$dose <- dplyr::recode_factor(same_dose_precision_df$dose, !!!dose_rename)
precision_df <- dplyr::bind_rows(
same_dose_precision_df,
precision_df %>%
dplyr::filter(dose_comparison != "same_dose")
) %>%
tidyr::drop_na()
precision_df$dose <- factor(precision_df$dose, levels = updated_dose_order)
moa_scores %>%
skimr::skim()
── Data Summary ────────────────────────
Values
Name Piped data
Number of rows 2200
Number of columns 8
_______________________
Column type frequency:
character 3
logical 1
numeric 4
________________________
Group variables None
moa_scores %>%
group_by(pass_thresh, assay) %>%
skimr::skim()
── Data Summary ────────────────────────
Values
Name Piped data
Number of rows 2200
Number of columns 8
_______________________
Column type frequency:
character 2
numeric 4
________________________
Group variables pass_thresh, assay
moa_scores %>%
ggplot(aes(no_of_replicates)) +
geom_histogram(bins = 50) +
theme_bw() +
scale_x_log10() +
facet_grid(assay~dose)
precision_df %>%
skimr::skim()
── Data Summary ────────────────────────
Values
Name Piped data
Number of rows 2933
Number of columns 5
_______________________
Column type frequency:
character 2
factor 2
numeric 1
________________________
Group variables None
precision_df %>%
anti_join(moa_scores, by = c("moa" = "moa")) %>%
arrange(moa)
moa_scores %>%
anti_join(precision_df, by = c("moa" = "moa")) %>%
arrange(moa)
moa_scores <-
moa_scores %>%
inner_join(precision_df, by = c("moa", "dose", "assay")) %>%
arrange(moa)
model_fit <- function(data, output_variable, input_variable = "log(no_of_replicates)") {
lm(formula = as.formula(paste0(
output_variable, "~", input_variable
)),
data = data)
}
avg_precision depends on number of replicates
moa_scores %>%
mutate(all_dose = dose %in% c("All", "All*")) %>%
ggplot(aes(no_of_replicates, avg_precision)) +
geom_bin2d() +
geom_smooth(method = "lm", formula = "y~x") +
theme_bw() +
facet_grid(assay~all_dose, labeller = labeller(.cols = label_both), scales = "free_x")
moa_scores %>%
mutate(all_dose = dose %in% c("All", "All*")) %>%
ggplot(aes(log(no_of_replicates), log(avg_precision))) +
geom_bin2d() +
geom_smooth(method = "lm", formula = "y~x") +
theme_bw() +
facet_grid(assay~all_dose, labeller = labeller(.cols = label_both), scales = "free_x")
moa_summary <-
moa_scores %>%
mutate(all_dose = dose %in% c("All", "All*")) %>%
group_nest(assay, all_dose) %>%
mutate(model = map(data, model_fit, output_variable = "log(avg_precision)", input_variable = "log(no_of_replicates)")) %>%
mutate(model_summary = map(model, broom::tidy))
moa_summary %>%
unnest(model_summary) %>%
filter(term != "(Intercept)") %>%
dplyr::select(assay, all_dose, term, estimate:p.value)
moa_summary <-
moa_scores %>%
mutate(all_dose = dose %in% c("All", "All*")) %>%
group_nest(assay, all_dose) %>%
mutate(model = map(data, model_fit, output_variable = "log(avg_precision)", input_variable = "no_of_replicates")) %>%
mutate(model_summary = map(model, broom::tidy))
moa_summary %>%
unnest(model_summary) %>%
filter(term != "(Intercept)") %>%
dplyr::select(assay, all_dose, term, estimate:p.value)
median pairwise correlation (matching_score) does not depend on number of replicates
moa_scores %>%
mutate(all_dose = dose %in% c("All", "All*")) %>%
ggplot(aes(log(no_of_replicates), matching_score)) +
geom_bin2d() +
geom_smooth(method = "lm", formula = "y~x") +
theme_bw() +
facet_grid(assay~all_dose, labeller = labeller(.cols = label_both))
moa_summary <-
moa_scores %>%
mutate(all_dose = dose %in% c("All", "All*")) %>%
group_nest(assay, all_dose) %>%
mutate(model = map(data, model_fit, output_variable = "matching_score")) %>%
mutate(model_summary = map(model, broom::tidy))
moa_summary %>%
unnest(model_summary) %>%
filter(term != "(Intercept)") %>%
dplyr::select(assay, all_dose, term, estimate:p.value)
p-value of matching_score does depend on number of replicates
moa_scores %>%
mutate(all_dose = dose %in% c("All", "All*")) %>%
ggplot(aes(log(no_of_replicates), neg_log_10_p_val)) +
geom_bin2d() +
geom_smooth(method = "lm", formula = "y~x") +
theme_bw() +
facet_grid(assay~all_dose, labeller = labeller(.cols = label_both))
moa_summary <-
moa_scores %>%
mutate(all_dose = dose %in% c("All", "All*")) %>%
group_nest(assay, all_dose) %>%
mutate(model = map(data, model_fit, output_variable = "neg_log_10_p_val")) %>%
mutate(model_summary = map(model, broom::tidy))
moa_summary %>%
unnest(model_summary) %>%
filter(term != "(Intercept)") %>%
dplyr::select(assay, all_dose, term, estimate:p.value)