library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.5.3
## -- Attaching packages ----------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.1 v purrr 0.3.2
## v tibble 2.1.1 v dplyr 0.8.3
## v tidyr 0.8.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## Warning: package 'ggplot2' was built under R version 3.5.3
## Warning: package 'tibble' was built under R version 3.5.3
## Warning: package 'tidyr' was built under R version 3.5.3
## Warning: package 'readr' was built under R version 3.5.3
## Warning: package 'purrr' was built under R version 3.5.3
## Warning: package 'dplyr' was built under R version 3.5.3
## Warning: package 'stringr' was built under R version 3.5.3
## Warning: package 'forcats' was built under R version 3.5.3
## -- Conflicts -------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
In this R Markdown document, you and your team will create a fully reproducible analysis with the goal of assessing and interpreting the replicability of two pharmacogenomic experiments. This document should contain all of the text and code of your analyses, which will allow others to run, interpret, and reuse your work.
The questions below will help guide you in your analyses and interpretation of results. You don’t need to answer every question, but for the problems you do complete, make sure that you completely justify your conclusions by explaining your reasoning and including numerical summaries and data visualizations wherever possible. There are four tutorials (also R Markdown documents) that will help you learn new tools to tackle these problems, and the questions are divided into four sections corresponding to the tutorials (though many overlap with more than one tutorial). If questions arise during your analyses that do not fit into these problems, feel free to include those as well.
For each answer, include text by simply typing below the question. Include code in code blocks (include three back ticks at the start and end of each code block):
#Your code goes here
You may find it helpful to use the version control and code sharing system called GitHub to work together with your team so that all of you can edit the same document and keep track of its changes. Here is a setup guide and brief introduction to Git and GitHub from another course. The mentors will be able to help if you run into problems.
rawPharmacoData <- readRDS("C:/Users/HP/Desktop/PR2019replicathon/data/rawPharmacoData.rds")
unique(rawPharmacoData$cellLine)
## [1] "22RV1" "5637" "639-V" "697"
## [5] "769-P" "786-0" "8-MG-BA" "8305C"
## [9] "8505C" "A172" "A204" "A2058"
## [13] "A253" "A2780" "A375" "A549"
## [17] "A673" "ACHN" "AN3-CA" "AsPC-1"
## [21] "AU565" "BCPAP" "BFTC-909" "BL-41"
## [25] "BL-70" "BT-20" "BT-474" "BT-549"
## [29] "BxPC-3" "C2BBe1" "C32" "C3A"
## [33] "CAL-12T" "CAL-27" "CAL-85-1" "Calu-3"
## [37] "Calu-6" "CAMA-1" "Capan-2" "CAS-1"
## [41] "CHL-1" "CHP-212" "COLO-205" "COLO-320-HSR"
## [45] "COLO-678" "COLO-679" "COLO-741" "COR-L105"
## [49] "COR-L23" "Daoy" "DBTRG-05MG" "DEL"
## [53] "Detroit562" "DK-MG" "DMS-114" "DOHH-2"
## [57] "DU-145" "EB2" "EFE-184" "EFM-19"
## [61] "EFO-21" "EFO-27" "EM-2" "ESS-1"
## [65] "FADU" "G-361" "G-401" "G-402"
## [69] "GAMG" "GB-1" "GCIY" "GCT"
## [73] "GI-1" "GMS-10" "H4" "HCC1187"
## [77] "HCC1395" "HCC1569" "HCC1806" "HCC1954"
## [81] "HCC70" "HCT-116" "HCT-15" "HD-MY-Z"
## [85] "HEC-1" "HGC-27" "HH" "HLE"
## [89] "HOS" "HPAF-II" "Hs-578-T" "HSC-2"
## [93] "HT" "HT-1080" "HT-1197" "HT-1376"
## [97] "HT-144" "HT-29" "HuCCT1" "HuP-T3"
## [101] "HuP-T4" "IA-LM" "IGROV-1" "IPC-298"
## [105] "IST-MES1" "J82" "JVM-3" "K052"
## [109] "KALS-1" "KARPAS-299" "KARPAS-422" "KG-1"
## [113] "KLE" "KNS-42" "KNS-62" "KNS-81-FD"
## [117] "KP-4" "KU812" "KURAMOCHI" "KYSE-140"
## [121] "KYSE-150" "KYSE-180" "KYSE-410" "KYSE-450"
## [125] "KYSE-510" "KYSE-520" "KYSE-70" "L-363"
## [129] "L-428" "LCLC-103H" "LOXIMVI" "LP-1"
## [133] "LS-123" "LS-411N" "LS-513" "LU-99A"
## [137] "M059J" "MC116" "MCF7" "MDA-MB-157"
## [141] "MDA-MB-175-VII" "MDA-MB-415" "MDA-MB-453" "MDA-MB-468"
## [145] "MEG-01" "MEL-HO" "MES-SA" "Mewo"
## [149] "MFE-280" "MFE-296" "MG-63" "MHH-ES-1"
## [153] "MIA-PaCa-2" "MKN45" "MKN7" "MOLT-16"
## [157] "MPP-89" "MSTO-211H" "NCI-H1048" "NCI-H1092"
## [161] "NCI-H1155" "NCI-H1299" "NCI-H1355" "NCI-H1563"
## [165] "NCI-H1573" "NCI-H1581" "NCI-H1648" "NCI-H1650"
## [169] "NCI-H1651" "NCI-H1666" "NCI-H1693" "NCI-H1694"
## [173] "NCI-H1703" "NCI-H1792" "NCI-H1793" "NCI-H1975"
## [177] "NCI-H2009" "NCI-H2030" "NCI-H2052" "NCI-H2087"
## [181] "NCI-H2122" "NCI-H2170" "NCI-H2228" "NCI-H226"
## [185] "NCI-H23" "NCI-H2452" "NCI-H28" "NCI-H358"
## [189] "NCI-H441" "NCI-H460" "NCI-H520" "NCI-H522"
## [193] "NCI-H650" "NCI-H661" "NCI-H727" "NCI-H747"
## [197] "NCI-H810" "NCI-N87" "NCI-SNU-1" "NCI-SNU-16"
## [201] "NUGC-3" "OC-314" "OCI-AML2" "OE33"
## [205] "ONS-76" "OPM-2" "OVCAR-3" "OVCAR-4"
## [209] "OVCAR-8" "P12-ICHIKAWA" "P31-FUJ" "Panc 03.27"
## [213] "Panc 10.05" "PC-14" "PC-3" "PSN1"
## [217] "Raji" "RD" "REH" "RERF-LC-MS"
## [221] "RKO" "RL95-2" "RPMI-7951" "RPMI-8402"
## [225] "RT-112" "RT4" "RVH-421" "Saos-2"
## [229] "SBC-5" "SCC-25" "SCC-9" "SF126"
## [233] "SF295" "SHP-77" "SIG-M5" "SIMA"
## [237] "SJRH30" "SJSA-1" "SK-CO-1" "SK-HEP-1"
## [241] "SK-LMS-1" "SK-LU-1" "SK-MEL-2" "SK-MEL-24"
## [245] "SK-MEL-30" "SK-MEL-5" "SK-MES-1" "SK-MM-2"
## [249] "SK-N-AS" "SK-N-DZ" "SK-N-FI" "SK-OV-3"
## [253] "SNG-M" "SNU-387" "SNU-423" "SNU-449"
## [257] "SNU-C2B" "SUP-T1" "SW1088" "SW1417"
## [261] "SW1573" "SW1990" "SW48" "SW620"
## [265] "SW780" "SW900" "T-24" "T47D"
## [269] "T84" "T98G" "TCCSUP" "TE-1"
## [273] "TE-11" "TE-15" "TE-5" "TE-9"
## [277] "TYK-nu" "U-118-MG" "U-2-OS" "U-87-MG"
## [281] "UACC-257" "UACC-62" "UACC-812" "UM-UC-3"
## [285] "VMRC-RCZ" "WM-115" "YKG-1" "ZR-75-30"
Celllines <- unique(rawPharmacoData$cellLine)
length(Celllines)
## [1] 288
Drug <- unique(rawPharmacoData$drug)
length(Drug)
## [1] 15
rawPharmacoData %>%
group_by(study) %>%
summarize(Concentration = n_distinct(concentration))
## # A tibble: 2 x 2
## study Concentration
## <chr> <int>
## 1 CCLE 8
## 2 GDSC 32
rawPharmacoData %>%
group_by(study) %>%
summarize(Concentration = n_distinct(concentration), droga = n_distinct(drug))
## # A tibble: 2 x 3
## study Concentration droga
## <chr> <int> <int>
## 1 CCLE 8 15
## 2 GDSC 32 15
rawPharmacoData %>%
group_by(study)%>%
count(concentration)
## # A tibble: 40 x 3
## # Groups: study [2]
## study concentration n
## <chr> <dbl> <int>
## 1 CCLE 0.0025 2530
## 2 CCLE 0.008 2543
## 3 CCLE 0.025 2557
## 4 CCLE 0.08 2557
## 5 CCLE 0.25 2557
## 6 CCLE 0.8 2557
## 7 CCLE 2.53 2557
## 8 CCLE 8 2556
## 9 GDSC 0.0004 89
## 10 GDSC 0.0008 89
## # ... with 30 more rows
rawPharmacoData %>%
ggplot(aes(x = log2(viability))) +
geom_histogram(fill = "gray", color = "black") +
facet_wrap(~ drug) +
ggtitle("Distributions of viability by drug")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
rawPharmacoData %>%
ggplot(aes(x = viability)) +
geom_histogram(aes( color = study), binwidth = 0.1) +
facet_wrap(~ drug) +
ggtitle("Distributions of viability by drug")
rawPharmacoData %>%
group_by(study)%>%
summarize(min_viability = min(viability),
max_viability = max(viability),
n_too_small = sum(viability < 0),
n_too_big = sum(viability > 100),
n_middle = sum(viability >0 & viability < 100))
## # A tibble: 2 x 6
## study min_viability max_viability n_too_small n_too_big n_middle
## <chr> <dbl> <dbl> <int> <int> <int>
## 1 CCLE -20 201 13 7056 13330
## 2 GDSC -14.2 319. 10 8722 14281
summarizedPharmacoData <-readRDS("C:/Users/HP/Desktop/PR2019replicathon/data/summarizedPharmacoData.rds")
str(summarizedPharmacoData)
## 'data.frame': 2557 obs. of 6 variables:
## $ cellLine : chr "22RV1" "5637" "639-V" "697" ...
## $ drug : chr "Nilotinib" "Nilotinib" "Nilotinib" "Nilotinib" ...
## $ ic50_CCLE: num 8 7.48 8 1.91 8 ...
## $ auc_CCLE : num 0 0.00726 0.07101 0.15734 0 ...
## $ ic50_GDSC: num 155.27 219.93 92.18 3.06 19.63 ...
## $ auc_GDSC : num 0.00394 0.00362 0.00762 0.06927 0.02876 ...
-No necesariamente mira la numero 10
rawPharmacoData %>%
ggplot(aes(x = viability)) +
geom_histogram(aes( color = drug), binwidth = 0.1) +
facet_wrap(~ concentration) +
ggtitle("Distributions of viability by different concentrations of drugs")
ggplot(summarizedPharmacoData, aes(x = auc_GDSC, y = auc_CCLE, col = drug)) +
geom_jitter(alpha = 1/2) +
xlab("GDSC AUC") +
ylab("CCLE AUC") +
ggtitle("Comparing AUC in GDSC and CCLE for all cell lines")
ggplot(summarizedPharmacoData, aes(x = auc_GDSC, y = auc_CCLE, col = drug)) +
geom_jitter(alpha = 1/2) +
xlab("GDSC AUC") +
ylab("CCLE AUC") +
facet_wrap(~ drug) +
geom_smooth(method = "lm", col = "black") +
ggtitle("Comparing AUC in GDSC and CCLE for all drugs")
2. Calculate correlation coefficients of the AUC in GDSC and CCLE for each drug (hint: code from Tutorial 1b may help).
drugCorrs <- summarizedPharmacoData%>%
group_by(drug) %>%
summarize(Pearson_ic50 = cor(-log10(ic50_GDSC / 10^6), -log10(ic50_CCLE / 10^6), method = "pearson"),
Spearman_ic50 = cor(-log10(ic50_GDSC / 10^6), -log10(ic50_CCLE / 10^6), method = "spearman"))
drugCorrs
## # A tibble: 15 x 3
## drug Pearson_ic50 Spearman_ic50
## <chr> <dbl> <dbl>
## 1 17-AAG 0.543 0.586
## 2 AZD0530 0.455 0.360
## 3 AZD6244 0.320 0.244
## 4 Crizotinib 0.409 0.106
## 5 Erlotinib 0.0812 0.0800
## 6 lapatinib 0.427 0.289
## 7 Nilotinib 0.611 0.122
## 8 Nutlin-3 0.143 0.306
## 9 paclitaxel 0.211 0.350
## 10 PD-0325901 0.625 0.580
## 11 PD-0332991 0.240 0.141
## 12 PHA-665752 0.118 0.0554
## 13 PLX4720 0.456 0.358
## 14 Sorafenib 0.277 0.329
## 15 TAE684 0.287 0.268
drugCorrs %>%
tidyr::spread(measure, correlation) %>%
ggplot(aes(x = Pearson_ic50, y = Spearman_ic50, label = drug)) +
geom_point(alpha = 1/2) +
geom_text() +
ggtitle("Correlation of cell line IC50 summaries between studies for each drug")
drugCorrs <- gather(drugCorrs, measure, correlation, -drug)
drugCorrs
drugCorrs <- gather(drugCorrs, measure, correlation, -drug)
drugCorrs
## # A tibble: 30 x 3
## drug measure correlation
## <chr> <chr> <dbl>
## 1 17-AAG Pearson_ic50 0.543
## 2 AZD0530 Pearson_ic50 0.455
## 3 AZD6244 Pearson_ic50 0.320
## 4 Crizotinib Pearson_ic50 0.409
## 5 Erlotinib Pearson_ic50 0.0812
## 6 lapatinib Pearson_ic50 0.427
## 7 Nilotinib Pearson_ic50 0.611
## 8 Nutlin-3 Pearson_ic50 0.143
## 9 paclitaxel Pearson_ic50 0.211
## 10 PD-0325901 Pearson_ic50 0.625
## # ... with 20 more rows
drugCorrs %>%
ggplot(aes(x = drug, y = correlation, fill = measure, group = measure)) +
geom_bar(stat = "identity", position = position_dodge(), colour = "black") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_fill_grey() +
ylim(0, 1) +
ggtitle("Correlation of cell line IC50 summaries between studies for each drug")