library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.5.3
## -- Attaching packages ----------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.1     v purrr   0.3.2
## v tibble  2.1.1     v dplyr   0.8.3
## v tidyr   0.8.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## Warning: package 'ggplot2' was built under R version 3.5.3
## Warning: package 'tibble' was built under R version 3.5.3
## Warning: package 'tidyr' was built under R version 3.5.3
## Warning: package 'readr' was built under R version 3.5.3
## Warning: package 'purrr' was built under R version 3.5.3
## Warning: package 'dplyr' was built under R version 3.5.3
## Warning: package 'stringr' was built under R version 3.5.3
## Warning: package 'forcats' was built under R version 3.5.3
## -- Conflicts -------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Instructions:

In this R Markdown document, you and your team will create a fully reproducible analysis with the goal of assessing and interpreting the replicability of two pharmacogenomic experiments. This document should contain all of the text and code of your analyses, which will allow others to run, interpret, and reuse your work.

The questions below will help guide you in your analyses and interpretation of results. You don’t need to answer every question, but for the problems you do complete, make sure that you completely justify your conclusions by explaining your reasoning and including numerical summaries and data visualizations wherever possible. There are four tutorials (also R Markdown documents) that will help you learn new tools to tackle these problems, and the questions are divided into four sections corresponding to the tutorials (though many overlap with more than one tutorial). If questions arise during your analyses that do not fit into these problems, feel free to include those as well.

For each answer, include text by simply typing below the question. Include code in code blocks (include three back ticks at the start and end of each code block):

#Your code goes here

You may find it helpful to use the version control and code sharing system called GitHub to work together with your team so that all of you can edit the same document and keep track of its changes. Here is a setup guide and brief introduction to Git and GitHub from another course. The mentors will be able to help if you run into problems.

Questions:

Exploratory analysis of pharmacogenomic data

  1. How many cell-lines are contained in the data?
rawPharmacoData <- readRDS("C:/Users/HP/Desktop/PR2019replicathon/data/rawPharmacoData.rds")
unique(rawPharmacoData$cellLine)
##   [1] "22RV1"          "5637"           "639-V"          "697"           
##   [5] "769-P"          "786-0"          "8-MG-BA"        "8305C"         
##   [9] "8505C"          "A172"           "A204"           "A2058"         
##  [13] "A253"           "A2780"          "A375"           "A549"          
##  [17] "A673"           "ACHN"           "AN3-CA"         "AsPC-1"        
##  [21] "AU565"          "BCPAP"          "BFTC-909"       "BL-41"         
##  [25] "BL-70"          "BT-20"          "BT-474"         "BT-549"        
##  [29] "BxPC-3"         "C2BBe1"         "C32"            "C3A"           
##  [33] "CAL-12T"        "CAL-27"         "CAL-85-1"       "Calu-3"        
##  [37] "Calu-6"         "CAMA-1"         "Capan-2"        "CAS-1"         
##  [41] "CHL-1"          "CHP-212"        "COLO-205"       "COLO-320-HSR"  
##  [45] "COLO-678"       "COLO-679"       "COLO-741"       "COR-L105"      
##  [49] "COR-L23"        "Daoy"           "DBTRG-05MG"     "DEL"           
##  [53] "Detroit562"     "DK-MG"          "DMS-114"        "DOHH-2"        
##  [57] "DU-145"         "EB2"            "EFE-184"        "EFM-19"        
##  [61] "EFO-21"         "EFO-27"         "EM-2"           "ESS-1"         
##  [65] "FADU"           "G-361"          "G-401"          "G-402"         
##  [69] "GAMG"           "GB-1"           "GCIY"           "GCT"           
##  [73] "GI-1"           "GMS-10"         "H4"             "HCC1187"       
##  [77] "HCC1395"        "HCC1569"        "HCC1806"        "HCC1954"       
##  [81] "HCC70"          "HCT-116"        "HCT-15"         "HD-MY-Z"       
##  [85] "HEC-1"          "HGC-27"         "HH"             "HLE"           
##  [89] "HOS"            "HPAF-II"        "Hs-578-T"       "HSC-2"         
##  [93] "HT"             "HT-1080"        "HT-1197"        "HT-1376"       
##  [97] "HT-144"         "HT-29"          "HuCCT1"         "HuP-T3"        
## [101] "HuP-T4"         "IA-LM"          "IGROV-1"        "IPC-298"       
## [105] "IST-MES1"       "J82"            "JVM-3"          "K052"          
## [109] "KALS-1"         "KARPAS-299"     "KARPAS-422"     "KG-1"          
## [113] "KLE"            "KNS-42"         "KNS-62"         "KNS-81-FD"     
## [117] "KP-4"           "KU812"          "KURAMOCHI"      "KYSE-140"      
## [121] "KYSE-150"       "KYSE-180"       "KYSE-410"       "KYSE-450"      
## [125] "KYSE-510"       "KYSE-520"       "KYSE-70"        "L-363"         
## [129] "L-428"          "LCLC-103H"      "LOXIMVI"        "LP-1"          
## [133] "LS-123"         "LS-411N"        "LS-513"         "LU-99A"        
## [137] "M059J"          "MC116"          "MCF7"           "MDA-MB-157"    
## [141] "MDA-MB-175-VII" "MDA-MB-415"     "MDA-MB-453"     "MDA-MB-468"    
## [145] "MEG-01"         "MEL-HO"         "MES-SA"         "Mewo"          
## [149] "MFE-280"        "MFE-296"        "MG-63"          "MHH-ES-1"      
## [153] "MIA-PaCa-2"     "MKN45"          "MKN7"           "MOLT-16"       
## [157] "MPP-89"         "MSTO-211H"      "NCI-H1048"      "NCI-H1092"     
## [161] "NCI-H1155"      "NCI-H1299"      "NCI-H1355"      "NCI-H1563"     
## [165] "NCI-H1573"      "NCI-H1581"      "NCI-H1648"      "NCI-H1650"     
## [169] "NCI-H1651"      "NCI-H1666"      "NCI-H1693"      "NCI-H1694"     
## [173] "NCI-H1703"      "NCI-H1792"      "NCI-H1793"      "NCI-H1975"     
## [177] "NCI-H2009"      "NCI-H2030"      "NCI-H2052"      "NCI-H2087"     
## [181] "NCI-H2122"      "NCI-H2170"      "NCI-H2228"      "NCI-H226"      
## [185] "NCI-H23"        "NCI-H2452"      "NCI-H28"        "NCI-H358"      
## [189] "NCI-H441"       "NCI-H460"       "NCI-H520"       "NCI-H522"      
## [193] "NCI-H650"       "NCI-H661"       "NCI-H727"       "NCI-H747"      
## [197] "NCI-H810"       "NCI-N87"        "NCI-SNU-1"      "NCI-SNU-16"    
## [201] "NUGC-3"         "OC-314"         "OCI-AML2"       "OE33"          
## [205] "ONS-76"         "OPM-2"          "OVCAR-3"        "OVCAR-4"       
## [209] "OVCAR-8"        "P12-ICHIKAWA"   "P31-FUJ"        "Panc 03.27"    
## [213] "Panc 10.05"     "PC-14"          "PC-3"           "PSN1"          
## [217] "Raji"           "RD"             "REH"            "RERF-LC-MS"    
## [221] "RKO"            "RL95-2"         "RPMI-7951"      "RPMI-8402"     
## [225] "RT-112"         "RT4"            "RVH-421"        "Saos-2"        
## [229] "SBC-5"          "SCC-25"         "SCC-9"          "SF126"         
## [233] "SF295"          "SHP-77"         "SIG-M5"         "SIMA"          
## [237] "SJRH30"         "SJSA-1"         "SK-CO-1"        "SK-HEP-1"      
## [241] "SK-LMS-1"       "SK-LU-1"        "SK-MEL-2"       "SK-MEL-24"     
## [245] "SK-MEL-30"      "SK-MEL-5"       "SK-MES-1"       "SK-MM-2"       
## [249] "SK-N-AS"        "SK-N-DZ"        "SK-N-FI"        "SK-OV-3"       
## [253] "SNG-M"          "SNU-387"        "SNU-423"        "SNU-449"       
## [257] "SNU-C2B"        "SUP-T1"         "SW1088"         "SW1417"        
## [261] "SW1573"         "SW1990"         "SW48"           "SW620"         
## [265] "SW780"          "SW900"          "T-24"           "T47D"          
## [269] "T84"            "T98G"           "TCCSUP"         "TE-1"          
## [273] "TE-11"          "TE-15"          "TE-5"           "TE-9"          
## [277] "TYK-nu"         "U-118-MG"       "U-2-OS"         "U-87-MG"       
## [281] "UACC-257"       "UACC-62"        "UACC-812"       "UM-UC-3"       
## [285] "VMRC-RCZ"       "WM-115"         "YKG-1"          "ZR-75-30"
Celllines <- unique(rawPharmacoData$cellLine) 
length(Celllines)
## [1] 288
  1. What drug concentrations were used in each study?
Drug <- unique(rawPharmacoData$drug) 
length(Drug)
## [1] 15
rawPharmacoData %>%
    group_by(study) %>%
    summarize(Concentration = n_distinct(concentration))
## # A tibble: 2 x 2
##   study Concentration
##   <chr>         <int>
## 1 CCLE              8
## 2 GDSC             32
rawPharmacoData %>%
    group_by(study) %>%
    summarize(Concentration = n_distinct(concentration), droga = n_distinct(drug))
## # A tibble: 2 x 3
##   study Concentration droga
##   <chr>         <int> <int>
## 1 CCLE              8    15
## 2 GDSC             32    15
rawPharmacoData %>%
  group_by(study)%>%
  count(concentration)
## # A tibble: 40 x 3
## # Groups:   study [2]
##    study concentration     n
##    <chr>         <dbl> <int>
##  1 CCLE         0.0025  2530
##  2 CCLE         0.008   2543
##  3 CCLE         0.025   2557
##  4 CCLE         0.08    2557
##  5 CCLE         0.25    2557
##  6 CCLE         0.8     2557
##  7 CCLE         2.53    2557
##  8 CCLE         8       2556
##  9 GDSC         0.0004    89
## 10 GDSC         0.0008    89
## # ... with 30 more rows
  1. Histograms, appart from telling us how the data is distributed, can also make evident potential problems with the data. Plot a histogram of drug viabilities. Does it look as one would expect from the description of the data?
rawPharmacoData %>%
    ggplot(aes(x = log2(viability))) +
    geom_histogram(fill = "gray", color = "black") +
    facet_wrap(~ drug) +
    ggtitle("Distributions of viability by drug")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

rawPharmacoData %>%
    ggplot(aes(x = viability)) +
    geom_histogram(aes( color = study), binwidth = 0.1) +
    facet_wrap(~ drug) +
    ggtitle("Distributions of viability by drug")

  1. How many viability data points are within the expected range according to the definition of viability (e.g. above 0 and below 100)? Try to come up with explanations about the values that are out of range. Are these due to technical issues? Biology?
rawPharmacoData %>%
    group_by(study)%>%
    summarize(min_viability = min(viability),
              max_viability = max(viability),
              n_too_small   = sum(viability < 0),
              n_too_big     = sum(viability > 100),
              n_middle      = sum(viability >0 & viability < 100))
## # A tibble: 2 x 6
##   study min_viability max_viability n_too_small n_too_big n_middle
##   <chr>         <dbl>         <dbl>       <int>     <int>    <int>
## 1 CCLE          -20            201           13      7056    13330
## 2 GDSC          -14.2          319.          10      8722    14281
  1. Read the csv file containing the summarized data. What kinds of variables are in the data? What does each column represent
summarizedPharmacoData <-readRDS("C:/Users/HP/Desktop/PR2019replicathon/data/summarizedPharmacoData.rds")
str(summarizedPharmacoData)
## 'data.frame':    2557 obs. of  6 variables:
##  $ cellLine : chr  "22RV1" "5637" "639-V" "697" ...
##  $ drug     : chr  "Nilotinib" "Nilotinib" "Nilotinib" "Nilotinib" ...
##  $ ic50_CCLE: num  8 7.48 8 1.91 8 ...
##  $ auc_CCLE : num  0 0.00726 0.07101 0.15734 0 ...
##  $ ic50_GDSC: num  155.27 219.93 92.18 3.06 19.63 ...
##  $ auc_GDSC : num  0.00394 0.00362 0.00762 0.06927 0.02876 ...
  1. Plot histograms of the viability scores at different levels of drug doses. Are stronger drug concentrations consistent with lower viability scores?

-No necesariamente mira la numero 10

rawPharmacoData %>%
    ggplot(aes(x = viability)) +
    geom_histogram(aes( color = drug), binwidth = 0.1) +
    facet_wrap(~ concentration) +
    ggtitle("Distributions of viability by different concentrations of drugs")

Using Correlation Measures to Assess Replicability of Drug Response Studies

  1. Create a scatterplot for each drug comparing the AUC in GDSC and CCLE for all cell lines (hint: code from Tutorial 1b may help).
ggplot(summarizedPharmacoData, aes(x = auc_GDSC, y = auc_CCLE, col = drug)) +
    geom_jitter(alpha = 1/2) +
    xlab("GDSC AUC") +
    ylab("CCLE AUC") +
    ggtitle("Comparing AUC in GDSC and CCLE for all cell lines")

ggplot(summarizedPharmacoData, aes(x = auc_GDSC, y = auc_CCLE, col = drug)) +
    geom_jitter(alpha = 1/2) +
    xlab("GDSC AUC") +
    ylab("CCLE AUC") +
    facet_wrap(~ drug) +
    geom_smooth(method = "lm", col = "black") +
    ggtitle("Comparing AUC in GDSC and CCLE for all drugs")

2. Calculate correlation coefficients of the AUC in GDSC and CCLE for each drug (hint: code from Tutorial 1b may help).

drugCorrs <- summarizedPharmacoData%>% 
    group_by(drug) %>%
    summarize(Pearson_ic50  = cor(-log10(ic50_GDSC / 10^6), -log10(ic50_CCLE / 10^6), method = "pearson"),
              Spearman_ic50 = cor(-log10(ic50_GDSC / 10^6), -log10(ic50_CCLE / 10^6), method = "spearman"))

drugCorrs
## # A tibble: 15 x 3
##    drug       Pearson_ic50 Spearman_ic50
##    <chr>             <dbl>         <dbl>
##  1 17-AAG           0.543         0.586 
##  2 AZD0530          0.455         0.360 
##  3 AZD6244          0.320         0.244 
##  4 Crizotinib       0.409         0.106 
##  5 Erlotinib        0.0812        0.0800
##  6 lapatinib        0.427         0.289 
##  7 Nilotinib        0.611         0.122 
##  8 Nutlin-3         0.143         0.306 
##  9 paclitaxel       0.211         0.350 
## 10 PD-0325901       0.625         0.580 
## 11 PD-0332991       0.240         0.141 
## 12 PHA-665752       0.118         0.0554
## 13 PLX4720          0.456         0.358 
## 14 Sorafenib        0.277         0.329 
## 15 TAE684           0.287         0.268
drugCorrs %>%
  tidyr::spread(measure, correlation) %>%
  ggplot(aes(x = Pearson_ic50, y = Spearman_ic50, label = drug)) +
    geom_point(alpha = 1/2) +
    geom_text() +
    ggtitle("Correlation of cell line IC50 summaries between studies for each drug")

drugCorrs <- gather(drugCorrs, measure, correlation, -drug)

drugCorrs

drugCorrs <- gather(drugCorrs, measure, correlation, -drug)

drugCorrs
## # A tibble: 30 x 3
##    drug       measure      correlation
##    <chr>      <chr>              <dbl>
##  1 17-AAG     Pearson_ic50      0.543 
##  2 AZD0530    Pearson_ic50      0.455 
##  3 AZD6244    Pearson_ic50      0.320 
##  4 Crizotinib Pearson_ic50      0.409 
##  5 Erlotinib  Pearson_ic50      0.0812
##  6 lapatinib  Pearson_ic50      0.427 
##  7 Nilotinib  Pearson_ic50      0.611 
##  8 Nutlin-3   Pearson_ic50      0.143 
##  9 paclitaxel Pearson_ic50      0.211 
## 10 PD-0325901 Pearson_ic50      0.625 
## # ... with 20 more rows
drugCorrs %>%
    ggplot(aes(x = drug, y = correlation, fill = measure, group = measure)) +
    geom_bar(stat = "identity", position = position_dodge(), colour = "black") + 
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    scale_fill_grey() +
    ylim(0, 1) + 
    ggtitle("Correlation of cell line IC50 summaries between studies for each drug")