Overview of biotidyr

Phil Chapman

2016-08-01

## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Introduction

Exemplify functions for generating tidy data from PharmacoSet and MultiAssayExperient objects.

Quick example

A PharmacoSet object contains molecular profiling and drug response data for collections of cell lines. The molecular profiling data is stored in an ExpressionSet object

data('CCLEsmall', package='PharmacoGx')
CCLEsmall
## Name:  CCLE 
## Date Created:  Fri Nov  6 14:00:53 2015 
## Number of cell lines:  1061 
## Number of drug compounds:  24 
## RNA: 
##  Dim:  50 1036 
## RNASeq: 
##  Dim:  50 935 
## CNV: 
##  Dim:  50 744 
## Drug pertubation: 
##  Please look at pertNumber(pSet) to determine number of experiments for each drug-cell combination.
## Drug sensitivity: 
##  Number of Experiments:  11670 
##  Please look at sensNumber(pSet) to determine number of experiments for each drug-cell combination.
eset <- CCLEsmall@molecularProfiles$rna
class(eset)
## [1] "ExpressionSet"
## attr(,"package")
## [1] "Biobase"

The gather.ExpressionSet function converts an ExpressionSet into a tidy data frame:

df <- gather.ExpressionSet(eset, sample_ids=c('143B', '23132-87'), gene_ids=c('BRAF', 'EGFR'), 
                           sample_col = "cellid", gene_col = "Symbol")
df
## # A tibble: 2 x 5
##         fData_row_id                                pData_row_id    value
##                <chr>                                       <chr>    <dbl>
## 1 ENSG00000157764_at MAKER_P_NCLE_RNA7_HG-U133_PLUS_2_F09_454702 6.553047
## 2 ENSG00000157764_at WATCH_P_NCLE_RNA8_HG-U133_PLUS_2_E11_474718 7.462726
## # ... with 2 more variables: fData_gene_id <chr>, pData_sample_id <chr>
as.data.frame(df)
##         fData_row_id                                pData_row_id    value
## 1 ENSG00000157764_at MAKER_P_NCLE_RNA7_HG-U133_PLUS_2_F09_454702 6.553047
## 2 ENSG00000157764_at WATCH_P_NCLE_RNA8_HG-U133_PLUS_2_E11_474718 7.462726
##   fData_gene_id pData_sample_id
## 1          BRAF            143B
## 2          BRAF        23132-87

Note that we can customise which identifier we want to use by specifying the sample_col and gene_col parameters.

gather.PharmacoSet

The gather.PharmacoSet function combined data from multiple sources and reports in a tidy format with a single data point per row:

gather.PharmacoSet(CCLEsmall, sample_ids=c('143B', '23132-87'), gene_ids=c('BRAF', 'EGFR'),
  data_types=c('rna', 'mutation'), gene_col=c('Symbol', 'Symbol'))
## # A tibble: 6 x 5
##   unified_id assayed_id data_type         original    value
##        <chr>      <chr>     <chr>            <chr>    <dbl>
## 1       143B       BRAF       rna 6.55304746313814 6.553047
## 2   23132-87       BRAF       rna 7.46272601915643 7.462726
## 3       143B       BRAF  mutation               wt 0.000000
## 4       143B       EGFR  mutation               wt 0.000000
## 5   23132-87       BRAF  mutation               wt 0.000000
## 6   23132-87       EGFR  mutation               wt 0.000000

makeGeneticVsGeneticTibble.PharmacoSet

The makeGeneticVsGeneticTibble.PharmacoSet function generates a data frame with one pair of genetic feature data per row.

gvg_df <- makeGeneticVsGeneticTibble.PharmacoSet(CCLEsmall, sample_ids=cellNames(CCLEsmall), gene1='RBM5', gene2='RBM5',
data_type1='rna', data_type2='rnaseq', gene_col1 = "Symbol", gene_col2 = "gene_name")
gvg_df
## # A tibble: 921 x 11
##    unified_id gene1 feature_type1 feature_name1 feature_value1
##         <chr> <chr>         <chr>         <chr>          <dbl>
## 1       22RV1  RBM5           rna      RBM5_rna       7.435970
## 2    23132-87  RBM5           rna      RBM5_rna       7.448929
## 3     253J-BV  RBM5           rna      RBM5_rna       6.882137
## 4        253J  RBM5           rna      RBM5_rna       6.619413
## 5    42-MG-BA  RBM5           rna      RBM5_rna       7.584633
## 6        5637  RBM5           rna      RBM5_rna       6.607861
## 7         59M  RBM5           rna      RBM5_rna       6.833055
## 8       639-V  RBM5           rna      RBM5_rna       8.237640
## 9       647-V  RBM5           rna      RBM5_rna       6.207275
## 10        697  RBM5           rna      RBM5_rna       8.403592
## # ... with 911 more rows, and 6 more variables: feature_original1 <chr>,
## #   gene2 <chr>, feature_type2 <chr>, feature_name2 <chr>,
## #   feature_value2 <dbl>, feature_original2 <chr>

This is useful for creating plots of one gene vs another in ggplot2:

ggplot(gvg_df, aes(x=feature_value1, y=feature_value2)) + 
    geom_point() + 
    theme_bw()

However, you can also supply multiple genes to this function:

genes <- c('RBM5', 'NQO1', "STPG1", "NIPAL3","LAS1L","ENPP4","SEMA3F","CFTR")
gvg_df2 <- makeGeneticVsGeneticTibble.PharmacoSet(CCLEsmall, sample_ids=cellNames(CCLEsmall), gene1=genes,  gene2=genes, data_type1='rna', data_type2='rnaseq', gene_col1 = "Symbol", gene_col2 = "gene_name")
nrow(gvg_df2)
## [1] 51576

This data format is then in a convenient form to manage models in a data frame using the dplyr and broom packages - see Managing Many Models by Hadley Wickham. In the example below we are exploring how well RNAseq and Affymetrix data correlate across a number of genes.

mod_df <- gvg_df2 %>%
    dplyr::filter(gene1==gene2) %>%
    dplyr::group_by(feature_name1, feature_name2) %>%
    tidyr::nest() %>%
    dplyr::mutate(mod=purrr::map(data, function(x) lm(feature_value1 ~ feature_value2, data=x)),
                  res=purrr::map(mod, broom::glance))
mod_df
## # A tibble: 7 x 5
##   feature_name1 feature_name2               data      mod
##           <chr>         <chr>             <list>   <list>
## 1     STPG1_rna  STPG1_rnaseq <tibble [921 x 9]> <S3: lm>
## 2    NIPAL3_rna NIPAL3_rnaseq <tibble [921 x 9]> <S3: lm>
## 3     LAS1L_rna  LAS1L_rnaseq <tibble [921 x 9]> <S3: lm>
## 4     ENPP4_rna  ENPP4_rnaseq <tibble [921 x 9]> <S3: lm>
## 5    SEMA3F_rna SEMA3F_rnaseq <tibble [921 x 9]> <S3: lm>
## 6      CFTR_rna   CFTR_rnaseq <tibble [921 x 9]> <S3: lm>
## 7      RBM5_rna   RBM5_rnaseq <tibble [921 x 9]> <S3: lm>
## # ... with 1 more variables: res <list>
mod_df %>% dplyr::select(-data,-mod) %>% tidyr::unnest()
## # A tibble: 7 x 13
##   feature_name1 feature_name2 r.squared adj.r.squared     sigma statistic
##           <chr>         <chr>     <dbl>         <dbl>     <dbl>     <dbl>
## 1     STPG1_rna  STPG1_rnaseq 0.5868825     0.5864330 0.2530020  1305.549
## 2    NIPAL3_rna NIPAL3_rnaseq 0.8396522     0.8394777 0.3080574  4812.291
## 3     LAS1L_rna  LAS1L_rnaseq 0.7786612     0.7784203 0.2541849  3233.006
## 4     ENPP4_rna  ENPP4_rnaseq 0.9482628     0.9482065 0.3559182 16843.845
## 5    SEMA3F_rna SEMA3F_rnaseq 0.8636539     0.8635055 0.2975912  5821.198
## 6      CFTR_rna   CFTR_rnaseq 0.8649105     0.8647635 0.3088132  5883.897
## 7      RBM5_rna   RBM5_rnaseq 0.8327128     0.8325308 0.2458873  4574.547
## # ... with 7 more variables: p.value <dbl>, df <int>, logLik <dbl>,
## #   AIC <dbl>, BIC <dbl>, deviance <dbl>, df.residual <int>