##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Exemplify functions for generating tidy data from PharmacoSet and MultiAssayExperient objects.
A PharmacoSet object contains molecular profiling and drug response data for collections of cell lines. The molecular profiling data is stored in an ExpressionSet object
data('CCLEsmall', package='PharmacoGx')
CCLEsmall## Name: CCLE
## Date Created: Fri Nov 6 14:00:53 2015
## Number of cell lines: 1061
## Number of drug compounds: 24
## RNA:
## Dim: 50 1036
## RNASeq:
## Dim: 50 935
## CNV:
## Dim: 50 744
## Drug pertubation:
## Please look at pertNumber(pSet) to determine number of experiments for each drug-cell combination.
## Drug sensitivity:
## Number of Experiments: 11670
## Please look at sensNumber(pSet) to determine number of experiments for each drug-cell combination.
eset <- CCLEsmall@molecularProfiles$rna
class(eset)## [1] "ExpressionSet"
## attr(,"package")
## [1] "Biobase"
The gather.ExpressionSet function converts an ExpressionSet into a tidy data frame:
df <- gather.ExpressionSet(eset, sample_ids=c('143B', '23132-87'), gene_ids=c('BRAF', 'EGFR'),
sample_col = "cellid", gene_col = "Symbol")
df## # A tibble: 2 x 5
## fData_row_id pData_row_id value
## <chr> <chr> <dbl>
## 1 ENSG00000157764_at MAKER_P_NCLE_RNA7_HG-U133_PLUS_2_F09_454702 6.553047
## 2 ENSG00000157764_at WATCH_P_NCLE_RNA8_HG-U133_PLUS_2_E11_474718 7.462726
## # ... with 2 more variables: fData_gene_id <chr>, pData_sample_id <chr>
as.data.frame(df)## fData_row_id pData_row_id value
## 1 ENSG00000157764_at MAKER_P_NCLE_RNA7_HG-U133_PLUS_2_F09_454702 6.553047
## 2 ENSG00000157764_at WATCH_P_NCLE_RNA8_HG-U133_PLUS_2_E11_474718 7.462726
## fData_gene_id pData_sample_id
## 1 BRAF 143B
## 2 BRAF 23132-87
Note that we can customise which identifier we want to use by specifying the sample_col and gene_col parameters.
The gather.PharmacoSet function combined data from multiple sources and reports in a tidy format with a single data point per row:
gather.PharmacoSet(CCLEsmall, sample_ids=c('143B', '23132-87'), gene_ids=c('BRAF', 'EGFR'),
data_types=c('rna', 'mutation'), gene_col=c('Symbol', 'Symbol'))## # A tibble: 6 x 5
## unified_id assayed_id data_type original value
## <chr> <chr> <chr> <chr> <dbl>
## 1 143B BRAF rna 6.55304746313814 6.553047
## 2 23132-87 BRAF rna 7.46272601915643 7.462726
## 3 143B BRAF mutation wt 0.000000
## 4 143B EGFR mutation wt 0.000000
## 5 23132-87 BRAF mutation wt 0.000000
## 6 23132-87 EGFR mutation wt 0.000000
The makeGeneticVsGeneticTibble.PharmacoSet function generates a data frame with one pair of genetic feature data per row.
gvg_df <- makeGeneticVsGeneticTibble.PharmacoSet(CCLEsmall, sample_ids=cellNames(CCLEsmall), gene1='RBM5', gene2='RBM5',
data_type1='rna', data_type2='rnaseq', gene_col1 = "Symbol", gene_col2 = "gene_name")
gvg_df## # A tibble: 921 x 11
## unified_id gene1 feature_type1 feature_name1 feature_value1
## <chr> <chr> <chr> <chr> <dbl>
## 1 22RV1 RBM5 rna RBM5_rna 7.435970
## 2 23132-87 RBM5 rna RBM5_rna 7.448929
## 3 253J-BV RBM5 rna RBM5_rna 6.882137
## 4 253J RBM5 rna RBM5_rna 6.619413
## 5 42-MG-BA RBM5 rna RBM5_rna 7.584633
## 6 5637 RBM5 rna RBM5_rna 6.607861
## 7 59M RBM5 rna RBM5_rna 6.833055
## 8 639-V RBM5 rna RBM5_rna 8.237640
## 9 647-V RBM5 rna RBM5_rna 6.207275
## 10 697 RBM5 rna RBM5_rna 8.403592
## # ... with 911 more rows, and 6 more variables: feature_original1 <chr>,
## # gene2 <chr>, feature_type2 <chr>, feature_name2 <chr>,
## # feature_value2 <dbl>, feature_original2 <chr>
This is useful for creating plots of one gene vs another in ggplot2:
ggplot(gvg_df, aes(x=feature_value1, y=feature_value2)) +
geom_point() +
theme_bw()However, you can also supply multiple genes to this function:
genes <- c('RBM5', 'NQO1', "STPG1", "NIPAL3","LAS1L","ENPP4","SEMA3F","CFTR")
gvg_df2 <- makeGeneticVsGeneticTibble.PharmacoSet(CCLEsmall, sample_ids=cellNames(CCLEsmall), gene1=genes, gene2=genes, data_type1='rna', data_type2='rnaseq', gene_col1 = "Symbol", gene_col2 = "gene_name")
nrow(gvg_df2)## [1] 51576
This data format is then in a convenient form to manage models in a data frame using the dplyr and broom packages - see Managing Many Models by Hadley Wickham. In the example below we are exploring how well RNAseq and Affymetrix data correlate across a number of genes.
mod_df <- gvg_df2 %>%
dplyr::filter(gene1==gene2) %>%
dplyr::group_by(feature_name1, feature_name2) %>%
tidyr::nest() %>%
dplyr::mutate(mod=purrr::map(data, function(x) lm(feature_value1 ~ feature_value2, data=x)),
res=purrr::map(mod, broom::glance))
mod_df## # A tibble: 7 x 5
## feature_name1 feature_name2 data mod
## <chr> <chr> <list> <list>
## 1 STPG1_rna STPG1_rnaseq <tibble [921 x 9]> <S3: lm>
## 2 NIPAL3_rna NIPAL3_rnaseq <tibble [921 x 9]> <S3: lm>
## 3 LAS1L_rna LAS1L_rnaseq <tibble [921 x 9]> <S3: lm>
## 4 ENPP4_rna ENPP4_rnaseq <tibble [921 x 9]> <S3: lm>
## 5 SEMA3F_rna SEMA3F_rnaseq <tibble [921 x 9]> <S3: lm>
## 6 CFTR_rna CFTR_rnaseq <tibble [921 x 9]> <S3: lm>
## 7 RBM5_rna RBM5_rnaseq <tibble [921 x 9]> <S3: lm>
## # ... with 1 more variables: res <list>
mod_df %>% dplyr::select(-data,-mod) %>% tidyr::unnest()## # A tibble: 7 x 13
## feature_name1 feature_name2 r.squared adj.r.squared sigma statistic
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 STPG1_rna STPG1_rnaseq 0.5868825 0.5864330 0.2530020 1305.549
## 2 NIPAL3_rna NIPAL3_rnaseq 0.8396522 0.8394777 0.3080574 4812.291
## 3 LAS1L_rna LAS1L_rnaseq 0.7786612 0.7784203 0.2541849 3233.006
## 4 ENPP4_rna ENPP4_rnaseq 0.9482628 0.9482065 0.3559182 16843.845
## 5 SEMA3F_rna SEMA3F_rnaseq 0.8636539 0.8635055 0.2975912 5821.198
## 6 CFTR_rna CFTR_rnaseq 0.8649105 0.8647635 0.3088132 5883.897
## 7 RBM5_rna RBM5_rnaseq 0.8327128 0.8325308 0.2458873 4574.547
## # ... with 7 more variables: p.value <dbl>, df <int>, logLik <dbl>,
## # AIC <dbl>, BIC <dbl>, deviance <dbl>, df.residual <int>