coursera_genomics_stats_wk2_quiz

Dependencies

This document depends on the following packages:

library(devtools)
library(Biobase)

## Loading required package: BiocGenerics

## Loading required package: parallel

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, cbind, colnames,
##     do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, lengths, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff,
##     sort, table, tapply, union, unique, unsplit, which, which.max,
##     which.min

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

library(broom)

module 2, quiz question #5

Download the data

con = url("http://bowtie-bio.sourceforge.net/recount/ExpressionSets/montpick_eset.RData")
load(file = con)
close(con)
mp = montpick.eset
pdata = pData(mp)
edata = as.data.frame(exprs(mp))
fdata = fData(mp)
ls()

## [1] "con"           "edata"         "fdata"         "montpick.eset"
## [5] "mp"            "pdata"

Question #5

Perform the log2(data + 1) transform. Then fit a regression model to each sample using population as the outcome. Do this using the lm.fit function (hint: don’t forget the intercept). What is the dimension of the residual matrix, the effects matrix and the coefficients matrix?

A. Residual matrix: 52580 x 129 Effects matrix: 52580 x129 Coefficients matrix: 2 x 52580

B. Residual matrix: 129 x 52580 Effects matrix: 129 x 52580 Coefficients matrix: 2 x 52580

C. Residual matrix: 52580 x 129 Effects matrix: 129 x 52580 Coefficients matrix: 2 x 52580

D. Residual matrix: 52580 x 129 Effects matrix: 52580 x129 Coefficients matrix: 52580 x 2

names(pdata)

## [1] "sample.id"     "num.tech.reps" "population"    "study"

Fit many regression models at once

edata = as.matrix(edata)
edata = log2(edata + 1)

mod = model.matrix(~ pdata$population)
fit = lm.fit(mod, t(edata))
names(fit)

## [1] "coefficients"  "residuals"     "effects"       "rank"         
## [5] "fitted.values" "assign"        "qr"            "df.residual"

Compare to output of lm

fit$coefficients[,1]

##         (Intercept) pdata$populationYRI 
##          0.08333333          0.00608587

tidy(lm(as.numeric(edata[1,]) ~ pdata$population))

##                  term   estimate  std.error  statistic    p.value
## 1         (Intercept) 0.08333333 0.04470046 1.86426110 0.06459378
## 2 pdata$populationYRI 0.00608587 0.06111986 0.09957271 0.92084057

Look at the coefficients across genes

par(mfrow = c(1, 2))
hist(fit$coefficients[1, ], breaks = 10, col = 2, ylim = c(0, 50000), xlab = "Intercept")
hist(fit$coefficients[2, ], breaks = 10, col = 2, ylim = c(0, 50000), xlab = "PopulationYRI")
abline(v = 0, lwd = 2, col = 1)

sessionInfo()

## R version 3.3.1 (2016-06-21)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04 LTS
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
## [1] broom_0.4.2         Biobase_2.34.0      BiocGenerics_0.20.0
## [4] devtools_1.13.2    
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.11     knitr_1.16       magrittr_1.5     mnormt_1.5-5    
##  [5] lattice_0.20-35  R6_2.2.1         rlang_0.1.1      stringr_1.2.0   
##  [9] plyr_1.8.4       dplyr_0.5.0      tools_3.3.1      grid_3.3.1      
## [13] nlme_3.1-131     psych_1.7.5      DBI_0.6-1        withr_1.0.2     
## [17] htmltools_0.3.6  yaml_2.1.14      rprojroot_1.2    digest_0.6.12   
## [21] assertthat_0.2.0 tibble_1.3.3     tidyr_0.6.3      reshape2_1.4.2  
## [25] memoise_1.1.0    evaluate_0.10    rmarkdown_1.5    stringi_1.1.5   
## [29] backports_1.1.0  foreign_0.8-68

coursera_genomics_stats_wk2_quiz_ques5

MCC

June 26, 2017