Dependencies

This document depends on the following packages:

library(devtools)
library(Biobase)
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, cbind, colnames,
##     do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, lengths, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff,
##     sort, table, tapply, union, unique, unsplit, which, which.max,
##     which.min
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

module 2, quiz question #2

Download the data

con =url("http://bowtie-bio.sourceforge.net/recount/ExpressionSets/montpick_eset.RData")
load(file = con)
close(con)
mp = montpick.eset
pdata=pData(mp) # Phenotype data set
edata=as.data.frame(exprs(mp)) # Experiment data set
fdata = fData(mp) # Feature data set
ls()
## [1] "con"           "edata"         "fdata"         "montpick.eset"
## [5] "mp"            "pdata"
dim(edata)
## [1] 52580   129
edata
summary(edata)

Calculate k-means (two clusters) singular vectors & percent variance explained

Q. Perform the log2(data + 1) transform and subtract row means from the samples. Set the seed to 333 and use k-means to cluster the samples into two clusters. Use svd to calculate the singular vectors. What is the correlation between the first singular vector and the sample clustering indicator?

K-means clustering: Now we can perform k-means clustering. By default, the rows are clustered. You can either input the cluster means (often unknown) or the number of clusters.

edata = log2(edata + 1)
edata_centered = edata - rowMeans(edata)

set.seed(333)
kmeans1 = kmeans(edata, centers = 2)
names(kmeans1)
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"
matplot(t(kmeans1$centers), col = 1:3, type = "l", lwd = 3)

svd_centered = svd(edata_centered)
names(svd_centered)
## [1] "d" "u" "v"
table(kmeans1$cluster)
## 
##     1     2 
## 47061  5519
47061/(47061+5519)
## [1] 0.8950361
sessionInfo()
## R version 3.3.1 (2016-06-21)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04 LTS
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
## [1] Biobase_2.34.0      BiocGenerics_0.20.0 devtools_1.13.2    
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.11    digest_0.6.12   withr_1.0.2     rprojroot_1.2  
##  [5] backports_1.1.0 magrittr_1.5    evaluate_0.10   stringi_1.1.5  
##  [9] rmarkdown_1.5   tools_3.3.1     stringr_1.2.0   yaml_2.1.14    
## [13] memoise_1.1.0   htmltools_0.3.6 knitr_1.16

EOF