Exercises

For the remaining parts of this book we will be downloading larger datasets than those we have been using. Most of these datasets are not available as part of the standard R installation or packages such as UsingR. For some of these packages, we have created packages and offer them via GitHub. To download these you will need to install the devtools package. Once you do this, you can install packages such as the GSE5859Subset which we will be using here:

library(devtools)
install_github("genomicsclass/GSE5859Subset")
library(GSE5859Subset)
data(GSE5859Subset)

This package loads three tables: geneAnnotation, geneExpression, and sampleInfo. Answer the following questions to familiarize yourself with the data set:

  1. How many samples where processed on 2005-06-27?
subset(sampleInfo, date == "2005-06-27")
##     ethnicity       date         filename group
## 122       ASN 2005-06-27 GSM136530.CEL.gz     1
## 113       ASN 2005-06-27 GSM136517.CEL.gz     1
## 118       ASN 2005-06-27 GSM136523.CEL.gz     0
## 117       ASN 2005-06-27 GSM136522.CEL.gz     0
## 119       ASN 2005-06-27 GSM136524.CEL.gz     0
dim(subset(sampleInfo, date == "2005-06-27"))
## [1] 5 4
  1. Question: How many of the genes represented in this particular technology are on chromosome Y?
head(geneAnnotation)
##      PROBEID  CHR     CHRLOC SYMBOL
## 1  1007_s_at chr6   30852327   DDR1
## 30   1053_at chr7  -73645832   RFC2
## 31    117_at chr1  161494036  HSPA6
## 32    121_at chr2 -113973574   PAX8
## 33 1255_g_at chr6   42123144 GUCA1A
## 34   1294_at chr3  -49842638   UBA7
unique(geneAnnotation$CHR)
##  [1] "chr6"  "chr7"  "chr1"  "chr2"  "chr3"  "chr17" "chr14" "chr10"
##  [9] "chr11" "chr19" "chr13" "chr16" "chr9"  "chr22" "chr15" "chr12"
## [17] "chr4"  "chr18" "chrX"  "chr8"  "chr5"  "chr21" "chr20" "chrY" 
## [25] NA      "chrUn"
subset(geneAnnotation, CHR == "chrY")
##          PROBEID  CHR    CHRLOC  SYMBOL
## 1149   201909_at chrY   2709623  RPS4Y1
## 3079 204409_s_at chrY  22737611  EIF1AY
## 3550   205000_at chrY  15016019   DDX3Y
## 4728   206279_at chrY   7142013    PRKY
## 5034   206624_at chrY  14813160   USP9Y
## 5149 206700_s_at chrY -21867301   KDM5D
## 5210   206769_at chrY  15815447  TMSB4Y
## 5353   206922_at chrY -16097652     VCY
## 5786 207247_s_at chrY   2803518     ZFY
## 6133   207647_at chrY -26191940    CDY1
## 6389   207893_at chrY  -2654896     SRY
## 6402 207912_s_at chrY -25275502    DAZ1
## 6417 207918_s_at chrY   9304564   TSPY1
## 6862 208220_x_at chrY  -6733959   AMELY
## 6913 208281_x_at chrY -26909216    DAZ3
## 6923 208282_x_at chrY  25365604    DAZ2
## 6968   208307_at chrY  23673258 RBMY1A1
## 7001   208331_at chrY  25130410    BPY2
## 7004   208332_at chrY -24217903     PRY
## 7015   208339_at chrY -19880860    XKRY
## 8856   211149_at chrY -15434914     UTY
dim(subset(geneAnnotation, CHR == "chrY"))
## [1] 21  4
  1. What is the log expression value of the for gene ARPC1A on the one subject that we measured on 2005-06-10 ?
matched_row <- match(subset(geneAnnotation, SYMBOL == "ARPC1A")$PROBEID, rownames(geneExpression))
matched_col <- match(subset(sampleInfo, date == "2005-06-10")$filename, colnames(geneExpression))
log(geneExpression[matched_row, matched_col])
## [1] 2.108223