Rebecca Johnston 22/01/13
library("ggplot2") # for graphing
library("hexbin") # for making hexbin objects
## Loading required package: grid
## Loading required package: lattice
library("reshape2") # for reshaping data from wide to tall format (melt)
Save rds file from STAT540 github. Note this is a different data structure compared to sem01. There are now 7 variables including sidChar and sidNum.
kDat <- readRDS("GSE4051_MINI.rds")
Sanity checks to ensure the data loaded successfully:
str(kDat)
## 'data.frame': 39 obs. of 7 variables:
## $ sidChar : chr "Sample_20" "Sample_21" "Sample_22" "Sample_23" ...
## $ sidNum : num 20 21 22 23 16 17 6 24 25 26 ...
## $ devStage : Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 1 1 1 2 2 2 ...
## $ gType : Factor w/ 2 levels "wt","NrlKO": 1 1 1 1 2 2 2 1 1 1 ...
## $ crabHammer: num 10.22 10.02 9.64 9.65 8.58 ...
## $ eggBomb : num 7.46 6.89 6.72 6.53 6.47 ...
## $ poisonFang: num 7.37 7.18 7.35 7.04 7.49 ...
head(kDat)
## sidChar sidNum devStage gType crabHammer eggBomb poisonFang
## 12 Sample_20 20 E16 wt 10.220 7.462 7.370
## 13 Sample_21 21 E16 wt 10.020 6.890 7.177
## 14 Sample_22 22 E16 wt 9.642 6.720 7.350
## 15 Sample_23 23 E16 wt 9.652 6.529 7.040
## 9 Sample_16 16 E16 NrlKO 8.583 6.470 7.494
## 10 Sample_17 17 E16 NrlKO 10.140 7.065 7.005
table(kDat$devStage, kDat$gType)
##
## wt NrlKO
## E16 4 3
## P2 4 4
## P6 4 4
## P10 4 4
## 4_weeks 4 4
Scatter plots “geom_point”
(p <- ggplot(kDat, aes(x = crabHammer, y = eggBomb)) + geom_point())
Add smoothing line “stat_smooth”
(p <- p + stat_smooth())
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
Change background and add labels:
(p <- p + theme_bw() + # white background
xlab("Expression of crabHammer") + # x axis label
ylab("Expression of eggBomb") + # y axis label
ggtitle("Scatterplot for expression levels")) # plot title
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
RESHAPE DATA:
Convert into long format so there is a single column of gene expression values. Only keep genes “eggBomb” and “poisonFang”. N.B. CrabHammer kept in separate column as a reference.
nDat <- with(kDat, data.frame(sidChar, sidNum, devStage, gType, crabHammer,
probeset = factor(rep(c("eggBomb", "poisonFang"), each = nrow(kDat))), geneExp = c(eggBomb,
poisonFang)))
str(nDat)
## 'data.frame': 78 obs. of 7 variables:
## $ sidChar : Factor w/ 39 levels "Sample_1","Sample_10",..: 13 14 15 16 8 9 36 17 18 19 ...
## $ sidNum : num 20 21 22 23 16 17 6 24 25 26 ...
## $ devStage : Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 1 1 1 2 2 2 ...
## $ gType : Factor w/ 2 levels "wt","NrlKO": 1 1 1 1 2 2 2 1 1 1 ...
## $ crabHammer: num 10.22 10.02 9.64 9.65 8.58 ...
## $ probeset : Factor w/ 2 levels "eggBomb","poisonFang": 1 1 1 1 1 1 1 1 1 1 ...
## $ geneExp : num 7.46 6.89 6.72 6.53 6.47 ...
ADD COLOUR AND SMOOTHING LINES:
Assign colour by probeset column under aes arg
(p <- ggplot(nDat, aes(crabHammer, geneExp, color = probeset)) + geom_point())
Add a smoothing line: “se” parameter displays standard error around smooth
(p <- ggplot(nDat, aes(crabHammer, geneExp, color = probeset)) + geom_point() +
stat_smooth(se = FALSE))
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
Overrule groupings in aes by specifiying new group so only one smoothing line is plotted:
(p <- ggplot(nDat, aes(crabHammer, geneExp, color = probeset)) + geom_point() +
stat_smooth(se = F, aes(group = 1)))
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
FACETTING:
(p <- ggplot(nDat, aes(crabHammer, geneExp)) + geom_point() + facet_wrap(~probeset))
Add colour by gType:
(p <- ggplot(nDat, aes(crabHammer, geneExp, color = gType)) + geom_point() +
facet_wrap(~probeset))
Add colour by devStage:
(p <- ggplot(nDat, aes(crabHammer, geneExp, color = devStage)) + geom_point() +
facet_wrap(~probeset))
Also use geom_point for strip plots, but coordinates are mapped to a factor.
RESHAPE DATA:
Long format so all genes in “probeset” this time!
oDat <- with(kDat, data.frame(sidChar, sidNum, devStage, gType, probeset = factor(rep(c("crabHammer",
"eggBomb", "poisonFang"), each = nrow(kDat))), geneExp = c(crabHammer, eggBomb,
poisonFang)))
str(oDat)
## 'data.frame': 117 obs. of 6 variables:
## $ sidChar : Factor w/ 39 levels "Sample_1","Sample_10",..: 13 14 15 16 8 9 36 17 18 19 ...
## $ sidNum : num 20 21 22 23 16 17 6 24 25 26 ...
## $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 1 1 1 2 2 2 ...
## $ gType : Factor w/ 2 levels "wt","NrlKO": 1 1 1 1 2 2 2 1 1 1 ...
## $ probeset: Factor w/ 3 levels "crabHammer","eggBomb",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ geneExp : num 10.22 10.02 9.64 9.65 8.58 ...
Plot geneExp by probe using geom_point:
(p <- ggplot(oDat, aes(geneExp, probeset)) + geom_point())
Add jitter:
(p <- ggplot(oDat, aes(geneExp, probeset)) + geom_point(position = position_jitter(height = 0.1)))
Plot geneExp by devStage. Note order of x and y in aes matters!
(p <- ggplot(oDat, aes(devStage, geneExp)) + geom_point())
Facetting by probeset:
(p <- p + facet_wrap(~probeset))
Colour by gType:
(p <- p + aes(color = gType))
Add some stats, mean for each group:
(p <- p + stat_summary(fun.y = mean, geom = "point", shape = 4, size = 4))
Gene expression across entire dataset:
(p <- ggplot(oDat, aes(geneExp)) + geom_density())
Remove border lines:
(p <- ggplot(oDat, aes(geneExp)) + stat_density(geom = "line", position = "identity"))
Add strip plot with jitter:
(p <- ggplot(oDat, aes(geneExp)) + stat_density(geom = "line", position = "identity") +
geom_point(aes(y = 0.05), position = position_jitter(height = 0.005)))
Change bandwidth with adjust arg in stat_density:
(p <- ggplot(oDat, aes(geneExp)) + stat_density(geom = "line", position = "identity",
adjust = 0.5) + geom_point(aes(y = 0.05), position = position_jitter(height = 0.005)))
Define color by gType:
(p <- ggplot(oDat, aes(geneExp, color = gType)) + stat_density(geom = "line",
position = "identity") + geom_point(aes(y = 0.05), position = position_jitter(height = 0.005)))
Define colour by devStage:
(p <- ggplot(oDat, aes(geneExp, color = devStage)) + stat_density(geom = "line",
position = "identity", adjust = 0.75) + geom_point(aes(y = 0.05), position = position_jitter(height = 0.01)))
Add facet wrap by devStage:
(p <- p + facet_wrap(~devStage))
Plot geneExp by devStage using geom_boxplot:
(p <- ggplot(oDat, aes(devStage, geneExp)) + geom_boxplot())
Add facet wrap by gType:
(p <- p + facet_wrap(~gType))
As a violin plot instead:
(p <- ggplot(oDat, aes(devStage, geneExp)) + geom_violin())
Load complete photorec dataset and its design matrix into R after saving files locally from STAT540 github.
Gene expression data:
prDat <- read.table("GSE4051_data.tsv")
str(prDat, max.level = 0)
## 'data.frame': 29949 obs. of 39 variables:
Design matrix:
prDes <- readRDS("GSE4051_design.rds")
str(prDes)
## 'data.frame': 39 obs. of 4 variables:
## $ sidChar : chr "Sample_20" "Sample_21" "Sample_22" "Sample_23" ...
## $ sidNum : num 20 21 22 23 16 17 6 24 25 26 ...
## $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 1 1 1 2 2 2 ...
## $ gType : Factor w/ 2 levels "wt","NrlKO": 1 1 1 1 2 2 2 1 1 1 ...
Choose 2 samples (columns) at random:
set.seed(2)
(yo <- sample(1:ncol(prDat), size = 2))
## [1] 8 27
Create new dataframe with only these 2 random samples:
bDat <- data.frame(y = prDat[[yo[1]]], z = prDat[[yo[2]]])
str(bDat)
## 'data.frame': 29949 obs. of 2 variables:
## $ y: num 7.11 9.75 9.39 8.37 8.36 ...
## $ z: num 7.15 9.22 10.06 8.35 8.45 ...
Plot samples against each other using geom_point:
(p <- ggplot(bDat, aes(z, y)) + geom_point())
Reduce transparency using alpha arg in geom_point:
(p <- ggplot(bDat, aes(z, y)) + geom_point(alpha = 0.1))
Use stat_density2d function:
(p <- ggplot(bDat, aes(z, y)) + stat_density2d())
Fill with colour. N.B. Some functions (especially stat) return their own calculated values. Use these values by calling ..[value name].., e.g. the use of fill = ..density.. here under aes arg:
(p <- ggplot(bDat, aes(z, y)) + stat_density2d(geom = "tile", contour = FALSE,
aes(fill = ..density..)) + scale_fill_gradient(low = "white", high = "blue"))
Use stat_binhex:
(p <- ggplot(bDat, aes(z, y)) + stat_binhex())
Use plotmatrix for pairwise scatterplots:
set.seed(3)
(yo <- sample(1:ncol(prDat), size = 4))
## [1] 7 31 15 12
pairDat <- subset(prDat, select = yo)
str(pairDat)
## 'data.frame': 29949 obs. of 4 variables:
## $ Sample_6 : num 7.24 9.71 10.17 8.84 8.54 ...
## $ Sample_19: num 7.21 9.21 9.59 8.31 8.31 ...
## $ Sample_8 : num 7.11 8.24 9.13 8.13 8.33 ...
## $ Sample_14: num 7.09 9.56 9.88 8.57 8.59 ...
(p <- plotmatrix(pairDat) + stat_binhex())
## This function is deprecated. For a replacement, see the ggpairs function in the GGally package. (Deprecated; last used in version 0.9.2)
The full photoRec dataset has 39 samples and 29,949 probesets. Choose 2 or 20 or 200 random probesets/genes and look for gene expression differences between the two genotypes, wild type versus knockout. Make use of the graphing techniques discussed this week such as scatter plots, box plot, etc.
Choose 20 random genes to start:
set.seed(50)
(someGene <- sample(nrow(prDat), 20))
## [1] 21226 13108 5990 22971 15367 1339 20958 19353 1259 3220 11693
## [12] 8077 19187 2322 8301 20239 25006 10913 2219 5057
Create new dataframe with only these 20 random genes:
tDat <- prDat[someGene, ]
str(tDat, max.level = 0)
## 'data.frame': 20 obs. of 39 variables:
What structure do we need the data to be in? To use ggplot2, we will need separate columns for: probeID, sampleID, gExp, gType.
What about creating a dataframe manually by writing a function? First “stack” the gene expression values:
stDat <- stack(tDat)
head(stDat) # Lose probeID
## values ind
## 1 7.582 Sample_20
## 2 6.711 Sample_20
## 3 8.169 Sample_20
## 4 6.291 Sample_20
## 5 6.484 Sample_20
## 6 9.889 Sample_20
Write function:
makeDf <- function(a, b, c) {
# a = tDat b = stDat c = prDes
probeID <- as.factor(row.names(a))
gExp <- b$values # I don't like this code :S better way to get gExp values?
gType <- c$gType
sID <- c$sidChar
data.frame(probeID, gExp, gType)
}
# Call function
newDf <- makeDf(tDat, stDat, prDes)
head(newDf)
## probeID gExp gType
## 1 1447589_at 7.582 wt
## 2 1434502_x_at 6.711 wt
## 3 1424102_at 8.169 wt
## 4 1450094_at 6.291 wt
## 5 1437425_at 6.484 NrlKO
## 6 1417242_at 9.889 NrlKO
I don't know if I trust the above result… particularly whether the gene expression values match the correct sample… Is there a simple way I could check this?
I will use the reshape package instead. First, need to make row names a column:
tDat <- data.frame(probeID = row.names(tDat), tDat)
head(tDat, n = 3)
## probeID Sample_20 Sample_21 Sample_22 Sample_23
## 1447589_at 1447589_at 7.582 7.106 7.551 7.313
## 1434502_x_at 1434502_x_at 6.711 6.774 6.412 6.573
## 1424102_at 1424102_at 8.169 8.265 8.565 8.419
## Sample_16 Sample_17 Sample_6 Sample_24 Sample_25 Sample_26
## 1447589_at 8.139 7.374 7.430 7.836 7.894 7.707
## 1434502_x_at 7.456 6.835 6.561 7.081 6.960 7.237
## 1424102_at 8.432 8.022 8.388 8.388 8.425 8.344
## Sample_27 Sample_14 Sample_3 Sample_5 Sample_8 Sample_28
## 1447589_at 7.849 7.943 7.927 8.028 8.281 7.791
## 1434502_x_at 7.131 6.938 7.323 7.311 7.203 6.999
## 1424102_at 8.368 8.719 8.645 8.792 8.164 8.393
## Sample_29 Sample_30 Sample_31 Sample_1 Sample_10 Sample_4
## 1447589_at 7.495 7.704 7.885 7.904 7.956 7.897
## 1434502_x_at 6.771 6.911 7.080 7.151 6.857 7.153
## 1424102_at 8.322 8.548 8.550 8.467 8.340 8.576
## Sample_7 Sample_32 Sample_33 Sample_34 Sample_35 Sample_13
## 1447589_at 8.293 7.410 7.968 8.261 7.657 7.748
## 1434502_x_at 7.321 6.592 7.149 7.358 6.913 6.849
## 1424102_at 8.316 8.506 8.032 8.192 8.673 7.951
## Sample_15 Sample_18 Sample_19 Sample_36 Sample_37 Sample_38
## 1447589_at 7.967 7.464 7.952 7.435 7.647 7.317
## 1434502_x_at 6.895 6.685 6.679 6.649 6.763 7.018
## 1424102_at 7.986 8.071 8.329 8.807 8.371 8.258
## Sample_39 Sample_11 Sample_12 Sample_2 Sample_9
## 1447589_at 7.506 7.672 7.693 7.613 7.576
## 1434502_x_at 6.759 6.671 7.003 6.894 6.576
## 1424102_at 8.184 8.353 8.388 8.033 8.306
Reshape to tall format using melt, where ID variable is “probeID”:
tDat <- melt(tDat, id.vars = "probeID")
head(tDat, n = 3)
## probeID variable value
## 1 1447589_at Sample_20 7.582
## 2 1434502_x_at Sample_20 6.711
## 3 1424102_at Sample_20 8.169
names(tDat) <- c("probeID", "sidChar", "gExp") # Rename columns
Merge prDes to obtain gType using “sidChar” as common column:
tDat <- merge(tDat, prDes, by = "sidChar")
For simplicity, rename probes to smaller names:
levels(tDat$probeID) <- c(LETTERS[1:length(levels(tDat$probeID))])
STRIP PLOT:
Gene expression per gene:
(strPlot <- ggplot(tDat, aes(gExp, probeID, colour = gType)) + geom_point())
This is a bit messy… Let's reorder probeID based on gExp and replot:
tDat <- within(tDat, probeID <- reorder(probeID, gExp))
(strPlot <- ggplot(tDat, aes(gExp, probeID, colour = gType)) + geom_point())
BOX PLOT:
Gene expression across all probes comparing gType:
(bxPlot <- ggplot(tDat, aes(gType, gExp)) + geom_boxplot())
Gene expression per probeID comparing gType:
(bxPlot <- ggplot(tDat, aes(probeID, gExp, fill = interaction(gType))) + geom_boxplot() +
theme())
DENSITY PLOT:
Gene expression across all 20 genes comparing gType:
(densPlot <- ggplot(tDat, aes(gExp, colour = gType)) + stat_density(geom = "line",
position = "identity"))
Gene expression per probeID comparing gType:
(densPlot <- ggplot(tDat, aes(gExp, colour = gType)) + stat_density(geom = "line",
position = "identity") + facet_wrap(~probeID))
Using ggplot visualisation tools alone, it appears that for the 20 probes that have been randomly chosen here, there is no significant difference between expression for wt and KO mice.