Rahul Banerjee, Sanjeev Dahal, Ben Ernest, Reka Kelemen, Ann Wells, and Qian Zhang
April 8, 2014
# Store data in variables
# "<-" is global, and "=" is local.
# For most purposes, they're completely
# interchangeable.
x <- 2
z = 5
y <- x + z
y
[1] 7
print(y)
[1] 7
cat("y = ", y, sep = "")
y = 7
# Create a vector by hand
x = c(48,49,51,50,49,41,40,38,35,40)
# Print the first element of the vector.
x[1]
[1] 48
# Print values from indicated positions.
x[c(1,3,5)]
[1] 48 51 49
# Add 1 to each value (if two vectors
# have unequal lengths, R typically
# recycles the smaller one.)
x + 1
[1] 49 50 52 51 50 42 41 39 36 41
# Print the class of an object.
class(x)
[1] "numeric"
# Convert x into a matrix.
x = matrix(x, nrow = 5, ncol = 2)
class(x)
[1] "matrix"
# Convert to a data frame
# (each column can be a different type
# (i.e., text, factors, numeric, integers,
# etc)
x = as.data.frame(x)
class(x)
[1] "data.frame"
x
V1 V2
1 48 41
2 49 40
3 51 38
4 50 35
5 49 40
x = t(x)
x
[,1] [,2] [,3] [,4] [,5]
V1 48 49 51 50 49
V2 41 40 38 35 40
x = as.data.frame(x)
colnames(x)
[1] "V1" "V2" "V3" "V4" "V5"
colnames(x) = paste("Column", 1:5, sep = "")
colnames(x)
[1] "Column1" "Column2" "Column3" "Column4" "Column5"
x$Column2
[1] 49 40
x[,2]
[1] 49 40
help(as.matrix)
# (1) Set working directory.
#setwd("C:/Users/Ben/Desktop/teamr")
# (2) Read in data containing column
# headings.
mouse_data = read.table("mouse_data.txt",
header = T)
# (3) Print number of rows and columns of
# mouse_data.
dim(mouse_data)
[1] 21 94
# (4) Print mouse_data rows 1 through 5,
# columns 1 through 5.
mouse_data[1:5,1:5]
Group Mouse Strain Fatness Actb
1 lean 1110158 11 0.3197 0.6438
2 lean 5010378 50 0.3568 1.0473
3 lean 2010026 20 0.4042 1.5298
4 lean 5010379 50 0.4104 1.2554
5 lean 1110180 11 0.4419 0.6798
# (5) Calculate the means of mouse_data
# columns 5 through the last column.
mouse_means = colMeans(mouse_data[,5:ncol(mouse_data)])
# (6) Print mouse_means elements 1 through 5.
mouse_means[1:5]
Actb Cdkn1a Lep Cfd Lpl
1.78129 0.03827 0.77928 26.86592 8.30270
# (7) Install dependencies that gplots
# package needs, and install gplots.
# install.packages(pkgs =
# c("gdata", "gtools"))
# install.packages("gplots")
# (8) Load gplots package.
library(gplots)
# (9) Open png file for plotting.
png("group-v-fatness.png")
# (10) Set plotting parameters
# (font size, margins, axis labels,
# etc).
par(
font = 2,
font.lab = 2,
font.axis = 2,
cex.main = 2,
cex.lab = 1.5,
cex.axis = 1.5,
mar = c(6,5,4,2)+0.1
)
# (11) Draw plot.
plotmeans(Fatness ~ Group,
data = mouse_data,
connect = F,
barwidth = 4,
n.label = F,
main = "Group vs. Fatness",
xlab = "Group",
ylab = "Fatness"
)
# (12) Close plotting device.
dev.off()
# (13) Open png file.
png("strain-v-fatness.png", width = 600,
height = 480
)
# (14) Set plotting parameters.
par(font = 2, font.lab = 2, font.axis = 2,
cex.main = 2, cex.lab = 1.5,
cex.axis = 1.5, mar = c(6,5,4,2)+0.1
)
# (15) Draw plot.
plotmeans(Fatness ~ Strain,
data = mouse_data, connect = F,
barwidth = 4, n.label = F,
main = "Strain vs. Fatness",
xlab = "Strain",
ylab = "Fatness"
)
# (16) Close plotting device.
dev.off()
# (17) Open png file.
png("strain-v-fatness_stripchart.png",
width = 600, height = 480
)
# (18) Set plotting parameters.
par(font = 2, font.lab = 2, font.axis = 2,
cex.main = 2, cex.lab = 1.5,
cex.axis = 1.5, mar = c(6,5,4,2)+0.1
)
# (19) Draw plot.
stripchart(Fatness ~ Strain,
data = mouse_data, vertical = T,
main = "Strain vs. Fatness",
xlab = "Strain",
ylab = "Fatness", pch = 20
)
dev.off()
# (20) Calculate pairwise correlation
# among genes in indicated columns.
# We need to convert our data frame
# into a matrix first.
gene_corrs = cor(as.matrix(mouse_data[,5:ncol(mouse_data)]))
# (21) Print gene_corrs rows 1 through 5,
# columns 1 through 5.
gene_corrs[1:3,1:5]
Actb Cdkn1a Lep Cfd Lpl
Actb 1.0000 0.7949 0.6130 -0.6168 0.6528
Cdkn1a 0.7949 1.0000 0.8246 -0.5787 0.8808
Lep 0.6130 0.8246 1.0000 -0.3658 0.7923
# (22) Begin with gene1 and gene2 = 1,
# and corr_max_all = 0.
gene1 = 1
gene2 = 1
corr_max_all = 0
# (23) Loop over the correlation matrix
# rows. If the row contains
# correlations >= corr_max_all,
# replace gene1 and gene2 with
# those column numbers and
# corr_max_all with that
# correlation value.
for(i in 1:nrow(gene_corrs)) {
row_max = max(abs(gene_corrs[i,gene_corrs[i,]!=1]))
if (row_max >= abs(corr_max_all)) {
corr_max_all = row_max
gene1 = i
gene2 = which(gene_corrs[i,] == row_max)
}
}
# (24) Use column numbers from previous
# step to get gene names from
# correlation matrix.
gene1 = colnames(gene_corrs)[gene1]
gene2 = colnames(gene_corrs)[gene2]
cat("gene1: ", gene1, ", gene2: ", gene2,
sep = ""
)
gene1: Wnt1, gene2: Dkk1
# (25) Use gene names as references to
# access gene expression values
# from mouse_data, and compute
# correlation.
cor_max = cor(mouse_data[,gene1],
mouse_data[,gene2]
)
# (26) Open png file.
png("scatterplot.png")
# (27) Set plotting parameters.
par(font = 2, font.lab = 2, font.axis = 2,
cex.main = 2, cex.lab = 1.5,
cex.axis = 1.5, mar = c(6,5,4,2)+0.1
)
# (28) Draw plot
plot(mouse_data[,gene1],
mouse_data[,gene2],
main = paste(gene1, " vs. ", gene2,
sep = ""),
xlab = gene1, ylab = gene2
)
# (29) Fit a linear model predicting gene2
# based on gene1.
fit = lm(mouse_data[,gene2] ~
mouse_data[,gene1]
)
# summary(fit)
# (30) Limit correlation value to 6 digits.
cor_max_form = format(cor_max^2, digits = 6
)
# (31) Write text on plot--formatted
# in italics, as well as bold.
mtext(
bquote(italic(R)^2 == .(cor_max_form)),
font = 4
)
# (32) Add line fit to plot
abline(fit)
# (32) Close plotting device.
dev.off()
# Create list of lean strains.
lean = c("11", "12", "20", "42", "50")
# Create factor variable "strain"
strain = as.factor(mouse_data$Strain)
# Create vector of colors. It has as
# many colors as we have strains.
strain_colors = rainbow(length(levels(strain)))
# Create empty vectors that we will fill
# with colors and point styles.
color_assignments = vector(length = length(strain))
style_assignments = vector(length = length(strain))
# Create an empty vector with length
# equal to the number of strains.
fat_levels = vector(length = length(levels(strain)))
# Loop from 1 to the number of strains,
# filling the color and style vectors.
# If the strain number in the lean list,
# fill the fat_levels vector at that
# position with "lean", otherwise "fat".
for (i in 1:length(levels(strain))) {
color_assignments[(strain == levels(strain)[i])] = strain_colors[i]
style_assignments [(strain == levels(strain)[i])] = i
if (levels(strain)[i] %in% lean) {
fat_levels[i] = "lean"
}
else {
fat_levels[i] = "fat"
}
}
# Open png file.
png("scatterplot_colors.png")
# Set plotting parameters.
par(font = 2, font.lab = 2, font.axis = 2,
cex.main = 2, cex.lab = 1.5,
cex.axis = 1.5, mar = c(6,5,4,2)+0.1
)
# Draw plot.
plot(mouse_data[,gene1],
mouse_data[,gene2],
main = paste(gene1, " vs. ", gene2,
sep = ""),
xlab = gene1, ylab = gene2,
col = color_assignments,
pch = style_assignments, lwd = 4
)
# Write formatted R-squared, as before.
mtext(bquote(italic(R)^2 == .(cor_max_form)), font = 4)
# Add legend in bottom right. It
# contains the strains, and the colors
# and point styles corresponding to each
# one.
legend("bottomright",
legend = paste(levels(strain),
"--", fat_levels,
sep = ""),
col = strain_colors,
text.col = strain_colors,
pch = 1:length(strain_colors),
cex = 1, pt.lwd = 4
)
abline(fit)
dev.off()
# (33) Sort expression means, lowest
# to highest.
mouse_means_sorted = sort(mouse_means)
# (34) Choose lower and upper percentiles.
# Get integer values corresponding
# to the indexes of these
# percentiles. Last, return only
# elements of mouse_means within
# those percentiles.
lb = 0.21
ub = 0.80
lb = floor(lb*length(mouse_means_sorted))
ub = ceiling(ub*length(mouse_means_sorted))
mouse_means_mid = mouse_means_sorted[lb:ub]
# (35) Open png file.
png("histogram_means.png")
# (36) Set plotting parameters.
par(font = 2, font.lab = 2, font.axis = 2,
cex.main = 1.5, cex.lab = 1.5,
cex.axis = 1.5, mar = c(6,5,4,2)+0.1
)
# (37) Draw histogram.
hist(mouse_means_mid,
main = "Mean mRNA Abundance",
xlab = "mRNA Abundance",
col = "lightgray",
labels = T, ylim = c(0,20)
)
dev.off()
# (38) Open pdf file.
pdf("box-plots.pdf")
# (39) Set plotting parameters. NOTE:
# "mfrow" splits each page into 2
# rows and 2 columns, so 4 windows.
par(mfrow = c(2,2), font = 2, font.lab = 2,
font.axis = 2, cex.main = 1.5,
cex.lab = 1.5, cex.axis = 1.5,
mar = c(6,5,4,2)+0.1
)
# (40) Draw boxplots.
boxplot(mouse_data$Fatness, main = "Fatness")
boxplot(mouse_means, main = "Mean mRNA Abundance")
# (41) Continue plotting into same pdf.
boxplot(mouse_means_mid,
main = "Mean mRNA Abundance"
)
boxplot(Fatness ~ Group, data = mouse_data,
main = "Fat and Lean Mice",
ylab = "Fatness"
)
boxplot(Actb ~ Group, data = mouse_data,
main = "Fat and Lean Mice",
ylab = "Actb"
)
boxplot(mouse_data$Actb, main="Actb")
dev.off()
# (42) Open new png file.
png("boxplot_fatness-v-actb.png")
# (43) Set plotting parameters.
par(font = 2, font.lab = 2, font.axis = 2,
cex.main = 1.5, cex.lab = 1.5,
cex.axis = 1.5, mar = c(6,5,4,2)+0.1
)
# (44) Draw boxplot.
boxplot(Actb ~ Group, data = mouse_data,
main = "Fat and Lean Mice",
ylab = "Actb"
)
dev.off()
# (45) Open png file.
png("pairwise-corrs.png")
# (46) Set plotting parameters.
par(font = 2, font.lab = 2, font.axis = 2,
cex.main = 1.5, cex.lab = 1.5,
cex.axis = 1.5, mar = c(6,5,4,2)+0.1
)
# (47) Plot pairwise scatterplots
# for genes in designated columns.
pairs(mouse_data[,6:9])
dev.off()
# (48) Force expression data into matrix.
expr_matrix = as.matrix(mouse_data[,5:ncol(mouse_data)])
# (49) Transpose rows into columns, and
# vice-versa.
expr_matrix = t(expr_matrix)
# (50) Use paste to concatenate mouse
# number and "lean" or "fat"
# as appropriate. Replace
# column names with these text
# strings.
colnames(expr_matrix) = paste(mouse_data$Mouse, "--", mouse_data$Group, sep = "")
# (51) Open png file.
png("mouse_heatmap.png",
width = 980, height = 980
)
# (52) Set plotting parameters.
par(font = 2, font.lab = 2, font.axis = 2,
cex.main = 2
)
# (53) Draw heatmap.
heatmap.2(
expr_matrix,
col = colorpanel(10, "yellow",
"orange", "red"),
scale = "row", trace = "none",
density.info = "none",
margins = c(14,7),
main = "Heatmap of Gene Expression",
cexCol = 2
)
dev.off()