Using R for Analyzing Biological Data

Rahul Banerjee, Sanjeev Dahal, Ben Ernest, Reka Kelemen, Ann Wells, and Qian Zhang

April 8, 2014

R Syntax

# Store data in variables
# "<-" is global, and "=" is local.
# For most purposes, they're completely
# interchangeable.
x <- 2
z = 5 
y <- x + z 
y

[1] 7

print(y)

[1] 7

cat("y = ", y, sep = "")

y = 7

Vectors are just lists of values or text

# Create a vector by hand
x = c(48,49,51,50,49,41,40,38,35,40)

# Print the first element of the vector.
x[1]

[1] 48


# Print values from indicated positions.
x[c(1,3,5)]

[1] 48 51 49


# Add 1 to each value (if two vectors 
# have unequal lengths, R typically
# recycles the smaller one.)
x + 1

 [1] 49 50 52 51 50 42 41 39 36 41

Data types

# Print the class of an object.
class(x)

[1] "numeric"


# Convert x into a matrix.
x = matrix(x, nrow = 5, ncol = 2)
class(x)

[1] "matrix"


# Convert to a data frame
# (each column can be a different type
# (i.e., text, factors, numeric, integers, 
# etc)
x = as.data.frame(x)
class(x)

[1] "data.frame"

Transpose a data frame

x = t(x)
x

   [,1] [,2] [,3] [,4] [,5]
V1   48   49   51   50   49
V2   41   40   38   35   40

x = as.data.frame(x)

Accessing data in a data frame

colnames(x)

[1] "V1" "V2" "V3" "V4" "V5"

colnames(x) = paste("Column", 1:5, sep = "")
colnames(x)

[1] "Column1" "Column2" "Column3" "Column4" "Column5"

x$Column2

[1] 49 40

x[,2]

[1] 49 40

Help

help(as.matrix)

Let's get started!

Read in data

# (1)   Set working directory.
#setwd("C:/Users/Ben/Desktop/teamr")

# (2)   Read in data containing column 
#       headings.
mouse_data = read.table("mouse_data.txt", 
    header = T)

Summarize the data

# (3)   Print number of rows and columns of 
#       mouse_data.
dim(mouse_data)

[1] 21 94

# (4)   Print mouse_data rows 1 through 5, 
#       columns 1 through 5.
mouse_data[1:5,1:5]

  Group   Mouse Strain Fatness   Actb
1  lean 1110158     11  0.3197 0.6438
2  lean 5010378     50  0.3568 1.0473
3  lean 2010026     20  0.4042 1.5298
4  lean 5010379     50  0.4104 1.2554
5  lean 1110180     11  0.4419 0.6798

Calculate mean expression for each gene

# (5)   Calculate the means of mouse_data 
#       columns 5 through the last column.
mouse_means = colMeans(mouse_data[,5:ncol(mouse_data)])

# (6)   Print mouse_means elements 1 through 5.
mouse_means[1:5]

    Actb   Cdkn1a      Lep      Cfd      Lpl 
 1.78129  0.03827  0.77928 26.86592  8.30270

Install Packages

# (7)   Install dependencies that gplots 
#       package needs, and install gplots.
# install.packages(pkgs = 
#   c("gdata", "gtools"))
# install.packages("gplots")

# (8)   Load gplots package.
library(gplots)

Open a file and set plotting parameters

# (9)  Open png file for plotting.
png("group-v-fatness.png")

# (10)  Set plotting parameters
#       (font size, margins, axis labels, 
#       etc).
par(
    font = 2, 
    font.lab = 2, 
    font.axis = 2, 
    cex.main = 2, 
    cex.lab = 1.5, 
    cex.axis = 1.5, 
    mar = c(6,5,4,2)+0.1
)

Plot mean fatness for lean vs. fat mice

# (11)  Draw plot.
plotmeans(Fatness ~ Group, 
          data = mouse_data, 
          connect = F, 
          barwidth = 4, 
          n.label = F,
          main = "Group vs. Fatness", 
          xlab = "Group", 
          ylab = "Fatness"
)

# (12)  Close plotting device.
dev.off()

Plot fatness by strain

# (13)  Open png file. 
png("strain-v-fatness.png", width = 600, 
      height = 480
)

# (14)  Set plotting parameters.
par(font = 2, font.lab = 2, font.axis = 2, 
    cex.main = 2, cex.lab = 1.5, 
    cex.axis = 1.5, mar = c(6,5,4,2)+0.1
)

# (15)  Draw plot.
plotmeans(Fatness ~ Strain, 
          data = mouse_data, connect = F, 
          barwidth = 4, n.label = F, 
          main = "Strain vs. Fatness", 
          xlab = "Strain", 
          ylab = "Fatness"
)

# (16)  Close plotting device.
dev.off()

Stripchart

# (17)  Open png file.
png("strain-v-fatness_stripchart.png", 
    width = 600, height = 480
)

# (18)  Set plotting parameters.
par(font = 2, font.lab = 2, font.axis = 2, 
    cex.main = 2, cex.lab = 1.5, 
    cex.axis = 1.5, mar = c(6,5,4,2)+0.1
)

# (19)  Draw plot.
stripchart(Fatness ~ Strain, 
           data = mouse_data, vertical = T, 
           main = "Strain vs. Fatness", 
            xlab = "Strain", 
           ylab = "Fatness", pch = 20
)
dev.off()

Find the 2 most correlated genes

# (20)  Calculate pairwise correlation 
#       among genes in indicated columns.
#       We need to convert our data frame
#       into a matrix first.
gene_corrs = cor(as.matrix(mouse_data[,5:ncol(mouse_data)]))

# (21)  Print gene_corrs rows 1 through 5,
#       columns 1 through 5.
gene_corrs[1:3,1:5]

         Actb Cdkn1a    Lep     Cfd    Lpl
Actb   1.0000 0.7949 0.6130 -0.6168 0.6528
Cdkn1a 0.7949 1.0000 0.8246 -0.5787 0.8808
Lep    0.6130 0.8246 1.0000 -0.3658 0.7923

Find the 2 most correlated genes

# (22)  Begin with gene1 and gene2 = 1, 
#       and corr_max_all = 0.  
gene1 = 1
gene2 = 1
corr_max_all = 0

# (23)  Loop over the correlation matrix 
#       rows.  If the row contains 
#       correlations >= corr_max_all, 
#       replace gene1 and gene2 with 
#       those column numbers and 
#       corr_max_all with that 
#       correlation value.
for(i in 1:nrow(gene_corrs)) {
  row_max = max(abs(gene_corrs[i,gene_corrs[i,]!=1]))
  if (row_max >= abs(corr_max_all)) {
    corr_max_all = row_max
    gene1 = i
    gene2 = which(gene_corrs[i,] == row_max)
  }
}

Find the 2 most correlated genes

# (24)  Use column numbers from previous
#       step to get gene names from 
#       correlation matrix.        
gene1 = colnames(gene_corrs)[gene1]
gene2 = colnames(gene_corrs)[gene2]

cat("gene1: ", gene1, ", gene2: ", gene2, 
    sep = ""
)

gene1: Wnt1, gene2: Dkk1


# (25)  Use gene names as references to
#       access gene expression values
#       from mouse_data, and compute 
#       correlation.
cor_max = cor(mouse_data[,gene1], 
              mouse_data[,gene2]
)

Plot gene 1 vs. gene 2

# (26)  Open png file.
png("scatterplot.png")

# (27)  Set plotting parameters.
par(font = 2, font.lab = 2, font.axis = 2, 
    cex.main = 2, cex.lab = 1.5, 
    cex.axis = 1.5, mar = c(6,5,4,2)+0.1
)

# (28)  Draw plot
plot(mouse_data[,gene1], 
      mouse_data[,gene2], 
      main = paste(gene1, " vs. ", gene2,
                  sep = ""),
      xlab = gene1, ylab = gene2
)

Plot gene 1 vs. gene 2

# (29)  Fit a linear model predicting gene2
#       based on gene1.
fit = lm(mouse_data[,gene2] ~ 
           mouse_data[,gene1]
)
# summary(fit)

Plot gene 1 vs. gene 2

# (30)  Limit correlation value to 6 digits.
cor_max_form = format(cor_max^2, digits = 6
)

# (31)  Write text on plot--formatted 
#       in italics, as well as bold.
mtext(
  bquote(italic(R)^2 == .(cor_max_form)), 
  font = 4
)

# (32)  Add line fit to plot
abline(fit)

# (32)  Close plotting device.
dev.off()

Assign each strain a different color and point style

#   Create list of lean strains.
lean = c("11", "12", "20", "42", "50")

#   Create factor variable "strain"
strain = as.factor(mouse_data$Strain)

#   Create vector of colors.  It has as 
#   many colors as we have strains.  
strain_colors = rainbow(length(levels(strain)))

#   Create empty vectors that we will fill 
#   with colors and point styles.
color_assignments = vector(length = length(strain))
style_assignments = vector(length = length(strain))

Assign each strain a different color and point style

#   Create an empty vector with length 
#   equal to the number of strains.
fat_levels = vector(length = length(levels(strain)))

Assign each strain a different color and point style

#   Loop from 1 to the number of strains, 
#   filling the color and style vectors.  
#   If the strain number in the lean list, 
#   fill the fat_levels vector at that 
#   position with "lean", otherwise "fat".
for (i in 1:length(levels(strain))) {
  color_assignments[(strain == levels(strain)[i])] = strain_colors[i]
  style_assignments [(strain == levels(strain)[i])] = i
  if (levels(strain)[i] %in% lean) {
    fat_levels[i] = "lean"
  }
  else {
    fat_levels[i] = "fat"
  }
}

Assign each strain a different color and point style

#   Open png file.
png("scatterplot_colors.png")

#   Set plotting parameters.
par(font = 2, font.lab = 2, font.axis = 2, 
    cex.main = 2, cex.lab = 1.5, 
    cex.axis = 1.5, mar = c(6,5,4,2)+0.1
)

#   Draw plot.
plot(mouse_data[,gene1], 
      mouse_data[,gene2], 
      main = paste(gene1, " vs. ", gene2, 
                  sep = ""),
      xlab = gene1, ylab = gene2, 
      col = color_assignments,
      pch = style_assignments, lwd = 4
)

Assign each strain a different color and point style

#   Write formatted R-squared, as before.
mtext(bquote(italic(R)^2 == .(cor_max_form)), font = 4)

#   Add legend in bottom right.  It
#   contains the strains, and the colors 
#   and point styles corresponding to each
#   one.
legend("bottomright", 
       legend = paste(levels(strain), 
                "--", fat_levels, 
                sep = ""), 
      col = strain_colors, 
      text.col = strain_colors, 
      pch = 1:length(strain_colors), 
      cex = 1, pt.lwd = 4
)

abline(fit)

dev.off()

Get middle quantile of mean gene expression

# (33)  Sort expression means, lowest
#       to highest.
mouse_means_sorted = sort(mouse_means)

# (34)  Choose lower and upper percentiles.
#       Get integer values corresponding 
#       to the indexes of these 
#       percentiles.  Last, return only 
#       elements of mouse_means within
#       those percentiles.
lb = 0.21
ub = 0.80
lb = floor(lb*length(mouse_means_sorted))
ub = ceiling(ub*length(mouse_means_sorted))
mouse_means_mid = mouse_means_sorted[lb:ub]

Plot histogram of mean gene expression

# (35)  Open png file.
png("histogram_means.png")

# (36)  Set plotting parameters.
par(font = 2, font.lab = 2, font.axis = 2, 
    cex.main = 1.5, cex.lab = 1.5, 
    cex.axis = 1.5, mar = c(6,5,4,2)+0.1
)

# (37)  Draw histogram.
hist(mouse_means_mid, 
     main = "Mean mRNA Abundance",
     xlab = "mRNA Abundance", 
     col = "lightgray", 
     labels = T, ylim = c(0,20)
)

dev.off()

Boxplots

# (38)  Open pdf file.
pdf("box-plots.pdf")

# (39)  Set plotting parameters.  NOTE: 
#       "mfrow" splits each page into 2 
#       rows and 2 columns, so 4 windows.
par(mfrow = c(2,2), font = 2, font.lab = 2, 
    font.axis = 2, cex.main = 1.5, 
    cex.lab = 1.5, cex.axis = 1.5, 
    mar = c(6,5,4,2)+0.1
)

# (40)  Draw boxplots.
boxplot(mouse_data$Fatness, main = "Fatness")

boxplot(mouse_means, main = "Mean mRNA Abundance")

Boxplots

# (41)  Continue plotting into same pdf.
boxplot(mouse_means_mid, 
        main = "Mean mRNA Abundance"
)

boxplot(Fatness ~ Group, data = mouse_data, 
        main = "Fat and Lean Mice",
        ylab = "Fatness"
)

boxplot(Actb ~ Group, data = mouse_data, 
        main = "Fat and Lean Mice",
       ylab = "Actb"
)

boxplot(mouse_data$Actb, main="Actb")

dev.off()

More boxplots

# (42)  Open new png file.
png("boxplot_fatness-v-actb.png")

# (43)  Set plotting parameters.
par(font = 2, font.lab = 2, font.axis = 2, 
    cex.main = 1.5, cex.lab = 1.5, 
    cex.axis = 1.5, mar = c(6,5,4,2)+0.1
)

# (44)  Draw boxplot.
boxplot(Actb ~ Group, data = mouse_data, 
        main = "Fat and Lean Mice",
        ylab = "Actb"
)

dev.off()

Pairwise scatterplots

# (45)  Open png file.
png("pairwise-corrs.png")

# (46)  Set plotting parameters.
par(font = 2, font.lab = 2, font.axis = 2, 
    cex.main = 1.5, cex.lab = 1.5, 
    cex.axis = 1.5, mar = c(6,5,4,2)+0.1
)

# (47)  Plot pairwise scatterplots 
#       for genes in designated columns.
pairs(mouse_data[,6:9])

dev.off()

Heatmap

# (48)  Force expression data into matrix.
expr_matrix = as.matrix(mouse_data[,5:ncol(mouse_data)])

# (49)  Transpose rows into columns, and 
#       vice-versa.
expr_matrix = t(expr_matrix)

# (50)  Use paste to concatenate mouse 
#       number and "lean" or "fat"
#       as appropriate.  Replace 
#       column names with these text 
#       strings.
colnames(expr_matrix) = paste(mouse_data$Mouse, "--", mouse_data$Group, sep = "")

Heatmap

# (51)  Open png file.
png("mouse_heatmap.png", 
    width = 980, height = 980
)

# (52)  Set plotting parameters.
par(font = 2, font.lab = 2, font.axis = 2, 
    cex.main = 2
)

# (53)  Draw heatmap.
heatmap.2(
    expr_matrix, 
    col = colorpanel(10, "yellow", 
                     "orange", "red"), 
    scale = "row", trace = "none", 
    density.info = "none", 
    margins = c(14,7),
    main = "Heatmap of Gene Expression",
    cexCol = 2
)
dev.off()

Using R for Analyzing Biological Data

R Syntax

Vectors are just lists of values or text

Data types

Transpose a data frame

Accessing data in a data frame

Help

Let's get started!

Read in data

Summarize the data

Calculate mean expression for each gene

Install Packages

Open a file and set plotting parameters

Plot mean fatness for lean vs. fat mice

Plot fatness by strain

Stripchart

Find the 2 most correlated genes

Find the 2 most correlated genes

Find the 2 most correlated genes

Plot gene 1 vs. gene 2

Plot gene 1 vs. gene 2

Plot gene 1 vs. gene 2

Assign each strain a different color and point style

Assign each strain a different color and point style

Assign each strain a different color and point style

Assign each strain a different color and point style

Assign each strain a different color and point style

Get middle quantile of mean gene expression

Plot histogram of mean gene expression

Boxplots

Boxplots

More boxplots

Pairwise scatterplots

Heatmap

Heatmap

The End!