Week 4

MONDAY August 14 2023

Today we will cover two new stats: t-tests and ANOVA, and two new graphs: Volcano plots and heatmaps.

A) Practice t-test

1: Clear environment, Load libraries, Import data

#this clears the environment from any holdover data or values we may have stored and no longer need
rm(list = ls());

#No libraries needed; this section is all done in base R
data("mtcars")
df <- mtcars
head(df)

2. Examine features and identify the groups we want to compare

#examine and get a sense of your data
#View(df)

# Subset data for two groups we find interesting (mpg of 4 cylinder and 6 cylinder cars)
cyl4 <- mtcars$mpg[mtcars$cyl == 4] #mpg of 4 cyl cars
cyl6 <- mtcars$mpg[mtcars$cyl == 6] #mpg of 6 cyl cars
#Plot our two groups to see if we have any visual trends
boxplot(cyl4, cyl6,
        names = c("4-cylinder", "6-cylinder"),
        main = "MPG of 4-cylinder vs. 6-cylinder Cars",
        xlab = "Cylinder Type", ylab = "MPG",
        col = c("skyblue", "violetred"))

t-test

# Looks like the two groups might be different. Let's run a t-test 
t.test(cyl4, cyl6)
## 
##  Welch Two Sample t-test
## 
## data:  cyl4 and cyl6
## t = 4.7191, df = 12.956, p-value = 0.0004048
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   3.751376 10.090182
## sample estimates:
## mean of x mean of y 
##  26.66364  19.74286




B) Practice ANOVA

1: Clear environment, Load libraries, Import data

#clear the environment
rm(list = ls());

#load libraries
#Remember to install these packages if you haven't already!
library(tidyverse) #for data manipulation 
library(ggplot2) #for plotting
library(ggpubr) #for summary table
library(multcomp) #for ANOVA post-hoc
#import built-in data using data() function

data(PlantGrowth)
df <- PlantGrowth

2. Check out data

head(df)
summary(df)
##      weight       group   
##  Min.   :3.590   ctrl:10  
##  1st Qu.:4.550   trt1:10  
##  Median :5.155   trt2:10  
##  Mean   :5.073            
##  3rd Qu.:5.530            
##  Max.   :6.310
#using library ggpubr, this is a different way to view a summary of our data

df %>%
  group_by(group) %>%
  get_summary_stats(weight, type = "mean_sd")

3. Quick Box plots of data using two methods

#boxplot in base R 
boxplot(weight ~ group, data = df,
        main = "Plant Growth by Treatment Group",
        xlab = "Treatment Group", ylab = "Weight (units)",
        col = c("lightpink", "lightgreen", "lightblue"),
        border = "black", notch = FALSE)

#boxplot using ggplot 
ggplot(df, aes(x = group, y = weight, fill = group)) +
  geom_boxplot() +
  labs(title = "Plant Growth by Treatment Group",
       x = "Treatment Group",
       y = "Plant Weight (units?)")

4. Stats tests

#ANOVA method 1 using aov() function
res_aov <- aov(weight ~ group, data = df)

summary(res_aov)
##             Df Sum Sq Mean Sq F value Pr(>F)  
## group        2  3.766  1.8832   4.846 0.0159 *
## Residuals   27 10.492  0.3886                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#ANOVA method 2 using oneway.test() function
oneway.test(weight ~ group, data = df, var.equal = TRUE)
## 
##  One-way analysis of means
## 
## data:  weight and group
## F = 4.8461, num df = 2, denom df = 27, p-value = 0.01591

5. Post-hoc analysis

#post-hoc using multcomp library


# Tukey HSD test:
#Dunnet and other methods can be used with this function as well. Check the library info to learn more!

post_test <- glht(res_aov, linfct = mcp(group = "Tukey")) 

summary(post_test)
## 
##   Simultaneous Tests for General Linear Hypotheses
## 
## Multiple Comparisons of Means: Tukey Contrasts
## 
## 
## Fit: aov(formula = weight ~ group, data = df)
## 
## Linear Hypotheses:
##                  Estimate Std. Error t value Pr(>|t|)  
## trt1 - ctrl == 0  -0.3710     0.2788  -1.331   0.3909  
## trt2 - ctrl == 0   0.4940     0.2788   1.772   0.1980  
## trt2 - trt1 == 0   0.8650     0.2788   3.103   0.0121 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)

What do these results tell us? Which groups are significantly different from one another?





C) Practice Volcano Plot

1. Clear environment and load packages

#Clear environment
rm(list = ls());

#load packages used 
library(tidyverse) #manipulate data
library(ggplot2) #make ggplots
library(ggrepel) #add labels to ggplot

2. Upload CSV sheet with read.csv(). Clean up data by trimming excess columns and renaming columns with colnames().

df <- read.csv("BreastCancer_DESeq_Results.csv");

#pull only the necessary columns from the excel file
df <- df[,c(1,3,6)]

#rename columns so they are easier to work with
colnames(df) <- c("Gene","log2","pval")

#Another useful function for cleaning up is round() 

3. Quick view data

head(df) #checking that it looks right

4. Plots! First, create plot parameters that will be used in our ggplot. Then create volcano plot.

#plot elements
gghisto <- list(
  theme(axis.text.x = element_text(face="bold", color="royalblue4", size=12),
          axis.text.y = element_text(face="bold", color="royalblue4", size=12),
          axis.title=element_text(size=14,face="italic"),
          plot.title = element_text(size=14,face="bold")))

Next we create a volcano plot with all of our genes. I use ggplot to make a scatter plot. Using the shape=21 parameter, each plotted point has an outer edge color and an inner fill color. I make the outer edge color dark pink, and the inner fill lighter pink. I also make the inner fill color transparent by adjusting the alpha parameter to 0.3. This way we can get a better sense of the density of the points where they overlap.

#Volcano plot

ggplot(df, aes(x = log2, y = -log10(pval))) +
  geom_point(fill = "violetred1",color= "violetred4", alpha = 0.3,size=3,shape=21) +
  labs(x = "log2 fold change", y = "-log10(p-val)", title = "Volcano Plot for Breast Cancer RNAseq") +
  gghisto

This shows all of our genes in the same color. What if we want some color-coding with thresholds? We can use logical statements to create thresholds that are color coded to better see signficant and non-significant genes.

5. Volcano plot with thresholds

significance_threshold <- 0.05
fold_change_threshold <- 1

# Create volcano plot with thresholding
ggplot(df, aes(x = log2, y = -log10(pval))) +
  geom_point(aes(color = ifelse(pval < significance_threshold & abs(log2) > fold_change_threshold, "Significant", "Not significant")), size = 2, alpha=.1) +
  geom_vline(xintercept = c(-fold_change_threshold, fold_change_threshold), linetype = "dashed", color = "gray50") +
  geom_hline(yintercept = -log10(significance_threshold), linetype = "dashed", color = "gray50") +
  scale_color_manual(values = c("Significant" = "violetred1", "Not significant" = "skyblue3")) +
  labs(title = "Volcano Plot for Breast Cancer RNAseq",
       x = "Log2 Fold Change",
       y = "-log10(P-value)") + guides(color = "none") + gghisto

Plotting with labels

#We will make our volcano plot but with labels. Here, we label the top 20 genes based on their transformed p-values.


#created a new df called top20 with a column with the transformed pvalues
top20 <- df %>% mutate(log10pval = -log10(pval))

# Subset to the top 20 genes with the highest -log10(pval)
top20 <- top20 %>% arrange(desc(log10pval)) %>% head(20)


#we already created these thresholds above but showing again so we know what the ggplot parameters are referencing
significance_threshold <- 0.05
fold_change_threshold <- 1

#p1 is short for plot 1
p1 <- ggplot(df, aes(x = log2, y = -log10(pval))) +
  geom_point(aes(color = ifelse(pval < significance_threshold & abs(log2) > fold_change_threshold, "Significant", "Not significant")), size = 2, alpha=.1) +
  geom_vline(xintercept = c(-fold_change_threshold, fold_change_threshold), linetype = "dashed", color = "gray50") +
  geom_hline(yintercept = -log10(significance_threshold), linetype = "dashed", color = "gray50") +
  scale_color_manual(values = c("Significant" = "violetred1", "Not significant" = "skyblue3")) +
  labs(title = "Volcano Plot for Breast Cancer RNAseq",
       x = "Log2 Fold Change",
       y = "-log10(P-value)") + guides(color = "none") + gghisto

# Add labels for the top 20 genes. Do this by taking our volcano plot named p1 and adding the geom_label_repel() function. Here, we pass the list of genes in our top20 df that we created above. What do you think the warning means? 
p1 + geom_label_repel(data = top20, aes(label = Gene))
## Warning: ggrepel: 11 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

D) Practice Heatmaps

1: Clear environment, Load libraries, Import data

#clear the environment 
rm(list = ls());

#install libraries you don't have
library("gplots")
library("pheatmap")

Back to the cars data. Load data and use scale() to normalize data for use in heatmap.

#Load data already in R
data(mtcars)

#Use the scale function as a quick shortcut to standardizing the data
df <- scale(mtcars)

2. Make your heatmap with heatmap()

#use the heatmap function to make our plot. The parameter scale is equal to none because we already scaled it. 
heatmap(df, scale = "none")

3. Make your heatmap with heatmap.2()

#Another option: use library gplots (note difference in spelling) for the function heatmap.2()

heatmap.2(df, scale = "none", col = bluered(100), 
          trace = "none", density.info = "none")

2. Make your heatmap with pheatmap()

#Another heatmap using the library pheatmap
pheatmap(df, cutree_rows = 4)