Today we will cover two new stats: t-tests and ANOVA, and two new graphs: Volcano plots and heatmaps.
#this clears the environment from any holdover data or values we may have stored and no longer need
rm(list = ls());
#No libraries needed; this section is all done in base R
data("mtcars")
df <- mtcars
head(df)
#examine and get a sense of your data
#View(df)
# Subset data for two groups we find interesting (mpg of 4 cylinder and 6 cylinder cars)
cyl4 <- mtcars$mpg[mtcars$cyl == 4] #mpg of 4 cyl cars
cyl6 <- mtcars$mpg[mtcars$cyl == 6] #mpg of 6 cyl cars
#Plot our two groups to see if we have any visual trends
boxplot(cyl4, cyl6,
names = c("4-cylinder", "6-cylinder"),
main = "MPG of 4-cylinder vs. 6-cylinder Cars",
xlab = "Cylinder Type", ylab = "MPG",
col = c("skyblue", "violetred"))
# Looks like the two groups might be different. Let's run a t-test
t.test(cyl4, cyl6)
##
## Welch Two Sample t-test
##
## data: cyl4 and cyl6
## t = 4.7191, df = 12.956, p-value = 0.0004048
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 3.751376 10.090182
## sample estimates:
## mean of x mean of y
## 26.66364 19.74286
#clear the environment
rm(list = ls());
#load libraries
#Remember to install these packages if you haven't already!
library(tidyverse) #for data manipulation
library(ggplot2) #for plotting
library(ggpubr) #for summary table
library(multcomp) #for ANOVA post-hoc
#import built-in data using data() function
data(PlantGrowth)
df <- PlantGrowth
head(df)
summary(df)
## weight group
## Min. :3.590 ctrl:10
## 1st Qu.:4.550 trt1:10
## Median :5.155 trt2:10
## Mean :5.073
## 3rd Qu.:5.530
## Max. :6.310
#using library ggpubr, this is a different way to view a summary of our data
df %>%
group_by(group) %>%
get_summary_stats(weight, type = "mean_sd")
#boxplot in base R
boxplot(weight ~ group, data = df,
main = "Plant Growth by Treatment Group",
xlab = "Treatment Group", ylab = "Weight (units)",
col = c("lightpink", "lightgreen", "lightblue"),
border = "black", notch = FALSE)
#boxplot using ggplot
ggplot(df, aes(x = group, y = weight, fill = group)) +
geom_boxplot() +
labs(title = "Plant Growth by Treatment Group",
x = "Treatment Group",
y = "Plant Weight (units?)")
#ANOVA method 1 using aov() function
res_aov <- aov(weight ~ group, data = df)
summary(res_aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## group 2 3.766 1.8832 4.846 0.0159 *
## Residuals 27 10.492 0.3886
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#ANOVA method 2 using oneway.test() function
oneway.test(weight ~ group, data = df, var.equal = TRUE)
##
## One-way analysis of means
##
## data: weight and group
## F = 4.8461, num df = 2, denom df = 27, p-value = 0.01591
#post-hoc using multcomp library
# Tukey HSD test:
#Dunnet and other methods can be used with this function as well. Check the library info to learn more!
post_test <- glht(res_aov, linfct = mcp(group = "Tukey"))
summary(post_test)
##
## Simultaneous Tests for General Linear Hypotheses
##
## Multiple Comparisons of Means: Tukey Contrasts
##
##
## Fit: aov(formula = weight ~ group, data = df)
##
## Linear Hypotheses:
## Estimate Std. Error t value Pr(>|t|)
## trt1 - ctrl == 0 -0.3710 0.2788 -1.331 0.3909
## trt2 - ctrl == 0 0.4940 0.2788 1.772 0.1980
## trt2 - trt1 == 0 0.8650 0.2788 3.103 0.0121 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)
What do these results tell us? Which groups are significantly different from one another?
#Clear environment
rm(list = ls());
#load packages used
library(tidyverse) #manipulate data
library(ggplot2) #make ggplots
library(ggrepel) #add labels to ggplot
read.csv(). Clean up data by
trimming excess columns and renaming columns with
colnames().df <- read.csv("BreastCancer_DESeq_Results.csv");
#pull only the necessary columns from the excel file
df <- df[,c(1,3,6)]
#rename columns so they are easier to work with
colnames(df) <- c("Gene","log2","pval")
#Another useful function for cleaning up is round()
head(df) #checking that it looks right
#plot elements
gghisto <- list(
theme(axis.text.x = element_text(face="bold", color="royalblue4", size=12),
axis.text.y = element_text(face="bold", color="royalblue4", size=12),
axis.title=element_text(size=14,face="italic"),
plot.title = element_text(size=14,face="bold")))
Next we create a volcano plot with all of our genes. I use ggplot to
make a scatter plot. Using the shape=21 parameter, each
plotted point has an outer edge color and an inner fill color. I make
the outer edge color dark pink, and the inner fill lighter pink. I also
make the inner fill color transparent by adjusting the
alpha parameter to 0.3. This way we can get a better sense
of the density of the points where they overlap.
#Volcano plot
ggplot(df, aes(x = log2, y = -log10(pval))) +
geom_point(fill = "violetred1",color= "violetred4", alpha = 0.3,size=3,shape=21) +
labs(x = "log2 fold change", y = "-log10(p-val)", title = "Volcano Plot for Breast Cancer RNAseq") +
gghisto
This shows all of our genes in the same color. What if we want some
color-coding with thresholds? We can use logical statements to create
thresholds that are color coded to better see signficant and
non-significant genes.
significance_threshold <- 0.05
fold_change_threshold <- 1
# Create volcano plot with thresholding
ggplot(df, aes(x = log2, y = -log10(pval))) +
geom_point(aes(color = ifelse(pval < significance_threshold & abs(log2) > fold_change_threshold, "Significant", "Not significant")), size = 2, alpha=.1) +
geom_vline(xintercept = c(-fold_change_threshold, fold_change_threshold), linetype = "dashed", color = "gray50") +
geom_hline(yintercept = -log10(significance_threshold), linetype = "dashed", color = "gray50") +
scale_color_manual(values = c("Significant" = "violetred1", "Not significant" = "skyblue3")) +
labs(title = "Volcano Plot for Breast Cancer RNAseq",
x = "Log2 Fold Change",
y = "-log10(P-value)") + guides(color = "none") + gghisto
Plotting with labels
#We will make our volcano plot but with labels. Here, we label the top 20 genes based on their transformed p-values.
#created a new df called top20 with a column with the transformed pvalues
top20 <- df %>% mutate(log10pval = -log10(pval))
# Subset to the top 20 genes with the highest -log10(pval)
top20 <- top20 %>% arrange(desc(log10pval)) %>% head(20)
#we already created these thresholds above but showing again so we know what the ggplot parameters are referencing
significance_threshold <- 0.05
fold_change_threshold <- 1
#p1 is short for plot 1
p1 <- ggplot(df, aes(x = log2, y = -log10(pval))) +
geom_point(aes(color = ifelse(pval < significance_threshold & abs(log2) > fold_change_threshold, "Significant", "Not significant")), size = 2, alpha=.1) +
geom_vline(xintercept = c(-fold_change_threshold, fold_change_threshold), linetype = "dashed", color = "gray50") +
geom_hline(yintercept = -log10(significance_threshold), linetype = "dashed", color = "gray50") +
scale_color_manual(values = c("Significant" = "violetred1", "Not significant" = "skyblue3")) +
labs(title = "Volcano Plot for Breast Cancer RNAseq",
x = "Log2 Fold Change",
y = "-log10(P-value)") + guides(color = "none") + gghisto
# Add labels for the top 20 genes. Do this by taking our volcano plot named p1 and adding the geom_label_repel() function. Here, we pass the list of genes in our top20 df that we created above. What do you think the warning means?
p1 + geom_label_repel(data = top20, aes(label = Gene))
## Warning: ggrepel: 11 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
#clear the environment
rm(list = ls());
#install libraries you don't have
library("gplots")
library("pheatmap")
scale() to
normalize data for use in heatmap.#Load data already in R
data(mtcars)
#Use the scale function as a quick shortcut to standardizing the data
df <- scale(mtcars)
heatmap()#use the heatmap function to make our plot. The parameter scale is equal to none because we already scaled it.
heatmap(df, scale = "none")
heatmap.2()#Another option: use library gplots (note difference in spelling) for the function heatmap.2()
heatmap.2(df, scale = "none", col = bluered(100),
trace = "none", density.info = "none")
pheatmap()#Another heatmap using the library pheatmap
pheatmap(df, cutree_rows = 4)