Data Viz in R - class script ** https://rpubs.com/aparada14/data-viz-feb2026)
Class data and code ** https://goto.stanford.edu/data-viz-R
This will not only organize all of your work under one folder, but it will also set the working directory for this script.
We should only need to do this once per R version, so can run code directly from console. Alternatively, you can comment it out.
#install.packages("readxl") # load packaged with library() command
#install.packages("tidyverse")
#install.packages("patchwork")
We could load each package only when we need it, but I like knowing what packages we need, so that I can install if I don’t have them.
library(tidyverse)
library(patchwork)
library(readxl)
library(ggplot2) #you likely don't need to do this, but can run this line if needed
#We will primarily work with ggplot2 today, but lots of options
#Read in the data
sea_chem<-read.table("data/sim_data.csv",header=T,sep=",",na.strings="NA",row.names=1)
samp_data<-read_xlsx("data/sim_data.xlsx","samples")
sea_comb<-merge(sea_chem,samp_data,by.x="row.names",by.y="sample_id")
#View(sea_comb) #note that the row names are now a column, we could
#change this, but for dataframes it's usually better to keep this valuable
#information in a column
head(sea_comb)
#Create new column to group by based on the values of another
sea_comb$NH4_stat <- ifelse(sea_comb$NH4_nanoM<20,"depleted","replete")
unique(sea_comb$NH4_stat)
# Check the data type and some basic stats
str(sea_comb)
summary(sea_comb) #Note the NA's this could cause some problems
mean(sea_comb$NH4_nanoM) #note answer, it is due to NAs
mean(sea_comb$NH4_nanoM,na.rm = TRUE) #this is an option for many stats functions
#Remove the na's from the dataset
#could filter the table to remove all NAs, but would lose all rows with missing values
sea_filt<-sea_comb[complete.cases(sea_comb),]
#remove the row for a specific variable
sea_NH4<-sea_comb %>% drop_na(NH4_nanoM)
#easy way to specify which column to drop NA's for using tidyR
#Check and count NAs in each column
colSums(is.na(sea_NH4))
colSums(is.na(sea_filt))
#Create a simple bar graph
#ggplot makes figures by adding layers
ggplot(sea_filt, mapping = aes(x = NH4_stat)) +
geom_bar()
#use the inherent groups in the variable to add color
ggplot(sea_filt, mapping = aes(x = NH4_stat,fill=NH4_stat)) +
geom_bar()
#can specify the colors instead of using the default
bar <- ggplot(sea_filt, mapping = aes(x = NH4_stat,fill=NH4_stat)) +
geom_bar()+
scale_fill_manual(values=c("depleted"="red","replete"="blue"))
bar
#many options for customization will be available
#if you either call the help pages or hit tab
?geom_bar()
#Like stacking if have multiple groups
ggplot(sea_filt, mapping = aes(x=location,fill=NH4_stat)) +
geom_bar(position="stack") +
scale_fill_manual(values=c("depleted"="red","replete"="blue"))
Color Brewer 2.0 ** www.colorbrewer2.org
#Improve the layout with factors, default is alphabetical
unique(sea_filt$location) #default of unique is the order
#they appear in the table
# we may want to organize these by distance from shore
sea_filt$location <- factor(sea_filt$location,
levels=c("coastal","offshore","openOcean","gyre"))
unique(sea_filt$location)
#Plot again using the new order and color
bar2 <-
ggplot(sea_filt, mapping = aes(x=location,fill=NH4_stat)) +
geom_bar(position="stack")
bar2 + scale_fill_brewer(palette = "Set1")
#Improve labels and plot background
#Create a vector of the new desired labels
ocean_zones<- c("coastal"="Coastal","offshore"="Off Shore",
"openOcean"="Open Ocean","gyre"="Gyre")
bar2 +
scale_fill_brewer(palette = "Set1",labels=c("Depleted","Replete")) +
labs(
x= "Location",
y= "Sample Count (n)",
title = "Ammonium status across samples",
fill=bquote(NH[4]^"+")
) +
scale_x_discrete(labels=ocean_zones) +
theme_bw()
#theme - the non-data part of figure (grid, legend,ticks, fonts)
#can fully customize or use built-in features
?theme
#Once satisfied with the look, can save to a new object
bar3 <- bar2 +
scale_fill_brewer(palette = "Set1",labels=c("Depleted","Replete")) +
labs(
x= "Location",
y= "Sample Count (n)",
title = "Ammonium status across samples",
fill=bquote(NH[4]^"+")
) +
scale_x_discrete(labels=ocean_zones) +
theme_bw(base_size = 16,base_family = "Arial")
#Can use the "Export" button in the "Plots" tab
#Print to png
#modify dimensions and resolution
png(file="sea_bar_sized.png",
units = "in",width=2,height=4,res=300)
print(bar3)
dev.off()
#Print to eps
postscript(file="sea_bar.eps")
print(bar3)
dev.off()
#EPS good for modifying later in Illustrator
#and so on for jpg, tiff, pdf, etc.
#don't forget to check help pages for more options
?pdf
#Using ggsave
ggsave("sea_bar.png",plot=bar3,
units="in",width = 5, height=8,dpi = 300)
#Create a simple histogram
ggplot(sea_NH4, aes(x = NH4_nanoM)) +
geom_histogram()
#Change the binwidth
ggplot(sea_NH4, aes(x = NH4_nanoM)) +
geom_histogram(binwidth = 10) # each bin covers 10nM
#Create a density plot
ggplot(sea_NH4, aes(x = NH4_nanoM)) +
geom_density()
#Histogram & density
# Histogram with kernel density
hist_NH4 <- ggplot(sea_NH4, aes(x = NH4_nanoM)) +
geom_histogram(aes(y =after_stat(density))) +
#density of points in bin, scaled to integrate to 1.
geom_density()
hist_NH4
#Create a scatterplot
scat_Micro <- ggplot(sea_comb,aes(x=Bact,y=Virus)) +
geom_point()
scat_Micro
# add a trendline - and modify line types
scat_Micro +
geom_smooth(method=lm,colour = "red",linewidth=1,linetype=2) +
theme_light()
?geom_smooth #check more options
#no easy way to add the equation, many "solutions" online, here will show you
#how to calculate, could then add manually in illustrator
lm_VirBac<-lm(Virus~Bact, data = sea_comb)
lm_VirBac
summary(lm_VirBac)
#Can color groups still
scat_Micro <-
ggplot(sea_comb,aes(x=Bact,y=Virus,col=location)) +
geom_point(size=3,alpha=0.5) +
scale_color_brewer(palette = "RdBu")+
geom_smooth(method=lm,colour = "black",linewidth=0.5) +
theme_light()
scat_Micro
#1 Plot the nitrogen data against each other in a scatterplot
ggplot(sea_filt,aes(x=NH4_nanoM,y=NO3_nanoM)) +
geom_point()
#2 Customize the labels and color by group
ggplot(sea_filt,aes(x=NH4_nanoM,y=NO3_nanoM,col=location)) +
geom_point() +
labs( x=bquote(NH[4]^"+"~" (nM)"),
y=bquote(NO[3]^"-"~" (nM)")
) +
scale_color_brewer(palette="Dark2")
#3 Customize the shape of the points by the depth sampled
scatter_nitrogen<-
ggplot(sea_filt,aes(x=NH4_nanoM,y=NO3_nanoM,col=location)) +
geom_point(aes(shape=as.factor(depth))) +
labs( x=bquote(NH[4]^"+"~" (nM)"),
y=bquote(NO[3]^"-"~" (nM)"),
color="Location",
shape="Depth (m)"
) +
scale_color_brewer(palette="Dark2")
scatter_nitrogen
#4 Save the final plot
png(file="nitrogen_species_scatter-plot.png", units = "in",width=8,height=4,res=300)
print(scatter_nitrogen)
dev.off()
#using built-in command par() & layout() cannot be used with ggplot2
attach(sea_comb) # allows you to call variables from the attached data set
# figures arranged in 3 rows and 1 columns and specify margins
#mar=c(b,l,t,r) or mai for inches
par(mfrow=c(3,1),mar=c(2,2,2,0))
hist(NH4_nanoM)
hist(NO3_nanoM)
plot(Bact,Virus)
dev.off()
#must turn off device driver to actually get plots, and move to another type
#Skipping during workshop, will leave for reference
#Use layout to place a single figure across row 1 and two figures in row 2
#layout(matrix(c(1,1,2,3),nrow=2,ncol=2,byrow=T))
#creating a 2 row 2 column plotting space, adding figures across a row first
#put figure 1 in all of row 1, then place the 2, & 3 in the remaining spots
#hist(NH4_nanoM)
#hist(NO3_nanoM)
#plot(Bact,Virus)
#detach(sea_comb) #detach to separate the variables from the objects in
#your environment
#Use patchwork package to plot graphs in one figure, works with ggplot2,
#Can also try grid.arrange from the girdExtra package
combinedPlots<-(bar3 + hist_NH4) / scat_Micro
ggsave("combinedPlotsSized.png",plot=combinedPlots,
units="in",width = 10, height=8,dpi = 300)
ggplot(data = sea_filt, aes(x=location,y=NO3_nanoM,fill=as.factor(depth))) +
geom_boxplot() +
coord_flip()+
scale_fill_manual(values=c("#999999","#E69F00")) +
theme_bw() +
theme(legend.position = c(0.75,0.25),
legend.box.background = element_rect(color = "black", fill = "white", linewidth = 1)) +
geom_jitter()
#Facet wrap - wraps list of plots in rows and columns and chooses best layout
ggplot(data = sea_filt, aes(x=location,y=Bact)) +
geom_boxplot() +
facet_wrap(~as.factor(depth))
#Facet grid - forms a matrix of panels defined by row and column variables (2 discrete variables)
ggplot(data = sea_filt, aes(x=Bact,y=Virus)) +
geom_point() +
facet_grid(as.factor(depth) ~ location) # row_variable ~ column_variable
#Scales are fixed by default, but can be changed
ggplot(data = sea_filt, aes(x=location,y=Bact)) +
geom_boxplot() +
geom_jitter() +
facet_grid(as.factor(depth) ~ .,scales="free_y") #different y scales
head(sea_filt)
sea_long<- sea_filt %>%
pivot_longer(
cols= c(NH4_nanoM,NO3_nanoM),
names_to ="nitrogen_species",
values_to="nitrogen_conc"
)
print(sea_long)
ggplot(data = sea_long, aes(x=nitrogen_species,y=nitrogen_conc)) +
geom_point() +
facet_grid(as.factor(depth) ~ location)
ggplot(data = sea_long, aes(x=location,y=nitrogen_conc)) +
geom_point() +
facet_grid(nitrogen_species~ as.factor(depth), scales="free_y") +
theme_light()
#Reshape data to be able to facet by microbial community
sea_longer<- sea_long %>%
pivot_longer(
cols= c(Bact,Virus),
names_to ="microbe",
values_to="microbial_count"
)
head(sea_longer)
#Create a multi-panel boxplot that separates the microbial community counts
#by depth,location, and microbe
ggplot(data = sea_longer, aes(x=location,y=microbial_count)) +
geom_boxplot() +
geom_jitter() +
facet_grid(microbe~ as.factor(depth), scales="free_y") +
theme_light()
# Load a community abundance matrix
community<-read.csv("data/sim_community.csv",header = T,row.names = 1)
head(community)
# Calculate a distance matrix and nmds
#install.packages("vegan")
library(vegan)
dist_mat <- vegdist(t(community), method = "bray")
nmds <- metaMDS(dist_mat, k = 2)
str(nmds)
# Extract NMDS scores
scores_df <- as.data.frame(scores(nmds, display = "sites"))
scores_df$sample_id <- rownames(scores_df)
# Add metadata to the NMDS scores
nmds_df <- merge(scores_df, as.data.frame(samp_data), by = "sample_id")
head(nmds_df)
# Re-establish the factors for each of the grouping variables
nmds_df$location <- factor(nmds_df$location)
nmds_df$depth <- factor(nmds_df$depth)
# Plot
ggplot(nmds_df, aes(x = NMDS1, y = NMDS2,
color = location,
shape = depth)) +
geom_point(size = 3, alpha = 0.85) +
scale_color_brewer(palette = "Dark2",type="seq")+
stat_ellipse(aes(group = interaction(location, depth),
color = location),
level = 0.95,
linewidth = 0.8,
linetype = 2) +
theme_bw(base_size = 14) +
labs(title = "NMDS (Bray-Curtis)",
subtitle = "Simulated microbial community",
color = "Location",
shape = "Depth (m)") +
theme(panel.grid = element_blank())