Data Viz in R - class script ** https://rpubs.com/aparada14/data-viz-may2026)
Class data and code ** https://goto.stanford.edu/data-viz-R
This will not only organize all of your work under one folder, but it will also set the working directory for this script.
We should only need to do this once per R version, so can run code directly from console. Alternatively, you can comment it out.
#install.packages("tidyverse")
#install.packages("patchwork")
We could load each package only when we need it, but I like knowing what packages we need, so that I can install if I don’t have them.
library(tidyverse)
library(patchwork)
library(ggplot2) #you likely don't need to do this, but can run this line if needed
#We will primarily work with ggplot2 today, but lots of options
#Read in the data
sea_comb<-read.table("data/sim_sea_data.csv",header=T,sep=",",na.strings="NA",row.names=1)
#Create new column to group by based on the values of another
sea_comb$NH4_stat <- ifelse(sea_comb$NH4_nanoM<20,"depleted","replete")
unique(sea_comb$NH4_stat)
# Check the data type and some basic stats
str(sea_comb)
summary(sea_comb) #Note the NA's this could cause some problems
mean(sea_comb$NH4_nanoM) #note answer, it is due to NAs
mean(sea_comb$NH4_nanoM,na.rm = TRUE) #this is an option for many stats functions
#Remove the na's from the dataset
#could filter the table to remove all NAs, but would lose all rows with missing values
sea_filt<-sea_comb[complete.cases(sea_comb),]
#remove the row for a specific variable
sea_NH4<-sea_comb %>% drop_na(NH4_nanoM)
#easy way to specify which column to drop NA's for using tidyR
#Check and count NAs in each column
colSums(is.na(sea_NH4))
colSums(is.na(sea_filt))
#Create a simple bar graph
#ggplot makes figures by adding layers
ggplot(sea_filt, mapping = aes(x = NH4_stat)) +
geom_bar()
#use the inherent groups in the variable to add color
ggplot(sea_filt, mapping = aes(x = NH4_stat,fill=NH4_stat)) +
geom_bar()
#can specify the colors instead of using the default
bar <- ggplot(sea_filt, mapping = aes(x = NH4_stat,fill=NH4_stat)) +
geom_bar()+
scale_fill_manual(values=c("depleted"="red","replete"="blue"))
bar
#many options for customization will be available
#if you either call the help pages or hit tab
?geom_bar()
#Like stacking if have multiple groups
ggplot(sea_filt, mapping = aes(x=location,fill=NH4_stat)) +
geom_bar(position="stack") +
scale_fill_manual(values=c("depleted"="red","replete"="blue"))
#note order of x axis categories
Color Brewer 2.0 ** www.colorbrewer2.org
#Improve the layout with factors, default is alphabetical
unique(sea_filt$location) #default of unique is the order
#they appear in the table
# we may want to organize these by distance from shore
sea_filt$location <- factor(sea_filt$location,
levels=c("coastal","offshore","openOcean","gyre"))
unique(sea_filt$location)
#Plot again using the new order and color
bar2 <-
ggplot(sea_filt, mapping = aes(x=location,fill=NH4_stat)) +
geom_bar(position="stack")
bar2
bar2 + scale_fill_brewer(palette = "Set1") #uses built-in color palette
#Improve labels and plot background
#Create a vector of the new desired labels
ocean_zones<- c("coastal"="Coastal","offshore"="Off Shore",
"openOcean"="Open Ocean","gyre"="Gyre")
#Update layers to improve labels,colors, and background
bar2 +
scale_fill_brewer(palette = "Set1",labels=c("Depleted","Replete")) +
labs(
x= "Location",
y= "Sample Count (n)",
title = "Ammonium status across samples",
fill=bquote(NH[4]^"+") #use bquote() for include mathematical notations
) +
scale_x_discrete(labels=ocean_zones) +
theme_bw() #theme - the non-data part of figure (grid, legend,ticks, fonts)
#can fully customize or use built-in features
?theme
?bquote
#Once satisfied with the look, can save to a new object
bar3 <- bar2 +
scale_fill_brewer(palette = "Set1",labels=c("Depleted","Replete")) +
labs(
x= "Location",
y= "Sample Count (n)",
title = "Ammonium status across samples",
fill=bquote(NH[4]^"+")
) +
scale_x_discrete(labels=ocean_zones) +
theme_bw(base_size = 16,base_family = "Arial")
#If we get to it there are other methods to create a multipanel figure,
#not all work with ggplot figures, so can use the patchwork package to do it
bar + bar3
bar / bar3
bar_comparison <- bar / bar3
#Can use the "Export" button in the "Plots" tab
#try with bar_comparison
#Print to png
#modify dimensions and resolution
png(file="sea_bar_sized.png",
units = "in",width=2,height=4,res=300)
print(bar3)
dev.off()
#Print to eps
postscript(file="sea_bar.eps")
print(bar3)
dev.off()
#EPS good for modifying later in Illustrator
#and so on for jpg, tiff, pdf, etc.
#don't forget to check help pages for more options
?pdf
#Using ggsave
ggsave("sea_bar.png",plot=bar3,
units="in",width = 5, height=8,dpi = 300)
#Create a simple histogram
ggplot(sea_NH4, aes(x = NH4_nanoM)) +
geom_histogram()
#Change the binwidth
ggplot(sea_NH4, aes(x = NH4_nanoM)) +
geom_histogram(binwidth = 10) # each bin covers 10nM
#Create a histogram and density plot
hist_NH4 <- ggplot(sea_NH4, aes(x = NH4_nanoM)) +
geom_histogram(aes(y =after_stat(density))) +
#sets the histogram to the density scale
geom_density()
hist_NH4
#Create a scatterplot
scat_Micro <- ggplot(sea_comb,aes(x=Bact,y=Virus)) +
geom_point()
scat_Micro
# add a trendline - and modify line types
scat_Micro +
geom_smooth(method=lm,colour = "red",linewidth=1,linetype=2) +
theme_light()
?geom_smooth #check more options
#no easy way to add the equation, many "solutions" online, here will show you
#how to calculate, could then add manually in illustrator
lm_VirBac<-lm(Virus~Bact, data = sea_comb)
lm_VirBac
summary(lm_VirBac)
#Can color groups still
scat_Micro <-
ggplot(sea_comb,aes(x=Bact,y=Virus,col=location)) +
geom_point(size=3,alpha=0.5) +
scale_color_brewer(palette = "RdBu")+
geom_smooth(method=lm,colour = "black",linewidth=0.5) +
theme_light()
scat_Micro
# Plot the nitrogen data against each other in a scatterplot, customize the labels,
# color by group, and custmize the shape of the points by the depth sampled, then
# save the plot
scatter_nitrogen<-
ggplot(sea_filt,aes(x=NH4_nanoM,y=NO3_nanoM,col=location)) +
geom_point(aes(shape=as.factor(depth))) +
labs( x="Nitrate Concentration (nM)",
y= "Ammonium Concentration (nM)",
color="Location",
shape="Depth (m)"
) +
scale_color_brewer(palette="Dark2")
scatter_nitrogen
#4 Save the final plot
png(file="nitrogen_species_scatter-plot.png", units = "in",width=8,height=4,res=300)
print(scatter_nitrogen)
dev.off()
ggplot(data = sea_filt, aes(x=location,y=NO3_nanoM,fill=as.factor(depth))) +
geom_boxplot() +
coord_flip()+
scale_fill_manual(values=c("#999999","#E69F00")) +
labs(fill="Depth (m)")+
theme_bw() +
theme(legend.position.inside = c(0.75,0.25),
legend.box.background = element_rect(color = "black", fill = "white", linewidth = 1)) +
geom_jitter()
#Facet wrap - wraps list of plots in rows and columns and chooses best layout
# can help when you have data that should be separate or have their own axes
ggplot(data = sea_filt, aes(x=location,y=Bact,fill=as.factor(depth))) +
geom_boxplot() +
#coord_flip()+
scale_fill_manual(values=c("#999999","#E69F00")) +
labs(fill="Depth (m)")+
theme_bw() +
geom_jitter(alpha=0.5)+
facet_wrap(~as.factor(depth))
#Facet grid - forms a matrix of panels defined by row and column variables (2 discrete variables)
ggplot(data = sea_filt, aes(x=Bact,y=Virus)) +
geom_point() +
facet_grid(as.factor(depth) ~ location) # row_variable ~ column_variable
#Scales are fixed by default, but can be changed
ggplot(data = sea_filt, aes(x=location,y=Bact)) +
geom_boxplot() +
geom_jitter() +
facet_grid(as.factor(depth) ~ .,scales="free_y") #different y scales
# Load a community abundance matrix
community<-read.csv("data/sim_community.csv",header = T,row.names = 1)
head(community)
# Calculate a distance matrix and nmds
#install.packages("vegan")
library(vegan)
dist_mat <- vegdist(t(community), method = "bray")
nmds <- metaMDS(dist_mat, k = 2)
str(nmds)
# Extract NMDS scores
scores_df <- as.data.frame(scores(nmds, display = "sites"))
scores_df$sample_id <- rownames(scores_df)
# Add metadata to the NMDS scores
samp_data<- sea_comb[,c("depth","location")]
nmds_df <- merge(scores_df, samp_data, by.x = "sample_id",by.y="row.names")
head(nmds_df)
# Re-establish the factors for each of the grouping variables
nmds_df$location <- factor(nmds_df$location)
nmds_df$depth <- factor(nmds_df$depth)
# Plot
ggplot(nmds_df, aes(x = NMDS1, y = NMDS2,
color = location,
shape = depth)) +
geom_point(size = 3, alpha = 0.85) +
scale_color_brewer(palette = "Dark2",type="seq")+
stat_ellipse(aes(group = interaction(location, depth),
color = location),
level = 0.95,
linewidth = 0.8,
linetype = 2) +
theme_bw(base_size = 14) +
labs(title = "NMDS (Bray-Curtis)",
subtitle = "Simulated microbial community",
color = "Location",
shape = "Depth (m)") +
theme(panel.grid = element_blank())
head(sea_filt)
sea_long<- sea_filt %>%
pivot_longer(
cols= c(NH4_nanoM,NO3_nanoM),
names_to ="nitrogen_species",
values_to="nitrogen_conc"
)
print(sea_long)
ggplot(data = sea_long, aes(x=nitrogen_species,y=nitrogen_conc)) +
geom_point() +
facet_grid(as.factor(depth) ~ location)
ggplot(data = sea_long, aes(x=location,y=nitrogen_conc)) +
geom_point() +
facet_grid(nitrogen_species~ as.factor(depth), scales="free_y") +
theme_light()
#Reshape data to be able to facet by microbial community
sea_longer<- sea_long %>%
pivot_longer(
cols= c(Bact,Virus),
names_to ="microbe",
values_to="microbial_count"
)
head(sea_longer)
#Create a multi-panel boxplot that separates the microbial community counts
#by depth,location, and microbe
ggplot(data = sea_longer, aes(x=location,y=microbial_count)) +
geom_boxplot() +
geom_jitter() +
facet_grid(microbe~ as.factor(depth), scales="free_y") +
theme_light()
#using built-in command par() & layout() cannot be used with ggplot2
attach(sea_comb) # allows you to call variables from the attached data set
# figures arranged in 3 rows and 1 columns and specify margins
#mar=c(b,l,t,r) or mai for inches
par(mfrow=c(3,1),mar=c(2,2,2,0))
hist(NH4_nanoM)
hist(NO3_nanoM)
plot(Bact,Virus)
dev.off()
#must turn off device driver to actually get plots, and move to another type
#Skipping during workshop, will leave for reference
#Use layout to place a single figure across row 1 and two figures in row 2
#layout(matrix(c(1,1,2,3),nrow=2,ncol=2,byrow=T))
#creating a 2 row 2 column plotting space, adding figures across a row first
#put figure 1 in all of row 1, then place the 2, & 3 in the remaining spots
#hist(NH4_nanoM)
#hist(NO3_nanoM)
#plot(Bact,Virus)
#detach(sea_comb) #detach to separate the variables from the objects in
#your environment
#Use patchwork package to plot graphs in one figure, works with ggplot2,
#Can also try grid.arrange from the girdExtra package
combinedPlots<-(bar3 + hist_NH4) / scat_Micro
ggsave("combinedPlotsSized.png",plot=combinedPlots,
units="in",width = 10, height=8,dpi = 300)