Course Materials for today

Data Viz in R - class script ** https://rpubs.com/aparada14/data-viz-feb2026)
Class data and code ** https://goto.stanford.edu/data-viz-R

Create a new project

This will not only organize all of your work under one folder, but it will also set the working directory for this script.

Install packages we will be using today

We should only need to do this once per R version, so can run code directly from console. Alternatively, you can comment it out.

#install.packages("readxl") # load packaged with library() command
#install.packages("tidyverse")
#install.packages("patchwork")

Load packages we will be using today

We could load each package only when we need it, but I like knowing what packages we need, so that I can install if I don’t have them.

library(tidyverse)
library(patchwork)
library(readxl)
library(ggplot2) #you likely don't need to do this, but can run this line if needed
#We will primarily work with ggplot2 today, but lots of options

Read in data and prepare

#Read in the data
sea_chem<-read.table("data/sim_data.csv",header=T,sep=",",na.strings="NA",row.names=1)

samp_data<-read_xlsx("data/sim_data.xlsx","samples")

sea_comb<-merge(sea_chem,samp_data,by.x="row.names",by.y="sample_id")
#View(sea_comb) #note that the row names are now a column, we could 
#change this, but for dataframes it's usually better to keep this valuable
#information in a column
head(sea_comb)


#Create new column to group by based on the values of another
sea_comb$NH4_stat <- ifelse(sea_comb$NH4_nanoM<20,"depleted","replete")
unique(sea_comb$NH4_stat)


# Check the data type and some basic stats
str(sea_comb)
summary(sea_comb) #Note the NA's this could cause some problems
mean(sea_comb$NH4_nanoM) #note answer, it is due to NAs
mean(sea_comb$NH4_nanoM,na.rm = TRUE) #this is an option for many stats functions

#Remove the na's from the dataset
#could filter the table to remove all NAs, but would lose all rows with missing values
sea_filt<-sea_comb[complete.cases(sea_comb),]

#remove the row for a specific variable
sea_NH4<-sea_comb %>% drop_na(NH4_nanoM) 
    #easy way to specify which column to drop NA's for using tidyR

#Check and count NAs in each column
colSums(is.na(sea_NH4))
colSums(is.na(sea_filt))

ggplot2 Overview

ggplot2 is a system for creating graphics.

You provide the data,
tell ggplot2 how to map variables to aesthetics (assigning variables to visual properties),
what building blocks to use (like points,
lines, curves), and
it takes care of the details.

In most cases you start with ggplot():

supply a dataset and aesthetic mapping (with aes()).
You then add on layers (like geom_point() or geom_histogram()),
scales (like scale_colour_brewer()),
faceting specifications (like facet_wrap()), and
coordinate systems (like coord_flip()).

Bar Graphs

#Create a simple bar graph
#ggplot makes figures by adding layers
ggplot(sea_filt, mapping = aes(x = NH4_stat)) +
  geom_bar()

#use the inherent groups in the variable to add color
ggplot(sea_filt, mapping = aes(x = NH4_stat,fill=NH4_stat)) +
  geom_bar()


#can specify the colors instead of using the default
bar <- ggplot(sea_filt, mapping = aes(x = NH4_stat,fill=NH4_stat)) +
  geom_bar()+
  scale_fill_manual(values=c("depleted"="red","replete"="blue"))

bar

#many options for customization will be available 
#if you either call the help pages or hit tab
?geom_bar()

#Like stacking if have multiple groups
ggplot(sea_filt, mapping = aes(x=location,fill=NH4_stat)) +
  geom_bar(position="stack") +
  scale_fill_manual(values=c("depleted"="red","replete"="blue"))

Modifications to improve aesthetics

Many customizations apply across ggplot2, will practice on bar graphs

Color cheatsheets

Color Palette cheatsheet
Color Brewer 2.0 ** www.colorbrewer2.org

#Improve the layout with factors, default is alphabetical
unique(sea_filt$location) #default of unique is the order 
                          #they appear in the table
# we may want to organize these by distance from shore
sea_filt$location <- factor(sea_filt$location,
                    levels=c("coastal","offshore","openOcean","gyre"))
unique(sea_filt$location)


#Plot again using the new order and color
bar2 <-
  ggplot(sea_filt, mapping = aes(x=location,fill=NH4_stat)) +
  geom_bar(position="stack")

bar2 + scale_fill_brewer(palette = "Set1")  

#Improve labels and plot background

#Create a vector of the new desired labels
ocean_zones<- c("coastal"="Coastal","offshore"="Off Shore",
  "openOcean"="Open Ocean","gyre"="Gyre")

bar2 +
    scale_fill_brewer(palette = "Set1",labels=c("Depleted","Replete")) +
    labs(
    x= "Location",
    y= "Sample Count (n)",
    title = "Ammonium status across samples",
    fill=bquote(NH[4]^"+")
  ) +
  scale_x_discrete(labels=ocean_zones) +
  theme_bw()

#theme - the non-data part of figure (grid, legend,ticks, fonts)
  #can fully customize or use built-in features
?theme


#Once satisfied with the look, can save to a new object
bar3 <- bar2 +
    scale_fill_brewer(palette = "Set1",labels=c("Depleted","Replete")) +
    labs(
    x= "Location",
    y= "Sample Count (n)",
    title = "Ammonium status across samples",
    fill=bquote(NH[4]^"+")
  ) +
  scale_x_discrete(labels=ocean_zones) +
  theme_bw(base_size = 16,base_family = "Arial")

Saving figures

#Can use the "Export" button in the "Plots" tab

#Print to png
#modify dimensions and resolution
png(file="sea_bar_sized.png",
    units = "in",width=2,height=4,res=300)
print(bar3)
dev.off()

#Print to eps
postscript(file="sea_bar.eps")
print(bar3)
dev.off()
#EPS good for modifying later in Illustrator

#and so on for jpg, tiff, pdf, etc.
#don't forget to check help pages for more options
?pdf

#Using ggsave
ggsave("sea_bar.png",plot=bar3,
       units="in",width = 5, height=8,dpi = 300)

Apply what was learned to new graph types

Histograms for data distributions

#Create a simple histogram
ggplot(sea_NH4, aes(x = NH4_nanoM)) +
  geom_histogram()

#Change the binwidth
ggplot(sea_NH4, aes(x = NH4_nanoM)) +
  geom_histogram(binwidth = 10) # each bin covers 10nM

#Create a density plot
ggplot(sea_NH4, aes(x = NH4_nanoM)) +
  geom_density()

#Histogram & density
# Histogram with kernel density
hist_NH4 <- ggplot(sea_NH4, aes(x = NH4_nanoM)) + 
  geom_histogram(aes(y =after_stat(density))) + 
  #density of points in bin, scaled to integrate to 1.
  geom_density()
hist_NH4

Scatterplots for identifying relationships

#Create a scatterplot
scat_Micro <- ggplot(sea_comb,aes(x=Bact,y=Virus)) +
            geom_point()
scat_Micro

# add a trendline - and modify line types
scat_Micro +
  geom_smooth(method=lm,colour = "red",linewidth=1,linetype=2) +
  theme_light()

?geom_smooth #check more options
#no easy way to add the equation, many "solutions" online, here will show you
#how to calculate, could then add manually in illustrator

lm_VirBac<-lm(Virus~Bact, data = sea_comb)
lm_VirBac
summary(lm_VirBac)

#Can color groups still
scat_Micro <- 
  ggplot(sea_comb,aes(x=Bact,y=Virus,col=location)) +
  geom_point(size=3,alpha=0.5) +
  scale_color_brewer(palette = "RdBu")+
  geom_smooth(method=lm,colour = "black",linewidth=0.5) +
  theme_light() 

scat_Micro

Exericse 1

#1 Plot the nitrogen data against each other in a scatterplot

ggplot(sea_filt,aes(x=NH4_nanoM,y=NO3_nanoM)) +
  geom_point()

#2 Customize the labels and color by group

ggplot(sea_filt,aes(x=NH4_nanoM,y=NO3_nanoM,col=location)) +
  geom_point() +
  labs( x=bquote(NH[4]^"+"~" (nM)"),
        y=bquote(NO[3]^"-"~" (nM)")
      ) +
  scale_color_brewer(palette="Dark2")

#3 Customize the shape of the points by the depth sampled
scatter_nitrogen<-
  ggplot(sea_filt,aes(x=NH4_nanoM,y=NO3_nanoM,col=location)) +
  geom_point(aes(shape=as.factor(depth))) +
  labs( x=bquote(NH[4]^"+"~" (nM)"),
        y=bquote(NO[3]^"-"~" (nM)"),
        color="Location",
        shape="Depth (m)"
      ) +
  scale_color_brewer(palette="Dark2")

scatter_nitrogen

#4 Save the final plot
png(file="nitrogen_species_scatter-plot.png", units = "in",width=8,height=4,res=300)
print(scatter_nitrogen)
dev.off()

Plotting multiple figures together

#using built-in command par() & layout() cannot be used with ggplot2
attach(sea_comb) # allows you to call variables from the attached data set

# figures arranged in 3 rows and 1 columns and specify margins
#mar=c(b,l,t,r) or mai for inches
par(mfrow=c(3,1),mar=c(2,2,2,0))
hist(NH4_nanoM)
hist(NO3_nanoM)
plot(Bact,Virus)

dev.off() 
#must turn off device driver to actually get plots, and move to another type

#Skipping during workshop, will leave for reference
#Use layout to place a single figure across row 1 and two figures in row 2
#layout(matrix(c(1,1,2,3),nrow=2,ncol=2,byrow=T))
  #creating a 2 row 2 column plotting space, adding figures across a row first
  #put figure 1 in all of row 1, then place the 2, & 3 in the remaining spots
#hist(NH4_nanoM)
#hist(NO3_nanoM)
#plot(Bact,Virus)
#detach(sea_comb) #detach to separate the variables from the objects in 
                 #your environment


#Use patchwork package to plot graphs in one figure, works with ggplot2, 
#Can also try grid.arrange from the girdExtra package

combinedPlots<-(bar3 + hist_NH4) / scat_Micro

ggsave("combinedPlotsSized.png",plot=combinedPlots,
       units="in",width = 10, height=8,dpi = 300)

Boxplots and modifying legends

ggplot(data = sea_filt, aes(x=location,y=NO3_nanoM,fill=as.factor(depth))) +
  geom_boxplot() +
  coord_flip()+
  scale_fill_manual(values=c("#999999","#E69F00")) +
  theme_bw() +
  theme(legend.position = c(0.75,0.25),
        legend.box.background = element_rect(color = "black", fill = "white", linewidth = 1)) + 
  geom_jitter()

Multi-panel plots

#Facet wrap - wraps list of plots in rows and columns and chooses best layout
ggplot(data = sea_filt, aes(x=location,y=Bact)) +
  geom_boxplot() +
  facet_wrap(~as.factor(depth))


#Facet grid - forms a matrix of panels defined by row and column variables (2 discrete variables)
ggplot(data = sea_filt, aes(x=Bact,y=Virus)) +
  geom_point() +
  facet_grid(as.factor(depth) ~ location) # row_variable ~ column_variable

#Scales are fixed by default, but can be changed
ggplot(data = sea_filt, aes(x=location,y=Bact)) +
  geom_boxplot() +
  geom_jitter() +
  facet_grid(as.factor(depth) ~ .,scales="free_y") #different y scales

Reshape data to create different plots

head(sea_filt)
sea_long<- sea_filt %>%
  pivot_longer(
    cols= c(NH4_nanoM,NO3_nanoM),
    names_to ="nitrogen_species",
    values_to="nitrogen_conc"
  )

print(sea_long)

ggplot(data = sea_long, aes(x=nitrogen_species,y=nitrogen_conc)) +
  geom_point() +
  facet_grid(as.factor(depth) ~ location)

ggplot(data = sea_long, aes(x=location,y=nitrogen_conc)) +
  geom_point() +
  facet_grid(nitrogen_species~ as.factor(depth), scales="free_y") +
  theme_light()

Exercise 2

#Reshape data to be able to facet by microbial community
sea_longer<- sea_long %>%
  pivot_longer(
    cols= c(Bact,Virus),
    names_to ="microbe",
    values_to="microbial_count"
  )

head(sea_longer)

#Create a multi-panel boxplot that separates the microbial community counts
#by depth,location, and microbe
ggplot(data = sea_longer, aes(x=location,y=microbial_count)) +
  geom_boxplot() +
  geom_jitter() +
  facet_grid(microbe~ as.factor(depth), scales="free_y") +
  theme_light()

Ordination plot - NMDS

# Load a community abundance matrix
community<-read.csv("data/sim_community.csv",header = T,row.names = 1)
head(community)

# Calculate a distance matrix and nmds
#install.packages("vegan")
library(vegan)

dist_mat <- vegdist(t(community), method = "bray")
nmds <- metaMDS(dist_mat, k = 2)

str(nmds)

# Extract NMDS scores
scores_df <- as.data.frame(scores(nmds, display = "sites"))
scores_df$sample_id <- rownames(scores_df)

# Add metadata to the NMDS scores
nmds_df <- merge(scores_df, as.data.frame(samp_data), by = "sample_id")
head(nmds_df)

# Re-establish the factors for each of the grouping variables
nmds_df$location <- factor(nmds_df$location)
nmds_df$depth <- factor(nmds_df$depth)

# Plot
ggplot(nmds_df, aes(x = NMDS1, y = NMDS2,
                    color = location,
                    shape = depth)) +
  geom_point(size = 3, alpha = 0.85) +
  scale_color_brewer(palette = "Dark2",type="seq")+
  stat_ellipse(aes(group = interaction(location, depth),
                   color = location),
               level = 0.95,
               linewidth = 0.8,
               linetype = 2) +
  theme_bw(base_size = 14) +
  labs(title = "NMDS (Bray-Curtis)",
       subtitle = "Simulated microbial community",
       color = "Location",
       shape = "Depth (m)") +
  theme(panel.grid = element_blank())

Carpentries R Data Viz: 2hr workshop

Alma Parada

2026-02-24