Thank you so much to Ilan Rubin for his development of the functions for this lab
DO NOT EDIT LINES 10 TO 174, JUST RUN THEM
### Observed Richness ###
# A simple count of the number of species found.
# Usually denoted by R or S
richness = function(x){
return(sum(x>0))
}
### Abundance ###
# Simply the number of sequences in the sample
# Because of sampling and sequencing biases this is less useful than one would hope.
abundance = function(x){
return(sum(x,na.rm=TRUE))
}
### Shannon's Diversity Index (entropy) ###
# Usually denoted by H'
# Does not have a specific mathematical explanation, but increases as diversity increases.
# Assumes infinite, well-mixed population.
# Also a measure of both richness and evenness.
shannons = function(x){
present = x[x>0]
p = present/sum(present)
-sum(p*log(p))
}
### Simpson's Diversity Index ###
# Usually denoted by lambda (or l)
# Equals the probability any two randomly chosen individuals in the population are the same.
# Therefore, actually a similarity index. To get diversity, D = 1/l
# Measure of both richness and evenness.
simpsons = function(x){
p = x/sum(x,na.rm=TRUE)
sum(p^2)
}
### Pielou's evennenss function ###
# Often denoted J
# Measures community evenness such that 1 is completely even and 0 is uneven.
# Completely even means that all taxa have exactly the same abundance.
# An uneven community is one dominated by one or a few large taxa and many rare taxa.
# Often used in conjunction with shannons or simpsons.
evenness = function(x){
present = x[x>0]
p = present/sum(present)
-sum(p*log(p))/log(sum(x>0))
}
### Chao1 ###
# Chao1 is a non-parametric estimator of richness. That means it makes no assumptions (infinite population,
# well mixed, specific model to fit and find an asymptote). It is observed richness, corrected
# with the number of singletons (species only seen once) and doubletons (species seen twice).
# There is actually some fancy statistics behind using singletons and doubletons as correctors.
# This may just return observed richness if your dataset was already cleaned to remove rare ASVs
chao1 = function(x){
f1 = sum(x == 1)
f2 = sum(x == 2)
S = sum(x > 0)
return(S + f1*(f1-1)/(2*(f2+1)))
}
### Collectors curve ###
# This function take an abundance (OTU) table and calculators a random collectors curve
# for each sample. It then calculates the mean and 95% confidence of the number of taxa
# observed for every individual sampled. You can also enter a variable column to group.
# This will calculate a separate collector's curve for each group of that variable.
# abundance_table: your otu table
# metadata_cols: a vector of the columns that contain metadata and not abundance data
# leave alone if your your table has no metadata in it
# sample_ID_col: the column name for a column with sample IDs. If you leave it blank
# the function will just use row numbers instead
# group_by: a column name for a metadata column to calculate separate curves for each
# group of the variable. If left blank you will get one collector's curve for
# for the entire data set
# verbose: if TRUE it will keep you updated on progess. Set to FALSE to quiet.
collectors.curve = function(abundance_table,
metadata_cols = F,
sample_ID_col = "",
group_by = "",
verbose=FALSE){
# abundance per sample
N = rowSums(abundance_table[,-metadata_cols])
# total richness of all samples
total_taxa = ncol(abundance_table[,-metadata_cols])
# array to hold collector's curve counts
count = matrix(NA,max(N)+1,nrow(abundance_table))
count[1,] = 0
color_group = rep("",nrow(abundance_table))
sample_ID = rep("",nrow(abundance_table))
# loop over every sample in the data set
for (r in 1:nrow(abundance_table)){
if (verbose) print(paste("Calculating collector's curve ",r,"/",nrow(abundance_table),sep=""))
abundance = as.numeric(abundance_table[r,-metadata_cols])
# create vector of each individual and which taxa they are
abundance_long = rep(0,total_taxa)
c = 1
for (t in 1:total_taxa){
if (abundance[t]>0){
abundance_long[c:(c+abundance[t])] = t
c = c+abundance[t] + 1
}
}
#
found = rep(0,total_taxa)
order_sampled = sample(1:N[r])
for (i in 2:(N[r]+1)){
if (found[abundance_long[order_sampled[i-1]]] == 0){
found[abundance_long[order_sampled[i-1]]] = 1
count[i,r] = count[i-1,r]+1
}else{
count[i,r] = count[i-1,r]
}
}
# record the sample ID and color group
if (sample_ID_col==""){
sample_ID[r] = r
}else{
sample_ID[r] = abundance_table[[sample_ID_col]][r]
}
if (group_by!="") color_group[r] = abundance_table[[group_by]][r]
}
# remove the 0th individual sampled
count = count[-1,]
# count = data.frame(count)
# calculate the mean and 95 confidence interval of the mean for each color group
groups = unique(color_group)
observed_taxa_mean = data.frame(matrix(0,max(N),length(groups)))
colnames(observed_taxa_mean) = groups
observed_taxa_95 = data.frame(matrix(0,max(N),length(groups)))
colnames(observed_taxa_95) = groups
observed_taxa_N = data.frame(matrix(0,max(N),length(groups)))
colnames(observed_taxa_N) = groups
for (g in 1:length(groups)){
if (verbose) print(paste("Calculating mean and 95% confidence interval for",groups[g]))
observed_taxa_mean[,g] = apply(count[,color_group==groups[g]],1,mean,na.rm=TRUE)
observed_taxa_95[,g] = 1.96*apply(count[,color_group==groups[g]],1,sd,na.rm=TRUE)/rowSums(!is.na(count[,color_group==groups[g]]))
observed_taxa_N[,g] = rowSums(!is.na(count[,color_group==groups[g]]))
}
# create output data frame
observed_taxa_mean$individuals_sampled = 1:max(N)
observed_taxa_mean = pivot_longer(observed_taxa_mean,cols=1:length(groups),names_to=group_by,values_to="observed_taxa")
observed_taxa_95 = pivot_longer(observed_taxa_95,cols=1:length(groups),names_to=group_by,values_to="confidence_interval_95")
observed_taxa_N = pivot_longer(observed_taxa_N,cols=1:length(groups),names_to=group_by,values_to="N_samples")
confidence_interval_95 = observed_taxa_95$confidence_interval_95
N_samples = observed_taxa_N$N_samples
output = cbind(observed_taxa_mean,confidence_interval_95,N_samples)
return(output)
}
Set up your workplace
# Load packages
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.0.5 v dplyr 1.0.3
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(vegan)
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.5-7
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(RColorBrewer)
library(viridis)
## Loading required package: viridisLite
# set working directory
setwd("C:/Users/siobh/OneDrive - The University Of British Columbia/TA/Biol 403 - Parfrey 2020/403 lab scripts")
# Load otu table
seagrass_otu = read.table("Seagrass_2015_otu_metadata_clean.txt",
header=TRUE,
sep="\t")
# Load full metadata table
seagrass_metadata = read.table("Seagrass_2015_all_metadata.txt",
header=TRUE,
sep="\t")
# Calculate collectors curve for all samples in the data set
seagrass_collectors_region = collectors.curve(seagrass_otu,
metadata_cols=1:3,
sample_ID_col="SampleID",
group_by="region")
# Plot a rarefaction curve for each region
p1 = ggplot(seagrass_collectors_region,aes(x=individuals_sampled,y=observed_taxa)) +
geom_ribbon(aes(ymin=observed_taxa-confidence_interval_95,
ymax=observed_taxa+confidence_interval_95,
fill=region),alpha=0.2) +
geom_line(aes(color=region),alpha=0.8,size=1.5) +
theme_classic()+
ylab("Observed taxa") +
xlab("Individuals sampled") +
scale_x_log10() +
scale_y_log10() +
scale_color_viridis(discrete = TRUE, option = "D")+
scale_fill_viridis(discrete = TRUE)
p1
## Warning: Removed 62577 row(s) containing missing values (geom_path).
# Plot the number of samples with a given sampling depth
p2 = ggplot(seagrass_collectors_region,aes(x=individuals_sampled,y=N_samples,color=region)) +
geom_line(size=1.5,alpha=0.8) +
ylab("Number of samples") +
xlab("Individuals sampled") +
theme_classic()+
scale_color_viridis(discrete = TRUE, option = "D")+
scale_fill_viridis(discrete = TRUE)
p2
# save rarefaction and sampling depth curve
p3=grid.arrange(p1,p2, nrow=1)
## Warning: Removed 62577 row(s) containing missing values (geom_path).
ggsave("seagrass_rarefaction_region.pdf",p3,width=7,height=7)
## plot p1 (rarefaction curve) but on a log-log scale
p4 = ggplot(seagrass_collectors_region,aes(x=individuals_sampled,y=N_samples,color=region)) +
geom_line(size=1.5,alpha=0.8) +
ylab("Number of samples") +
xlab("Individuals sampled") +
theme_classic()+
scale_x_log10() +
scale_y_log10()+
scale_color_viridis(discrete = TRUE, option = "D")+
scale_fill_viridis(discrete = TRUE)
p4
## Warning: Transformation introduced infinite values in continuous y-axis
p5=grid.arrange(p2,p4, nrow=1)
## Warning: Transformation introduced infinite values in continuous y-axis
ggsave("seagrass_p2_p4.pdf",p5,width=7,height=7)
# Use the rrarefy function from vegan to rarefy samples down to desired sampling depth
rarefied_table = rrarefy(seagrass_otu[,-(1:3)],sample=1000)
# put metadata back on the rarefied table
rarefied_table = cbind(seagrass_otu[,1:3],rarefied_table)
# remove any samples with depth less than rarefaction depth
# Be careful with this. Doing with this will ensure the samples you
# keep are all of a certain depth, but you will lose samples.
rarefied_table = rarefied_table[apply(seagrass_otu[,-(1:3)],1,abundance) >= 1000,]
# combine abundance table and metadata into one data frame for plotting
# If you chose to rarefy your data, just replace the following seagrass_otu
# with rarefied_table
seagrass = full_join(seagrass_otu,seagrass_metadata,
by=c())
## Joining, by = c("SampleID", "region", "sample_growth")
Make at least 5 changes to the boxplot in this section. If you color your boxplot and your color pallet is not color blind friendly, you will get 0 on this question. Upload this modified graph to the lab 2 checkpoint question discussion.
Add abundance, simpsons, evenness, and chao1 to the summarize function section. Save this new data frame as a CSV and upload it along with your script for the lab assignment submission
# Calculate alpha diversity and save it to the abundance table
seagrass$richness = apply(seagrass_otu[,-(1:3)],1,richness)
seagrass$abundance = apply(seagrass_otu[,-(1:3)],1,abundance)
seagrass$shannons = apply(seagrass_otu[,-(1:3)],1,shannons)
seagrass$simpsons = apply(seagrass_otu[,-(1:3)],1,simpsons)
seagrass$evenness = apply(seagrass_otu[,-(1:3)],1,evenness)
seagrass$chao1 = apply(seagrass_otu[,-(1:3)],1,chao1)
# Look at the names of metadata available to plot
names(seagrass_metadata)
## [1] "SampleID"
## [2] "BarcodeSequence"
## [3] "LinkerPrimerSequence"
## [4] "swab_id"
## [5] "barcode_plate"
## [6] "barcode_well"
## [7] "project_name"
## [8] "person_responsible"
## [9] "date"
## [10] "Run_id"
## [11] "sequence_file"
## [12] "region"
## [13] "site"
## [14] "latitude"
## [15] "longitude"
## [16] "sample_method"
## [17] "host_species"
## [18] "host_type"
## [19] "survey_type"
## [20] "dna_extraction_plate"
## [21] "dna_extraction_well"
## [22] "growth"
## [23] "sample_growth"
## [24] "quadrat_id"
## [25] "meso_quadrat_density_shoots_m2"
## [26] "meso_quadrat_biomass_g_m2"
## [27] "meso_quadrats_epimass"
## [28] "meso_shoot_id"
## [29] "meso_shoot_length"
## [30] "meso_shoot_width"
## [31] "meso_number_of_blades"
## [32] "meso_total_dry_wt"
## [33] "meso_blade_dry_wt"
## [34] "meso_microepiphyte_wt"
## [35] "meso_macroepiphyte_wt"
## [36] "meso_smithora_wt"
## [37] "meso_bryozoans"
## [38] "broken"
## [39] "smithora_shoot"
## [40] "smithora_region"
## [41] "ocean_id"
## [42] "mean_depth"
## [43] "ctd_depth"
## [44] "conductivity"
## [45] "temperature"
## [46] "pressure"
## [47] "par"
## [48] "fluorometry_chlorophyll"
## [49] "turbidity"
## [50] "dissolved_oxygen"
## [51] "salinity"
## [52] "speed_of_sound"
## [53] "dissolved_oxygen_concentration"
## [54] "chl_a"
## [55] "phaeo"
## [56] "nitrate_nitrite_ugL"
## [57] "phosphate_ugL"
## [58] "silicate_ugL"
## [59] "mg.region.mean.zm.density.plot"
## [60] "mg.region.se.zm.density.plot"
## [61] "mg.region.mean.zm.density.msq"
## [62] "mg.site.mean.zm.density.plot"
## [63] "mg.site.se.mean.zm.density.plot"
## [64] "mg.site.zm.shoot.biomass.site.g"
## [65] "mg.site.sd.zm.shoot.biomass.g"
## [66] "mg.region.mean.zm.shoot.biomass.g"
## [67] "mg.region.sd.zm.shoot.biomass.g"
## [68] "mg.site.mean.zm.shoot.biomass.g.msq"
## [69] "mg.site.se.zm.shoot.biomass.g.msq"
## [70] "mg.region.mean.zm.shoot.biomass.g.msq"
## [71] "mg.region.se.zm.shoot.biomass.g.msq"
## [72] "mg.site.mean.shoot.length.cm"
## [73] "mg.site.se.shoot.length.cm"
## [74] "mg.region.mean.shoot.length.cm"
## [75] "mg.region.se.shoot.length.cm"
## [76] "mg.site.mean.shoot.epi.biomass.g"
## [77] "mg.site.se.shoot.epi.biomass.g"
## [78] "mg.region.mean.shoot.epi.biomass.g"
## [79] "mg.region.se.shoot.epi.biomass.g"
## [80] "mg.site.mean.meso.biomass.g.per.zm.g"
## [81] "mg.site.se.meso.biomass.g.per.zm.g"
## [82] "mg.region.mean.meso.biomass.g.per.zm.g"
## [83] "mg.region.se.meso.biomass.g.per.zm.g"
## [84] "bed_area_m2"
## [85] "Description"
# Plot an alpha diversity measure versus a discrete variable
ggplot(seagrass,aes(x=region,y=richness, fill=region))+
geom_boxplot()+
theme_classic()+
scale_fill_brewer(labels=c("Choked", "Goose", "McMullin", "Triquet"))+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
text=element_text(size = 15, face= "bold"),
panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
legend.text = element_text(size=12))+
guides(fill=guide_legend(title="Region of Sample Collection"))+
labs(x = "Region", y="Species Richness")+
scale_x_discrete(labels=c("Choked", "Goose", "McMullin", "Triquet"))
# Get alpha diversity summaries for each region
# You can add lines to calculate any of the alpha diversity measures
# or any other summaries (e.g., max and min)
seagrass = group_by(seagrass,region)
alpha_diversity_summary = summarise(seagrass,
richness_mean = mean(richness),
richness_median = median(richness),
richness_sd = sd(richness),
shannons_mean = mean(shannons),
shannons_median = median(shannons),
shannons_sd = sd(shannons),
abundance_mean = mean(abundance),
abundance_median = median(abundance),
abundance_sd = sd(abundance),
simpsons_mean = mean(simpsons),
simpsons_median = median(simpsons),
simpsons_sd = sd(simpsons),
evenness_mean = mean(evenness),
evenness_median = median(evenness),
evenness_sd = sd(evenness),
chao1_mean = mean(chao1),
chao1_median = median(chao1),
chao1_sd = sd(chao1))
alpha_diversity_summary
## # A tibble: 4 x 19
## region richness_mean richness_median richness_sd shannons_mean shannons_median
## * <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 choked 285. 278 126. 4.05 3.99
## 2 goose 290. 266 150. 3.94 4.05
## 3 mcmul~ 323. 304 151. 4.37 4.27
## 4 triqu~ 340. 318. 132. 4.30 4.60
## # ... with 13 more variables: shannons_sd <dbl>, abundance_mean <dbl>,
## # abundance_median <dbl>, abundance_sd <dbl>, simpsons_mean <dbl>,
## # simpsons_median <dbl>, simpsons_sd <dbl>, evenness_mean <dbl>,
## # evenness_median <dbl>, evenness_sd <dbl>, chao1_mean <dbl>,
## # chao1_median <dbl>, chao1_sd <dbl>
write.csv(alpha_diversity_summary, "diveristy_summary.csv")
EXTRA R
Run a linear regression model or and ANOVA on any of the seagrass variables and discuss your results
## Running a linear regression model
lm1=lm(seagrass$temperature~seagrass$pressure)
# view the linear model output
summary(lm1)
##
## Call:
## lm(formula = seagrass$temperature ~ seagrass$pressure)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8809 -1.6896 0.6236 1.3006 2.4385
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 21.1082 1.6224 13.010 < 2e-16 ***
## seagrass$pressure -0.5345 0.1123 -4.759 4.6e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.758 on 147 degrees of freedom
## Multiple R-squared: 0.1335, Adjusted R-squared: 0.1276
## F-statistic: 22.65 on 1 and 147 DF, p-value: 4.604e-06
# view the plots of the linear mode. What do you notice about them? What do these points tell you about these variables?
plot(lm1)
/ / / Linear model Look at how the points are grouped in the Normal QQ plot. This shows that the data are clustered, which can be a problem. Look at points 29 and 30 in the residuals plot. They also have very high leverage. I would be very concerned about the points specifically when conducting an analysis.
Note that a linear model also works with discrete variables (you could have done temperature~site for example).
## Running an ANOVA
# generate anova
a1 = aov(richness~region, seagrass)
a1
## Call:
## aov(formula = richness ~ region, data = seagrass)
##
## Terms:
## region Residuals
## Sum of Squares 72931.9 2764043.0
## Deg. of Freedom 3 145
##
## Residual standard error: 138.0665
## Estimated effects may be unbalanced
# look at anova results
summary(a1)
## Df Sum Sq Mean Sq F value Pr(>F)
## region 3 72932 24311 1.275 0.285
## Residuals 145 2764043 19062
plot(a1)
coefficients(a1)
## (Intercept) regiongoose regionmcmullin regiontriquet
## 285.118644 4.631356 37.409134 54.996741
# look for significant differences between then groups
TukeyHSD(a1)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = richness ~ region, data = seagrass)
##
## $region
## diff lwr upr p adj
## goose-choked 4.631356 -77.71827 86.98098 0.9988816
## mcmullin-choked 37.409134 -38.48210 113.30036 0.5763807
## triquet-choked 54.996741 -29.47352 139.46700 0.3315201
## mcmullin-goose 32.777778 -57.64269 123.19825 0.7821689
## triquet-goose 50.365385 -47.36694 148.09771 0.5394578
## triquet-mcmullin 17.587607 -74.76836 109.94357 0.9601029
/ / / ANOVA F-value very small, so p-value not significant. The normal Q-Q plot looks fairly normal except the ends. Point 65 and 56 highlighted there also have leverage, which tou can see in the residuals vs leverage plots. Conducted post-hoc test anyways (Tukey HSD) but no sites are significantly from eachother, which we know from the ANOVA results