Lab 2: Alpha diversity with seagrass data

Thank you so much to Ilan Rubin for his development of the functions for this lab

DO NOT EDIT LINES 10 TO 174, JUST RUN THEM

### Observed Richness ###
# A simple count of the number of species found.
# Usually denoted by R or S
richness = function(x){
  return(sum(x>0))
}

### Abundance ###
# Simply the number of sequences in the sample
# Because of sampling and sequencing biases this is less useful than one would hope.
abundance = function(x){
  return(sum(x,na.rm=TRUE))
}

### Shannon's Diversity Index (entropy) ###
# Usually denoted by H'
# Does not have a specific mathematical explanation, but increases as diversity increases.
# Assumes infinite, well-mixed population.
# Also a measure of both richness and evenness.
shannons = function(x){
  present = x[x>0]
  p = present/sum(present)
  -sum(p*log(p))
}

### Simpson's Diversity Index ###
# Usually denoted by lambda (or l)
# Equals the probability any two randomly chosen individuals in the population are the same.
# Therefore, actually a similarity index. To get diversity, D = 1/l
# Measure of both richness and evenness.
simpsons = function(x){
  p = x/sum(x,na.rm=TRUE)
  sum(p^2)
}

### Pielou's evennenss function ###
# Often denoted J
# Measures community evenness such that 1 is completely even and 0 is uneven.
# Completely even means that all taxa have exactly the same abundance. 
# An uneven community is one dominated by one or a few large taxa and many rare taxa.
# Often used in conjunction with shannons or simpsons.
evenness = function(x){
  present = x[x>0]
  p = present/sum(present)
  -sum(p*log(p))/log(sum(x>0))
}

### Chao1 ###
# Chao1 is a non-parametric estimator of richness. That means it makes no assumptions (infinite population,
# well mixed, specific model to fit and find an asymptote). It is observed richness, corrected
# with the number of singletons (species only seen once) and doubletons (species seen twice).
# There is actually some fancy statistics behind using singletons and doubletons as correctors.
# This may just return observed richness if your dataset was already cleaned to remove rare ASVs
chao1 = function(x){
  f1 = sum(x == 1)
  f2 = sum(x == 2)
  S = sum(x > 0)
  return(S + f1*(f1-1)/(2*(f2+1)))
} 



### Collectors curve ###
# This function take an abundance (OTU) table and calculators a random collectors curve
# for each sample. It then calculates the mean and 95% confidence of the number of taxa
# observed for every individual sampled. You can also enter a variable column to group.
# This will calculate a separate collector's curve for each group of that variable.
# abundance_table: your otu table
# metadata_cols: a vector of the columns that contain metadata and not abundance data
#                leave alone if your your table has no metadata in it
# sample_ID_col: the column name for a column with sample IDs. If you leave it blank
#                the function will just use row numbers instead
# group_by: a column name for a metadata column to calculate separate curves for each
#           group of the variable. If left blank you will get one collector's curve for
#           for the entire data set
# verbose: if TRUE it will keep you updated on progess. Set to FALSE to quiet.
collectors.curve = function(abundance_table,
                            metadata_cols = F,
                            sample_ID_col = "",
                            group_by = "",
                            verbose=FALSE){
  # abundance per sample
  N = rowSums(abundance_table[,-metadata_cols])
  # total richness of all samples
  total_taxa = ncol(abundance_table[,-metadata_cols])

  # array to hold collector's curve counts
  count = matrix(NA,max(N)+1,nrow(abundance_table))
  count[1,] = 0
  
  color_group = rep("",nrow(abundance_table))
  sample_ID = rep("",nrow(abundance_table))
  
  # loop over every sample in the data set
  for (r in 1:nrow(abundance_table)){
    if (verbose) print(paste("Calculating collector's curve ",r,"/",nrow(abundance_table),sep=""))
    abundance = as.numeric(abundance_table[r,-metadata_cols])
    
    # create vector of each individual and which taxa they are
    abundance_long = rep(0,total_taxa)
    c = 1
    for (t in 1:total_taxa){
      if (abundance[t]>0){
        abundance_long[c:(c+abundance[t])] = t
        c = c+abundance[t] + 1
      }
    }
    
    #
    found = rep(0,total_taxa)
    order_sampled = sample(1:N[r])
    for (i in 2:(N[r]+1)){
      if (found[abundance_long[order_sampled[i-1]]] == 0){
        found[abundance_long[order_sampled[i-1]]] = 1
        count[i,r] = count[i-1,r]+1
      }else{
        count[i,r] = count[i-1,r]
      }
    }
    
    # record the sample ID and color group
    if (sample_ID_col==""){
      sample_ID[r] = r
    }else{
      sample_ID[r] = abundance_table[[sample_ID_col]][r]
    }
    if (group_by!="") color_group[r] = abundance_table[[group_by]][r]
  }
  
  # remove the 0th individual sampled
  count = count[-1,]
  # count = data.frame(count)
  

  # calculate the mean and 95 confidence interval of the mean for each color group
  groups = unique(color_group)
  observed_taxa_mean = data.frame(matrix(0,max(N),length(groups)))
  colnames(observed_taxa_mean) = groups
  observed_taxa_95 = data.frame(matrix(0,max(N),length(groups)))
  colnames(observed_taxa_95) = groups
  observed_taxa_N = data.frame(matrix(0,max(N),length(groups)))
  colnames(observed_taxa_N) = groups

  for (g in 1:length(groups)){
    if (verbose) print(paste("Calculating mean and 95% confidence interval for",groups[g]))
    observed_taxa_mean[,g] = apply(count[,color_group==groups[g]],1,mean,na.rm=TRUE)
    observed_taxa_95[,g] = 1.96*apply(count[,color_group==groups[g]],1,sd,na.rm=TRUE)/rowSums(!is.na(count[,color_group==groups[g]]))
    observed_taxa_N[,g] = rowSums(!is.na(count[,color_group==groups[g]]))
  }
  
  # create output data frame
  observed_taxa_mean$individuals_sampled = 1:max(N)
  observed_taxa_mean = pivot_longer(observed_taxa_mean,cols=1:length(groups),names_to=group_by,values_to="observed_taxa")
  observed_taxa_95 = pivot_longer(observed_taxa_95,cols=1:length(groups),names_to=group_by,values_to="confidence_interval_95")
  observed_taxa_N = pivot_longer(observed_taxa_N,cols=1:length(groups),names_to=group_by,values_to="N_samples")
  confidence_interval_95 = observed_taxa_95$confidence_interval_95
  N_samples = observed_taxa_N$N_samples
  
  output = cbind(observed_taxa_mean,confidence_interval_95,N_samples)

  return(output)
}

Set up your workplace

working directory in script
both files are read in an named properly

# Load packages
library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.5     v dplyr   1.0.3
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.0

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(vegan)

## Loading required package: permute

## Loading required package: lattice

## This is vegan 2.5-7

library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(RColorBrewer)
library(viridis)

## Loading required package: viridisLite

# set working directory
setwd("C:/Users/siobh/OneDrive - The University Of British Columbia/TA/Biol 403 - Parfrey 2020/403 lab scripts")


# Load otu table
seagrass_otu = read.table("Seagrass_2015_otu_metadata_clean.txt",
                          header=TRUE,
                          sep="\t")
# Load full metadata table
seagrass_metadata = read.table("Seagrass_2015_all_metadata.txt",
                          header=TRUE,
                          sep="\t")

export p2 as a log-log plot and as a non log plot in the same form as p3 (in the same file with grid.arrange) (1 bonus) explain what rarefaction is -> normalizing the data but arbitrary threshold selected by the person doing the analysis so can be biased

# Calculate collectors curve for all samples in the data set
seagrass_collectors_region = collectors.curve(seagrass_otu,
                                              metadata_cols=1:3,
                                              sample_ID_col="SampleID",
                                              group_by="region")

# Plot a rarefaction curve for each region
p1 = ggplot(seagrass_collectors_region,aes(x=individuals_sampled,y=observed_taxa)) +
  geom_ribbon(aes(ymin=observed_taxa-confidence_interval_95,
                  ymax=observed_taxa+confidence_interval_95,
                  fill=region),alpha=0.2) +
  geom_line(aes(color=region),alpha=0.8,size=1.5) +
  theme_classic()+
  ylab("Observed taxa") +
  xlab("Individuals sampled") +
  scale_x_log10() +
  scale_y_log10() +
  scale_color_viridis(discrete = TRUE, option = "D")+
  scale_fill_viridis(discrete = TRUE) 
p1

## Warning: Removed 62577 row(s) containing missing values (geom_path).

# Plot the number of samples with a given sampling depth
p2 = ggplot(seagrass_collectors_region,aes(x=individuals_sampled,y=N_samples,color=region)) +
  geom_line(size=1.5,alpha=0.8) +
  ylab("Number of samples") +
  xlab("Individuals sampled") +
  theme_classic()+
  scale_color_viridis(discrete = TRUE, option = "D")+
  scale_fill_viridis(discrete = TRUE) 
p2

# save rarefaction and sampling depth curve
p3=grid.arrange(p1,p2, nrow=1)

## Warning: Removed 62577 row(s) containing missing values (geom_path).

ggsave("seagrass_rarefaction_region.pdf",p3,width=7,height=7)

## plot p1 (rarefaction curve) but on a log-log scale
p4 = ggplot(seagrass_collectors_region,aes(x=individuals_sampled,y=N_samples,color=region)) +
  geom_line(size=1.5,alpha=0.8) +
  ylab("Number of samples") +
  xlab("Individuals sampled") +
  theme_classic()+
  scale_x_log10() +
  scale_y_log10()+
  scale_color_viridis(discrete = TRUE, option = "D")+
  scale_fill_viridis(discrete = TRUE) 
p4

## Warning: Transformation introduced infinite values in continuous y-axis

p5=grid.arrange(p2,p4, nrow=1)

## Warning: Transformation introduced infinite values in continuous y-axis

ggsave("seagrass_p2_p4.pdf",p5,width=7,height=7)

# Use the rrarefy function from vegan to rarefy samples down to desired sampling depth
rarefied_table = rrarefy(seagrass_otu[,-(1:3)],sample=1000)

# put metadata back on the rarefied table
rarefied_table = cbind(seagrass_otu[,1:3],rarefied_table)

# remove any samples with depth less than rarefaction depth
# Be careful with this. Doing with this will ensure the samples you
# keep are all of a certain depth, but you will lose samples.
rarefied_table = rarefied_table[apply(seagrass_otu[,-(1:3)],1,abundance) >= 1000,]

# combine abundance table and metadata into one data frame for plotting
# If you chose to rarefy your data, just replace the following seagrass_otu
# with rarefied_table
seagrass = full_join(seagrass_otu,seagrass_metadata,
                      by=c())

## Joining, by = c("SampleID", "region", "sample_growth")

Make at least 5 changes to the boxplot in this section. If you color your boxplot and your color pallet is not color blind friendly, you will get 0 on this question. Upload this modified graph to the lab 2 checkpoint question discussion.
Add abundance, simpsons, evenness, and chao1 to the summarize function section. Save this new data frame as a CSV and upload it along with your script for the lab assignment submission

# Calculate alpha diversity and save it to the abundance table
seagrass$richness = apply(seagrass_otu[,-(1:3)],1,richness)
seagrass$abundance = apply(seagrass_otu[,-(1:3)],1,abundance)
seagrass$shannons = apply(seagrass_otu[,-(1:3)],1,shannons)
seagrass$simpsons = apply(seagrass_otu[,-(1:3)],1,simpsons)
seagrass$evenness = apply(seagrass_otu[,-(1:3)],1,evenness)
seagrass$chao1 = apply(seagrass_otu[,-(1:3)],1,chao1)

# Look at the names of metadata available to plot
names(seagrass_metadata)

##  [1] "SampleID"                              
##  [2] "BarcodeSequence"                       
##  [3] "LinkerPrimerSequence"                  
##  [4] "swab_id"                               
##  [5] "barcode_plate"                         
##  [6] "barcode_well"                          
##  [7] "project_name"                          
##  [8] "person_responsible"                    
##  [9] "date"                                  
## [10] "Run_id"                                
## [11] "sequence_file"                         
## [12] "region"                                
## [13] "site"                                  
## [14] "latitude"                              
## [15] "longitude"                             
## [16] "sample_method"                         
## [17] "host_species"                          
## [18] "host_type"                             
## [19] "survey_type"                           
## [20] "dna_extraction_plate"                  
## [21] "dna_extraction_well"                   
## [22] "growth"                                
## [23] "sample_growth"                         
## [24] "quadrat_id"                            
## [25] "meso_quadrat_density_shoots_m2"        
## [26] "meso_quadrat_biomass_g_m2"             
## [27] "meso_quadrats_epimass"                 
## [28] "meso_shoot_id"                         
## [29] "meso_shoot_length"                     
## [30] "meso_shoot_width"                      
## [31] "meso_number_of_blades"                 
## [32] "meso_total_dry_wt"                     
## [33] "meso_blade_dry_wt"                     
## [34] "meso_microepiphyte_wt"                 
## [35] "meso_macroepiphyte_wt"                 
## [36] "meso_smithora_wt"                      
## [37] "meso_bryozoans"                        
## [38] "broken"                                
## [39] "smithora_shoot"                        
## [40] "smithora_region"                       
## [41] "ocean_id"                              
## [42] "mean_depth"                            
## [43] "ctd_depth"                             
## [44] "conductivity"                          
## [45] "temperature"                           
## [46] "pressure"                              
## [47] "par"                                   
## [48] "fluorometry_chlorophyll"               
## [49] "turbidity"                             
## [50] "dissolved_oxygen"                      
## [51] "salinity"                              
## [52] "speed_of_sound"                        
## [53] "dissolved_oxygen_concentration"        
## [54] "chl_a"                                 
## [55] "phaeo"                                 
## [56] "nitrate_nitrite_ugL"                   
## [57] "phosphate_ugL"                         
## [58] "silicate_ugL"                          
## [59] "mg.region.mean.zm.density.plot"        
## [60] "mg.region.se.zm.density.plot"          
## [61] "mg.region.mean.zm.density.msq"         
## [62] "mg.site.mean.zm.density.plot"          
## [63] "mg.site.se.mean.zm.density.plot"       
## [64] "mg.site.zm.shoot.biomass.site.g"       
## [65] "mg.site.sd.zm.shoot.biomass.g"         
## [66] "mg.region.mean.zm.shoot.biomass.g"     
## [67] "mg.region.sd.zm.shoot.biomass.g"       
## [68] "mg.site.mean.zm.shoot.biomass.g.msq"   
## [69] "mg.site.se.zm.shoot.biomass.g.msq"     
## [70] "mg.region.mean.zm.shoot.biomass.g.msq" 
## [71] "mg.region.se.zm.shoot.biomass.g.msq"   
## [72] "mg.site.mean.shoot.length.cm"          
## [73] "mg.site.se.shoot.length.cm"            
## [74] "mg.region.mean.shoot.length.cm"        
## [75] "mg.region.se.shoot.length.cm"          
## [76] "mg.site.mean.shoot.epi.biomass.g"      
## [77] "mg.site.se.shoot.epi.biomass.g"        
## [78] "mg.region.mean.shoot.epi.biomass.g"    
## [79] "mg.region.se.shoot.epi.biomass.g"      
## [80] "mg.site.mean.meso.biomass.g.per.zm.g"  
## [81] "mg.site.se.meso.biomass.g.per.zm.g"    
## [82] "mg.region.mean.meso.biomass.g.per.zm.g"
## [83] "mg.region.se.meso.biomass.g.per.zm.g"  
## [84] "bed_area_m2"                           
## [85] "Description"

# Plot an alpha diversity measure versus a discrete variable
ggplot(seagrass,aes(x=region,y=richness, fill=region))+
  geom_boxplot()+
  theme_classic()+
  scale_fill_brewer(labels=c("Choked", "Goose", "McMullin", "Triquet"))+
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
        text=element_text(size = 15, face= "bold"),
        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        legend.text = element_text(size=12))+
  guides(fill=guide_legend(title="Region of Sample Collection"))+
  labs(x = "Region", y="Species Richness")+
  scale_x_discrete(labels=c("Choked", "Goose", "McMullin", "Triquet"))

# Get alpha diversity summaries for each region
# You can add lines to calculate any of the alpha diversity measures
# or any other summaries (e.g., max and min)
seagrass = group_by(seagrass,region)
alpha_diversity_summary = summarise(seagrass,
                                    richness_mean = mean(richness),
                                    richness_median = median(richness),
                                    richness_sd = sd(richness),
                                    shannons_mean = mean(shannons),
                                    shannons_median = median(shannons),
                                    shannons_sd = sd(shannons),
                                    abundance_mean = mean(abundance),
                                    abundance_median = median(abundance),
                                    abundance_sd = sd(abundance),
                                    simpsons_mean = mean(simpsons),
                                    simpsons_median = median(simpsons),
                                    simpsons_sd = sd(simpsons),
                                    evenness_mean = mean(evenness),
                                    evenness_median = median(evenness),
                                    evenness_sd = sd(evenness),
                                    chao1_mean = mean(chao1),
                                    chao1_median = median(chao1),
                                    chao1_sd = sd(chao1))
alpha_diversity_summary

## # A tibble: 4 x 19
##   region richness_mean richness_median richness_sd shannons_mean shannons_median
## * <chr>          <dbl>           <dbl>       <dbl>         <dbl>           <dbl>
## 1 choked          285.            278         126.          4.05            3.99
## 2 goose           290.            266         150.          3.94            4.05
## 3 mcmul~          323.            304         151.          4.37            4.27
## 4 triqu~          340.            318.        132.          4.30            4.60
## # ... with 13 more variables: shannons_sd <dbl>, abundance_mean <dbl>,
## #   abundance_median <dbl>, abundance_sd <dbl>, simpsons_mean <dbl>,
## #   simpsons_median <dbl>, simpsons_sd <dbl>, evenness_mean <dbl>,
## #   evenness_median <dbl>, evenness_sd <dbl>, chao1_mean <dbl>,
## #   chao1_median <dbl>, chao1_sd <dbl>

write.csv(alpha_diversity_summary, "diveristy_summary.csv")

EXTRA R

Run a linear regression model or and ANOVA on any of the seagrass variables and discuss your results

## Running a linear regression model 
lm1=lm(seagrass$temperature~seagrass$pressure)

# view the linear model output
summary(lm1)

## 
## Call:
## lm(formula = seagrass$temperature ~ seagrass$pressure)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.8809 -1.6896  0.6236  1.3006  2.4385 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        21.1082     1.6224  13.010  < 2e-16 ***
## seagrass$pressure  -0.5345     0.1123  -4.759  4.6e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.758 on 147 degrees of freedom
## Multiple R-squared:  0.1335, Adjusted R-squared:  0.1276 
## F-statistic: 22.65 on 1 and 147 DF,  p-value: 4.604e-06

# view the plots of the linear mode. What do you notice about them? What do these points tell you about these variables?
plot(lm1)

/ / / Linear model Look at how the points are grouped in the Normal QQ plot. This shows that the data are clustered, which can be a problem. Look at points 29 and 30 in the residuals plot. They also have very high leverage. I would be very concerned about the points specifically when conducting an analysis.

Note that a linear model also works with discrete variables (you could have done temperature~site for example).

## Running an ANOVA
  # generate anova
a1 = aov(richness~region, seagrass)
a1

## Call:
##    aov(formula = richness ~ region, data = seagrass)
## 
## Terms:
##                    region Residuals
## Sum of Squares    72931.9 2764043.0
## Deg. of Freedom         3       145
## 
## Residual standard error: 138.0665
## Estimated effects may be unbalanced

  # look at anova results
summary(a1)

##              Df  Sum Sq Mean Sq F value Pr(>F)
## region        3   72932   24311   1.275  0.285
## Residuals   145 2764043   19062

plot(a1)

coefficients(a1)

##    (Intercept)    regiongoose regionmcmullin  regiontriquet 
##     285.118644       4.631356      37.409134      54.996741

  # look for significant differences between then groups
TukeyHSD(a1)

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = richness ~ region, data = seagrass)
## 
## $region
##                       diff       lwr       upr     p adj
## goose-choked      4.631356 -77.71827  86.98098 0.9988816
## mcmullin-choked  37.409134 -38.48210 113.30036 0.5763807
## triquet-choked   54.996741 -29.47352 139.46700 0.3315201
## mcmullin-goose   32.777778 -57.64269 123.19825 0.7821689
## triquet-goose    50.365385 -47.36694 148.09771 0.5394578
## triquet-mcmullin 17.587607 -74.76836 109.94357 0.9601029

/ / / ANOVA F-value very small, so p-value not significant. The normal Q-Q plot looks fairly normal except the ends. Point 65 and 56 highlighted there also have leverage, which tou can see in the residuals vs leverage plots. Conducted post-hoc test anyways (Tukey HSD) but no sites are significantly from eachother, which we know from the ANOVA results