Thank you so much to Ilan Rubin for his development of the functions for this lab

DO NOT EDIT LINES 10 TO 174, JUST RUN THEM

### Observed Richness ###
# A simple count of the number of species found.
# Usually denoted by R or S
richness = function(x){
  return(sum(x>0))
}

### Abundance ###
# Simply the number of sequences in the sample
# Because of sampling and sequencing biases this is less useful than one would hope.
abundance = function(x){
  return(sum(x,na.rm=TRUE))
}

### Shannon's Diversity Index (entropy) ###
# Usually denoted by H'
# Does not have a specific mathematical explanation, but increases as diversity increases.
# Assumes infinite, well-mixed population.
# Also a measure of both richness and evenness.
shannons = function(x){
  present = x[x>0]
  p = present/sum(present)
  -sum(p*log(p))
}

### Simpson's Diversity Index ###
# Usually denoted by lambda (or l)
# Equals the probability any two randomly chosen individuals in the population are the same.
# Therefore, actually a similarity index. To get diversity, D = 1/l
# Measure of both richness and evenness.
simpsons = function(x){
  p = x/sum(x,na.rm=TRUE)
  sum(p^2)
}

### Pielou's evennenss function ###
# Often denoted J
# Measures community evenness such that 1 is completely even and 0 is uneven.
# Completely even means that all taxa have exactly the same abundance. 
# An uneven community is one dominated by one or a few large taxa and many rare taxa.
# Often used in conjunction with shannons or simpsons.
evenness = function(x){
  present = x[x>0]
  p = present/sum(present)
  -sum(p*log(p))/log(sum(x>0))
}

### Chao1 ###
# Chao1 is a non-parametric estimator of richness. That means it makes no assumptions (infinite population,
# well mixed, specific model to fit and find an asymptote). It is observed richness, corrected
# with the number of singletons (species only seen once) and doubletons (species seen twice).
# There is actually some fancy statistics behind using singletons and doubletons as correctors.
# This may just return observed richness if your dataset was already cleaned to remove rare ASVs
chao1 = function(x){
  f1 = sum(x == 1)
  f2 = sum(x == 2)
  S = sum(x > 0)
  return(S + f1*(f1-1)/(2*(f2+1)))
} 



### Collectors curve ###
# This function take an abundance (OTU) table and calculators a random collectors curve
# for each sample. It then calculates the mean and 95% confidence of the number of taxa
# observed for every individual sampled. You can also enter a variable column to group.
# This will calculate a separate collector's curve for each group of that variable.
# abundance_table: your otu table
# metadata_cols: a vector of the columns that contain metadata and not abundance data
#                leave alone if your your table has no metadata in it
# sample_ID_col: the column name for a column with sample IDs. If you leave it blank
#                the function will just use row numbers instead
# group_by: a column name for a metadata column to calculate separate curves for each
#           group of the variable. If left blank you will get one collector's curve for
#           for the entire data set
# verbose: if TRUE it will keep you updated on progess. Set to FALSE to quiet.
collectors.curve = function(abundance_table,
                            metadata_cols = F,
                            sample_ID_col = "",
                            group_by = "",
                            verbose=FALSE){
  # abundance per sample
  N = rowSums(abundance_table[,-metadata_cols])
  # total richness of all samples
  total_taxa = ncol(abundance_table[,-metadata_cols])

  # array to hold collector's curve counts
  count = matrix(NA,max(N)+1,nrow(abundance_table))
  count[1,] = 0
  
  color_group = rep("",nrow(abundance_table))
  sample_ID = rep("",nrow(abundance_table))
  
  # loop over every sample in the data set
  for (r in 1:nrow(abundance_table)){
    if (verbose) print(paste("Calculating collector's curve ",r,"/",nrow(abundance_table),sep=""))
    abundance = as.numeric(abundance_table[r,-metadata_cols])
    
    # create vector of each individual and which taxa they are
    abundance_long = rep(0,total_taxa)
    c = 1
    for (t in 1:total_taxa){
      if (abundance[t]>0){
        abundance_long[c:(c+abundance[t])] = t
        c = c+abundance[t] + 1
      }
    }
    
    #
    found = rep(0,total_taxa)
    order_sampled = sample(1:N[r])
    for (i in 2:(N[r]+1)){
      if (found[abundance_long[order_sampled[i-1]]] == 0){
        found[abundance_long[order_sampled[i-1]]] = 1
        count[i,r] = count[i-1,r]+1
      }else{
        count[i,r] = count[i-1,r]
      }
    }
    
    # record the sample ID and color group
    if (sample_ID_col==""){
      sample_ID[r] = r
    }else{
      sample_ID[r] = abundance_table[[sample_ID_col]][r]
    }
    if (group_by!="") color_group[r] = abundance_table[[group_by]][r]
  }
  
  # remove the 0th individual sampled
  count = count[-1,]
  # count = data.frame(count)
  

  # calculate the mean and 95 confidence interval of the mean for each color group
  groups = unique(color_group)
  observed_taxa_mean = data.frame(matrix(0,max(N),length(groups)))
  colnames(observed_taxa_mean) = groups
  observed_taxa_95 = data.frame(matrix(0,max(N),length(groups)))
  colnames(observed_taxa_95) = groups
  observed_taxa_N = data.frame(matrix(0,max(N),length(groups)))
  colnames(observed_taxa_N) = groups

  for (g in 1:length(groups)){
    if (verbose) print(paste("Calculating mean and 95% confidence interval for",groups[g]))
    observed_taxa_mean[,g] = apply(count[,color_group==groups[g]],1,mean,na.rm=TRUE)
    observed_taxa_95[,g] = 1.96*apply(count[,color_group==groups[g]],1,sd,na.rm=TRUE)/rowSums(!is.na(count[,color_group==groups[g]]))
    observed_taxa_N[,g] = rowSums(!is.na(count[,color_group==groups[g]]))
  }
  
  # create output data frame
  observed_taxa_mean$individuals_sampled = 1:max(N)
  observed_taxa_mean = pivot_longer(observed_taxa_mean,cols=1:length(groups),names_to=group_by,values_to="observed_taxa")
  observed_taxa_95 = pivot_longer(observed_taxa_95,cols=1:length(groups),names_to=group_by,values_to="confidence_interval_95")
  observed_taxa_N = pivot_longer(observed_taxa_N,cols=1:length(groups),names_to=group_by,values_to="N_samples")
  confidence_interval_95 = observed_taxa_95$confidence_interval_95
  N_samples = observed_taxa_N$N_samples
  
  output = cbind(observed_taxa_mean,confidence_interval_95,N_samples)

  return(output)
}

Set up your workplace

  1. working directory in script
  2. both files are read in an named properly
# Load packages
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.5     v dplyr   1.0.3
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(vegan)
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.5-7
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(RColorBrewer)
library(viridis)
## Loading required package: viridisLite
# set working directory
setwd("C:/Users/siobh/OneDrive - The University Of British Columbia/TA/Biol 403 - Parfrey 2020/403 lab scripts")


# Load otu table
seagrass_otu = read.table("Seagrass_2015_otu_metadata_clean.txt",
                          header=TRUE,
                          sep="\t")
# Load full metadata table
seagrass_metadata = read.table("Seagrass_2015_all_metadata.txt",
                          header=TRUE,
                          sep="\t")
  1. export p2 as a log-log plot and as a non log plot in the same form as p3 (in the same file with grid.arrange) (1 bonus) explain what rarefaction is -> normalizing the data but arbitrary threshold selected by the person doing the analysis so can be biased
# Calculate collectors curve for all samples in the data set
seagrass_collectors_region = collectors.curve(seagrass_otu,
                                              metadata_cols=1:3,
                                              sample_ID_col="SampleID",
                                              group_by="region")

# Plot a rarefaction curve for each region
p1 = ggplot(seagrass_collectors_region,aes(x=individuals_sampled,y=observed_taxa)) +
  geom_ribbon(aes(ymin=observed_taxa-confidence_interval_95,
                  ymax=observed_taxa+confidence_interval_95,
                  fill=region),alpha=0.2) +
  geom_line(aes(color=region),alpha=0.8,size=1.5) +
  theme_classic()+
  ylab("Observed taxa") +
  xlab("Individuals sampled") +
  scale_x_log10() +
  scale_y_log10() +
  scale_color_viridis(discrete = TRUE, option = "D")+
  scale_fill_viridis(discrete = TRUE) 
p1
## Warning: Removed 62577 row(s) containing missing values (geom_path).

# Plot the number of samples with a given sampling depth
p2 = ggplot(seagrass_collectors_region,aes(x=individuals_sampled,y=N_samples,color=region)) +
  geom_line(size=1.5,alpha=0.8) +
  ylab("Number of samples") +
  xlab("Individuals sampled") +
  theme_classic()+
  scale_color_viridis(discrete = TRUE, option = "D")+
  scale_fill_viridis(discrete = TRUE) 
p2

# save rarefaction and sampling depth curve
p3=grid.arrange(p1,p2, nrow=1)
## Warning: Removed 62577 row(s) containing missing values (geom_path).

ggsave("seagrass_rarefaction_region.pdf",p3,width=7,height=7)

## plot p1 (rarefaction curve) but on a log-log scale
p4 = ggplot(seagrass_collectors_region,aes(x=individuals_sampled,y=N_samples,color=region)) +
  geom_line(size=1.5,alpha=0.8) +
  ylab("Number of samples") +
  xlab("Individuals sampled") +
  theme_classic()+
  scale_x_log10() +
  scale_y_log10()+
  scale_color_viridis(discrete = TRUE, option = "D")+
  scale_fill_viridis(discrete = TRUE) 
p4
## Warning: Transformation introduced infinite values in continuous y-axis

p5=grid.arrange(p2,p4, nrow=1)
## Warning: Transformation introduced infinite values in continuous y-axis

ggsave("seagrass_p2_p4.pdf",p5,width=7,height=7)

# Use the rrarefy function from vegan to rarefy samples down to desired sampling depth
rarefied_table = rrarefy(seagrass_otu[,-(1:3)],sample=1000)

# put metadata back on the rarefied table
rarefied_table = cbind(seagrass_otu[,1:3],rarefied_table)

# remove any samples with depth less than rarefaction depth
# Be careful with this. Doing with this will ensure the samples you
# keep are all of a certain depth, but you will lose samples.
rarefied_table = rarefied_table[apply(seagrass_otu[,-(1:3)],1,abundance) >= 1000,]

# combine abundance table and metadata into one data frame for plotting
# If you chose to rarefy your data, just replace the following seagrass_otu
# with rarefied_table
seagrass = full_join(seagrass_otu,seagrass_metadata,
                      by=c())
## Joining, by = c("SampleID", "region", "sample_growth")
  1. Make at least 5 changes to the boxplot in this section. If you color your boxplot and your color pallet is not color blind friendly, you will get 0 on this question. Upload this modified graph to the lab 2 checkpoint question discussion.

  2. Add abundance, simpsons, evenness, and chao1 to the summarize function section. Save this new data frame as a CSV and upload it along with your script for the lab assignment submission

# Calculate alpha diversity and save it to the abundance table
seagrass$richness = apply(seagrass_otu[,-(1:3)],1,richness)
seagrass$abundance = apply(seagrass_otu[,-(1:3)],1,abundance)
seagrass$shannons = apply(seagrass_otu[,-(1:3)],1,shannons)
seagrass$simpsons = apply(seagrass_otu[,-(1:3)],1,simpsons)
seagrass$evenness = apply(seagrass_otu[,-(1:3)],1,evenness)
seagrass$chao1 = apply(seagrass_otu[,-(1:3)],1,chao1)

# Look at the names of metadata available to plot
names(seagrass_metadata)
##  [1] "SampleID"                              
##  [2] "BarcodeSequence"                       
##  [3] "LinkerPrimerSequence"                  
##  [4] "swab_id"                               
##  [5] "barcode_plate"                         
##  [6] "barcode_well"                          
##  [7] "project_name"                          
##  [8] "person_responsible"                    
##  [9] "date"                                  
## [10] "Run_id"                                
## [11] "sequence_file"                         
## [12] "region"                                
## [13] "site"                                  
## [14] "latitude"                              
## [15] "longitude"                             
## [16] "sample_method"                         
## [17] "host_species"                          
## [18] "host_type"                             
## [19] "survey_type"                           
## [20] "dna_extraction_plate"                  
## [21] "dna_extraction_well"                   
## [22] "growth"                                
## [23] "sample_growth"                         
## [24] "quadrat_id"                            
## [25] "meso_quadrat_density_shoots_m2"        
## [26] "meso_quadrat_biomass_g_m2"             
## [27] "meso_quadrats_epimass"                 
## [28] "meso_shoot_id"                         
## [29] "meso_shoot_length"                     
## [30] "meso_shoot_width"                      
## [31] "meso_number_of_blades"                 
## [32] "meso_total_dry_wt"                     
## [33] "meso_blade_dry_wt"                     
## [34] "meso_microepiphyte_wt"                 
## [35] "meso_macroepiphyte_wt"                 
## [36] "meso_smithora_wt"                      
## [37] "meso_bryozoans"                        
## [38] "broken"                                
## [39] "smithora_shoot"                        
## [40] "smithora_region"                       
## [41] "ocean_id"                              
## [42] "mean_depth"                            
## [43] "ctd_depth"                             
## [44] "conductivity"                          
## [45] "temperature"                           
## [46] "pressure"                              
## [47] "par"                                   
## [48] "fluorometry_chlorophyll"               
## [49] "turbidity"                             
## [50] "dissolved_oxygen"                      
## [51] "salinity"                              
## [52] "speed_of_sound"                        
## [53] "dissolved_oxygen_concentration"        
## [54] "chl_a"                                 
## [55] "phaeo"                                 
## [56] "nitrate_nitrite_ugL"                   
## [57] "phosphate_ugL"                         
## [58] "silicate_ugL"                          
## [59] "mg.region.mean.zm.density.plot"        
## [60] "mg.region.se.zm.density.plot"          
## [61] "mg.region.mean.zm.density.msq"         
## [62] "mg.site.mean.zm.density.plot"          
## [63] "mg.site.se.mean.zm.density.plot"       
## [64] "mg.site.zm.shoot.biomass.site.g"       
## [65] "mg.site.sd.zm.shoot.biomass.g"         
## [66] "mg.region.mean.zm.shoot.biomass.g"     
## [67] "mg.region.sd.zm.shoot.biomass.g"       
## [68] "mg.site.mean.zm.shoot.biomass.g.msq"   
## [69] "mg.site.se.zm.shoot.biomass.g.msq"     
## [70] "mg.region.mean.zm.shoot.biomass.g.msq" 
## [71] "mg.region.se.zm.shoot.biomass.g.msq"   
## [72] "mg.site.mean.shoot.length.cm"          
## [73] "mg.site.se.shoot.length.cm"            
## [74] "mg.region.mean.shoot.length.cm"        
## [75] "mg.region.se.shoot.length.cm"          
## [76] "mg.site.mean.shoot.epi.biomass.g"      
## [77] "mg.site.se.shoot.epi.biomass.g"        
## [78] "mg.region.mean.shoot.epi.biomass.g"    
## [79] "mg.region.se.shoot.epi.biomass.g"      
## [80] "mg.site.mean.meso.biomass.g.per.zm.g"  
## [81] "mg.site.se.meso.biomass.g.per.zm.g"    
## [82] "mg.region.mean.meso.biomass.g.per.zm.g"
## [83] "mg.region.se.meso.biomass.g.per.zm.g"  
## [84] "bed_area_m2"                           
## [85] "Description"
# Plot an alpha diversity measure versus a discrete variable
ggplot(seagrass,aes(x=region,y=richness, fill=region))+
  geom_boxplot()+
  theme_classic()+
  scale_fill_brewer(labels=c("Choked", "Goose", "McMullin", "Triquet"))+
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
        text=element_text(size = 15, face= "bold"),
        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        legend.text = element_text(size=12))+
  guides(fill=guide_legend(title="Region of Sample Collection"))+
  labs(x = "Region", y="Species Richness")+
  scale_x_discrete(labels=c("Choked", "Goose", "McMullin", "Triquet"))

# Get alpha diversity summaries for each region
# You can add lines to calculate any of the alpha diversity measures
# or any other summaries (e.g., max and min)
seagrass = group_by(seagrass,region)
alpha_diversity_summary = summarise(seagrass,
                                    richness_mean = mean(richness),
                                    richness_median = median(richness),
                                    richness_sd = sd(richness),
                                    shannons_mean = mean(shannons),
                                    shannons_median = median(shannons),
                                    shannons_sd = sd(shannons),
                                    abundance_mean = mean(abundance),
                                    abundance_median = median(abundance),
                                    abundance_sd = sd(abundance),
                                    simpsons_mean = mean(simpsons),
                                    simpsons_median = median(simpsons),
                                    simpsons_sd = sd(simpsons),
                                    evenness_mean = mean(evenness),
                                    evenness_median = median(evenness),
                                    evenness_sd = sd(evenness),
                                    chao1_mean = mean(chao1),
                                    chao1_median = median(chao1),
                                    chao1_sd = sd(chao1))
alpha_diversity_summary
## # A tibble: 4 x 19
##   region richness_mean richness_median richness_sd shannons_mean shannons_median
## * <chr>          <dbl>           <dbl>       <dbl>         <dbl>           <dbl>
## 1 choked          285.            278         126.          4.05            3.99
## 2 goose           290.            266         150.          3.94            4.05
## 3 mcmul~          323.            304         151.          4.37            4.27
## 4 triqu~          340.            318.        132.          4.30            4.60
## # ... with 13 more variables: shannons_sd <dbl>, abundance_mean <dbl>,
## #   abundance_median <dbl>, abundance_sd <dbl>, simpsons_mean <dbl>,
## #   simpsons_median <dbl>, simpsons_sd <dbl>, evenness_mean <dbl>,
## #   evenness_median <dbl>, evenness_sd <dbl>, chao1_mean <dbl>,
## #   chao1_median <dbl>, chao1_sd <dbl>
write.csv(alpha_diversity_summary, "diveristy_summary.csv")

EXTRA R

Run a linear regression model or and ANOVA on any of the seagrass variables and discuss your results

## Running a linear regression model 
lm1=lm(seagrass$temperature~seagrass$pressure)

# view the linear model output
summary(lm1)
## 
## Call:
## lm(formula = seagrass$temperature ~ seagrass$pressure)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.8809 -1.6896  0.6236  1.3006  2.4385 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        21.1082     1.6224  13.010  < 2e-16 ***
## seagrass$pressure  -0.5345     0.1123  -4.759  4.6e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.758 on 147 degrees of freedom
## Multiple R-squared:  0.1335, Adjusted R-squared:  0.1276 
## F-statistic: 22.65 on 1 and 147 DF,  p-value: 4.604e-06
# view the plots of the linear mode. What do you notice about them? What do these points tell you about these variables?
plot(lm1)

/ / / Linear model Look at how the points are grouped in the Normal QQ plot. This shows that the data are clustered, which can be a problem. Look at points 29 and 30 in the residuals plot. They also have very high leverage. I would be very concerned about the points specifically when conducting an analysis.

Note that a linear model also works with discrete variables (you could have done temperature~site for example).

## Running an ANOVA
  # generate anova
a1 = aov(richness~region, seagrass)
a1
## Call:
##    aov(formula = richness ~ region, data = seagrass)
## 
## Terms:
##                    region Residuals
## Sum of Squares    72931.9 2764043.0
## Deg. of Freedom         3       145
## 
## Residual standard error: 138.0665
## Estimated effects may be unbalanced
  # look at anova results
summary(a1)
##              Df  Sum Sq Mean Sq F value Pr(>F)
## region        3   72932   24311   1.275  0.285
## Residuals   145 2764043   19062
plot(a1)

coefficients(a1)
##    (Intercept)    regiongoose regionmcmullin  regiontriquet 
##     285.118644       4.631356      37.409134      54.996741
  # look for significant differences between then groups
TukeyHSD(a1)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = richness ~ region, data = seagrass)
## 
## $region
##                       diff       lwr       upr     p adj
## goose-choked      4.631356 -77.71827  86.98098 0.9988816
## mcmullin-choked  37.409134 -38.48210 113.30036 0.5763807
## triquet-choked   54.996741 -29.47352 139.46700 0.3315201
## mcmullin-goose   32.777778 -57.64269 123.19825 0.7821689
## triquet-goose    50.365385 -47.36694 148.09771 0.5394578
## triquet-mcmullin 17.587607 -74.76836 109.94357 0.9601029

/ / / ANOVA F-value very small, so p-value not significant. The normal Q-Q plot looks fairly normal except the ends. Point 65 and 56 highlighted there also have leverage, which tou can see in the residuals vs leverage plots. Conducted post-hoc test anyways (Tukey HSD) but no sites are significantly from eachother, which we know from the ANOVA results