Rarefaction shows we have a sufficient sample size

What follows is basically a rarefaction analysis.

Import data from Excel

coding_wb <- loadWorkbook("01 Sample Size Sufficiency/Subsetting Coding Sequences/200_Seq_output.xlsx")
coding_lst = readWorksheet(coding_wb, sheet = getSheets(coding_wb))
coding <- as_tibble(coding_lst$`200_Seq_output`)

Plot diversity statistics by number of alleles (n)

coding %>%
  ggplot(mapping = aes(x = n, y = S)) + 
  geom_jitter(alpha = 0.5, height = 0) +
  labs(x = "Number of Alleles", y = "S (Segregating Sites)") +
  geom_smooth(color = "red")
## `geom_smooth()` using method = 'loess'

coding %>%
  ggplot(mapping = aes(x = n, y = Pi)) + 
  geom_jitter(alpha = 0.5, height = 0) +
  labs(x = "Number of Alleles", y = "Nucleotide Diversity (Pi)") +
  geom_smooth(color = "red")
## `geom_smooth()` using method = 'loess'

coding %>%
  ggplot(mapping = aes(x = n, y = ThetaWattNuc)) + 
  geom_jitter(alpha = 0.5, height = 0) +
  labs(x = "Number of Alleles", y = "Watterson's Theta") +
  geom_smooth(color = "red")
## `geom_smooth()` using method = 'loess'

coding %>%
  mutate(Uniq_hap = Hap / n) %>%
  ggplot(mapping = aes(x = n, y = Uniq_hap)) + 
  geom_jitter(alpha = 0.5, height = 0) +
  labs(x = "Number of Alleles", y = "Fraction of Unique Haplotypes") +
  geom_smooth(color = "red")
## `geom_smooth()` using method = 'loess'

I think it might be sufficient just to show the above plots for Pi and Theta and say, look, they don’t change that much once you have more than ~15 alleles, which we definitely do. For S, I can model the relationship with an R^2 of 96% (see first linear model below). While we’re not saturated for the number of segregating sites we could possibly find, I’d argue that 400 is more than enough to make a SFS, look at the distribution across the gene sequence, etc etc.

NB: This is as far as you probably need to read. If you’ve made it this far, thanks!



Linear models for these diversity statistics.

Model for S

coding <- coding %>%
  mutate(Uniq_hap = Hap / n,
         lS = log2(S),
         lUniq_hap = log2(Uniq_hap),
         ln = log2(n),
         lPi = log2(Pi)
  )

coding %>% 
  ggplot(mapping = aes(x = ln, y = S)) + 
  geom_jitter(alpha = 0.5, height = 0) +
  labs(x = "Log(Number of Alleles)", y = "S (Segregating Sites)")

simS <- lm(S ~ ln, data = coding)
summary(simS)
## 
## Call:
## lm(formula = S ~ ln, data = coding)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -52.178  -7.357   1.985   9.717  35.453 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -9.926      4.869  -2.039   0.0428 *  
## ln            89.658      1.185  75.639   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.24 on 199 degrees of freedom
## Multiple R-squared:  0.9664, Adjusted R-squared:  0.9662 
## F-statistic:  5721 on 1 and 199 DF,  p-value: < 2.2e-16
grid <- coding %>%
  add_predictions(simS, "S_pred") %>%
  add_residuals(simS, "S_resid")

ggplot(grid, aes(S, S_pred)) + 
  geom_point() + geom_abline(slope = 1, intercept = 0) + 
  labs(x = "Observed S", y = "Predicted S")

ggplot(grid, aes(n, S_resid)) + 
  geom_point() + geom_abline(slope = 0, intercept = 0) + 
  labs(x = "Number of Alleles", y = "Residuals for Model of S (In Number of Sites)")

Model for Pi

coding %>%
  filter(n >= 5) %>%
  ggplot(mapping = aes(x = n, y = Pi)) + 
  geom_jitter(alpha = 0.5, height = 0) +
  labs(x = "Number of Alleles", y = "Nucleotide Diversity (Pi)")

simPi <- lm(Pi ~ n, data = coding[coding$n >= 5,])
summary(simPi)
## 
## Call:
## lm(formula = Pi ~ n, data = coding[coding$n >= 5, ])
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -2.297e-03 -4.726e-04  3.912e-05  4.735e-04  2.338e-03 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.215e-02  1.654e-04 254.858  < 2e-16 ***
## n           -5.935e-05  7.746e-06  -7.662 9.63e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.000857 on 187 degrees of freedom
## Multiple R-squared:  0.2389, Adjusted R-squared:  0.2348 
## F-statistic:  58.7 on 1 and 187 DF,  p-value: 9.627e-13
grid <- coding %>%
  add_predictions(simPi, "Pi_pred") %>%
  add_residuals(simPi, "Pi_resid")

ggplot(grid, aes(Pi, Pi_pred)) + 
  geom_point() + geom_abline(slope = 1, intercept = 0) + 
  labs(x = "Observed Values for Pi", y = "Predicted Values for Pi")

ggplot(grid, aes(n, Pi_resid)) + 
  geom_point() + geom_abline(slope = 0, intercept = 0) + 
  labs(x = "Number of Alleles", y = "Residuals for Model of Pi")

Model for Watterson’s Theta

coding %>%
  filter(n >= 6) %>%
  ggplot(mapping = aes(x = n, y = ThetaWattNuc)) + 
  geom_jitter(alpha = 0.5, height = 0) +
  labs(x = "Number of Alleles", y = "Nucleotide Diversity (Pi)")

simThetaWattNuc <- lm(ThetaWattNuc ~ n, data = coding)
summary(simThetaWattNuc)
## 
## Call:
## lm(formula = ThetaWattNuc ~ n, data = coding)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -37.490  -3.285   0.313   3.558  14.409 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 115.78986    1.05137  110.13   <2e-16 ***
## n             0.56673    0.05073   11.17   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.262 on 199 degrees of freedom
## Multiple R-squared:  0.3854, Adjusted R-squared:  0.3823 
## F-statistic: 124.8 on 1 and 199 DF,  p-value: < 2.2e-16
grid <- coding %>%
  add_predictions(simThetaWattNuc, "ThetaWattNuc_pred") %>%
  add_residuals(simThetaWattNuc, "ThetaWattNuc_resid")

ggplot(grid, aes(ThetaWattNuc, ThetaWattNuc_pred)) + 
  geom_point() + geom_abline(slope = 1, intercept = 0) + 
  labs(x = "Observed Theta", y = "Predicted Theta")

ggplot(grid, aes(n, ThetaWattNuc_resid)) + 
  geom_point() + geom_abline(slope = 0, intercept = 0) + 
  labs(x = "Number of Alleles", y = "Residuals for Model of Theta")