What follows is basically a rarefaction analysis.
I have run a bunch of diversity analyses on different subsets of the coding region for Rpp8 data (removing alleles in MEGA so I have 200 data subsets with anywhere between 2 and 32 coding sequences for Rpp8).
Here I plot the diversity results by the number of alleles included in the analysis.
Then I get the linear model coefficients that correspond to the relationships you see.
In the letter, I will make some statements about how well the diversity summary statistics behave given my number of alleles and the relationships that I’ve plotted and modeled. I still need to write that part up.
coding_wb <- loadWorkbook("01 Sample Size Sufficiency/Subsetting Coding Sequences/200_Seq_output.xlsx")
coding_lst = readWorksheet(coding_wb, sheet = getSheets(coding_wb))
coding <- as_tibble(coding_lst$`200_Seq_output`)
n)coding %>%
ggplot(mapping = aes(x = n, y = S)) +
geom_jitter(alpha = 0.5, height = 0) +
labs(x = "Number of Alleles", y = "S (Segregating Sites)") +
geom_smooth(color = "red")
## `geom_smooth()` using method = 'loess'
coding %>%
ggplot(mapping = aes(x = n, y = Pi)) +
geom_jitter(alpha = 0.5, height = 0) +
labs(x = "Number of Alleles", y = "Nucleotide Diversity (Pi)") +
geom_smooth(color = "red")
## `geom_smooth()` using method = 'loess'
coding %>%
ggplot(mapping = aes(x = n, y = ThetaWattNuc)) +
geom_jitter(alpha = 0.5, height = 0) +
labs(x = "Number of Alleles", y = "Watterson's Theta") +
geom_smooth(color = "red")
## `geom_smooth()` using method = 'loess'
coding %>%
mutate(Uniq_hap = Hap / n) %>%
ggplot(mapping = aes(x = n, y = Uniq_hap)) +
geom_jitter(alpha = 0.5, height = 0) +
labs(x = "Number of Alleles", y = "Fraction of Unique Haplotypes") +
geom_smooth(color = "red")
## `geom_smooth()` using method = 'loess'
I think it might be sufficient just to show the above plots for Pi and Theta and say, look, they don’t change that much once you have more than ~15 alleles, which we definitely do. For S, I can model the relationship with an R^2 of 96% (see first linear model below). While we’re not saturated for the number of segregating sites we could possibly find, I’d argue that 400 is more than enough to make a SFS, look at the distribution across the gene sequence, etc etc.
coding <- coding %>%
mutate(Uniq_hap = Hap / n,
lS = log2(S),
lUniq_hap = log2(Uniq_hap),
ln = log2(n),
lPi = log2(Pi)
)
coding %>%
ggplot(mapping = aes(x = ln, y = S)) +
geom_jitter(alpha = 0.5, height = 0) +
labs(x = "Log(Number of Alleles)", y = "S (Segregating Sites)")
simS <- lm(S ~ ln, data = coding)
summary(simS)
##
## Call:
## lm(formula = S ~ ln, data = coding)
##
## Residuals:
## Min 1Q Median 3Q Max
## -52.178 -7.357 1.985 9.717 35.453
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.926 4.869 -2.039 0.0428 *
## ln 89.658 1.185 75.639 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.24 on 199 degrees of freedom
## Multiple R-squared: 0.9664, Adjusted R-squared: 0.9662
## F-statistic: 5721 on 1 and 199 DF, p-value: < 2.2e-16
grid <- coding %>%
add_predictions(simS, "S_pred") %>%
add_residuals(simS, "S_resid")
ggplot(grid, aes(S, S_pred)) +
geom_point() + geom_abline(slope = 1, intercept = 0) +
labs(x = "Observed S", y = "Predicted S")
ggplot(grid, aes(n, S_resid)) +
geom_point() + geom_abline(slope = 0, intercept = 0) +
labs(x = "Number of Alleles", y = "Residuals for Model of S (In Number of Sites)")
coding %>%
filter(n >= 5) %>%
ggplot(mapping = aes(x = n, y = Pi)) +
geom_jitter(alpha = 0.5, height = 0) +
labs(x = "Number of Alleles", y = "Nucleotide Diversity (Pi)")
simPi <- lm(Pi ~ n, data = coding[coding$n >= 5,])
summary(simPi)
##
## Call:
## lm(formula = Pi ~ n, data = coding[coding$n >= 5, ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.297e-03 -4.726e-04 3.912e-05 4.735e-04 2.338e-03
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.215e-02 1.654e-04 254.858 < 2e-16 ***
## n -5.935e-05 7.746e-06 -7.662 9.63e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.000857 on 187 degrees of freedom
## Multiple R-squared: 0.2389, Adjusted R-squared: 0.2348
## F-statistic: 58.7 on 1 and 187 DF, p-value: 9.627e-13
grid <- coding %>%
add_predictions(simPi, "Pi_pred") %>%
add_residuals(simPi, "Pi_resid")
ggplot(grid, aes(Pi, Pi_pred)) +
geom_point() + geom_abline(slope = 1, intercept = 0) +
labs(x = "Observed Values for Pi", y = "Predicted Values for Pi")
ggplot(grid, aes(n, Pi_resid)) +
geom_point() + geom_abline(slope = 0, intercept = 0) +
labs(x = "Number of Alleles", y = "Residuals for Model of Pi")
coding %>%
filter(n >= 6) %>%
ggplot(mapping = aes(x = n, y = ThetaWattNuc)) +
geom_jitter(alpha = 0.5, height = 0) +
labs(x = "Number of Alleles", y = "Nucleotide Diversity (Pi)")
simThetaWattNuc <- lm(ThetaWattNuc ~ n, data = coding)
summary(simThetaWattNuc)
##
## Call:
## lm(formula = ThetaWattNuc ~ n, data = coding)
##
## Residuals:
## Min 1Q Median 3Q Max
## -37.490 -3.285 0.313 3.558 14.409
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 115.78986 1.05137 110.13 <2e-16 ***
## n 0.56673 0.05073 11.17 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.262 on 199 degrees of freedom
## Multiple R-squared: 0.3854, Adjusted R-squared: 0.3823
## F-statistic: 124.8 on 1 and 199 DF, p-value: < 2.2e-16
grid <- coding %>%
add_predictions(simThetaWattNuc, "ThetaWattNuc_pred") %>%
add_residuals(simThetaWattNuc, "ThetaWattNuc_resid")
ggplot(grid, aes(ThetaWattNuc, ThetaWattNuc_pred)) +
geom_point() + geom_abline(slope = 1, intercept = 0) +
labs(x = "Observed Theta", y = "Predicted Theta")
ggplot(grid, aes(n, ThetaWattNuc_resid)) +
geom_point() + geom_abline(slope = 0, intercept = 0) +
labs(x = "Number of Alleles", y = "Residuals for Model of Theta")