Library
Warning: package 'ggplot2' was built under R version 4.2.3
── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
✔ tibble 3.2.1 ✔ dplyr 1.1.4
✔ tidyr 1.2.1 ✔ stringr 1.5.0
✔ readr 2.1.3 ✔ forcats 1.0.0
✔ purrr 1.0.1
Warning: package 'tibble' was built under R version 4.2.3
Warning: package 'dplyr' was built under R version 4.2.3
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
Warning: package 'e1071' was built under R version 4.2.3
Data
Update to your own file path from downloaded Dryad dataset
#Load data
DA <- read_csv("C:/Users/Sara/OneDrive - University of Utah/MS_Research/Dryad/D_alata.csv", show_col_types = FALSE)
DB <- read_csv("C:/Users/Sara/OneDrive - University of Utah/MS_Research/Dryad/D_bulbifera.csv", show_col_types = FALSE)
Data filter, 20% of granules for analysis sorted by Max Length
# D. alata filtered by length
percentage_to_keep <- 0.2
DA_TOP20L <- DA %>%
arrange(desc(`Max Length (Microns)`)) %>%
top_n(n = round(nrow(.) * percentage_to_keep), wt = `Max Length (Microns)`)
view(DA_TOP20L)
# D. bulbifera filtered by length
percentage_to_keep <- 0.2
DB_TOP20L <- DB %>%
arrange(desc(`Max Length (Microns)`)) %>%
top_n(n = round(nrow(.) * percentage_to_keep), wt = `Max Length (Microns)`)
Bind for inter-taxa comparison
total_DA_DB <- rbind(DA, DB) # 100% of sample population
top_20_DA_DB <- rbind(DA_TOP20L, DB_TOP20L) # top 20% sorted by length
Ranges of quantitative data for both D. alata and D. bulbifera
range(DA$`Max Length (Microns)`)
range(DA_TOP20L$`Max Length (Microns)`)
range(DA$`Eccentricity Ratio`)
[1] 0.08355523 0.50000000
range(DA_TOP20L$`Eccentricity Ratio`)
[1] 0.08355523 0.32810844
range(DA$`Hilum Angle (Degrees)`)
range(DA_TOP20L$`Hilum Angle (Degrees)`)
range(DA$`Hilum Angle (Degrees)`)
range(DA_TOP20L$`Hilum Angle (Degrees)`)
range(DB$`Max Length (Microns)`)
range(DB_TOP20L$`Max Length (Microns)`)
range(DB$`Eccentricity Ratio`)
[1] 0.05340962 0.41209204
range(DB_TOP20L$`Eccentricity Ratio`)
[1] 0.06225496 0.13425845
range(DB$`Hilum Angle (Degrees)`)
range(DB_TOP20L$`Hilum Angle (Degrees)`)
range(DB$`Hilum Angle (Degrees)`)
range(DB_TOP20L$`Hilum Angle (Degrees)`)
Mean for Eccentricty Ratio and Hilum Angle for both taxa
mean(DA_TOP20L$`Eccentricity Ratio`)
mean(DA_TOP20L$`Hilum Angle (Degrees)`)
mean(DB_TOP20L$`Eccentricity Ratio`)
mean(DB_TOP20L$`Hilum Angle (Degrees)`)
Two-sample Kolmogorov-Smirnov tests total population
ks.test(DA$`Max Length (Microns)`, DB$`Max Length (Microns)`)
Warning in ks.test.default(DA$`Max Length (Microns)`, DB$`Max Length
(Microns)`): p-value will be approximate in the presence of ties
Asymptotic two-sample Kolmogorov-Smirnov test
data: DA$`Max Length (Microns)` and DB$`Max Length (Microns)`
D = 0.1127, p-value = 0.04456
alternative hypothesis: two-sided
ks.test(DA$`Max Width (Microns)`, DB$`Max Width (Microns)`)
Warning in ks.test.default(DA$`Max Width (Microns)`, DB$`Max Width (Microns)`):
p-value will be approximate in the presence of ties
Asymptotic two-sample Kolmogorov-Smirnov test
data: DA$`Max Width (Microns)` and DB$`Max Width (Microns)`
D = 0.23566, p-value = 1.196e-07
alternative hypothesis: two-sided
ks.test(DA$`Hilum Angle (Degrees)` , DB$`Hilum Angle (Degrees)`)
Warning in ks.test.default(DA$`Hilum Angle (Degrees)`, DB$`Hilum Angle
(Degrees)`): p-value will be approximate in the presence of ties
Asymptotic two-sample Kolmogorov-Smirnov test
data: DA$`Hilum Angle (Degrees)` and DB$`Hilum Angle (Degrees)`
D = 0.66191, p-value < 2.2e-16
alternative hypothesis: two-sided
ks.test(DA$`Eccentricity Ratio`, DB$`Eccentricity Ratio`)
Asymptotic two-sample Kolmogorov-Smirnov test
data: DA$`Eccentricity Ratio` and DB$`Eccentricity Ratio`
D = 0.69976, p-value < 2.2e-16
alternative hypothesis: two-sided
Two-sample Kolmogorov-Smirnov tests largest 20% by length data subset
ks.test(DA_TOP20L$`Max Length (Microns)`, DB_TOP20L$`Max Length (Microns)`)
Exact two-sample Kolmogorov-Smirnov test
data: DA_TOP20L$`Max Length (Microns)` and DB_TOP20L$`Max Length (Microns)`
D = 0.18333, p-value = 0.2659
alternative hypothesis: two-sided
ks.test(DA_TOP20L$`Max Width (Microns)`, DB_TOP20L$`Max Width (Microns)`)
Exact two-sample Kolmogorov-Smirnov test
data: DA_TOP20L$`Max Width (Microns)` and DB_TOP20L$`Max Width (Microns)`
D = 0.3, p-value = 0.008631
alternative hypothesis: two-sided
ks.test(DA_TOP20L$`Hilum Angle (Degrees)`, DB_TOP20L$`Hilum Angle (Degrees)`)
Exact two-sample Kolmogorov-Smirnov test
data: DA_TOP20L$`Hilum Angle (Degrees)` and DB_TOP20L$`Hilum Angle (Degrees)`
D = 0.78333, p-value = 2.62e-14
alternative hypothesis: two-sided
ks.test(DA_TOP20L$`Eccentricity Ratio`, DB_TOP20L$`Eccentricity Ratio`)
Exact two-sample Kolmogorov-Smirnov test
data: DA_TOP20L$`Eccentricity Ratio` and DB_TOP20L$`Eccentricity Ratio`
D = 0.93333, p-value = 2.62e-14
alternative hypothesis: two-sided
Box plots for quantitative data
Top 20% length, both taxa
ggplot(top_20_DA_DB, aes(x = Species, y = `Max Length (Microns)`)) +
geom_boxplot(outlier.shape = NA) +
labs(color = "Species")+
geom_jitter(width = 0.1, shape = 1)+
scale_y_continuous(name = "Max Length (Microns)", limits = c(0, 70)) +
theme_minimal(base_size = 16)+
scale_x_discrete(labels = expression(italic("D. alata"), italic("D. bulbifera")))
Total population length, both taxa
ggplot(total_DA_DB, aes(x = Species, y = `Max Length (Microns)`, width = 0.5)) +
geom_boxplot(outlier.shape = NA) +
labs(color = "Species")+
geom_jitter(width = 0.1, shape = 1)+
scale_y_continuous(name = "Max Length (Microns)", limits = c(0, 70)) +
theme_minimal(base_size = 16)+
scale_x_discrete(labels = expression(italic("D. alata"), italic("D. bulbifera")))
Top 20% width, both taxa
ggplot(top_20_DA_DB, aes(x = Species, y = `Max Width (Microns)`)) +
geom_boxplot(outlier.shape = NA) +
labs(color = "Species")+
geom_jitter(width = 0.1, shape = 1)+
scale_y_continuous(name = "Max Width (Microns)", limits = c(0, 50)) +
theme_minimal(base_size = 16)+
scale_x_discrete(labels = expression(italic("D. alata"), italic("D. bulbifera")))
Total population width, both taxa
ggplot(total_DA_DB, aes(x = Species, y = `Max Width (Microns)`)) +
geom_boxplot(outlier.shape = NA) +
labs(color = "Species")+
geom_jitter(width = 0.1, shape = 1)+
scale_y_continuous(name = "Max Width (Microns)", limits = c(0, 50)) +
theme_minimal(base_size = 16)+
scale_x_discrete(labels = expression(italic("D. alata"), italic("D. bulbifera")))
Top 20% width, both taxa
ggplot(top_20_DA_DB, aes(x = Species, y = `Max Width (Microns)`)) +
geom_boxplot(outlier.shape = NA) +
labs(color = "Species")+
geom_jitter(width = 0.1, shape = 1)+
theme_minimal(base_size = 16)+
scale_x_discrete(labels = expression(italic("D. alata"), italic("D. bulbifera")))
Top 20% hilum angle, both taxa
ggplot(top_20_DA_DB, aes(x = Species, y = `Hilum Angle (Degrees)`)) +
geom_boxplot(outlier.shape = NA) + # Boxplot without outliers
geom_jitter(width = 0.1, shape = 1) + # Add jitter with species-based colors
labs(y = "Hilum Angle (Degrees)", color = "Species") + # Add y-axis label and legend title
scale_y_continuous(name = "Hilum Angle (Degrees)", limits = c(0, 200)) +
theme_minimal(base_size = 16) + # Minimal theme
scale_x_discrete(labels = c(expression(italic("D. alata")), expression(italic("D. bulbifera")))) # Format x-axis labels
Total population hilum angle, both taxa
ggplot(total_DA_DB, aes(x = Species, y = `Hilum Angle (Degrees)`)) +
geom_boxplot(outlier.shape = NA) +
labs(color = "Species")+
scale_y_continuous(name = "Hilum Angle (Degrees)", limits = c(0, 200)) +
geom_jitter(width = 0.1, shape = 1)+
theme_minimal(base_size = 16)+
scale_x_discrete(labels = expression(italic("D. alata"), italic("D. bulbifera")))
Total population eccentricity ratio, both taxa
ggplot(total_DA_DB, aes(x = Species, y = `Eccentricity Ratio`)) +
geom_boxplot(outlier.shape = NA) +
labs(color = "Species")+
scale_y_continuous(limits = c(0.0, 0.7)) +
geom_jitter(width = 0.1, shape = 1)+
theme_minimal(base_size = 16)+
scale_x_discrete(labels = expression(italic("D. alata"), italic("D. bulbifera")))
Top 20% eccentricity ratio, both taxa
ggplot(top_20_DA_DB, aes(x = Species, y = `Eccentricity Ratio`,)) +
geom_boxplot(outlier.shape = NA) + # Boxplot with gradient fill
geom_boxplot(outlier.shape = NA) +
labs(color = "Species")+
scale_y_continuous(limits = c(0.0, 0.7)) +
geom_jitter(width = 0.1, shape = 1)+
theme_minimal(base_size = 16)+
scale_x_discrete(labels = expression(italic("D. alata"), italic("D. bulbifera")))
Data Normalization and Distribution
Skewness of D. alata data
skewness(DA_TOP20L$`Max Length (Microns)`)
skewness(DA_TOP20L$`Max Width (Microns)`)
skewness(DA_TOP20L$`Eccentricity Ratio`)
skewness(DA_TOP20L$`Hilum to Prox (Microns)`)
skewness(DA_TOP20L$`Hilum Angle (Degrees)`)
D. alata distribution plots
ggplot(data = DA_TOP20L, aes(`Max Length (Microns)`))+
geom_histogram()+
ggtitle("D.alata top 20% length distribution" )
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = DA, aes(`Max Length (Microns)`))+
geom_histogram()+
ggtitle("D.alata total length distribution" )
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = DA_TOP20L, aes(`Max Width (Microns)`)) +
geom_histogram()+ #the most normal distributed metric
ggtitle("D.alata top 20% width distribution" )
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = DA, aes(`Max Width (Microns)`)) +
geom_histogram()+ #the most normal distributed metric
ggtitle("D.alata total population width distribution" )
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = DA_TOP20L, aes(`Eccentricity Ratio`)) +
geom_histogram()+ #Pretty normal
ggtitle("D.alata top 20% Eccentricity Ratio distribution" )
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = DA_TOP20L, aes(`Hilum Angle (Degrees)`)) + geom_histogram()+
ggtitle("D.alata top 20% hilum angle distribution" )
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = DA, aes(`Eccentricity Ratio`)) +
geom_histogram()+ #Pretty normal
ggtitle("D.alata total population Eccentricity Ratio distribution" )
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = DA, aes(`Hilum Angle (Degrees)`)) + geom_histogram()+
ggtitle("D.alata total population hilum angle distribution" )
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = DA, aes(`Max Length (Microns)`, `Max Width (Microns)`)) +
geom_point()
ggtitle("D.alata total population length by width distribution" )
$title
[1] "D.alata total population length by width distribution"
attr(,"class")
[1] "labels"
Normalizing D. alata data subset
DA.Norm = lm(`Max Length (Microns)` ~ `Max Width (Microns)`, data = DA_TOP20L)
summary(DA.Norm)
Call:
lm(formula = `Max Length (Microns)` ~ `Max Width (Microns)`,
data = DA_TOP20L)
Residuals:
Min 1Q Median 3Q Max
-6.297 -3.571 -1.326 2.140 18.291
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 32.1618 3.3490 9.603 1.36e-13 ***
`Max Width (Microns)` 0.3734 0.1241 3.010 0.00386 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 4.803 on 58 degrees of freedom
Multiple R-squared: 0.1351, Adjusted R-squared: 0.1202
F-statistic: 9.06 on 1 and 58 DF, p-value: 0.003864
DA_log.model <- lm(log1p(`Max Length (Microns)`) ~ log1p(`Max Width (Microns)`), data = DA_TOP20L)
summary(DA_log.model)
Call:
lm(formula = log1p(`Max Length (Microns)`) ~ log1p(`Max Width (Microns)`),
data = DA_TOP20L)
Residuals:
Min 1Q Median 3Q Max
-0.13745 -0.08373 -0.02694 0.05730 0.37367
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.10913 0.24357 12.765 <2e-16 ***
log1p(`Max Width (Microns)`) 0.19624 0.07373 2.662 0.01 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.1065 on 58 degrees of freedom
Multiple R-squared: 0.1088, Adjusted R-squared: 0.09348
F-statistic: 7.084 on 1 and 58 DF, p-value: 0.01004
D. bulbifera skewness analysis
skewness(DB_TOP20L$`Max Length (Microns)`)
skewness(DB_TOP20L$`Max Width (Microns)`) #hecken skewed
skewness(DB_TOP20L$`Eccentricity Ratio`) #lowest skew
skewness(DB_TOP20L$`Hilum to Prox (Microns)`)
skewness(DB_TOP20L$`Hilum Angle (Degrees)`) #second lowest skew
Normalize data DB
DB.Norm = lm(`Max Length (Microns)` ~ `Eccentricity Ratio`, data = DB_TOP20L)
summary(DB.Norm)
Call:
lm(formula = `Max Length (Microns)` ~ `Eccentricity Ratio`, data = DB_TOP20L)
Residuals:
Min 1Q Median 3Q Max
-5.326 -3.542 -0.992 2.542 11.327
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 45.364 3.209 14.138 <2e-16 ***
`Eccentricity Ratio` -27.649 34.636 -0.798 0.428
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 4.157 on 58 degrees of freedom
Multiple R-squared: 0.01087, Adjusted R-squared: -0.006186
F-statistic: 0.6373 on 1 and 58 DF, p-value: 0.428
DB_log.model = lm(log1p(`Max Length (Microns)`) ~ log1p(`Eccentricity Ratio`), data = DB_TOP20L)
summary(DB_log.model)
Call:
lm(formula = log1p(`Max Length (Microns)`) ~ log1p(`Eccentricity Ratio`),
data = DB_TOP20L)
Residuals:
Min 1Q Median 3Q Max
-0.12505 -0.08038 -0.01841 0.06089 0.23250
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.84043 0.07437 51.639 <2e-16 ***
log1p(`Eccentricity Ratio`) -0.73476 0.84086 -0.874 0.386
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.09224 on 58 degrees of freedom
Multiple R-squared: 0.01299, Adjusted R-squared: -0.004023
F-statistic: 0.7636 on 1 and 58 DF, p-value: 0.3858
D. bulbifera distribution plots
ggplot(data = DB_TOP20L, aes(`Max Length (Microns)`))+
geom_histogram()+
ggtitle("D.bulbifera top 20% length distribution" )
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = DB, aes(`Max Length (Microns)`))+
geom_histogram()+
ggtitle("D.bulbifera total length distribution" )
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = DB_TOP20L, aes(`Max Width (Microns)`)) +
geom_histogram()+ #the most normal distributed metric
ggtitle("D.bulbifera top 20% width distribution" )
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = DB, aes(`Max Width (Microns)`)) +
geom_histogram()+ #the most normal distributed metric
ggtitle("D.bulbifera total population width distribution" )
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = DB_TOP20L, aes(`Eccentricity Ratio`)) +
geom_histogram()+ #Pretty normal
ggtitle("D.bulbifera top 20% Eccentricity Ratio distribution" )
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = DB, aes(`Eccentricity Ratio`)) +
geom_histogram()+ #Pretty normal
ggtitle("D.bulbifera total population Eccentricity Ratio distribution" )
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = DB_TOP20L, aes(`Hilum Angle (Degrees)`)) + geom_histogram()+
ggtitle("D.bulbifera top 20% hilum angle distribution" )
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = DB, aes(`Hilum Angle (Degrees)`)) + geom_histogram()+
ggtitle("D.bulbifera total population hilum angle distribution" )
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = DB, aes(`Max Length (Microns)`, `Max Width (Microns)`)) +
geom_point()
ggtitle("D.bulbifera total population length by width distribution" )
$title
[1] "D.bulbifera total population length by width distribution"
attr(,"class")
[1] "labels"