A Morphometric Analysis of Starch Granules from Two Dioscorea Species

Author

Sara Rickett

Library

library(ggplot2)
Warning: package 'ggplot2' was built under R version 4.2.3
library(tidyverse)
── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
✔ tibble  3.2.1     ✔ dplyr   1.1.4
✔ tidyr   1.2.1     ✔ stringr 1.5.0
✔ readr   2.1.3     ✔ forcats 1.0.0
✔ purrr   1.0.1     
Warning: package 'tibble' was built under R version 4.2.3
Warning: package 'dplyr' was built under R version 4.2.3
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
library(e1071)
Warning: package 'e1071' was built under R version 4.2.3

Data

Update to your own file path from downloaded Dryad dataset

#Load data
DA <- read_csv("C:/Users/Sara/OneDrive - University of Utah/MS_Research/Dryad/D_alata.csv", show_col_types = FALSE)

DB <- read_csv("C:/Users/Sara/OneDrive - University of Utah/MS_Research/Dryad/D_bulbifera.csv", show_col_types = FALSE)

Data filter, 20% of granules for analysis sorted by Max Length

# D. alata filtered by length
percentage_to_keep <- 0.2
DA_TOP20L <- DA %>%
  arrange(desc(`Max Length (Microns)`)) %>%
  top_n(n = round(nrow(.) * percentage_to_keep), wt = `Max Length (Microns)`)

view(DA_TOP20L)
# D. bulbifera filtered by length
percentage_to_keep <- 0.2
DB_TOP20L <- DB %>%
  arrange(desc(`Max Length (Microns)`)) %>%
  top_n(n = round(nrow(.) * percentage_to_keep), wt = `Max Length (Microns)`)

Bind for inter-taxa comparison

total_DA_DB <- rbind(DA, DB) # 100% of sample population

top_20_DA_DB <- rbind(DA_TOP20L, DB_TOP20L) # top 20% sorted by length

Ranges of quantitative data for both D. alata and D. bulbifera

range(DA$`Max Length (Microns)`)
[1]  9.534 57.459
range(DA_TOP20L$`Max Length (Microns)`)
[1] 36.351 57.459
range(DA$`Eccentricity Ratio`)
[1] 0.08355523 0.50000000
range(DA_TOP20L$`Eccentricity Ratio`)
[1] 0.08355523 0.32810844
range(DA$`Hilum Angle (Degrees)`)
[1]  38.135 180.000
range(DA_TOP20L$`Hilum Angle (Degrees)`)
[1]  39.766 151.896
range(DA$`Hilum Angle (Degrees)`)
[1]  38.135 180.000
range(DA_TOP20L$`Hilum Angle (Degrees)`)
[1]  39.766 151.896
range(DB$`Max Length (Microns)`)
[1]  6.649 54.422
range(DB_TOP20L$`Max Length (Microns)`)
[1] 37.618 54.422
range(DB$`Eccentricity Ratio`)
[1] 0.05340962 0.41209204
range(DB_TOP20L$`Eccentricity Ratio`)
[1] 0.06225496 0.13425845
range(DB$`Hilum Angle (Degrees)`)
[1]  28.465 135.303
range(DB_TOP20L$`Hilum Angle (Degrees)`)
[1] 28.465 74.664
range(DB$`Hilum Angle (Degrees)`)
[1]  28.465 135.303
range(DB_TOP20L$`Hilum Angle (Degrees)`)
[1] 28.465 74.664

Mean for Eccentricty Ratio and Hilum Angle for both taxa

mean(DA_TOP20L$`Eccentricity Ratio`)
[1] 0.1896401
mean(DA_TOP20L$`Hilum Angle (Degrees)`)
[1] 103.4777
mean(DB_TOP20L$`Eccentricity Ratio`)
[1] 0.09133846
mean(DB_TOP20L$`Hilum Angle (Degrees)`)
[1] 57.5455

Two-sample Kolmogorov-Smirnov tests total population

ks.test(DA$`Max Length (Microns)`, DB$`Max Length (Microns)`)
Warning in ks.test.default(DA$`Max Length (Microns)`, DB$`Max Length
(Microns)`): p-value will be approximate in the presence of ties

    Asymptotic two-sample Kolmogorov-Smirnov test

data:  DA$`Max Length (Microns)` and DB$`Max Length (Microns)`
D = 0.1127, p-value = 0.04456
alternative hypothesis: two-sided
ks.test(DA$`Max Width (Microns)`, DB$`Max Width (Microns)`)
Warning in ks.test.default(DA$`Max Width (Microns)`, DB$`Max Width (Microns)`):
p-value will be approximate in the presence of ties

    Asymptotic two-sample Kolmogorov-Smirnov test

data:  DA$`Max Width (Microns)` and DB$`Max Width (Microns)`
D = 0.23566, p-value = 1.196e-07
alternative hypothesis: two-sided
ks.test(DA$`Hilum Angle (Degrees)` , DB$`Hilum Angle (Degrees)`)
Warning in ks.test.default(DA$`Hilum Angle (Degrees)`, DB$`Hilum Angle
(Degrees)`): p-value will be approximate in the presence of ties

    Asymptotic two-sample Kolmogorov-Smirnov test

data:  DA$`Hilum Angle (Degrees)` and DB$`Hilum Angle (Degrees)`
D = 0.66191, p-value < 2.2e-16
alternative hypothesis: two-sided
ks.test(DA$`Eccentricity Ratio`, DB$`Eccentricity Ratio`)

    Asymptotic two-sample Kolmogorov-Smirnov test

data:  DA$`Eccentricity Ratio` and DB$`Eccentricity Ratio`
D = 0.69976, p-value < 2.2e-16
alternative hypothesis: two-sided

Two-sample Kolmogorov-Smirnov tests largest 20% by length data subset

ks.test(DA_TOP20L$`Max Length (Microns)`, DB_TOP20L$`Max Length (Microns)`)

    Exact two-sample Kolmogorov-Smirnov test

data:  DA_TOP20L$`Max Length (Microns)` and DB_TOP20L$`Max Length (Microns)`
D = 0.18333, p-value = 0.2659
alternative hypothesis: two-sided
ks.test(DA_TOP20L$`Max Width (Microns)`, DB_TOP20L$`Max Width (Microns)`)

    Exact two-sample Kolmogorov-Smirnov test

data:  DA_TOP20L$`Max Width (Microns)` and DB_TOP20L$`Max Width (Microns)`
D = 0.3, p-value = 0.008631
alternative hypothesis: two-sided
ks.test(DA_TOP20L$`Hilum Angle (Degrees)`, DB_TOP20L$`Hilum Angle (Degrees)`)

    Exact two-sample Kolmogorov-Smirnov test

data:  DA_TOP20L$`Hilum Angle (Degrees)` and DB_TOP20L$`Hilum Angle (Degrees)`
D = 0.78333, p-value = 2.62e-14
alternative hypothesis: two-sided
ks.test(DA_TOP20L$`Eccentricity Ratio`, DB_TOP20L$`Eccentricity Ratio`)

    Exact two-sample Kolmogorov-Smirnov test

data:  DA_TOP20L$`Eccentricity Ratio` and DB_TOP20L$`Eccentricity Ratio`
D = 0.93333, p-value = 2.62e-14
alternative hypothesis: two-sided

Frequency analysis and figures

D. alata 20 % frequency subsets

Frequencies of lamellae, longitudinal fissures, granule curves, and ordinal shape (Triangular and Oval).

# Subset data
T20_Oval_DA <- subset(DA_TOP20L, `Oval (0=absent, 1=present)` == 1)


# Calculate the relative frequency of "Oval"
T20_RF_Oval_DA <- nrow(T20_Oval_DA) / nrow(DA_TOP20L)

#Subset Triangular granules
T20_Tri_DA <- subset(DA_TOP20L, `Triangular (0=absent, 1=present)` == 1)

#RF of Tra
T20_RF_Tri_DA <- nrow(T20_Tri_DA) / nrow(DA_TOP20L)


T20_Lon_DA <- subset(DA_TOP20L,  `Longitudinal Fissures (0=absent, 1=present)` == 1)

#Relative Frequency
T20_RF_Lon_DA <- nrow(T20_Lon_DA) / nrow(DA_TOP20L)

#Subset data
T20_Lam_DA <- subset(DA_TOP20L, `Lamellae (0=non-lamellated, 1=lamellated)` == 1)

#Relative Frequency
T20_RF_Lam_DA <- nrow(T20_Lam_DA) / nrow(DA_TOP20L)

#subset data
T20_Ben_DA <- subset(DA_TOP20L, `Granule curve (0=absent, 1=present)` == 1)
#Relative Frequency
T20_RF_Ben_DA <- nrow(T20_Ben_DA)/nrow(DA_TOP20L)

T20_RF_DA <- rbind(c(T20_RF_Lam_DA, T20_RF_Lon_DA, T20_RF_Ben_DA, T20_RF_Tri_DA, T20_RF_Oval_DA)) #order correct 2/1/25

D. alata total population frequency data subsets

# Subset data
Tot_Oval_DA <- subset(DA, `Oval (0=absent, 1=present)` == 1)

# Calculate the relative frequency of "Oval"
Tot_RF_Oval_DA <- nrow(Tot_Oval_DA) / nrow(DA)

#Subset triangular shape
Tot_Tri_DA <- subset(DA, `Triangular (0=absent, 1=present)` == 1)

#RF of Tri
Tot_RF_Tri_DA <- nrow(Tot_Tri_DA) / nrow(DA)

#Subset data
Lon_Tot_DA <- subset(DA,  `Longitudinal Fissures (0=absent, 1=present)` == 1)

#Relative Frequency
Tot_RF_Lon_DA <- nrow(Lon_Tot_DA) / nrow(DA)

#Subset data
Lam_Tot_DA <- subset(DA, `Lamellae (0=non-lamellated, 1=lamellated)` == 1)

#Relative Frequency
Tot_RF_Lam_DA <- nrow(Lam_Tot_DA) / nrow(DA)

#subset data
Tot_Ben_DA <- subset(DA, `Granule curve (0=absent, 1=present)` == 1)

#Relative Frequency
Tot_RF_Ben_DA <- nrow(Tot_Ben_DA)/nrow(DA)

print(Tot_RF_Ben_DA)
[1] 0.06375839
# Create a dataframe for plotting
Tot.Freq.Plot.DA <- data.frame(
  Category = c('Lamellae', "Longitudinal Fissures",'Curved', 'Triangular', 'Oval'),
  RF_Total_DA = c(Tot_RF_Lam_DA, Tot_RF_Lon_DA, Tot_RF_Ben_DA, Tot_RF_Tri_DA, Tot_RF_Oval_DA)
)
RF_Total_DA <- c(Tot_RF_Lam_DA, Tot_RF_Lon_DA, Tot_RF_Ben_DA, Tot_RF_Tri_DA, Tot_RF_Oval_DA)

D. alata total population frequency figure

Frequencies of lamellae, longitudinal fissures, granule curves, and ordinal shape (Triangular vs Oval).

Tot.Freq.Plot.DA$Category <- factor(Tot.Freq.Plot.DA$Category, 
                                    levels = c('Lamellae', 'Longitudinal Fissures', 'Curved', 'Triangular', 'Oval'))  


ggplot(Tot.Freq.Plot.DA, aes(x = Category, y = RF_Total_DA, fill = Category)) +
  geom_bar(stat = "identity") +
  labs(title = "D. alata", x = "", y = "") +
  theme_minimal(base_size = 20) +
  theme(plot.title = element_text(face = "italic")) +
  ylim(0.00, 1.00)+
  theme(legend.position = "none") +
  theme(axis.text.x = element_text(angle=45, vjust=1, hjust=1))

D. alata largest 20% by length Frequency plot

Frequencies of lamellae, longitudinal fissures, granule curves, and ordinal shape (Triangular vs Oval).

# Create a dataframe for plotting
T20.Freq.Plot.DA <- data.frame(
  Category = c('Lamellae', "Longitudunal Fissures",'Curved', 'Triangular', 'Oval'),
  T20_RF_DA = c(T20_RF_Lam_DA, T20_RF_Lon_DA, T20_RF_Ben_DA, T20_RF_Tri_DA, T20_RF_Oval_DA)
)


T20_RF_DA <- rbind(c(T20_RF_Lam_DA, T20_RF_Lon_DA, T20_RF_Ben_DA, T20_RF_Tri_DA, T20_RF_Oval_DA)) #order correct 2/1/25


T20.Freq.Plot.DA$Category <- factor(Tot.Freq.Plot.DA$Category, 
                                    levels = c('Lamellae', 'Longitudinal Fissures', 'Curved', 'Triangular', 'Oval'))  


ggplot(T20.Freq.Plot.DA, aes(x = Category, y = T20_RF_DA, fill = Category)) +
  geom_bar(stat = "identity") +
  labs(title = "D. alata", x = "", y = "") +
  theme_minimal(base_size = 20) +
  theme(plot.title = element_text(face = "italic")) +
  ylim(0.00, 1.00)+
  theme(legend.position = "none") +
  theme(axis.text.x = element_text(angle=45, vjust=1, hjust=1))

D. bulbifera total population frequency data subsets

# Subset data

Tot_Oval_DB <- subset(DB, `Oval (0=absent, 1=present)` == 1)

# Calculate the relative frequency of "Oval"
Tot_RF_Oval_DB <- nrow(Tot_Oval_DB) / nrow(DB)

#Subset Transverse fissures
Tot_Tri_DB <- subset(DB, `Triangular (0=absent, 1=present)` == 1)

#RF of Triangular
Tot_RF_Tri_DB <- nrow(Tot_Tri_DB) / nrow(DB)

#Subset data
Tot_LF_DB <- subset(DB, `Longitudinal Fissures (0=absent, 1=present)` == 1)

#Relative Frequency
Tot_RF_LF_DB <- nrow(Tot_LF_DB) / nrow(DB)

#Subset data
Tot_Lam_DB <- subset(DB, `Lamellae (0=non-lamellated, 1=lamellated)` == 1)

#Relative Frequency
Tot_RF_Lam_DB <- nrow(Tot_Lam_DB) / nrow(DB)

#subset data
Tot_Ben_DB <- subset(DB, `Granule curve (0=absent, 1=present)` == 1)

#Relative Frequency
Tot_RF_Ben_DB <- nrow(Tot_Ben_DB)/nrow(DB)

# Create a dataframe for plotting
Total.Freq.Plot.DB <- data.frame(
  Category = c('Lamellae', "Longitudunal Fissures",'Curved', 'Triangular', 'Oval'),
  RF_Total_DB = c(Tot_RF_Lam_DB, Tot_RF_LF_DB, Tot_RF_Ben_DB, Tot_RF_Tri_DB, Tot_RF_Oval_DB)
)

RF_Total_DB <- c(Tot_RF_Lam_DB, Tot_RF_LF_DB, Tot_RF_Ben_DB, Tot_RF_Tri_DB, Tot_RF_Oval_DB)

print(RF_Total_DB)
[1] 0.90365449 0.66112957 0.37541528 0.96677741 0.02990033

D. bulbifera total population frequency figure

Frequencies of lamellae, longitudinal fissures, granule curves, and ordinal shape (Triangular vs Oval).

Total.Freq.Plot.DB$Category <- factor(Total.Freq.Plot.DB$Category, 
                                    levels = c('Lamellae', 'Longitudunal Fissures', 'Curved', 'Triangular', 'Oval'))  

ggplot(Total.Freq.Plot.DB, aes(x = Category, y = RF_Total_DB, fill = Category)) +
  geom_bar(stat = "identity") +
  labs(title = "D. bulbifera", x = "", y = "") +
  theme_minimal(base_size = 20) +
  theme(plot.title = element_text(face = "italic")) +
  ylim(0.00, 1.00)+
  theme(legend.position = "none")+
  theme(axis.text.x = element_text(angle=45, vjust=1, hjust=1))

D. bulbifera largest 20% frequency data subsets

T20_Oval_DB <- subset(DB_TOP20L, `Oval (0=absent, 1=present)` == 1)


# Calculate the relative frequency of "Oval"
T20_RF_Oval_DB <- nrow(T20_Oval_DB) / nrow(DB_TOP20L)


#Subset Triangular shape
T20_Tri_DB <- subset(DB_TOP20L, `Triangular (0=absent, 1=present)` == 1)

#RF of Tra
T20_RF_Tri_DB <- nrow(T20_Tri_DB) / nrow(DB_TOP20L)

#Subset DBta
T20_LF_DB <- subset(DB_TOP20L,  `Longitudinal Fissures (0=absent, 1=present)` == 1)

#Relative Frequency
T20_RF_LF_DB <- nrow(T20_LF_DB) / nrow(DB_TOP20L)

print(T20_RF_LF_DB)
[1] 0.9333333
#Subset data
T20_Lam_DB <- subset(DB_TOP20L, `Lamellae (0=non-lamellated, 1=lamellated)` == 1)

#Relative Frequency
T20_RF_Lam_DB <- nrow(T20_Lam_DB) / nrow(DB_TOP20L)

#subset data
T20_Ben_DB <- subset(DB_TOP20L, `Granule curve (0=absent, 1=present)` == 1)

#Relative Frequency
T20_RF_Ben_DB <- nrow(T20_Ben_DB)/nrow(DB_TOP20L)

# Create a dataframe for plotting
T20.Freq.Plot.DB <- data.frame(
  Category = c('Lamellae', 'Longitudunal Fissures','Curved', 'Triangular', 'Oval'),
  T20_RF_DB = c(T20_RF_Lam_DB, T20_RF_LF_DB, T20_RF_Ben_DB, T20_RF_Tri_DB, T20_RF_Oval_DB)
)

T20_RF_DB <- c(T20_RF_Lam_DB, T20_RF_LF_DB, T20_RF_Ben_DB, T20_RF_Tri_DB, T20_RF_Oval_DB)

print(T20_RF_DB)
[1] 0.95000000 0.93333333 0.53333333 0.98333333 0.01666667

D. bulbifera largest 20% by length frequency figure

Frequencies of lamellae, longitudinal fissures, granule curves, and ordinal shape (Triangular vs Oval).

T20.Freq.Plot.DB$Category <- factor(T20.Freq.Plot.DB$Category, 
                                    levels = c('Lamellae', 'Longitudunal Fissures', 'Curved', 'Triangular', 'Oval'))  

ggplot(T20.Freq.Plot.DB, aes(x = Category, y = T20_RF_DB, fill = Category)) +
  geom_bar(stat = "identity") +
  labs(title = "D. bulbifera", x = "", y = "") +
  theme_minimal(base_size = 20) +
  theme(plot.title = element_text(face = "italic")) +
  ylim(0.00, 1.00)+
  theme(legend.position = "none")+
  theme(axis.text.x = element_text(angle=45, vjust=1, hjust=1))

D. alata frequency histogram for 20% subset and 100% population

# Create a dataframe with proper structure
Comp.DA <- data.frame(
  Category = rep(c('Lamellae', 'Longitudunal Fissures', 'Curved', 'Triangular', 'Oval'), 2), 
  Group = rep(c("T20_RF_DA", "RF_Total_DA"), each = 5),  # Define group labels properly
  Value = c(T20_RF_DA, RF_Total_DA)  # combine datasets
)

Comp.DA$Category <- factor(Comp.DA$Category, 
levels = c('Lamellae', 'Longitudunal Fissures', 'Curved', 'Triangular', 'Oval'))


# Create the overlay plot
ggplot(Comp.DA, aes(x = Category, y = Value, fill = Group)) +
  geom_bar(stat = "identity", position = "dodge", alpha = 0.7) +  
  labs(title = "D. alata", x = "", y = "Frequency", ) +
  theme_minimal(base_size = 20) +
  theme(plot.title = element_text(face = "italic")) +
  ylim(0.00, 1.00) +
  scale_fill_manual(
    values = c("T20_RF_DA" = "blue", "RF_Total_DA" = "red"))+
   theme(legend.position = "top")+
  theme(axis.text.x = element_text(angle=45, vjust=1, hjust=1))

D. bulbifera frequency histogram for 20% subset and 100% population

# Create a dataframe with proper structure
Comp.DB <- data.frame(
  Category = rep(c('Lamellae', 'Longitudunal Fissures', 'Curved', 'Triangular', 'Oval'), 2), 
  Group = rep(c("T20_RF_DB", "RF_Total_DB"), each = 5),  # Define group labels properly
  Value = c(T20_RF_DB, RF_Total_DB)  # Ensure both datasets are combined properly
)

Comp.DB$Category <- factor(Comp.DB$Category, 
levels = c('Lamellae', 'Longitudunal Fissures', 'Curved', 'Triangular', 'Oval'))


# Create the overlay plot
ggplot(Comp.DB, aes(x = Category, y = Value, fill = Group)) +
  geom_bar(stat = "identity", position = "dodge", alpha = 0.7) +  # Adjust alpha for transparency
  labs(title = "D. bulbifera", x = "", y = "Frequency") +
  theme_minimal(base_size = 20) +
  theme(plot.title = element_text(face = "italic")) +
  ylim(0.00, 1.00) +
  scale_fill_manual(values = c("T20_RF_DB" = "blue", "RF_Total_DB" = "red")) +  # Custom colors
  theme(legend.position = "top")+  # Move legend to top for clarity
theme(axis.text.x = element_text(angle=45, vjust=1, hjust=1))

Box plots for quantitative data

Top 20% length, both taxa

ggplot(top_20_DA_DB, aes(x = Species, y = `Max Length (Microns)`)) +
  geom_boxplot(outlier.shape = NA) +
  labs(color = "Species")+
  geom_jitter(width = 0.1, shape = 1)+
  scale_y_continuous(name = "Max Length (Microns)", limits = c(0, 70)) +
  theme_minimal(base_size = 16)+
  scale_x_discrete(labels = expression(italic("D. alata"), italic("D. bulbifera")))

Total population length, both taxa

ggplot(total_DA_DB, aes(x = Species, y = `Max Length (Microns)`, width = 0.5)) +
  geom_boxplot(outlier.shape = NA) +
  labs(color = "Species")+
  geom_jitter(width = 0.1, shape = 1)+
  scale_y_continuous(name = "Max Length (Microns)", limits = c(0, 70)) +
  theme_minimal(base_size = 16)+
  scale_x_discrete(labels = expression(italic("D. alata"), italic("D. bulbifera")))

Top 20% width, both taxa

ggplot(top_20_DA_DB, aes(x = Species, y = `Max Width (Microns)`)) +
  geom_boxplot(outlier.shape = NA) +
  labs(color = "Species")+
  geom_jitter(width = 0.1, shape = 1)+
  scale_y_continuous(name = "Max Width (Microns)", limits = c(0, 50)) +
  theme_minimal(base_size = 16)+
  scale_x_discrete(labels = expression(italic("D. alata"), italic("D. bulbifera")))

Total population width, both taxa

ggplot(total_DA_DB, aes(x = Species, y = `Max Width (Microns)`)) +
  geom_boxplot(outlier.shape = NA) +
  labs(color = "Species")+
  geom_jitter(width = 0.1, shape = 1)+
   scale_y_continuous(name = "Max Width (Microns)", limits = c(0, 50)) +
  theme_minimal(base_size = 16)+
  scale_x_discrete(labels = expression(italic("D. alata"), italic("D. bulbifera"))) 

Top 20% width, both taxa

ggplot(top_20_DA_DB, aes(x = Species, y = `Max Width (Microns)`)) +
  geom_boxplot(outlier.shape = NA) +
  labs(color = "Species")+
  geom_jitter(width = 0.1, shape = 1)+
  theme_minimal(base_size = 16)+
  scale_x_discrete(labels = expression(italic("D. alata"), italic("D. bulbifera")))

Top 20% hilum angle, both taxa

  ggplot(top_20_DA_DB, aes(x = Species, y = `Hilum Angle (Degrees)`)) +
  geom_boxplot(outlier.shape = NA) + # Boxplot without outliers
  geom_jitter(width = 0.1, shape = 1) + # Add jitter with species-based colors
  labs(y = "Hilum Angle (Degrees)", color = "Species") + # Add y-axis label and legend title
  scale_y_continuous(name = "Hilum Angle (Degrees)", limits = c(0, 200)) +
  theme_minimal(base_size = 16) + # Minimal theme
  scale_x_discrete(labels = c(expression(italic("D. alata")), expression(italic("D. bulbifera")))) # Format x-axis labels

Total population hilum angle, both taxa

ggplot(total_DA_DB, aes(x = Species, y = `Hilum Angle (Degrees)`)) +
  geom_boxplot(outlier.shape = NA) +
  labs(color = "Species")+
  scale_y_continuous(name = "Hilum Angle (Degrees)", limits = c(0, 200)) +
  geom_jitter(width = 0.1, shape = 1)+
  theme_minimal(base_size = 16)+
  scale_x_discrete(labels = expression(italic("D. alata"), italic("D. bulbifera"))) 

Total population eccentricity ratio, both taxa

ggplot(total_DA_DB, aes(x = Species, y = `Eccentricity Ratio`)) +
  geom_boxplot(outlier.shape = NA) +
  labs(color = "Species")+
  scale_y_continuous(limits = c(0.0, 0.7)) +
  geom_jitter(width = 0.1, shape = 1)+
  theme_minimal(base_size = 16)+
  scale_x_discrete(labels = expression(italic("D. alata"), italic("D. bulbifera"))) 

Top 20% eccentricity ratio, both taxa

ggplot(top_20_DA_DB, aes(x = Species, y = `Eccentricity Ratio`,)) +
  geom_boxplot(outlier.shape = NA) + # Boxplot with gradient fill
 geom_boxplot(outlier.shape = NA) +
  labs(color = "Species")+
  scale_y_continuous(limits = c(0.0, 0.7)) +
  geom_jitter(width = 0.1, shape = 1)+
  theme_minimal(base_size = 16)+
  scale_x_discrete(labels = expression(italic("D. alata"), italic("D. bulbifera"))) 

Data Normalization and Distribution

Skewness of D. alata data

skewness(DA_TOP20L$`Max Length (Microns)`)
[1] 1.285376
skewness(DA_TOP20L$`Max Width (Microns)`)
[1] 0.1343228
skewness(DA_TOP20L$`Eccentricity Ratio`)
[1] 0.4274028
skewness(DA_TOP20L$`Hilum to Prox (Microns)`)
[1] 1.117609
skewness(DA_TOP20L$`Hilum Angle (Degrees)`)
[1] -0.6546644

D. alata distribution plots

ggplot(data = DA_TOP20L, aes(`Max Length (Microns)`))+
  geom_histogram()+
  ggtitle("D.alata top 20% length distribution" )
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = DA, aes(`Max Length (Microns)`))+
  geom_histogram()+
  ggtitle("D.alata total length distribution" )
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = DA_TOP20L, aes(`Max Width (Microns)`)) + 
  geom_histogram()+ #the most normal distributed metric
  ggtitle("D.alata top 20% width distribution" )
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = DA, aes(`Max Width (Microns)`)) + 
  geom_histogram()+ #the most normal distributed metric
  ggtitle("D.alata total population width distribution" )
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = DA_TOP20L, aes(`Eccentricity Ratio`)) + 
  geom_histogram()+ #Pretty normal
  ggtitle("D.alata top 20% Eccentricity Ratio distribution" )
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = DA_TOP20L, aes(`Hilum Angle (Degrees)`)) + geom_histogram()+
  ggtitle("D.alata top 20% hilum angle distribution" )
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = DA, aes(`Eccentricity Ratio`)) + 
  geom_histogram()+ #Pretty normal
  ggtitle("D.alata total population Eccentricity Ratio distribution" )
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = DA, aes(`Hilum Angle (Degrees)`)) + geom_histogram()+
  ggtitle("D.alata total population hilum angle distribution" )
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = DA, aes(`Max Length (Microns)`, `Max Width (Microns)`)) + 
  geom_point()

  ggtitle("D.alata total population length by width distribution" )
$title
[1] "D.alata total population length by width distribution"

attr(,"class")
[1] "labels"

Normalizing D. alata data subset

DA.Norm = lm(`Max Length (Microns)` ~ `Max Width (Microns)`, data = DA_TOP20L)

summary(DA.Norm)

Call:
lm(formula = `Max Length (Microns)` ~ `Max Width (Microns)`, 
    data = DA_TOP20L)

Residuals:
   Min     1Q Median     3Q    Max 
-6.297 -3.571 -1.326  2.140 18.291 

Coefficients:
                      Estimate Std. Error t value Pr(>|t|)    
(Intercept)            32.1618     3.3490   9.603 1.36e-13 ***
`Max Width (Microns)`   0.3734     0.1241   3.010  0.00386 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 4.803 on 58 degrees of freedom
Multiple R-squared:  0.1351,    Adjusted R-squared:  0.1202 
F-statistic:  9.06 on 1 and 58 DF,  p-value: 0.003864
DA_log.model <- lm(log1p(`Max Length (Microns)`) ~ log1p(`Max Width (Microns)`), data = DA_TOP20L)

summary(DA_log.model)

Call:
lm(formula = log1p(`Max Length (Microns)`) ~ log1p(`Max Width (Microns)`), 
    data = DA_TOP20L)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.13745 -0.08373 -0.02694  0.05730  0.37367 

Coefficients:
                             Estimate Std. Error t value Pr(>|t|)    
(Intercept)                   3.10913    0.24357  12.765   <2e-16 ***
log1p(`Max Width (Microns)`)  0.19624    0.07373   2.662     0.01 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.1065 on 58 degrees of freedom
Multiple R-squared:  0.1088,    Adjusted R-squared:  0.09348 
F-statistic: 7.084 on 1 and 58 DF,  p-value: 0.01004

D. bulbifera skewness analysis

skewness(DB_TOP20L$`Max Length (Microns)`)
[1] 0.7162827
skewness(DB_TOP20L$`Max Width (Microns)`) #hecken skewed
[1] -0.8622865
skewness(DB_TOP20L$`Eccentricity Ratio`) #lowest skew
[1] 0.3724668
skewness(DB_TOP20L$`Hilum to Prox (Microns)`)
[1] 0.8306417
skewness(DB_TOP20L$`Hilum Angle (Degrees)`) #second lowest skew
[1] -0.597649

Normalize data DB

DB.Norm = lm(`Max Length (Microns)` ~ `Eccentricity Ratio`, data = DB_TOP20L)

summary(DB.Norm)

Call:
lm(formula = `Max Length (Microns)` ~ `Eccentricity Ratio`, data = DB_TOP20L)

Residuals:
   Min     1Q Median     3Q    Max 
-5.326 -3.542 -0.992  2.542 11.327 

Coefficients:
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)            45.364      3.209  14.138   <2e-16 ***
`Eccentricity Ratio`  -27.649     34.636  -0.798    0.428    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 4.157 on 58 degrees of freedom
Multiple R-squared:  0.01087,   Adjusted R-squared:  -0.006186 
F-statistic: 0.6373 on 1 and 58 DF,  p-value: 0.428
DB_log.model = lm(log1p(`Max Length (Microns)`) ~ log1p(`Eccentricity Ratio`), data = DB_TOP20L)

summary(DB_log.model)

Call:
lm(formula = log1p(`Max Length (Microns)`) ~ log1p(`Eccentricity Ratio`), 
    data = DB_TOP20L)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.12505 -0.08038 -0.01841  0.06089  0.23250 

Coefficients:
                            Estimate Std. Error t value Pr(>|t|)    
(Intercept)                  3.84043    0.07437  51.639   <2e-16 ***
log1p(`Eccentricity Ratio`) -0.73476    0.84086  -0.874    0.386    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.09224 on 58 degrees of freedom
Multiple R-squared:  0.01299,   Adjusted R-squared:  -0.004023 
F-statistic: 0.7636 on 1 and 58 DF,  p-value: 0.3858

D. bulbifera distribution plots

ggplot(data = DB_TOP20L, aes(`Max Length (Microns)`))+
  geom_histogram()+
  ggtitle("D.bulbifera top 20% length distribution" )
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = DB, aes(`Max Length (Microns)`))+
  geom_histogram()+
  ggtitle("D.bulbifera total length distribution" )
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = DB_TOP20L, aes(`Max Width (Microns)`)) + 
  geom_histogram()+ #the most normal distributed metric
  ggtitle("D.bulbifera top 20% width distribution" )
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = DB, aes(`Max Width (Microns)`)) + 
  geom_histogram()+ #the most normal distributed metric
  ggtitle("D.bulbifera total population width distribution" )
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = DB_TOP20L, aes(`Eccentricity Ratio`)) + 
  geom_histogram()+ #Pretty normal
  ggtitle("D.bulbifera top 20% Eccentricity Ratio distribution" )
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = DB, aes(`Eccentricity Ratio`)) + 
  geom_histogram()+ #Pretty normal
  ggtitle("D.bulbifera total population Eccentricity Ratio distribution" )
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = DB_TOP20L, aes(`Hilum Angle (Degrees)`)) + geom_histogram()+
  ggtitle("D.bulbifera top 20% hilum angle distribution" )
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = DB, aes(`Hilum Angle (Degrees)`)) + geom_histogram()+
  ggtitle("D.bulbifera total population hilum angle distribution" )
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = DB, aes(`Max Length (Microns)`, `Max Width (Microns)`)) + 
  geom_point()

  ggtitle("D.bulbifera total population length by width distribution" )
$title
[1] "D.bulbifera total population length by width distribution"

attr(,"class")
[1] "labels"