BIOL 458 Final Project Frog

Investigating the relationship between parasite / pathogen (BdZE) loads and frogs body size (SVL).

# Setting up necessary library and importing data from the two studies
library(tidyverse)
Warning: package 'ggplot2' was built under R version 4.5.2
Warning: package 'purrr' was built under R version 4.5.1
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   4.0.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.1.0     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(patchwork)
Warning: package 'patchwork' was built under R version 4.5.2
sfsu_frog <- read.csv("SFSU_FrogsDisease_Data.csv")

anhui_frog_1 <- read.csv("265_2025_3654_MOESM2_ESM.csv")

anhui_frog_2 <- read.csv("265_2025_3654_MOESM4_ESM.csv")

anhui_frog_3 <- read.csv("265_2025_3654_MOESM3_ESM.csv")

Check the data structure and format

summary(sfsu_frog)
      ID              FileCall           Location          Call.Type        
 Length:484         Length:484         Length:484         Length:484        
 Class :character   Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character   Mode  :character  
                                                                            
                                                                            
                                                                            
 Bd.Positive          Water.Temp        BdZE                SVL       
 Length:484         Min.   :10.0   Min.   :     0.00   Min.   :31.20  
 Class :character   1st Qu.:13.0   1st Qu.:    37.73   1st Qu.:33.30  
 Mode  :character   Median :14.0   Median :  7580.19   Median :34.90  
                    Mean   :14.5   Mean   : 42917.30   Mean   :35.18  
                    3rd Qu.:16.0   3rd Qu.: 43239.52   3rd Qu.:36.30  
                    Max.   :22.0   Max.   :549920.00   Max.   :41.10  
      Mass            Call       Call.Duration    InterCall.Interval
 Min.   :2.000   Min.   :1.000   Min.   :0.1438   Min.   :0.3346    
 1st Qu.:3.200   1st Qu.:3.000   1st Qu.:0.2040   1st Qu.:0.7102    
 Median :3.400   Median :5.000   Median :0.2232   Median :0.8834    
 Mean   :3.631   Mean   :4.926   Mean   :0.2259   Mean   :0.9204    
 3rd Qu.:3.900   3rd Qu.:7.000   3rd Qu.:0.2490   3rd Qu.:1.0938    
 Max.   :6.500   Max.   :9.000   Max.   :0.3653   Max.   :1.9883    
 Call.Duty.Cycle    Call.Rate       Call.Effort     Call.Dominant.Frequency
 Min.   :0.0960   Min.   :0.4309   Min.   : 371.3   Min.   :1680           
 1st Qu.:0.1709   1st Qu.:0.7545   1st Qu.: 737.0   1st Qu.:2067           
 Median :0.2032   Median :0.9019   Median : 968.7   Median :2196           
 Mean   :0.2082   Mean   :0.9431   Mean   : 973.7   Mean   :2162           
 3rd Qu.:0.2389   3rd Qu.:1.0604   3rd Qu.:1175.7   3rd Qu.:2282           
 Max.   :0.3454   Max.   :2.0574   Max.   :1975.1   Max.   :2730           
 Total.N.Pulses  P1.Peak.Pulse.Rate
 Min.   :10.00   Min.   :  8.51    
 1st Qu.:15.00   1st Qu.: 85.82    
 Median :17.00   Median : 92.08    
 Mean   :17.25   Mean   : 94.67    
 3rd Qu.:20.00   3rd Qu.:100.03    
 Max.   :24.00   Max.   :152.21    
summary(anhui_frog_1)
    forg_id          SVL           parasite      calling_rates         f0      
 Min.   : 1.0   Min.   :4.440   Min.   : 0.000   Min.   : 51.0   Min.   :2602  
 1st Qu.:19.5   1st Qu.:4.900   1st Qu.: 0.000   1st Qu.:100.0   1st Qu.:3255  
 Median :38.0   Median :5.160   Median : 1.000   Median :144.0   Median :3598  
 Mean   :38.0   Mean   :5.162   Mean   : 1.933   Mean   :156.9   Mean   :3636  
 3rd Qu.:56.5   3rd Qu.:5.455   3rd Qu.: 3.000   3rd Qu.:202.0   3rd Qu.:3942  
 Max.   :75.0   Max.   :5.820   Max.   :13.000   Max.   :382.0   Max.   :5362  
      SPL             Tem              RH         NLP_content    
 Min.   :65.00   Min.   : 9.50   Min.   :83.00   Min.   : 0.900  
 1st Qu.:71.00   1st Qu.:15.75   1st Qu.:88.70   1st Qu.: 4.150  
 Median :73.00   Median :17.00   Median :92.00   Median : 5.900  
 Mean   :72.84   Mean   :16.79   Mean   :91.31   Mean   : 6.629  
 3rd Qu.:75.00   3rd Qu.:18.70   3rd Qu.:93.90   3rd Qu.: 8.450  
 Max.   :79.00   Max.   :21.60   Max.   :98.90   Max.   :16.700  
 call_duration       X          
 Min.   :  82.0   Mode:logical  
 1st Qu.: 312.5   NA's:75       
 Median : 473.0                 
 Mean   : 535.3                 
 3rd Qu.: 708.0                 
 Max.   :1140.0                 
 Title..The.impact.of.parasitic.infection.on.nonlinear.vocalizations.and.mate.choice.in.two.torrent.frogs
 Length:75                                                                                               
 Class :character                                                                                        
 Mode  :character                                                                                        
                                                                                                         
                                                                                                         
                                                                                                         
     X.1           
 Length:75         
 Class :character  
 Mode  :character  
                   
                   
                   

Test if the BdZE is normally distributed using ggpubr library

## install.packages("ggpubr")
library(ggpubr)
Warning: package 'ggpubr' was built under R version 4.5.2
ggdensity(sfsu_frog$BdZE)

ggqqplot(sfsu_frog$BdZE)

sfsu_frog$log_BdZE <- log(sfsu_frog$BdZE)
ggdensity(sfsu_frog$log_BdZE)
Warning: Removed 70 rows containing non-finite outside the scale range
(`stat_density()`).

ggqqplot(sfsu_frog$log_BdZE)
Warning: Removed 70 rows containing non-finite outside the scale range
(`stat_qq()`).
Warning: Removed 70 rows containing non-finite outside the scale range
(`stat_qq_line()`).
Removed 70 rows containing non-finite outside the scale range
(`stat_qq_line()`).

Testing BdZE distribution using Cumulative distribution function

plot(ecdf(sfsu_frog$BdZE))

As we can see abot Test if parasite count is normally distributed

ggdensity(anhui_frog_1$parasite)

ggqqplot(anhui_frog_1$parasite)

Test the Q-Q distribution using Shapiro-wilk test

shapiro.test(sfsu_frog$BdZE)

    Shapiro-Wilk normality test

data:  sfsu_frog$BdZE
W = 0.46488, p-value < 2.2e-16
hist(sfsu_frog$BdZE)

P-value of the test is < 2.2e-16, which is way less that 0.05, so we reject that the data is normally distributed.

shapiro.test(sfsu_frog$log_BdZE)

    Shapiro-Wilk normality test

data:  sfsu_frog$log_BdZE
W = NaN, p-value = NA
hist(sfsu_frog$log_BdZE)

Lets see if parasite counts is normally distributed.

shapiro.test(anhui_frog_1$parasite)

    Shapiro-Wilk normality test

data:  anhui_frog_1$parasite
W = 0.74822, p-value = 5.111e-10

So the parasite counts in the Anhui study is also not normally distributed.

Since both of the dependent variables we want to investigate are both nonnormal, we need to try other methods. Let’s try bootstrapping.

#install.packages("mosaic")

library(mosaic)
Warning: package 'mosaic' was built under R version 4.5.2
Registered S3 method overwritten by 'mosaic':
  method                           from   
  fortify.SpatialPolygonsDataFrame ggplot2

The 'mosaic' package masks several functions from core packages in order to add 
additional features.  The original behavior of these functions should not be affected by this.

Attaching package: 'mosaic'
The following object is masked from 'package:Matrix':

    mean
The following objects are masked from 'package:dplyr':

    count, do, tally
The following object is masked from 'package:purrr':

    cross
The following object is masked from 'package:ggplot2':

    stat
The following objects are masked from 'package:stats':

    binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
    quantile, sd, t.test, var
The following objects are masked from 'package:base':

    max, mean, min, prod, range, sample, sum
library(ggplot2)

BdZE <- sfsu_frog$BdZE

resample_BdZE <- resample(BdZE)

BdZE_means <- vector()

BdZE_means <- c(BdZE_means, mean(resample(BdZE)))

hist(BdZE_means)

library(boot)

Attaching package: 'boot'
The following object is masked from 'package:mosaic':

    logit
The following object is masked from 'package:lattice':

    melanoma
# function to obtain R-Squared from the data
rsq <- function(formula, data, indices)
{
  d <- data[indices,] # allows boot to select sample
  fit <- lm(formula, data=d)
  return(summary(fit)$r.square)
}
# bootstrapping with 1000 replications
results <- boot(data=sfsu_frog, statistic = rsq,
                R=1000, formula=SVL~BdZE)

# View results
results

ORDINARY NONPARAMETRIC BOOTSTRAP


Call:
boot(data = sfsu_frog, statistic = rsq, R = 1000, formula = SVL ~ 
    BdZE)


Bootstrap Statistics :
       original      bias    std. error
t1* 0.006340548 0.001705556 0.007242887
plot(results)

Lets try Spearman ranks for the SFSU data since it is non-parametric data

corr <- cor.test(x=sfsu_frog$SVL, y=sfsu_frog$BdZE, method = 'spearman')
Warning in cor.test.default(x, y, ...): Cannot compute exact p-value with ties
corr

    Spearman's rank correlation rho

data:  x and y
S = 19980043, p-value = 0.208
alternative hypothesis: true rho is not equal to 0
sample estimates:
      rho 
-0.057337 

Lets visualize it

ggscatter(sfsu_frog, x = "SVL", y = "BdZE",
          add = "reg.line", conf.int = TRUE,
          cor.coef = TRUE, cor.method = "spearman",
          xlab = "SVL", ylab = "BdZE")

Let’s see if theres a correlation in the Anhui Species 1 Study

corr2 <- cor.test(x=anhui_frog_1$SVL, y=anhui_frog_1$parasite, method = 'spearman')
Warning in cor.test.default(x, y, ...): Cannot compute exact p-value with ties
corr2

    Spearman's rank correlation rho

data:  x and y
S = 52816, p-value = 0.03143
alternative hypothesis: true rho is not equal to 0
sample estimates:
      rho 
0.2487081 

Interestingly, it does, Lets visualize it.

ggscatter(anhui_frog_1, x = "SVL", y = "parasite",
          add = "reg.line", conf.int = TRUE,
          cor.coef = TRUE, cor.method = "spearman",
          xlab = "SVL", ylab = "parasite count")

Lets see the other species in the Anhui Study

corr3 <- cor.test(x=anhui_frog_2$SVL, y=anhui_frog_2$parasite, method = 'spearman')
Warning in cor.test.default(x, y, ...): Cannot compute exact p-value with ties
corr3

    Spearman's rank correlation rho

data:  x and y
S = 70794, p-value = 0.9523
alternative hypothesis: true rho is not equal to 0
sample estimates:
         rho 
-0.007023257 

Visualize it

ggscatter(anhui_frog_2, x = "SVL", y = "parasite",
          add = "reg.line", conf.int = TRUE,
          cor.coef = TRUE, cor.method = "spearman",
          xlab = "SVL", ylab = "parasite count")

plotting the relationship between SVL and BdZe in the sfsu study

ggplot(data = sfsu_frog,
       mapping = aes(x = SVL, y = BdZE)) + 
        geom_point() + geom_smooth(method = "lm") 
`geom_smooth()` using formula = 'y ~ x'

Try to log transform it and then plot to see if they correlate

ggplot(data = sfsu_frog,
       mapping = aes(x = SVL, y = log_BdZE)) +
         geom_point() + geom_smooth(method = "lm") 
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 70 rows containing non-finite outside the scale range
(`stat_smooth()`).

log transforming the SVL as well to see the correlation

sfsu_frog$log_SVL <- log(sfsu_frog$SVL)
ggplot(data = sfsu_frog,
       mapping = aes(x = log_SVL, y = log_BdZE)) +
         geom_point() + geom_smooth(method = "lm") 
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 70 rows containing non-finite outside the scale range
(`stat_smooth()`).

Plotting SVL vs parasite count in the Anhui Study

ggplot(data = anhui_frog_1,
       mapping = aes(x = SVL, y = parasite)) +
         geom_point() + geom_smooth(method = "lm") 
`geom_smooth()` using formula = 'y ~ x'

As shown above, there is a positive correlation between SVL and parasite count in the Anhui Study, meaning in that specific sample, the larger the size of the frog, the more parasites it has.

Plotting the relation between SVL and Call Duration

sfsu1 <- ggplot(data = sfsu_frog,
       mapping = aes(x = SVL, y = Call.Duration)) +
         geom_point() + geom_smooth(method = "lm") 
sfsu1
`geom_smooth()` using formula = 'y ~ x'

As we can see, there is a positive relationship between SVL and Call Duration, meaning larger the frog, longer their call duration in the SFSU sample.

Now, lets see if the same relationship exist in the Anhui study.

anhui1 <- ggplot(data = anhui_frog_1,
       mapping = aes(x = SVL, y = call_duration)) +
         geom_point() + geom_smooth(method = "lm") 

anhui1
`geom_smooth()` using formula = 'y ~ x'

Interestingly, there is not much correlation between SVL and call_duration, a slightly positive one at most. Lets see the other specie they studied.

anhui2 <- ggplot(data = anhui_frog_2,
       mapping = aes(x = SVL, y = call_duration)) +
         geom_point() + geom_smooth(method = "lm") 

anhui2
`geom_smooth()` using formula = 'y ~ x'

For the other species, again, it is a rather flat regression line, however, it slightly trends downward.

Let’s see if SVL correlates with other call attributes.

sfsu2 <- ggplot(data = sfsu_frog,
       mapping = aes(x = SVL, y = Call.Rate)) +
         geom_point() + geom_smooth(method = "lm") 

sfsu3 <- ggplot(data = sfsu_frog,
       mapping = aes(x = SVL, y = Call.Effort)) +
         geom_point() + geom_smooth(method = "lm")

sfsu4 <- ggplot(data = sfsu_frog,
       mapping = aes(x = SVL, y = Call.Duty.Cycle )) +
         geom_point() + geom_smooth(method = "lm")

sfsu5 <- ggplot(data = sfsu_frog,
       mapping = aes(x = SVL, y = InterCall.Interval )) +
         geom_point() + geom_smooth(method = "lm")

sfsu2 + sfsu3 + sfsu4 + sfsu5
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'

As we can see, only call duty cycle has an obvious correlation with SVL. Call rates and call effort has a flat or slightly upward regression line, while Intercall Interval has a slightly downward trend line.