## total abundance was not accurate in original file
#data <- read.csv("data-updated.csv", header=TRUE)
#data$Total_abund <- rowSums(data[,7:11]) 
#write.csv(data, "data-updated-2022.csv", row.names=F)

#data <- read.csv("data-updated-2022.csv", header=T)
data <- read.csv("parasite-RV-final-data.csv", header=T)
data<-data[,1:14]
# Creating tables
# Table 1
library(dplyr)
data %>%
  group_by(Town) %>%
  summarise(Infections=sum(Ranavirus), TotAbund = sum(Total_abund), AvgLoad=mean(RV_viral.load, na.rm=T) ,N=n())
## # A tibble: 9 × 5
##   Town                 Infections TotAbund AvgLoad     N
##   <chr>                     <int>    <int>   <dbl> <int>
## 1 Blumenau                      1      261 22.9        9
## 2 Chapecó                       1       30  5.64       3
## 3 Embu das Artes               10      853  2.98      13
## 4 Iporanga                      0      233  0          4
## 5 Piedade                       0      153  0         10
## 6 Piraquara                     0       52  0.0157     2
## 7 Quatro Barras                 4       35 20.6       10
## 8 São José dos Pinhais          0      218  0          4
## 9 Urubici                       0      295  0         10
# Macroparasite prevalence (# of bullfrogs with any parasites/total#)
data %>%
  group_by(Town) %>%
  summarise(Sum=sum(ifelse(Acantho>0 | Nema>0 | Trema>0 | Cest>0 | Pentastomida_larva>0, 1, 0)), N=n(), Prev = Sum/N)
## # A tibble: 9 × 4
##   Town                   Sum     N  Prev
##   <chr>                <dbl> <int> <dbl>
## 1 Blumenau                 8     9 0.889
## 2 Chapecó                  3     3 1    
## 3 Embu das Artes          12    13 0.923
## 4 Iporanga                 4     4 1    
## 5 Piedade                 10    10 1    
## 6 Piraquara                2     2 1    
## 7 Quatro Barras            4    10 0.4  
## 8 São José dos Pinhais     4     4 1    
## 9 Urubici                  7    10 0.7
table2 <- data %>%
  group_by(Town) %>%
  summarize(Acantho = sum(Acantho), Nema = sum(Nema), Trema = sum(Trema), Cest=sum(Cest), Pentastomida_larva=sum(Pentastomida_larva)) %>%
  select(-Town) %>%
  mutate(group_by(data, Town) %>% summarize(n=n()) %>% select(Town), Rich = rowSums(. > 0)) %>%
  select(Town, Rich, Cest, Acantho, Nema, Pentastomida_larva, Trema, everything())
table2
## # A tibble: 9 × 7
##   Town                  Rich  Cest Acantho  Nema Pentastomida_larva Trema
##   <chr>                <dbl> <int>   <int> <int>              <int> <int>
## 1 Blumenau                 2     0     115   146                  0     0
## 2 Chapecó                  2     0      13    17                  0     0
## 3 Embu das Artes           2     0       1   852                  0     0
## 4 Iporanga                 3     0      44   185                  0     4
## 5 Piedade                  3     0       1   150                  0     2
## 6 Piraquara                2     0       8    44                  0     0
## 7 Quatro Barras            4     7      12    15                  1     0
## 8 São José dos Pinhais     5    30      24   147                  3    14
## 9 Urubici                  2     0       0   292                  0     3

Boxplots - comparing individuals that were positive for Ranavirus to those that were negative (ANOVA p-values listed on plot)

Multiple logistic regression: predicting ranavirus presence in an individual

################################
# multiple logistic regression with Ranavirus presence/absence as response
mod <- glm(Ranavirus ~ Total_abund + Total_rich, family = "binomial", data=data)
# summary(mod) #no signif. terms 
# AIC(mod) #75.208
mod2 <- glm(Ranavirus ~ Length + Acantho + Nema + Trema + Cest + Pentastomida_larva, family = "binomial", data=data)
# summary(mod2) #Length p=0.0443; Nema p=0.0901
# AIC(mod2) #70.833
mod3 <- glm(Ranavirus ~ Length + Total_abund + Total_rich+ Length:Total_rich+ Length:Total_abund, family = "binomial", data=data)
# summary(mod3) # no sig. terms # Total_rich p=0.0586; Length:Total_rich p=0.0631
# AIC(mod3) #68.628


#######################################
#Testing this marginally sig. model
library(caret)
dataClean <- na.omit(data)
# Split the data into training and test set
set.seed(623)
training.samples <- dataClean$Ranavirus %>% 
  createDataPartition(p = 0.7, list = FALSE)
train.data  <- dataClean[training.samples, ]
test.data <- dataClean[-training.samples, ]

mod4 <- glm(Ranavirus ~ Total_rich + Length:Total_rich, family = "binomial", data=data)
# summary(mod4) # Total Rich p=0.0293; rich:length p=0.0330
# AIC(mod4) #70.757

library(MASS)
MOD2 <- glm(Ranavirus ~ Total_rich+ Length:Total_rich, family = "binomial", data=train.data) #AIC 42.81

#predicted1 <- predict(MOD1, test.data, type="response")
predicted2 <- predict(MOD2, test.data, type="response")
#find optimal cutoff probability to use to maximize accuracy
library(InformationValue)
optimal <- optimalCutoff(test.data$Ranavirus, predicted2)[1]

predicted.classes <- as.factor(ifelse(predicted2 > 0.26, "1", "0"))
#create confusion matrix
caret::confusionMatrix(as.factor(test.data$Ranavirus), predicted.classes) #Accuracy : 0.6667; p=0.6315; kappa=0.25; sensitivity=0.75; specificity 0.5; balanced accuracy = 0.6250
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 10  3
##          1  2  3
##                                           
##                Accuracy : 0.7222          
##                  95% CI : (0.4652, 0.9031)
##     No Information Rate : 0.6667          
##     P-Value [Acc > NIR] : 0.4122          
##                                           
##                   Kappa : 0.3478          
##                                           
##  Mcnemar's Test P-Value : 1.0000          
##                                           
##             Sensitivity : 0.8333          
##             Specificity : 0.5000          
##          Pos Pred Value : 0.7692          
##          Neg Pred Value : 0.6000          
##              Prevalence : 0.6667          
##          Detection Rate : 0.5556          
##    Detection Prevalence : 0.7222          
##       Balanced Accuracy : 0.6667          
##                                           
##        'Positive' Class : 0               
## 
## Let's plot
##getting predicted response from model
plotting_dfm <- expand.grid(Length = seq(from=0, to = 22, by=0.01),
                           Total_rich = seq(from=0, to = 4, by=1))
plotting_dfm$preds <- plogis(predict(MOD2 , newdata=plotting_dfm))

##plotting the predicted response on the two covariates
library(ggplot2)
library(viridis)
library(paletteer)
pl <- ggplot(plotting_dfm, aes(x=Length, y =preds, color=as.factor(Total_rich)))
pl + 
  geom_point(size=0.75) +
  ggtitle("GLM predicted Ranavirus status") + 
  ggplot2::ylab("Predicted Ranavirus Presence") +
  xlab("Bullfrog Length (cm?)") +
  scale_color_paletteer_d(`"beyonce::X2"`) +
  theme_bw() + 
  labs(color='Macroparasite Richness')

  #scale_color_grey() + 
  #scale_color_viridis(discrete=TRUE, option="viridis")
  #scale_color_brewer(palette = "Paired")

### More accurate/intuitive model
#########################
# Using interact plot
library(jtools)
library(interactions)
library(colorspace)
library(colorblindr)
summ(mod4)
## MODEL INFO:
## Observations: 64 (1 missing obs. deleted)
## Dependent Variable: Ranavirus
## Type: Generalized linear model
##   Family: binomial 
##   Link function: logit 
## 
## MODEL FIT:
## χ²(2) = 7.22, p = 0.03
## Pseudo-R² (Cragg-Uhler) = 0.16
## Pseudo-R² (McFadden) = 0.10
## AIC = 70.76, BIC = 77.23 
## 
## Standard errors: MLE
## ------------------------------------------------------
##                            Est.   S.E.   z val.      p
## ----------------------- ------- ------ -------- ------
## (Intercept)               -0.71   0.50    -1.42   0.16
## Total_rich                -6.26   2.87    -2.18   0.03
## Total_rich:Length          0.35   0.16     2.13   0.03
## ------------------------------------------------------
p2 <- interact_plot(mod4, pred = Total_rich, modx = Length)
p2<- p2 + xlab("Total Macroparasite Richness") + ylab("Ranavirus Presence Probability")
p2<-edit_colors(p2, desaturate)
cowplot::ggdraw(p2)

##################################
# GAM with Ranavirus as response
library(mgcv)
mod <- gam(Ranavirus ~ s(Total_abund) + s(Total_rich, k=4), family = "binomial", data=data)
# summary(mod) #no signif. terms
# AIC(mod) #71.57

mod2 <- gam(Ranavirus ~  s(Length), family = "binomial", data=data)
# summary(mod2) #Length p=0.356
# AIC(mod2) #55.96702

mod3 <- gam(Ranavirus ~ s(Length) + s(Total_abund) + s(Total_rich,k=4) + s(Length, by= Total_rich)+ s(Length, by=Total_abund) + s(Total_rich, k=4, by=Length), family = "binomial", data=data)
# summary(mod3) # no sig. terms
# AIC(mod3) #60.37

mod4 <- gam(Ranavirus ~ s(Total_abund) + s(Length, by=Total_abund), family = "binomial", data=data)
# summary(mod4) # s(Length):Total_abund p=0.0319
# AIC(mod4) #57.21132

# just length 
mod5 <- gam(Ranavirus ~  s(Length), family = "binomial", data=data)
# AIC(mod5) # 55.96702 #p=0.356


##################################
# linear regression with total macroparasite abundance as response variable 
mod00 <- lm(Total_abund ~ Length, data=data) #summary(mod00) #p=0.0359
mod6 <- lm(Total_abund ~ Length + Ranavirus + Length:Ranavirus, data=data)
# summary(mod6) # Length p=0.0119
# GAM
mod6g <- gam(Total_abund ~s(Length) + s(Length, by = Ranavirus), data=data)
# summary(mod6g) # s(Length) p=0.0109; s(Length):Ranavirus p=0.0976

mod7 <- lm(Total_abund ~ Length + RanavirusFac, data=data) # see below where log of total abundance makes a better model (and with Ranavirus interaction term)
#summary(mod7) # Length p=0.0211 with Ranavirus; length p=0.12 without ranavirus

mod8 <- lm(Total_abund ~ Length + RanavirusFac + Length:RanavirusFac, data=data)
#summary(mod8) #no sig other than length

Stepwise model analyses

library(caret)
# Inspect the data
#sample_n(data, 3)
data$Log_Total_abund <- log10(ifelse(data$Total_abund == 0, 1, data$Total_abund))
dataClean <- na.omit(data)
# Split the data into training and test set
set.seed(623)
training.samples <- dataClean$Ranavirus %>% 
  createDataPartition(p = 0.8, list = FALSE)
train.data  <- dataClean[training.samples, ]
test.data <- dataClean[-training.samples, ]

# model
library(MASS)
# Fit the model
#model <- glm(Ranavirus ~ Acantho + Nema + Trema + Cest + Pentastomida_larva + Total_abund + Total_rich + Length + Log_Total_abund + Length:Total_rich, data = train.data, family = binomial) %>%
  #stepAIC(trace = FALSE, direction = "both")
# Summarize the final selected model
#summary(model) #AIC 49.07
# Make predictions
#probabilities <- model %>% predict(test.data, type = "response")
#predicted.classes <- ifelse(probabilities > 0.5, "1", "0")
# Model accuracy
#mean(predicted.classes==test.data$Ranavirus)
# 0.66 #predicted all negatives

full.model <- glm(Ranavirus ~ Acantho + Nema + Trema + Cest + Pentastomida_larva + Total_abund + Total_rich + Length + Log_Total_abund + Length:Total_rich, data = train.data, family = binomial)
#summary(full.model)
#coef(full.model) #AIC=50.361 #total_rich and #total_rich:Length are sig.

# stepwise variable selection
library(MASS)
step.model <- full.model %>% stepAIC(trace = FALSE, direction = "both") #AIC 48
#summary(step.model) # Total Rich p=0.04, # Rich:length p=0.0277

# Make predictions for stepwise model
probabilities <- step.model %>% predict(test.data, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, "1", "0")
# Prediction accuracy
observed.classes <- test.data$Ranavirus
#mean(predicted.classes == observed.classes, na.rm = T) #0.6666

## hand selected variables
top.model <- glm(Ranavirus ~ Total_rich + Length:Total_rich, data = train.data, family = binomial) #AIC(top.model) 47.3; AIC 48.58 with log_abund

probabilities <- top.model %>% predict(test.data, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, "1", "0")
# # Prediction accuracy
observed.classes <- test.data$Ranavirus
#mean(predicted.classes == observed.classes)
# not great at predicting presences #0.6666

#### compare performance
# Make predictions for full model
probabilities <- full.model %>% predict(test.data, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, "1", "0")
# Prediction accuracy
observed.classes <- test.data$Ranavirus
#mean(predicted.classes == observed.classes)
#0.6666



###################################
# which is best model for predicting parasite abundance? Length variable alone
allVars <- lm(Log_Total_abund ~ Length + Ranavirus + Length:Ranavirus, data = train.data)

step.model2 <- allVars %>% stepAIC(trace = FALSE, direction="both") #just Length
# summary(step.model2) #Length; p=0.000458

################## using another method
# http://www.sthda.com/english/articles/37-model-selection-essentials-in-r/154-stepwise-regression-essentials-in-r/
set.seed(123)
library(caret)
# Set up repeated k-fold cross-validation
train.control <- trainControl(method = "cv", number = 10)
# Train the model
step.model <- train(Log_Total_abund ~ Length + Ranavirus + Length:Ranavirus, data = dataClean,
                    method = "leapBackward", 
                    tuneGrid = data.frame(nvmax = 1:3),
                    trControl = train.control
                    )
# step.model$results
# step.model$bestTune
# summary(step.model$finalModel)

#Length alone is best model

Linear regression: is Bullfrog Length affecting macroparasite abundance (log10 transformed?) and/or richness? Yes

# length and total abundance

# summary(lm(Total_abund ~ Length, data=data)) #p=0.0398
ggplot(data, aes(x=Length, y=Total_abund)) + geom_point() + theme_bw() + annotate("text", x=19, y= 500, label= "p = 0.04") + xlab("Length") + ylab("Total Macroparasite Abundance")

#data$Log_Total_abund <- log10(ifelse(data$Total_abund == 0, 1, data$Total_abund))

#summary(lm(Log_Total_abund~Length, data=data)) #p=3.15e-06

## Adding an interaction with Ranavirus is still a good model (AIC)
mod1<-lm(Log_Total_abund ~ Length + Ranavirus:Length, data=data);AIC(mod1) #AIC=124.35
## [1] 124.3454
summary(mod1) #length p=1.24e-06; Length:Ranavirus p=0.1432; mult. R2 = 0.3219
## 
## Call:
## lm(formula = Log_Total_abund ~ Length + Ranavirus:Length, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.4131 -0.3784  0.1315  0.3482  1.3861 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -0.68877    0.31041  -2.219   0.0302 *  
## Length            0.10834    0.02013   5.381 1.24e-06 ***
## Length:Ranavirus -0.01605    0.01082  -1.483   0.1432    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6151 on 61 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.3219, Adjusted R-squared:  0.2997 
## F-statistic: 14.48 on 2 and 61 DF,  p-value: 7.145e-06
# Plot
#devtools::install_github("cardiomoon/ggiraphExtra")
#from here: https://cran.r-project.org/web/packages/ggiraphExtra/vignettes/ggPredict.html
require(ggiraph)
require(ggiraphExtra)
require(plyr)

ggPredict(mod1) 

mod2<-lm(Log_Total_abund ~ Length, data=data); AIC(mod2) #AIC=128.6079
## [1] 124.612
summary(mod2) #p=3.15e-06; mult R2=0.2975
## 
## Call:
## lm(formula = Log_Total_abund ~ Length, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.3133 -0.3680  0.0702  0.3083  1.4878 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.62461    0.31034  -2.013   0.0485 *  
## Length       0.09989    0.01950   5.124 3.15e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.621 on 62 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.2975, Adjusted R-squared:  0.2861 
## F-statistic: 26.25 on 1 and 62 DF,  p-value: 3.154e-06
mod3<-lm(Log_Total_abund ~ Length:RanavirusFac, data=data); AIC(mod3) #AIC=124.3454
## [1] 124.3454
summary(mod3) #Length:RV0 p=1.24e-06; Length:RV1 p=2.04e-05; overall p=7.145e-06 #mult R2=0.3219
## 
## Call:
## lm(formula = Log_Total_abund ~ Length:RanavirusFac, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.4131 -0.3784  0.1315  0.3482  1.3861 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -0.68877    0.31041  -2.219   0.0302 *  
## Length:RanavirusFac0  0.10834    0.02013   5.381 1.24e-06 ***
## Length:RanavirusFac1  0.09229    0.01998   4.620 2.04e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6151 on 61 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.3219, Adjusted R-squared:  0.2997 
## F-statistic: 14.48 on 2 and 61 DF,  p-value: 7.145e-06
cols<- c("#440154FF","#3CBB75FF")
#bords <- c("black","black")
p<-ggPredict(mod3, se=T, show.summary = T)
## 
## Call:
## lm(formula = Log_Total_abund ~ Length:RanavirusFac, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.4131 -0.3784  0.1315  0.3482  1.3861 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -0.68877    0.31041  -2.219   0.0302 *  
## Length:RanavirusFac0  0.10834    0.02013   5.381 1.24e-06 ***
## Length:RanavirusFac1  0.09229    0.01998   4.620 2.04e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6151 on 61 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.3219, Adjusted R-squared:  0.2997 
## F-statistic: 14.48 on 2 and 61 DF,  p-value: 7.145e-06
p<- p + xlab("Length (cm)") + ylab("Log(Total macroparasite abundance)") + scale_fill_manual(values=cols, name="Ranavirus") + scale_color_manual(values=cols, name="Ranavirus") + theme_bw()

gg_des <- edit_colors(p, desaturate)
cowplot::ggdraw(gg_des)

p<-ggplot(data, aes(x=Length, y=Log_Total_abund)) + geom_point() + theme_bw() + annotate("text", x=7, y= 2.5, label= "p = 3.15E-06") +
  geom_smooth(method="lm")+
  ylab("Log(Total macroparasite abundance)") + xlab("Length (cm)")
gg_des <- edit_colors(p, desaturate)
cowplot::ggdraw(gg_des)

### transformed abundance ANOVA, logreg, lm 
#summary(aov(Log_Total_abund ~ Ranavirus, data=data)) #p=0.325
#summary(lm(Log_Total_abund~Ranavirus, data=data)) #p=0.3251

#summary(lm(Length~Acantho, data=data)) #p=0.4
summary(lm(Length~Nema, data=data)) #p=0.0452
## 
## Call:
## lm(formula = Length ~ Nema, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -10.336  -1.241   1.181   2.391   4.739 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 15.086544   0.514735  29.309   <2e-16 ***
## Nema         0.011439   0.005596   2.044   0.0452 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.916 on 62 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.06313,    Adjusted R-squared:  0.04802 
## F-statistic: 4.178 on 1 and 62 DF,  p-value: 0.04521
#summary(lm(Length~Trema, data=data)) #p=0.469
#summary(lm(Length~Cest, data=data)) #p=0.98
#summary(lm(Length~Pentastomida_larva, data=data)) #p=0.7

#summary(lm(Total_rich~Length, data=data)) #p=0.00145
ggplot(data, aes(x=Length, y=Total_rich)) + geom_point() + theme_bw() + annotate("text", x=6.5, y=3.5, label="p = 0.001")

#data$Total_rich_factor<-as.factor(data$Total_rich)
#ggplot(data, aes(x=Total_rich_factor, y=Length)) + geom_boxplot()
#mod<-aov(Length~Total_rich_factor, data=data)
#summary(mod) #p=2.73e-05
#TukeyHSD(mod) ## 0 is driving the differences

Logistic regression: does parasite abundance and richness predict presence/absence of ranavirus? (No) Does Bullfrog length predict presence/absence of ranavirus? (No)

## 
## Call:
## lm(formula = Log_Total_abund ~ Length, data = positives)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.0251 -0.2166  0.1349  0.2975  0.4841 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -0.64680    0.48139  -1.344  0.20045   
## Length       0.08989    0.02841   3.164  0.00689 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4543 on 14 degrees of freedom
## Multiple R-squared:  0.417,  Adjusted R-squared:  0.3753 
## F-statistic: 10.01 on 1 and 14 DF,  p-value: 0.006892
## 
## Call:
## lm(formula = Log_Total_abund ~ Length, data = negatives)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.4156 -0.4339  0.1145  0.3884  1.3834 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.70109    0.38020  -1.844   0.0716 .  
## Length       0.10911    0.02443   4.465 5.15e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6624 on 46 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.3024, Adjusted R-squared:  0.2872 
## F-statistic: 19.94 on 1 and 46 DF,  p-value: 5.153e-05

Are there relationships at the site level? (No, not when log transformed)

ANOVA analyses at the site level (no significant data)

Logistic regression at the site level (no significant data)

Ranavirus Positive Sites