## total abundance was not accurate in original file
#data <- read.csv("data-updated.csv", header=TRUE)
#data$Total_abund <- rowSums(data[,7:11])
#write.csv(data, "data-updated-2022.csv", row.names=F)
#data <- read.csv("data-updated-2022.csv", header=T)
data <- read.csv("parasite-RV-final-data.csv", header=T)
data<-data[,1:14]
# Creating tables
# Table 1
library(dplyr)
data %>%
group_by(Town) %>%
summarise(Infections=sum(Ranavirus), TotAbund = sum(Total_abund), AvgLoad=mean(RV_viral.load, na.rm=T) ,N=n())
## # A tibble: 9 × 5
## Town Infections TotAbund AvgLoad N
## <chr> <int> <int> <dbl> <int>
## 1 Blumenau 1 261 22.9 9
## 2 Chapecó 1 30 5.64 3
## 3 Embu das Artes 10 853 2.98 13
## 4 Iporanga 0 233 0 4
## 5 Piedade 0 153 0 10
## 6 Piraquara 0 52 0.0157 2
## 7 Quatro Barras 4 35 20.6 10
## 8 São José dos Pinhais 0 218 0 4
## 9 Urubici 0 295 0 10
# Macroparasite prevalence (# of bullfrogs with any parasites/total#)
data %>%
group_by(Town) %>%
summarise(Sum=sum(ifelse(Acantho>0 | Nema>0 | Trema>0 | Cest>0 | Pentastomida_larva>0, 1, 0)), N=n(), Prev = Sum/N)
## # A tibble: 9 × 4
## Town Sum N Prev
## <chr> <dbl> <int> <dbl>
## 1 Blumenau 8 9 0.889
## 2 Chapecó 3 3 1
## 3 Embu das Artes 12 13 0.923
## 4 Iporanga 4 4 1
## 5 Piedade 10 10 1
## 6 Piraquara 2 2 1
## 7 Quatro Barras 4 10 0.4
## 8 São José dos Pinhais 4 4 1
## 9 Urubici 7 10 0.7
table2 <- data %>%
group_by(Town) %>%
summarize(Acantho = sum(Acantho), Nema = sum(Nema), Trema = sum(Trema), Cest=sum(Cest), Pentastomida_larva=sum(Pentastomida_larva)) %>%
select(-Town) %>%
mutate(group_by(data, Town) %>% summarize(n=n()) %>% select(Town), Rich = rowSums(. > 0)) %>%
select(Town, Rich, Cest, Acantho, Nema, Pentastomida_larva, Trema, everything())
table2
## # A tibble: 9 × 7
## Town Rich Cest Acantho Nema Pentastomida_larva Trema
## <chr> <dbl> <int> <int> <int> <int> <int>
## 1 Blumenau 2 0 115 146 0 0
## 2 Chapecó 2 0 13 17 0 0
## 3 Embu das Artes 2 0 1 852 0 0
## 4 Iporanga 3 0 44 185 0 4
## 5 Piedade 3 0 1 150 0 2
## 6 Piraquara 2 0 8 44 0 0
## 7 Quatro Barras 4 7 12 15 1 0
## 8 São José dos Pinhais 5 30 24 147 3 14
## 9 Urubici 2 0 0 292 0 3
Multiple logistic regression: predicting ranavirus presence in an
individual
################################
# multiple logistic regression with Ranavirus presence/absence as response
mod <- glm(Ranavirus ~ Total_abund + Total_rich, family = "binomial", data=data)
# summary(mod) #no signif. terms
# AIC(mod) #75.208
mod2 <- glm(Ranavirus ~ Length + Acantho + Nema + Trema + Cest + Pentastomida_larva, family = "binomial", data=data)
# summary(mod2) #Length p=0.0443; Nema p=0.0901
# AIC(mod2) #70.833
mod3 <- glm(Ranavirus ~ Length + Total_abund + Total_rich+ Length:Total_rich+ Length:Total_abund, family = "binomial", data=data)
# summary(mod3) # no sig. terms # Total_rich p=0.0586; Length:Total_rich p=0.0631
# AIC(mod3) #68.628
#######################################
#Testing this marginally sig. model
library(caret)
dataClean <- na.omit(data)
# Split the data into training and test set
set.seed(623)
training.samples <- dataClean$Ranavirus %>%
createDataPartition(p = 0.7, list = FALSE)
train.data <- dataClean[training.samples, ]
test.data <- dataClean[-training.samples, ]
mod4 <- glm(Ranavirus ~ Total_rich + Length:Total_rich, family = "binomial", data=data)
# summary(mod4) # Total Rich p=0.0293; rich:length p=0.0330
# AIC(mod4) #70.757
library(MASS)
MOD2 <- glm(Ranavirus ~ Total_rich+ Length:Total_rich, family = "binomial", data=train.data) #AIC 42.81
#predicted1 <- predict(MOD1, test.data, type="response")
predicted2 <- predict(MOD2, test.data, type="response")
#find optimal cutoff probability to use to maximize accuracy
library(InformationValue)
optimal <- optimalCutoff(test.data$Ranavirus, predicted2)[1]
predicted.classes <- as.factor(ifelse(predicted2 > 0.26, "1", "0"))
#create confusion matrix
caret::confusionMatrix(as.factor(test.data$Ranavirus), predicted.classes) #Accuracy : 0.6667; p=0.6315; kappa=0.25; sensitivity=0.75; specificity 0.5; balanced accuracy = 0.6250
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 10 3
## 1 2 3
##
## Accuracy : 0.7222
## 95% CI : (0.4652, 0.9031)
## No Information Rate : 0.6667
## P-Value [Acc > NIR] : 0.4122
##
## Kappa : 0.3478
##
## Mcnemar's Test P-Value : 1.0000
##
## Sensitivity : 0.8333
## Specificity : 0.5000
## Pos Pred Value : 0.7692
## Neg Pred Value : 0.6000
## Prevalence : 0.6667
## Detection Rate : 0.5556
## Detection Prevalence : 0.7222
## Balanced Accuracy : 0.6667
##
## 'Positive' Class : 0
##
## Let's plot
##getting predicted response from model
plotting_dfm <- expand.grid(Length = seq(from=0, to = 22, by=0.01),
Total_rich = seq(from=0, to = 4, by=1))
plotting_dfm$preds <- plogis(predict(MOD2 , newdata=plotting_dfm))
##plotting the predicted response on the two covariates
library(ggplot2)
library(viridis)
library(paletteer)
pl <- ggplot(plotting_dfm, aes(x=Length, y =preds, color=as.factor(Total_rich)))
pl +
geom_point(size=0.75) +
ggtitle("GLM predicted Ranavirus status") +
ggplot2::ylab("Predicted Ranavirus Presence") +
xlab("Bullfrog Length (cm?)") +
scale_color_paletteer_d(`"beyonce::X2"`) +
theme_bw() +
labs(color='Macroparasite Richness')

#scale_color_grey() +
#scale_color_viridis(discrete=TRUE, option="viridis")
#scale_color_brewer(palette = "Paired")
### More accurate/intuitive model
#########################
# Using interact plot
library(jtools)
library(interactions)
library(colorspace)
library(colorblindr)
summ(mod4)
## MODEL INFO:
## Observations: 64 (1 missing obs. deleted)
## Dependent Variable: Ranavirus
## Type: Generalized linear model
## Family: binomial
## Link function: logit
##
## MODEL FIT:
## χ²(2) = 7.22, p = 0.03
## Pseudo-R² (Cragg-Uhler) = 0.16
## Pseudo-R² (McFadden) = 0.10
## AIC = 70.76, BIC = 77.23
##
## Standard errors: MLE
## ------------------------------------------------------
## Est. S.E. z val. p
## ----------------------- ------- ------ -------- ------
## (Intercept) -0.71 0.50 -1.42 0.16
## Total_rich -6.26 2.87 -2.18 0.03
## Total_rich:Length 0.35 0.16 2.13 0.03
## ------------------------------------------------------
p2 <- interact_plot(mod4, pred = Total_rich, modx = Length)
p2<- p2 + xlab("Total Macroparasite Richness") + ylab("Ranavirus Presence Probability")
p2<-edit_colors(p2, desaturate)
cowplot::ggdraw(p2)

##################################
# GAM with Ranavirus as response
library(mgcv)
mod <- gam(Ranavirus ~ s(Total_abund) + s(Total_rich, k=4), family = "binomial", data=data)
# summary(mod) #no signif. terms
# AIC(mod) #71.57
mod2 <- gam(Ranavirus ~ s(Length), family = "binomial", data=data)
# summary(mod2) #Length p=0.356
# AIC(mod2) #55.96702
mod3 <- gam(Ranavirus ~ s(Length) + s(Total_abund) + s(Total_rich,k=4) + s(Length, by= Total_rich)+ s(Length, by=Total_abund) + s(Total_rich, k=4, by=Length), family = "binomial", data=data)
# summary(mod3) # no sig. terms
# AIC(mod3) #60.37
mod4 <- gam(Ranavirus ~ s(Total_abund) + s(Length, by=Total_abund), family = "binomial", data=data)
# summary(mod4) # s(Length):Total_abund p=0.0319
# AIC(mod4) #57.21132
# just length
mod5 <- gam(Ranavirus ~ s(Length), family = "binomial", data=data)
# AIC(mod5) # 55.96702 #p=0.356
##################################
# linear regression with total macroparasite abundance as response variable
mod00 <- lm(Total_abund ~ Length, data=data) #summary(mod00) #p=0.0359
mod6 <- lm(Total_abund ~ Length + Ranavirus + Length:Ranavirus, data=data)
# summary(mod6) # Length p=0.0119
# GAM
mod6g <- gam(Total_abund ~s(Length) + s(Length, by = Ranavirus), data=data)
# summary(mod6g) # s(Length) p=0.0109; s(Length):Ranavirus p=0.0976
mod7 <- lm(Total_abund ~ Length + RanavirusFac, data=data) # see below where log of total abundance makes a better model (and with Ranavirus interaction term)
#summary(mod7) # Length p=0.0211 with Ranavirus; length p=0.12 without ranavirus
mod8 <- lm(Total_abund ~ Length + RanavirusFac + Length:RanavirusFac, data=data)
#summary(mod8) #no sig other than length
Stepwise model analyses
library(caret)
# Inspect the data
#sample_n(data, 3)
data$Log_Total_abund <- log10(ifelse(data$Total_abund == 0, 1, data$Total_abund))
dataClean <- na.omit(data)
# Split the data into training and test set
set.seed(623)
training.samples <- dataClean$Ranavirus %>%
createDataPartition(p = 0.8, list = FALSE)
train.data <- dataClean[training.samples, ]
test.data <- dataClean[-training.samples, ]
# model
library(MASS)
# Fit the model
#model <- glm(Ranavirus ~ Acantho + Nema + Trema + Cest + Pentastomida_larva + Total_abund + Total_rich + Length + Log_Total_abund + Length:Total_rich, data = train.data, family = binomial) %>%
#stepAIC(trace = FALSE, direction = "both")
# Summarize the final selected model
#summary(model) #AIC 49.07
# Make predictions
#probabilities <- model %>% predict(test.data, type = "response")
#predicted.classes <- ifelse(probabilities > 0.5, "1", "0")
# Model accuracy
#mean(predicted.classes==test.data$Ranavirus)
# 0.66 #predicted all negatives
full.model <- glm(Ranavirus ~ Acantho + Nema + Trema + Cest + Pentastomida_larva + Total_abund + Total_rich + Length + Log_Total_abund + Length:Total_rich, data = train.data, family = binomial)
#summary(full.model)
#coef(full.model) #AIC=50.361 #total_rich and #total_rich:Length are sig.
# stepwise variable selection
library(MASS)
step.model <- full.model %>% stepAIC(trace = FALSE, direction = "both") #AIC 48
#summary(step.model) # Total Rich p=0.04, # Rich:length p=0.0277
# Make predictions for stepwise model
probabilities <- step.model %>% predict(test.data, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, "1", "0")
# Prediction accuracy
observed.classes <- test.data$Ranavirus
#mean(predicted.classes == observed.classes, na.rm = T) #0.6666
## hand selected variables
top.model <- glm(Ranavirus ~ Total_rich + Length:Total_rich, data = train.data, family = binomial) #AIC(top.model) 47.3; AIC 48.58 with log_abund
probabilities <- top.model %>% predict(test.data, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, "1", "0")
# # Prediction accuracy
observed.classes <- test.data$Ranavirus
#mean(predicted.classes == observed.classes)
# not great at predicting presences #0.6666
#### compare performance
# Make predictions for full model
probabilities <- full.model %>% predict(test.data, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, "1", "0")
# Prediction accuracy
observed.classes <- test.data$Ranavirus
#mean(predicted.classes == observed.classes)
#0.6666
###################################
# which is best model for predicting parasite abundance? Length variable alone
allVars <- lm(Log_Total_abund ~ Length + Ranavirus + Length:Ranavirus, data = train.data)
step.model2 <- allVars %>% stepAIC(trace = FALSE, direction="both") #just Length
# summary(step.model2) #Length; p=0.000458
################## using another method
# http://www.sthda.com/english/articles/37-model-selection-essentials-in-r/154-stepwise-regression-essentials-in-r/
set.seed(123)
library(caret)
# Set up repeated k-fold cross-validation
train.control <- trainControl(method = "cv", number = 10)
# Train the model
step.model <- train(Log_Total_abund ~ Length + Ranavirus + Length:Ranavirus, data = dataClean,
method = "leapBackward",
tuneGrid = data.frame(nvmax = 1:3),
trControl = train.control
)
# step.model$results
# step.model$bestTune
# summary(step.model$finalModel)
#Length alone is best model
Linear regression: is Bullfrog Length affecting macroparasite
abundance (log10 transformed?) and/or richness? Yes
# length and total abundance
# summary(lm(Total_abund ~ Length, data=data)) #p=0.0398
ggplot(data, aes(x=Length, y=Total_abund)) + geom_point() + theme_bw() + annotate("text", x=19, y= 500, label= "p = 0.04") + xlab("Length") + ylab("Total Macroparasite Abundance")

#data$Log_Total_abund <- log10(ifelse(data$Total_abund == 0, 1, data$Total_abund))
#summary(lm(Log_Total_abund~Length, data=data)) #p=3.15e-06
## Adding an interaction with Ranavirus is still a good model (AIC)
mod1<-lm(Log_Total_abund ~ Length + Ranavirus:Length, data=data);AIC(mod1) #AIC=124.35
## [1] 124.3454
summary(mod1) #length p=1.24e-06; Length:Ranavirus p=0.1432; mult. R2 = 0.3219
##
## Call:
## lm(formula = Log_Total_abund ~ Length + Ranavirus:Length, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.4131 -0.3784 0.1315 0.3482 1.3861
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.68877 0.31041 -2.219 0.0302 *
## Length 0.10834 0.02013 5.381 1.24e-06 ***
## Length:Ranavirus -0.01605 0.01082 -1.483 0.1432
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6151 on 61 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.3219, Adjusted R-squared: 0.2997
## F-statistic: 14.48 on 2 and 61 DF, p-value: 7.145e-06
# Plot
#devtools::install_github("cardiomoon/ggiraphExtra")
#from here: https://cran.r-project.org/web/packages/ggiraphExtra/vignettes/ggPredict.html
require(ggiraph)
require(ggiraphExtra)
require(plyr)
ggPredict(mod1)

mod2<-lm(Log_Total_abund ~ Length, data=data); AIC(mod2) #AIC=128.6079
## [1] 124.612
summary(mod2) #p=3.15e-06; mult R2=0.2975
##
## Call:
## lm(formula = Log_Total_abund ~ Length, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.3133 -0.3680 0.0702 0.3083 1.4878
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.62461 0.31034 -2.013 0.0485 *
## Length 0.09989 0.01950 5.124 3.15e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.621 on 62 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.2975, Adjusted R-squared: 0.2861
## F-statistic: 26.25 on 1 and 62 DF, p-value: 3.154e-06
mod3<-lm(Log_Total_abund ~ Length:RanavirusFac, data=data); AIC(mod3) #AIC=124.3454
## [1] 124.3454
summary(mod3) #Length:RV0 p=1.24e-06; Length:RV1 p=2.04e-05; overall p=7.145e-06 #mult R2=0.3219
##
## Call:
## lm(formula = Log_Total_abund ~ Length:RanavirusFac, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.4131 -0.3784 0.1315 0.3482 1.3861
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.68877 0.31041 -2.219 0.0302 *
## Length:RanavirusFac0 0.10834 0.02013 5.381 1.24e-06 ***
## Length:RanavirusFac1 0.09229 0.01998 4.620 2.04e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6151 on 61 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.3219, Adjusted R-squared: 0.2997
## F-statistic: 14.48 on 2 and 61 DF, p-value: 7.145e-06
cols<- c("#440154FF","#3CBB75FF")
#bords <- c("black","black")
p<-ggPredict(mod3, se=T, show.summary = T)
##
## Call:
## lm(formula = Log_Total_abund ~ Length:RanavirusFac, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.4131 -0.3784 0.1315 0.3482 1.3861
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.68877 0.31041 -2.219 0.0302 *
## Length:RanavirusFac0 0.10834 0.02013 5.381 1.24e-06 ***
## Length:RanavirusFac1 0.09229 0.01998 4.620 2.04e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6151 on 61 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.3219, Adjusted R-squared: 0.2997
## F-statistic: 14.48 on 2 and 61 DF, p-value: 7.145e-06
p<- p + xlab("Length (cm)") + ylab("Log(Total macroparasite abundance)") + scale_fill_manual(values=cols, name="Ranavirus") + scale_color_manual(values=cols, name="Ranavirus") + theme_bw()
gg_des <- edit_colors(p, desaturate)
cowplot::ggdraw(gg_des)

p<-ggplot(data, aes(x=Length, y=Log_Total_abund)) + geom_point() + theme_bw() + annotate("text", x=7, y= 2.5, label= "p = 3.15E-06") +
geom_smooth(method="lm")+
ylab("Log(Total macroparasite abundance)") + xlab("Length (cm)")
gg_des <- edit_colors(p, desaturate)
cowplot::ggdraw(gg_des)

### transformed abundance ANOVA, logreg, lm
#summary(aov(Log_Total_abund ~ Ranavirus, data=data)) #p=0.325
#summary(lm(Log_Total_abund~Ranavirus, data=data)) #p=0.3251
#summary(lm(Length~Acantho, data=data)) #p=0.4
summary(lm(Length~Nema, data=data)) #p=0.0452
##
## Call:
## lm(formula = Length ~ Nema, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.336 -1.241 1.181 2.391 4.739
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15.086544 0.514735 29.309 <2e-16 ***
## Nema 0.011439 0.005596 2.044 0.0452 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.916 on 62 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.06313, Adjusted R-squared: 0.04802
## F-statistic: 4.178 on 1 and 62 DF, p-value: 0.04521
#summary(lm(Length~Trema, data=data)) #p=0.469
#summary(lm(Length~Cest, data=data)) #p=0.98
#summary(lm(Length~Pentastomida_larva, data=data)) #p=0.7
#summary(lm(Total_rich~Length, data=data)) #p=0.00145
ggplot(data, aes(x=Length, y=Total_rich)) + geom_point() + theme_bw() + annotate("text", x=6.5, y=3.5, label="p = 0.001")

#data$Total_rich_factor<-as.factor(data$Total_rich)
#ggplot(data, aes(x=Total_rich_factor, y=Length)) + geom_boxplot()
#mod<-aov(Length~Total_rich_factor, data=data)
#summary(mod) #p=2.73e-05
#TukeyHSD(mod) ## 0 is driving the differences
ANOVA analyses at the site level (no significant data)

Ranavirus Positive Sites
