Set up stuff

rm(list=ls())
pacman::p_load(plyr,dplyr,ggplot2,lme4,car,multcomp,tidyr,reshape2,AICcmodavg)
setwd("C:/Users/Katy/Desktop/thesisr")

Data organization

# Read in data, add column calculating Calls Per Night
data.allparks <- read.csv("./MixedModel_allparks.csv")
data.allparks$Year <- as.factor(data.allparks$Year)
data.allparks$CallsPN <- round(data.allparks$Calls/data.allparks$Nights, 3)
str(data.allparks)
## 'data.frame':    2460 obs. of  7 variables:
##  $ Park   : Factor w/ 8 levels "APIS","GRPO",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Site   : Factor w/ 185 levels "APIS001A","APIS002A",..: 1 1 1 1 1 1 2 2 2 2 ...
##  $ Year   : Factor w/ 2 levels "2016","2017": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Nights : int  12 12 12 12 12 12 9 9 9 9 ...
##  $ Species: Factor w/ 9 levels "EPFU","LABO",..: 1 2 3 4 5 6 1 2 3 4 ...
##  $ Calls  : int  130 92 95 1496 2311 1 8 2 7 47 ...
##  $ CallsPN: num  10.83 7.67 7.92 124.67 192.58 ...
head(data.allparks)
##   Park     Site Year Nights Species Calls CallsPN
## 1 APIS APIS001A 2017     12    EPFU   130  10.833
## 2 APIS APIS001A 2017     12    LABO    92   7.667
## 3 APIS APIS001A 2017     12    LACI    95   7.917
## 4 APIS APIS001A 2017     12    LANO  1496 124.667
## 5 APIS APIS001A 2017     12    MYLU  2311 192.583
## 6 APIS APIS001A 2017     12    MYSE     1   0.083
# Temporarily subset the data by year. Rename columns to include year information.
# and remove the actual "Year" column. Then merge all data back to one wide
# data frame, with rows for each combination of Park, Site, and Species. Add a new
# column showing difference between Calls Per Night in 2017 vs 2016. 
data.allparks.2016 <- data.allparks %>% 
                      subset(Year == "2016") %>%
                      rename(Nights.2016 = Nights, Calls.2016 = Calls, 
                             CallsPN.2016 = CallsPN) %>%
                      subset(select = -c(Year))

data.allparks.2017 <- data.allparks %>% 
                      subset(Year == "2017") %>%
                      rename(Nights.2017 = Nights, Calls.2017 = Calls, 
                           CallsPN.2017 = CallsPN) %>%
                      subset(select = -c(Year))

data.allparks.wide <- merge(data.allparks.2016, data.allparks.2017, 
                            by = c("Park","Site","Species")) %>%
                      mutate(Diff = CallsPN.2017 - CallsPN.2016)
str(data.allparks.wide)
## 'data.frame':    1230 obs. of  10 variables:
##  $ Park        : Factor w/ 8 levels "APIS","GRPO",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Site        : Factor w/ 185 levels "APIS001A","APIS002A",..: 1 1 1 1 1 1 2 2 2 2 ...
##  $ Species     : Factor w/ 9 levels "EPFU","LABO",..: 1 2 3 4 5 6 1 2 3 4 ...
##  $ Nights.2016 : int  7 7 7 7 7 7 10 10 10 10 ...
##  $ Calls.2016  : int  17 91 35 144 2380 16 4 1 4 33 ...
##  $ CallsPN.2016: num  2.43 13 5 20.57 340 ...
##  $ Nights.2017 : int  12 12 12 12 12 12 9 9 9 9 ...
##  $ Calls.2017  : int  130 92 95 1496 2311 1 8 2 7 47 ...
##  $ CallsPN.2017: num  10.83 7.67 7.92 124.67 192.58 ...
##  $ Diff        : num  8.4 -5.33 2.92 104.1 -147.42 ...
head(data.allparks.wide)
##   Park     Site Species Nights.2016 Calls.2016 CallsPN.2016 Nights.2017
## 1 APIS APIS001A    EPFU           7         17        2.429          12
## 2 APIS APIS001A    LABO           7         91       13.000          12
## 3 APIS APIS001A    LACI           7         35        5.000          12
## 4 APIS APIS001A    LANO           7        144       20.571          12
## 5 APIS APIS001A    MYLU           7       2380      340.000          12
## 6 APIS APIS001A    MYSE           7         16        2.286          12
##   Calls.2017 CallsPN.2017     Diff
## 1        130       10.833    8.404
## 2         92        7.667   -5.333
## 3         95        7.917    2.917
## 4       1496      124.667  104.096
## 5       2311      192.583 -147.417
## 6          1        0.083   -2.203

Plot 1 - histogram of counts

gg.diff1 <- ggplot(data.allparks.wide, aes(data.allparks.wide$Diff)) + 
            geom_histogram(binwidth = 20, colour="black", fill="gray") + 
            theme_bw()
gg.diff1

Plot 2 - histogram of densities, density estimate, and probability density function

gg.diff2 <- ggplot(data.allparks.wide, aes(x=Diff)) + 
            geom_histogram(aes(y=..density..), binwidth = 20, 
                           colour="black", fill="gray") + 
            geom_density(alpha=0.4, color= "coral2", size=.7, fill= "coral2") + 
            theme_bw() 

mean(data.allparks.wide$Diff)
## [1] -6.78478
sd(data.allparks.wide$Diff)
## [1] 60.77969
gg.diff3 <- gg.diff2 + 
            stat_function(fun = dnorm, colour="blue", size=1.2,
                    args = list(mean = -6.78478, sd = 60.77969))
gg.diff3

# normal distribution seems to fit okay

Original attempt to test significance of “Park” variable

# Using a linear model
mod.null <- lm(data.allparks.wide$Diff ~ 1 , data= data.allparks.wide)
mod.park <- lm(data.allparks.wide$Diff ~ Park, data= data.allparks.wide)
anova(mod.null, mod.park)

# Using a linear mixed model
mod.null2 <- lmer(data.allparks.wide$Diff ~ 1 + (1|Site), data= data.allparks.wide)
mod.park2 <- lmer(data.allparks.wide$Diff ~ Park + (1|Site), data= data.allparks.wide)
anova(mod.null2, mod.park2)

# I wasn't sure whether to used the regular linear vs. mixed model for this test? Ran both and both gave a significant p-value. Therefore, conclude that there are significant differences by park, so I will run separate models for each park. 

Plot 3 - boxplots of Call Differences by park and species

# Plot Calls Per Night Differences (y axis) by park (x axis) for each species. Except leave out the two species MYSO and NYHU since they're only present at one park. 
diff.box <- ggplot(subset(data.allparks.wide, Species %in% c("EPFU","LABO","LACI","LANO", "MYLU", "MYSE", "PESU")), 
            aes(x=Park, y=Diff, color=Park)) + 
            geom_boxplot() + 
            scale_color_brewer(palette = "Dark2") +
            theme_bw() +
            facet_grid(cols = vars(Species)) + 
            theme(axis.text.x.bottom = element_text(angle=90)) +
            theme(legend.position = "bottom")
diff.box

Subset data by parks

park.list <- list("APIS", "GRPO", "INDU", "ISRO","MISS", "SACN", "SLBE", "VOYA")
for (i in park.list) {
  temp <- subset(data.allparks.wide, Park == i)
  temp <- droplevels(temp)
  assign(paste("data",i,sep="."), temp)
        }

Run model and hypothesis tests for each of the 8 parks

Apostle Islands

# This model yields a "singular fit" and lots of warnings when I get the confidence intervals. So still need to figure out what the problem is and how to resolve it.
model.APIS <- lmer(Diff ~ 0 + Species + (1|Site), data=data.APIS)
summary(model.APIS)
car::Anova(model.APIS)

hyptest.APIS <- glht(model.APIS, linfct= (Species = c("SpeciesEPFU = 0", "SpeciesLABO = 0", "SpeciesLACI = 0", "SpeciesLANO = 0", "SpeciesMYLU = 0", "SpeciesMYSE = 0")))
summary(hyptest.APIS)
confint(model.APIS, oldNames=FALSE)

Grand Portage

model.GRPO <- lmer(Diff ~ 0 + Species + (1|Site), data=data.GRPO)
summary(model.GRPO)
car::Anova(model.GRPO)

hyptest.GRPO <- glht(model.GRPO, linfct= (Species = c("SpeciesEPFU = 0", "SpeciesLABO = 0", "SpeciesLACI = 0", "SpeciesLANO = 0", "SpeciesMYLU = 0", "SpeciesMYSE = 0")))
summary(hyptest.GRPO)
confint(model.GRPO, oldNames=FALSE)

Indiana Dunes

model.INDU <- lmer(Diff ~ 0 + Species + (1|Site), data=data.INDU)
summary(model.INDU)
car::Anova(model.INDU)

hyptest.INDU <- glht(model.INDU, linfct= (Species = c("SpeciesEPFU = 0", "SpeciesLABO = 0", "SpeciesLACI = 0", "SpeciesLANO = 0", "SpeciesMYLU = 0", "SpeciesMYSE = 0", "SpeciesMYSO = 0", "SpeciesNYHU = 0", "SpeciesPESU = 0")))
summary(hyptest.INDU)
confint(model.INDU, oldNames=FALSE)

Isle Royale

model.ISRO <- lmer(Diff ~ 0 + Species + (1|Site), data=data.ISRO)
summary(model.ISRO)
car::Anova(model.ISRO)

hyptest.ISRO <- glht(model.ISRO, linfct= (Species = c("SpeciesEPFU = 0", "SpeciesLABO = 0", "SpeciesLACI = 0", "SpeciesLANO = 0", "SpeciesMYLU = 0", "SpeciesMYSE = 0")))
summary(hyptest.ISRO)
confint(model.ISRO, oldNames=FALSE)

Mississippi River

model.MISS <- lmer(Diff ~ 0 + Species + (1|Site), data=data.MISS)
summary(model.MISS)
car::Anova(model.MISS)

hyptest.MISS <- glht(model.MISS, linfct= (Species = c("SpeciesEPFU = 0", "SpeciesLABO = 0", "SpeciesLACI = 0", "SpeciesLANO = 0", "SpeciesMYLU = 0", "SpeciesMYSE = 0", "SpeciesPESU = 0")))
summary(hyptest.MISS)
confint(model.MISS, oldNames=FALSE)

Saint Croix

model.SACN <- lmer(Diff ~ 0 + Species + (1|Site), data=data.SACN)
summary(model.SACN)
car::Anova(model.SACN)

hyptest.SACN <- glht(model.SACN, linfct= (Species = c("SpeciesEPFU = 0", "SpeciesLABO = 0", "SpeciesLACI = 0", "SpeciesLANO = 0", "SpeciesMYLU = 0", "SpeciesMYSE = 0", "SpeciesPESU = 0")))
summary(hyptest.SACN)
confint(model.SACN, oldNames=FALSE)

Sleeping Bear Dunes

model.SLBE <- lmer(Diff ~ 0 + Species + (1|Site), data=data.SLBE)
summary(model.SLBE)
car::Anova(model.SLBE)

hyptest.SLBE <- glht(model.SLBE, linfct= (Species = c("SpeciesEPFU = 0", "SpeciesLABO = 0", "SpeciesLACI = 0", "SpeciesLANO = 0", "SpeciesMYLU = 0", "SpeciesMYSE = 0")))
summary(hyptest.SLBE)
confint(model.SLBE, oldNames=FALSE)

Voyageurs

model.VOYA <- lmer(Diff ~ 0 + Species + (1|Site), data=data.VOYA)
summary(model.VOYA)
car::Anova(model.VOYA)

hyptest.VOYA <- glht(model.VOYA, linfct= (Species = c("SpeciesEPFU = 0", "SpeciesLABO = 0", "SpeciesLACI = 0", "SpeciesLANO = 0", "SpeciesMYLU = 0", "SpeciesMYSE = 0")))
summary(hyptest.VOYA)
confint(model.VOYA, oldNames=FALSE)

Updated model to test significance of “Park” variable

# Added Park/Site as nested random effect
model.parkonly.up <- lmer(data.allparks.wide$Diff ~ 0 + Park + (1|Park/Site), 
                     data= data.allparks.wide, REML = FALSE)
## singular fit
summary(model.parkonly.up)
## Linear mixed model fit by maximum likelihood  ['lmerMod']
## Formula: data.allparks.wide$Diff ~ 0 + Park + (1 | Park/Site)
##    Data: data.allparks.wide
## 
##      AIC      BIC   logLik deviance df.resid 
##  13595.8  13652.0  -6786.9  13573.8     1219 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -10.7864  -0.0844   0.1029   0.2235   7.3730 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  Site:Park (Intercept)   33.88   5.821  
##  Park      (Intercept)    0.00   0.000  
##  Residual              3599.59  59.997  
## Number of obs: 1230, groups:  Site:Park, 185; Park, 8
## 
## Fixed effects:
##          Estimate Std. Error t value
## ParkAPIS   -9.760      5.139  -1.899
## ParkGRPO  -19.999      6.106  -3.275
## ParkINDU    5.327      4.085   1.304
## ParkISRO  -14.417      5.139  -2.805
## ParkMISS   -5.838      4.779  -1.222
## ParkSACN  -11.086      5.518  -2.009
## ParkSLBE   -2.189      4.318  -0.507
## ParkVOYA  -11.539      5.934  -1.945
## 
## Correlation of Fixed Effects:
##          PrAPIS PrGRPO PrINDU PrISRO PrMISS PrSACN PrSLBE
## ParkGRPO 0.000                                           
## ParkINDU 0.000  0.000                                    
## ParkISRO 0.000  0.000  0.000                             
## ParkMISS 0.000  0.000  0.000  0.000                      
## ParkSACN 0.000  0.000  0.000  0.000  0.000               
## ParkSLBE 0.000  0.000  0.000  0.000  0.000  0.000        
## ParkVOYA 0.000  0.000  0.000  0.000  0.000  0.000  0.000 
## convergence code: 0
## singular fit
car::Anova(model.parkonly.up)
## Analysis of Deviance Table (Type II Wald chisquare tests)
## 
## Response: data.allparks.wide$Diff
##       Chisq Df Pr(>Chisq)    
## Park 33.472  8  5.063e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
hyptest.parkonly.up <- glht(model.parkonly.up, linfct= mcp(Park = "Tukey"))
# With this change, still only seeing one significant difference among all of the Tukey pairwise comparisons between parks.
summary(hyptest.parkonly.up)
## Warning in RET$pfunction("adjusted", ...): Completion with error > abseps

## Warning in RET$pfunction("adjusted", ...): Completion with error > abseps
## 
##   Simultaneous Tests for General Linear Hypotheses
## 
## Multiple Comparisons of Means: Tukey Contrasts
## 
## 
## Fit: lmer(formula = data.allparks.wide$Diff ~ 0 + Park + (1 | Park/Site), 
##     data = data.allparks.wide, REML = FALSE)
## 
## Linear Hypotheses:
##                  Estimate Std. Error z value Pr(>|z|)  
## GRPO - APIS == 0 -10.2383     7.9807  -1.283   0.9039  
## INDU - APIS == 0  15.0872     6.5647   2.298   0.2908  
## ISRO - APIS == 0  -4.6562     7.2676  -0.641   0.9983  
## MISS - APIS == 0   3.9219     7.0176   0.559   0.9993  
## SACN - APIS == 0  -1.3254     7.5405  -0.176   1.0000  
## SLBE - APIS == 0   7.5711     6.7120   1.128   0.9499  
## VOYA - APIS == 0  -1.7785     7.8499  -0.227   1.0000  
## INDU - GRPO == 0  25.3254     7.3464   3.447   0.0129 *
## ISRO - GRPO == 0   5.5821     7.9807   0.699   0.9970  
## MISS - GRPO == 0  14.1601     7.7538   1.826   0.5980  
## SACN - GRPO == 0   8.9129     8.2300   1.083   0.9597  
## SLBE - GRPO == 0  17.8094     7.4783   2.381   0.2478  
## VOYA - GRPO == 0   8.4598     8.5144   0.994   0.9750  
## ISRO - INDU == 0 -19.7434     6.5647  -3.008   0.0524 .
## MISS - INDU == 0 -11.1653     6.2868  -1.776   0.6328  
## SACN - INDU == 0 -16.4126     6.8656  -2.391   0.2430  
## SLBE - INDU == 0  -7.5161     5.9437  -1.265   0.9104  
## VOYA - INDU == 0 -16.8656     7.2040  -2.341   0.2677  
## MISS - ISRO == 0   8.5781     7.0176   1.222   0.9242  
## SACN - ISRO == 0   3.3308     7.5405   0.442   0.9998  
## SLBE - ISRO == 0  12.2273     6.7120   1.822   0.6014  
## VOYA - ISRO == 0   2.8777     7.8499   0.367   1.0000  
## SACN - MISS == 0  -5.2472     7.2999  -0.719   0.9964  
## SLBE - MISS == 0   3.6493     6.4405   0.567   0.9992  
## VOYA - MISS == 0  -5.7003     7.6190  -0.748   0.9954  
## SLBE - SACN == 0   8.8965     7.0066   1.270   0.9086  
## VOYA - SACN == 0  -0.4531     8.1032  -0.056   1.0000  
## VOYA - SLBE == 0  -9.3496     7.3385  -1.274   0.9070  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)

Candidate Models with all parks together

# Change of strategy. DO NOT separate by park. Leave all parks together. 
# Now try these 5 candidate models. Set REML = FALSE so that log-likelihood is used, 
# so that we can compare the models with AIC. 

# NULL MODEL
model.null <- lmer(data.allparks.wide$Diff ~ 1 + (1|Site), 
                   data= data.allparks.wide, REML = FALSE)
summary(model.null)


# MODEL 1, FIXED EFFECTS = 0 + PARK
model.parkonly <- lmer(data.allparks.wide$Diff ~ 0 + Park + (1|Site), 
                     data= data.allparks.wide, REML = FALSE)
summary(model.parkonly)
car::Anova(model.parkonly)
hyptest.parkonly <- glht(model.parkonly, linfct= mcp(Park = "Tukey"))
# Only see one significant difference among all of the Tukey pairwise comparisons
summary(hyptest.parkonly)


# MODEL 2, FIXED EFFECTS = 0 + SPECIES
model.speciesonly <- lmer(data.allparks.wide$Diff ~ 0 + Species + (1|Park/Site), 
                       data= data.allparks.wide, REML = FALSE)
summary(model.speciesonly)
car::Anova(model.speciesonly)
hyptest.speciesonly <- glht(model.speciesonly, 
                       linfct= (Species = c("SpeciesEPFU = 0",  "SpeciesLABO = 0", "SpeciesLACI = 0", "SpeciesLANO = 0", "SpeciesMYLU = 0", "SpeciesMYSE = 0", "SpeciesMYSO = 0", "SpeciesNYHU = 0", "SpeciesPESU = 0")))
# Only see one species with estimate significantly different than zero (MYLU)
summary(hyptest.speciesonly)


# MODEL 3, FIXED EFFECTS = 0 + PARK + SPECIES
model.parkspecies <- lmer(data.allparks.wide$Diff ~ 0 + Park + Species + (1|Site), 
                       data= data.allparks.wide, REML = FALSE)
summary(model.parkspecies)
car::Anova(model.parkspecies)


# MODEL 4, FIXED EFFECTS = 0 + PARK*SPECIES
model.parkspeciesInt <- lmer(data.allparks.wide$Diff ~ 0 + Park*Species + (1|Site), 
                          data= data.allparks.wide, REML = FALSE)
# I think the "rank-deficient" error is ok because not all parks have all species.
summary(model.parkspeciesInt)
car::Anova(model.parkspeciesInt)

AIC Comparison Tables

# Standard AIC (using stats pkg) -> Model 3 has lowest AIC
AIC(model.null, model.parkonly, model.speciesonly, model.parkspecies, model.parkspeciesInt)


# Small sample AICc (using AICcmodavg pkg) -> Model 3 has lowest AIC
# (Note: aictab function automatically selected AICc instead of AIC, even when I did not include the argument "second.ord = TRUE".)
model.list <- list(model.null, model.parkonly, model.speciesonly, model.parkspecies, model.parkspeciesInt)
model.names <- c("model.null", "model.parkonly", "model.speciesonly", "model.parkspecies", "model.parkspeciesInt")
aictab(cand.set = model.list, modnames = model.names, sort = TRUE, second.ord = TRUE)