Set up stuff

rm(list=ls())
pacman::p_load(plyr,dplyr,ggplot2,lme4,car,multcomp,tidyr,reshape2,AICcmodavg)
setwd("C:/Users/Katy/Desktop/thesisr")

Data organization

# Read in data, add column calculating Calls Per Night
data.allparks <- read.csv("./MixedModel_allparks.csv")
data.allparks$Year <- as.factor(data.allparks$Year)
data.allparks$CallsPN <- round(data.allparks$Calls/data.allparks$Nights, 3)
str(data.allparks)
## 'data.frame':    2460 obs. of  7 variables:
##  $ Park   : Factor w/ 8 levels "APIS","GRPO",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Site   : Factor w/ 185 levels "APIS001A","APIS002A",..: 1 1 1 1 1 1 2 2 2 2 ...
##  $ Year   : Factor w/ 2 levels "2016","2017": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Nights : int  12 12 12 12 12 12 9 9 9 9 ...
##  $ Species: Factor w/ 9 levels "EPFU","LABO",..: 1 2 3 4 5 6 1 2 3 4 ...
##  $ Calls  : int  130 92 95 1496 2311 1 8 2 7 47 ...
##  $ CallsPN: num  10.83 7.67 7.92 124.67 192.58 ...
head(data.allparks)
##   Park     Site Year Nights Species Calls CallsPN
## 1 APIS APIS001A 2017     12    EPFU   130  10.833
## 2 APIS APIS001A 2017     12    LABO    92   7.667
## 3 APIS APIS001A 2017     12    LACI    95   7.917
## 4 APIS APIS001A 2017     12    LANO  1496 124.667
## 5 APIS APIS001A 2017     12    MYLU  2311 192.583
## 6 APIS APIS001A 2017     12    MYSE     1   0.083
# Temporarily subset the data by year. Rename columns to include year information.
# and remove the actual "Year" column. Then merge all data back to one wide
# data frame, with rows for each combination of Park, Site, and Species. Add a new
# column showing difference between Calls Per Night in 2017 vs 2016. 
data.allparks.2016 <- data.allparks %>% 
                      subset(Year == "2016") %>%
                      rename(Nights.2016 = Nights, Calls.2016 = Calls, 
                             CallsPN.2016 = CallsPN) %>%
                      subset(select = -c(Year))

data.allparks.2017 <- data.allparks %>% 
                      subset(Year == "2017") %>%
                      rename(Nights.2017 = Nights, Calls.2017 = Calls, 
                           CallsPN.2017 = CallsPN) %>%
                      subset(select = -c(Year))

data.allparks.wide <- merge(data.allparks.2016, data.allparks.2017, 
                            by = c("Park","Site","Species")) %>%
                      mutate(Diff = CallsPN.2017 - CallsPN.2016)
str(data.allparks.wide)
## 'data.frame':    1230 obs. of  10 variables:
##  $ Park        : Factor w/ 8 levels "APIS","GRPO",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Site        : Factor w/ 185 levels "APIS001A","APIS002A",..: 1 1 1 1 1 1 2 2 2 2 ...
##  $ Species     : Factor w/ 9 levels "EPFU","LABO",..: 1 2 3 4 5 6 1 2 3 4 ...
##  $ Nights.2016 : int  7 7 7 7 7 7 10 10 10 10 ...
##  $ Calls.2016  : int  17 91 35 144 2380 16 4 1 4 33 ...
##  $ CallsPN.2016: num  2.43 13 5 20.57 340 ...
##  $ Nights.2017 : int  12 12 12 12 12 12 9 9 9 9 ...
##  $ Calls.2017  : int  130 92 95 1496 2311 1 8 2 7 47 ...
##  $ CallsPN.2017: num  10.83 7.67 7.92 124.67 192.58 ...
##  $ Diff        : num  8.4 -5.33 2.92 104.1 -147.42 ...
head(data.allparks.wide)
##   Park     Site Species Nights.2016 Calls.2016 CallsPN.2016 Nights.2017
## 1 APIS APIS001A    EPFU           7         17        2.429          12
## 2 APIS APIS001A    LABO           7         91       13.000          12
## 3 APIS APIS001A    LACI           7         35        5.000          12
## 4 APIS APIS001A    LANO           7        144       20.571          12
## 5 APIS APIS001A    MYLU           7       2380      340.000          12
## 6 APIS APIS001A    MYSE           7         16        2.286          12
##   Calls.2017 CallsPN.2017     Diff
## 1        130       10.833    8.404
## 2         92        7.667   -5.333
## 3         95        7.917    2.917
## 4       1496      124.667  104.096
## 5       2311      192.583 -147.417
## 6          1        0.083   -2.203

Plot 1 - histogram of counts

gg.diff1 <- ggplot(data.allparks.wide, aes(data.allparks.wide$Diff)) + 
            geom_histogram(binwidth = 20, colour="black", fill="gray") + 
            theme_bw()
gg.diff1

Plot 2 - histogram of densities, density estimate, and probability density function

gg.diff2 <- ggplot(data.allparks.wide, aes(x=Diff)) + 
            geom_histogram(aes(y=..density..), binwidth = 20, 
                           colour="black", fill="gray") + 
            geom_density(alpha=0.4, color= "coral2", size=.7, fill= "coral2") + 
            theme_bw() 

mean(data.allparks.wide$Diff)
## [1] -6.78478
sd(data.allparks.wide$Diff)
## [1] 60.77969
gg.diff3 <- gg.diff2 + 
            stat_function(fun = dnorm, colour="blue", size=1.2,
                    args = list(mean = -6.78478, sd = 60.77969))
gg.diff3

# normal distribution seems to fit okay

Plot 3 - boxplots of Calls/Night Differences by park and species

# Plot Calls Per Night Differences (y axis) by park (x axis) for each species. 
diff.box <- ggplot(subset(data.allparks.wide), 
            aes(x=Park, y=Diff, color=Park)) + 
            geom_boxplot() + 
            scale_color_brewer(palette = "Dark2") +
            theme_bw() +
            facet_grid(cols = vars(Species), scales = "free_x") + 
            theme(axis.text.x.bottom = element_text(angle=90)) +
            theme(legend.position = "bottom")
diff.box

Test models with “Park” and “Species” as fixed effects vs. a null model

# Use linear mixed models with "Site" as a random effect in all models. 
# Compare a null model, a model with only "Park" as a fixed effect, a model with only "Species" as a fixed effect, and models with "Park" and "Species" as fixed effects (with or without the interaction).  
# Set REML = FALSE so that log-likelihood is used, so that we can compare the models with AIC. 
mod.null <- lmer(data.allparks.wide$Diff ~ 1 + (1|Site), 
                 data= data.allparks.wide, REML = FALSE)
mod.park <- lmer(data.allparks.wide$Diff ~ Park + (1|Site), 
                 data= data.allparks.wide, REML = FALSE)
mod.species <- lmer(data.allparks.wide$Diff ~ Species + (1|Site), 
                 data= data.allparks.wide, REML = FALSE)
mod.parkspp <- lmer(data.allparks.wide$Diff ~ Park + Species + (1|Site), 
                 data= data.allparks.wide, REML = FALSE)
mod.parksppint <- lmer(data.allparks.wide$Diff ~ Park*Species + (1|Site), 
                 data= data.allparks.wide, REML = FALSE)  
## fixed-effect model matrix is rank deficient so dropping 19 columns / coefficients
# The rank-deficient warning is not an issue. Not all park-species combinations exist.

# Use small sample AICc (using AICcmodavg pkg) to compare these models.
# (Note: aictab function automatically selected AICc instead of AIC, even when I did not include the argument "second.ord = TRUE".)
mod.list <- list(mod.null, mod.park, mod.species, mod.parkspp, mod.parksppint)
mod.names <- c("mod.null", "mod.park", "mod.species", "mod.parkspp", "mod.parksppint")
aictab(cand.set = mod.list, modnames = mod.names, sort = TRUE, second.ord = TRUE)
## 
## Model selection based on AICc:
## 
##                 K     AICc Delta_AICc AICcWt Cum.Wt       LL
## mod.parkspp    18 13520.71       0.00   0.84   0.84 -6742.07
## mod.species    11 13524.11       3.39   0.15   1.00 -6750.94
## mod.parksppint 55 13535.34      14.63   0.00   1.00 -6710.05
## mod.park       10 13593.94      73.23   0.00   1.00 -6786.88
## mod.null        3 13597.15      76.43   0.00   1.00 -6795.56

Conclusions: The Park+Species and Species alone models have the lowest AIC scores and do not differ too much from each other. The Park alone model is only slightly different from the null model. This suggests that “Park” as a fixed effect does not contribute much to the model, but “Species” does.

Look at differences among parks

# Continue examining the effects of the "Park" variable by looking at model outputs. 
summary(mod.park)
## Linear mixed model fit by maximum likelihood  ['lmerMod']
## Formula: data.allparks.wide$Diff ~ Park + (1 | Site)
##    Data: data.allparks.wide
## 
##      AIC      BIC   logLik deviance df.resid 
##  13593.8  13644.9  -6786.9  13573.8     1220 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -10.7864  -0.0844   0.1029   0.2235   7.3730 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  Site     (Intercept)   33.88   5.821  
##  Residual             3599.59  59.997  
## Number of obs: 1230, groups:  Site, 185
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)   -9.760      5.139  -1.899
## ParkGRPO     -10.238      7.981  -1.283
## ParkINDU      15.087      6.565   2.298
## ParkISRO      -4.656      7.268  -0.641
## ParkMISS       3.922      7.018   0.559
## ParkSACN      -1.325      7.541  -0.176
## ParkSLBE       7.571      6.712   1.128
## ParkVOYA      -1.778      7.850  -0.227
## 
## Correlation of Fixed Effects:
##          (Intr) PrGRPO PrINDU PrISRO PrMISS PrSACN PrSLBE
## ParkGRPO -0.644                                          
## ParkINDU -0.783  0.504                                   
## ParkISRO -0.707  0.455  0.554                            
## ParkMISS -0.732  0.472  0.573  0.518                     
## ParkSACN -0.682  0.439  0.534  0.482  0.499              
## ParkSLBE -0.766  0.493  0.599  0.541  0.561  0.522       
## ParkVOYA -0.655  0.422  0.512  0.463  0.479  0.446  0.501
confint(mod.park)
## Computing profile confidence intervals ...
##                  2.5 %     97.5 %
## .sig01        0.000000 13.3638887
## .sigma       57.507426 62.6455775
## (Intercept) -19.877219  0.3565804
## ParkGRPO    -25.949660  5.4731421
## ParkINDU      2.141496 28.0328604
## ParkISRO    -18.963658  9.6512555
## ParkMISS     -9.902512 17.7462342
## ParkSACN    -16.181441 13.5306828
## ParkSLBE     -5.642512 20.7847389
## ParkVOYA    -17.232278 13.6753614
car::Anova(mod.park)
## Analysis of Deviance Table (Type II Wald chisquare tests)
## 
## Response: data.allparks.wide$Diff
##       Chisq Df Pr(>Chisq)  
## Park 18.436  7    0.01015 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# There is a significant difference among at least two of the parks, but we still don't know which ones are different from each other. So, next run a Tukey pairwise comparisons test to compare each park to all other parks, to identify where the significant differences are. This tests the hypothesis that Park1 = Park2 for every combination of parks. 
hyptest.mod.park <- glht(mod.park, linfct= mcp(Park = "Tukey"))
summary(hyptest.mod.park)
## Warning in RET$pfunction("adjusted", ...): Completion with error > abseps

## Warning in RET$pfunction("adjusted", ...): Completion with error > abseps

## Warning in RET$pfunction("adjusted", ...): Completion with error > abseps

## Warning in RET$pfunction("adjusted", ...): Completion with error > abseps
## 
##   Simultaneous Tests for General Linear Hypotheses
## 
## Multiple Comparisons of Means: Tukey Contrasts
## 
## 
## Fit: lmer(formula = data.allparks.wide$Diff ~ Park + (1 | Site), data = data.allparks.wide, 
##     REML = FALSE)
## 
## Linear Hypotheses:
##                  Estimate Std. Error z value Pr(>|z|)  
## GRPO - APIS == 0 -10.2383     7.9807  -1.283   0.9039  
## INDU - APIS == 0  15.0872     6.5647   2.298   0.2908  
## ISRO - APIS == 0  -4.6562     7.2676  -0.641   0.9983  
## MISS - APIS == 0   3.9219     7.0176   0.559   0.9993  
## SACN - APIS == 0  -1.3254     7.5405  -0.176   1.0000  
## SLBE - APIS == 0   7.5711     6.7120   1.128   0.9500  
## VOYA - APIS == 0  -1.7785     7.8499  -0.227   1.0000  
## INDU - GRPO == 0  25.3254     7.3464   3.447   0.0130 *
## ISRO - GRPO == 0   5.5821     7.9807   0.699   0.9970  
## MISS - GRPO == 0  14.1601     7.7538   1.826   0.5982  
## SACN - GRPO == 0   8.9129     8.2300   1.083   0.9598  
## SLBE - GRPO == 0  17.8094     7.4783   2.381   0.2468  
## VOYA - GRPO == 0   8.4598     8.5144   0.994   0.9750  
## ISRO - INDU == 0 -19.7434     6.5647  -3.008   0.0524 .
## MISS - INDU == 0 -11.1653     6.2868  -1.776   0.6327  
## SACN - INDU == 0 -16.4126     6.8656  -2.391   0.2424  
## SLBE - INDU == 0  -7.5161     5.9437  -1.265   0.9104  
## VOYA - INDU == 0 -16.8656     7.2040  -2.341   0.2681  
## MISS - ISRO == 0   8.5781     7.0176   1.222   0.9242  
## SACN - ISRO == 0   3.3308     7.5405   0.442   0.9998  
## SLBE - ISRO == 0  12.2273     6.7120   1.822   0.6014  
## VOYA - ISRO == 0   2.8777     7.8499   0.367   1.0000  
## SACN - MISS == 0  -5.2472     7.2999  -0.719   0.9964  
## SLBE - MISS == 0   3.6493     6.4405   0.567   0.9992  
## VOYA - MISS == 0  -5.7003     7.6190  -0.748   0.9954  
## SLBE - SACN == 0   8.8965     7.0066   1.270   0.9086  
## VOYA - SACN == 0  -0.4531     8.1032  -0.056   1.0000  
## VOYA - SLBE == 0  -9.3496     7.3385  -1.274   0.9071  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)

Hypothesis test/Tukey gives a warning message: “Completion with error > abseps”

*** Is this a problem?

Conclusions: There is only one pair of parks that are significantly different from each other (INDU vs. GRPO).

Since we now know that “Park” did not contribute significantly to the model as a fixed effect, and only one pair of parks differed significantly from each other, we can try a nested random effect with “Site” nested within “Park”.

Test a new version of the “Species” vs. null models, now with Park/Site as nested random effects

# Use a linear mixed model with (1|Park/Site) as a random effect in both models. 
# Species is a fixed effect in the second model. Compare the two models. 
# Set REML = FALSE so that log-likelihood is used, so that we can compare the models with AIC. 

mod.null2 <- lmer(data.allparks.wide$Diff ~ 1 + (1|Park/Site), 
                   data= data.allparks.wide, REML = FALSE)
mod.species2 <- lmer(data.allparks.wide$Diff ~ Species + (1|Park/Site), 
                   data= data.allparks.wide, REML = FALSE)

# Use small sample AICc (using AICcmodavg pkg) to compare the 2 models.
# (Note: aictab function automatically selected AICc instead of AIC, even when I did not include the argument "second.ord = TRUE".)
mod.list2 <- list(mod.null2, mod.species2)
mod.names2 <- c("mod.null2", "mod.species2")
aictab(cand.set = mod.list2, modnames = mod.names2, sort = TRUE, second.ord = TRUE)
## 
## Model selection based on AICc:
## 
##               K     AICc Delta_AICc AICcWt Cum.Wt       LL
## mod.species2 12 13522.96       0.00      1      1 -6749.35
## mod.null2     4 13595.71      72.75      0      1 -6793.84

Conclusions: The two models are different and the AIC score is much lower when we include the “Species” term. “Species” therefore still seems to be contributing importantly to this model.

The final version of the model will be this: Diff ~ 0 + Species + (1|Park/Site)

*** Is this correct that now I can use the “0 +” since I no longer need the intercept term if I’m no longer comparing to null models?

Run the model and check residual plots

# Run the newest model.
mod.final <- lmer(data.allparks.wide$Diff ~ 0 + Species + (1|Park/Site), 
                   data= data.allparks.wide, REML = FALSE)

# Plot the residuals. 
plot(fitted(mod.final),residuals(mod.final))

hist(residuals(mod.final))

qqnorm(residuals(mod.final))

The residual plots don’t look great…. consider some kind of transformation? Dividing difference values by 10 or by 100 to reduce the spread did not change the outcomes. Log-transform and square root won’t work with negative numbers and zeros.

Look at differences among species

# Look at the model outputs.
summary(mod.final)
## Linear mixed model fit by maximum likelihood  ['lmerMod']
## Formula: data.allparks.wide$Diff ~ 0 + Species + (1 | Park/Site)
##    Data: data.allparks.wide
## 
##      AIC      BIC   logLik deviance df.resid 
##  13522.7  13584.1  -6749.3  13498.7     1218 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -11.3014  -0.1110   0.0229   0.1971   8.1057 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  Site:Park (Intercept)  106.65  10.327  
##  Park      (Intercept)   32.66   5.715  
##  Residual              3301.96  57.463  
## Number of obs: 1230, groups:  Site:Park, 185; Park, 8
## 
## Fixed effects:
##              Estimate Std. Error t value
## SpeciesEPFU   8.29959    4.75386   1.746
## SpeciesLABO -12.30168    4.75386  -2.588
## SpeciesLACI   0.05076    4.75386   0.011
## SpeciesLANO   3.32668    4.75386   0.700
## SpeciesMYLU -42.11837    4.75386  -8.860
## SpeciesMYSE  -5.07552    4.75386  -1.068
## SpeciesMYSO -10.12359   11.94660  -0.847
## SpeciesNYHU -13.80982   11.94660  -1.156
## SpeciesPESU  -4.78630    7.50598  -0.638
## 
## Correlation of Fixed Effects:
##             SpEPFU SpLABO SpLACI SpLANO SpMYLU SpMYSE SpMYSO SpNYHU
## SpeciesLABO 0.210                                                  
## SpeciesLACI 0.210  0.210                                           
## SpeciesLANO 0.210  0.210  0.210                                    
## SpeciesMYLU 0.210  0.210  0.210  0.210                             
## SpeciesMYSE 0.210  0.210  0.210  0.210  0.210                      
## SpeciesMYSO 0.086  0.086  0.086  0.086  0.086  0.086               
## SpeciesNYHU 0.086  0.086  0.086  0.086  0.086  0.086  0.110        
## SpeciesPESU 0.132  0.132  0.132  0.132  0.132  0.132  0.090  0.090
confint(mod.final)
## Computing profile confidence intervals ...
##                  2.5 %     97.5 %
## .sig01        0.000000  16.137754
## .sig02        0.000000  12.874075
## .sigma       55.078637  60.023864
## SpeciesEPFU  -1.268021  17.691051
## SpeciesLABO -21.869291  -2.910220
## SpeciesLACI  -9.516848   9.442223
## SpeciesLANO  -6.240929  12.718142
## SpeciesMYLU -51.685978 -32.726906
## SpeciesMYSE -14.643135   4.315937
## SpeciesMYSO -34.401746  13.903374
## SpeciesNYHU -38.087977  10.217143
## SpeciesPESU -19.915921   9.997615
car::Anova(mod.final)
## Analysis of Deviance Table (Type II Wald chisquare tests)
## 
## Response: data.allparks.wide$Diff
##          Chisq Df Pr(>Chisq)    
## Species 101.35  9  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# There is a significant difference among at least two of the species. 

# But what we really want to know is which species have Differences in Calls/Night that are significantly different than zero. Null hypothesis is that the Difference = Zero, i.e. there is no difference between years.

# Run a hypothesis test to check this. 
hyptest.species <- glht(mod.final, linfct= (Species = c("SpeciesEPFU = 0",  "SpeciesLABO = 0", "SpeciesLACI = 0", "SpeciesLANO = 0", "SpeciesMYLU = 0", "SpeciesMYSE = 0", "SpeciesMYSO = 0", "SpeciesNYHU = 0", "SpeciesPESU = 0")))
summary(hyptest.species)
## 
##   Simultaneous Tests for General Linear Hypotheses
## 
## Fit: lmer(formula = data.allparks.wide$Diff ~ 0 + Species + (1 | Park/Site), 
##     data = data.allparks.wide, REML = FALSE)
## 
## Linear Hypotheses:
##                   Estimate Std. Error z value Pr(>|z|)    
## SpeciesEPFU == 0   8.29959    4.75386   1.746   0.5132    
## SpeciesLABO == 0 -12.30168    4.75386  -2.588   0.0816 .  
## SpeciesLACI == 0   0.05076    4.75386   0.011   1.0000    
## SpeciesLANO == 0   3.32668    4.75386   0.700   0.9968    
## SpeciesMYLU == 0 -42.11837    4.75386  -8.860   <1e-04 ***
## SpeciesMYSE == 0  -5.07552    4.75386  -1.068   0.9441    
## SpeciesMYSO == 0 -10.12359   11.94660  -0.847   0.9873    
## SpeciesNYHU == 0 -13.80982   11.94660  -1.156   0.9125    
## SpeciesPESU == 0  -4.78630    7.50598  -0.638   0.9984    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)

Conclusions: There is one species that is significantly different than zero (MYLU).

Overall Conclusions: Differences in calls per night between 2017 and 2016 depend on species, when we account for inherent variation among the parks and sites. For most species we did not observe a significant difference in calls per night between 2017 and 2016. However, for MYLU there was a significant difference, specifically there was a decline from 2016 to 2017.