#Poisson regression
  #used when predicting responses that are counts during a specified period of time
  #poisson regression models counts per some unit of time or space, so its minimum value is 0
    #to avoid negative values for lambda, use log(lam) to take on all values
  #lambda is the average number of occurences per unit of time
  #assumptions: respons must be a count per unit of time and described by a poisson distribution, all observations are independent, the mean must equal the variance

data(gala)

pmod1 = glm(Species ~ Area + Elevation + Nearest + Scruz + Adjacent, family = poisson, data = gala)
summary(pmod1)
## 
## Call:
## glm(formula = Species ~ Area + Elevation + Nearest + Scruz + 
##     Adjacent, family = poisson, data = gala)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -8.2752  -4.4966  -0.9443   1.9168  10.1849  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  3.155e+00  5.175e-02  60.963  < 2e-16 ***
## Area        -5.799e-04  2.627e-05 -22.074  < 2e-16 ***
## Elevation    3.541e-03  8.741e-05  40.507  < 2e-16 ***
## Nearest      8.826e-03  1.821e-03   4.846 1.26e-06 ***
## Scruz       -5.709e-03  6.256e-04  -9.126  < 2e-16 ***
## Adjacent    -6.630e-04  2.933e-05 -22.608  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 3510.73  on 29  degrees of freedom
## Residual deviance:  716.85  on 24  degrees of freedom
## AIC: 889.68
## 
## Number of Fisher Scoring iterations: 5
#dispersion parameter set = 1 to force variance to = mean
#deviance/g-statistic (similar to RSS)
  #want deviance to be small, which means model is fitting well. Large deviance means predictions are far from actual values

#Poisson goodness of fit test
  #does the model fit? Looking at deviance in test stat (residual deviance in summary of model)
    #null deviance is for intercept only model
  #df calculated based on n
    #df = n - (# of betas) = n - (k+1)
    #in our example, df = 30 - 6 = 24

pchisq(716.85, 24, lower.tail = FALSE)
## [1] 7.058684e-136
#small p-value indicates lack of fit

#check for outliers with half normal plot
  #takes absolute value of residuals, creating half of a normal distribution
  #makes it easier to identify trends with a half normal plot
halfnorm(residuals(pmod1))

#are there points not following the trend?

#check poisson assumptions
#mean = variance
dp = sum(residuals(pmod1, type="pearson")^2 / pmod1$df.res) #calculate deviance/dispersion parameter
#our variance is about 32x larger than our mean, which explains the lack of fit

summary(pmod1, dispersion = dp) #calculate actual standard errors
## 
## Call:
## glm(formula = Species ~ Area + Elevation + Nearest + Scruz + 
##     Adjacent, family = poisson, data = gala)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -8.2752  -4.4966  -0.9443   1.9168  10.1849  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  3.1548079  0.2915897  10.819  < 2e-16 ***
## Area        -0.0005799  0.0001480  -3.918 8.95e-05 ***
## Elevation    0.0035406  0.0004925   7.189 6.53e-13 ***
## Nearest      0.0088256  0.0102621   0.860    0.390    
## Scruz       -0.0057094  0.0035251  -1.620    0.105    
## Adjacent    -0.0006630  0.0001653  -4.012 6.01e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 31.74914)
## 
##     Null deviance: 3510.73  on 29  degrees of freedom
## Residual deviance:  716.85  on 24  degrees of freedom
## AIC: 889.68
## 
## Number of Fisher Scoring iterations: 5
  #SE increased, so test statistics and p-values change for Wald tests

#regression equation
  #let lambda-i be the mean number of species for the i-th island
  #log(lambda-i) = 3.1548079 - 0.0005799(Area) + 0.00035406(Elevation) + 0.0088256(Nearest) - 0.0057094(SCruz) - 0.0006630(Adjacent)
  #interpretation of area coefficient: An increase of 1 km^2 in an island's area is associated with a decrease of 0.0005799 in the log mean number of species on that island, HOLDING ALL OTHER VARIABLES CONSTANT
    #An increase of 1 km^2 in an island's area is associated with multiplicative change of 0.9994212 in the mean number of species on that island, HOLDING ALL OTHER VARIABLES CONSTANT
      #e^-0.0005799 

  #interpretation of elevation coefficient: an increase of 1 m in the island's highest point is associated with an increase of 0.0035406 in the log mean number of species on that island, HOLDING ALL OTHER VARIABLES CONSTANT
    #An increase of 1 m of the island's highest point is associated with multiplicative change of 1.0003547 in the mean number of species on that island, HOLDING ALL OTHER VARIABLES CONSTANT


#Confidence intervals for poisson
  #beta estimator +- z quantile * SE(beta)
confint(pmod1, level = .95)
## Waiting for profiling to be done...
##                     2.5 %        97.5 %
## (Intercept)  3.0523321651  3.2552085036
## Area        -0.0006315162 -0.0005285142
## Elevation    0.0033697410  0.0037124153
## Nearest      0.0052272898  0.0123679395
## Scruz       -0.0069547767 -0.0045020142
## Adjacent    -0.0007213699 -0.0006063186
#calculate CI using SE built under assumption that mean = variance
  #CIs are too narrow because dispersion is much bigger than this calculation assumes (SEs increase when using actual dispersion value, which would widen the CIs)
  #multiply SE by sqrt of dispersion parameter
myz<-qnorm(.975)
myci.dp = (myci<- 0.0088256 +c(-1,1)* myz * 0.0102621 * sqrt(dp)) #adjusted for actual dispersion
myci.dp
## [1] -0.1045058  0.1221570
#we interpreted beta but wanted to interpret exp(beta) because that tells us the impact on the mean. Similarly, we want a CI for exp(beta) rather than for beta itself
  #just exponentiate the endpoints of the CI for beta to get the CI for exp(beta) 
  #stat theory says beta predictions are approximately normal, so use this property to create CI for beta, and then exponentiate the CI to get the CI for exp(beta)
exp(myci.dp) 
## [1] 0.9007696 1.1299315
#For every additional km of distance between an island and its nearest neighboring island, we are 95% confident that the mean number of species on the island experiences a multiplicative change between 0.90077 and 1.1299.  (In other words, adding distance could be related to a decreasing mean number of species or an increasing mean number of species.)
#Wald test
  #H0: beta = 0
  #HA: beta =/ 0
  #test stat: beta / SE(beta)
  #in summary, look at p-value/z-value for the variables
    #if p < 0.05, we should not drop that predictor
  #easy for quantitative variables and dropping one beta at a time
  #forgetting the dispersion parameter makes SE smaller
    #test stat farther from 0
    #p-values smaller
    #increase probability of type I error


#DROP IN DEVIANCE TEST
  #similar to anova: fit both the larger and the smaller model, then compare the drop in deviance
  #how much does deviance drop as we go to the more complicated model?
  #large drop in deviance means that larger model is better than the simple model
  #similar deviance means that simpler model is better

#setting 1: mean = variance
  #chi square distribution, df = df1 - df2
  #test drop in deviance by hand by fitting both models, getting the deviance, and calculating the p-value
  #pretending that our problem doesn't have overdispersion, test whether Nearest should be dropped from the model
pmod2 = glm(Species ~ Area + Elevation + Scruz + Adjacent, family = poisson, data = gala)
teststat = deviance(pmod2) - deviance(pmod1)
pchisq(teststat, df = 1, lower.tail = FALSE)
## [1] 2.031481e-06
#since the p-value is very small, we would not want to drop Nearest
#you can also use the anova command to obtain the p-value
anova(pmod1, pmod2, test = "Chisq")
## Analysis of Deviance Table
## 
## Model 1: Species ~ Area + Elevation + Nearest + Scruz + Adjacent
## Model 2: Species ~ Area + Elevation + Scruz + Adjacent
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1        24     716.85                          
## 2        25     739.41 -1  -22.565 2.031e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#setting 2: mean =/ variance
  #if our dispersion parameter is far from 1, we should incorporate this overdispersion into the drop in deviance test
  #test stat = drop in dev / (dispersion parameter * df)
    #f distribution
dropinD = deviance(pmod2) - deviance(pmod1)
teststat = dropinD / (dp*1)
pf(teststat, df1=1, 24, lower.tail = FALSE)
## [1] 0.4075257
#after adjusting for overdispersion, our p-value is > 0.05, which means the bigger model isn't better than the smaller model
  #we can safely drop Nearest from the model

#if we want to test each term one at a time:
drop1(pmod1, test = "F")
## Warning in drop1.glm(pmod1, test = "F"): F test assumes 'quasipoisson' family
## Single term deletions
## 
## Model:
## Species ~ Area + Elevation + Nearest + Scruz + Adjacent
##           Df Deviance     AIC F value    Pr(>F)    
## <none>         716.85  889.68                      
## Area       1  1204.35 1375.18 16.3217 0.0004762 ***
## Elevation  1  2389.57 2560.40 56.0028 1.007e-07 ***
## Nearest    1   739.41  910.24  0.7555 0.3933572    
## Scruz      1   813.62  984.45  3.2400 0.0844448 .  
## Adjacent   1  1341.45 1512.29 20.9119 0.0001230 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1