#Poisson Regression HW

#Ben Spann, Bryce Sellner, and Nick Tourtillott

crabdat <- read.csv("http://www.cknudson.com/data/crabs.csv")
attach(crabdat)
#Exploratory data analysis
hist(satell)

#The histogram shows that the number of satellite crabs sorrounding female crabs is a decreasing function with the majority of our data between 0 and 2.
boxplot(satell~color)

#As our horshoe crab becomes darker, the number of sorrounding sattelite crabs decreases.

boxplot(satell~spine)
#Horseshoe crabs with middle spines tend ot have very few sattle crabs around them. Horseshoe crabs with good spines tend to have the most satell crabs and bad spine coming in a close second.

#Does not appear to be a correlation between width and log(sattel)
#e. Because our response is a count it would be smart to use poisson regression
#f. Based on our plots we can assume that the color and spine will be influential predictors.


#2
modq <- glm(satell~color, family = "poisson")
#Colors light and medium are both significant predictors on number of satelll crabs
modw <- glm(satell~spine, family = "poisson")
#Both bad and good signs are predictors but no the middle
mode <- glm(satell~width, family = "poisson")
#Width is an extremely significant factor

#3
#Use AIC to compare models
AIC(modq)
## [1] 972.4368
AIC(modw)
## [1] 982.4582
AIC(mode)
## [1] 927.1762
#Module E with width appears to be the best model we have due to the lowest AIC. This does line up with our previous problem because it has the lowest p value as well.

#4
library(faraway)

plot(mode$residuals~mode$fitted.values, xlab = "fitted", ylab = "residuals")

halfnorm(residuals(mode))

#Our halfnorm plot shows 2 outliers in our model that may be causing overdispersion.

dp <- sum(residuals(mode, type = "pearson")^2 / mode$df.residual)

#We have a bit of over dispersion with our variance being about 3 times larger than the mean.


#5
modP <- glm(satell~weight, family = poisson)
AIC(modP)
## [1] 920.1641
mod1 <- glm(satell~width+color, family = "poisson")
mod2 <- glm(satell~width+spine, family = "poisson")
mod3 <- glm(satell~width+weight, family = "poisson")

pvals <- c(anova(mode,mod1,test = "Chisq")[5], + anova(mode,mod2,test = "Chisq")[5], + anova(mode,mod3,test = "Chisq")[5])
pvals
## $`Pr(>Chi)`
## [1]         NA 0.03617671
## 
## $`Pr(>Chi)`
## [1]        NA 0.5289668
## 
## $`Pr(>Chi)`
## [1]          NA 0.004734998
#Both model 1 and model 3 appear to be significantly better than our model e. However, model 3 is by far the better model.

mod11 <- glm(satell~width+weight+color, family = "poisson")
mod12 <- glm(satell~width+weight+spine, family = "poisson")

pvals1 <- c(anova(mod3,mod11,test = "Chisq")[5], + anova(mod3,mod12,test = "Chisq")[5])
pvals1
## $`Pr(>Chi)`
## [1]         NA 0.03639796
## 
## $`Pr(>Chi)`
## [1]        NA 0.5843623
#It appears that we should add color as a predictor as well. We could keep going but it is doubtful that Spine will be a good predictor.

#Drop in Deviance
teststat <- deviance(mode)-deviance(mod1)
teststat1 <- deviance(mode)-deviance(mod2)
teststat2 <- deviance(mode)-deviance(mod3)

#Note: df is the difference in predictors

DevPvals <- c(pchisq(teststat,df = 1, lower.tail = FALSE), + pchisq(teststat1,df = 1, lower.tail = FALSE), + pchisq(teststat2,df = 1, lower.tail = FALSE))
DevPvals
## [1] 0.003486135 0.259081355 0.004734998
#Since our p values are low for model 1 and model 3, we do not want to drop the variables color and weight. Spine again appears to be useless.
#Since our p value is slightly lower for model 1 we will add color first

teststat11 <- deviance(mod1)-deviance(mod11)
teststat12 <- deviance(mod1)-deviance(mod12)

DevPvals2 <- c(pchisq(teststat11,df = 1, lower.tail = FALSE), + pchisq(teststat12,df = 1, lower.tail = FALSE))
DevPvals2
## [1] 0.004770432 0.471417348
#Model11 appears to be a predictor we should not drop, so we will make model 11 our new model.
#Width, weight, and color again our the predictors we will decide to use for our model.

summary(mod11)
## 
## Call:
## glm(formula = satell ~ width + weight + color, family = "poisson")
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.9806  -1.8987  -0.5392   0.9015   4.8274  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)   
## (Intercept) -1.0789430  0.9094063  -1.186  0.23545   
## width        0.0309119  0.0473407   0.653  0.51378   
## weight       0.0004516  0.0001602   2.818  0.00483 **
## colordarker  0.0001622  0.1805138   0.001  0.99928   
## colorlight   0.4422488  0.1761433   2.511  0.01205 * 
## colormedium  0.2376849  0.1182078   2.011  0.04435 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 632.79  on 172  degrees of freedom
## Residual deviance: 551.38  on 167  degrees of freedom
## AIC: 918.68
## 
## Number of Fisher Scoring iterations: 6
#log(satell) = -1.0789 + .0309(width) + .00045(weight) + .00016(colordarker) + .44225(colorlight) + .23768(colormedium)
#All other things held constant, an increase of 1 in width will result in a multiplicative change of 1.031395 in the mean of satell crabs
#Finish all other variables

#6
halfnorm(residuals(mod11))

#It appears that we only have two outliers in row 15 and 149. 15 remained a outlier in both.

#7
dp2 <- sum(residuals(mod11, type = "pearson")^2 / mode$df.residual)

#The dispersion parameter went down slightly from our module e, but not much.

#Elephant Problem
eleph <- read.csv("http://cknudson.com/data/elephant.csv")
attach(eleph)

#a
hist(MATINGS)

AgeSq <- AGE^2

#Yes, we can conclude that Poisson would be preferred because our data ranges from 0-10. A linear model would give us negative matings

#b
lm1 <- lm(MATINGS~AGE)
plot(MATINGS~AGE)
abline(lm1)

plot(residuals(lm1))

#A linear model appears to fit well, but our model residuals appear to fan out as we go through our elephants.

#c
agemat<-c(mean(subset(eleph, AGE==27)$MATINGS),mean(subset(eleph, AGE==28)$MATINGS),mean(subset(eleph, AGE==29)$MATINGS),mean(subset(eleph, AGE==30)$MATINGS),mean(subset(eleph, AGE==32)$MATINGS),mean(subset(eleph, AGE==33)$MATINGS),mean(subset(eleph, AGE==34)$MATINGS),mean(subset(eleph, AGE==36)$MATINGS),mean(subset(eleph, AGE==37)$MATINGS),mean(subset(eleph, AGE==38)$MATINGS),mean(subset(eleph, AGE==39)$MATINGS),mean(subset(eleph, AGE==41)$MATINGS),mean(subset(eleph, AGE==42)$MATINGS),mean(subset(eleph, AGE==43)$MATINGS),mean(subset(eleph, AGE==44)$MATINGS),mean(subset(eleph, AGE==45)$MATINGS),mean(subset(eleph, AGE==47)$MATINGS),mean(subset(eleph, AGE==48)$MATINGS),mean(subset(eleph, AGE==52)$MATINGS))
logAge <- log(agemat)

ages <- c(27,28,29,30,32,33,34,36,37,38,39,41,42,43,44,45,47,48,52)
plot(logAge~ages, ylab = "logMates")

#Assumptions: Response variabel is a count per some unit of time, The log of the mean rate, log(λ), must be a linear function of x.
#The residuals look very normal and our plot appears to be extremely linear, so no I do not believe there is a quadratic plot.

#d
pmod <- glm(MATINGS~AGE, family = "poisson")

pmod2 <- glm(MATINGS~AgeSq, family = "poisson")
summary(pmod2)
## 
## Call:
## glm(formula = MATINGS ~ AgeSq, family = "poisson")
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.76066  -0.84290  -0.06587   0.58710   2.25632  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.2588755  0.2847181  -0.909    0.363    
## AgeSq        0.0008635  0.0001711   5.047  4.5e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 75.372  on 40  degrees of freedom
## Residual deviance: 51.590  on 39  degrees of freedom
## AIC: 157.04
## 
## Number of Fisher Scoring iterations: 5
sqrt(0.0008635)
## [1] 0.02938537
exp(sqrt(0.0008635))
## [1] 1.029821
#then take log to find the increase in matings after 1 year of age

#An increase of 1 year in age, will result in an increase of 1.03 matings

#e.
confint(pmod2,level = .95)
## Waiting for profiling to be done...
##                     2.5 %      97.5 %
## (Intercept) -0.8289211654 0.288472089
## AgeSq        0.0005241956 0.001195922
exp(.0005241956)
## [1] 1.000524
exp(.001195922)
## [1] 1.001197
#We are 95% confident that an increase of 1 year of age will result in a multiplicative increase of 1.000524 and 1.001197.

#f.
summary(pmod)
## 
## Call:
## glm(formula = MATINGS ~ AGE, family = "poisson")
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.80798  -0.86137  -0.08629   0.60087   2.17777  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.58201    0.54462  -2.905  0.00368 ** 
## AGE          0.06869    0.01375   4.997 5.81e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 75.372  on 40  degrees of freedom
## Residual deviance: 51.012  on 39  degrees of freedom
## AIC: 156.46
## 
## Number of Fisher Scoring iterations: 5
#Based on our summary and ages p value, we can assume that the number of mates significantly relates to age
pmod0 <- glm(MATINGS~1, family = "poisson")

#Drop in deviance
testy <- deviance(pmod0) - deviance(pmod)
DevP <- pchisq(testy, df= 1, lower.tail = FALSE)
DevP
## [1] 7.99062e-07
#Age does appear to be significant when compared to the grand mean model using drop in deviance.
#Using both test we can conclude that age is a singificant predictor of age.

#g. 
#i. Wald
pmod3 <- glm(MATINGS~AgeSq+AGE, family = "poisson")
summary(pmod3)
## 
## Call:
## glm(formula = MATINGS ~ AgeSq + AGE, family = "poisson")
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8470  -0.8848  -0.1122   0.6580   2.1134  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.8574060  3.0356383  -0.941    0.347
## AgeSq       -0.0008595  0.0020124  -0.427    0.669
## AGE          0.1359544  0.1580095   0.860    0.390
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 75.372  on 40  degrees of freedom
## Residual deviance: 50.826  on 38  degrees of freedom
## AIC: 158.27
## 
## Number of Fisher Scoring iterations: 5
#Age squared appears to hurt the initial model

#ii.Drop in Deviance
testy2 <- deviance(pmod) - deviance(pmod3)
DevP2 <- pchisq(testy2, df= 1, lower.tail = FALSE)
DevP2 
## [1] 0.6667354
plot(residuals(pmod))

#Because our deviance for the smaller model is only slightly better than that which includes age squared, we can conclude that the linear model is better than the quadratic.

#Now we must account for our dispersion in our model

dispersion <- sum(residuals(pmod, type = "pearson")^2 / pmod$df.residual)
dispersion
## [1] 1.157334
#Our dispersion appears to be slightly greater than one