#RMD 3
#Nick Tourtillott
`2019` <- read.csv("~/Downloads/data_v2 2/yearly/2019.csv")
RBs <- subset(`2019`, Pos == "RB")
attach(RBs)

#Poisson Regression
#We want o use Poisson Regression when our target variable is a count per some unit of time.
#In the dataset I am using, my target variables is number of touchdowns over the course of a season for Running Backs in the National Football League

pmod1 <- glm(RushingTD~Rushing.Attempts+RushingYds+Rec+ReceivingYds+Age, family = poisson)
summary(pmod1)
## 
## Call:
## glm(formula = RushingTD ~ Rushing.Attempts + RushingYds + Rec + 
##     ReceivingYds + Age, family = poisson)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0903  -1.2024  -0.9219   0.4877   2.5187  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -0.4936838  0.5196775  -0.950 0.342122    
## Rushing.Attempts  0.0091693  0.0027615   3.320 0.000899 ***
## RushingYds        0.0002326  0.0005762   0.404 0.686429    
## Rec              -0.0240025  0.0091307  -2.629 0.008570 ** 
## ReceivingYds      0.0029897  0.0010254   2.916 0.003549 ** 
## Age               0.0035162  0.0197060   0.178 0.858381    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 633.68  on 152  degrees of freedom
## Residual deviance: 227.66  on 147  degrees of freedom
## AIC: 493.86
## 
## Number of Fisher Scoring iterations: 5
#Important things to note about the summary: The smaller the deviance, the better fit the model is. Z value is for wald test stat. Null Deviance is just the intercept only model while residual deviance is for the model we fit.

#We can use deviance to test the goodness-of-fit test for the model. The deviance follows chisquared with a test stat of n-(k+1)
pchisq(71.228, 58, lower.tail = FALSE)
## [1] 0.1138294
#Since our p value is not significant we conclude that the data is consistent with the distribution

#Dispersion
#One assumption of the Poisson Regression Model is that mean is approximately equal to mean. We are able to see if this is true by finding the dispersion parameter Φ.
#If Φ is equal to 1 then the Mean and Variance are relatively equal, If Φ is greater than 1, then variance is larger

dp <- sum(residuals(pmod1, type = "pearson")^2 / pmod1$df.residual)
dp
## [1] 1.488819
#Our model is slightly larger than one and likely explains the relatively good fit of the model
summary(pmod1, dispersion = dp)
## 
## Call:
## glm(formula = RushingTD ~ Rushing.Attempts + RushingYds + Rec + 
##     ReceivingYds + Age, family = poisson)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0903  -1.2024  -0.9219   0.4877   2.5187  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)   
## (Intercept)      -0.4936838  0.6340958  -0.779   0.4362   
## Rushing.Attempts  0.0091693  0.0033695   2.721   0.0065 **
## RushingYds        0.0002326  0.0007030   0.331   0.7408   
## Rec              -0.0240025  0.0111410  -2.154   0.0312 * 
## ReceivingYds      0.0029897  0.0012512   2.390   0.0169 * 
## Age               0.0035162  0.0240447   0.146   0.8837   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1.488819)
## 
##     Null deviance: 633.68  on 152  degrees of freedom
## Residual deviance: 227.66  on 147  degrees of freedom
## AIC: 493.86
## 
## Number of Fisher Scoring iterations: 5
#Pvalues, z values, and standard error all decrease

#Confidence Intervals
#Say we want a confidence interval on β1, Rushing Attempts
myz <- qnorm(.95)
disparm <- summary(pmod1, dispersion = dp)$dispersion
myci <- 0.0028243 + c(1,-1)*myz *0.0030905 * sqrt(dp)
myci
## [1]  0.009026945 -0.003378345
#because we are interpreting the log mean of our target variable we need to take the natural log of our CI to get interpretable results
exp(myci)
## [1] 1.0090678 0.9966274
#We can interpret these results like this:
#For every additional rushing attempt, we are 95% confident that the mean number of rushing touchdowns experienced by NFL players experiences a multiplicative change between .99627 and 1.00943

#Drop in Deviance Test
#This method is much like anova in that it compares nested models. We are looking for a singificant drop in variance with the complex model compared to the simpler model.

pmod2 <- glm(RushingTD~Rec+Rushing.Attempts+RushingYds+ReceivingTD, family = poisson)

#We are looking to see if we can drop Age as a predictor
deviance(pmod1)
## [1] 227.6561
deviance(pmod2)
## [1] 232.042
teststat <- deviance(pmod2)-deviance(pmod1)
pchisq(teststat,df=1,lower.tail = FALSE)
## [1] 0.03623704
#Since our pvalue is low, we would not want to get rid of the variable age. However, this is without accounting for dispersion.