#RMD 3
#Nick Tourtillott
`2019` <- read.csv("~/Downloads/data_v2 2/yearly/2019.csv")
RBs <- subset(`2019`, Pos == "RB")
attach(RBs)
#Poisson Regression
#We want o use Poisson Regression when our target variable is a count per some unit of time.
#In the dataset I am using, my target variables is number of touchdowns over the course of a season for Running Backs in the National Football League
pmod1 <- glm(RushingTD~Rushing.Attempts+RushingYds+Rec+ReceivingYds+Age, family = poisson)
summary(pmod1)
##
## Call:
## glm(formula = RushingTD ~ Rushing.Attempts + RushingYds + Rec +
## ReceivingYds + Age, family = poisson)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0903 -1.2024 -0.9219 0.4877 2.5187
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.4936838 0.5196775 -0.950 0.342122
## Rushing.Attempts 0.0091693 0.0027615 3.320 0.000899 ***
## RushingYds 0.0002326 0.0005762 0.404 0.686429
## Rec -0.0240025 0.0091307 -2.629 0.008570 **
## ReceivingYds 0.0029897 0.0010254 2.916 0.003549 **
## Age 0.0035162 0.0197060 0.178 0.858381
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 633.68 on 152 degrees of freedom
## Residual deviance: 227.66 on 147 degrees of freedom
## AIC: 493.86
##
## Number of Fisher Scoring iterations: 5
#Important things to note about the summary: The smaller the deviance, the better fit the model is. Z value is for wald test stat. Null Deviance is just the intercept only model while residual deviance is for the model we fit.
#We can use deviance to test the goodness-of-fit test for the model. The deviance follows chisquared with a test stat of n-(k+1)
pchisq(71.228, 58, lower.tail = FALSE)
## [1] 0.1138294
#Since our p value is not significant we conclude that the data is consistent with the distribution
#Dispersion
#One assumption of the Poisson Regression Model is that mean is approximately equal to mean. We are able to see if this is true by finding the dispersion parameter Φ.
#If Φ is equal to 1 then the Mean and Variance are relatively equal, If Φ is greater than 1, then variance is larger
dp <- sum(residuals(pmod1, type = "pearson")^2 / pmod1$df.residual)
dp
## [1] 1.488819
#Our model is slightly larger than one and likely explains the relatively good fit of the model
summary(pmod1, dispersion = dp)
##
## Call:
## glm(formula = RushingTD ~ Rushing.Attempts + RushingYds + Rec +
## ReceivingYds + Age, family = poisson)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0903 -1.2024 -0.9219 0.4877 2.5187
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.4936838 0.6340958 -0.779 0.4362
## Rushing.Attempts 0.0091693 0.0033695 2.721 0.0065 **
## RushingYds 0.0002326 0.0007030 0.331 0.7408
## Rec -0.0240025 0.0111410 -2.154 0.0312 *
## ReceivingYds 0.0029897 0.0012512 2.390 0.0169 *
## Age 0.0035162 0.0240447 0.146 0.8837
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1.488819)
##
## Null deviance: 633.68 on 152 degrees of freedom
## Residual deviance: 227.66 on 147 degrees of freedom
## AIC: 493.86
##
## Number of Fisher Scoring iterations: 5
#Pvalues, z values, and standard error all decrease
#Confidence Intervals
#Say we want a confidence interval on β1, Rushing Attempts
myz <- qnorm(.95)
disparm <- summary(pmod1, dispersion = dp)$dispersion
myci <- 0.0028243 + c(1,-1)*myz *0.0030905 * sqrt(dp)
myci
## [1] 0.009026945 -0.003378345
#because we are interpreting the log mean of our target variable we need to take the natural log of our CI to get interpretable results
exp(myci)
## [1] 1.0090678 0.9966274
#We can interpret these results like this:
#For every additional rushing attempt, we are 95% confident that the mean number of rushing touchdowns experienced by NFL players experiences a multiplicative change between .99627 and 1.00943
#Drop in Deviance Test
#This method is much like anova in that it compares nested models. We are looking for a singificant drop in variance with the complex model compared to the simpler model.
pmod2 <- glm(RushingTD~Rec+Rushing.Attempts+RushingYds+ReceivingTD, family = poisson)
#We are looking to see if we can drop Age as a predictor
deviance(pmod1)
## [1] 227.6561
deviance(pmod2)
## [1] 232.042
teststat <- deviance(pmod2)-deviance(pmod1)
pchisq(teststat,df=1,lower.tail = FALSE)
## [1] 0.03623704
#Since our pvalue is low, we would not want to get rid of the variable age. However, this is without accounting for dispersion.