library(car)
## Loading required package: carData
library(plyr)
library(ggplot2)
library(ggfortify)
library(emmeans)
library(kableExtra)

#` This is largely taken from https://stats.idre.ucla.edu/r/dae/poisson-regression/

programmes <- read.csv("~/Dropbox/Teaching/second_year_stats/r_sessions/generalized/awards.csv", stringsAsFactors=TRUE)

programmes$prog<-as.factor(programmes$prog)
programmes$id<-as.factor(programmes$id)


Me <- tapply(programmes$num_awards, programmes$prog, mean)
Var <- tapply(programmes$num_awards, programmes$prog, var)
rbind(Me, Var) %>% kbl() %>%
  kable_classic(full_width = F, html_font = "Cambria", font_size = 12)
Academic General Vocational
Me 1.000000 0.2000000 0.2400000
Var 1.634615 0.1636364 0.2677551
ggplot(programmes, aes(num_awards, fill = prog)) +
  geom_histogram(binwidth=.5, position="dodge")

awards_model <- glm(num_awards ~ prog+math, family=poisson,data=programmes)
summary(awards_model)
## 
## Call:
## glm(formula = num_awards ~ prog + math, family = poisson, data = programmes)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2043  -0.8436  -0.5106   0.2558   2.6796  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -4.16327    0.66288  -6.281 3.37e-10 ***
## progGeneral    -1.08386    0.35825  -3.025  0.00248 ** 
## progVocational -0.71405    0.32001  -2.231  0.02566 *  
## math            0.07015    0.01060   6.619 3.63e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 287.67  on 199  degrees of freedom
## Residual deviance: 189.45  on 196  degrees of freedom
## AIC: 373.5
## 
## Number of Fisher Scoring iterations: 6
Anova(awards_model)
## Analysis of Deviance Table (Type II tests)
## 
## Response: num_awards
##      LR Chisq Df Pr(>Chisq)    
## prog   14.572  2  0.0006852 ***
## math   45.010  1   1.96e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
100*((awards_model$null.deviance-awards_model$deviance)/awards_model$null.deviance)
## [1] 34.14393
awards_model$deviance/awards_model$df.residual
## [1] 0.9665797
autoplot(awards_model)
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

plot(awards_model)

marginal = emmeans(awards_model, ~ prog)

pairs(marginal)
##  contrast              estimate    SE  df z.ratio p.value
##  Academic - General       1.084 0.358 Inf  3.025  0.0070 
##  Academic - Vocational    0.714 0.320 Inf  2.231  0.0660 
##  General - Vocational    -0.370 0.441 Inf -0.838  0.6791 
## 
## Results are given on the log (not the response) scale. 
## P value adjustment: tukey method for comparing a family of 3 estimates
## calculate and store predicted values
programmes$phat <- predict(awards_model, type="response")

## order by program and then by math
programmes <- programmes[with(programmes, order(prog, math)), ]

## create the plot
ggplot(programmes, aes(x = math, y = phat, colour = prog)) +
  geom_point(aes(y = num_awards), alpha=.5, position=position_jitter(h=.2)) +
  geom_line(size = 1) +
  labs(x = "Math Score", y = "Expected number of awards")