library('tidyverse') ; library("gridExtra")
events = read_csv('events copy.csv')%>%select(-text)
shots = drop_na(events, c(shot_place)) ##just select shots

shots$from_box = factor(ifelse(shots$shot_place%in%c(3, 9:15), 'box','not box'))
shots$situation = relevel(factor(shots$situation, 
                                  labels = c('Open play','Set piece','Corner',
                                             'Free kick')), ref = 'Open play')
shots$side = factor(shots$side, levels = 1:2, labels = c('Home', 'Away'))
shots$fast_break = factor(shots$fast_break, levels = 0:1, labels = c('no','yes'))

shots_final = select(shots, c(is_goal, from_box, situation, side, fast_break, time))%>%
  drop_na()
summary(shots_final)
##     is_goal          from_box          situation        side       
##  Min.   :0.0000   box    : 76957   Open play:192431   Home:126211  
##  1st Qu.:0.0000   not box:150495   Set piece: 11567   Away:101241  
##  Median :0.0000                    Corner   : 18046                
##  Mean   :0.1001                    Free kick:  5408                
##  3rd Qu.:0.0000                                                    
##  Max.   :1.0000                                                    
##  fast_break        time       
##  no :222926   Min.   :  0.00  
##  yes:  4526   1st Qu.: 27.00  
##               Median : 50.00  
##               Mean   : 49.21  
##               3rd Qu.: 72.00  
##               Max.   :100.00
# View(shots_final)

Q.1 - The most appropriate distribution is X ~ Bern(p).

Q.2 - Fit the model

model <- glm(formula = is_goal ~ from_box, family = "binomial", data = shots_final)

Q.3 - Assessing the model

summary(model)
## 
## Call:
## glm(formula = is_goal ~ from_box, family = "binomial", data = shots_final)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.5661  -0.5661  -0.3965  -0.3965   2.2725  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -1.75013    0.01015 -172.42   <2e-16 ***
## from_boxnot box -0.75350    0.01408  -53.53   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 147990  on 227451  degrees of freedom
## Residual deviance: 145168  on 227450  degrees of freedom
## AIC: 145172
## 
## Number of Fisher Scoring iterations: 5
exp(-0.75350)
## [1] 0.4707162

The log of the odds of scoring outside of the box are -0.75350 or odds of exp(-0.75350) =0.4707162. Odds here are total number of goals over the total number of misses. (Probability is total number of goals over the total number of goals and misses). The negative log odds mean that the odds of scoring outside the box go down when compared to scoring inside the box.

Q.4 - CI.

CI = exp(confint(model)[2,])
## Waiting for profiling to be done...
CI
##     2.5 %    97.5 % 
## 0.4579063 0.4838809

The confidence interval interpretation: When shooting from outside the box, the odds of the shot being successful decrease by a multiplicative factor of 0.47 where a 95% CI for this reduction is (0.46, 0.48). Since this interval does not contain 1, we can say that there is a statistically significant reduction in the odds of scoring a goal for a shot coming from outside the box vs. inside the box.

Q.5 - Updated GLM

model1 <- glm(formula = is_goal ~ from_box + situation + fast_break + side, family = "binomial", data = shots_final)
summary(model1)
## 
## Call:
## glm(formula = is_goal ~ from_box + situation + fast_break + side, 
##     family = "binomial", data = shots_final)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0723  -0.4737  -0.3633  -0.3493   2.3779  
## 
## Coefficients:
##                    Estimate Std. Error  z value Pr(>|z|)    
## (Intercept)        -2.04985    0.01354 -151.396  < 2e-16 ***
## from_boxnot box    -0.63540    0.01451  -43.783  < 2e-16 ***
## situationSet piece  1.55346    0.02231   69.644  < 2e-16 ***
## situationCorner     0.56743    0.02334   24.312  < 2e-16 ***
## situationFree kick  0.50145    0.04120   12.171  < 2e-16 ***
## fast_breakyes       1.79745    0.03274   54.896  < 2e-16 ***
## sideAway           -0.08093    0.01451   -5.576 2.46e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 147990  on 227451  degrees of freedom
## Residual deviance: 138818  on 227445  degrees of freedom
## AIC: 138832
## 
## Number of Fisher Scoring iterations: 5
anova(model, model1, test = "LRT")
## Analysis of Deviance Table
## 
## Model 1: is_goal ~ from_box
## Model 2: is_goal ~ from_box + situation + fast_break + side
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1    227450     145168                          
## 2    227445     138818  5   6349.7 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The null hypothesis of the test is that the regression coefficients are all zero. Here we reject the null hypothesis, which means the updated model is superior to the model with just the single explanatory variable.

AIC(model)
## [1] 145171.6
AIC(model1)
## [1] 138831.9

The updated model has a lower AIC.

Q.6 - Using predict and residuals

shots_final$linpred <- predict(model1)
shots_final$resid <- residuals(model1, type = "deviance")

ggplot(data = shots_final, aes(x = linpred, y = resid)) + 
  geom_point()

Only a finite number of predicted values and a finite number of residuals. This is because all of our predictors are categorical and our outcomes are only 0 or 1.

Q.7 - Plot residuals

names(shots_final)
## [1] "is_goal"    "from_box"   "situation"  "side"       "fast_break"
## [6] "time"       "linpred"    "resid"
p1 = ggplot(data = shots_final, aes(x = fast_break, y = resid)) + geom_boxplot()
p2 = ggplot(data = shots_final, aes(x = from_box, y = resid)) + geom_boxplot()
p3 =ggplot(data = shots_final, aes(x = situation, y = resid)) + geom_boxplot()
p4 = ggplot(data = shots_final, aes(x = side, y = resid)) + geom_boxplot()

grid.arrange(p1,p2,p3,p4, nrow = 2)

We hope to see that the residuals have similar distributions across the levels of the predictors. This is mostly true, apart from fast breaks.

Q.8 - Role of time.

shots_final%>%group_by(time)%>% summarise(proportion = mean(is_goal))%>%
  ggplot(aes(x = time, y = proportion))+geom_point()

time_mod <- glm(formula = is_goal ~ time, family = "binomial", data = shots_final)
summary(time_mod)
## 
## Call:
## glm(formula = is_goal ~ time, family = "binomial", data = shots_final)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.4804  -0.4673  -0.4574  -0.4464   2.1846  
## 
## Coefficients:
##               Estimate Std. Error  z value Pr(>|z|)    
## (Intercept) -2.2897603  0.0150513 -152.130  < 2e-16 ***
## time         0.0018848  0.0002653    7.106  1.2e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 147990  on 227451  degrees of freedom
## Residual deviance: 147940  on 227450  degrees of freedom
## AIC: 147944
## 
## Number of Fisher Scoring iterations: 4
exp(0.0018848)
## [1] 1.001887

For every extra minute of time played the odds of scoring is increased by a factor of 1.001887.