library(tidyverse)
exercise <- haven::read_dta("exercise.dta")
head(exercise)
## # A tibble: 6 × 5
##     alcohol policy  time  post  freq
##   <dbl+lbl>  <dbl> <dbl> <dbl> <dbl>
## 1    0 [no]      0     1     0  1126
## 2    0 [no]      0     2     0  1126
## 3    0 [no]      0     3     0  1201
## 4    0 [no]      0     4     0  1271
## 5    0 [no]      0     5     0  1178
## 6    0 [no]      0     6     0  1405
glimpse(exercise)
## Rows: 88
## Columns: 5
## $ alcohol <dbl+lbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ policy  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ time    <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,…
## $ post    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ freq    <dbl> 1126, 1126, 1201, 1271, 1178, 1405, 1287, 1478, 1326, 1293, 13…
###Dataset briefing : alcohol coded 0 mean no alcohol consumption and 1 mean alcohol consumption, policy coded 0 mean no policy period and 1 mean policy period.
###time mean individual time point, post mean time after intervention started.
###freq is frequency of accidents.

## Dataset must be transform into wide form first.
exercise <- pivot_wider(exercise, names_from = alcohol, 
                       names_prefix = "Alcohol_",
                       values_from = freq)
##Then calculate proportion of outcome of interest.
exercise <- exercise %>%
            mutate("sum_freq"= (Alcohol_0+Alcohol_1))%>%
            mutate("proportion"= Alcohol_1 /sum_freq)
head(exercise)      
## # A tibble: 6 × 7
##   policy  time  post Alcohol_0 Alcohol_1 sum_freq proportion
##    <dbl> <dbl> <dbl>     <dbl>     <dbl>    <dbl>      <dbl>
## 1      0     1     0      1126       578     1704      0.339
## 2      0     2     0      1126       465     1591      0.292
## 3      0     3     0      1201       480     1681      0.286
## 4      0     4     0      1271       702     1973      0.356
## 5      0     5     0      1178       541     1719      0.315
## 6      0     6     0      1405       526     1931      0.272
model <- lm(data = exercise, proportion ~ time + post + policy )
summary(model)
## 
## Call:
## lm(formula = proportion ~ time + post + policy, data = exercise)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.045879 -0.015031 -0.002569  0.019217  0.039405 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.3126418  0.0086000  36.354  < 2e-16 ***
## time         0.0009392  0.0005181   1.813  0.07740 .  
## post        -0.0020357  0.0013081  -1.556  0.12752    
## policy      -0.0450482  0.0141888  -3.175  0.00288 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.02215 on 40 degrees of freedom
## Multiple R-squared:  0.5012, Adjusted R-squared:  0.4638 
## F-statistic:  13.4 on 3 and 40 DF,  p-value: 3.378e-06
##After we got model coefficients and intercept, We will predicted new variable and create counterfactual variables that will be use in plot.
exercise <- exercise %>% 
            mutate( "factual_accident_proportion" = (0.313+(0.00093*time)
                                                       -(0.002*post)
                                                       -(0.045*as.numeric(policy))))%>%
            mutate("factual_accident_number" = factual_accident_proportion*(sum_freq))%>%
            mutate("counterfact_accident_proportion" = (0.313+(0.00093*time)))%>%
            mutate("counterfact_accident_number" = 
                                           counterfact_accident_proportion*(sum_freq))
##Final table result
DT::datatable(exercise)
##ggplot for Interrupted time series analysis                                              
ggplot(data = exercise)+
      geom_line(aes(x=time,y=factual_accident_number, 
                    col="orange")) +
      geom_line(aes(x=time,y=counterfact_accident_number, 
                    col="blue")) +
      geom_vline(xintercept = 28, 
                 col ="red",
                 linetype = "twodash")+
      geom_segment(aes(x = 38, y = 600, xend = 38, yend = 737 ),
                   col = "cadetblue4")+
      annotate(geom = "label",
               x = 29, y = 900,
               label = "Intervention begin",
               col = "red")+
      annotate(geom = "label",
               x = 40, y = 700,
               label = "Intervention impact",
               col = "cadetblue4",
               size = 2.5)+
      xlab(label = "Time point") +
      ylab(label = "Number of accidents") +
      labs(title = "Relationship between number of accidents and time",
           color = "Intervention")+
      scale_color_discrete(name = "Intervention",
                           labels = c("without policy (counterfactual)","with policy"))+
      theme_bw()+
      theme(legend.position = c(0.2,0.85))+
      geom_smooth(method = lm,aes(x = time, y=counterfact_accident_number),
                  size = 1, se = FALSE)+
      geom_smooth(method = lm,aes(x = time,y=factual_accident_number),
                  size = 1, se = FALSE, color = "red")

As plot show that there is relationship of intervention and change in slope of the model, from blue line to red line. And the number of accidents decline after intervention start. We can consider that the gap between two plot is the impact of our intervention.