library(tidyverse)
exercise <- haven::read_dta("exercise.dta")
head(exercise)
## # A tibble: 6 × 5
## alcohol policy time post freq
## <dbl+lbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 [no] 0 1 0 1126
## 2 0 [no] 0 2 0 1126
## 3 0 [no] 0 3 0 1201
## 4 0 [no] 0 4 0 1271
## 5 0 [no] 0 5 0 1178
## 6 0 [no] 0 6 0 1405
glimpse(exercise)
## Rows: 88
## Columns: 5
## $ alcohol <dbl+lbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ policy <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ time <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,…
## $ post <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ freq <dbl> 1126, 1126, 1201, 1271, 1178, 1405, 1287, 1478, 1326, 1293, 13…
###Dataset briefing : alcohol coded 0 mean no alcohol consumption and 1 mean alcohol consumption, policy coded 0 mean no policy period and 1 mean policy period.
###time mean individual time point, post mean time after intervention started.
###freq is frequency of accidents.
## Dataset must be transform into wide form first.
exercise <- pivot_wider(exercise, names_from = alcohol,
names_prefix = "Alcohol_",
values_from = freq)
##Then calculate proportion of outcome of interest.
exercise <- exercise %>%
mutate("sum_freq"= (Alcohol_0+Alcohol_1))%>%
mutate("proportion"= Alcohol_1 /sum_freq)
head(exercise)
## # A tibble: 6 × 7
## policy time post Alcohol_0 Alcohol_1 sum_freq proportion
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 1 0 1126 578 1704 0.339
## 2 0 2 0 1126 465 1591 0.292
## 3 0 3 0 1201 480 1681 0.286
## 4 0 4 0 1271 702 1973 0.356
## 5 0 5 0 1178 541 1719 0.315
## 6 0 6 0 1405 526 1931 0.272
model <- lm(data = exercise, proportion ~ time + post + policy )
summary(model)
##
## Call:
## lm(formula = proportion ~ time + post + policy, data = exercise)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.045879 -0.015031 -0.002569 0.019217 0.039405
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.3126418 0.0086000 36.354 < 2e-16 ***
## time 0.0009392 0.0005181 1.813 0.07740 .
## post -0.0020357 0.0013081 -1.556 0.12752
## policy -0.0450482 0.0141888 -3.175 0.00288 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.02215 on 40 degrees of freedom
## Multiple R-squared: 0.5012, Adjusted R-squared: 0.4638
## F-statistic: 13.4 on 3 and 40 DF, p-value: 3.378e-06
##After we got model coefficients and intercept, We will predicted new variable and create counterfactual variables that will be use in plot.
exercise <- exercise %>%
mutate( "factual_accident_proportion" = (0.313+(0.00093*time)
-(0.002*post)
-(0.045*as.numeric(policy))))%>%
mutate("factual_accident_number" = factual_accident_proportion*(sum_freq))%>%
mutate("counterfact_accident_proportion" = (0.313+(0.00093*time)))%>%
mutate("counterfact_accident_number" =
counterfact_accident_proportion*(sum_freq))
##Final table result
DT::datatable(exercise)
##ggplot for Interrupted time series analysis
ggplot(data = exercise)+
geom_line(aes(x=time,y=factual_accident_number,
col="orange")) +
geom_line(aes(x=time,y=counterfact_accident_number,
col="blue")) +
geom_vline(xintercept = 28,
col ="red",
linetype = "twodash")+
geom_segment(aes(x = 38, y = 600, xend = 38, yend = 737 ),
col = "cadetblue4")+
annotate(geom = "label",
x = 29, y = 900,
label = "Intervention begin",
col = "red")+
annotate(geom = "label",
x = 40, y = 700,
label = "Intervention impact",
col = "cadetblue4",
size = 2.5)+
xlab(label = "Time point") +
ylab(label = "Number of accidents") +
labs(title = "Relationship between number of accidents and time",
color = "Intervention")+
scale_color_discrete(name = "Intervention",
labels = c("without policy (counterfactual)","with policy"))+
theme_bw()+
theme(legend.position = c(0.2,0.85))+
geom_smooth(method = lm,aes(x = time, y=counterfact_accident_number),
size = 1, se = FALSE)+
geom_smooth(method = lm,aes(x = time,y=factual_accident_number),
size = 1, se = FALSE, color = "red")
As plot show that there is relationship of intervention and change in slope of the model, from blue line to red line. And the number of accidents decline after intervention start. We can consider that the gap between two plot is the impact of our intervention.