# import packages
library(tidyverse)
library(broom)
library(rdrobust)
library(rddensity)
library(ggsci)
# data
dat <- read_csv("tutoring_program.csv")
head(dat)
## # A tibble: 6 x 4
## id entrance_exam exit_exam tutoring
## <dbl> <dbl> <dbl> <lgl>
## 1 1 92.4 78.1 FALSE
## 2 2 72.8 58.2 FALSE
## 3 3 53.7 62 TRUE
## 4 4 98.3 67.5 FALSE
## 5 5 69.7 54.1 TRUE
## 6 6 68.1 60.1 TRUE
没有统计方法可以说明这是一个rule-based的过程。研究者必须结合具体的研究背景来说明,为什么会有断点的存在。
查看是否为模糊断点。
显然,这里是非模糊断点。
ggplot(dat, aes(x = entrance_exam, y = tutoring, color = tutoring)) +
geom_point(size = .5, alpha = .6,
position = position_jitter(width = 0,
height = .2)) +
geom_vline(xintercept = 70, alpha = .5) +
theme_bw()
Fig.1 模糊断点检查
dat %>%
group_by(tutoring, entrance_exam > 70) %>%
summarise(count = n())
## # A tibble: 2 x 3
## # Groups: tutoring [2]
## tutoring `entrance_exam > 70` count
## <lgl> <lgl> <int>
## 1 FALSE TRUE 759
## 2 TRUE FALSE 241
需要说明runing variable
的连续性在threshold处不会出现非连续。比如老师经常强行把很多55-59分以上的学生分提高到60来“捞他们”,这样你比较出来的就不单单是是否及格所带来的影响了,因为高于60分的同学的能力,比低于60分的同学的能力要高很多(超过5分)。
我们可以自己画图来观察:
ggplot(dat, aes(x = entrance_exam, fill = tutoring)) +
geom_histogram(binwidth = 4, color = "white") +
geom_vline(xintercept = 70, alpha = .8) +
theme_bw()
Fig.2 running variable 连续性检查
当然,也可以使用工具包内置的函数。
从图可以看到,断点处的差异非常小,且没有超过置信区间。这说明running variable
在断点处没有发生跳跃。
check_result <- rdplotdensity(
rdd = rddensity(dat$entrance_exam, c = 70),
X = dat$entrance_exam,
type = "both")
Fig.3 running variable 连续性检查
这是更具体的McCrary density test
的检验。发现并running variable
的密度函数并没有发生显著的跳跃。
test_density <- rddensity(dat$entrance_exam, c = 70)
summary(test_density)
##
## Manipulation testing using local polynomial density estimation.
##
## Number of obs = 1000
## Model = unrestricted
## Kernel = triangular
## BW method = estimated
## VCE method = jackknife
##
## c = 70 Left of c Right of c
## Number of obs 237 763
## Eff. Number of obs 208 577
## Order est. (p) 2 2
## Order bias (q) 3 3
## BW est. (h) 22.444 19.966
##
## Method T P > |T|
## Robust -0.5521 0.5809
## Warning in summary.CJMrddensity(test_density): There are repeated observations.
## Point estimates and standard errors have been adjusted. Use option
## massPoints=FALSE to suppress this feature.
##
## P-values of binomial tests (H0: p=0.5).
##
## Window Length / 2 <c >=c P>|T|
## 0.300 9 11 0.8238
## 0.600 15 16 1.0000
## 0.900 17 19 0.8679
## 1.200 22 22 1.0000
## 1.500 26 29 0.7877
## 1.800 34 35 1.0000
## 2.100 41 44 0.8284
## 2.400 43 51 0.4705
## 2.700 46 56 0.3729
## 3.000 51 61 0.3952
检查因变量是否在断点两边发生了跳跃。
看起来,\(\leq 70\)分的个体因为接受tutoring
成绩会提升一点。
ggplot(dat, aes(x = entrance_exam, y = exit_exam, color = tutoring)) +
geom_point(alpha = .6) +
geom_vline(xintercept = 70, alpha = .8) +
geom_smooth(data = filter(dat, entrance_exam <= 70), method = "lm") +
geom_smooth(data = filter(dat, entrance_exam > 70), method = "lm") +
theme_bw()
Fig.4
首先,将running variable
减去 threshold
。当然,这不是必要的,但可以帮助我们更方便的解释模型。
dat_centered <- dat %>%
mutate(entrance_centered = entrance_exam - 70)
然后将running variable
和treatment
一起放到回归模型里面。
可以看到,估计的处理效应的是10.8分。
model_simple <- lm(exit_exam ~ entrance_centered + tutoring,
data = dat_centered)
tidy(model_simple)
## # A tibble: 3 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 59.4 0.442 134. 0.
## 2 entrance_centered 0.510 0.0269 18.9 1.40e-68
## 3 tutoringTRUE 10.8 0.800 13.5 3.12e-38
回归参数的拟合用到了所有数据,但在RDD设计中,我们实际上更关注threshold
两侧的观测对象,那些成绩特别高或特别低的人不应该对我们的拟合结果产生影响。
因此中我们还需要调整bandwidth
,即在threshold
两侧多远的范围内做参数估计(即局部的因果效应)。
可以调整bandwidth进行可视化:
bandwidth = 10 # 局部的估计
ggplot(dat, aes(x = entrance_exam, y = exit_exam, color = tutoring)) +
geom_point(alpha = .6) +
geom_vline(xintercept = 70, alpha = .8) +
# 只基于bandwidht内的点进行回归
geom_smooth(data = filter(dat,
entrance_exam <= 70,
entrance_exam >= 70 - bandwidth), method = "lm") +
geom_smooth(data = filter(dat,
entrance_exam > 70,
entrance_exam <= 70 + bandwidth), method = "lm") +
theme_bw()
Fig.5
这里通过一个函数,来基于不同bandwidth
给出估计值。下图展示了不同bandwidth
和基于回归分析估计出来的处理效应之间的关系。
cal_effect <- function(bw){
bandwidth = bw
model_bw <- lm(exit_exam ~ entrance_centered + tutoring,
data = filter(dat_centered,
entrance_exam >= 70 - bandwidth,
entrance_exam <= 70 + bandwidth))
return(list(bw = bw, effect = model_bw$coefficients[3]))
}
map_dfr(5:30, cal_effect) %>%
ggplot(aes(x = bw, y = effect)) +
geom_point() +
geom_line() +
theme_bw()
Fig.6
与先行回归不同,非参数方法并不是拟合我们设定好的分布,而是寻求通过一条曲线来尽量拟合当前数据。这里要用到rdrobust()
函数:
在下面的结果中:
BW est. (h)
表示使用的bandwidth=9.969
,即拟合时只考虑threshold
两侧约各10分范围内的观测对象,我们可以自行调整参数h
。Kernel
:是估计曲线时对threshold
两侧对象赋予的权重。我们也可以自行选择其形式,参见下图:knitr::include_graphics("https://tva1.sinaimg.cn/large/e6c9d24egy1gzxn8qko6ij20re0jm76m.jpg")
Fig.7 Kernal权重
rdrobust(y = dat$exit_exam, x = dat$entrance_exam, c = 70) %>%
summary()
## [1] "Mass points detected in the running variable."
## Call: rdrobust
##
## Number of Obs. 1000
## BW type mserd
## Kernel Triangular
## VCE method NN
##
## Number of Obs. 237 763
## Eff. Number of Obs. 144 256
## Order est. (p) 1 1
## Order bias (q) 2 2
## BW est. (h) 9.969 9.969
## BW bias (b) 14.661 14.661
## rho (h/b) 0.680 0.680
## Unique Obs. 155 262
##
## =============================================================================
## Method Coef. Std. Err. z P>|z| [ 95% C.I. ]
## =============================================================================
## Conventional -8.578 1.601 -5.359 0.000 [-11.715 , -5.441]
## Robust - - -4.352 0.000 [-12.101 , -4.587]
## =============================================================================
将非参数方法的结果绘制出来:(点代表局部的均值)
rdplot(y = dat$exit_exam, x = dat$entrance_exam, c = 70)
## [1] "Mass points detected in the running variable."
模糊断点估计本质上是工具变量法。
df <- read_csv("tutoring_program_fuzzy.csv")
head(df)
## # A tibble: 6 x 5
## id entrance_exam tutoring tutoring_text exit_exam
## <dbl> <dbl> <lgl> <chr> <dbl>
## 1 1 92.4 FALSE No tutor 78.1
## 2 2 72.8 FALSE No tutor 58.2
## 3 3 53.7 TRUE Tutor 62.0
## 4 4 98.3 FALSE No tutor 67.5
## 5 5 69.7 TRUE Tutor 54.1
## 6 6 68.1 TRUE Tutor 60.1
通过绘图的方式判断:
可以看到,是否接受处理并没有一个明确的分界线:
ggplot(df, aes(x = entrance_exam, y = tutoring_text, color = entrance_exam <= 70)) +
# Make points small and semi-transparent since there are lots of them
geom_point(size = 1.5, alpha = 0.5,
position = position_jitter(width = 0, height = 0.25, seed = 1234)) +
# Add vertical line
geom_vline(xintercept = 70) +
# Add labels
labs(x = "Entrance exam score", y = "Participated in tutoring program")
可以看到,小于等于70分的人群有更大的可能接受tutor。
df %>%
group_by(tutoring, entrance_exam <= 70) %>%
summarize(count = n()) %>%
group_by(tutoring) %>%
mutate(prop = count / sum(count))
## # A tibble: 4 x 4
## # Groups: tutoring [2]
## tutoring `entrance_exam <= 70` count prop
## <lgl> <lgl> <int> <dbl>
## 1 FALSE FALSE 646 0.947
## 2 FALSE TRUE 36 0.0528
## 3 TRUE FALSE 116 0.365
## 4 TRUE TRUE 202 0.635
这里使用的工具变量Z
,是个体的成绩是否高于threshold
。这显然满足工具变量的要求:
这里还是先生成相关变量,包括减去threshold后的入学成绩,以及生成的工具变量。
tutoring_centered <- df %>%
mutate(entrance_centered = entrance_exam - 70,
below_cutoff = entrance_exam <= 70)
head(tutoring_centered)
## # A tibble: 6 x 7
## id entrance_exam tutoring tutoring_text exit_exam entrance_centered
## <dbl> <dbl> <lgl> <chr> <dbl> <dbl>
## 1 1 92.4 FALSE No tutor 78.1 22.4
## 2 2 72.8 FALSE No tutor 58.2 2.77
## 3 3 53.7 TRUE Tutor 62.0 -16.3
## 4 4 98.3 FALSE No tutor 67.5 28.3
## 5 5 69.7 TRUE Tutor 54.1 -0.288
## 6 6 68.1 TRUE Tutor 60.1 -1.93
## # … with 1 more variable: below_cutoff <lgl>
然后使用2SLS来估计,模型为:
注意,估计时同样需要设定bandwidth
。可以看到,对于那些compliers
,估计的因果效应为9.74分。
library(estimatr)
bandwidth <- 10
model_fuzzy <- iv_robust(
exit_exam ~ entrance_centered + tutoring | entrance_centered + below_cutoff,
data = filter(tutoring_centered,
entrance_centered >= -1*bandwidth & entrance_centered <= bandwidth)
)
tidy(model_fuzzy)
## term estimate std.error statistic p.value conf.low
## 1 (Intercept) 60.1413558 1.01765573 59.097939 9.746624e-200 58.1407338
## 2 entrance_centered 0.4366281 0.09929619 4.397229 1.407213e-05 0.2414205
## 3 tutoringTRUE 9.7410444 1.91184891 5.095091 5.384163e-07 5.9825170
## conf.high df outcome
## 1 62.1419778 400 exit_exam
## 2 0.6318357 400 exit_exam
## 3 13.4995718 400 exit_exam
与非模糊断点一样,我们需要用到rdrobust
命令,只不过需要将处理变量传递给fuzzy
参数。
rdrobust(y = df$exit_exam, x = df$entrance_exam,
c = 70,
fuzzy = df$tutoring) %>% # 注意,要指定fuzzy参数
summary()
## Call: rdrobust
##
## Number of Obs. 1000
## BW type mserd
## Kernel Triangular
## VCE method NN
##
## Number of Obs. 238 762
## Eff. Number of Obs. 170 347
## Order est. (p) 1 1
## Order bias (q) 2 2
## BW est. (h) 12.985 12.985
## BW bias (b) 19.733 19.733
## rho (h/b) 0.658 0.658
## Unique Obs. 238 762
##
## =============================================================================
## Method Coef. Std. Err. z P>|z| [ 95% C.I. ]
## =============================================================================
## Conventional 9.683 1.893 5.116 0.000 [5.973 , 13.393]
## Robust - - 4.258 0.000 [5.210 , 14.095]
## =============================================================================