模拟数据生成

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.1     ✔ stringr   1.5.2
✔ ggplot2   4.0.0     ✔ tibble    3.3.0
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.1.0     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)

第一步:模拟参数设定


set.seed(42) # 设置随机种子以保证结果可复现

n_students <- 2000          # 模拟的学生数量
gpa_cutoff <- 3.5           # 获得奖学金的GPA临界点
mean_gpa <- 3.2             # GPA的平均值
sd_gpa <- 0.4               # GPA的标准差

base_income <- 50000        # 基础年收入
gpa_effect_on_income <- 8000 # GPA每提高1分对年收入的影响
scholarship_effect <- 5000  # 获得奖学金带来的额外年收入 (真实的“处理效应”)
noise_sd <- 4000            # 收入的随机波动(噪声)的标准差

第二步:生成核心变量


# 生成学生的GPA (分配变量)
# 使用rnorm生成正态分布的GPA,并用pmax和pmin确保其在0到4.0的合理范围内
gpa <- rnorm(n_students, mean = mean_gpa, sd = sd_gpa)
gpa <- pmax(0, pmin(4.0, gpa)) # 将GPA限制在[0, 4.0]区间

# 分配奖学金状态 (处理变量)
# 如果GPA大于等于临界点,则获得奖学金 (值为1),否则为0
scholarship <- ifelse(gpa >= gpa_cutoff, 1, 0)

第三步:生成结果变量


# 生成未来收入 (结果变量)
# 收入 = 基础收入 + GPA带来的影响 + 奖学金带来的影响 + 随机噪声
future_income <- base_income + (gpa * gpa_effect_on_income) + (scholarship * scholarship_effect) + rnorm(n_students, mean = 0, sd = noise_sd)

第四步:整合为数据框


# 创建最终的数据框
rdd_data <- data.frame(
  student_id = 1:n_students,
  gpa = gpa,
  scholarship = as.factor(scholarship), # 将奖学金状态转为因子类型,便于绘图
  future_income = future_income
)

# 查看数据的前几行
print(head(rdd_data))
  student_id      gpa scholarship future_income
1          1 3.748383           1      85989.38
2          2 2.974121           0      72681.27
3          3 3.345251           0      69863.07
4          4 3.453145           0      69598.34
5          5 3.361707           0      71726.43
6          6 3.157550           0      76723.75
# 查看数据的基本摘要
# summary(rdd_data)

第五步:数据可视化


# 使用 ggplot2 绘制散点图来展示RDD的断点
rdd_plot <- ggplot(rdd_data, aes(x = gpa, y = future_income, color = scholarship)) +
  geom_point(alpha = 0.6) +  # 绘制散点图,设置透明度
  geom_vline(xintercept = gpa_cutoff, linetype = "dashed", color = "red", size = 1) + # 添加临界点垂直线
  labs(
    title = "GPA 对未来收入的影响 (RDD 模拟)",
    subtitle = paste("在 GPA =", gpa_cutoff, "处存在明显的断点"),
    x = "学生平均绩点 (GPA)",
    y = "毕业五年后年收入",
    color = "是否获得奖学金"
  ) +
  scale_color_manual(values = c("0" = "blue", "1" = "orange"), labels = c("否", "是")) + # 自定义颜色和图例标签
  theme(text = element_text(family = "Songti SC")) 
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
# 显示图表
print(rdd_plot)
Figure 1