模拟数据生成

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.1     ✔ stringr   1.5.2
✔ ggplot2   4.0.0     ✔ tibble    3.3.0
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.1.0     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggplot2)

第一步：模拟参数设定

set.seed(42) # 设置随机种子以保证结果可复现

n_students <- 2000          # 模拟的学生数量
gpa_cutoff <- 3.5           # 获得奖学金的GPA临界点
mean_gpa <- 3.2             # GPA的平均值
sd_gpa <- 0.4               # GPA的标准差

base_income <- 50000        # 基础年收入
gpa_effect_on_income <- 8000 # GPA每提高1分对年收入的影响
scholarship_effect <- 5000  # 获得奖学金带来的额外年收入 (真实的“处理效应”)
noise_sd <- 4000            # 收入的随机波动（噪声）的标准差

第二步：生成核心变量

# 生成学生的GPA (分配变量)
# 使用rnorm生成正态分布的GPA，并用pmax和pmin确保其在0到4.0的合理范围内
gpa <- rnorm(n_students, mean = mean_gpa, sd = sd_gpa)
gpa <- pmax(0, pmin(4.0, gpa)) # 将GPA限制在[0, 4.0]区间

# 分配奖学金状态 (处理变量)
# 如果GPA大于等于临界点，则获得奖学金 (值为1)，否则为0
scholarship <- ifelse(gpa >= gpa_cutoff, 1, 0)

第三步：生成结果变量

# 生成未来收入 (结果变量)
# 收入 = 基础收入 + GPA带来的影响 + 奖学金带来的影响 + 随机噪声
future_income <- base_income + (gpa * gpa_effect_on_income) + (scholarship * scholarship_effect) + rnorm(n_students, mean = 0, sd = noise_sd)

第四步：整合为数据框

# 创建最终的数据框
rdd_data <- data.frame(
  student_id = 1:n_students,
  gpa = gpa,
  scholarship = as.factor(scholarship), # 将奖学金状态转为因子类型，便于绘图
  future_income = future_income
)

# 查看数据的前几行
print(head(rdd_data))

  student_id      gpa scholarship future_income
1          1 3.748383           1      85989.38
2          2 2.974121           0      72681.27
3          3 3.345251           0      69863.07
4          4 3.453145           0      69598.34
5          5 3.361707           0      71726.43
6          6 3.157550           0      76723.75

# 查看数据的基本摘要
# summary(rdd_data)

第五步：数据可视化

# 使用 ggplot2 绘制散点图来展示RDD的断点
rdd_plot <- ggplot(rdd_data, aes(x = gpa, y = future_income, color = scholarship)) +
  geom_point(alpha = 0.6) +  # 绘制散点图，设置透明度
  geom_vline(xintercept = gpa_cutoff, linetype = "dashed", color = "red", size = 1) + # 添加临界点垂直线
  labs(
    title = "GPA 对未来收入的影响 (RDD 模拟)",
    subtitle = paste("在 GPA =", gpa_cutoff, "处存在明显的断点"),
    x = "学生平均绩点 (GPA)",
    y = "毕业五年后年收入",
    color = "是否获得奖学金"
  ) +
  scale_color_manual(values = c("0" = "blue", "1" = "orange"), labels = c("否", "是")) + # 自定义颜色和图例标签
  theme(text = element_text(family = "Songti SC"))

Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.

# 显示图表
print(rdd_plot)