library(tidyverse)
Examine whether there is a significant difference in graduate admission rates between males and females.
datasets::UCBAdmissions
## , , Dept = A
##
## Gender
## Admit Male Female
## Admitted 512 89
## Rejected 313 19
##
## , , Dept = B
##
## Gender
## Admit Male Female
## Admitted 353 17
## Rejected 207 8
##
## , , Dept = C
##
## Gender
## Admit Male Female
## Admitted 120 202
## Rejected 205 391
##
## , , Dept = D
##
## Gender
## Admit Male Female
## Admitted 138 131
## Rejected 279 244
##
## , , Dept = E
##
## Gender
## Admit Male Female
## Admitted 53 94
## Rejected 138 299
##
## , , Dept = F
##
## Gender
## Admit Male Female
## Admitted 22 24
## Rejected 351 317
ucb <- UCBAdmissions %>%
as_tibble()
# Measure of association: Pr(Admitted | Male) and P(Admitted | Female)
tab <- ucb %>%
reframe(sum = sum(n), .by = c(Gender, Admit)) %>%
pivot_wider(names_from = Gender, values_from = sum)
tab
## # A tibble: 2 ร 3
## Admit Male Female
## <chr> <dbl> <dbl>
## 1 Admitted 1198 557
## 2 Rejected 1493 1278
# Collapse across departments
overall <- margin.table(UCBAdmissions, c(1, 2))
overall
## Gender
## Admit Male Female
## Admitted 1198 557
## Rejected 1493 1278
# Compute admission rates (Admit within Gender)
prop.table(overall, 2)
## Gender
## Admit Male Female
## Admitted 0.4451877 0.3035422
## Rejected 0.5548123 0.6964578
๐น Interpretation: Overall, 44.5% of men vs 30.4% of women were admitted โ looks like bias.
Examine the association between admission and sex by different departments and conclude.
dept_tab <- ucb %>%
reframe(sum = sum(n), .by = c(Dept, Admit, Gender)) %>%
pivot_wider(names_from = Admit, values_from = sum) %>%
group_by(Dept)
dept_tab
## # A tibble: 12 ร 4
## # Groups: Dept [6]
## Dept Gender Admitted Rejected
## <chr> <chr> <dbl> <dbl>
## 1 A Male 512 313
## 2 A Female 89 19
## 3 B Male 353 207
## 4 B Female 17 8
## 5 C Male 120 205
## 6 C Female 202 391
## 7 D Male 138 279
## 8 D Female 131 244
## 9 E Male 53 138
## 10 E Female 94 299
## 11 F Male 22 351
## 12 F Female 24 317
set.seed(1000)
n <- 1000
dta <- tibble::tibble(id = 1:n,
Y0 = rnorm(n = 1000),
tao = -0.5 + Y0,
Y1 = Y0 + tao)
Causal effect from data
CE <- mean(dta$Y1) - mean(dta$Y0)
Manuallyโฆ
EY1 <- (2*dnorm(x = 0.5)/pnorm(q = .5, lower.tail = F) - 0.5)
EY2 <- dnorm(x = 0.5)/pnorm(q = .5, lower.tail = T)
tau_true <- EY1 - EY2
dta <- dta %>%
mutate(A1 = if_else(tao > 0, 1, 0),
obsY = A1*Y1 + (1-A1)*Y0)
trt_effect <- dta %>%
group_by(A1) %>%
summarize(mean = mean(obsY)) %>%
mutate(tau = mean[2] - mean[1])
trt_effect
## # A tibble: 2 ร 3
## A1 mean tau
## <dbl> <dbl> <dbl>
## 1 0 -0.501 2.20
## 2 1 1.70 2.20
Determine the effect of a randomly assigned treatment.
set.seed(1000)
dta1 <- tibble::tibble(
id = 1:n,
Y0 = rnorm(n = 1000),
tao = -0.5 + Y0,
Y1 = Y0 + tao,
A2 = rbinom(n = 1000, size = 1, prob = 0.5),
Y = A2 * Y1 + (1 - A2) * Y0
)
trt_effect1 <- dta1 %>%
group_by(A2) %>%
summarize(mean = mean(Y)) %>%
mutate(tau = mean[2] - mean[1])
trt_effect1
## # A tibble: 2 ร 3
## A2 mean tau
## <int> <dbl> <dbl>
## 1 0 -0.0128 -0.512
## 2 1 -0.524 -0.512
Here is the submitted PDF
library(MASS)
library(Matching)
data("lalonde")
head(lalonde)
## age educ black hisp married nodegr re74 re75 re78 u74 u75 treat
## 1 37 11 1 0 1 1 0 0 9930.05 1 1 1
## 2 22 9 0 1 0 1 0 0 3595.89 1 1 1
## 3 30 12 1 0 0 0 0 0 24909.50 1 1 1
## 4 27 11 1 0 0 1 0 0 7506.15 1 1 1
## 5 33 8 1 0 0 1 0 0 289.79 1 1 1
## 6 22 9 1 0 0 1 0 0 4056.49 1 1 1
m1 <- lm(re78 ~ treat + age + educ +
black +hisp + married + nodegr +
re74 + re75 + re78 + u74 + u75 + treat, data = lalonde)
summary(m1)
##
## Call:
## lm(formula = re78 ~ treat + age + educ + black + hisp + married +
## nodegr + re74 + re75 + re78 + u74 + u75 + treat, data = lalonde)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9612 -4355 -1572 3054 53119
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.567e+02 3.522e+03 0.073 0.94193
## treat 1.671e+03 6.411e+02 2.606 0.00948 **
## age 5.357e+01 4.581e+01 1.170 0.24284
## educ 4.008e+02 2.288e+02 1.751 0.08058 .
## black -2.037e+03 1.174e+03 -1.736 0.08331 .
## hisp 4.258e+02 1.565e+03 0.272 0.78562
## married -1.463e+02 8.823e+02 -0.166 0.86835
## nodegr -1.518e+01 1.006e+03 -0.015 0.98797
## re74 1.234e-01 8.784e-02 1.405 0.16079
## re75 1.974e-02 1.503e-01 0.131 0.89554
## u74 1.380e+03 1.188e+03 1.162 0.24590
## u75 -1.071e+03 1.025e+03 -1.045 0.29651
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6517 on 433 degrees of freedom
## Multiple R-squared: 0.05822, Adjusted R-squared: 0.0343
## F-statistic: 2.433 on 11 and 433 DF, p-value: 0.005974
coef(m1)["treat"]
## treat
## 1670.709
dat1 <- UCBAdmissions %>%
as_tibble()
tab <- dat1 %>%
reframe(sum = sum(n), .by = c(Admit, Gender)) %>%
pivot_wider(names_from = Admit, values_from = sum)
a <- tab$Admitted[tab$Gender == "Male"]
b <- tab$Rejected[tab$Gender == "Male"]
c <- tab$Admitted[tab$Gender == "Female"]
d <- tab$Rejected[tab$Gender == "Female"]
n_m <- a + b
n_f <- c + d
p_m <- a / (a + b)
f_m <- c / (c + d)
# risk difference (RD)
RD <- p_m - f_m
RD
## [1] 0.1416454
SE_RD <- sqrt(p_m*(1-p_m)/n_m + f_m*(1-f_m)/n_f)
CI_RD <- RD + c(-1.96, 1.96)*SE_RD
CI_RD
## [1] 0.1134464 0.1698444
# odds ratio (OR)
#OR <- (a/b)/(c/d)
OR <- (p_m / (1 - p_m))/ (f_m / (1 - f_m))
OR
## [1] 1.84108
SE_logOR <- sqrt(1/a + 1/b + 1/c + 1/d)
CI_OR <- exp(log(OR) + c(-1.96, 1.96)*SE_logOR)
CI_OR
## [1] 1.624373 2.086698
# risk ratio (RR)
RR <- p_m / f_m
RR
## [1] 1.466642
SE_logRR <- sqrt((1 - p_m)/(a) + (1 - f_m)/(c)) # delta method
CI_RR <- exp(log(RR) + c(-1.96, 1.96)*SE_logRR)
CI_RR
## [1] 1.352348 1.590595
dept_tab <- dat1 %>%
reframe(sum = sum(n), .by = c(Dept, Admit, Gender)) %>%
pivot_wider(names_from = Admit, values_from = sum) %>%
group_by(Dept)
a1 <- dept_tab$Admitted[dept_tab$Gender == "Male"]
b1 <- dept_tab$Rejected[dept_tab$Gender == "Male"]
c1 <- dept_tab$Admitted[dept_tab$Gender == "Female"]
d1 <- dept_tab$Rejected[dept_tab$Gender == "Female"]
n_m1 <- a1 + b1
n_f1 <- c1 + d1
p_m1 <- a1 / n_m1
p_f1 <- c1 / n_f1
# Risk difference
RD1 <- p_m1 - p_f1
SE_RD1 <- sqrt(p_m1*(1-p_m1)/n_m1 + p_f1*(1-p_f1)/n_f1)
L_RD1 <- RD1 - 1.96*SE_RD1
U_RD1 <- RD1 + 1.96*SE_RD1
# Odds ratio
OR1 <- (p_m1/(1-p_m1))/(p_f1/(1-p_f1))
SE_logOR1 <- sqrt(1/a1 + 1/b1 + 1/c1 + 1/d1)
L_OR1 <- exp(log(OR1) - 1.96*SE_logOR1)
U_OR1 <- exp(log(OR1) + 1.96*SE_logOR1)
# Risk ratio
RR1 <- p_m1/p_f1
SE_logRR1 <- sqrt((1-p_m1)/a1 + (1-p_f1)/c1)
L_RR1 <- exp(log(RR1) - 1.96*SE_logRR1)
U_RR1 <- exp(log(RR1) + 1.96*SE_logRR1)
result <- tibble(
Dept = unique(dept_tab$Dept),
RD = RD1,
RD_CI = sprintf("(%.3f, %.3f)", L_RD1, U_RD1),
OR = OR1,
OR_CI = sprintf("(%.3f, %.3f)", L_OR1, U_OR1),
RR = RR1,
RR_CI = sprintf("(%.3f, %.3f)", L_RR1, U_RR1)
)
result
## # A tibble: 6 ร 7
## Dept RD RD_CI OR OR_CI RR RR_CI
## <chr> <dbl> <chr> <dbl> <chr> <dbl> <chr>
## 1 A -0.203 (-0.283, -0.124) 0.349 (0.209, 0.584) 0.753 (0.680, 0.834)
## 2 B -0.0496 (-0.237, 0.138) 0.803 (0.340, 1.892) 0.927 (0.703, 1.222)
## 3 C 0.0286 (-0.036, 0.093) 1.13 (0.855, 1.502) 1.08 (0.905, 1.299)
## 4 D -0.0184 (-0.084, 0.048) 0.921 (0.686, 1.237) 0.947 (0.780, 1.150)
## 5 E 0.0383 (-0.038, 0.115) 1.22 (0.825, 1.809) 1.16 (0.869, 1.549)
## 6 F -0.0114 (-0.048, 0.025) 0.828 (0.455, 1.506) 0.838 (0.479, 1.466)
set.seed(1000)
n <- 1000
dat2 <- tibble::tibble(id = 1:n,
Y0 = rnorm(n = 1000),
tao = -0.5 + Y0,
Y1 = Y0 + tao)
#Treatment effect
EY1 <- ((2*dnorm(x = 0.5))/pnorm(q = .5, lower.tail = F)) - 0.5
EY2 <- -(dnorm(x = 0.5)/pnorm(q = .5, lower.tail = T))
tau_true <- EY1 - EY2
tau_true
## [1] 2.291316
dat3 <- dat2 %>%
mutate(A1 = if_else(tao > 0, 1, 0),
Y = A1*Y1 + (1-A1)*Y0)
trt_effect <- dat3 %>%
group_by(A1) %>%
summarize(mean = mean(Y)) %>%
mutate(tau = mean[2] - mean[1])
trt_effect
## # A tibble: 2 ร 3
## A1 mean tau
## <dbl> <dbl> <dbl>
## 1 0 -0.501 2.20
## 2 1 1.70 2.20
dat4 <- dat2 %>%
mutate(
A2 = rbinom(n = 1000, size = 1, prob = 0.5),
Y_2 = A2 * Y1 + (1 - A2) * Y0
)
dat4 %>%
group_by(A2) %>%
summarize(mean_2 = mean(Y_2)) %>%
mutate(tau_2 = mean_2[2] - mean_2[1])
## # A tibble: 2 ร 3
## A2 mean_2 tau_2
## <int> <dbl> <dbl>
## 1 0 -0.0128 -0.512
## 2 1 -0.524 -0.512
dat4
## # A tibble: 1,000 ร 6
## id Y0 tao Y1 A2 Y_2
## <int> <dbl> <dbl> <dbl> <int> <dbl>
## 1 1 -0.446 -0.946 -1.39 1 -1.39
## 2 2 -1.21 -1.71 -2.91 0 -1.21
## 3 3 0.0411 -0.459 -0.418 0 0.0411
## 4 4 0.639 0.139 0.779 0 0.639
## 5 5 -0.787 -1.29 -2.07 1 -2.07
## 6 6 -0.385 -0.885 -1.27 1 -1.27
## 7 7 -0.476 -0.976 -1.45 1 -1.45
## 8 8 0.720 0.220 0.940 1 0.940
## 9 9 -0.0185 -0.519 -0.537 0 -0.0185
## 10 10 -1.37 -1.87 -3.25 1 -3.25
## # โน 990 more rows
f_tab <- function(a, b, c, d) {
n1 <- a + b
n2 <- c + d
p1 <- a / n1
p2 <- c / n2
RD <- p1 - p2
SE_RD <- sqrt(p1*(1-p1)/n1 + p2*(1-p2)/n2)
CI_RD <- c(RD - 1.96*SE_RD, RD + 1.96*SE_RD)
# Risk ratio
RR <- p1 / p2
SE_logRR <- sqrt((1-p1)/a + (1-p2)/c)
CI_RR <- exp(log(RR) + c(-1.96, 1.96) * SE_logRR)
OR <- (a/b)/(c/d)
SE_logOR <- sqrt(1/a + 1/b + 1/c + 1/d)
CI_OR <- exp(log(OR) + c(-1.96, 1.96) * SE_logOR)
return(list(
RD = RD, RD_CI = CI_RD,
RR = RR, RR_CI = CI_RR,
OR = OR, OR_CI = CI_OR
))
}
f_tab(1198, 1493, 557, 1278)
## $RD
## [1] 0.1416454
##
## $RD_CI
## [1] 0.1134464 0.1698444
##
## $RR
## [1] 1.466642
##
## $RR_CI
## [1] 1.352348 1.590595
##
## $OR
## [1] 1.84108
##
## $OR_CI
## [1] 1.624373 2.086698
moa <- function(df){
# wrangling
trdf <- df %>%
data.frame() %>%
dplyr::select(Admitted, Rejected) %>%
as.matrix()
# removing column names
colnames(trdf) <- NULL
a <- trdf[1, 1]
b <- trdf[1, 2]
c <- trdf[2, 1]
d <- trdf[2, 2]
# RD
#rd <- a/(a+b) - c/(c+d)
# RR
#rr <- (a/(a+b))/(c/(c+d))
# OR
or <- (a*d)/(b*c)
# SE = 1/a + 1/b + 1/c + 1/d
se_or <- sum(1/c(a, b, c, d)) %>% sqrt()
# confidence limits
ci <- c(or - 1.96*se_or, or + 1.96*se_or)
# main output
#return(list(OR = or, Lower_bound = ci[1], Upper_bound = ci[2]))
#if(or == 1 || rr == 1 || rd == 0){
if(dplyr::between(x = 1, left = ci[1], right = ci[2])){
#print("There is no association between admission rates and gender within the department, at 5% level of significance.")
return(
list(
Conclusion = "There is no association between admission rates and gender within the department, at 5% level of significance.",
OR = or,
Lower_bound = ci[1],
Upper_bound = ci[2]
)
)
}else{
return(list(Conclusion = "There is significant association between admission rates and gender within the department, at 5% level of significance.", OR = or, Lower_bound = ci[1], Upper_bound = ci[2]))
#print("There is significant association between admission rates and gender within the department, at 5% level of significance.")
}
}
๐ฏ Objective Test whether participation in a job training program
(treat) affects post-program earnings (re78)
using:
library(Matching)
data("lalonde")
str(lalonde)
## 'data.frame': 445 obs. of 12 variables:
## $ age : int 37 22 30 27 33 22 23 32 22 33 ...
## $ educ : int 11 9 12 11 8 9 12 11 16 12 ...
## $ black : int 1 0 1 1 1 1 1 1 1 0 ...
## $ hisp : int 0 1 0 0 0 0 0 0 0 0 ...
## $ married: int 1 0 0 0 0 0 0 0 0 1 ...
## $ nodegr : int 1 1 0 1 1 1 0 1 0 0 ...
## $ re74 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ re75 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ re78 : num 9930 3596 24910 7506 290 ...
## $ u74 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ u75 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ treat : int 1 1 1 1 1 1 1 1 1 1 ...
lalonde %>%
count(treat)
## treat n
## 1 0 260
## 2 1 185
t1 <- t.test(re78 ~ treat,
var.equal = T, data = lalonde)
##OR
t1a <- t.test(lalonde$re78[lalonde$treat == 1],
lalonde$re78[lalonde$treat == 0],
var.equal = T)
t1
##
## Two Sample t-test
##
## data: re78 by treat
## t = -2.8353, df = 443, p-value = 0.004788
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -3038.1113 -550.5749
## sample estimates:
## mean in group 0 mean in group 1
## 4554.802 6349.145
names(t1)
## [1] "statistic" "parameter" "p.value" "conf.int" "estimate"
## [6] "null.value" "stderr" "alternative" "method" "data.name"
re78) between the treated and control groups.var.equal = TRUE).๐น Null Hypothesis (Hโ):
There is no relationship between Y and A (treatment
has no effect).
set.seed(30)
fn_FR <- function(Y, A, nsim = 10000) {
res <- list() #store all simulated t-statistics from permuted samples
n <- length(A)
t_obs <- t.test(Y ~ A, var.equal = TRUE)$statistic # Observed t-statistic
for (i in 1:nsim) { # Permutation under the null
A_perm <- sample(A, size = n, replace = FALSE)
res[[i]] <- t.test(Y ~ A_perm, var.equal = TRUE)$statistic
}
# Converts the list of t-statistics into a tidy tibble.
out <- tibble(stat = unlist(res))
# Compute two-sided p-value
p <- out %>%
reframe(p = mean(abs(stat) > abs(t_obs)))
return(list(out, p))
}
# Run Fisher Randomization Test
res <- fn_FR(Y = lalonde$re78, A = lalonde$treat)
res[[2]]
## # A tibble: 1 ร 1
## p
## <dbl>
## 1 0.0051
res[[1]] %>%
ggplot(aes(stat)) +
geom_histogram(aes(y = after_stat(density)), fill = "skyblue", color = "white") +
geom_density(aes(stat), color = "red", bw = 0.75) +
labs(title = "Permutation Distribution of t-statistics",
x = "t-statistic", y = "Density")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The red curve shows the estimated density of permuted t-values.
If the observed t-statistic lies far in the tail, it indicates a significant effect.
set.seed(30)
fn_FR_residual <- function(Y, X, A, nsim = 10000) {
res <- list() # store permuted t-statistics
n <- length(A)
mod_null <- lm(Y ~ ., data = X)
resid_null <- residuals(mod_null)
# Observed t-statistic: residuals ~ actual treatment
t_obs <- t.test(resid_null ~ A, var.equal = TRUE)$statistic
# Permutation loop
for (i in 1:nsim) {
A_perm <- sample(A, size = n, replace = FALSE)
res[[i]] <- t.test(resid_null ~ A_perm, var.equal = TRUE)$statistic
}
out <- tibble(stat = unlist(res))
p <- out %>% reframe(p = mean(abs(stat) >= abs(t_obs)))
return(list(out, p))
}
X <- lalonde %>% dplyr::select(-re78, -treat) # covariates only
res1 <- fn_FR_residual(Y = lalonde$re78, X = X, A = lalonde$treat, nsim = 10000)
# Empirical p-value
res[[2]]
## # A tibble: 1 ร 1
## p
## <dbl>
## 1 0.0051
D = t-test assuming equal variance
S = t-test assuming unequal variance (Welch)
W = Wilcoxon rank-sum test (nonparametric)
FRT_f <- function(Y, A) {
y1 <- Y[A == 1]
y0 <- Y[A == 0]
selc <- c("statistic", "p.value")
m_d <- broom::tidy(t.test(x = y1, y = y0, var.equal = T))[selc]
m_t <- broom::tidy(t.test(x = y1, y = y0, var.equal = F))[selc]
m_W <- broom::tidy(wilcox.test(x = y1, y = y0))[selc]
return(tibble(method = c("D", "S", "W"), bind_rows(m_d, m_t,
m_W)))
}
out_appx <- FRT_f(A = lalonde$treat, Y = lalonde$re78)
out_appx
## # A tibble: 3 ร 3
## method statistic p.value
## <chr> <dbl> <dbl>
## 1 D 2.84 0.00479
## 2 S 2.67 0.00789
## 3 W 27402. 0.0109
res <- list()
for (i in 1:10^4) {
a_perm <- sample(lalonde$treat, replace = F)
res[[i]] <- FRT_f(Y = lalonde$re78, A = a_perm)
}
out <- bind_rows(res) %>%
left_join(out_appx %>%
dplyr::select(method, est = statistic))
## Joining with `by = join_by(method)`
#
#save(out, file = here::here("data/FRT-perm.Rdata"))
#p-values obtained from the permutation distributions
out %>%
reframe(p = mean(statistic >= est[1]), .by = method) %>% #calculate the mean within each method group
mutate(p2 = 2 * p)
## # A tibble: 3 ร 3
## method p p2
## <chr> <dbl> <dbl>
## 1 D 0.0023 0.0046
## 2 S 0.0024 0.0048
## 3 W 0.0054 0.0108