Create indicators for treatment assignment and female gender. N = 100.
set.seed(1)
treatment <- sample(c(0, 1), 100, prob = c(0.5, 0.5), replace = TRUE)
female <- sample(c(0, 1), 100, prob = c(0.6, 0.4), replace = TRUE)
dat <- data.frame(cbind(treatment, female))
head(dat)
## treatment female
## 1 1 1
## 2 1 0
## 3 0 0
## 4 0 1
## 5 1 1
## 6 0 0
Create a contingency table.
c.tbl <- table(dat)
c.tbl <- c.tbl[, c(2, 1)] # reverse column order to get prop female
c.tbl
## female
## treatment 1 0
## 0 21 27
## 1 19 33
Use the prop.test() function in R to test null hypothesis that group proportions are the same.
p <- prop.test(c.tbl)
p
##
## 2-sample test for equality of proportions with continuity
## correction
##
## data: c.tbl
## X-squared = 0.2821, df = 1, p-value = 0.5953
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## -0.1398 0.2840
## sample estimates:
## prop 1 prop 2
## 0.4375 0.3654
The proportion of females in the control group is 0.4375. This compares to 0.3654 in the treatment group, a difference of -0.0721 (p=0.5953).
r <- lm(female ~ treatment, data = dat)
summary(r)
##
## Call:
## lm(formula = female ~ treatment, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.438 -0.438 -0.365 0.562 0.635
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.4375 0.0712 6.14 1.8e-08 ***
## treatment -0.0721 0.0988 -0.73 0.47
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.494 on 98 degrees of freedom
## Multiple R-squared: 0.00541, Adjusted R-squared: -0.00474
## F-statistic: 0.533 on 1 and 98 DF, p-value: 0.467
The coefficient on treatment is -0.0721 (p=0.4671).
The difference in p-values between approaches is 0.1282. The p-value resulting from the t-test of proportions, p=0.5953, is 27.4443 percent greater than the p-value of the coefficient on treatment, p=0.4671.
p-values from prop.test are never smaller than p-values from regression. Histogram of 500 iterations of difference between p-values derived from prop.test and p-values derived from regression (prop.test - regression, so positive differences mean higher p-values for prop.test).
results <- as.data.frame(NULL)
for (i in 1:500) {
treatment <- sample(c(0, 1), 100, prob = c(0.5, 0.5), replace = TRUE)
female <- sample(c(0, 1), 100, prob = c(0.6, 0.4), replace = TRUE)
dat <- data.frame(cbind(treatment, female))
c.tbl <- table(dat)
c.tbl <- c.tbl[, c(2, 1)]
p <- prop.test(c.tbl)
r <- lm(female ~ treatment, data = dat)
results[i, 1] <- p$estimate[1] # control
results[i, 2] <- p$estimate[2] # treat
results[i, 3] <- results[i, 2] - results[i, 1] # difference proportions
results[i, 4] <- p$p.value
results[i, 5] <- coef(summary(r))[, 4][2]
results[i, 6] <- results[i, 4] - results[i, 5] # difference p-values
}
names(results) <- c("ctl.pro", "trt.pro", "diff.pro", "p.proptest", "p.lm",
"diff.p")
library(ggplot2)
ggplot(results, aes(x = diff.p)) + geom_histogram()
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
.