Algorithm - Benjamini-Hochberg(BH)FDR Control
library(AER)
## Loading required package: car
## Loading required package: carData
## Loading required package: lmtest
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
## Loading required package: survival
data("CASchools")
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x dplyr::recode() masks car::recode()
## x purrr::some() masks car::some()
library(dplyr)
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
library(estimatr)#(lm_robust)
str(CASchools)
## 'data.frame': 420 obs. of 14 variables:
## $ district : chr "75119" "61499" "61549" "61457" ...
## $ school : chr "Sunol Glen Unified" "Manzanita Elementary" "Thermalito Union Elementary" "Golden Feather Union Elementary" ...
## $ county : Factor w/ 45 levels "Alameda","Butte",..: 1 2 2 2 2 6 29 11 6 25 ...
## $ grades : Factor w/ 2 levels "KK-06","KK-08": 2 2 2 2 2 2 2 2 2 1 ...
## $ students : num 195 240 1550 243 1335 ...
## $ teachers : num 10.9 11.1 82.9 14 71.5 ...
## $ calworks : num 0.51 15.42 55.03 36.48 33.11 ...
## $ lunch : num 2.04 47.92 76.32 77.05 78.43 ...
## $ computer : num 67 101 169 85 171 25 28 66 35 0 ...
## $ expenditure: num 6385 5099 5502 7102 5236 ...
## $ income : num 22.69 9.82 8.98 8.98 9.08 ...
## $ english : num 0 4.58 30 0 13.86 ...
## $ read : num 692 660 636 652 642 ...
## $ math : num 690 662 651 644 640 ...
CASchools %<>% mutate(score = (read+math)/2)
CASchools %<>% mutate(st.ratio = students/teachers)
lm.fit.7 <- CASchools %$% lm_robust(score ~ st.ratio + calworks + lunch + computer + expenditure + income + english, se_type = "HC2")
summary(lm.fit.7)$coef
## Estimate Std. Error t value Pr(>|t|) CI Lower
## (Intercept) 6.618848e+02 1.011465e+01 65.4382631 8.746942e-220 6.420021e+02
## st.ratio -2.782033e-01 3.131453e-01 -0.8884160 3.748354e-01 -8.937650e-01
## calworks -9.174277e-02 6.369565e-02 -1.4403303 1.505334e-01 -2.169518e-01
## lunch -3.709982e-01 4.470528e-02 -8.2987567 1.518469e-15 -4.588771e-01
## computer 4.100852e-04 8.207297e-04 0.4996593 6.175817e-01 -1.203255e-03
## expenditure 1.743849e-03 9.054684e-04 1.9259079 5.480367e-02 -3.606548e-05
## income 6.190124e-01 8.463125e-02 7.3142304 1.361586e-12 4.526495e-01
## english -2.113803e-01 3.889648e-02 -5.4344319 9.427272e-08 -2.878406e-01
## CI Upper DF
## (Intercept) 681.767588421 412
## st.ratio 0.337358497 412
## calworks 0.033466221 412
## lunch -0.283119321 412
## computer 0.002023425 412
## expenditure 0.003523763 412
## income 0.785375360 412
## english -0.134919948 412
pvals <- summary(lm.fit.7)$coef[-c(1,2), "Pr(>|t|)"]
pvals
## calworks lunch computer expenditure income english
## 1.505334e-01 1.518469e-15 6.175817e-01 5.480367e-02 1.361586e-12 9.427272e-08
hist(pvals, xlab="p-value", main="", col="lightblue")

BH_cutoff <- function(pvals, q=0.1){
pvals <- sort(pvals[!is.na(pvals)])
N <- length(pvals)
k <- rank(pvals, ties.method="min")
alpha <- max(pvals[ pvals<= (q*k/N) ])
plot(pvals, log="xy", xlab="order", main=sprintf("FDR of %g",q),
ylab="p-value", bty="n", col=c(8,2)[(pvals<=alpha) + 1], pch=20)
#log="xy": both axes are to be logarithmic
#bty="n"": no box
lines(1:N, q*(1:N)/(N+1))
return(alpha)
}
BH_cutoff(pvals) # 0.05480367

## [1] 0.05480367
signif <- which(pvals <= 0.05480367)
signif
## lunch income english
## 2 5 6
ols.fit.4 <- lm_robust(score ~.,
data=CASchools[,c("score", "st.ratio", names(signif))], se_type = "HC2")
summary(ols.fit.4)
##
## Call:
## lm_robust(formula = score ~ ., data = CASchools[, c("score",
## "st.ratio", names(signif))], se_type = "HC2")
##
## Standard error type: HC2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|) CI Lower CI Upper DF
## (Intercept) 675.6082 6.22466 108.537 9.114e-307 663.3724 687.84400 415
## st.ratio -0.5604 0.25603 -2.189 2.917e-02 -1.0637 -0.05711 415
## lunch -0.3964 0.03033 -13.066 6.385e-33 -0.4560 -0.33674 415
## income 0.6750 0.08448 7.989 1.352e-14 0.5089 0.84106 415
## english -0.1943 0.03336 -5.825 1.147e-08 -0.2599 -0.12875 415
##
## Multiple R-squared: 0.8053 , Adjusted R-squared: 0.8034
## F-statistic: 463.2 on 4 and 415 DF, p-value: < 2.2e-16