Algorithm - Benjamini-Hochberg(BH)FDR Control

library(AER)
## Loading required package: car
## Loading required package: carData
## Loading required package: lmtest
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
## Loading required package: survival
data("CASchools")

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x dplyr::recode() masks car::recode()
## x purrr::some()   masks car::some()
library(dplyr)
library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
library(estimatr)#(lm_robust)
str(CASchools)
## 'data.frame':    420 obs. of  14 variables:
##  $ district   : chr  "75119" "61499" "61549" "61457" ...
##  $ school     : chr  "Sunol Glen Unified" "Manzanita Elementary" "Thermalito Union Elementary" "Golden Feather Union Elementary" ...
##  $ county     : Factor w/ 45 levels "Alameda","Butte",..: 1 2 2 2 2 6 29 11 6 25 ...
##  $ grades     : Factor w/ 2 levels "KK-06","KK-08": 2 2 2 2 2 2 2 2 2 1 ...
##  $ students   : num  195 240 1550 243 1335 ...
##  $ teachers   : num  10.9 11.1 82.9 14 71.5 ...
##  $ calworks   : num  0.51 15.42 55.03 36.48 33.11 ...
##  $ lunch      : num  2.04 47.92 76.32 77.05 78.43 ...
##  $ computer   : num  67 101 169 85 171 25 28 66 35 0 ...
##  $ expenditure: num  6385 5099 5502 7102 5236 ...
##  $ income     : num  22.69 9.82 8.98 8.98 9.08 ...
##  $ english    : num  0 4.58 30 0 13.86 ...
##  $ read       : num  692 660 636 652 642 ...
##  $ math       : num  690 662 651 644 640 ...
CASchools %<>% mutate(score = (read+math)/2)
CASchools %<>% mutate(st.ratio = students/teachers)
lm.fit.7 <- CASchools %$% lm_robust(score ~ st.ratio + calworks + lunch + computer + expenditure + income + english, se_type = "HC2")
summary(lm.fit.7)$coef
##                  Estimate   Std. Error    t value      Pr(>|t|)      CI Lower
## (Intercept)  6.618848e+02 1.011465e+01 65.4382631 8.746942e-220  6.420021e+02
## st.ratio    -2.782033e-01 3.131453e-01 -0.8884160  3.748354e-01 -8.937650e-01
## calworks    -9.174277e-02 6.369565e-02 -1.4403303  1.505334e-01 -2.169518e-01
## lunch       -3.709982e-01 4.470528e-02 -8.2987567  1.518469e-15 -4.588771e-01
## computer     4.100852e-04 8.207297e-04  0.4996593  6.175817e-01 -1.203255e-03
## expenditure  1.743849e-03 9.054684e-04  1.9259079  5.480367e-02 -3.606548e-05
## income       6.190124e-01 8.463125e-02  7.3142304  1.361586e-12  4.526495e-01
## english     -2.113803e-01 3.889648e-02 -5.4344319  9.427272e-08 -2.878406e-01
##                  CI Upper  DF
## (Intercept) 681.767588421 412
## st.ratio      0.337358497 412
## calworks      0.033466221 412
## lunch        -0.283119321 412
## computer      0.002023425 412
## expenditure   0.003523763 412
## income        0.785375360 412
## english      -0.134919948 412
pvals <- summary(lm.fit.7)$coef[-c(1,2), "Pr(>|t|)"]  
pvals
##     calworks        lunch     computer  expenditure       income      english 
## 1.505334e-01 1.518469e-15 6.175817e-01 5.480367e-02 1.361586e-12 9.427272e-08
hist(pvals, xlab="p-value", main="", col="lightblue")

BH_cutoff <- function(pvals, q=0.1){
  pvals <- sort(pvals[!is.na(pvals)])
  N <- length(pvals)
  k <- rank(pvals, ties.method="min")
  alpha <- max(pvals[ pvals<= (q*k/N) ])
  
  plot(pvals, log="xy", xlab="order", main=sprintf("FDR of %g",q),
       ylab="p-value", bty="n", col=c(8,2)[(pvals<=alpha) + 1], pch=20)
  #log="xy": both axes are to be logarithmic
  #bty="n"": no box
  lines(1:N, q*(1:N)/(N+1))
  
  return(alpha)
}

BH_cutoff(pvals) # 0.05480367

## [1] 0.05480367
signif <- which(pvals <= 0.05480367) 
signif
##   lunch  income english 
##       2       5       6
ols.fit.4 <- lm_robust(score ~., 
            data=CASchools[,c("score", "st.ratio", names(signif))], se_type = "HC2")
summary(ols.fit.4)
## 
## Call:
## lm_robust(formula = score ~ ., data = CASchools[, c("score", 
##     "st.ratio", names(signif))], se_type = "HC2")
## 
## Standard error type:  HC2 
## 
## Coefficients:
##             Estimate Std. Error t value   Pr(>|t|) CI Lower  CI Upper  DF
## (Intercept) 675.6082    6.22466 108.537 9.114e-307 663.3724 687.84400 415
## st.ratio     -0.5604    0.25603  -2.189  2.917e-02  -1.0637  -0.05711 415
## lunch        -0.3964    0.03033 -13.066  6.385e-33  -0.4560  -0.33674 415
## income        0.6750    0.08448   7.989  1.352e-14   0.5089   0.84106 415
## english      -0.1943    0.03336  -5.825  1.147e-08  -0.2599  -0.12875 415
## 
## Multiple R-squared:  0.8053 ,    Adjusted R-squared:  0.8034 
## F-statistic: 463.2 on 4 and 415 DF,  p-value: < 2.2e-16