# packages for data munging
library(haven)
suppressMessages(library(dplyr))
library(reshape2)
library(broom)
# packages for heteroskedasticity consistent covariances
suppressMessages(library(lmtest))
library(sandwich) hmda <- read_dta("http://wps.aw.com/wps/media/objects/11422/11696965/datasets3e/datasets/hmda_sw.dta")
hmda <- hmda %>%
mutate(black = ifelse(s13==3, 1, 0),
deny = ifelse(s7==3, 1, 0),
PI = s46/100)
hmda %>%
group_by(black, deny) %>%
summarise(too=n()) ## Source: local data frame [4 x 3]
## Groups: black [?]
##
## black deny too
## (dbl) (dbl) (int)
## 1 0 0 1852
## 2 0 1 189
## 3 1 0 243
## 4 1 1 96
Variable Generation
hmda <- hmda %>%
mutate(hse_inc = s45/100,
loan_val = s6/s50,
ccred = s43,
mcred = s42,
pubrec = 1*(s44>0),
denpmi = 1*(s53==1),
selfemp = 1*(s27a==1),
married = 1*(s23a=="M"),
single = 1*(married==0),
hischl = 1*(school>=12),
probunmp = uria,
condo = 1*(s51 == 1))Drop the unused columns
hmda <- hmda %>%
select(PI, hse_inc, loan_val, ccred, mcred, pubrec, denpmi, selfemp,
single, hischl, probunmp, condo, black, deny)Summary stats (Table 11.1)
average <- hmda %>%
summarise_each(funs(mean))
melt(average)## No id variables; using all as measure variables
## variable value
## 1 PI 0.33081357
## 2 hse_inc 0.25534612
## 3 loan_val 0.73777591
## 4 ccred 2.11638655
## 5 mcred 1.72100840
## 6 pubrec 0.07352941
## 7 denpmi 0.02016807
## 8 selfemp 0.11638655
## 9 single 0.39327731
## 10 hischl 0.98361345
## 11 probunmp 3.77449585
## 12 condo 0.28823529
## 13 black 0.14243697
## 14 deny 0.11974790
Generation of loan-to-value ratio with 3 levels (high if loan_val>0.95, low if loav_val<0.8, medium otherwise )
# method 1: nested ifelse (not recommended)
hmda <- hmda %>%
mutate(lvrat = factor(ifelse(loan_val < 0.8, "low",
ifelse(loan_val >= 0.8 & loan_val <= 0.95, "medium", "high")),
levels = c("low", "medium", "high"))
)
# method 2: cut function
hmda <- hmda %>%
mutate(lvrat1= cut(loan_val,
breaks = c(-Inf, 0.8-0.0001, 0.95+0.0001, Inf),
labels = c("low", "medium", "high"))
)
DT::datatable(hmda)factor-iin tuhai
mode(hmda$lvrat)## [1] "numeric"
attributes(hmda$lvrat)## $levels
## [1] "low" "medium" "high"
##
## $class
## [1] "factor"
To avoid the repeated copy-pasting (which is prone to error), create an object that holds the equation used in the estimation.
eq <- deny ~ black + PI + hse_inc + lvrat + ccred + mcred + pubrec + denpmi + selfempsummary reports the values using homoskedastic variance. LPM is inherently heteroskedastic, hence the need for heteroskedasticity robust covariance estimates.
f1 <- lm(eq, data = hmda)
coeftest(f1, vcov = sandwich)##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.1829933 0.0276089 -6.6281 4.197e-11 ***
## black 0.0836967 0.0225101 3.7182 0.0002053 ***
## PI 0.4487963 0.1133333 3.9600 7.717e-05 ***
## hse_inc -0.0480227 0.1093055 -0.4393 0.6604528
## lvratmedium 0.0314498 0.0127097 2.4745 0.0134125 *
## lvrathigh 0.1890511 0.0500520 3.7771 0.0001626 ***
## ccred 0.0307716 0.0045737 6.7279 2.150e-11 ***
## mcred 0.0209104 0.0112637 1.8565 0.0635134 .
## pubrec 0.1970876 0.0348005 5.6634 1.664e-08 ***
## denpmi 0.7018841 0.0450008 15.5972 < 2.2e-16 ***
## selfemp 0.0598438 0.0204759 2.9227 0.0035035 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
f2 <- glm(eq, data = hmda, family = binomial(link = "logit"))
coeftest(f2, vcov = sandwich)##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.70738 0.48333 -11.8084 < 2.2e-16 ***
## black 0.68842 0.18208 3.7808 0.0001563 ***
## PI 4.76442 1.32912 3.5846 0.0003375 ***
## hse_inc -0.10881 1.29471 -0.0840 0.9330224
## lvratmedium 0.46353 0.16004 2.8963 0.0037764 **
## lvrathigh 1.49476 0.32415 4.6113 4.001e-06 ***
## ccred 0.29030 0.03882 7.4781 7.542e-14 ***
## mcred 0.27902 0.13760 2.0278 0.0425844 *
## pubrec 1.22580 0.20301 6.0382 1.559e-09 ***
## denpmi 4.54817 0.57430 7.9195 2.384e-15 ***
## selfemp 0.66613 0.21331 3.1228 0.0017912 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Upon a closer look, you’ll notice the slight discrepancy in estimated SEs from the textbook (for instance, SE for PI ratio). This is because glm uses iterative-least squares method while STATA’s probit uses Newton-Raphson iteration. Check Stock-Watson’s do file.
f3 <- glm(eq, data = hmda, family = binomial(link = "probit"))
summary(f3)##
## Call:
## glm(formula = eq, family = binomial(link = "probit"), data = hmda)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5851 -0.4294 -0.3065 -0.2218 3.1866
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.04056 0.20784 -14.629 < 2e-16 ***
## black 0.38913 0.09687 4.017 5.90e-05 ***
## PI 2.44177 0.55096 4.432 9.34e-06 ***
## hse_inc -0.18467 0.65666 -0.281 0.77853
## lvratmedium 0.21397 0.08189 2.613 0.00898 **
## lvrathigh 0.79108 0.17554 4.507 6.59e-06 ***
## ccred 0.15462 0.02135 7.241 4.46e-13 ***
## mcred 0.14771 0.07330 2.015 0.04391 *
## pubrec 0.69745 0.11823 5.899 3.65e-09 ***
## denpmi 2.55680 0.28350 9.019 < 2e-16 ***
## selfemp 0.35862 0.11177 3.209 0.00133 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1744.2 on 2379 degrees of freedom
## Residual deviance: 1273.7 on 2369 degrees of freedom
## AIC: 1295.7
##
## Number of Fisher Scoring iterations: 6
Column 4 has three added regressors: single, high school diploma, unemployment rate (single + hischl + probunmp)
f4 <- update(f3, . ~ . + single + hischl + probunmp)
tidy(f4)## term estimate std.error statistic p.value
## 1 (Intercept) -2.57457150 0.32394267 -7.9476146 1.901370e-15
## 2 black 0.37106827 0.09824524 3.7769593 1.587547e-04
## 3 PI 2.46431161 0.55046391 4.4767905 7.577354e-06
## 4 hse_inc -0.30247265 0.65889967 -0.4590572 6.461931e-01
## 5 lvratmedium 0.21572868 0.08244360 2.6166821 8.878900e-03
## 6 lvrathigh 0.79454244 0.17668484 4.4969475 6.893602e-06
## 7 ccred 0.15759221 0.02150800 7.3271438 2.351095e-13
## 8 mcred 0.11041155 0.07543205 1.4637219 1.432699e-01
## 9 pubrec 0.70215888 0.11895679 5.9026378 3.577350e-09
## 10 denpmi 2.58517913 0.28665440 9.0184527 1.907646e-19
## 11 selfemp 0.34594497 0.11341817 3.0501724 2.287101e-03
## 12 single 0.22948004 0.07978535 2.8762179 4.024718e-03
## 13 hischl -0.61262394 0.23751830 -2.5792705 9.900923e-03
## 14 probunmp 0.03003044 0.01812467 1.6568827 9.754320e-02
f5 <- update(f4, . ~ . + factor(ccred) + factor(mcred) + condo)
tidy(f5)## term estimate std.error statistic p.value
## 1 (Intercept) -2.58760645 0.34239003 -7.5574819 4.109472e-14
## 2 black 0.36276913 0.10013843 3.6226766 2.915703e-04
## 3 PI 2.62215969 0.56443930 4.6456008 3.390881e-06
## 4 hse_inc -0.50238204 0.67804887 -0.7409231 4.587401e-01
## 5 lvratmedium 0.21508041 0.08368447 2.5701354 1.016588e-02
## 6 lvrathigh 0.83556684 0.17860044 4.6784143 2.891019e-06
## 7 ccred 0.16346179 0.02526793 6.4691417 9.856116e-11
## 8 mcred 0.03447050 0.12316337 0.2798762 7.795725e-01
## 9 pubrec 0.71719417 0.12070936 5.9414960 2.824326e-09
## 10 denpmi 2.58885409 0.28838058 8.9772138 2.777099e-19
## 11 selfemp 0.34165926 0.11476478 2.9770394 2.910466e-03
## 12 single 0.22989664 0.08332399 2.7590690 5.796629e-03
## 13 hischl -0.60374917 0.23972543 -2.5185028 1.178549e-02
## 14 probunmp 0.02795894 0.01832465 1.5257560 1.270706e-01
## 15 factor(ccred)2 0.18097183 0.10235924 1.7680067 7.705978e-02
## 16 factor(ccred)3 0.13573973 0.16040898 0.8462103 3.974354e-01
## 17 factor(ccred)4 0.29150967 0.18730512 1.5563358 1.196283e-01
## 18 factor(ccred)5 -0.06537325 0.14275011 -0.4579559 6.469842e-01
## 19 factor(mcred)2 0.12757162 0.14354872 0.8886991 3.741648e-01
## 20 factor(mcred)3 0.14858465 0.34601026 0.4294227 6.676157e-01
## 21 condo -0.05495853 0.09134682 -0.6016469 5.474092e-01
f6 <- update(f4, . ~ . + black*PI + black*hse_inc)
tidy(f6)## term estimate std.error statistic p.value
## 1 (Intercept) -2.5434114 0.33262389 -7.6465085 2.065097e-14
## 2 black 0.2462816 0.40508453 0.6079759 5.432035e-01
## 3 PI 2.5716995 0.60723109 4.2351250 2.284246e-05
## 4 hse_inc -0.5380253 0.72878917 -0.7382455 4.603653e-01
## 5 lvratmedium 0.2158859 0.08248471 2.6172837 8.863265e-03
## 6 lvrathigh 0.7881882 0.17728885 4.4457853 8.757137e-06
## 7 ccred 0.1578706 0.02153770 7.3299677 2.302081e-13
## 8 mcred 0.1114918 0.07546005 1.4774940 1.395433e-01
## 9 pubrec 0.7047679 0.11896644 5.9240904 3.140308e-09
## 10 denpmi 2.5900971 0.28684250 9.0296838 1.721686e-19
## 11 selfemp 0.3475271 0.11335639 3.0657916 2.170946e-03
## 12 single 0.2255940 0.08013016 2.8153446 4.872495e-03
## 13 hischl -0.6198274 0.23734762 -2.6114753 9.015251e-03
## 14 probunmp 0.0297126 0.01813846 1.6380993 1.014010e-01
## 15 black:PI -0.5792622 1.40523093 -0.4122185 6.801793e-01
## 16 black:hse_inc 1.2317343 1.66681267 0.7389758 4.599217e-01
waldtest(f4, update(f4, . ~ . -single - hischl - probunmp))## Wald test
##
## Model 1: deny ~ black + PI + hse_inc + lvrat + ccred + mcred + pubrec +
## denpmi + selfemp + single + hischl + probunmp
## Model 2: deny ~ black + PI + hse_inc + lvrat + ccred + mcred + pubrec +
## denpmi + selfemp
## Res.Df Df F Pr(>F)
## 1 2366
## 2 2369 -3 5.6614 0.0007316 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
waldtest(f5, update(f5, . ~ . -single - hischl - probunmp))## Wald test
##
## Model 1: deny ~ black + PI + hse_inc + lvrat + ccred + mcred + pubrec +
## denpmi + selfemp + single + hischl + probunmp + factor(ccred) +
## factor(mcred) + condo
## Model 2: deny ~ black + PI + hse_inc + lvrat + ccred + mcred + pubrec +
## denpmi + selfemp + factor(ccred) + factor(mcred) + condo
## Res.Df Df F Pr(>F)
## 1 2359
## 2 2362 -3 5.2324 0.001338 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
waldtest(f6, update(f6, . ~ . -single - hischl - probunmp))## Wald test
##
## Model 1: deny ~ black + PI + hse_inc + lvrat + ccred + mcred + pubrec +
## denpmi + selfemp + single + hischl + probunmp + black:PI +
## black:hse_inc
## Model 2: deny ~ black + PI + hse_inc + lvrat + ccred + mcred + pubrec +
## denpmi + selfemp + black:PI + black:hse_inc
## Res.Df Df F Pr(>F)
## 1 2364
## 2 2367 -3 5.5999 0.0007979 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lrtest(f5, update(f4, .~. + condo))## Likelihood ratio test
##
## Model 1: deny ~ black + PI + hse_inc + lvrat + ccred + mcred + pubrec +
## denpmi + selfemp + single + hischl + probunmp + factor(ccred) +
## factor(mcred) + condo
## Model 2: deny ~ black + PI + hse_inc + lvrat + ccred + mcred + pubrec +
## denpmi + selfemp + single + hischl + probunmp + condo
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 21 -625.06
## 2 15 -628.47 -6 6.8153 0.3383
waldtest(f4, f6)## Wald test
##
## Model 1: deny ~ black + PI + hse_inc + lvrat + ccred + mcred + pubrec +
## denpmi + selfemp + single + hischl + probunmp
## Model 2: deny ~ black + PI + hse_inc + lvrat + ccred + mcred + pubrec +
## denpmi + selfemp + single + hischl + probunmp + black:PI +
## black:hse_inc
## Res.Df Df F Pr(>F)
## 1 2366
## 2 2364 2 0.2781 0.7572
waldtest(update(f4, .~. - black), f6)## Wald test
##
## Model 1: deny ~ PI + hse_inc + lvrat + ccred + mcred + pubrec + denpmi +
## selfemp + single + hischl + probunmp
## Model 2: deny ~ black + PI + hse_inc + lvrat + ccred + mcred + pubrec +
## denpmi + selfemp + single + hischl + probunmp + black:PI +
## black:hse_inc
## Res.Df Df F Pr(>F)
## 1 2367
## 2 2364 3 4.9547 0.001976 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
As per “A conventional way to make this choice is to consider an”average" applicant who has the sample average values of all the regressors other than race" p. 403
average$lvrat <- "medium"
average <- rbind(average, average)
average$black <- c(0, 1)
models <- list(f1, f2, f3, f4, f6)
sapply(models, function(x)diff(predict(x, average, type="response")))## 2 2 2 2 2
## 0.08369674 0.07294764 0.08071661 0.07510299 0.07461927