1 Set Up

# Clear the workspace

  rm(list = ls()) # Clear environment
  gc()            # Clear unused memory
##           used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells  552231 29.5    1227675 65.6         NA   700254 37.4
## Vcells 1025959  7.9    8388608 64.0      49152  1963353 15.0
  cat("\f")       # Clear the console

The LST package provides software and datasets for Lessons in Statistical Thinking.

# Prepare needed libraries

  packages <- c("haven",
                 "stargazer", 
                 "visdat", 
                 "dplyr", 
                 "magrittr")    ### if multiple packages packages <- c("stringr", "psych")

  for (i in 1:length(packages)) {
    if (!packages[i] %in% rownames(installed.packages())) {
      install.packages(packages[i]
                       , repos = "http://cran.rstudio.com/"
                       , dependencies = TRUE
                       )
    }
    library(packages[i], character.only = TRUE)
  }
## 
## Please cite as:
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
  rm(packages)

2 Data

2.1 Partial Data

Data from an experimental study in which researchers sent the resumes of fictitious job applicants to potential employers. The first names of the fictitious applicants was set randomly to sound either Black or white.

install.packages("devtools")
devtools::install_github("dtkaplan/LST")
library(LSTbook)

data("Callback")
head(Callback)
##      name    sex callback
## 1 Allison female        0
## 2 Kristen female        0
## 3 Lakisha female        0
## 4 Latonya female        0
## 5  Carrie female        0
## 6     Jay   male        0
head(Names_and_race)
## # A tibble: 6 × 2
## # Groups:   firstname [6]
##   firstname race 
##   <chr>     <chr>
## 1 Allison   white
## 2 Kristen   white
## 3 Lakisha   black
## 4 Latonya   black
## 5 Carrie    white
## 6 Jay       white

Callback: A data frame with 4870 rows and 4 variables. Each row is one fictitious applicant

  • name: first name of the fictitious job applicant

  • sex: sex of applicant (female or male)

  • callback: whether the potential employer called back to follow up. (1 = yes, 0 = no)

Another data frame -

Names_and_race: which first names are associated with which race.

vis_dat(Callback)

vis_dat(Names_and_race)

LST <- 
merge(x = Callback,
      y = Names_and_race, 
      by.x = c("name"), 
      by.y = c("firstname"), 
      all.x = T
      )

str(LST)
## 'data.frame':    4870 obs. of  4 variables:
##  $ name    : chr  "Aisha" "Aisha" "Aisha" "Aisha" ...
##  $ sex     : chr  "female" "female" "female" "female" ...
##  $ callback: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ race    : chr  "black" "black" "black" "black" ...

2.2 Full (original) Data

The data is only a subset of the original data. I go to AER website and download the original data as well. It is in .dta format (Stata file), but you ca import it into R using the haven package.

library(haven) 
df <- read_dta("lakisha_aer.dta")

3 Balance Table

  1. https://cran.r-project.org/web/packages/MatchIt/vignettes/assessing-balance.html

  2. https://ngreifer.github.io/cobalt/reference/bal.tab.html

  • The default balance statistic for mean differences for continuous variables is the standardized mean difference, which is the difference in the means divided by a measure of spread (i.e., a d-type effect size measure).
?mtcars

# install.packages("cobalt")
library("cobalt")
##  cobalt (Version 4.5.5, Build Date: 2024-04-02)
covs <- subset(x = mtcars,
               select = -c(am)
               )
?bal.tab
bal.tab(x = covs, 
        treat = mtcars$am)
## Note: `s.d.denom` not specified; assuming "pooled".
## Balance Measures
##         Type Diff.Un
## mpg  Contin.  1.4110
## cyl  Contin. -1.2078
## disp Contin. -1.4780
## hp   Contin. -0.4732
## drat Contin.  2.0180
## wt   Contin. -1.9349
## qsec Contin. -0.4645
## vs    Binary  0.1700
## gear Contin.  2.5267
## carb Contin.  0.1070
## 
## Sample sizes
##     Control Treated
## All      19      13
# install.packages("modelsummary")
library(modelsummary)

datasummary_balance(formula = ~ am, 
                    data=mtcars
                    )
tinytable_lt5ohkcsggrynbzj5gad
0 1
Mean Std. Dev. Mean Std. Dev. Diff. in Means Std. Error
mpg 17.1 3.8 24.4 6.2 7.2 1.9
cyl 6.9 1.5 5.1 1.6 -1.9 0.6
disp 290.4 110.2 143.5 87.2 -146.8 35.0
hp 160.3 53.9 126.8 84.1 -33.4 26.4
drat 3.3 0.4 4.0 0.4 0.8 0.1
wt 3.8 0.8 2.4 0.6 -1.4 0.2
qsec 18.2 1.8 17.4 1.8 -0.8 0.6
vs 0.4 0.5 0.5 0.5 0.2 0.2
gear 3.2 0.4 4.4 0.5 1.2 0.2
carb 2.7 1.1 2.9 2.2 0.2 0.7

4 Table 1

Lets try to recreate it/match the numbers.

4.0.1 All sent resumes

mean_call_by_race <- tapply(X     = df$call, 
                            INDEX = as.factor(df$race), 
                            FUN   = mean, 
                            na.rm = TRUE
                            )
# Display the result
mean_call_by_race
##          b          w 
## 0.06447639 0.09650924

Add percentage and round to 2 decimal places as well.

# Display the formatted result 
print(paste0(round(x = mean_call_by_race * 100, 
                   digits = 2),"%"
             )
      )
## [1] "6.45%" "9.65%"

Now, since the numbers match, lets us generalize the code (get other rows as well) -

4.0.2 Cities - Chicago and Boston

table(df$city)
## 
##    b    c 
## 2166 2704
df_chicago <- subset(df, city == "c")
df_boston <-  subset(df, city == "b")

mean_call_by_chicago <- tapply(X    = df_chicago$call, 
                              INDEX = as.factor(df_chicago$race), 
                              FUN   = mean, 
                              na.rm = TRUE
                              )

mean_call_by_boston <- tapply(X    = df_boston$call, 
                              INDEX = as.factor(df_boston$race), 
                              FUN   = mean, 
                              na.rm = TRUE
                              )

# Display the formatted result
print(paste0(round(x = mean_call_by_chicago * 100, digits = 2),"%"))
## [1] "5.4%"  "8.06%"
print(paste0(round(x = mean_call_by_boston  * 100, digits = 2),"%"))
## [1] "7.76%"  "11.63%"
# table(df_chicago$race)
# table(df_boston$race)

4.0.3 Gender

table(df$city)
## 
##    b    c 
## 2166 2704
df_female <- subset(df, sex == "f")
df_male   <- subset(df, sex == "m")

mean_call_by_female <- tapply(X    =  df_female$call, 
                              INDEX = df_female$race, 
                              FUN   = mean, 
                              na.rm = TRUE
                              )

mean_call_by_male <- tapply(X       = df_male$call, 
                              INDEX = df_male$race, 
                              FUN   = mean, 
                              na.rm = TRUE
                              )

# Display the formatted result
print(paste0(round(x = mean_call_by_female * 100, digits = 2),"%"))
## [1] "6.63%" "9.89%"
print(paste0(round(x = mean_call_by_male   * 100, digits = 2),"%"))
## [1] "5.83%" "8.87%"
# nrow(df_female)
# 1886+1860
# table(df_female$race) 
# table(df_male$race) 

4.0.4 Female - Occupation

df %>% ls(pattern = "sales")
## [1] "branch_sales" "parent_sales" "retailsales"  "salesrep"
df_female_admin <- subset(df, sex == "f" & secretary == "1" )
df_female_sales <- subset(df, sex == "f" & salesrep  == "1")

# table(df_female_admin$race) 

mean_call_by_female_admin <- tapply(X    =  df_female_admin$call, 
                                   INDEX = df_female_admin$race, 
                                   FUN   = mean, 
                                   na.rm = TRUE
                                  )

mean_call_by_female_sales <- tapply(X      = df_female_sales$call, 
                                     INDEX = df_female_sales$race, 
                                     FUN   = mean, 
                                     na.rm = TRUE
                                    )

# Display the formatted result
print(paste0(round(x = mean_call_by_female_admin * 100, digits = 2),"%"))
## [1] "6.16%"  "10.19%"
print(paste0(round(x = mean_call_by_female_sales * 100, digits = 2),"%"))
## [1] "7.94%" "7.82%"
# table(df_female_admin$race)
# table(df_female_sales$race)

These numbers are a bit off.

4.1 Final Table

# Assuming df1, df2, df3, etc. are the data frames you want to row-bind
table1 <- rbind(mean_call_by_race,
                mean_call_by_chicago,
                mean_call_by_boston,
                mean_call_by_female,
                mean_call_by_female_admin,
                mean_call_by_female_sales,
                mean_call_by_male)

table1 <- round(x = table1*100, digits = 2)

ratio <- round(x = table1[,2]/table1[,1],
               digits = 3
               ) # column 3

diff  <- round(x = table1[,2]-table1[,1],
               digits = 2
               ) # column 4
table1 <- cbind(table1, ratio, diff)



table1
##                              b     w ratio  diff
## mean_call_by_race         6.45  9.65 1.496  3.20
## mean_call_by_chicago      5.40  8.06 1.493  2.66
## mean_call_by_boston       7.76 11.63 1.499  3.87
## mean_call_by_female       6.63  9.89 1.492  3.26
## mean_call_by_female_admin 6.16 10.19 1.654  4.03
## mean_call_by_female_sales 7.94  7.82 0.985 -0.12
## mean_call_by_male         5.83  8.87 1.521  3.04

4.1.1 P value of the difference of All Sent Resumes by race (first row)

4.1.1.1 METHOD 1

User written command adds a continuity correction to calculate the p value for no difference in two means (null).

p-value of the observed difference is .0004, and we will reject the null of no difference between the two groups.

successes_group_white <- sum(as.integer(df$call==1 & df$race=="w"))
successes_group_black <- sum(as.integer(df$call==1 & df$race=="b"))
successes_group_white # 235
## [1] 235
successes_group_black # 157
## [1] 157
table(df$race)
## 
##    b    w 
## 2435 2435
trails_group_white <- 2435
trials_group_black <- 2435 


# Calculate proportions
prop_group_white <- successes_group_white / trails_group_white
prop_group_black <- successes_group_black / trials_group_black


# Create a 2x2 matrix (contingency table)
observed_data <-    matrix(c(successes_group_white, 
                             successes_group_black, 
                             trails_group_white - successes_group_white, 
                             trials_group_black - successes_group_black), 
                          ncol = 2)

observed_data
##      [,1] [,2]
## [1,]  235 2200
## [2,]  157 2278
# Perform a chi-squared test of proportions
?prop.test
prop.test(observed_data)
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  observed_data
## X-squared = 16.449, df = 1, p-value = 4.998e-05
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  0.01636705 0.04769866
## sample estimates:
##     prop 1     prop 2 
## 0.09650924 0.06447639

4.1.1.2 METHOD 2

p-value of the observed difference is .0004, and we will reject the null of no difference between the two groups.

# Calculate proportions
prop_group_white <- successes_group_white / trails_group_white
prop_group_black <- successes_group_black / trials_group_black


# Standard error of the difference between proportions
se_diff <- sqrt(prop_group_white * (1 - prop_group_white) / trails_group_white + prop_group_black * (1 - prop_group_black) / trials_group_black)

# Calculate the z statistic
z_stat <- (prop_group_white - prop_group_black) / se_diff

# Two-tailed p-value
p_value <- 2 * (1 - pnorm(q = abs(z_stat), 
                          mean = 0,
                          sd = 1))

# Display the result
cat("Z statistic:", z_stat, "\n")
## Z statistic: 4.11555
cat("P-value:", p_value, "\n")
## P-value: 3.862565e-05

4.1.2 P value of the difference of Males by race (last row)

successes_group_male_white <- sum(as.integer(df$call==1 & df$race=="w" & df$sex=="m"))
successes_group_male_black <- sum(as.integer(df$call==1 & df$race=="b" & df$sex=="m"))
successes_group_male_white # 51
## [1] 51
successes_group_male_black # 32
## [1] 32
table(df_male$race)
## 
##   b   w 
## 549 575
trails_group_male_white <- 575
trials_group_male_black <- 2549 


# Calculate proportions
prop_group_male_white <- successes_group_male_white / trails_group_male_white
prop_group_male_black <- successes_group_male_black / trials_group_male_black


# Create a 2x2 matrix (contingency table)
observed_data2 <- matrix(c(successes_group_male_white, 
                          successes_group_male_black, 
                          trails_group_male_white - successes_group_male_white, 
                          trials_group_male_black - successes_group_male_black), 
                        ncol = 2)

observed_data2
##      [,1] [,2]
## [1,]   51  524
## [2,]   32 2517
# Perform a chi-squared test of proportions
prop.test(observed_data2)
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  observed_data2
## X-squared = 102.25, df = 1, p-value < 2.2e-16
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  0.05143953 0.10084389
## sample estimates:
##     prop 1     prop 2 
## 0.08869565 0.01255394
# Standard error of the difference between proportions
se_diff2 <- sqrt(prop_group_male_white * (1 - prop_group_male_white) / trails_group_male_white + prop_group_male_black * (1 - prop_group_male_black) / trials_group_male_black)

# Calculate the z statistic
z_stat2 <- (prop_group_male_white - prop_group_male_black) / se_diff2

# Two-tailed p-value
p_value2 <- 2 * (1 - pnorm(q = abs(z_stat), 
                          mean = 0,
                          sd = 1))

# Display the result
cat("Z statistic:", z_stat2, "\n")
## Z statistic: 6.313763
cat("P-value:", p_value2, "\n")
## P-value: 3.862565e-05

5 References

  • Bertrand, Marianne, and Mullainathan, Sendhil. Replication data for: Are Emily and Greg More Employable Than Lakisha and Jamal? A Field Experiment on Labor Market Discrimination. Nashville, TN: American Economic Association [publisher], 2004. Ann Arbor, MI: Inter-university Consortium for Political and Social Research [distributor], 2019-12-06. https://doi.org/10.3886/E116023V1

  • Balance Tables https://lost-stats.github.io/Presentation/Tables/Balance_Tables.html

    # Import Dependency
    library("cobalt")
    
    # Load Data
    data(mtcars)
    
    # Create Balance Table
    bal.tab(am ~ mpg + hp, data = mtcars)
    ## Note: `s.d.denom` not specified; assuming "pooled".
    ## Balance Measures
    ##        Type Diff.Un
    ## mpg Contin.  1.4110
    ## hp  Contin. -0.4732
    ## 
    ## Sample sizes
    ##     Control Treated
    ## All      19      13