1 Set Up

# Clear the workspace

  rm(list = ls()) # Clear environment
  gc()            # Clear unused memory

##           used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells  552231 29.5    1227675 65.6         NA   700254 37.4
## Vcells 1025959  7.9    8388608 64.0      49152  1963353 15.0

  cat("\f")       # Clear the console

The LST package provides software and datasets for Lessons in Statistical Thinking.

# Prepare needed libraries

  packages <- c("haven",
                 "stargazer", 
                 "visdat", 
                 "dplyr", 
                 "magrittr")    ### if multiple packages packages <- c("stringr", "psych")

  for (i in 1:length(packages)) {
    if (!packages[i] %in% rownames(installed.packages())) {
      install.packages(packages[i]
                       , repos = "http://cran.rstudio.com/"
                       , dependencies = TRUE
                       )
    }
    library(packages[i], character.only = TRUE)
  }

## 
## Please cite as:

##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

  rm(packages)

2 Data

2.1 Partial Data

Data from an experimental study in which researchers sent the resumes of fictitious job applicants to potential employers. The first names of the fictitious applicants was set randomly to sound either Black or white.

install.packages("devtools")
devtools::install_github("dtkaplan/LST")

library(LSTbook)

data("Callback")
head(Callback)

##      name    sex callback
## 1 Allison female        0
## 2 Kristen female        0
## 3 Lakisha female        0
## 4 Latonya female        0
## 5  Carrie female        0
## 6     Jay   male        0

head(Names_and_race)

## # A tibble: 6 × 2
## # Groups:   firstname [6]
##   firstname race 
##   <chr>     <chr>
## 1 Allison   white
## 2 Kristen   white
## 3 Lakisha   black
## 4 Latonya   black
## 5 Carrie    white
## 6 Jay       white

Callback: A data frame with 4870 rows and 4 variables. Each row is one fictitious applicant

name: first name of the fictitious job applicant
sex: sex of applicant (female or male)
callback: whether the potential employer called back to follow up. (1 = yes, 0 = no)

Another data frame -

Names_and_race: which first names are associated with which race.

vis_dat(Callback)

vis_dat(Names_and_race)

LST <- 
merge(x = Callback,
      y = Names_and_race, 
      by.x = c("name"), 
      by.y = c("firstname"), 
      all.x = T
      )

str(LST)

## 'data.frame':    4870 obs. of  4 variables:
##  $ name    : chr  "Aisha" "Aisha" "Aisha" "Aisha" ...
##  $ sex     : chr  "female" "female" "female" "female" ...
##  $ callback: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ race    : chr  "black" "black" "black" "black" ...

2.2 Full (original) Data

The data is only a subset of the original data. I go to AER website and download the original data as well. It is in .dta format (Stata file), but you ca import it into R using the haven package.

library(haven) 
df <- read_dta("lakisha_aer.dta")

3 Balance Table

The default balance statistic for mean differences for continuous variables is the standardized mean difference, which is the difference in the means divided by a measure of spread (i.e., a d-type effect size measure).

?mtcars

# install.packages("cobalt")
library("cobalt")

##  cobalt (Version 4.5.5, Build Date: 2024-04-02)

covs <- subset(x = mtcars,
               select = -c(am)
               )
?bal.tab
bal.tab(x = covs, 
        treat = mtcars$am)

## Note: `s.d.denom` not specified; assuming "pooled".

## Balance Measures
##         Type Diff.Un
## mpg  Contin.  1.4110
## cyl  Contin. -1.2078
## disp Contin. -1.4780
## hp   Contin. -0.4732
## drat Contin.  2.0180
## wt   Contin. -1.9349
## qsec Contin. -0.4645
## vs    Binary  0.1700
## gear Contin.  2.5267
## carb Contin.  0.1070
## 
## Sample sizes
##     Control Treated
## All      19      13

# install.packages("modelsummary")
library(modelsummary)

datasummary_balance(formula = ~ am, 
                    data=mtcars
                    )

tinytable_lt5ohkcsggrynbzj5gad

	0		1
	Mean	Std. Dev.	Mean	Std. Dev.	Diff. in Means	Std. Error
mpg	17.1	3.8	24.4	6.2	7.2	1.9
cyl	6.9	1.5	5.1	1.6	-1.9	0.6
disp	290.4	110.2	143.5	87.2	-146.8	35.0
hp	160.3	53.9	126.8	84.1	-33.4	26.4
drat	3.3	0.4	4.0	0.4	0.8	0.1
wt	3.8	0.8	2.4	0.6	-1.4	0.2
qsec	18.2	1.8	17.4	1.8	-0.8	0.6
vs	0.4	0.5	0.5	0.5	0.2	0.2
gear	3.2	0.4	4.4	0.5	1.2	0.2
carb	2.7	1.1	2.9	2.2	0.2	0.7

4 Table 1

Lets try to recreate it/match the numbers.

4.0.1 All sent resumes

mean_call_by_race <- tapply(X     = df$call, 
                            INDEX = as.factor(df$race), 
                            FUN   = mean, 
                            na.rm = TRUE
                            )
# Display the result
mean_call_by_race

##          b          w 
## 0.06447639 0.09650924

Add percentage and round to 2 decimal places as well.

# Display the formatted result 
print(paste0(round(x = mean_call_by_race * 100, 
                   digits = 2),"%"
             )
      )

## [1] "6.45%" "9.65%"

Now, since the numbers match, lets us generalize the code (get other rows as well) -

4.0.2 Cities - Chicago and Boston

table(df$city)

## 
##    b    c 
## 2166 2704

df_chicago <- subset(df, city == "c")
df_boston <-  subset(df, city == "b")

mean_call_by_chicago <- tapply(X    = df_chicago$call, 
                              INDEX = as.factor(df_chicago$race), 
                              FUN   = mean, 
                              na.rm = TRUE
                              )

mean_call_by_boston <- tapply(X    = df_boston$call, 
                              INDEX = as.factor(df_boston$race), 
                              FUN   = mean, 
                              na.rm = TRUE
                              )

# Display the formatted result
print(paste0(round(x = mean_call_by_chicago * 100, digits = 2),"%"))

## [1] "5.4%"  "8.06%"

print(paste0(round(x = mean_call_by_boston  * 100, digits = 2),"%"))

## [1] "7.76%"  "11.63%"

# table(df_chicago$race)
# table(df_boston$race)

4.0.3 Gender

table(df$city)

## 
##    b    c 
## 2166 2704

df_female <- subset(df, sex == "f")
df_male   <- subset(df, sex == "m")

mean_call_by_female <- tapply(X    =  df_female$call, 
                              INDEX = df_female$race, 
                              FUN   = mean, 
                              na.rm = TRUE
                              )

mean_call_by_male <- tapply(X       = df_male$call, 
                              INDEX = df_male$race, 
                              FUN   = mean, 
                              na.rm = TRUE
                              )

# Display the formatted result
print(paste0(round(x = mean_call_by_female * 100, digits = 2),"%"))

## [1] "6.63%" "9.89%"

print(paste0(round(x = mean_call_by_male   * 100, digits = 2),"%"))

## [1] "5.83%" "8.87%"

# nrow(df_female)
# 1886+1860
# table(df_female$race) 
# table(df_male$race)

4.0.4 Female - Occupation

df %>% ls(pattern = "sales")

## [1] "branch_sales" "parent_sales" "retailsales"  "salesrep"

df_female_admin <- subset(df, sex == "f" & secretary == "1" )
df_female_sales <- subset(df, sex == "f" & salesrep  == "1")

# table(df_female_admin$race) 

mean_call_by_female_admin <- tapply(X    =  df_female_admin$call, 
                                   INDEX = df_female_admin$race, 
                                   FUN   = mean, 
                                   na.rm = TRUE
                                  )

mean_call_by_female_sales <- tapply(X      = df_female_sales$call, 
                                     INDEX = df_female_sales$race, 
                                     FUN   = mean, 
                                     na.rm = TRUE
                                    )

# Display the formatted result
print(paste0(round(x = mean_call_by_female_admin * 100, digits = 2),"%"))

## [1] "6.16%"  "10.19%"

print(paste0(round(x = mean_call_by_female_sales * 100, digits = 2),"%"))

## [1] "7.94%" "7.82%"

# table(df_female_admin$race)
# table(df_female_sales$race)

These numbers are a bit off.

4.1 Final Table

# Assuming df1, df2, df3, etc. are the data frames you want to row-bind
table1 <- rbind(mean_call_by_race,
                mean_call_by_chicago,
                mean_call_by_boston,
                mean_call_by_female,
                mean_call_by_female_admin,
                mean_call_by_female_sales,
                mean_call_by_male)

table1 <- round(x = table1*100, digits = 2)

ratio <- round(x = table1[,2]/table1[,1],
               digits = 3
               ) # column 3

diff  <- round(x = table1[,2]-table1[,1],
               digits = 2
               ) # column 4
table1 <- cbind(table1, ratio, diff)



table1

##                              b     w ratio  diff
## mean_call_by_race         6.45  9.65 1.496  3.20
## mean_call_by_chicago      5.40  8.06 1.493  2.66
## mean_call_by_boston       7.76 11.63 1.499  3.87
## mean_call_by_female       6.63  9.89 1.492  3.26
## mean_call_by_female_admin 6.16 10.19 1.654  4.03
## mean_call_by_female_sales 7.94  7.82 0.985 -0.12
## mean_call_by_male         5.83  8.87 1.521  3.04

4.1.1 P value of the difference of All Sent Resumes by race (first row)

4.1.1.1 METHOD 1

User written command adds a continuity correction to calculate the p value for no difference in two means (null).

p-value of the observed difference is .0004, and we will reject the null of no difference between the two groups.

successes_group_white <- sum(as.integer(df$call==1 & df$race=="w"))
successes_group_black <- sum(as.integer(df$call==1 & df$race=="b"))
successes_group_white # 235

## [1] 235

successes_group_black # 157

## [1] 157

table(df$race)

## 
##    b    w 
## 2435 2435

trails_group_white <- 2435
trials_group_black <- 2435 


# Calculate proportions
prop_group_white <- successes_group_white / trails_group_white
prop_group_black <- successes_group_black / trials_group_black


# Create a 2x2 matrix (contingency table)
observed_data <-    matrix(c(successes_group_white, 
                             successes_group_black, 
                             trails_group_white - successes_group_white, 
                             trials_group_black - successes_group_black), 
                          ncol = 2)

observed_data

##      [,1] [,2]
## [1,]  235 2200
## [2,]  157 2278

# Perform a chi-squared test of proportions
?prop.test
prop.test(observed_data)

## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  observed_data
## X-squared = 16.449, df = 1, p-value = 4.998e-05
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  0.01636705 0.04769866
## sample estimates:
##     prop 1     prop 2 
## 0.09650924 0.06447639

4.1.1.2 METHOD 2