1 Set Up

# Clear the workspace

  rm(list = ls()) # Clear environment
  gc()            # Clear unused memory

##          used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 527985 28.2    1173894 62.7         NA   669268 35.8
## Vcells 971693  7.5    8388608 64.0      32768  1840524 14.1

  cat("\f")       # Clear the console

The LST package provides software and datasets for Lessons in Statistical Thinking.

# Prepare needed libraries

  packages <- c("haven",
                 "stargazer", 
                 "visdat", 
                 "dplyr", 
                 "magrittr")    ### if multiple packages packages <- c("stringr", "psych")

  for (i in 1:length(packages)) {
    if (!packages[i] %in% rownames(installed.packages())) {
      install.packages(packages[i]
                       , repos = "http://cran.rstudio.com/"
                       , dependencies = TRUE
                       )
    }
    library(packages[i], character.only = TRUE)
  }

## 
## Please cite as:

##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

  rm(packages)

2 Data

2.1 Partial Data

Data from an experimental study in which researchers sent the resumes of fictitious job applicants to potential employers. The first names of the fictitious applicants was set randomly to sound either Black or white.

# install.packages("devtools")
# devtools::install_github("dtkaplan/LST")

library(LSTbook)

## 
## Attaching package: 'LSTbook'

## The following object is masked from 'package:base':
## 
##     sample

data("Callback")
head(Callback)

##      name    sex callback
## 1 Allison female        0
## 2 Kristen female        0
## 3 Lakisha female        0
## 4 Latonya female        0
## 5  Carrie female        0
## 6     Jay   male        0

head(Names_and_race)

## # A tibble: 6 × 2
## # Groups:   firstname [6]
##   firstname race 
##   <chr>     <chr>
## 1 Allison   white
## 2 Kristen   white
## 3 Lakisha   black
## 4 Latonya   black
## 5 Carrie    white
## 6 Jay       white

Callback: A data frame with 4870 rows and 4 variables. Each row is one fictitious applicant

name: first name of the fictitious job applicant
sex: sex of applicant (female or male)
callback: whether the potential employer called back to follow up. (1 = yes, 0 = no)

Another data frame -

Names_and_race: which first names are associated with which race.

vis_dat(Callback)

vis_dat(Names_and_race)

LST <- 
merge(x = Callback,
      y = Names_and_race, 
      by.x = c("name"), 
      by.y = c("firstname"), 
      all.x = T
      )

str(LST)

## 'data.frame':    4870 obs. of  4 variables:
##  $ name    : chr  "Aisha" "Aisha" "Aisha" "Aisha" ...
##  $ sex     : chr  "female" "female" "female" "female" ...
##  $ callback: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ race    : chr  "black" "black" "black" "black" ...

2.2 Full (original) Data

The data is only a subset of the original data. I go to AER website and download the original data as well. It is in .dta format (Stata file), but you ca import it into R using the haven package.

library(haven) 
df <- read_dta("lakisha_aer.dta")

library(stargazer)

# stargazer(...  = as.data.frame(df), 
#           type = "text"
#           )

3 Table 1

Lets try to recreate it/match the numbers.

3.0.1 All sent resumes

mean_call_by_race <- tapply(X     = df$call, 
                            INDEX = as.factor(df$race), 
                            FUN   = mean, 
                            na.rm = TRUE
                            )
# Display the result
mean_call_by_race

##          b          w 
## 0.06447639 0.09650924

Add percentage and round to 2 decimal places as well.

# Display the formatted result 
print(paste0(round(x = mean_call_by_race * 100, 
                   digits = 2),"%"
             )
      )

## [1] "6.45%" "9.65%"

Now, since the numbers match, lets us generalize the code (get other rows as well) -

3.0.2 Cities - Chicago and Boston

table(df$city)

## 
##    b    c 
## 2166 2704

df_chicago <- subset(df, city == "c")
df_boston <-  subset(df, city == "b")

mean_call_by_chicago <- tapply(X    = df_chicago$call, 
                              INDEX = as.factor(df_chicago$race), 
                              FUN   = mean, 
                              na.rm = TRUE
                              )

mean_call_by_boston <- tapply(X    = df_boston$call, 
                              INDEX = as.factor(df_boston$race), 
                              FUN   = mean, 
                              na.rm = TRUE
                              )

# Display the formatted result
print(paste0(round(x = mean_call_by_chicago * 100, digits = 2),"%"))

## [1] "5.4%"  "8.06%"

print(paste0(round(x = mean_call_by_boston  * 100, digits = 2),"%"))

## [1] "7.76%"  "11.63%"

# table(df_chicago$race)
# table(df_boston$race)

3.0.3 Gender

table(df$city)

## 
##    b    c 
## 2166 2704

df_female <- subset(df, sex == "f")
df_male   <- subset(df, sex == "m")

mean_call_by_female <- tapply(X    =  df_female$call, 
                              INDEX = df_female$race, 
                              FUN   = mean, 
                              na.rm = TRUE
                              )

mean_call_by_male <- tapply(X       = df_male$call, 
                              INDEX = df_male$race, 
                              FUN   = mean, 
                              na.rm = TRUE
                              )

# Display the formatted result
print(paste0(round(x = mean_call_by_female * 100, digits = 2),"%"))

## [1] "6.63%" "9.89%"

print(paste0(round(x = mean_call_by_male   * 100, digits = 2),"%"))

## [1] "5.83%" "8.87%"

# nrow(df_female)
# 1886+1860
# table(df_female$race) 
# table(df_male$race)

3.0.4 Female - Occupation

df %>% ls(pattern = "sales")

## [1] "branch_sales" "parent_sales" "retailsales"  "salesrep"

df_female_admin <- subset(df, sex == "f" & secretary == "1" )
df_female_sales <- subset(df, sex == "f" & salesrep  == "1")

# table(df_female_admin$race) 

mean_call_by_female_admin <- tapply(X    =  df_female_admin$call, 
                                   INDEX = df_female_admin$race, 
                                   FUN   = mean, 
                                   na.rm = TRUE
                                  )

mean_call_by_female_sales <- tapply(X      = df_female_sales$call, 
                                     INDEX = df_female_sales$race, 
                                     FUN   = mean, 
                                     na.rm = TRUE
                                    )

# Display the formatted result
print(paste0(round(x = mean_call_by_female_admin * 100, digits = 2),"%"))

## [1] "6.16%"  "10.19%"

print(paste0(round(x = mean_call_by_female_sales * 100, digits = 2),"%"))

## [1] "7.94%" "7.82%"

# table(df_female_admin$race)
# table(df_female_sales$race)

These numbers are a bit off.

3.1 Final Table

# Assuming df1, df2, df3, etc. are the data frames you want to row-bind
table1 <- rbind(mean_call_by_race,
                mean_call_by_chicago,
                mean_call_by_boston,
                mean_call_by_female,
                mean_call_by_female_admin,
                mean_call_by_female_sales,
                mean_call_by_male)

table1 <- round(x = table1*100, digits = 2)

ratio <- round(x = table1[,2]/table1[,1],
               digits = 3
               ) # column 3

diff  <- round(x = table1[,2]-table1[,1],
               digits = 2
               ) # column 4
table1 <- cbind(table1, ratio, diff)



table1

##                              b     w ratio  diff
## mean_call_by_race         6.45  9.65 1.496  3.20
## mean_call_by_chicago      5.40  8.06 1.493  2.66
## mean_call_by_boston       7.76 11.63 1.499  3.87
## mean_call_by_female       6.63  9.89 1.492  3.26
## mean_call_by_female_admin 6.16 10.19 1.654  4.03
## mean_call_by_female_sales 7.94  7.82 0.985 -0.12
## mean_call_by_male         5.83  8.87 1.521  3.04

3.1.1 P value of the difference of All Sent Resumes by race (first row)

3.1.1.1 METHOD 1

User written command adds a continuity correction to calculate the p value for no difference in two means (null).

p-value of the observed difference is .0004, and we will reject the null of no difference between the two groups.

successes_group_white <- sum(as.integer(df$call==1 & df$race=="w"))
successes_group_black <- sum(as.integer(df$call==1 & df$race=="b"))
successes_group_white # 235

## [1] 235

successes_group_black # 157

## [1] 157

table(df$race)

## 
##    b    w 
## 2435 2435

trails_group_white <- 2435
trials_group_black <- 2435 


# Calculate proportions
prop_group_white <- successes_group_white / trails_group_white
prop_group_black <- successes_group_black / trials_group_black


# Create a 2x2 matrix (contingency table)
observed_data <-    matrix(c(successes_group_white, 
                             successes_group_black, 
                             trails_group_white - successes_group_white, 
                             trials_group_black - successes_group_black), 
                          ncol = 2)

observed_data

##      [,1] [,2]
## [1,]  235 2200
## [2,]  157 2278

# Perform a chi-squared test of proportions
?prop.test
prop.test(observed_data)

## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  observed_data
## X-squared = 16.449, df = 1, p-value = 4.998e-05
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  0.01636705 0.04769866
## sample estimates:
##     prop 1     prop 2 
## 0.09650924 0.06447639

3.1.1.2 METHOD 2