# Clear the workspace
rm(list = ls()) # Clear environment
gc() # Clear unused memory
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 527985 28.2 1173894 62.7 NA 669268 35.8
## Vcells 971693 7.5 8388608 64.0 32768 1840524 14.1
cat("\f") # Clear the console
The LST package provides software and datasets for Lessons in Statistical Thinking.
# Prepare needed libraries
packages <- c("haven",
"stargazer",
"visdat",
"dplyr",
"magrittr") ### if multiple packages packages <- c("stringr", "psych")
for (i in 1:length(packages)) {
if (!packages[i] %in% rownames(installed.packages())) {
install.packages(packages[i]
, repos = "http://cran.rstudio.com/"
, dependencies = TRUE
)
}
library(packages[i], character.only = TRUE)
}
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
rm(packages)
Data from an experimental study in which researchers sent the resumes of fictitious job applicants to potential employers. The first names of the fictitious applicants was set randomly to sound either Black or white.
# install.packages("devtools")
# devtools::install_github("dtkaplan/LST")
library(LSTbook)
##
## Attaching package: 'LSTbook'
## The following object is masked from 'package:base':
##
## sample
data("Callback")
head(Callback)
## name sex callback
## 1 Allison female 0
## 2 Kristen female 0
## 3 Lakisha female 0
## 4 Latonya female 0
## 5 Carrie female 0
## 6 Jay male 0
head(Names_and_race)
## # A tibble: 6 × 2
## # Groups: firstname [6]
## firstname race
## <chr> <chr>
## 1 Allison white
## 2 Kristen white
## 3 Lakisha black
## 4 Latonya black
## 5 Carrie white
## 6 Jay white
Callback: A data frame with 4870 rows and 4 variables.
Each row is one fictitious applicant
name: first name of the fictitious job applicant
sex: sex of applicant (female or male)
callback: whether the potential employer called back to follow up. (1 = yes, 0 = no)
Another data frame -
Names_and_race: which first names are associated with
which race.
vis_dat(Callback)
vis_dat(Names_and_race)
LST <-
merge(x = Callback,
y = Names_and_race,
by.x = c("name"),
by.y = c("firstname"),
all.x = T
)
str(LST)
## 'data.frame': 4870 obs. of 4 variables:
## $ name : chr "Aisha" "Aisha" "Aisha" "Aisha" ...
## $ sex : chr "female" "female" "female" "female" ...
## $ callback: int 0 0 0 0 0 0 0 0 0 0 ...
## $ race : chr "black" "black" "black" "black" ...
The data is only a subset of the original
data. I go to AER website and download the original data as well. It
is in .dta format (Stata file), but you ca import it into R using the
haven package.
library(haven)
df <- read_dta("lakisha_aer.dta")
library(stargazer)
# stargazer(... = as.data.frame(df),
# type = "text"
# )
Lets try to recreate it/match the numbers.
mean_call_by_race <- tapply(X = df$call,
INDEX = as.factor(df$race),
FUN = mean,
na.rm = TRUE
)
# Display the result
mean_call_by_race
## b w
## 0.06447639 0.09650924
Add percentage and round to 2 decimal places as well.
# Display the formatted result
print(paste0(round(x = mean_call_by_race * 100,
digits = 2),"%"
)
)
## [1] "6.45%" "9.65%"
Now, since the numbers match, lets us generalize the code (get other rows as well) -
table(df$city)
##
## b c
## 2166 2704
df_chicago <- subset(df, city == "c")
df_boston <- subset(df, city == "b")
mean_call_by_chicago <- tapply(X = df_chicago$call,
INDEX = as.factor(df_chicago$race),
FUN = mean,
na.rm = TRUE
)
mean_call_by_boston <- tapply(X = df_boston$call,
INDEX = as.factor(df_boston$race),
FUN = mean,
na.rm = TRUE
)
# Display the formatted result
print(paste0(round(x = mean_call_by_chicago * 100, digits = 2),"%"))
## [1] "5.4%" "8.06%"
print(paste0(round(x = mean_call_by_boston * 100, digits = 2),"%"))
## [1] "7.76%" "11.63%"
# table(df_chicago$race)
# table(df_boston$race)
table(df$city)
##
## b c
## 2166 2704
df_female <- subset(df, sex == "f")
df_male <- subset(df, sex == "m")
mean_call_by_female <- tapply(X = df_female$call,
INDEX = df_female$race,
FUN = mean,
na.rm = TRUE
)
mean_call_by_male <- tapply(X = df_male$call,
INDEX = df_male$race,
FUN = mean,
na.rm = TRUE
)
# Display the formatted result
print(paste0(round(x = mean_call_by_female * 100, digits = 2),"%"))
## [1] "6.63%" "9.89%"
print(paste0(round(x = mean_call_by_male * 100, digits = 2),"%"))
## [1] "5.83%" "8.87%"
# nrow(df_female)
# 1886+1860
# table(df_female$race)
# table(df_male$race)
df %>% ls(pattern = "sales")
## [1] "branch_sales" "parent_sales" "retailsales" "salesrep"
df_female_admin <- subset(df, sex == "f" & secretary == "1" )
df_female_sales <- subset(df, sex == "f" & salesrep == "1")
# table(df_female_admin$race)
mean_call_by_female_admin <- tapply(X = df_female_admin$call,
INDEX = df_female_admin$race,
FUN = mean,
na.rm = TRUE
)
mean_call_by_female_sales <- tapply(X = df_female_sales$call,
INDEX = df_female_sales$race,
FUN = mean,
na.rm = TRUE
)
# Display the formatted result
print(paste0(round(x = mean_call_by_female_admin * 100, digits = 2),"%"))
## [1] "6.16%" "10.19%"
print(paste0(round(x = mean_call_by_female_sales * 100, digits = 2),"%"))
## [1] "7.94%" "7.82%"
# table(df_female_admin$race)
# table(df_female_sales$race)
These numbers are a bit off.
# Assuming df1, df2, df3, etc. are the data frames you want to row-bind
table1 <- rbind(mean_call_by_race,
mean_call_by_chicago,
mean_call_by_boston,
mean_call_by_female,
mean_call_by_female_admin,
mean_call_by_female_sales,
mean_call_by_male)
table1 <- round(x = table1*100, digits = 2)
ratio <- round(x = table1[,2]/table1[,1],
digits = 3
) # column 3
diff <- round(x = table1[,2]-table1[,1],
digits = 2
) # column 4
table1 <- cbind(table1, ratio, diff)
table1
## b w ratio diff
## mean_call_by_race 6.45 9.65 1.496 3.20
## mean_call_by_chicago 5.40 8.06 1.493 2.66
## mean_call_by_boston 7.76 11.63 1.499 3.87
## mean_call_by_female 6.63 9.89 1.492 3.26
## mean_call_by_female_admin 6.16 10.19 1.654 4.03
## mean_call_by_female_sales 7.94 7.82 0.985 -0.12
## mean_call_by_male 5.83 8.87 1.521 3.04
User written command adds a continuity correction to calculate the p value for no difference in two means (null).
p-value of the observed difference is .0004, and we will reject the null of no difference between the two groups.
successes_group_white <- sum(as.integer(df$call==1 & df$race=="w"))
successes_group_black <- sum(as.integer(df$call==1 & df$race=="b"))
successes_group_white # 235
## [1] 235
successes_group_black # 157
## [1] 157
table(df$race)
##
## b w
## 2435 2435
trails_group_white <- 2435
trials_group_black <- 2435
# Calculate proportions
prop_group_white <- successes_group_white / trails_group_white
prop_group_black <- successes_group_black / trials_group_black
# Create a 2x2 matrix (contingency table)
observed_data <- matrix(c(successes_group_white,
successes_group_black,
trails_group_white - successes_group_white,
trials_group_black - successes_group_black),
ncol = 2)
observed_data
## [,1] [,2]
## [1,] 235 2200
## [2,] 157 2278
# Perform a chi-squared test of proportions
?prop.test
prop.test(observed_data)
##
## 2-sample test for equality of proportions with continuity correction
##
## data: observed_data
## X-squared = 16.449, df = 1, p-value = 4.998e-05
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## 0.01636705 0.04769866
## sample estimates:
## prop 1 prop 2
## 0.09650924 0.06447639
p-value of the observed difference is .0004, and we will reject the null of no difference between the two groups.
# Calculate proportions
prop_group_white <- successes_group_white / trails_group_white
prop_group_black <- successes_group_black / trials_group_black
# Standard error of the difference between proportions
se_diff <- sqrt(prop_group_white * (1 - prop_group_white) / trails_group_white + prop_group_black * (1 - prop_group_black) / trials_group_black)
# Calculate the z statistic
z_stat <- (prop_group_white - prop_group_black) / se_diff
# Two-tailed p-value
p_value <- 2 * (1 - pnorm(q = abs(z_stat),
mean = 0,
sd = 1))
# Display the result
cat("Z statistic:", z_stat, "\n")
## Z statistic: 4.11555
cat("P-value:", p_value, "\n")
## P-value: 3.862565e-05
successes_group_male_white <- sum(as.integer(df$call==1 & df$race=="w" & df$sex=="m"))
successes_group_male_black <- sum(as.integer(df$call==1 & df$race=="b" & df$sex=="m"))
successes_group_male_white # 51
## [1] 51
successes_group_male_black # 32
## [1] 32
table(df_male$race)
##
## b w
## 549 575
trails_group_male_white <- 575
trials_group_male_black <- 2549
# Calculate proportions
prop_group_male_white <- successes_group_male_white / trails_group_male_white
prop_group_male_black <- successes_group_male_black / trials_group_male_black
# Create a 2x2 matrix (contingency table)
observed_data2 <- matrix(c(successes_group_male_white,
successes_group_male_black,
trails_group_male_white - successes_group_male_white,
trials_group_male_black - successes_group_male_black),
ncol = 2)
observed_data2
## [,1] [,2]
## [1,] 51 524
## [2,] 32 2517
# Perform a chi-squared test of proportions
prop.test(observed_data2)
##
## 2-sample test for equality of proportions with continuity correction
##
## data: observed_data2
## X-squared = 102.25, df = 1, p-value < 2.2e-16
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## 0.05143953 0.10084389
## sample estimates:
## prop 1 prop 2
## 0.08869565 0.01255394
# Standard error of the difference between proportions
se_diff2 <- sqrt(prop_group_male_white * (1 - prop_group_male_white) / trails_group_male_white + prop_group_male_black * (1 - prop_group_male_black) / trials_group_male_black)
# Calculate the z statistic
z_stat2 <- (prop_group_male_white - prop_group_male_black) / se_diff2
# Two-tailed p-value
p_value2 <- 2 * (1 - pnorm(q = abs(z_stat),
mean = 0,
sd = 1))
# Display the result
cat("Z statistic:", z_stat2, "\n")
## Z statistic: 6.313763
cat("P-value:", p_value2, "\n")
## P-value: 3.862565e-05