# Clear the workspace
rm(list = ls()) # Clear environment
gc() # Clear unused memory
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 552231 29.5 1227675 65.6 NA 700254 37.4
## Vcells 1025959 7.9 8388608 64.0 49152 1963353 15.0
cat("\f") # Clear the console
The LST package provides software and datasets for Lessons in Statistical Thinking.
# Prepare needed libraries
packages <- c("haven",
"stargazer",
"visdat",
"dplyr",
"magrittr") ### if multiple packages packages <- c("stringr", "psych")
for (i in 1:length(packages)) {
if (!packages[i] %in% rownames(installed.packages())) {
install.packages(packages[i]
, repos = "http://cran.rstudio.com/"
, dependencies = TRUE
)
}
library(packages[i], character.only = TRUE)
}
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
rm(packages)
Data from an experimental study in which researchers sent the resumes of fictitious job applicants to potential employers. The first names of the fictitious applicants was set randomly to sound either Black or white.
install.packages("devtools")
devtools::install_github("dtkaplan/LST")
library(LSTbook)
data("Callback")
head(Callback)
## name sex callback
## 1 Allison female 0
## 2 Kristen female 0
## 3 Lakisha female 0
## 4 Latonya female 0
## 5 Carrie female 0
## 6 Jay male 0
head(Names_and_race)
## # A tibble: 6 × 2
## # Groups: firstname [6]
## firstname race
## <chr> <chr>
## 1 Allison white
## 2 Kristen white
## 3 Lakisha black
## 4 Latonya black
## 5 Carrie white
## 6 Jay white
Callback
: A data frame with 4870 rows and 4 variables.
Each row is one fictitious applicant
name: first name of the fictitious job applicant
sex: sex of applicant (female or male)
callback: whether the potential employer called back to follow up. (1 = yes, 0 = no)
Another data frame -
Names_and_race
: which first names are associated with
which race.
vis_dat(Callback)
vis_dat(Names_and_race)
LST <-
merge(x = Callback,
y = Names_and_race,
by.x = c("name"),
by.y = c("firstname"),
all.x = T
)
str(LST)
## 'data.frame': 4870 obs. of 4 variables:
## $ name : chr "Aisha" "Aisha" "Aisha" "Aisha" ...
## $ sex : chr "female" "female" "female" "female" ...
## $ callback: int 0 0 0 0 0 0 0 0 0 0 ...
## $ race : chr "black" "black" "black" "black" ...
The data is only a subset of the original
data. I go to AER website and download the original data as well. It
is in .dta format (Stata file), but you ca import it into R using the
haven
package.
library(haven)
df <- read_dta("lakisha_aer.dta")
?mtcars
# install.packages("cobalt")
library("cobalt")
## cobalt (Version 4.5.5, Build Date: 2024-04-02)
covs <- subset(x = mtcars,
select = -c(am)
)
?bal.tab
bal.tab(x = covs,
treat = mtcars$am)
## Note: `s.d.denom` not specified; assuming "pooled".
## Balance Measures
## Type Diff.Un
## mpg Contin. 1.4110
## cyl Contin. -1.2078
## disp Contin. -1.4780
## hp Contin. -0.4732
## drat Contin. 2.0180
## wt Contin. -1.9349
## qsec Contin. -0.4645
## vs Binary 0.1700
## gear Contin. 2.5267
## carb Contin. 0.1070
##
## Sample sizes
## Control Treated
## All 19 13
# install.packages("modelsummary")
library(modelsummary)
datasummary_balance(formula = ~ am,
data=mtcars
)
0 | 1 | |||||
---|---|---|---|---|---|---|
Mean | Std. Dev. | Mean | Std. Dev. | Diff. in Means | Std. Error | |
mpg | 17.1 | 3.8 | 24.4 | 6.2 | 7.2 | 1.9 |
cyl | 6.9 | 1.5 | 5.1 | 1.6 | -1.9 | 0.6 |
disp | 290.4 | 110.2 | 143.5 | 87.2 | -146.8 | 35.0 |
hp | 160.3 | 53.9 | 126.8 | 84.1 | -33.4 | 26.4 |
drat | 3.3 | 0.4 | 4.0 | 0.4 | 0.8 | 0.1 |
wt | 3.8 | 0.8 | 2.4 | 0.6 | -1.4 | 0.2 |
qsec | 18.2 | 1.8 | 17.4 | 1.8 | -0.8 | 0.6 |
vs | 0.4 | 0.5 | 0.5 | 0.5 | 0.2 | 0.2 |
gear | 3.2 | 0.4 | 4.4 | 0.5 | 1.2 | 0.2 |
carb | 2.7 | 1.1 | 2.9 | 2.2 | 0.2 | 0.7 |
Lets try to recreate it/match the numbers.
mean_call_by_race <- tapply(X = df$call,
INDEX = as.factor(df$race),
FUN = mean,
na.rm = TRUE
)
# Display the result
mean_call_by_race
## b w
## 0.06447639 0.09650924
Add percentage and round to 2 decimal places as well.
# Display the formatted result
print(paste0(round(x = mean_call_by_race * 100,
digits = 2),"%"
)
)
## [1] "6.45%" "9.65%"
Now, since the numbers match, lets us generalize the code (get other rows as well) -
table(df$city)
##
## b c
## 2166 2704
df_chicago <- subset(df, city == "c")
df_boston <- subset(df, city == "b")
mean_call_by_chicago <- tapply(X = df_chicago$call,
INDEX = as.factor(df_chicago$race),
FUN = mean,
na.rm = TRUE
)
mean_call_by_boston <- tapply(X = df_boston$call,
INDEX = as.factor(df_boston$race),
FUN = mean,
na.rm = TRUE
)
# Display the formatted result
print(paste0(round(x = mean_call_by_chicago * 100, digits = 2),"%"))
## [1] "5.4%" "8.06%"
print(paste0(round(x = mean_call_by_boston * 100, digits = 2),"%"))
## [1] "7.76%" "11.63%"
# table(df_chicago$race)
# table(df_boston$race)
table(df$city)
##
## b c
## 2166 2704
df_female <- subset(df, sex == "f")
df_male <- subset(df, sex == "m")
mean_call_by_female <- tapply(X = df_female$call,
INDEX = df_female$race,
FUN = mean,
na.rm = TRUE
)
mean_call_by_male <- tapply(X = df_male$call,
INDEX = df_male$race,
FUN = mean,
na.rm = TRUE
)
# Display the formatted result
print(paste0(round(x = mean_call_by_female * 100, digits = 2),"%"))
## [1] "6.63%" "9.89%"
print(paste0(round(x = mean_call_by_male * 100, digits = 2),"%"))
## [1] "5.83%" "8.87%"
# nrow(df_female)
# 1886+1860
# table(df_female$race)
# table(df_male$race)
df %>% ls(pattern = "sales")
## [1] "branch_sales" "parent_sales" "retailsales" "salesrep"
df_female_admin <- subset(df, sex == "f" & secretary == "1" )
df_female_sales <- subset(df, sex == "f" & salesrep == "1")
# table(df_female_admin$race)
mean_call_by_female_admin <- tapply(X = df_female_admin$call,
INDEX = df_female_admin$race,
FUN = mean,
na.rm = TRUE
)
mean_call_by_female_sales <- tapply(X = df_female_sales$call,
INDEX = df_female_sales$race,
FUN = mean,
na.rm = TRUE
)
# Display the formatted result
print(paste0(round(x = mean_call_by_female_admin * 100, digits = 2),"%"))
## [1] "6.16%" "10.19%"
print(paste0(round(x = mean_call_by_female_sales * 100, digits = 2),"%"))
## [1] "7.94%" "7.82%"
# table(df_female_admin$race)
# table(df_female_sales$race)
These numbers are a bit off.
# Assuming df1, df2, df3, etc. are the data frames you want to row-bind
table1 <- rbind(mean_call_by_race,
mean_call_by_chicago,
mean_call_by_boston,
mean_call_by_female,
mean_call_by_female_admin,
mean_call_by_female_sales,
mean_call_by_male)
table1 <- round(x = table1*100, digits = 2)
ratio <- round(x = table1[,2]/table1[,1],
digits = 3
) # column 3
diff <- round(x = table1[,2]-table1[,1],
digits = 2
) # column 4
table1 <- cbind(table1, ratio, diff)
table1
## b w ratio diff
## mean_call_by_race 6.45 9.65 1.496 3.20
## mean_call_by_chicago 5.40 8.06 1.493 2.66
## mean_call_by_boston 7.76 11.63 1.499 3.87
## mean_call_by_female 6.63 9.89 1.492 3.26
## mean_call_by_female_admin 6.16 10.19 1.654 4.03
## mean_call_by_female_sales 7.94 7.82 0.985 -0.12
## mean_call_by_male 5.83 8.87 1.521 3.04
User written command adds a continuity correction to calculate the p value for no difference in two means (null).
p-value of the observed difference is .0004, and we will reject the null of no difference between the two groups.
successes_group_white <- sum(as.integer(df$call==1 & df$race=="w"))
successes_group_black <- sum(as.integer(df$call==1 & df$race=="b"))
successes_group_white # 235
## [1] 235
successes_group_black # 157
## [1] 157
table(df$race)
##
## b w
## 2435 2435
trails_group_white <- 2435
trials_group_black <- 2435
# Calculate proportions
prop_group_white <- successes_group_white / trails_group_white
prop_group_black <- successes_group_black / trials_group_black
# Create a 2x2 matrix (contingency table)
observed_data <- matrix(c(successes_group_white,
successes_group_black,
trails_group_white - successes_group_white,
trials_group_black - successes_group_black),
ncol = 2)
observed_data
## [,1] [,2]
## [1,] 235 2200
## [2,] 157 2278
# Perform a chi-squared test of proportions
?prop.test
prop.test(observed_data)
##
## 2-sample test for equality of proportions with continuity correction
##
## data: observed_data
## X-squared = 16.449, df = 1, p-value = 4.998e-05
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## 0.01636705 0.04769866
## sample estimates:
## prop 1 prop 2
## 0.09650924 0.06447639
p-value of the observed difference is .0004, and we will reject the null of no difference between the two groups.
# Calculate proportions
prop_group_white <- successes_group_white / trails_group_white
prop_group_black <- successes_group_black / trials_group_black
# Standard error of the difference between proportions
se_diff <- sqrt(prop_group_white * (1 - prop_group_white) / trails_group_white + prop_group_black * (1 - prop_group_black) / trials_group_black)
# Calculate the z statistic
z_stat <- (prop_group_white - prop_group_black) / se_diff
# Two-tailed p-value
p_value <- 2 * (1 - pnorm(q = abs(z_stat),
mean = 0,
sd = 1))
# Display the result
cat("Z statistic:", z_stat, "\n")
## Z statistic: 4.11555
cat("P-value:", p_value, "\n")
## P-value: 3.862565e-05
successes_group_male_white <- sum(as.integer(df$call==1 & df$race=="w" & df$sex=="m"))
successes_group_male_black <- sum(as.integer(df$call==1 & df$race=="b" & df$sex=="m"))
successes_group_male_white # 51
## [1] 51
successes_group_male_black # 32
## [1] 32
table(df_male$race)
##
## b w
## 549 575
trails_group_male_white <- 575
trials_group_male_black <- 2549
# Calculate proportions
prop_group_male_white <- successes_group_male_white / trails_group_male_white
prop_group_male_black <- successes_group_male_black / trials_group_male_black
# Create a 2x2 matrix (contingency table)
observed_data2 <- matrix(c(successes_group_male_white,
successes_group_male_black,
trails_group_male_white - successes_group_male_white,
trials_group_male_black - successes_group_male_black),
ncol = 2)
observed_data2
## [,1] [,2]
## [1,] 51 524
## [2,] 32 2517
# Perform a chi-squared test of proportions
prop.test(observed_data2)
##
## 2-sample test for equality of proportions with continuity correction
##
## data: observed_data2
## X-squared = 102.25, df = 1, p-value < 2.2e-16
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## 0.05143953 0.10084389
## sample estimates:
## prop 1 prop 2
## 0.08869565 0.01255394
# Standard error of the difference between proportions
se_diff2 <- sqrt(prop_group_male_white * (1 - prop_group_male_white) / trails_group_male_white + prop_group_male_black * (1 - prop_group_male_black) / trials_group_male_black)
# Calculate the z statistic
z_stat2 <- (prop_group_male_white - prop_group_male_black) / se_diff2
# Two-tailed p-value
p_value2 <- 2 * (1 - pnorm(q = abs(z_stat),
mean = 0,
sd = 1))
# Display the result
cat("Z statistic:", z_stat2, "\n")
## Z statistic: 6.313763
cat("P-value:", p_value2, "\n")
## P-value: 3.862565e-05
Bertrand, Marianne, and Mullainathan, Sendhil. Replication data for: Are Emily and Greg More Employable Than Lakisha and Jamal? A Field Experiment on Labor Market Discrimination. Nashville, TN: American Economic Association [publisher], 2004. Ann Arbor, MI: Inter-university Consortium for Political and Social Research [distributor], 2019-12-06. https://doi.org/10.3886/E116023V1
Balance Tables https://lost-stats.github.io/Presentation/Tables/Balance_Tables.html
# Import Dependency
library("cobalt")
# Load Data
data(mtcars)
# Create Balance Table
bal.tab(am ~ mpg + hp, data = mtcars)
## Note: `s.d.denom` not specified; assuming "pooled".
## Balance Measures
## Type Diff.Un
## mpg Contin. 1.4110
## hp Contin. -0.4732
##
## Sample sizes
## Control Treated
## All 19 13