library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

QUESTION 1-1

set.seed(0)
n <- 1000

X <- rnorm(n, mean = 1, sd = 2)

Simulating the trained and untrained errors along with the wage equations.

untrainederror <- rnorm(n, mean = 0, sd = 1)
untrainedwage <- (5 + 2*X + untrainederror)

trainederror <- rnorm(n, mean = 0, sd = 1)
trainedwage <- (8 + 6*X + trainederror)

Calculating the average treatment effect (ATE), using the difference.

ATE1 <- mean(trainedwage - untrainedwage)

Summary of all measures found.

summary(untrainedwage)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -6.732   4.003   6.758   6.912   9.717  20.869
summary(trainedwage)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -24.67    5.31   13.30   13.88   22.35   54.11
ATE1
## [1] 6.966293

QUESTION 1-2

Here, I separated the simulated X into that above the mean of 1 and that below. Then I recalculated the ATE, separated into these two blocks. Finally, I found the proportion of the simulated X that was in both categories.

abovemean <- X > 1
ATE1above <- mean(trainedwage[abovemean] - untrainedwage[abovemean])
belowmean <- X < 1
ATE1below <- mean(trainedwage[belowmean] - untrainedwage[belowmean])

proportionabove <- mean(abovemean)
proportionbelow <- mean(belowmean)

The new total can be found by multiplying these two new ATE’s by the proportion I found, then adding. The result that allowed me to do this calculation is the property of the linearity of expectation, or in other words, the ability to separate the expectation into blocks and multiply each by the proportion in the block.

ATE1newtotal <- ATE1above * proportionabove + ATE1below * proportionbelow

QUESTION 1-3

Here, I simulated epsilon and U for this altered simulation.

epsilon <- rnorm(n, mean = 0, sd = 1)
U <- rnorm(n, mean = 0, sd = 1)

This is the code we were given. Then, I found a new ATE observed from finding the difference, similar to before.

D <- ifelse(X + U > 1, 1, 0) # Treatment assignment
Y0 <- 5 + 2 * (X + U) + epsilon # Outcome without treatment
Y1 <- 8 + 6 * (X + U) + epsilon # Outcome with treatment
Y <- ifelse(D == 1, Y1, Y0) # Observed outcome

ATEobserved <- mean(Y[D == 1]) - mean(Y[D == 0])
ATEobserved
## [1] 21.75732

QUESTION 1-4

Based on the above results, I would propose the relevant estimator for selection bias to be the difference between the observed ATE and the true ATE which represents the true causal effect.

The sign of the selection bias is represented in our scenario around X+U. Workers receive training only if this greater than 1, therefore when this is higher, workers are more likely to be trained. The selection bias means that our observed ATE overestimates the true effect of training.

QUESTION 1-5

This is the code similar to the previous question, but with having Nsim as 5000.

set.seed(0)
n <- 1000
Nsim <- 5000

observedATE <- numeric(Nsim)
trueATE <- numeric(Nsim)
bias <- numeric(Nsim)

for (i in 1:Nsim) {
  X2 <- rnorm(n, mean = 1, sd = 2)
  U2 <- rnorm(n, mean = 0, sd = 1)
  
  D <- ifelse(X + U > 1, 1, 0)
  
  epsilon <- rnorm(n, mean = 0, sd = 1)
  
  # Potential outcomes
  Y0 <- 5 + 2 * (X2 + U2) + epsilon
  Y1 <- 8 + 6 * (X2 + U2) + epsilon
  
  # Observed outcome
  Y <- ifelse(D == 1, Y1, Y0)
  
  # Calculate ATEs
  observedATE[i] <- mean(Y[D == 1]) - mean(Y[D == 0])
  trueATE[i] <- mean(Y1 - Y0)
  bias[i] <- observedATE[i] - trueATE[i]
}

sim_results <- data.frame(
  ObservedATE = observedATE,
  TrueATE = trueATE,
  Bias = bias
)

Here are the plotted results of the data frame.

ggplot(sim_results, aes(x = ObservedATE)) +
  geom_histogram(fill = "navy", bins = 50, alpha = 0.6) +
  labs(title = "Distribution of Observed ATE", x = "Observed ATE", y = "Frequency") +
  theme_minimal()

# Histogram of True ATE
ggplot(sim_results, aes(x = TrueATE)) +
  geom_histogram(fill = "salmon", bins = 50, alpha = 0.6) +
  labs(title = "Distribution of True ATE", x = "True ATE", y = "Frequency") +
  theme_minimal()

# Histogram of Selection Bias
ggplot(sim_results, aes(x = Bias)) +
  geom_histogram(fill = "darkgreen", bins = 50, alpha = 0.6) +
  labs(title = "Distribution of Selection Bias", x = "Selection Bias", y = "Frequency") +
  theme_minimal()

Observations: The True ATE is clustered around the expected causal effect. The Observed ATE distribution is longer than the True ATE because allows for workers to have higher initial outcomes. The bias is fairly split, but majority positive, indicating the selection bias with the Observed ATE.

QUESTION 2-1

Load and rename data.

wage_data <- read.csv("/Users/rileystern/Downloads/WAGEDATA.csv", 
                      stringsAsFactors = FALSE)
head(wage_data)
##   sex age marst  incwage education
## 1   1  32     1 69568.85         1
## 2   2  39     2 30756.76         1
## 3   1  47     2 29292.15         1
## 4   1  34     1 73230.37         1
## 5   2  39     4 34418.28         3
## 6   1  32     4 29292.15         1

Filter to include just ages 30-39. Find mean wages for married and never married men and women.

wage30_39 <- wage_data %>%
  filter(age >= 30 & age <= 39)

mean_wages <- wage30_39 %>%
  filter(marst %in% c(1, 4)) %>%   
  group_by(marst, sex) %>%
  summarise(mean_incwage = mean(incwage, na.rm = TRUE)) %>%
  ungroup()
## `summarise()` has grouped output by 'marst'. You can override using the
## `.groups` argument.
mean_wages
## # A tibble: 4 × 3
##   marst   sex mean_incwage
##   <int> <int>        <dbl>
## 1     1     1       50739.
## 2     1     2       44733.
## 3     4     1       40819.
## 4     4     2       38605.

Now, find the gaps for each.

married_gap <- mean_wages %>%
  filter(marst == 1) %>%
  summarise(wage_gap = mean_incwage[sex == 1] - mean_incwage[sex == 2]) %>%
  pull(wage_gap)


never_married_gap <- mean_wages %>%
  filter(marst == 4) %>%
  summarise(wage_gap = mean_incwage[sex == 1] - mean_incwage[sex == 2]) %>%
  pull(wage_gap)

married_gap
## [1] 6005.845
never_married_gap
## [1] 2213.474

The gap between married men and women is much larger, signifying that marital status may be associated with a higher gender wage gap.

QUESTION 2-2

This difference may suggest that underlying sources of the wage gap have to do with other income sources within a household or other familial responsibilities such as childcare.

QUESTION 2-3

Here, I isolated the four categories of married men, married women, single men, and single women.

married_men <- wage30_39 %>% filter(marst == 1, sex == 1) %>% pull(incwage)
married_women <- wage30_39 %>% filter(marst == 1, sex == 2) %>% pull(incwage)
single_men <- wage30_39 %>% filter(marst == 4, sex == 1) %>% pull(incwage)
single_women <- wage30_39 %>% filter(marst == 4, sex == 2) %>% pull(incwage)

Then, I created vectors that represent the differences between these corresponding differences.

married_gap_vector <- married_men - married_women[1:length(married_men)]
single_gap_vector <- single_men - single_women[1:length(single_men)]

Then, I conducted a T-Test to test the null hypothesis that the wage gaps are the same.

t_test_result <- t.test(married_gap_vector, single_gap_vector, alternative = "two.sided")
t_test_result
## 
##  Welch Two Sample t-test
## 
## data:  married_gap_vector and single_gap_vector
## t = 12.889, df = 45831, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  2908.628 3951.933
## sample estimates:
## mean of x mean of y 
##  5924.990  2494.709

The results indicate the wage gap is statistically significantly higher for married men and women then single.

QUESTION 2-4 Here, I found new mean values for college educated men and women specifically. Also, the wage gap between the two.

college <- subset(wage_data, education > 3)
mean_wage_male_college <- mean(college$incwage[college$sex == 1])
mean_wage_female_college <- mean(college$incwage[college$sex == 2])
gap_college <- mean_wage_male_college - mean_wage_female_college

Then, I conducted a repeat t-test. This showed that on average college educated men earn about $5000 dollars more. These results are also highly statistically significant. Meaning, educating does not necessarily eliminate the wage gap.

t.test(college$incwage ~ college$sex)
## 
##  Welch Two Sample t-test
## 
## data:  college$incwage by college$sex
## t = 19.038, df = 28191, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 1 and group 2 is not equal to 0
## 95 percent confidence interval:
##  4496.111 5528.139
## sample estimates:
## mean in group 1 mean in group 2 
##        63602.24        58590.11

Here, I again found new mean values, but specifically for non-college educated men and women.

no_college <- subset(wage_data, education <= 3)
mean_wage_male_nocol <- mean(no_college$incwage[no_college$sex == 1])
mean_wage_female_nocol <- mean(no_college$incwage[no_college$sex == 0])
gap_nocollege <- mean_wage_male_nocol - mean_wage_female_nocol

I again conducted a repeat t-test. These results showed that on average, non college educated men earn about $7000 dollars more. These results are also extremely statistically significant. This indicates that a lack of college education does not even out the wage gap.

QUESTION 2-5 Here, I calculated the differences in gender wage gap amongst the group of never married, divorced, and widowed people. The results again show a similar statistically significant gap between men and women.

wd <- subset(wage_data, marst %in% c(2, 3, 4))

wd$incwage <- as.numeric(as.character(wd$incwage))

mean_wage_male_wd <- mean(wd$incwage[wd$sex == 1], na.rm = TRUE)
mean_wage_female_wd <- mean(wd$incwage[wd$sex == 2], na.rm = TRUE)
gap_wd <- mean_wage_male_wd - mean_wage_female_wd
gap_wd
## [1] 2947.26
t.test(wd$incwage ~ wd$sex)
## 
##  Welch Two Sample t-test
## 
## data:  wd$incwage by wd$sex
## t = 22.082, df = 92549, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 1 and group 2 is not equal to 0
## 95 percent confidence interval:
##  2685.663 3208.857
## sample estimates:
## mean in group 1 mean in group 2 
##        41617.19        38669.93

QUESTION 3-1 According to the paper, labor force survey data is fairly limited and does not include all the characteristics employers take into account when making hiring or salary choices. These can include appearance, name, etc. Hiring can be very subjective. Therefore, using just survey data makes it hard to compare.

QUESTION 3-2 No, even with the best dataset and no moral constraints we can not observe this difference. Why? Because there is no way to simulate a reality that may occur if a black person were white or vice versa. The opposite outcome is counterfactual.

QUESTION 3-3 Yes, there are also other unobserved factors. Race has been linked to other factors like educational attainment, income, human capital, access to proper healthcare, etc. As shown by the given table, race is highly correlated to college education status. Therefore, the difference between the two groups is likely not just race, but a slew of other factors as well. It is difficult to isolate the discriminatory affect of race alone.

QUESTION 3-4 Since we can not observe direct counterfactual outcomes from real data points- i.e.  if the same person were black and white applying for the same job, the randomization done by BM, allows for all factors to be randomly selected, except for the racially sounding names. That way, every white or black assigned resume only differs in name, not the other qualifications and factors. This allows us to directly compare callback rates for race in this simulated dataset.

QUESTION 3-5

Here, I downloaded and loaded the BM data set.

install.packages("haven")
## 
## The downloaded binary packages are in
##  /var/folders/rx/1zr50b_d1s78nny7x6_83j5m0000gn/T//RtmpdpnQ3p/downloaded_packages
library(haven)


BM_data <- read_dta("/Users/rileystern/Downloads/BM2004.dta")

str(BM_data)
## tibble [4,870 × 65] (S3: tbl_df/tbl/data.frame)
##  $ id                : chr [1:4870] "b" "b" "b" "b" ...
##   ..- attr(*, "label")= chr "ID"
##   ..- attr(*, "format.stata")= chr "%9s"
##  $ ad                : chr [1:4870] "1" "1" "1" "1" ...
##   ..- attr(*, "label")= chr "Ad #"
##   ..- attr(*, "format.stata")= chr "%9s"
##  $ education         : num [1:4870] 4 3 4 3 3 4 4 3 4 4 ...
##   ..- attr(*, "label")= chr "0=not reported; 1=HSD; 2=HSG; 3=some col; 4=col +"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ ofjobs            : num [1:4870] 2 3 1 4 3 2 2 4 3 2 ...
##   ..- attr(*, "label")= chr "number of jobs listed on resume"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ yearsexp          : num [1:4870] 6 6 6 6 22 6 5 21 3 6 ...
##   ..- attr(*, "label")= chr "number of years of work experience on the resume"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ honors            : num [1:4870] 0 0 0 0 0 1 0 0 0 0 ...
##   ..- attr(*, "label")= chr "1=resume mentions some honors"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ volunteer         : num [1:4870] 0 1 0 1 0 0 1 1 0 1 ...
##   ..- attr(*, "label")= chr "1=resume mentions some volunteering experience"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ military          : num [1:4870] 0 1 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "label")= chr "1=applicant has some military experience"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ empholes          : num [1:4870] 1 0 0 1 0 0 0 1 0 0 ...
##   ..- attr(*, "label")= chr "1=resume has some employment holes"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ occupspecific     : num [1:4870] 17 316 19 313 313 266 13 313 316 263 ...
##   ..- attr(*, "label")= chr "Occup. Specific"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ occupbroad        : num [1:4870] 1 6 1 5 5 4 1 5 6 4 ...
##   ..- attr(*, "label")= chr "Occup. Broad"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ workinschool      : num [1:4870] 0 1 1 0 1 0 1 0 0 1 ...
##   ..- attr(*, "label")= chr "1=resume mentions some work experience while at school"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ email             : num [1:4870] 0 1 0 1 1 0 1 1 0 1 ...
##   ..- attr(*, "label")= chr "1=email address on applicant's resume"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ computerskills    : num [1:4870] 1 1 1 1 1 0 1 1 1 0 ...
##   ..- attr(*, "label")= chr "1=resume mentions some computer skills"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ specialskills     : num [1:4870] 0 0 0 1 0 1 1 1 1 1 ...
##   ..- attr(*, "label")= chr "1=resume mentions some special skills"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ firstname         : chr [1:4870] "Allison" "Kristen" "Lakisha" "Latonya" ...
##   ..- attr(*, "label")= chr "applicant's first name"
##   ..- attr(*, "format.stata")= chr "%9s"
##  $ sex               : chr [1:4870] "f" "f" "f" "f" ...
##   ..- attr(*, "format.stata")= chr "%9s"
##  $ race              : chr [1:4870] "w" "w" "b" "b" ...
##   ..- attr(*, "format.stata")= chr "%9s"
##  $ h                 : num [1:4870] 0 1 0 1 1 0 1 1 0 1 ...
##   ..- attr(*, "label")= chr "1=high quality resume"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ l                 : num [1:4870] 1 0 1 0 0 1 0 0 1 0 ...
##   ..- attr(*, "label")= chr "1=low quality resume"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ call              : num [1:4870] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "label")= chr "1=applicant was called back"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ city              : chr [1:4870] "c" "c" "c" "c" ...
##   ..- attr(*, "label")= chr "c=chicago; b=boston"
##   ..- attr(*, "format.stata")= chr "%9s"
##  $ kind              : chr [1:4870] "a" "a" "a" "a" ...
##   ..- attr(*, "format.stata")= chr "%9s"
##  $ adid              : num [1:4870] 384 384 384 384 385 386 386 385 386 386 ...
##   ..- attr(*, "label")= chr "employment ad identifier"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ fracblack         : num [1:4870] 0.9894 0.0807 0.1043 0.3362 0.3976 ...
##   ..- attr(*, "label")= chr "frac blacks in applicant's zip"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ fracwhite         : num [1:4870] 0.0055 0.8884 0.8374 0.6374 0.1802 ...
##   ..- attr(*, "label")= chr "frac whites in applicant's zip"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ lmedhhinc         : num [1:4870] 9.53 10.41 10.47 10.43 9.88 ...
##   ..- attr(*, "label")= chr "log median HH income in applicant's zip"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ fracdropout       : num [1:4870] 0.274 0.234 0.101 0.109 0.313 ...
##   ..- attr(*, "label")= chr "frac HS dropouts in applicant's zip"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ fraccolp          : num [1:4870] 0.0377 0.0873 0.5917 0.4066 0.0308 ...
##   ..- attr(*, "label")= chr "frac college educ or more in applicant's zip"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ linc              : num [1:4870] 8.71 9.53 10.54 10.41 8.73 ...
##   ..- attr(*, "label")= chr "log per capita income in applicant's zip"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ col               : num [1:4870] 1 0 1 0 0 1 1 0 1 1 ...
##   ..- attr(*, "label")= chr "applicant has college degree or more"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ expminreq         : chr [1:4870] "5" "5" "5" "5" ...
##   ..- attr(*, "label")= chr "min experience required, if any"
##   ..- attr(*, "format.stata")= chr "%9s"
##  $ schoolreq         : chr [1:4870] "" "" "" "" ...
##   ..- attr(*, "label")= chr "specific education requirement, if any"
##   ..- attr(*, "format.stata")= chr "%9s"
##  $ eoe               : num [1:4870] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "label")= chr "1=ad mentions employer is EOE"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ parent_sales      : num [1:4870] NA NA NA NA 9.4 ...
##   ..- attr(*, "label")= chr "sales of parent company"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ parent_emp        : num [1:4870] NA NA NA NA 143 135 135 143 135 135 ...
##   ..- attr(*, "label")= chr "employment of parent company"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ branch_sales      : num [1:4870] NA NA NA NA 9.4 ...
##   ..- attr(*, "label")= chr "sales of branch"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ branch_emp        : num [1:4870] NA NA NA NA 143 135 135 143 135 135 ...
##   ..- attr(*, "label")= chr "emp of branch"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ fed               : num [1:4870] NA NA NA NA 0 0 0 0 0 0 ...
##   ..- attr(*, "label")= chr "employer is federal contractor"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ fracblack_empzip  : num [1:4870] NA NA NA NA 0.205 ...
##   ..- attr(*, "label")= chr "fraction blacks in employers's zipcode"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ fracwhite_empzip  : num [1:4870] NA NA NA NA 0.727 ...
##   ..- attr(*, "label")= chr "fraction whites in employer's zipcode"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ lmedhhinc_empzip  : num [1:4870] NA NA NA NA 10.6 ...
##   ..- attr(*, "label")= chr "log median HH income in employer's zipcode"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ fracdropout_empzip: num [1:4870] NA NA NA NA 0.0705 ...
##   ..- attr(*, "label")= chr "fraction HS dropouts in employer's zipcode"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ fraccolp_empzip   : num [1:4870] NA NA NA NA 0.37 ...
##   ..- attr(*, "label")= chr "fraction college or more in employer's  zipcode"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ linc_empzip       : num [1:4870] NA NA NA NA 10 ...
##   ..- attr(*, "label")= chr "log per capita income in employer's zipcode"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ manager           : num [1:4870] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "label")= chr "manager wanted"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ supervisor        : num [1:4870] 1 1 1 1 0 0 0 0 0 0 ...
##   ..- attr(*, "label")= chr "supervisor wanted"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ secretary         : num [1:4870] 0 0 0 0 1 0 0 1 0 0 ...
##   ..- attr(*, "label")= chr "secretary wanted"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ offsupport        : num [1:4870] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ salesrep          : num [1:4870] 0 0 0 0 0 1 1 0 1 1 ...
##   ..- attr(*, "label")= chr "sales representative wanted"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ retailsales       : num [1:4870] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "label")= chr "retail sales worker wanted"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ req               : num [1:4870] 1 1 1 1 1 0 0 1 0 0 ...
##   ..- attr(*, "label")= chr "1=ad mentions any requirement for job"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ expreq            : num [1:4870] 1 1 1 1 1 0 0 1 0 0 ...
##   ..- attr(*, "label")= chr "1=ad mentions some experience requirement"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ comreq            : num [1:4870] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "label")= chr "1=ad mentions some communication skills requirement"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ educreq           : num [1:4870] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "label")= chr "1=ad mentions some educational requirement"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ compreq           : num [1:4870] 1 1 1 1 1 0 0 1 0 0 ...
##   ..- attr(*, "label")= chr "1=ad mentions some computer skill requirement"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ orgreq            : num [1:4870] 0 0 0 0 1 0 0 1 0 0 ...
##   ..- attr(*, "label")= chr "1=ad mentions some organizational skills requirement"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ manuf             : num [1:4870] 1 1 1 1 0 0 0 0 0 0 ...
##   ..- attr(*, "label")= chr "employer industry is manufacturing"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ transcom          : num [1:4870] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "label")= chr "employer industry is transport/communication"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ bankreal          : num [1:4870] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "label")= chr "employer industry is finance, insurance, real estate"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ trade             : num [1:4870] 0 0 0 0 0 1 1 0 1 1 ...
##   ..- attr(*, "label")= chr "employer industry is wholesale or retail trade"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ busservice        : num [1:4870] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "label")= chr "employer industry is business and personal  services"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ othservice        : num [1:4870] 0 0 0 0 1 0 0 1 0 0 ...
##   ..- attr(*, "label")= chr "employer industry is health, educ. and social  services"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ missind           : num [1:4870] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "label")= chr "employer industry is other/unknown"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ ownership         : chr [1:4870] "" "" "" "" ...
##   ..- attr(*, "label")= chr "ownership status of employer"
##   ..- attr(*, "format.stata")= chr "%10s"
head(BM_data)
## # A tibble: 6 × 65
##   id    ad    education ofjobs yearsexp honors volunteer military empholes
##   <chr> <chr>     <dbl>  <dbl>    <dbl>  <dbl>     <dbl>    <dbl>    <dbl>
## 1 b     1             4      2        6      0         0        0        1
## 2 b     1             3      3        6      0         1        1        0
## 3 b     1             4      1        6      0         0        0        0
## 4 b     1             3      4        6      0         1        0        1
## 5 b     1             3      3       22      0         0        0        0
## 6 b     1             4      2        6      1         0        0        0
## # ℹ 56 more variables: occupspecific <dbl>, occupbroad <dbl>,
## #   workinschool <dbl>, email <dbl>, computerskills <dbl>, specialskills <dbl>,
## #   firstname <chr>, sex <chr>, race <chr>, h <dbl>, l <dbl>, call <dbl>,
## #   city <chr>, kind <chr>, adid <dbl>, fracblack <dbl>, fracwhite <dbl>,
## #   lmedhhinc <dbl>, fracdropout <dbl>, fraccolp <dbl>, linc <dbl>, col <dbl>,
## #   expminreq <chr>, schoolreq <chr>, eoe <dbl>, parent_sales <dbl>,
## #   parent_emp <dbl>, branch_sales <dbl>, branch_emp <dbl>, fed <dbl>, …

Here, I produced the first two columns of the first row on page 997.These measure as a 6.45% call back rate for “black” resumes and a 9.65% for “white” ones.

BM_data %>%
  select(race, call) %>%          
  group_by(race) %>%              
  summarise(mean_call = mean(call, na.rm = TRUE))
## # A tibble: 2 × 2
##   race  mean_call
##   <chr>     <dbl>
## 1 b        0.0645
## 2 w        0.0965

QUESTION 3-6 Now, I created a T-Test measuring the null hypothesis that there is not a difference in call back rates. This test showed that there is in fact a statistically significant difference.

t.test(call ~ race, data = BM_data)
## 
##  Welch Two Sample t-test
## 
## data:  call by race
## t = -4.1147, df = 4711.6, p-value = 3.943e-05
## alternative hypothesis: true difference in means between group b and group w is not equal to 0
## 95 percent confidence interval:
##  -0.04729503 -0.01677067
## sample estimates:
## mean in group b mean in group w 
##      0.06447639      0.09650924