#
# Statistics Project 2
#
# You will perform a statistical analysis to establish whether race has a significant impact on the rate of callbacks for resumes.

# Parameter of interest:  difference in callback rates in the population at large between black and white applicants
# Point estimate:         difference in callback rates in the sample of black and white applicants

# Answer the following questions in this notebook below and submit to your Github account. 
# 1.What test is appropriate for this problem? Does CLT apply
# 2.What are the null and alternate hypotheses?
# 3.Compute margin of error, confidence interval, and p-value.
# 4.Discuss statistical significance.
#
# Resources: 
# http://www.stat.columbia.edu/~martin/W2024/R2.pdf
# https://stat.ethz.ch/R-manual/R-devel/library/stats/html/p.adjust.html
#
#

library(stats)
library(foreign)

# load the data
setwd("~/sliderule/statistics_project2/statistics project 2")
data <- read.dta('data/us_job_market_discrimination.dta')

# Use inference function provide by DataCamp
load(url("http://assets.datacamp.com/course/dasi/inference.Rdata"))

# clean data
data$call <- as.logical(data$call)

# subset for comparison
black <- subset(data, data$race=="b")
white <- subset(data, data$race=="w")

# 1.What test is appropriate for this problem? Does CLT apply
# check condition: samples are independent (assume yes)
# check condition: 10% population (total population of black/white applicants in country is >10x of sample)

# check condition: success and failure cases >10 each (yes)
nrow(black)
## [1] 2435
nrow(white)
## [1] 2435
# same number of rows: 2435

# 2.What are the null and alternate hypotheses?
# H0 = there is no difference between callback rates on resumes from blacks vs. whites 
# HA = there is a difference between callback rates
#
# Conclustion: There is a significant difference in callback rates, reject null
#
# TODO: Contrast t.test() with inference()
t.test(black$call, white$call, alternative = "two.sided", mu = 0, var.equal = TRUE)
## 
##  Two Sample t-test
## 
## data:  black$call and white$call
## t = -4.1147, df = 4868, p-value = 3.941e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.04729491 -0.01677080
## sample estimates:
##  mean of x  mean of y 
## 0.06447639 0.09650924
t.test(black$call, white$call, alternative = "two.sided", mu = 0, var.equal = FALSE)
## 
##  Welch Two Sample t-test
## 
## data:  black$call and white$call
## t = -4.1147, df = 4711.6, p-value = 3.943e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.04729503 -0.01677067
## sample estimates:
##  mean of x  mean of y 
## 0.06447639 0.09650924
# 3.Compute margin of error, confidence interval, and p-value.
#
# Conclusion: At 95% CI, callback rate to black applicants is lower by 23.08 to 35.18%
# TODO: Can we do multiple tests and use p-adjust to deal with FWER and FDR? 

inference(black$call, white$call, est = "proportion", type = "ci", conflevel = 0.95, boot_method = "perc", method = "theoretical", success = TRUE)
## Response variable: categorical, Explanatory variable: categorical
## Difference between two proportions -- success: TRUE
## Summary statistics:
##        x
## y       FALSE TRUE  Sum
##   FALSE  2120  158 2278
##   TRUE     80   77  157
##   Sum    2200  235 2435

## Observed difference between proportions (FALSE-TRUE) = -0.2913
## Check conditions:
##    FALSE : number of successes = 80 ; number of failures = 2120 
##    TRUE : number of successes = 77 ; number of failures = 158 
## Standard error = 0.0309 
## 95 % Confidence interval = ( -0.3518 , -0.2308 )
# 4.Discuss statistical significance.
#
# Alternative hypothesis supported that there is a difference in callback rates at 95% CI
#
# Conclusion: 
#
# TODO: Can we do multiple tests and use p-adjust to deal with FWER and FDR?

# testing for normality -- weird results?
qqnorm(black$call)
qqline(black$call)

qqnorm(white$call)
qqline(white$call)

# using z-score test stat assuming normality
inference(black$call, white$call, est = "proportion", type = "ht", conflevel = 0.95, boot_method = "perc", method = "theoretical", alternative = "twosided", success = TRUE, null = 0)
## Response variable: categorical, Explanatory variable: categorical
## Difference between two proportions -- success: TRUE
## Summary statistics:
##        x
## y       FALSE TRUE  Sum
##   FALSE  2120  158 2278
##   TRUE     80   77  157
##   Sum    2200  235 2435
## Observed difference between proportions (FALSE-TRUE) = -0.2913
## H0: p_FALSE - p_TRUE = 0 
## HA: p_FALSE - p_TRUE != 0 
## Pooled proportion = 0.0645 
## Check conditions:
##    FALSE : number of expected successes = 142 ; number of expected failures = 2058 
##    TRUE : number of expected successes = 15 ; number of expected failures = 220 
## Standard error = 0.017 
## Test statistic: Z =  -17.282 
## p-value =  0