library(oaxaca)
## 
## Please cite as:
##  Hlavac, Marek (2018). oaxaca: Blinder-Oaxaca Decomposition in R.
##  R package version 0.1.4. https://CRAN.R-project.org/package=oaxaca
library(haven)
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:Hmisc':
## 
##     src, summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(magrittr)

a) Data Import

df.nlsy <- read_dta("C:\\Users\\t420\\Desktop\\NLSY.dta")
wage <- df.nlsy[,5]
lwage <- log(wage)
female.f <- factor(df.nlsy$female, labels=c("men", "women"))

b) Create an overlapping density plot according to gender

# Log Wage by gender
df.nlsy %<>%
  mutate(female.f <- factor(female, labels = c("men","women")))
ggplot(df.nlsy, aes(x = wage, y = ..density.., weight = wgt, fill = female.f)) +
         geom_histogram(bins = 30, position = position_dodge(width = 0), alpha = 1) +
  labs(x = "log wage", y = "density", title = "log(wage) by gender") +
  scale_x_log10() +
  theme_minimal()

# Years of work by gender
ggplot(df.nlsy, aes(x = yrswrkd, y = ..density.., weight = wgt, fill = female.f)) +
  geom_histogram(bins = 30, position = position_dodge(width = 0), alpha = 0.6) +
  labs(x = "years of work", y = "density", title = "years of work by gender") +
  theme_minimal()

#Fraction of part time work by gender
ggplot(df.nlsy, aes(x = fracpart, y = ..density.., weight = wgt, fill = female.f)) +
  geom_histogram(bins = 25, position = position_dodge(width = 0), alpha = 1) +
  labs(x = "part time", y = "density", title = "part time work by gender") +
  theme_minimal()

# Years of education by gender
ggplot(df.nlsy, aes(x = yrseduc, y = ..density.., weight = wgt, fill = female.f)) +
  geom_histogram(bins = 20, position = position_dodge(width = 0), alpha = 1) +
  labs(x = "years of education", y = "density", title = "years of education by gender") +
  theme_minimal()
## Warning: Removed 66 rows containing non-finite values (stat_bin).

C) Comment on differences according to gender

 #Above figures compare men and women in terms of wage, years of education, years of job market experience, and part-time job experience. Overall, we can conclude that men tend to earn higher than women. Similarly, they have higher education and job market experience in comparsion to female. But, in case of part time job, female dominate male. This might be due to their reponisibilities at home as a mother.

d) Create Density Plot Function in terms of race

race <- factor(df.nlsy$nwhite, labels = c("white","nwhite"))
df.nlsy %<>%   mutate(race = factor(nwhite, labels = c("white","nwhite")))
# According to wage
ggplot(df.nlsy, aes(x = wage, y = ..density.., weight = wgt, fill = race)) +
  geom_histogram(bins = 30, position = position_dodge(width = 0), alpha = 1) +
  labs(x = "logwage", y = "density", title = "log(wage) by race") +
  scale_x_log10() +
  theme_minimal()

# years of work by race
ggplot(df.nlsy, aes(x = yrswrkd, y = ..density.., weight = wgt, fill = race)) +
  geom_histogram(bins = 30, position = position_dodge(width = 0), alpha = 1) +
  labs(x = "work experience", y = "density", title = "work experience by years") +
  theme_minimal()

# Part time work by race
ggplot(df.nlsy, aes(x = fracpart, y = ..density.., weight = wgt, fill = race)) +
  geom_histogram(bins = 25, position = position_dodge(width = 0), alpha = 1) +
  labs(x = "part time work experience", y = "density", title = "part time by race") +
  theme_minimal()

# Education according to race
ggplot(df.nlsy, aes(x = yrseduc, y = ..density.., weight = wgt, fill = race)) +
  geom_histogram(bins = 20, position = position_dodge(width = 0), alpha = 1) +
  labs(x = "years of education", y = "density", title = "years of education according to race") +
  theme_minimal()
## Warning: Removed 66 rows containing non-finite values (stat_bin).

E) Comment on differences according to race

# Figures above show the wage, years of market experience, part time work experience, and education according to race. We can conclude that whites earn higher wage on an average than non whites. Similarly, whites seem to have higher years of work experience than their counterparts who are non-whites. In terms of part time work experience, whites seem to have higher part time work experience but the difference isn't that much big. At the end, we can also see that non-whites seem to have less education than their white friends. 

F) Calculate the average wages difference

# Calculate  the average wages difference between men and women
df.nlsy %<>% mutate(lw=log(wage))
gender <- df.nlsy %>%
  group_by(female)%>%
  summarise(mean.wage = wtd.mean(lw, weights = wgt),
median.wage = wtd.quantile(lw, weights = wgt, probs = 0.5))
gender
## # A tibble: 2 x 3
##   female mean.wage median.wage
##    <dbl>     <dbl>       <dbl>
## 1     0.      2.86        2.84
## 2     1.      2.58        2.54
# We can see that the average  log wage difference between male and female is 0.28. 
# In terms of race
df.nlsy %<>% mutate(lw=log(wage))
race <- df.nlsy %>%
  group_by(race)%>%
  summarise(mean.wage = wtd.mean(lw, weights = wgt),
median.wage = wtd.quantile(lw, weights = wgt, probs = 0.5))
race
## # A tibble: 2 x 3
##   race   mean.wage median.wage
##   <fct>      <dbl>       <dbl>
## 1 white       2.77        2.74
## 2 nwhite      2.54        2.49
# Above result shows that the log difference of wage between whites and non-whites is 0.23.

G) Standard deviation for Log Wages

# According to Gender
df.nlsy %<>% mutate(lw=log(wage))
gender <- df.nlsy %>%
  group_by(female)%>%
  summarise(standard.wage = wtd.var(lw, weights = wgt))
gender
## # A tibble: 2 x 2
##   female standard.wage
##    <dbl>         <dbl>
## 1     0.         0.312
## 2     1.         0.280
# We can conclude that the standard deviation of wage is higher in case of men than in the case of women. 
# According to race
df.nlsy %<>% mutate(lw=log(wage))
race <- df.nlsy %>%
  group_by(race)%>%
  summarise(standard.wage = wtd.var(lw, weights = wgt))
race
## # A tibble: 2 x 2
##   race   standard.wage
##   <fct>          <dbl>
## 1 white          0.319
## 2 nwhite         0.261
# We can cocnlude that the standard deviation of wage is higer in case of whites than in the case of non-whites.

H) Pooled Weighted OLS model

regress<-lm(log(wage) ~ age + yrseduc + yrswrkd + fracpart + female + nwhite + union + famresp + s_agric + s_minman + s_hltedusoc + hazard, weight = wgt, data = df.nlsy)
summary(regress)
## 
## Call:
## lm(formula = log(wage) ~ age + yrseduc + yrswrkd + fracpart + 
##     female + nwhite + union + famresp + s_agric + s_minman + 
##     s_hltedusoc + hazard, data = df.nlsy, weights = wgt)
## 
## Weighted Residuals:
##      Min       1Q   Median       3Q      Max 
## -1449.66  -155.26   -10.49   144.74  1651.44 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.763769   0.110280  15.994  < 2e-16 ***
## age         -0.019860   0.002983  -6.659 3.04e-11 ***
## yrseduc      0.098669   0.002552  38.667  < 2e-16 ***
## yrswrkd      0.030420   0.001752  17.365  < 2e-16 ***
## fracpart    -0.324998   0.043881  -7.406 1.50e-13 ***
## female      -0.137215   0.014420  -9.516  < 2e-16 ***
## nwhite      -0.107132   0.015457  -6.931 4.68e-12 ***
## union        0.073226   0.015198   4.818 1.49e-06 ***
## famresp     -0.090586   0.016579  -5.464 4.87e-08 ***
## s_agric     -0.223325   0.048717  -4.584 4.66e-06 ***
## s_minman     0.056648   0.015830   3.579 0.000348 ***
## s_hltedusoc -0.062747   0.016558  -3.790 0.000153 ***
## hazard       0.077705   0.028953   2.684 0.007300 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 284.2 on 5319 degrees of freedom
##   (66 observations deleted due to missingness)
## Multiple R-squared:  0.3837, Adjusted R-squared:  0.3823 
## F-statistic:   276 on 12 and 5319 DF,  p-value: < 2.2e-16

I) R squared calculation

summary(regress)$r.squared
## [1] 0.3837271
# R squared value is 0.38. This means 38 percentages of changes in wage is explained by independent variables in the model. Low R-squared values isn't a big problem as long as the model variables are highly significant as in this model. 

J) Interpretation of estimated coefficients for gender and race

# The regression model estimation shows that the average log wage difference between male and female is 0.14. That means, men earn 0.14 percentages more income than female. The relationship is significant at 0 percentage. Similarly, the regression model also shows that the mean difference between whites and nonwhites in terms of wage is 10 percentages which is also significant. 

K) Interpretation of estimated coefficients for other variables

#The coefficient of years of education is 0.01 . That means with the increase in years of education by one year, wage increases by 0.01 percentage. The relationship is significant statistically. Similarly, the coefficient of regression on job market experience is 0.03. That means, with increase in job market experience, wage increases by 0.03. The relationship is also significant statistically.

L) Oaxaca Blinder Approach to Decompose Wage According to Gender

OLmodelM <- lm(log(wage) ~ yrseduc+ age+yrswrkd+fracpart+nwhite+s_agric+s_minman+hazard+s_hltedusoc, female == 0, weight = wgt, data = df.nlsy)
OLmodelF <- lm(log(wage) ~ yrseduc+age+yrswrkd+fracpart+nwhite+s_agric+s_minman+hazard+s_hltedusoc, female == 1, weight = wgt, data = df.nlsy)
source("oaxacablinder.R")
oaxacablinder(OLmodelM, OLmodelF)
## 
##  Blinder-Oaxaca decomposition
## 
##  Call:
## oaxacablinder(model.A = OLmodelM, model.B = OLmodelF)
## 
##  Group 1:  female == 0         N of obs 1:  2665
##  Group 2:  female == 1         N of obs 2:  2667 
## 
##  decomposition using weight =  1 
## 
## log(wage) 
##                Coef. Std.Err.         z Pr>|z|  2.5 %  97.5 %
## difference     0.277        0 12143.504      0  0.277   0.277
## explained      0.185        0  7649.002      0  0.185   0.185
##   yrseduc     -0.012        0 -1053.466      0 -0.012  -0.012
##   age          0.001        0   959.835      0  0.001   0.002
##   yrswrkd      0.049        0  5673.932      0  0.049   0.049
##   fracpart     0.109        0  6394.023      0  0.109   0.109
##   nwhite       0.003        0  1006.460      0  0.003   0.003
##   s_agric     -0.004        0 -1833.224      0 -0.004  -0.004
##   s_minman     0.004        0   904.886      0  0.004   0.004
##   hazard       0.003        0   920.161      0  0.003   0.003
##   s_hltedusoc  0.033        0  2847.236      0  0.033   0.033
## unexplained    0.092        0  3433.176      0  0.092   0.092
##   (Intercept) -0.477        0 -1402.802      0 -0.478  -0.476
##   yrseduc      0.245        0  2303.488      0  0.245   0.245
##   age          0.662        0  1874.871      0  0.661   0.663
##   yrswrkd     -0.124        0 -1507.895      0 -0.124  -0.124
##   fracpart    -0.136        0 -5179.722      0 -0.136  -0.136
##   nwhite      -0.023        0 -2328.472      0 -0.023  -0.023
##   s_agric     -0.004        0 -1923.316      0 -0.004  -0.004
##   s_minman    -0.007        0  -970.974      0 -0.007  -0.006
##   hazard      -0.004        0 -2138.045      0 -0.004  -0.004
##   s_hltedusoc -0.040        0 -2188.603      0 -0.040  -0.040
# The result shows that the log wage difference between male and female is 0.277. Of the total difference, 18.5 percentages are explained by the model while 9.2 percentages of the difference in log wage is not explained. Among all the variables that explain the wage gap between male and female, fraction of population engaged  in part time work is dominant and alone contributes 10.9 percentages  of total wage variation between male and female. Of all the independent variables, age is least effective in explaining the wage gap between male and female. 

M) Oaxaca Blinder Approach to Decompose Wage According to Race

OLmodelN <- lm(log(wage) ~ yrseduc+ age+yrswrkd+fracpart+female+s_agric+s_minman+hazard+s_hltedusoc, nwhite == 0, weight = wgt, data = df.nlsy)
OLmodelW <- lm(log(wage) ~ yrseduc+age+yrswrkd+fracpart+female+s_agric+s_minman+hazard+s_hltedusoc, nwhite == 1, weight = wgt, data = df.nlsy)
source("oaxacablinder.R")
oaxacablinder(OLmodelN, OLmodelW)
## 
##  Blinder-Oaxaca decomposition
## 
##  Call:
## oaxacablinder(model.A = OLmodelN, model.B = OLmodelW)
## 
##  Group 1:  nwhite == 0         N of obs 1:  2722
##  Group 2:  nwhite == 1         N of obs 2:  2610 
## 
##  decomposition using weight =  1 
## 
## log(wage) 
##                Coef. Std.Err.         z Pr>|z|  2.5 %  97.5 %
## difference     0.231        0  8415.583      0  0.231   0.231
## explained      0.134        0  6978.421      0  0.134   0.134
##   yrseduc      0.076        0  5689.080      0  0.076   0.076
##   age         -0.002        0  -574.867      0 -0.002  -0.002
##   yrswrkd      0.061        0  5901.844      0  0.061   0.061
##   fracpart    -0.010        0 -3197.757      0 -0.010  -0.010
##   female       0.005        0  1026.735      0  0.005   0.005
##   s_agric     -0.001        0  -681.800      0 -0.001  -0.001
##   s_minman     0.003        0  1652.676      0  0.003   0.003
##   hazard       0.000        0    48.067      0  0.000   0.000
##   s_hltedusoc  0.001        0   701.061      0  0.001   0.001
## unexplained    0.097        0  4125.125      0  0.097   0.097
##   (Intercept)  0.102        0   249.785      0  0.101   0.103
##   yrseduc      0.257        0  1986.357      0  0.257   0.258
##   age         -0.250        0  -608.488      0 -0.250  -0.249
##   yrswrkd      0.020        0   237.216      0  0.020   0.020
##   fracpart    -0.004        0  -276.321      0 -0.004  -0.004
##   female      -0.047        0 -1917.206      0 -0.047  -0.047
##   s_agric      0.000        0   -81.647      0  0.000   0.000
##   s_minman     0.009        0   903.074      0  0.009   0.009
##   hazard       0.003        0   490.444      0  0.003   0.003
##   s_hltedusoc  0.007        0   551.262      0  0.007   0.007
# Above table shows the Oaxaca blinder decomposition result of wage difference according to race. It shows that wage gap between whites and non-whites is 23.1 percentages. Of the total gap of 23.1 percentages, 13.4 percentages is explained by the model while 9.7 percentages isn't explained. Of all the independent variables in the model, years of education is most likely to affect the wage gap between whites and non-whites and contributes 7.6 percentages alone to the gap. While, the result shows hazard exposure isn't effective in explaining the wage gap between whites and nonwhites. 

n) Comment on Sabrina Schaffer’s explanation

# The paper basically explains the wage gap from two perpectives; gender and race. While doing so, the authors use years of education, age, part time work experience, exposure to hazard, agriculture etc as independent variables. However, the paper doesn't consider important case like bargaining tendency according to gender. So, I think, if the paper had included the thing, the wage gap between male and women would be lower which is in aggreement with Schaffer's explanation.

O) Comment on Elizabeth’s Explanation

#I don't agree with Elizabeth.  If we revisit Oxaca Blinder decompositions based on gender and race, we can conclude that years of education plays major role in wage gap between whites and non-whites. Similarly, fraction of part time work experience plays major role in  explaining  the difference in wage between male and female. Considering these two things, we can assume that due to responsibility of women as mothers and care taker of home, they are bounder to work part time which may pay low. Similarly, non-whites have lower education than whites, they need to apply for those works which require less skills specializations paying low job. So, the wage difference is natural. There is nothing about racial. 

P) Other Factors Affecting Wage Gap

#Based on Oaxaca blinder decomposition, we find that fraction of part time job experience affects wage of male and female. In case of non-whites, years of education affects the wage gap. Therefore, in order to achieve equality between whites and non-whites in terms of wage, focus should be given to education.

Q) Goals of Executive Order Signed by Former President Obama

# The main goal of Obama's executive order was to increase the transparency of wage payment so that the wage gap can be addressed. Though Sabrina and Elizabeth have differ in the explanation, they accept there exists wage gap between male and female. The difference basically comes because Sabrina seems to have understood fact to some extent while Elizabeth explains basically from activism approach.