library(oaxaca)
##
## Please cite as:
## Hlavac, Marek (2018). oaxaca: Blinder-Oaxaca Decomposition in R.
## R package version 0.1.4. https://CRAN.R-project.org/package=oaxaca
library(haven)
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:Hmisc':
##
## src, summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(magrittr)
a) Data Import
df.nlsy <- read_dta("C:\\Users\\t420\\Desktop\\NLSY.dta")
wage <- df.nlsy[,5]
lwage <- log(wage)
female.f <- factor(df.nlsy$female, labels=c("men", "women"))
b) Create an overlapping density plot according to gender
# Log Wage by gender
df.nlsy %<>%
mutate(female.f <- factor(female, labels = c("men","women")))
ggplot(df.nlsy, aes(x = wage, y = ..density.., weight = wgt, fill = female.f)) +
geom_histogram(bins = 30, position = position_dodge(width = 0), alpha = 1) +
labs(x = "log wage", y = "density", title = "log(wage) by gender") +
scale_x_log10() +
theme_minimal()
# Years of work by gender
ggplot(df.nlsy, aes(x = yrswrkd, y = ..density.., weight = wgt, fill = female.f)) +
geom_histogram(bins = 30, position = position_dodge(width = 0), alpha = 0.6) +
labs(x = "years of work", y = "density", title = "years of work by gender") +
theme_minimal()
#Fraction of part time work by gender
ggplot(df.nlsy, aes(x = fracpart, y = ..density.., weight = wgt, fill = female.f)) +
geom_histogram(bins = 25, position = position_dodge(width = 0), alpha = 1) +
labs(x = "part time", y = "density", title = "part time work by gender") +
theme_minimal()
# Years of education by gender
ggplot(df.nlsy, aes(x = yrseduc, y = ..density.., weight = wgt, fill = female.f)) +
geom_histogram(bins = 20, position = position_dodge(width = 0), alpha = 1) +
labs(x = "years of education", y = "density", title = "years of education by gender") +
theme_minimal()
## Warning: Removed 66 rows containing non-finite values (stat_bin).
d) Create Density Plot Function in terms of race
race <- factor(df.nlsy$nwhite, labels = c("white","nwhite"))
df.nlsy %<>% mutate(race = factor(nwhite, labels = c("white","nwhite")))
# According to wage
ggplot(df.nlsy, aes(x = wage, y = ..density.., weight = wgt, fill = race)) +
geom_histogram(bins = 30, position = position_dodge(width = 0), alpha = 1) +
labs(x = "logwage", y = "density", title = "log(wage) by race") +
scale_x_log10() +
theme_minimal()
# years of work by race
ggplot(df.nlsy, aes(x = yrswrkd, y = ..density.., weight = wgt, fill = race)) +
geom_histogram(bins = 30, position = position_dodge(width = 0), alpha = 1) +
labs(x = "work experience", y = "density", title = "work experience by years") +
theme_minimal()
# Part time work by race
ggplot(df.nlsy, aes(x = fracpart, y = ..density.., weight = wgt, fill = race)) +
geom_histogram(bins = 25, position = position_dodge(width = 0), alpha = 1) +
labs(x = "part time work experience", y = "density", title = "part time by race") +
theme_minimal()
# Education according to race
ggplot(df.nlsy, aes(x = yrseduc, y = ..density.., weight = wgt, fill = race)) +
geom_histogram(bins = 20, position = position_dodge(width = 0), alpha = 1) +
labs(x = "years of education", y = "density", title = "years of education according to race") +
theme_minimal()
## Warning: Removed 66 rows containing non-finite values (stat_bin).
F) Calculate the average wages difference
# Calculate the average wages difference between men and women
df.nlsy %<>% mutate(lw=log(wage))
gender <- df.nlsy %>%
group_by(female)%>%
summarise(mean.wage = wtd.mean(lw, weights = wgt),
median.wage = wtd.quantile(lw, weights = wgt, probs = 0.5))
gender
## # A tibble: 2 x 3
## female mean.wage median.wage
## <dbl> <dbl> <dbl>
## 1 0. 2.86 2.84
## 2 1. 2.58 2.54
# We can see that the average log wage difference between male and female is 0.28.
# In terms of race
df.nlsy %<>% mutate(lw=log(wage))
race <- df.nlsy %>%
group_by(race)%>%
summarise(mean.wage = wtd.mean(lw, weights = wgt),
median.wage = wtd.quantile(lw, weights = wgt, probs = 0.5))
race
## # A tibble: 2 x 3
## race mean.wage median.wage
## <fct> <dbl> <dbl>
## 1 white 2.77 2.74
## 2 nwhite 2.54 2.49
# Above result shows that the log difference of wage between whites and non-whites is 0.23.
G) Standard deviation for Log Wages
# According to Gender
df.nlsy %<>% mutate(lw=log(wage))
gender <- df.nlsy %>%
group_by(female)%>%
summarise(standard.wage = wtd.var(lw, weights = wgt))
gender
## # A tibble: 2 x 2
## female standard.wage
## <dbl> <dbl>
## 1 0. 0.312
## 2 1. 0.280
# We can conclude that the standard deviation of wage is higher in case of men than in the case of women.
# According to race
df.nlsy %<>% mutate(lw=log(wage))
race <- df.nlsy %>%
group_by(race)%>%
summarise(standard.wage = wtd.var(lw, weights = wgt))
race
## # A tibble: 2 x 2
## race standard.wage
## <fct> <dbl>
## 1 white 0.319
## 2 nwhite 0.261
# We can cocnlude that the standard deviation of wage is higer in case of whites than in the case of non-whites.
H) Pooled Weighted OLS model
regress<-lm(log(wage) ~ age + yrseduc + yrswrkd + fracpart + female + nwhite + union + famresp + s_agric + s_minman + s_hltedusoc + hazard, weight = wgt, data = df.nlsy)
summary(regress)
##
## Call:
## lm(formula = log(wage) ~ age + yrseduc + yrswrkd + fracpart +
## female + nwhite + union + famresp + s_agric + s_minman +
## s_hltedusoc + hazard, data = df.nlsy, weights = wgt)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -1449.66 -155.26 -10.49 144.74 1651.44
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.763769 0.110280 15.994 < 2e-16 ***
## age -0.019860 0.002983 -6.659 3.04e-11 ***
## yrseduc 0.098669 0.002552 38.667 < 2e-16 ***
## yrswrkd 0.030420 0.001752 17.365 < 2e-16 ***
## fracpart -0.324998 0.043881 -7.406 1.50e-13 ***
## female -0.137215 0.014420 -9.516 < 2e-16 ***
## nwhite -0.107132 0.015457 -6.931 4.68e-12 ***
## union 0.073226 0.015198 4.818 1.49e-06 ***
## famresp -0.090586 0.016579 -5.464 4.87e-08 ***
## s_agric -0.223325 0.048717 -4.584 4.66e-06 ***
## s_minman 0.056648 0.015830 3.579 0.000348 ***
## s_hltedusoc -0.062747 0.016558 -3.790 0.000153 ***
## hazard 0.077705 0.028953 2.684 0.007300 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 284.2 on 5319 degrees of freedom
## (66 observations deleted due to missingness)
## Multiple R-squared: 0.3837, Adjusted R-squared: 0.3823
## F-statistic: 276 on 12 and 5319 DF, p-value: < 2.2e-16
I) R squared calculation
summary(regress)$r.squared
## [1] 0.3837271
# R squared value is 0.38. This means 38 percentages of changes in wage is explained by independent variables in the model. Low R-squared values isn't a big problem as long as the model variables are highly significant as in this model.
J) Interpretation of estimated coefficients for gender and race
# The regression model estimation shows that the average log wage difference between male and female is 0.14. That means, men earn 0.14 percentages more income than female. The relationship is significant at 0 percentage. Similarly, the regression model also shows that the mean difference between whites and nonwhites in terms of wage is 10 percentages which is also significant.
K) Interpretation of estimated coefficients for other variables
#The coefficient of years of education is 0.01 . That means with the increase in years of education by one year, wage increases by 0.01 percentage. The relationship is significant statistically. Similarly, the coefficient of regression on job market experience is 0.03. That means, with increase in job market experience, wage increases by 0.03. The relationship is also significant statistically.
L) Oaxaca Blinder Approach to Decompose Wage According to Gender
OLmodelM <- lm(log(wage) ~ yrseduc+ age+yrswrkd+fracpart+nwhite+s_agric+s_minman+hazard+s_hltedusoc, female == 0, weight = wgt, data = df.nlsy)
OLmodelF <- lm(log(wage) ~ yrseduc+age+yrswrkd+fracpart+nwhite+s_agric+s_minman+hazard+s_hltedusoc, female == 1, weight = wgt, data = df.nlsy)
source("oaxacablinder.R")
oaxacablinder(OLmodelM, OLmodelF)
##
## Blinder-Oaxaca decomposition
##
## Call:
## oaxacablinder(model.A = OLmodelM, model.B = OLmodelF)
##
## Group 1: female == 0 N of obs 1: 2665
## Group 2: female == 1 N of obs 2: 2667
##
## decomposition using weight = 1
##
## log(wage)
## Coef. Std.Err. z Pr>|z| 2.5 % 97.5 %
## difference 0.277 0 12143.504 0 0.277 0.277
## explained 0.185 0 7649.002 0 0.185 0.185
## yrseduc -0.012 0 -1053.466 0 -0.012 -0.012
## age 0.001 0 959.835 0 0.001 0.002
## yrswrkd 0.049 0 5673.932 0 0.049 0.049
## fracpart 0.109 0 6394.023 0 0.109 0.109
## nwhite 0.003 0 1006.460 0 0.003 0.003
## s_agric -0.004 0 -1833.224 0 -0.004 -0.004
## s_minman 0.004 0 904.886 0 0.004 0.004
## hazard 0.003 0 920.161 0 0.003 0.003
## s_hltedusoc 0.033 0 2847.236 0 0.033 0.033
## unexplained 0.092 0 3433.176 0 0.092 0.092
## (Intercept) -0.477 0 -1402.802 0 -0.478 -0.476
## yrseduc 0.245 0 2303.488 0 0.245 0.245
## age 0.662 0 1874.871 0 0.661 0.663
## yrswrkd -0.124 0 -1507.895 0 -0.124 -0.124
## fracpart -0.136 0 -5179.722 0 -0.136 -0.136
## nwhite -0.023 0 -2328.472 0 -0.023 -0.023
## s_agric -0.004 0 -1923.316 0 -0.004 -0.004
## s_minman -0.007 0 -970.974 0 -0.007 -0.006
## hazard -0.004 0 -2138.045 0 -0.004 -0.004
## s_hltedusoc -0.040 0 -2188.603 0 -0.040 -0.040
# The result shows that the log wage difference between male and female is 0.277. Of the total difference, 18.5 percentages are explained by the model while 9.2 percentages of the difference in log wage is not explained. Among all the variables that explain the wage gap between male and female, fraction of population engaged in part time work is dominant and alone contributes 10.9 percentages of total wage variation between male and female. Of all the independent variables, age is least effective in explaining the wage gap between male and female.
M) Oaxaca Blinder Approach to Decompose Wage According to Race
OLmodelN <- lm(log(wage) ~ yrseduc+ age+yrswrkd+fracpart+female+s_agric+s_minman+hazard+s_hltedusoc, nwhite == 0, weight = wgt, data = df.nlsy)
OLmodelW <- lm(log(wage) ~ yrseduc+age+yrswrkd+fracpart+female+s_agric+s_minman+hazard+s_hltedusoc, nwhite == 1, weight = wgt, data = df.nlsy)
source("oaxacablinder.R")
oaxacablinder(OLmodelN, OLmodelW)
##
## Blinder-Oaxaca decomposition
##
## Call:
## oaxacablinder(model.A = OLmodelN, model.B = OLmodelW)
##
## Group 1: nwhite == 0 N of obs 1: 2722
## Group 2: nwhite == 1 N of obs 2: 2610
##
## decomposition using weight = 1
##
## log(wage)
## Coef. Std.Err. z Pr>|z| 2.5 % 97.5 %
## difference 0.231 0 8415.583 0 0.231 0.231
## explained 0.134 0 6978.421 0 0.134 0.134
## yrseduc 0.076 0 5689.080 0 0.076 0.076
## age -0.002 0 -574.867 0 -0.002 -0.002
## yrswrkd 0.061 0 5901.844 0 0.061 0.061
## fracpart -0.010 0 -3197.757 0 -0.010 -0.010
## female 0.005 0 1026.735 0 0.005 0.005
## s_agric -0.001 0 -681.800 0 -0.001 -0.001
## s_minman 0.003 0 1652.676 0 0.003 0.003
## hazard 0.000 0 48.067 0 0.000 0.000
## s_hltedusoc 0.001 0 701.061 0 0.001 0.001
## unexplained 0.097 0 4125.125 0 0.097 0.097
## (Intercept) 0.102 0 249.785 0 0.101 0.103
## yrseduc 0.257 0 1986.357 0 0.257 0.258
## age -0.250 0 -608.488 0 -0.250 -0.249
## yrswrkd 0.020 0 237.216 0 0.020 0.020
## fracpart -0.004 0 -276.321 0 -0.004 -0.004
## female -0.047 0 -1917.206 0 -0.047 -0.047
## s_agric 0.000 0 -81.647 0 0.000 0.000
## s_minman 0.009 0 903.074 0 0.009 0.009
## hazard 0.003 0 490.444 0 0.003 0.003
## s_hltedusoc 0.007 0 551.262 0 0.007 0.007
# Above table shows the Oaxaca blinder decomposition result of wage difference according to race. It shows that wage gap between whites and non-whites is 23.1 percentages. Of the total gap of 23.1 percentages, 13.4 percentages is explained by the model while 9.7 percentages isn't explained. Of all the independent variables in the model, years of education is most likely to affect the wage gap between whites and non-whites and contributes 7.6 percentages alone to the gap. While, the result shows hazard exposure isn't effective in explaining the wage gap between whites and nonwhites.
P) Other Factors Affecting Wage Gap
#Based on Oaxaca blinder decomposition, we find that fraction of part time job experience affects wage of male and female. In case of non-whites, years of education affects the wage gap. Therefore, in order to achieve equality between whites and non-whites in terms of wage, focus should be given to education.