# Load necessary libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Load the data
wagegap <- read.csv("c:/users/dell/downloads/wagegap.csv")
# Explore the data
summary(wagegap)
## wage gender sector education
## Min. : 5.09 Length:1517 Length:1517 Min. : 9.00
## 1st Qu.:23.42 Class :character Class :character 1st Qu.:13.00
## Median :32.22 Mode :character Mode :character Median :14.00
## Mean :34.65 Mean :13.83
## 3rd Qu.:43.29 3rd Qu.:15.00
## Max. :95.63 Max. :19.00
## experience
## Min. : 7.00
## 1st Qu.:13.00
## Median :15.00
## Mean :14.89
## 3rd Qu.:16.00
## Max. :22.00
str(wagegap)
## 'data.frame': 1517 obs. of 5 variables:
## $ wage : num 36.2 28.3 30.6 29.7 74.8 ...
## $ gender : chr "male" "male" "male" "female" ...
## $ sector : chr "public" "private" "private" "public" ...
## $ education : int 12 14 15 12 12 13 16 15 15 18 ...
## $ experience: int 14 16 15 14 10 14 17 14 17 15 ...
# Data preprocessing
wagegap$log_wage <- log(wagegap$wage)
wagegap$experience_sq <- wagegap$experience^2
# Model selection
model_1 <- lm(log_wage ~ gender * sector * education + experience + experience_sq, wagegap)
model_2 <- lm(log(wage) ~ gender + sector + education +
experience + I(experience^2) +
gender:sector + gender:education +
gender:experience + sector:education +
sector:experience + education:experience,
data = wagegap)
# Calculate AIC and BIC for each model
aic_1 <- AIC(model_1)
bic_1 <- BIC(model_1)
aic_2 <- AIC(model_2)
bic_2 <- BIC(model_2)
# Print AIC and BIC for each model
cat("AIC for model_1:", aic_1, "\n")
## AIC for model_1: -2671.203
cat("BIC for model_1:", bic_1, "\n")
## BIC for model_1: -2612.634
cat("AIC for model_2:", aic_2, "\n")
## AIC for model_2: -2668.017
cat("BIC for model_2:", bic_2, "\n")
## BIC for model_2: -2598.799
# Summary of the model
summary(model_2)
##
## Call:
## lm(formula = log(wage) ~ gender + sector + education + experience +
## I(experience^2) + gender:sector + gender:education + gender:experience +
## sector:education + sector:experience + education:experience,
## data = wagegap)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.34751 -0.06694 0.00199 0.06694 0.33845
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.2551800 0.1643209 25.896 < 2e-16 ***
## gendermale 0.1107813 0.0561553 1.973 0.04870 *
## sectorpublic -0.0971310 0.0552612 -1.758 0.07901 .
## education 0.0297877 0.0111623 2.669 0.00770 **
## experience 0.0311892 0.0135416 2.303 0.02140 *
## I(experience^2) -0.0080345 0.0003453 -23.270 < 2e-16 ***
## gendermale:sectorpublic -0.0297462 0.0108879 -2.732 0.00637 **
## gendermale:education -0.0033575 0.0032604 -1.030 0.30329
## gendermale:experience -0.0017409 0.0023707 -0.734 0.46286
## sectorpublic:education -0.0009048 0.0032060 -0.282 0.77780
## sectorpublic:experience 0.0017712 0.0022765 0.778 0.43668
## education:experience 0.0008114 0.0007141 1.136 0.25605
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09997 on 1505 degrees of freedom
## Multiple R-squared: 0.9543, Adjusted R-squared: 0.954
## F-statistic: 2857 on 11 and 1505 DF, p-value: < 2.2e-16
# Interpretation of coefficients
coefficients <- coef(model_2)
print(coefficients)
## (Intercept) gendermale sectorpublic
## 4.2551800491 0.1107813240 -0.0971310178
## education experience I(experience^2)
## 0.0297876864 0.0311892014 -0.0080344722
## gendermale:sectorpublic gendermale:education gendermale:experience
## -0.0297462121 -0.0033574588 -0.0017408898
## sectorpublic:education sectorpublic:experience education:experience
## -0.0009048458 0.0017711608 0.0008113977
# Estimated regression equation
equation <- paste("log(wage) =", coefficients[1], "+",
coefficients[2], "* genderMale +",
coefficients[3], "* sectorPublic +",
coefficients[4], "* education +",
coefficients[5], "* experience +",
coefficients[6], "* experience_sq +",
coefficients[7], "* genderMale:sectorPublic +",
coefficients[8], "* genderMale:education +",
coefficients[9], "* genderMale:experience +",
coefficients[10], "* genderMale:experience_sq +",
coefficients[11], "* sectorPublic:education +",
coefficients[12], "* sectorPublic:experience +",
coefficients[13], "* sectorPublic:experience_sq +",
coefficients[14], "* education:experience +",
coefficients[15], "* education:experience_sq +",
coefficients[16], "* genderMale:sectorPublic:education +",
coefficients[17], "* genderMale:sectorPublic:experience +",
coefficients[18], "* genderMale:sectorPublic:experience_sq")
cat("Estimated Regression Equation:\n")
## Estimated Regression Equation:
cat(equation, "\n\n")
## log(wage) = 4.25518004914403 + 0.110781323980847 * genderMale + -0.0971310177783612 * sectorPublic + 0.0297876864234807 * education + 0.0311892014486484 * experience + -0.00803447221897204 * experience_sq + -0.029746212117146 * genderMale:sectorPublic + -0.00335745877732158 * genderMale:education + -0.00174088978505732 * genderMale:experience + -0.000904845772501819 * genderMale:experience_sq + 0.00177116078770278 * sectorPublic:education + 0.000811397700512628 * sectorPublic:experience + NA * sectorPublic:experience_sq + NA * education:experience + NA * education:experience_sq + NA * genderMale:sectorPublic:education + NA * genderMale:sectorPublic:experience + NA * genderMale:sectorPublic:experience_sq
# Estimated equation for log(wage) dependent on education for a male worker in the private sector with an average number of experience years
male_private_worker <- data.frame(gender = "male", sector = "private", education = mean(wagegap$education), experience = mean(wagegap$experience), experience_sq = mean(wagegap$experience_sq))
predicted_log_wage <- predict(model_2, newdata = male_private_worker)
cat("Estimated log(wage) for a male worker in the private sector with average education and experience:", predicted_log_wage, "\n\n")
## Estimated log(wage) for a male worker in the private sector with average education and experience: 3.556109
# Additional analysis to check for significance of interaction terms
anova(model_2, test = "F")
## Analysis of Variance Table
##
## Response: log(wage)
## Df Sum Sq Mean Sq F value Pr(>F)
## gender 1 0.159 0.159 15.9506 6.816e-05 ***
## sector 1 3.073 3.073 307.5482 < 2.2e-16 ***
## education 1 3.637 3.637 363.9749 < 2.2e-16 ***
## experience 1 301.611 301.611 30180.6999 < 2.2e-16 ***
## I(experience^2) 1 5.462 5.462 546.5945 < 2.2e-16 ***
## gender:sector 1 0.081 0.081 8.1174 0.004444 **
## gender:education 1 0.011 0.011 1.1362 0.286632
## gender:experience 1 0.007 0.007 0.6760 0.411096
## sector:education 1 0.001 0.001 0.0546 0.815336
## sector:experience 1 0.008 0.008 0.8392 0.359762
## education:experience 1 0.013 0.013 1.2910 0.256051
## Residuals 1505 15.040 0.010
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Checking model assumptions
plot(model_2)
Analysis:
Upon examining the model fit and assessing the significance of interaction terms, we can make inferences regarding gender-based wage disparities in both private and public sectors:
Importance of Interaction Terms: The statistical significance (p < 0.05) of the interaction term “gender:sector” indicates that the influence of gender on wages is not uniform across private and public sectors.
Interpretation: The presence of a significant “gender:sector” interaction term implies the existence of a gender wage gap, and importantly, this gap varies between employees in the private and public sectors.
More specifically, the coefficient associated with the “gender:sector” interaction term quantifies the disparity in log(wage) between male and female workers within the private sector when compared to the public sector.
Conclusion: Our analysis points to the existence of a gender wage gap in both the private and public sectors. However, the substantial finding lies in the fact that the magnitude of this wage gap differs significantly between these two sectors, as evidenced by the noteworthy “gender:sector” interaction term.
To gain deeper insights into the nature and extent of the gender wage gap within each sector, further exploration, such as examining the coefficient linked to the “gender:sector” interaction term, is warranted.
In summary, our conclusion is that a gender wage gap is present in both private and public sectors, with discernible variations in its magnitude between the sectors. Additional analyses may be necessary to unravel the underlying factors contributing to these observed differences.