library(readxl)
library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
library(ggplot2)library(readxl)
library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
library(ggplot2)df <- read_excel("~/Econ 465 R/data/Wage_GenderDS.xlsx")
glimpse(df)Rows: 500
Columns: 6
$ Observation <dbl> 119, 2, 41, 65, 246, 254, 74, 12, 9, 237, 79, 294, 182, 25…
$ Wage <dbl> 32, 34, 37, 38, 38, 38, 39, 40, 42, 43, 44, 45, 46, 46, 47…
$ Female <dbl> 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1…
$ Age <dbl> 31, 42, 31, 33, 21, 28, 31, 28, 25, 25, 44, 25, 31, 42, 38…
$ Educ <dbl> 1, 1, 1, 1, 2, 2, 1, 1, 1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1…
$ Parttime <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1…
summary(df) Observation Wage Female Age
Min. : 1.0 Min. : 32.0 Min. :0.000 Min. :20.00
1st Qu.:125.8 1st Qu.: 72.0 1st Qu.:0.000 1st Qu.:32.00
Median :250.5 Median :100.0 Median :0.000 Median :39.00
Mean :250.5 Mean :114.9 Mean :0.368 Mean :40.01
3rd Qu.:375.2 3rd Qu.:144.0 3rd Qu.:1.000 3rd Qu.:47.00
Max. :500.0 Max. :384.0 Max. :1.000 Max. :70.00
Educ Parttime
Min. :1.000 Min. :0.000
1st Qu.:1.000 1st Qu.:0.000
Median :2.000 Median :0.000
Mean :2.078 Mean :0.348
3rd Qu.:3.000 3rd Qu.:1.000
Max. :4.000 Max. :1.000
# WAGE HISTOGRAM
ggplot(df, aes(x = Wage)) +
geom_histogram(binwidth = 2, fill = "#2E86AB", color = "white") +
labs(title = "Histogram of Hourly Wage",
x = "Wage ($/hr)", y = "Count") +
theme_minimal()#BOXPLOT BY GENDER
df$Gender <- factor(df$Female, labels = c("Men", "Women"))
ggplot(df, aes(x = Gender, y = Wage, fill = Gender)) +
geom_boxplot(outlier.alpha = 0.4) +
scale_fill_manual(values = c("#2E86AB", "#E84855")) +
labs(title = "Wage Distribution by Gender",
x = "", y = "Wage ($/hr)") +
theme_minimal() +
theme(legend.position = "none")#Summary Statistics
stats <- df %>%
group_by(Gender) %>%
summarise(
Mean = mean(Wage),
Median = median(Wage),
SD = sd(Wage),
Min = min(Wage),
Max = max(Wage),
.groups = "drop"
)
print(stats)# A tibble: 2 × 6
Gender Mean Median SD Min Max
<fct> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Men 125. 111 57.3 38 384
2 Women 97.3 83.5 46.3 32 364
# Raw wage gap (mean men – mean women)
mean_men <- mean(df$Wage[df$Female == 0])
mean_women <- mean(df$Wage[df$Female == 1])
raw_gap <- mean_men - mean_women
cat("Raw wage gap (mean men - mean women): $", round(raw_gap, 2), "/hr\n")Raw wage gap (mean men - mean women): $ 27.81 /hr
PART 2: LOG TRANSFORMATION
# Boxplot of l_wage by Gender
df$l_wage <- log(df$Wage) # natural log
ggplot(df, aes(x = l_wage)) +
geom_histogram(binwidth = 0.15, fill = "#3BB273", color = "white") +
labs(title = "Histogram of log(Wage)",
x = "log(Wage)", y = "Count") +
theme_minimal()# Boxplot of l_wage by Gender
ggplot(df, aes(x = Gender, y = l_wage, fill = Gender)) +
geom_boxplot(outlier.alpha = 0.4) +
scale_fill_manual(values = c("#2E86AB", "#E84855")) +
labs(title = "log(Wage) Distribution by Gender",
x = "", y = "log(Wage)") +
theme_minimal() +
theme(legend.position = "none")# Approximate percentage gap
mean_l_men <- mean(df$l_wage[df$Female == 0])
mean_l_women <- mean(df$l_wage[df$Female == 1])
pct_gap <- 100 * (mean_l_men - mean_l_women)
cat("Mean log(wage) – Men: ", round(mean_l_men, 4), "\n")Mean log(wage) – Men: 4.7336
cat("Mean log(wage) – Women:", round(mean_l_women, 4), "\n")Mean log(wage) – Women: 4.483
cat("Approximate % gap: ", round(pct_gap, 2), "%\n")Approximate % gap: 25.06 %
PART 3: EXPLORING CONFOUNDERS
# Education levels by gender
educ_table <- df %>%
count(Gender, Educ) %>%
group_by(Gender) %>%
mutate(Proportion = round(n / sum(n), 3))
print(educ_table)# A tibble: 8 × 4
# Groups: Gender [2]
Gender Educ n Proportion
<fct> <dbl> <int> <dbl>
1 Men 1 108 0.342
2 Men 2 77 0.244
3 Men 3 72 0.228
4 Men 4 59 0.187
5 Women 1 88 0.478
6 Women 2 57 0.31
7 Women 3 33 0.179
8 Women 4 6 0.033
# Most common education level by gender
df %>%
group_by(Gender) %>%
count(Educ) %>%
slice_max(n, n = 1) %>%
print()# A tibble: 2 × 3
# Groups: Gender [2]
Gender Educ n
<fct> <dbl> <int>
1 Men 1 108
2 Women 1 88
# Part-time work by gender
parttime_props <- df %>%
group_by(Gender) %>%
summarise(
Parttime_prop = mean(Parttime),
.groups = "drop"
)
print(parttime_props)# A tibble: 2 × 2
Gender Parttime_prop
<fct> <dbl>
1 Men 0.225
2 Women 0.560
# Age distribution by gender
age_stats <- df %>%
group_by(Gender) %>%
summarise(
Mean_Age = mean(Age),
Median_Age = median(Age),
.groups = "drop"
)
print(age_stats)# A tibble: 2 × 3
Gender Mean_Age Median_Age
<fct> <dbl> <dbl>
1 Men 40.1 39
2 Women 39.9 39
Q1: Two reasons to use log(wage):
Raw wages are right-skewed; log transformation yields a distribution closer to normal, which satisfies classical OLS assumptions.
Regression coefficients on log(wage) have a convenient interpretation as approximate percentage changes (semi-elasticity), making comparisons across individuals and datasets more meaningful.
Q2: Is raw wage gap = discrimination?
No. The raw gap conflates discrimination with compositional differences. If women have lower education on average, or are more likely to work part-time, some of the gap reflects those factors — not discrimination. A proper decomposition (e.g., Oaxaca-Blinder) is needed to isolate the “unexplained” portion that could reflect discrimination.