quiz2_selhançil

Quarto

library(readxl)
library(dplyr)


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

library(ggplot2)

df <- read_excel("~/Econ 465 R/data/Wage_GenderDS.xlsx")

glimpse(df)

Rows: 500
Columns: 6
$ Observation <dbl> 119, 2, 41, 65, 246, 254, 74, 12, 9, 237, 79, 294, 182, 25…
$ Wage        <dbl> 32, 34, 37, 38, 38, 38, 39, 40, 42, 43, 44, 45, 46, 46, 47…
$ Female      <dbl> 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1…
$ Age         <dbl> 31, 42, 31, 33, 21, 28, 31, 28, 25, 25, 44, 25, 31, 42, 38…
$ Educ        <dbl> 1, 1, 1, 1, 2, 2, 1, 1, 1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1…
$ Parttime    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1…

summary(df)

  Observation         Wage           Female           Age       
 Min.   :  1.0   Min.   : 32.0   Min.   :0.000   Min.   :20.00  
 1st Qu.:125.8   1st Qu.: 72.0   1st Qu.:0.000   1st Qu.:32.00  
 Median :250.5   Median :100.0   Median :0.000   Median :39.00  
 Mean   :250.5   Mean   :114.9   Mean   :0.368   Mean   :40.01  
 3rd Qu.:375.2   3rd Qu.:144.0   3rd Qu.:1.000   3rd Qu.:47.00  
 Max.   :500.0   Max.   :384.0   Max.   :1.000   Max.   :70.00  
      Educ          Parttime    
 Min.   :1.000   Min.   :0.000  
 1st Qu.:1.000   1st Qu.:0.000  
 Median :2.000   Median :0.000  
 Mean   :2.078   Mean   :0.348  
 3rd Qu.:3.000   3rd Qu.:1.000  
 Max.   :4.000   Max.   :1.000

# WAGE HISTOGRAM

ggplot(df, aes(x = Wage)) +
  geom_histogram(binwidth = 2, fill = "#2E86AB", color = "white") +
  labs(title = "Histogram of Hourly Wage",
       x = "Wage ($/hr)", y = "Count") +
  theme_minimal()

#BOXPLOT BY GENDER

df$Gender <- factor(df$Female, labels = c("Men", "Women"))
 
ggplot(df, aes(x = Gender, y = Wage, fill = Gender)) +
  geom_boxplot(outlier.alpha = 0.4) +
  scale_fill_manual(values = c("#2E86AB", "#E84855")) +
  labs(title = "Wage Distribution by Gender",
       x = "", y = "Wage ($/hr)") +
  theme_minimal() +
  theme(legend.position = "none")

#Summary Statistics
stats <- df %>%
  group_by(Gender) %>%
  summarise(
    Mean   = mean(Wage),
    Median = median(Wage),
    SD     = sd(Wage),
    Min    = min(Wage),
    Max    = max(Wage),
    .groups = "drop"
  )
print(stats)

# A tibble: 2 × 6
  Gender  Mean Median    SD   Min   Max
  <fct>  <dbl>  <dbl> <dbl> <dbl> <dbl>
1 Men    125.   111    57.3    38   384
2 Women   97.3   83.5  46.3    32   364

# Raw wage gap (mean men – mean women)
mean_men   <- mean(df$Wage[df$Female == 0])
mean_women <- mean(df$Wage[df$Female == 1])
raw_gap    <- mean_men - mean_women
cat("Raw wage gap (mean men - mean women): $", round(raw_gap, 2), "/hr\n")

Raw wage gap (mean men - mean women): $ 27.81 /hr

PART 2: LOG TRANSFORMATION

# Boxplot of l_wage by Gender

df$l_wage <- log(df$Wage)   # natural log
 
ggplot(df, aes(x = l_wage)) +
  geom_histogram(binwidth = 0.15, fill = "#3BB273", color = "white") +
  labs(title = "Histogram of log(Wage)",
       x = "log(Wage)", y = "Count") +
  theme_minimal()

# Boxplot of l_wage by Gender
ggplot(df, aes(x = Gender, y = l_wage, fill = Gender)) +
  geom_boxplot(outlier.alpha = 0.4) +
  scale_fill_manual(values = c("#2E86AB", "#E84855")) +
  labs(title = "log(Wage) Distribution by Gender",
       x = "", y = "log(Wage)") +
  theme_minimal() +
  theme(legend.position = "none")

# Approximate percentage gap
mean_l_men   <- mean(df$l_wage[df$Female == 0])
mean_l_women <- mean(df$l_wage[df$Female == 1])
pct_gap      <- 100 * (mean_l_men - mean_l_women)
 
cat("Mean log(wage) – Men:  ", round(mean_l_men,   4), "\n")

Mean log(wage) – Men:   4.7336

cat("Mean log(wage) – Women:", round(mean_l_women, 4), "\n")

Mean log(wage) – Women: 4.483

cat("Approximate % gap:     ", round(pct_gap, 2), "%\n")

Approximate % gap:      25.06 %

PART 3: EXPLORING CONFOUNDERS

# Education levels by gender 
educ_table <- df %>%
  count(Gender, Educ) %>%
  group_by(Gender) %>%
  mutate(Proportion = round(n / sum(n), 3))
print(educ_table)

# A tibble: 8 × 4
# Groups:   Gender [2]
  Gender  Educ     n Proportion
  <fct>  <dbl> <int>      <dbl>
1 Men        1   108      0.342
2 Men        2    77      0.244
3 Men        3    72      0.228
4 Men        4    59      0.187
5 Women      1    88      0.478
6 Women      2    57      0.31 
7 Women      3    33      0.179
8 Women      4     6      0.033

# Most common education level by gender

df %>%
  group_by(Gender) %>%
  count(Educ) %>%
  slice_max(n, n = 1) %>%
  print()

# A tibble: 2 × 3
# Groups:   Gender [2]
  Gender  Educ     n
  <fct>  <dbl> <int>
1 Men        1   108
2 Women      1    88

# Part-time work by gender
parttime_props <- df %>%
  group_by(Gender) %>%
  summarise(
    Parttime_prop = mean(Parttime),
    .groups = "drop"
  )
print(parttime_props)

# A tibble: 2 × 2
  Gender Parttime_prop
  <fct>          <dbl>
1 Men            0.225
2 Women          0.560

# Age distribution by gender
age_stats <- df %>%
  group_by(Gender) %>%
  summarise(
    Mean_Age   = mean(Age),
    Median_Age = median(Age),
    .groups    = "drop"
  )
print(age_stats)

# A tibble: 2 × 3
  Gender Mean_Age Median_Age
  <fct>     <dbl>      <dbl>
1 Men        40.1         39
2 Women      39.9         39

Q1: Two reasons to use log(wage):

Raw wages are right-skewed; log transformation yields a distribution closer to normal, which satisfies classical OLS assumptions.
Regression coefficients on log(wage) have a convenient interpretation as approximate percentage changes (semi-elasticity), making comparisons across individuals and datasets more meaningful.

Q2: Is raw wage gap = discrimination?

No. The raw gap conflates discrimination with compositional differences. If women have lower education on average, or are more likely to work part-time, some of the gap reflects those factors — not discrimination. A proper decomposition (e.g., Oaxaca-Blinder) is needed to isolate the “unexplained” portion that could reflect discrimination.