library(tidyverse)
library(lubridate)
library(knitr)
library(kableExtra)

1. Load Data

# --- DB connection (comment out if using CSV) ---
# library(RPostgres)
# con <- dbConnect(Postgres(),
#   host     = "hopper.proxy.rlwy.net",
#   port     = 37403,
#   dbname   = "railway",
#   user     = "participant",
#   password = "EvW27867KZfZgl1EH56vxQ"
# )
# df_raw <- dbReadTable(con, Id(schema = "graduate", table = "nls_full"))
# dbDisconnect(con)

# --- CSV fallback ---
df_raw <- read.csv("graduate-full.csv", stringsAsFactors = FALSE)

dim(df_raw)
## [1] 194545     26
glimpse(df_raw)
## Rows: 194,545
## Columns: 26
## $ PUBID_1997        <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ DOB               <chr> "09/15/1981", "09/15/1981", "09/15/1981", "09/15/198…
## $ SAMPLE_RACE_1997  <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4…
## $ SAMPLE_SEX_1997   <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2…
## $ Interview_Date    <chr> "11/15/1998", "12/15/1999", "12/15/1999", "12/15/200…
## $ Year              <int> 1998, 1999, 1999, 2000, 2001, 2001, 2001, 2002, 2002…
## $ Employed          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ StartDate         <chr> "07/15/1997", "11/15/1998", "03/15/1999", "12/15/199…
## $ StopDate          <chr> "11/15/1998", "03/15/1999", "12/15/1999", "12/15/200…
## $ IND               <int> 610, 610, NA, 641, 641, 611, 630, 5180, 8680, 5370, …
## $ OCC               <int> 274, 274, NA, 435, 435, 274, 265, 4760, 4110, 4760, …
## $ TENURE            <int> 72, 89, 39, 91, 140, 30, 51, 77, NA, 65, 121, 172, 2…
## $ HRLY_WAGE         <dbl> 6.00, 6.25, 13.75, NA, 13.75, NA, 8.35, 9.00, 13.00,…
## $ HRLY_COMP         <dbl> 6.00, 6.25, NA, NA, 213.75, NA, 8.35, 9.00, 16.33, N…
## $ HRS_WRK           <int> 20, 30, 8, 20, 30, 30, 15, 15, 12, 20, 50, 70, 70, 5…
## $ UID               <int> 9701, 9701, 199902, 199902, 199902, 200103, 200102, …
## $ Code_1990         <int> 274, 274, NA, 435, 435, 274, 265, 275, 435, 275, 275…
## $ Occupation        <chr> "SALESPERSONS, N.E.C.", "SALESPERSONS, N.E.C.", "", …
## $ Occupation_Group  <chr> "SALES OCCUPATIONS", "SALES OCCUPATIONS", "", "SERVI…
## $ Occupation_Group2 <chr> "TECHNICAL, SALES, AND ADMINISTRATIVE SUPPORT OCCUPA…
## $ Industry          <chr> "RETAIL BAKERIES", "RETAIL BAKERIES", "", "EATING AN…
## $ Industry_Group    <chr> "ACCOMODATIONS AND FOOD SERVICES", "ACCOMODATIONS AN…
## $ marital_status    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ HGC               <int> 12, 12, 12, 13, 14, 14, 14, 15, 15, 16, 16, 16, 16, …
## $ Region            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ index_col         <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1…

2. Missing Value Audit

missing_summary <- df_raw %>%
  summarise(across(everything(), ~ sum(is.na(.)))) %>%
  pivot_longer(everything(), names_to = "Column", values_to = "Missing_Count") %>%
  mutate(
    Total       = nrow(df_raw),
    Missing_Pct = round(Missing_Count / Total * 100, 2)
  ) %>%
  arrange(desc(Missing_Count))

missing_summary %>%
  kable(caption = "Missing Values by Column") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
Missing Values by Column
Column Missing_Count Total Missing_Pct
HRLY_COMP 17175 194545 8.83
HRLY_WAGE 10729 194545 5.51
HRS_WRK 6998 194545 3.60
TENURE 6204 194545 3.19
Code_1990 3932 194545 2.02
IND 3678 194545 1.89
OCC 3396 194545 1.75
HGC 1450 194545 0.75
Region 1223 194545 0.63
marital_status 324 194545 0.17
UID 4 194545 0.00
PUBID_1997 0 194545 0.00
DOB 0 194545 0.00
SAMPLE_RACE_1997 0 194545 0.00
SAMPLE_SEX_1997 0 194545 0.00
Interview_Date 0 194545 0.00
Year 0 194545 0.00
Employed 0 194545 0.00
StartDate 0 194545 0.00
StopDate 0 194545 0.00
Occupation 0 194545 0.00
Occupation_Group 0 194545 0.00
Occupation_Group2 0 194545 0.00
Industry 0 194545 0.00
Industry_Group 0 194545 0.00
index_col 0 194545 0.00
missing_summary %>%
  filter(Missing_Count > 0) %>%
  ggplot(aes(x = reorder(Column, Missing_Pct), y = Missing_Pct)) +
  geom_col(fill = "#2c7bb6") +
  coord_flip() +
  labs(
    title = "Missing Data by Column (%)",
    x     = NULL,
    y     = "% Missing"
  ) +
  theme_minimal()


3. Date Parsing

df_dates <- df_raw %>%
  mutate(
    DOB            = mdy(DOB),
    Interview_Date = mdy(Interview_Date),
    StartDate      = mdy(StartDate),
    StopDate       = mdy(StopDate),
    # Derived fields
    Age_at_Interview = as.numeric(interval(DOB, Interview_Date) / years(1)),
    Spell_Length_Days = as.numeric(StopDate - StartDate)
  )

# Sanity checks
cat("DOB range:           ", format(range(df_dates$DOB, na.rm = TRUE)), "\n")
## DOB range:            1980-01-15 1984-12-15
cat("Interview_Date range:", format(range(df_dates$Interview_Date, na.rm = TRUE)), "\n")
## Interview_Date range: 1997-01-15 2022-10-15
cat("StartDate range:     ", format(range(df_dates$StartDate, na.rm = TRUE)), "\n")
## StartDate range:      1900-01-15 2022-09-15
cat("StopDate range:      ", format(range(df_dates$StopDate, na.rm = TRUE)), "\n")
## StopDate range:       1990-11-15 2022-10-15
# Flag negative spell lengths
neg_spells <- sum(df_dates$Spell_Length_Days < 0, na.rm = TRUE)
cat("\nNegative spell lengths (StopDate < StartDate):", neg_spells, "\n")
## 
## Negative spell lengths (StopDate < StartDate): 0

4. Wage Filtering

cat("HRLY_WAGE — raw summary:\n")
## HRLY_WAGE — raw summary:
summary(df_dates$HRLY_WAGE)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.01    7.00   10.00   14.65   15.38 6000.00   10729
cat("\nTotal rows:               ", nrow(df_dates))
## 
## Total rows:                194545
cat("\nRows with NA HRLY_WAGE:   ", sum(is.na(df_dates$HRLY_WAGE)))
## 
## Rows with NA HRLY_WAGE:    10729
cat("\nRows below $5.15 (pre-1997 min wage): ", sum(df_dates$HRLY_WAGE < 5.15, na.rm = TRUE), "\n")
## 
## Rows below $5.15 (pre-1997 min wage):  12215
# Upper outlier bound — 99th percentile among valid wages (>= $5.15)
wage_valid <- df_dates$HRLY_WAGE[df_dates$HRLY_WAGE >= 5.15 & !is.na(df_dates$HRLY_WAGE)]
upper_bound <- quantile(wage_valid, 0.99)

cat("99th percentile upper bound: $", round(upper_bound, 2), "\n")
## 99th percentile upper bound: $ 78.95
cat("Rows above upper bound:      ", sum(wage_valid > upper_bound), "\n")
## Rows above upper bound:       1715
# Drop rows where HRLY_WAGE is NA or below $5.15 (1997 federal minimum wage)
# Cap extreme upper outliers at 99th percentile
rows_before <- nrow(df_dates)

df_wages <- df_dates %>%
  filter(!is.na(HRLY_WAGE), HRLY_WAGE >= 5.15) %>%
  mutate(
    HRLY_WAGE_clean = if_else(HRLY_WAGE > upper_bound, NA_real_, HRLY_WAGE),
    HRLY_COMP_clean = if_else(is.na(HRLY_COMP) | HRLY_COMP <= 0, NA_real_, HRLY_COMP)
  )

rows_after <- nrow(df_wages)
cat("Rows before wage filter:", rows_before, "\n")
## Rows before wage filter: 194545
cat("Rows after wage filter: ", rows_after, "\n")
## Rows after wage filter:  171601
cat("Rows removed:           ", rows_before - rows_after, "\n")
## Rows removed:            22944
cat("\nHRLY_WAGE_clean — cleaned summary:\n")
## 
## HRLY_WAGE_clean — cleaned summary:
summary(df_wages$HRLY_WAGE_clean)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    5.15    7.25   10.00   13.57   15.94   78.95    1715
df_wages %>%
  filter(!is.na(HRLY_WAGE_clean)) %>%
  ggplot(aes(x = HRLY_WAGE_clean)) +
  geom_histogram(bins = 60, fill = "#2c7bb6", color = "white") +
  scale_x_continuous(labels = scales::dollar_format()) +
  labs(
    title    = "Distribution of Hourly Wage (Cleaned)",
    subtitle = "Excludes wages below $5.15 (1997 federal minimum) and NA; caps at 99th percentile",
    x        = "Hourly Wage ($)",
    y        = "Count"
  ) +
  theme_minimal()


5. Factor Encoding

df_clean <- df_wages %>%
  mutate(
    # Sex
    SAMPLE_SEX_1997 = factor(SAMPLE_SEX_1997,
      levels = c(1, 2),
      labels = c("Male", "Female")
    ),

    # Race (NLS 1997 coding)
    SAMPLE_RACE_1997 = factor(SAMPLE_RACE_1997,
      levels = c(1, 2, 3, 4),
      labels = c("Black", "Hispanic", "Mixed Race (Non-Hispanic)", "Non-Black / Non-Hispanic")
    ),

    # Region
    Region = factor(Region,
      levels = c(1, 2, 3, 4),
      labels = c("Northeast", "North Central", "South", "West")
    ),

    # Marital status
    marital_status = factor(marital_status,
      levels = c(0, 1, 2, 3, 4, 5),
      labels = c("Never Married", "Married", "Separated", "Divorced", "Widowed", "Unknown")
    ),

    # Occupation and industry as ordered factors for plotting
    Occupation_Group  = factor(Occupation_Group),
    Occupation_Group2 = factor(Occupation_Group2),
    Industry_Group    = factor(Industry_Group),

    # Employed as logical
    Employed = as.logical(Employed),

    # Year as integer
    Year = as.integer(Year)
  )

# Quick check
df_clean %>%
  select(SAMPLE_SEX_1997, SAMPLE_RACE_1997, Region, marital_status) %>%
  summary()
##  SAMPLE_SEX_1997                  SAMPLE_RACE_1997           Region     
##  Male  :86937    Black                    :42907   Northeast    :27639  
##  Female:84664    Hispanic                 :34437   North Central:39410  
##                  Mixed Race (Non-Hispanic): 1553   South        :65435  
##                  Non-Black / Non-Hispanic :92704   West         :38311  
##                                                    NA's         :  806  
##                                                                         
##                                                                         
##        marital_status  
##  Never Married:121265  
##  Married      : 40042  
##  Separated    :  2703  
##  Divorced     :  7226  
##  Widowed      :   192  
##  Unknown      :     0  
##  NA's         :   173

6. Validation

cat("=== Final Dataset Dimensions ===\n")
## === Final Dataset Dimensions ===
cat("Rows:", nrow(df_clean), "\n")
## Rows: 171601
cat("Cols:", ncol(df_clean), "\n")
## Cols: 30
cat("\n=== Unique Individuals ===\n")
## 
## === Unique Individuals ===
cat(length(unique(df_clean$PUBID_1997)), "people\n")
## 8810 people
cat("\n=== Year Range ===\n")
## 
## === Year Range ===
print(table(df_clean$Year))
## 
##  1997  1998  1999  2000  2001  2002  2003  2004  2005  2006  2007  2008  2009 
##   922  5525  7400  9971 10297 10421 10100 10036  9651 10133  9711  9364  8379 
##  2010  2011  2013  2015  2017  2019  2021 
##  8183  7748  9393  8969  8531  8657  8210
cat("\n=== Employment Status ===\n")
## 
## === Employment Status ===
print(table(df_clean$Employed))
## 
##   TRUE 
## 171601
cat("\n=== Remaining Missing (key columns) ===\n")
## 
## === Remaining Missing (key columns) ===
df_clean %>%
  select(HRLY_WAGE_clean, HRLY_COMP_clean, Age_at_Interview, Spell_Length_Days, HGC, Region) %>%
  summarise(across(everything(), ~ sum(is.na(.)))) %>%
  pivot_longer(everything(), names_to = "Column", values_to = "Remaining_NA") %>%
  mutate(Pct = round(Remaining_NA / nrow(df_clean) * 100, 2)) %>%
  kable() %>%
  kable_styling(bootstrap_options = c("striped", "condensed"), full_width = FALSE)
Column Remaining_NA Pct
HRLY_WAGE_clean 1715 1.00
HRLY_COMP_clean 6784 3.95
Age_at_Interview 0 0.00
Spell_Length_Days 0 0.00
HGC 1261 0.73
Region 806 0.47
cat("\nNote: HRLY_WAGE_clean NAs are upper outliers capped at 99th percentile only.\n")
## 
## Note: HRLY_WAGE_clean NAs are upper outliers capped at 99th percentile only.
cat("All rows with wage < $5.15 or missing wage were removed at the filter step.\n")
## All rows with wage < $5.15 or missing wage were removed at the filter step.

7. Save Cleaned Data

#write.csv(df_clean, "graduate_full_clean.csv", row.names = FALSE)
cat("Saved: graduate_full_clean.csv\n")
## Saved: graduate_full_clean.csv