library(tidyverse)
library(lubridate)
library(knitr)
library(kableExtra)

1. Load Data

# --- DB connection (comment out if using CSV) ---
# library(RPostgres)
# con <- dbConnect(Postgres(),
#   host     = "hopper.proxy.rlwy.net",
#   port     = 37403,
#   dbname   = "railway",
#   user     = "participant",
#   password = "EvW27867KZfZgl1EH56vxQ"
# )
# df_raw <- dbReadTable(con, Id(schema = "graduate", table = "nls_full"))
# dbDisconnect(con)

# --- CSV fallback ---
df_raw <- read.csv("graduate-full.csv", stringsAsFactors = FALSE)

dim(df_raw)

## [1] 194545     26

glimpse(df_raw)

## Rows: 194,545
## Columns: 26
## $ PUBID_1997        <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ DOB               <chr> "09/15/1981", "09/15/1981", "09/15/1981", "09/15/198…
## $ SAMPLE_RACE_1997  <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4…
## $ SAMPLE_SEX_1997   <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2…
## $ Interview_Date    <chr> "11/15/1998", "12/15/1999", "12/15/1999", "12/15/200…
## $ Year              <int> 1998, 1999, 1999, 2000, 2001, 2001, 2001, 2002, 2002…
## $ Employed          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ StartDate         <chr> "07/15/1997", "11/15/1998", "03/15/1999", "12/15/199…
## $ StopDate          <chr> "11/15/1998", "03/15/1999", "12/15/1999", "12/15/200…
## $ IND               <int> 610, 610, NA, 641, 641, 611, 630, 5180, 8680, 5370, …
## $ OCC               <int> 274, 274, NA, 435, 435, 274, 265, 4760, 4110, 4760, …
## $ TENURE            <int> 72, 89, 39, 91, 140, 30, 51, 77, NA, 65, 121, 172, 2…
## $ HRLY_WAGE         <dbl> 6.00, 6.25, 13.75, NA, 13.75, NA, 8.35, 9.00, 13.00,…
## $ HRLY_COMP         <dbl> 6.00, 6.25, NA, NA, 213.75, NA, 8.35, 9.00, 16.33, N…
## $ HRS_WRK           <int> 20, 30, 8, 20, 30, 30, 15, 15, 12, 20, 50, 70, 70, 5…
## $ UID               <int> 9701, 9701, 199902, 199902, 199902, 200103, 200102, …
## $ Code_1990         <int> 274, 274, NA, 435, 435, 274, 265, 275, 435, 275, 275…
## $ Occupation        <chr> "SALESPERSONS, N.E.C.", "SALESPERSONS, N.E.C.", "", …
## $ Occupation_Group  <chr> "SALES OCCUPATIONS", "SALES OCCUPATIONS", "", "SERVI…
## $ Occupation_Group2 <chr> "TECHNICAL, SALES, AND ADMINISTRATIVE SUPPORT OCCUPA…
## $ Industry          <chr> "RETAIL BAKERIES", "RETAIL BAKERIES", "", "EATING AN…
## $ Industry_Group    <chr> "ACCOMODATIONS AND FOOD SERVICES", "ACCOMODATIONS AN…
## $ marital_status    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ HGC               <int> 12, 12, 12, 13, 14, 14, 14, 15, 15, 16, 16, 16, 16, …
## $ Region            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ index_col         <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1…

2. Missing Value Audit

missing_summary <- df_raw %>%
  summarise(across(everything(), ~ sum(is.na(.)))) %>%
  pivot_longer(everything(), names_to = "Column", values_to = "Missing_Count") %>%
  mutate(
    Total       = nrow(df_raw),
    Missing_Pct = round(Missing_Count / Total * 100, 2)
  ) %>%
  arrange(desc(Missing_Count))

missing_summary %>%
  kable(caption = "Missing Values by Column") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)

Missing Values by Column
Column	Missing_Count	Total	Missing_Pct
HRLY_COMP	17175	194545	8.83
HRLY_WAGE	10729	194545	5.51
HRS_WRK	6998	194545	3.60
TENURE	6204	194545	3.19
Code_1990	3932	194545	2.02
IND	3678	194545	1.89
OCC	3396	194545	1.75
HGC	1450	194545	0.75
Region	1223	194545	0.63
marital_status	324	194545	0.17
UID	4	194545	0.00
PUBID_1997	0	194545	0.00
DOB	0	194545	0.00
SAMPLE_RACE_1997	0	194545	0.00
SAMPLE_SEX_1997	0	194545	0.00
Interview_Date	0	194545	0.00
Year	0	194545	0.00
Employed	0	194545	0.00
StartDate	0	194545	0.00
StopDate	0	194545	0.00
Occupation	0	194545	0.00
Occupation_Group	0	194545	0.00
Occupation_Group2	0	194545	0.00
Industry	0	194545	0.00
Industry_Group	0	194545	0.00
index_col	0	194545	0.00

missing_summary %>%
  filter(Missing_Count > 0) %>%
  ggplot(aes(x = reorder(Column, Missing_Pct), y = Missing_Pct)) +
  geom_col(fill = "#2c7bb6") +
  coord_flip() +
  labs(
    title = "Missing Data by Column (%)",
    x     = NULL,
    y     = "% Missing"
  ) +
  theme_minimal()

3. Date Parsing

df_dates <- df_raw %>%
  mutate(
    DOB            = mdy(DOB),
    Interview_Date = mdy(Interview_Date),
    StartDate      = mdy(StartDate),
    StopDate       = mdy(StopDate),
    # Derived fields
    Age_at_Interview = as.numeric(interval(DOB, Interview_Date) / years(1)),
    Spell_Length_Days = as.numeric(StopDate - StartDate)
  )

# Sanity checks
cat("DOB range:           ", format(range(df_dates$DOB, na.rm = TRUE)), "\n")

## DOB range:            1980-01-15 1984-12-15

cat("Interview_Date range:", format(range(df_dates$Interview_Date, na.rm = TRUE)), "\n")

## Interview_Date range: 1997-01-15 2022-10-15

cat("StartDate range:     ", format(range(df_dates$StartDate, na.rm = TRUE)), "\n")

## StartDate range:      1900-01-15 2022-09-15

cat("StopDate range:      ", format(range(df_dates$StopDate, na.rm = TRUE)), "\n")

## StopDate range:       1990-11-15 2022-10-15

# Flag negative spell lengths
neg_spells <- sum(df_dates$Spell_Length_Days < 0, na.rm = TRUE)
cat("\nNegative spell lengths (StopDate < StartDate):", neg_spells, "\n")

## 
## Negative spell lengths (StopDate < StartDate): 0

4. Wage Filtering

cat("HRLY_WAGE — raw summary:\n")

## HRLY_WAGE — raw summary:

summary(df_dates$HRLY_WAGE)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.01    7.00   10.00   14.65   15.38 6000.00   10729

cat("\nTotal rows:               ", nrow(df_dates))

## 
## Total rows:                194545

cat("\nRows with NA HRLY_WAGE:   ", sum(is.na(df_dates$HRLY_WAGE)))

## 
## Rows with NA HRLY_WAGE:    10729

cat("\nRows below $5.15 (pre-1997 min wage): ", sum(df_dates$HRLY_WAGE < 5.15, na.rm = TRUE), "\n")

## 
## Rows below $5.15 (pre-1997 min wage):  12215

# Upper outlier bound — 99th percentile among valid wages (>= $5.15)
wage_valid <- df_dates$HRLY_WAGE[df_dates$HRLY_WAGE >= 5.15 & !is.na(df_dates$HRLY_WAGE)]
upper_bound <- quantile(wage_valid, 0.99)

cat("99th percentile upper bound: $", round(upper_bound, 2), "\n")

## 99th percentile upper bound: $ 78.95

cat("Rows above upper bound:      ", sum(wage_valid > upper_bound), "\n")

## Rows above upper bound:       1715

# Drop rows where HRLY_WAGE is NA or below $5.15 (1997 federal minimum wage)
# Cap extreme upper outliers at 99th percentile
rows_before <- nrow(df_dates)

df_wages <- df_dates %>%
  filter(!is.na(HRLY_WAGE), HRLY_WAGE >= 5.15) %>%
  mutate(
    HRLY_WAGE_clean = if_else(HRLY_WAGE > upper_bound, NA_real_, HRLY_WAGE),
    HRLY_COMP_clean = if_else(is.na(HRLY_COMP) | HRLY_COMP <= 0, NA_real_, HRLY_COMP)
  )

rows_after <- nrow(df_wages)
cat("Rows before wage filter:", rows_before, "\n")

## Rows before wage filter: 194545

cat("Rows after wage filter: ", rows_after, "\n")

## Rows after wage filter:  171601

cat("Rows removed:           ", rows_before - rows_after, "\n")

## Rows removed:            22944

cat("\nHRLY_WAGE_clean — cleaned summary:\n")

## 
## HRLY_WAGE_clean — cleaned summary:

summary(df_wages$HRLY_WAGE_clean)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    5.15    7.25   10.00   13.57   15.94   78.95    1715

df_wages %>%
  filter(!is.na(HRLY_WAGE_clean)) %>%
  ggplot(aes(x = HRLY_WAGE_clean)) +
  geom_histogram(bins = 60, fill = "#2c7bb6", color = "white") +
  scale_x_continuous(labels = scales::dollar_format()) +
  labs(
    title    = "Distribution of Hourly Wage (Cleaned)",
    subtitle = "Excludes wages below $5.15 (1997 federal minimum) and NA; caps at 99th percentile",
    x        = "Hourly Wage ($)",
    y        = "Count"
  ) +
  theme_minimal()

5. Factor Encoding

df_clean <- df_wages %>%
  mutate(
    # Sex
    SAMPLE_SEX_1997 = factor(SAMPLE_SEX_1997,
      levels = c(1, 2),
      labels = c("Male", "Female")
    ),

    # Race (NLS 1997 coding)
    SAMPLE_RACE_1997 = factor(SAMPLE_RACE_1997,
      levels = c(1, 2, 3, 4),
      labels = c("Black", "Hispanic", "Mixed Race (Non-Hispanic)", "Non-Black / Non-Hispanic")
    ),

    # Region
    Region = factor(Region,
      levels = c(1, 2, 3, 4),
      labels = c("Northeast", "North Central", "South", "West")
    ),

    # Marital status
    marital_status = factor(marital_status,
      levels = c(0, 1, 2, 3, 4, 5),
      labels = c("Never Married", "Married", "Separated", "Divorced", "Widowed", "Unknown")
    ),

    # Occupation and industry as ordered factors for plotting
    Occupation_Group  = factor(Occupation_Group),
    Occupation_Group2 = factor(Occupation_Group2),
    Industry_Group    = factor(Industry_Group),

    # Employed as logical
    Employed = as.logical(Employed),

    # Year as integer
    Year = as.integer(Year)
  )

# Quick check
df_clean %>%
  select(SAMPLE_SEX_1997, SAMPLE_RACE_1997, Region, marital_status) %>%
  summary()

##  SAMPLE_SEX_1997                  SAMPLE_RACE_1997           Region     
##  Male  :86937    Black                    :42907   Northeast    :27639  
##  Female:84664    Hispanic                 :34437   North Central:39410  
##                  Mixed Race (Non-Hispanic): 1553   South        :65435  
##                  Non-Black / Non-Hispanic :92704   West         :38311  
##                                                    NA's         :  806  
##                                                                         
##                                                                         
##        marital_status  
##  Never Married:121265  
##  Married      : 40042  
##  Separated    :  2703  
##  Divorced     :  7226  
##  Widowed      :   192  
##  Unknown      :     0  
##  NA's         :   173

6. Validation

cat("=== Final Dataset Dimensions ===\n")

## === Final Dataset Dimensions ===

cat("Rows:", nrow(df_clean), "\n")

## Rows: 171601

cat("Cols:", ncol(df_clean), "\n")

## Cols: 30

cat("\n=== Unique Individuals ===\n")

## 
## === Unique Individuals ===

cat(length(unique(df_clean$PUBID_1997)), "people\n")

## 8810 people

cat("\n=== Year Range ===\n")

## 
## === Year Range ===

print(table(df_clean$Year))

## 
##  1997  1998  1999  2000  2001  2002  2003  2004  2005  2006  2007  2008  2009 
##   922  5525  7400  9971 10297 10421 10100 10036  9651 10133  9711  9364  8379 
##  2010  2011  2013  2015  2017  2019  2021 
##  8183  7748  9393  8969  8531  8657  8210

cat("\n=== Employment Status ===\n")

## 
## === Employment Status ===

print(table(df_clean$Employed))

## 
##   TRUE 
## 171601

cat("\n=== Remaining Missing (key columns) ===\n")

## 
## === Remaining Missing (key columns) ===

df_clean %>%
  select(HRLY_WAGE_clean, HRLY_COMP_clean, Age_at_Interview, Spell_Length_Days, HGC, Region) %>%
  summarise(across(everything(), ~ sum(is.na(.)))) %>%
  pivot_longer(everything(), names_to = "Column", values_to = "Remaining_NA") %>%
  mutate(Pct = round(Remaining_NA / nrow(df_clean) * 100, 2)) %>%
  kable() %>%
  kable_styling(bootstrap_options = c("striped", "condensed"), full_width = FALSE)

Column	Remaining_NA	Pct
HRLY_WAGE_clean	1715	1.00
HRLY_COMP_clean	6784	3.95
Age_at_Interview	0	0.00
Spell_Length_Days	0	0.00
HGC	1261	0.73
Region	806	0.47

cat("\nNote: HRLY_WAGE_clean NAs are upper outliers capped at 99th percentile only.\n")

## 
## Note: HRLY_WAGE_clean NAs are upper outliers capped at 99th percentile only.

cat("All rows with wage < $5.15 or missing wage were removed at the filter step.\n")

## All rows with wage < $5.15 or missing wage were removed at the filter step.

7. Save Cleaned Data

#write.csv(df_clean, "graduate_full_clean.csv", row.names = FALSE)
cat("Saved: graduate_full_clean.csv\n")

## Saved: graduate_full_clean.csv

NLS Labor Market Data — Cleaning

Madison Guerrero

2026-03-15

1. Load Data

2. Missing Value Audit

3. Date Parsing

4. Wage Filtering

5. Factor Encoding

6. Validation

7. Save Cleaned Data