library(tidyverse)
library(lubridate)
library(knitr)
library(kableExtra)
1. Load Data
# --- DB connection (comment out if using CSV) ---
# library(RPostgres)
# con <- dbConnect(Postgres(),
# host = "hopper.proxy.rlwy.net",
# port = 37403,
# dbname = "railway",
# user = "participant",
# password = "EvW27867KZfZgl1EH56vxQ"
# )
# df_raw <- dbReadTable(con, Id(schema = "graduate", table = "nls_full"))
# dbDisconnect(con)
# --- CSV fallback ---
df_raw <- read.csv("graduate-full.csv", stringsAsFactors = FALSE)
dim(df_raw)
## [1] 194545 26
glimpse(df_raw)
## Rows: 194,545
## Columns: 26
## $ PUBID_1997 <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ DOB <chr> "09/15/1981", "09/15/1981", "09/15/1981", "09/15/198…
## $ SAMPLE_RACE_1997 <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4…
## $ SAMPLE_SEX_1997 <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2…
## $ Interview_Date <chr> "11/15/1998", "12/15/1999", "12/15/1999", "12/15/200…
## $ Year <int> 1998, 1999, 1999, 2000, 2001, 2001, 2001, 2002, 2002…
## $ Employed <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ StartDate <chr> "07/15/1997", "11/15/1998", "03/15/1999", "12/15/199…
## $ StopDate <chr> "11/15/1998", "03/15/1999", "12/15/1999", "12/15/200…
## $ IND <int> 610, 610, NA, 641, 641, 611, 630, 5180, 8680, 5370, …
## $ OCC <int> 274, 274, NA, 435, 435, 274, 265, 4760, 4110, 4760, …
## $ TENURE <int> 72, 89, 39, 91, 140, 30, 51, 77, NA, 65, 121, 172, 2…
## $ HRLY_WAGE <dbl> 6.00, 6.25, 13.75, NA, 13.75, NA, 8.35, 9.00, 13.00,…
## $ HRLY_COMP <dbl> 6.00, 6.25, NA, NA, 213.75, NA, 8.35, 9.00, 16.33, N…
## $ HRS_WRK <int> 20, 30, 8, 20, 30, 30, 15, 15, 12, 20, 50, 70, 70, 5…
## $ UID <int> 9701, 9701, 199902, 199902, 199902, 200103, 200102, …
## $ Code_1990 <int> 274, 274, NA, 435, 435, 274, 265, 275, 435, 275, 275…
## $ Occupation <chr> "SALESPERSONS, N.E.C.", "SALESPERSONS, N.E.C.", "", …
## $ Occupation_Group <chr> "SALES OCCUPATIONS", "SALES OCCUPATIONS", "", "SERVI…
## $ Occupation_Group2 <chr> "TECHNICAL, SALES, AND ADMINISTRATIVE SUPPORT OCCUPA…
## $ Industry <chr> "RETAIL BAKERIES", "RETAIL BAKERIES", "", "EATING AN…
## $ Industry_Group <chr> "ACCOMODATIONS AND FOOD SERVICES", "ACCOMODATIONS AN…
## $ marital_status <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ HGC <int> 12, 12, 12, 13, 14, 14, 14, 15, 15, 16, 16, 16, 16, …
## $ Region <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ index_col <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1…
2. Missing Value Audit
missing_summary <- df_raw %>%
summarise(across(everything(), ~ sum(is.na(.)))) %>%
pivot_longer(everything(), names_to = "Column", values_to = "Missing_Count") %>%
mutate(
Total = nrow(df_raw),
Missing_Pct = round(Missing_Count / Total * 100, 2)
) %>%
arrange(desc(Missing_Count))
missing_summary %>%
kable(caption = "Missing Values by Column") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
Missing Values by Column
|
Column
|
Missing_Count
|
Total
|
Missing_Pct
|
|
HRLY_COMP
|
17175
|
194545
|
8.83
|
|
HRLY_WAGE
|
10729
|
194545
|
5.51
|
|
HRS_WRK
|
6998
|
194545
|
3.60
|
|
TENURE
|
6204
|
194545
|
3.19
|
|
Code_1990
|
3932
|
194545
|
2.02
|
|
IND
|
3678
|
194545
|
1.89
|
|
OCC
|
3396
|
194545
|
1.75
|
|
HGC
|
1450
|
194545
|
0.75
|
|
Region
|
1223
|
194545
|
0.63
|
|
marital_status
|
324
|
194545
|
0.17
|
|
UID
|
4
|
194545
|
0.00
|
|
PUBID_1997
|
0
|
194545
|
0.00
|
|
DOB
|
0
|
194545
|
0.00
|
|
SAMPLE_RACE_1997
|
0
|
194545
|
0.00
|
|
SAMPLE_SEX_1997
|
0
|
194545
|
0.00
|
|
Interview_Date
|
0
|
194545
|
0.00
|
|
Year
|
0
|
194545
|
0.00
|
|
Employed
|
0
|
194545
|
0.00
|
|
StartDate
|
0
|
194545
|
0.00
|
|
StopDate
|
0
|
194545
|
0.00
|
|
Occupation
|
0
|
194545
|
0.00
|
|
Occupation_Group
|
0
|
194545
|
0.00
|
|
Occupation_Group2
|
0
|
194545
|
0.00
|
|
Industry
|
0
|
194545
|
0.00
|
|
Industry_Group
|
0
|
194545
|
0.00
|
|
index_col
|
0
|
194545
|
0.00
|
missing_summary %>%
filter(Missing_Count > 0) %>%
ggplot(aes(x = reorder(Column, Missing_Pct), y = Missing_Pct)) +
geom_col(fill = "#2c7bb6") +
coord_flip() +
labs(
title = "Missing Data by Column (%)",
x = NULL,
y = "% Missing"
) +
theme_minimal()

3. Date Parsing
df_dates <- df_raw %>%
mutate(
DOB = mdy(DOB),
Interview_Date = mdy(Interview_Date),
StartDate = mdy(StartDate),
StopDate = mdy(StopDate),
# Derived fields
Age_at_Interview = as.numeric(interval(DOB, Interview_Date) / years(1)),
Spell_Length_Days = as.numeric(StopDate - StartDate)
)
# Sanity checks
cat("DOB range: ", format(range(df_dates$DOB, na.rm = TRUE)), "\n")
## DOB range: 1980-01-15 1984-12-15
cat("Interview_Date range:", format(range(df_dates$Interview_Date, na.rm = TRUE)), "\n")
## Interview_Date range: 1997-01-15 2022-10-15
cat("StartDate range: ", format(range(df_dates$StartDate, na.rm = TRUE)), "\n")
## StartDate range: 1900-01-15 2022-09-15
cat("StopDate range: ", format(range(df_dates$StopDate, na.rm = TRUE)), "\n")
## StopDate range: 1990-11-15 2022-10-15
# Flag negative spell lengths
neg_spells <- sum(df_dates$Spell_Length_Days < 0, na.rm = TRUE)
cat("\nNegative spell lengths (StopDate < StartDate):", neg_spells, "\n")
##
## Negative spell lengths (StopDate < StartDate): 0
4. Wage Filtering
cat("HRLY_WAGE — raw summary:\n")
## HRLY_WAGE — raw summary:
summary(df_dates$HRLY_WAGE)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.01 7.00 10.00 14.65 15.38 6000.00 10729
cat("\nTotal rows: ", nrow(df_dates))
##
## Total rows: 194545
cat("\nRows with NA HRLY_WAGE: ", sum(is.na(df_dates$HRLY_WAGE)))
##
## Rows with NA HRLY_WAGE: 10729
cat("\nRows below $5.15 (pre-1997 min wage): ", sum(df_dates$HRLY_WAGE < 5.15, na.rm = TRUE), "\n")
##
## Rows below $5.15 (pre-1997 min wage): 12215
# Upper outlier bound — 99th percentile among valid wages (>= $5.15)
wage_valid <- df_dates$HRLY_WAGE[df_dates$HRLY_WAGE >= 5.15 & !is.na(df_dates$HRLY_WAGE)]
upper_bound <- quantile(wage_valid, 0.99)
cat("99th percentile upper bound: $", round(upper_bound, 2), "\n")
## 99th percentile upper bound: $ 78.95
cat("Rows above upper bound: ", sum(wage_valid > upper_bound), "\n")
## Rows above upper bound: 1715
# Drop rows where HRLY_WAGE is NA or below $5.15 (1997 federal minimum wage)
# Cap extreme upper outliers at 99th percentile
rows_before <- nrow(df_dates)
df_wages <- df_dates %>%
filter(!is.na(HRLY_WAGE), HRLY_WAGE >= 5.15) %>%
mutate(
HRLY_WAGE_clean = if_else(HRLY_WAGE > upper_bound, NA_real_, HRLY_WAGE),
HRLY_COMP_clean = if_else(is.na(HRLY_COMP) | HRLY_COMP <= 0, NA_real_, HRLY_COMP)
)
rows_after <- nrow(df_wages)
cat("Rows before wage filter:", rows_before, "\n")
## Rows before wage filter: 194545
cat("Rows after wage filter: ", rows_after, "\n")
## Rows after wage filter: 171601
cat("Rows removed: ", rows_before - rows_after, "\n")
## Rows removed: 22944
cat("\nHRLY_WAGE_clean — cleaned summary:\n")
##
## HRLY_WAGE_clean — cleaned summary:
summary(df_wages$HRLY_WAGE_clean)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 5.15 7.25 10.00 13.57 15.94 78.95 1715
df_wages %>%
filter(!is.na(HRLY_WAGE_clean)) %>%
ggplot(aes(x = HRLY_WAGE_clean)) +
geom_histogram(bins = 60, fill = "#2c7bb6", color = "white") +
scale_x_continuous(labels = scales::dollar_format()) +
labs(
title = "Distribution of Hourly Wage (Cleaned)",
subtitle = "Excludes wages below $5.15 (1997 federal minimum) and NA; caps at 99th percentile",
x = "Hourly Wage ($)",
y = "Count"
) +
theme_minimal()

5. Factor Encoding
df_clean <- df_wages %>%
mutate(
# Sex
SAMPLE_SEX_1997 = factor(SAMPLE_SEX_1997,
levels = c(1, 2),
labels = c("Male", "Female")
),
# Race (NLS 1997 coding)
SAMPLE_RACE_1997 = factor(SAMPLE_RACE_1997,
levels = c(1, 2, 3, 4),
labels = c("Black", "Hispanic", "Mixed Race (Non-Hispanic)", "Non-Black / Non-Hispanic")
),
# Region
Region = factor(Region,
levels = c(1, 2, 3, 4),
labels = c("Northeast", "North Central", "South", "West")
),
# Marital status
marital_status = factor(marital_status,
levels = c(0, 1, 2, 3, 4, 5),
labels = c("Never Married", "Married", "Separated", "Divorced", "Widowed", "Unknown")
),
# Occupation and industry as ordered factors for plotting
Occupation_Group = factor(Occupation_Group),
Occupation_Group2 = factor(Occupation_Group2),
Industry_Group = factor(Industry_Group),
# Employed as logical
Employed = as.logical(Employed),
# Year as integer
Year = as.integer(Year)
)
# Quick check
df_clean %>%
select(SAMPLE_SEX_1997, SAMPLE_RACE_1997, Region, marital_status) %>%
summary()
## SAMPLE_SEX_1997 SAMPLE_RACE_1997 Region
## Male :86937 Black :42907 Northeast :27639
## Female:84664 Hispanic :34437 North Central:39410
## Mixed Race (Non-Hispanic): 1553 South :65435
## Non-Black / Non-Hispanic :92704 West :38311
## NA's : 806
##
##
## marital_status
## Never Married:121265
## Married : 40042
## Separated : 2703
## Divorced : 7226
## Widowed : 192
## Unknown : 0
## NA's : 173
6. Validation
cat("=== Final Dataset Dimensions ===\n")
## === Final Dataset Dimensions ===
cat("Rows:", nrow(df_clean), "\n")
## Rows: 171601
cat("Cols:", ncol(df_clean), "\n")
## Cols: 30
cat("\n=== Unique Individuals ===\n")
##
## === Unique Individuals ===
cat(length(unique(df_clean$PUBID_1997)), "people\n")
## 8810 people
cat("\n=== Year Range ===\n")
##
## === Year Range ===
print(table(df_clean$Year))
##
## 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009
## 922 5525 7400 9971 10297 10421 10100 10036 9651 10133 9711 9364 8379
## 2010 2011 2013 2015 2017 2019 2021
## 8183 7748 9393 8969 8531 8657 8210
cat("\n=== Employment Status ===\n")
##
## === Employment Status ===
print(table(df_clean$Employed))
##
## TRUE
## 171601
cat("\n=== Remaining Missing (key columns) ===\n")
##
## === Remaining Missing (key columns) ===
df_clean %>%
select(HRLY_WAGE_clean, HRLY_COMP_clean, Age_at_Interview, Spell_Length_Days, HGC, Region) %>%
summarise(across(everything(), ~ sum(is.na(.)))) %>%
pivot_longer(everything(), names_to = "Column", values_to = "Remaining_NA") %>%
mutate(Pct = round(Remaining_NA / nrow(df_clean) * 100, 2)) %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "condensed"), full_width = FALSE)
|
Column
|
Remaining_NA
|
Pct
|
|
HRLY_WAGE_clean
|
1715
|
1.00
|
|
HRLY_COMP_clean
|
6784
|
3.95
|
|
Age_at_Interview
|
0
|
0.00
|
|
Spell_Length_Days
|
0
|
0.00
|
|
HGC
|
1261
|
0.73
|
|
Region
|
806
|
0.47
|
cat("\nNote: HRLY_WAGE_clean NAs are upper outliers capped at 99th percentile only.\n")
##
## Note: HRLY_WAGE_clean NAs are upper outliers capped at 99th percentile only.
cat("All rows with wage < $5.15 or missing wage were removed at the filter step.\n")
## All rows with wage < $5.15 or missing wage were removed at the filter step.
7. Save Cleaned Data
#write.csv(df_clean, "graduate_full_clean.csv", row.names = FALSE)
cat("Saved: graduate_full_clean.csv\n")
## Saved: graduate_full_clean.csv