Requirements

library(lattice)
library(ggplot2)
library(car)
library(stargazer, quietly = T)
library(plm)
library(sampleSelection) 
library(sandwich)
library(lmtest)
library(readr)

Source data from github repository

Read in raw data

deg.creds_dat <- read_csv("https://raw.githubusercontent.com/cjmarraro/wageGapData/master/data/csv_dir/tagset_edu.csv")

health_dat <- read_csv("https://raw.githubusercontent.com/cjmarraro/wageGapData/master/data/csv_dir/tagset_health.csv")

spouse_dat <- read_csv("https://raw.githubusercontent.com/cjmarraro/wageGapData/master/data/csv_dir/tagset_spouse.csv")

marital_dat <- read_csv("https://raw.githubusercontent.com/cjmarraro/wageGapData/master/data/csv_dir/tagset_marital.csv")

indust_dat <- read_csv("https://raw.githubusercontent.com/cjmarraro/wageGapData/master/data/csv_dir/tagset_indust.csv")

tenure_dat <- read_csv("https://raw.githubusercontent.com/cjmarraro/wageGapData/master/data/csv_dir/tagset_tenure.csv")

occu_dat <- read_csv("https://raw.githubusercontent.com/cjmarraro/wageGapData/master/data/csv_dir/tagset_occu.csv")

ex_occ_dat <- read_csv("https://raw.githubusercontent.com/cjmarraro/wageGapData/master/data/csv_dir/tagset_ex_occu.csv")

occudesire_dat <- read_csv("https://raw.githubusercontent.com/cjmarraro/wageGapData/master/data/csv_dir/tagset_occdesire.csv")

hrs_dat <- read_csv("https://raw.githubusercontent.com/cjmarraro/wageGapData/master/data/csv_dir/tageset_hrs.csv")

wage_dat <- read_csv("https://raw.githubusercontent.com/cjmarraro/wageGapData/master/data/csv_dir/tageset_hrwage.csv")

income_dat <- read_csv("https://raw.githubusercontent.com/cjmarraro/wageGapData/master/data/csv_dir/tagset_income.csv")

age_dat <- read_csv("https://raw.githubusercontent.com/cjmarraro/wageGapData/master/data/csv_dir/tagset_age.csv")

preprocess data

age_dat <- age_dat[,c("R0000100", "T3966200")]
income_dat = income_dat[,c("R0214700", "R0214800", "R0328000", 
                           "R0482600", "R0782101","R1024001", 
                           "R1410701", "R1778501", "R2141601",
                           "R2350301","R2722501", "R2971401", 
                           "R3279401","R3559001","R3897101",
                           "R4295101", "R5626201", "R6364601",
                           "R6909701" ,"R7607800", "R8316300",
                           "T0912400", "T3045300" ,"T3977400")]  
wage_dat = wage_dat[,c("R0047010", "R0263710", "R0446810", 
                       "R0702510" ,"R0945610", "R1256010", 
                       "R1650810", "R1923410","R2318210", "R2526010",
                       "R2925010" ,"R3127800" ,"R3523500", "R3728500",
                       "R4416800", "R5079800" ,"R5165200" ,"R6478000", 
                       "R7005700" ,"R7702900","R8495200" ,"T0986800")]
  
hrs_dat = hrs_dat[,c("R0070500", "R0337800", "R0545600", "R0839900", 
                     "R1087300","R1463000" ,"R1809800" ,"R2171500", 
                     "R2376300", "R2770900", "R3012700", "R3340100", 
                     "R3604400", "R3954600", "R4582900", "R5267100", 
                     "R5912600", "R6578200", "R7192900" ,"R7879600",
                     "T0121900" )]
hrs_dat$HRS.NA <- NA
occu_dat = occu_dat[,c("R0263400", "R0828000", "R0945300", "R1255700",
                       "R1650500","R1923100", "R2317900", "R2525700", 
                       "R2924700", "R3127400", "R3523100" ,"R3728100",
                       "R4193700", "R4587901", "R5270900","R6473700",
                       "R6592900", "R7209600", "R7898000", "T0138400")]
occu_dat$OCC.NA <- NA
occu_dat$OCC.NA1 <- NA
exp_occu_dat = ex_occ_dat[,-1]
occ_desr = occudesire_dat[,-1]
indust_dat = indust_dat[,c("R0046300", "R0263300", "R0446300", 
                           "R0702000", "R0944900","R1255300", 
                           "R1650100", "R1922700","R2317500" ,
                           "R2525300", "R2924300", "R3127000", 
                           "R3522700", "R3727700", "R4182000",
                           "R4587901", "R5270500", "R6472100", 
                           "R6591300" ,"R7209100", "R7897500", 
                           "T0137900")]
tenure_dat = tenure_dat[,c("R0068710", "R0333221", "R0539410", 
                           "R0833810", "R1081010","R1456710" ,
                           "R1803510", "R2165110", "R2372510",
                           "R2763410", "R3005210", "R3332610",
                           "R3597610", "R3947800", "R4416300",
                           "R5079300", "R5164600" ,"R6477500", 
                           "R7005200", "R7702400","R8494700", 
                           "T0986300")]
marital_dat = marital_dat[,c("R0217501", "R0405601", "R0618601", 
                             "R0898401","R1144901", "R1520101", 
                             "R1890801", "R2257901", "R2445301", 
                             "R2871000","R3074700", "R3401400",
                             "R3656800", "R4007300", "R4418400", 
                             "R5081400", "R5166700", "R6479300", 
                             "R7007000", "R7704300","R8496700", 
                             "T0988500")]
spouse_dat = spouse_dat[,c("R0155500", "R0312710", "R0482910", 
                           "R0784300", "R1024001","R1410701", 
                           "R1780701", "R2143801", "R2352501", 
                           "R2724701", "R2973600", "R3281600", 
                           "R3571801", "R3899300", "R4314401",
                           "R4996001", "R5650801", "R6374901", 
                           "R6917801", "R8325800", "T0920800" )]
spouse_dat$SPO.NA <-NA
health_dat = health_dat[,-1]
deg.creds_dat = deg.creds_dat[,-1]

generate column names for time-series

mylist = list()
iterColNames <- function(mylist){
    appendYear = c(as.character(79:94),'96','98','00',
                   '02','04','06')
    myColNames = list()  
    for(i in mylist){
          myColNames[[i]] = paste(i, appendYear, sep = ".")
          myColNames = as.list(myColNames)
          myColNames = myColNames[myColNames= as.character(mylist)]
          }
    return(myColNames)
    }

Calculate the variables tenure and “total experience” in 2006

calc_experience <- (rowSums(tenure_dat))
tot_exp06 <-as.data.frame(calc_experience) 
tenure <- as.data.frame(rowMeans(tenure_dat))
dat_all <- data.frame(age_dat, deg.creds_dat, exp_occu_dat, 
                      tot_exp06,  occ_desr, income_dat,  occu_dat, 
                      indust_dat, tenure_dat, marital_dat, spouse_dat, 
                      wage_dat, hrs_dat, health_dat, tenure)
  dat_all[dat_all == -1] = NA  # Refused 
  dat_all[dat_all == -2] = NA  # Dont know 
  dat_all[dat_all == -3] = NA  # Invalid missing 
  dat_all[dat_all == -4] = NA  # Valid missing 
  dat_all[dat_all == -5] = NA  # Non-interview 
myColNames <-iterColNames(list("INC", "OCC", "IND", 
                               "TEN", "MAR", "SPO", 
                               "WAG", "HRS"))
colnames(dat_all) <- c("CaseID","Age", "CREDITS", "EXPT.OCC", 
                       "TOT.EXP06", "DES.OCC", "Race", "Sex", 
                       as.character(as.list(unlist(myColNames))), 
                       "PHYS", "WGHT", "tenure")

Subset individuals with bachelors degree or higher

dat_ed = (dat_all$CREDITS > 100)
dat_use <-subset(dat_all, dat_ed)

compare percentage change of income in 2006 from previous years income

years_stagger <- dat_use[,c(9,12,15,20)]
year06 <- dat_use[,30]
wage <- as.matrix((log(year06/years_stagger))/15 )
                   wage[is.infinite(as.numeric(wage))] <- NA
summary(wage)

##      INC.79           INC.82            INC.85            INC.90        
##  Min.   :0.1030   Min.   :-0.1023   Min.   :-0.1284   Min.   :-0.24795  
##  1st Qu.:0.2679   1st Qu.: 0.1821   1st Qu.: 0.1315   1st Qu.: 0.04621  
##  Median :0.3447   Median : 0.2396   Median : 0.1832   Median : 0.07079  
##  Mean   :0.3427   Mean   : 0.2361   Mean   : 0.1827   Mean   : 0.07182  
##  3rd Qu.:0.4027   3rd Qu.: 0.2907   3rd Qu.: 0.2364   3rd Qu.: 0.09714  
##  Max.   :0.6454   Max.   : 0.5593   Max.   : 0.4131   Max.   : 0.39387  
##  NA's   :367      NA's   :242       NA's   :223       NA's   :221

transform to panel data

years <- c(1979:1994,1996,1998,2000,2002,2004,2006)
dat_reshape <-reshape(dat_use, direction = "long",
                      varying = list(myColNames$INC, myColNames$OCC,
                      myColNames$IND,myColNames$TEN, myColNames$MAR, 
                      myColNames$SPO, myColNames$WAG, myColNames$HRS),
                      v.names=c("INC", "OCC", "IND", "TEN", "MAR", 
                      "SPO", "WAG", "HRS"), idvar="CaseID", 
                      times=rep(years))

Additional features

dat_panel = dat_reshape
dat_hrs = (dat_panel[, "HRS"] >= 35)  # full-time workers only
data_NLSY <- subset(dat_panel, dat_hrs)
attach(data_NLSY)

# occupation dummies
mngmt = as.numeric((OCC >= 1) & (OCC <= 43))
professional = as.numeric((OCC >= 50) & (OCC <= 95))
hard_science = as.numeric((OCC >= 100) & (OCC <= 156))
technical <- as.numeric((OCC >= 203) & (OCC <= 235))
sales <- as.numeric((OCC >= 243) & (OCC <= 285))
clerical <- as.numeric((OCC >= 303) & (OCC <= 395))
service <- as.numeric((OCC >= 403) & (OCC <= 469))
operatives <- as.numeric((OCC >= 473) & (OCC <= 889))

occ_indx <- as.factor(0 * mngmt + 1 * professional + 2 * hard_science + 3 * 
    technical + 4 * sales + 5 * clerical + 6 * service + 7 * operatives)

levels(occ_indx)[1] <- "mngmt"
levels(occ_indx)[2] <- "prof"
levels(occ_indx)[3] <- "hard science"
levels(occ_indx)[4] <- "technical"
levels(occ_indx)[5] <- "sales"
levels(occ_indx)[6] <- "clerical"
levels(occ_indx)[7] <- "services"
levels(occ_indx)[8] <- "operatives"

occ_indx = relevel(occ_indx, ref = "mngmt")
levels(occ_indx)

## [1] "mngmt"        "prof"         "hard science" "technical"   
## [5] "sales"        "clerical"     "services"     "operatives"

female <- as.numeric(Sex == 2)
hispanic <- as.numeric(Race == 1)
AfAm <- as.numeric(Race == 2)
race_other <- as.numeric(Race == 3)

# Physical abilities proxy
phys <- as.factor(PHYS)
levels(phys) <- c("Excellent", "Very good", "Good", "Fair", "Poor")
phys = relevel(phys, ref = "Good")

# Physical health proxy
hlth <- as.factor(WGHT)
levels(hlth) <- c("overweight", "underweight", "normal", "notascertained")
hlth = relevel(hlth, ref = "normal")

marital <- as.factor(MAR)
levels(marital) <- c("never married", "married", "separated", "divorced", "remarried", 
    "widowed")
marital = relevel(marital, ref = "married")

# 5 yr career expectation parameter <-- career expectations in 1979, those
# who expected to be married (i.e. didn't plan on having a job) := 0,
# otherwise := 1
exp_career <- as.numeric(EXPT.OCC == c(1, 2, 4))
tenure = round(data_NLSY$tenure)
logHours <- log(HRS)
logHours[is.infinite(logHours)] <- NA

logWage <- log(WAG)
logWage[is.infinite((logWage))] <- NA

# occupational expectations to be compared with actual occupation
desr_mngmt <- as.numeric((DES.OCC >= 1) & (DES.OCC <= 43))
desr_professional <- as.numeric((DES.OCC >= 50) & (DES.OCC <= 95))
desr_hard_science <- as.numeric((DES.OCC >= 100) & (DES.OCC <= 156))
desr_technical <- as.numeric((DES.OCC >= 203) & (DES.OCC <= 235))
desr_sales <- as.numeric((DES.OCC >= 243) & (DES.OCC <= 285))
desr_clerical <- as.numeric((DES.OCC >= 303) & (DES.OCC <= 395))
desr_service <- as.numeric((DES.OCC >= 403) & (DES.OCC <= 469))
desr_operatives <- as.numeric((DES.OCC >= 473) & (DES.OCC <= 889))

desr.occ_indx <- as.factor(0 * desr_mngmt + 1 * desr_professional + 2 * desr_hard_science + 
    3 * desr_technical + 4 * desr_sales + 5 * desr_clerical + 6 * desr_service + 
    7 * desr_operatives)

levels(desr.occ_indx)[1] <- "desr.mngmt"
levels(desr.occ_indx)[2] <- "desr.prof"
levels(desr.occ_indx)[3] <- "desr.hard science"
levels(desr.occ_indx)[4] <- "desr.technical"
levels(desr.occ_indx)[5] <- "desr.sales"
levels(desr.occ_indx)[6] <- "desr.clerical"
levels(desr.occ_indx)[7] <- "desr.services"
levels(desr.occ_indx)[8] <- "desr.operatives"

print(by(data_NLSY$INC, occ_indx, summary))  # income, occupation comparison

## occ_indx: mngmt
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##       0   35000   50000   64161   75000  279816      43 
## -------------------------------------------------------- 
## occ_indx: prof
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##       0   42000   56000   67061   75000  236000      10 
## -------------------------------------------------------- 
## occ_indx: hard science
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##       0   32000   47000   61558   78000  279816       3 
## -------------------------------------------------------- 
## occ_indx: technical
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    1200   27000   42000   46225   62300  236000       6 
## -------------------------------------------------------- 
## occ_indx: sales
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     130   21000   34575   42943   60000  216200      13 
## -------------------------------------------------------- 
## occ_indx: clerical
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##       0   10300   28000   47512   58000  236000      25 
## -------------------------------------------------------- 
## occ_indx: services
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##       0    2500   18000   40679   55000  279816      15 
## -------------------------------------------------------- 
## occ_indx: operatives
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##       0   15000   31000   37176   48000  236000      21

careers <- as.data.frame(cbind(as.character(occ_indx), as.character(desr.occ_indx)))
colnames(careers) <- c("Actual", "Desired")
careers_tab <- xtabs(~Actual + Desired, careers, addNA = T)
dActual <- rowSums(careers_tab)
dDesired <- colSums(careers_tab)
career_result <- rbind(dActual, dDesired)
barplot(career_result[, -9], col = c("black", "grey"), beside = T, main = "Actual vs. Desired Occupation", 
    legend.text = c("Actual", "Desired"), ylim = c(0, 960))

data_NLSY = cbind.data.frame(data_NLSY[, c("CaseID", "time", "Age", "Race", 
    "Sex", "PHYS", "WGHT", "CREDITS", "EXPT.OCC", "TOT.EXP06", "DES.OCC", "tenure", 
    "INC", "OCC", "IND", "MAR", "SPO", "WAG", "HRS")], occ_indx, desr.occ_indx, 
    marital, female, hispanic, AfAm, race_other, logHours, logWage, hlth, exp_career)

detach(data_NLSY)
# Tests
coplot(logWage ~ time | CaseID * female, type = "l", col = c("blue", " pink"), 
    data = data_NLSY, panel = function(x, y, ...) panel.smooth(x, y, span = 0.6, 
        ...))

## 
##  Missing rows: 22, 23, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 154, 180, 195, 207, 229, 287, 288, 290, 292, 293, 294, 295, 296, 297, 298, 300, 301, 302, 304, 306, 329, 337, 367, 435, 483, 516, 518, 520, 523, 543, 549, 556, 568, 599, 614, 628, 637, 678, 765, 770, 812, 853, 885, 909, 910, 913, 932, 941, 953, 955, 958, 978, 990, 991, 997, 1001, 1020, 1031, 1103, 1104, 1127, 1128, 1130, 1144, 1160, 1195, 1211, 1230, 1251, 1267, 1276, 1299, 1322, 1330, 1342, 1344, 1349, 1393, 1469, 1471, 1474, 1488, 1493, 1511, 1523, 1531, 1535, 1564, 1603, 1648, 1660, 1664, 1679, 1688, 1715, 1765, 1809, 1811, 1813, 1832, 1870, 1880, 1887, 1892, 1930, 1945, 1964, 1965, 1974, 1978, 1999, 2068, 2076, 2100, 2115, 2126, 2158, 2181, 2185, 2191, 2200, 2218, 2248, 2261, 2267, 2270, 2295, 2297, 2299, 2313, 2334, 2337, 2339, 2351, 2356, 2373, 2390, 2437, 2439, 2441, 2445, 2505, 2531, 2532, 2534, 2544, 2547, 2555, 2558, 2579, 2586

xyplot(logWage ~ time, data = data_NLSY, groups = factor(female, labels = c("M", 
    "F")), pch = 20, auto.key = T, type = c("p", "g"))

Tests

m.ols_null <- lm(logWage~occ_indx+female)
m.ols_full <- lm(logWage~female+AfAm+hispanic+
                 phys+hlth + marital+occ_indx+
                 tenure+I(tenure^2)+data_NLSY$TOT.EXP06+
                 I(data_NLSY$TOT.EXP06^2)+exp_career+IND+
                 SPO, data=data_NLSY[,c(1:11,13:19)])
mfixed <- plm(logWage~occ_indx+female, data=data_NLSY,
              index=c("CaseID", "time"), 
              model="within") #F stat < .05, use log wage only
pFtest(mfixed, m.ols_null) # F test p-val < .05; use fixed

## 
##  F test for individual effects
## 
## data:  logWage ~ occ_indx + female
## F = 3.179, df1 = 418, df2 = 1515, p-value < 2.2e-16
## alternative hypothesis: significant effects

mrandom <- plm(logWage~occ_indx+female,
               data=data_NLSY, index=c("CaseID", "time"),
               model="random")
phtest(mfixed, mrandom) # < .05, use fixed

## 
##  Hausman Test
## 
## data:  logWage ~ occ_indx + female
## chisq = 94.571, df = 7, p-value < 2.2e-16
## alternative hypothesis: one model is inconsistent

mfixed.2 <-plm(logWage~female+AfAm+hispanic+phys+ 
               hlth+marital+occ_indx+tenure+I(tenure^2)+ 
               TOT.EXP06+I(TOT.EXP06^2)+
               exp_career + IND+SPO, 
               index=c("CaseID", "time"),
               model="within", data=data_NLSY)
coeftest(mfixed.2)

## 
## t test of coefficients:
## 
##                         Estimate  Std. Error t value  Pr(>|t|)    
## physExcellent        -3.5780e-02  8.0394e-02 -0.4451 0.6564429    
## physVery good         1.4246e-02  4.5706e-02  0.3117 0.7553872    
## physFair             -2.2353e-02  4.7956e-02 -0.4661 0.6413073    
## physPoor              3.7020e-02  5.6022e-02  0.6608 0.5089956    
## maritalnever married  1.1848e-01  1.5945e-01  0.7431 0.4577542    
## maritalseparated      4.4580e-02  5.2263e-01  0.0853 0.9320522    
## maritaldivorced       2.0855e-01  2.1427e-01  0.9733 0.3308023    
## occ_indxprof         -3.9529e-01  6.1773e-02 -6.3990 3.253e-10 ***
## occ_indxhard science -1.2213e-01  7.1596e-02 -1.7058 0.0885878 .  
## occ_indxtechnical    -7.2264e-02  7.3738e-02 -0.9800 0.3274958    
## occ_indxsales        -2.7211e-01  1.0646e-01 -2.5561 0.0108427 *  
## occ_indxclerical     -1.0887e-01  8.4019e-02 -1.2958 0.1955605    
## occ_indxservices      2.3526e-02  9.2537e-02  0.2542 0.7994097    
## occ_indxoperatives   -1.4798e-01  8.3864e-02 -1.7646 0.0781702 .  
## exp_career            2.7865e-03  3.8991e-02  0.0715 0.9430524    
## IND                   2.8894e-04  6.8037e-05  4.2469 2.529e-05 ***
## SPO                   3.7542e-06  1.0297e-06  3.6459 0.0002907 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

coeftest(mfixed.2, vcovHC(mfixed.2, method = "arellano"))

## 
## t test of coefficients:
## 
##                         Estimate  Std. Error t value  Pr(>|t|)    
## physExcellent        -3.5780e-02  6.7786e-02 -0.5278 0.5978114    
## physVery good         1.4246e-02  5.0757e-02  0.2807 0.7790635    
## physFair             -2.2353e-02  5.4844e-02 -0.4076 0.6837363    
## physPoor              3.7020e-02  5.1190e-02  0.7232 0.4698574    
## maritalnever married  1.1848e-01  2.0705e-01  0.5722 0.5674023    
## maritalseparated      4.4580e-02  9.1088e-02  0.4894 0.6247293    
## maritaldivorced       2.0855e-01  2.9624e-01  0.7040 0.4817145    
## occ_indxprof         -3.9529e-01  7.6712e-02 -5.1529 3.535e-07 ***
## occ_indxhard science -1.2213e-01  7.7791e-02 -1.5700 0.1169785    
## occ_indxtechnical    -7.2264e-02  6.9018e-02 -1.0470 0.2955236    
## occ_indxsales        -2.7211e-01  1.0795e-01 -2.5207 0.0119825 *  
## occ_indxclerical     -1.0887e-01  9.7751e-02 -1.1138 0.2658395    
## occ_indxservices      2.3526e-02  1.3743e-01  0.1712 0.8641358    
## occ_indxoperatives   -1.4798e-01  8.5465e-02 -1.7315 0.0838986 .  
## exp_career            2.7865e-03  4.2321e-02  0.0658 0.9475268    
## IND                   2.8894e-04  7.6577e-05  3.7733 0.0001779 ***
## SPO                   3.7542e-06  1.4136e-06  2.6558 0.0081327 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Selection bias

exp_career_female <- exp_career*female
selection.1 <- selection(selection=exp_career_female~occ_indx + marital, 
                         outcome=logWage~tenure+I(tenure^2)+
                         data_NLSY$SPO+data_NLSY$TOT.EXP06,
                         data=data_NLSY[,c(1:11,13:19)], 
                         method="2step" )
probit_sel <- glm(exp_career_female~occ_indx+AfAm+hispanic+
                    logHours+tenure+I(tenure^2)+data_NLSY$TOT.EXP06+
                    I(data_NLSY$TOT.EXP06^2)+
                    data_NLSY$SPO+marital, family="binomial" 
                    (link="probit"), data = data_NLSY[,c(1:11,13:19)])
imr_var <- dnorm(probit_sel$linear.predictors)/pnorm(probit_sel$linear.predictors)
IMR <-cbind(imr_var, exp_career_female) #inverse mill's ratio
outcome1 <- lm(logWage ~ tenure + I(tenure^2)+
               data_NLSY$SPO+data_NLSY$TOT.EXP06+
               IMR[,1], subset=(exp_career_female==1),
               data=data_NLSY[,c(1:11,13:19)])
outcome2 <- lm(logWage~logHours+AfAm+
               hispanic+phys+hlth+marital+
               occ_indx+tenure+I(tenure^2)+
               data_NLSY$TOT.EXP06+
               I(data_NLSY$TOT.EXP06^2)+
               data_NLSY$SPO+IMR[,1],
               subset=(exp_career_female==1), 
               data=data_NLSY[,c(1:11,13:19)])

Tables

star_probit <- stargazer(probit_sel, type="html", 
                         out="tableprobit.html", align=T, 
                         title="Table. 1 Probit Model of Occupational Expectations for Women", 
                         intercept.bottom=F, flip=T)

**Table. 1 Probit Model of Occupational Expectations for Women**

	Dependent variable:

	exp_career_female

Constant	3.145^**
	(1.529)

occ_indxprof	0.442^***
	(0.168)

occ_indxhard science	0.609^***
	(0.186)

occ_indxtechnical	0.339
	(0.217)

occ_indxsales	0.137
	(0.280)

occ_indxclerical	0.210
	(0.258)

occ_indxservices	-0.114
	(0.319)

occ_indxoperatives	-0.092
	(0.259)

AfAm	-0.071
	(0.153)

hispanic	0.213
	(0.178)

logHours	-1.194^***
	(0.395)

tenure	0.061
	(0.375)

I(tenure2)	0.001
	(0.001)

TOT.EXP06	-0.003
	(0.017)

TOT.EXP062)	-0.00000
	(0.00000)

SPO	0.00000^*
	(0.00000)

maritalnever married	1.153^***
	(0.245)

maritalseparated	-3.734
	(103.654)

maritaldivorced	-0.044
	(0.588)


Observations	951
Log Likelihood	-306.237
Akaike Inf. Crit.	650.473

Note:	p<0.1; p<0.05; p<0.01

star_selection <- stargazer(m.ols_null, m.ols_full, 
                            outcome1, outcome2, selection.1, 
                            type="html", out="tableselection.html",
                            title="Table. 2 Heckman Model Selection", 
                            column.labels=c("uncorrected", "corrected"), 
                            column.separate=c(2,2), selection.equation=T,                             single.row=T, dep.var.labels.include=F,
                            model.names=T, align=T, multicolumn=F)

**Table. 2 Heckman Model Selection**

	Dependent variable:

	OLS	OLS	OLS	OLS	selection
	uncorrected		corrected
	(1)	(2)	(3)	(4)	(5)

logHours				-0.439 (0.366)
occ_indxprof	0.003 (0.051)	-0.071 (0.060)		0.065 (0.167)	0.295^** (0.148)
occ_indxhard science	-0.053 (0.062)	-0.118 (0.073)		-0.357^* (0.188)	0.501^*** (0.164)
occ_indxtechnical	-0.187^*** (0.068)	-0.098 (0.076)		-0.060 (0.226)	0.320^* (0.186)
occ_indxsales	-0.433^*** (0.068)	-0.215^** (0.102)		-0.589^** (0.287)	0.035 (0.228)
occ_indxclerical	-0.393^*** (0.062)	-0.148^* (0.089)		-0.206 (0.280)	-0.046 (0.222)
occ_indxservices	-0.517^*** (0.065)	0.079 (0.092)		-0.284 (0.354)	-0.228 (0.260)
occ_indxoperatives	-0.400^*** (0.059)	-0.195^** (0.078)		-0.205 (0.273)	-0.165 (0.214)
tenure		0.022 (0.126)	0.117 (0.316)	1.072^** (0.446)
I(tenure2)		-0.0004 (0.0003)	-0.00001 (0.00000)	-0.004^*** (0.001)
SPO			0.00000 (0.00000)	0.00000 (0.00000)
TOT.EXP06		-0.001 (0.006)	-0.005 (0.014)	-0.049^** (0.020)
TOT.EXP062)		0.00000 (0.00000)		0.00001^*** (0.00000)
exp_career		-0.028 (0.041)
IND		0.00000 (0.0001)
SPO		0.00000^*** (0.00000)
female	-0.235^*** (0.032)	-0.223^*** (0.043)
AfAm		-0.084^* (0.049)		-0.172 (0.183)
hispanic		-0.016 (0.068)		-0.028 (0.232)
physExcellent		-0.008 (0.101)		-0.177 (0.306)
physVery good		-0.107^** (0.052)		-0.075 (0.191)
physFair		-0.006 (0.052)		-0.047 (0.170)
physPoor		0.065 (0.061)		0.045 (0.196)
hlthoverweight		-0.083^* (0.046)		-0.184 (0.184)
hlthunderweight		-0.159 (0.124)		-0.687 (0.468)
hlthnotascertained		0.013 (0.059)		0.061 (0.268)
maritalnever married		-0.022 (0.105)		0.273 (0.195)	-0.562^*** (0.132)
maritalseparated		0.112 (0.401)			-4.042 (135.430)
maritaldivorced		-0.025 (0.200)		0.824 (0.615)	-0.965^*** (0.373)
IMR[, 1]			-0.155 (0.357)	0.084 (0.226)
maritalremarried					-3.979 (318.613)
Constant	7.613^*** (0.026)	7.411^*** (0.088)	6.843^*** (0.688)	8.635^*** (1.464)	-1.498^*** (0.076)

Observations	1,942	903	130	102	1,871
R²	0.094	0.114	0.056	0.400
Adjusted R²	0.091	0.086	0.018	0.202
rho					0.626
Inverse Mills Ratio					0.422^* (0.233)
Residual Std. Error	0.687 (df = 1933)	0.556 (df = 875)	1.048 (df = 124)	0.544 (df = 76)
F Statistic	25.163^*** (df = 8; 1933)	4.151^*** (df = 27; 875)	1.479 (df = 5; 124)	2.025^** (df = 25; 76)

Note:	p<0.1; p<0.05; p<0.01

star_FE <- stargazer(m.ols_null, m.ols_full, mfixed, mfixed.2, 
                     type="html", out="tableFE.html",
                     title="Table. 3 OLS and Fixed Effects Comparison",
                     column.labels=c("OLS", "FE"), 
                     column.separate=c(2,2), 
                     single.row=T, dep.var.labels.include=F, 
                     model.names=F, align=T, multicolumn=F)

**Table. 3 OLS and Fixed Effects Comparison**

	Dependent variable:

	OLS		FE
	(1)	(2)	(3)	(4)

occ_indxprof	0.003 (0.051)	-0.071 (0.060)	-0.260^*** (0.053)	-0.395^*** (0.062)
occ_indxhard science	-0.053 (0.062)	-0.118 (0.073)	-0.032 (0.062)	-0.122^* (0.072)
occ_indxtechnical	-0.187^*** (0.068)	-0.098 (0.076)	-0.092 (0.067)	-0.072 (0.074)
occ_indxsales	-0.433^*** (0.068)	-0.215^** (0.102)	-0.464^*** (0.072)	-0.272^** (0.106)
occ_indxclerical	-0.393^*** (0.062)	-0.148^* (0.089)	-0.354^*** (0.061)	-0.109 (0.084)
occ_indxservices	-0.517^*** (0.065)	0.079 (0.092)	-0.401^*** (0.064)	0.024 (0.093)
occ_indxoperatives	-0.400^*** (0.059)	-0.195^** (0.078)	-0.221^*** (0.061)	-0.148^* (0.084)
tenure		0.022 (0.126)
I(tenure2)		-0.0004 (0.0003)
TOT.EXP06		-0.001 (0.006)
TOT.EXP062)		0.00000 (0.00000)
exp_career		-0.028 (0.041)		0.003 (0.039)
IND		0.00000 (0.0001)		0.0003^*** (0.0001)
SPO		0.00000^*** (0.00000)		0.00000^*** (0.00000)
female	-0.235^*** (0.032)	-0.223^*** (0.043)
AfAm		-0.084^* (0.049)
hispanic		-0.016 (0.068)
physExcellent		-0.008 (0.101)		-0.036 (0.080)
physVery good		-0.107^** (0.052)		0.014 (0.046)
physFair		-0.006 (0.052)		-0.022 (0.048)
physPoor		0.065 (0.061)		0.037 (0.056)
hlthoverweight		-0.083^* (0.046)
hlthunderweight		-0.159 (0.124)
hlthnotascertained		0.013 (0.059)
maritalnever married		-0.022 (0.105)		0.118 (0.159)
maritalseparated		0.112 (0.401)		0.045 (0.523)
maritaldivorced		-0.025 (0.200)		0.209 (0.214)
Constant	7.613^*** (0.026)	7.411^*** (0.088)

Observations	1,942	903	1,942	853
R²	0.094	0.114	0.065	0.140
Adjusted R²	0.091	0.086	-0.198	-0.279
Residual Std. Error	0.687 (df = 1933)	0.556 (df = 875)
F Statistic	25.163^*** (df = 8; 1933)	4.151^*** (df = 27; 875)	15.104^*** (df = 7; 1515)	5.472^*** (df = 17; 573)

Note:	p<0.1; p<0.05; p<0.01

#robustness 
cov1 <- vcovHC(mfixed.2)
robust.se <- sqrt(diag(cov1))
star_Robust <- stargazer(mfixed.2,title="Table.4 Robust Standard Errors",
                         type="html", out="robust.html", 
                          single.row=T, se=list(robust.se, NULL))

**Table.4 Robust Standard Errors**

	Dependent variable:

	logWage

physExcellent	-0.036 (0.068)
physVery good	0.014 (0.051)
physFair	-0.022 (0.055)
physPoor	0.037 (0.051)
maritalnever married	0.118 (0.207)
maritalseparated	0.045 (0.091)
maritaldivorced	0.209 (0.296)
occ_indxprof	-0.395^*** (0.077)
occ_indxhard science	-0.122 (0.078)
occ_indxtechnical	-0.072 (0.069)
occ_indxsales	-0.272^** (0.108)
occ_indxclerical	-0.109 (0.098)
occ_indxservices	0.024 (0.137)
occ_indxoperatives	-0.148^* (0.085)
exp_career	0.003 (0.042)
IND	0.0003^*** (0.0001)
SPO	0.00000^*** (0.00000)

Observations	853
R²	0.140
Adjusted R²	-0.279
F Statistic	5.472^*** (df = 17; 573)

Note:	p<0.1; p<0.05; p<0.01

coeftest(mfixed.2)

## 
## t test of coefficients:
## 
##                         Estimate  Std. Error t value  Pr(>|t|)    
## physExcellent        -3.5780e-02  8.0394e-02 -0.4451 0.6564429    
## physVery good         1.4246e-02  4.5706e-02  0.3117 0.7553872    
## physFair             -2.2353e-02  4.7956e-02 -0.4661 0.6413073    
## physPoor              3.7020e-02  5.6022e-02  0.6608 0.5089956    
## maritalnever married  1.1848e-01  1.5945e-01  0.7431 0.4577542    
## maritalseparated      4.4580e-02  5.2263e-01  0.0853 0.9320522    
## maritaldivorced       2.0855e-01  2.1427e-01  0.9733 0.3308023    
## occ_indxprof         -3.9529e-01  6.1773e-02 -6.3990 3.253e-10 ***
## occ_indxhard science -1.2213e-01  7.1596e-02 -1.7058 0.0885878 .  
## occ_indxtechnical    -7.2264e-02  7.3738e-02 -0.9800 0.3274958    
## occ_indxsales        -2.7211e-01  1.0646e-01 -2.5561 0.0108427 *  
## occ_indxclerical     -1.0887e-01  8.4019e-02 -1.2958 0.1955605    
## occ_indxservices      2.3526e-02  9.2537e-02  0.2542 0.7994097    
## occ_indxoperatives   -1.4798e-01  8.3864e-02 -1.7646 0.0781702 .  
## exp_career            2.7865e-03  3.8991e-02  0.0715 0.9430524    
## IND                   2.8894e-04  6.8037e-05  4.2469 2.529e-05 ***
## SPO                   3.7542e-06  1.0297e-06  3.6459 0.0002907 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Gender Wage Gap

Celia Marraro

6 August, 2018

Source data from github repository

Selection bias

Tables