Load Libraries

library(psych)
## Warning: package 'psych' was built under R version 4.3.3
library(visdat)
## Warning: package 'visdat' was built under R version 4.3.3
library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(stringr)
library(sjlabelled)
## Warning: package 'sjlabelled' was built under R version 4.3.3
## 
## Attaching package: 'sjlabelled'
## The following object is masked from 'package:dplyr':
## 
##     as_label
library(sampleSelection)
## Warning: package 'sampleSelection' was built under R version 4.3.3
## Loading required package: maxLik
## Warning: package 'maxLik' was built under R version 4.3.3
## Loading required package: miscTools
## Warning: package 'miscTools' was built under R version 4.3.3
## 
## Please cite the 'maxLik' package as:
## Henningsen, Arne and Toomet, Ott (2011). maxLik: A package for maximum likelihood estimation in R. Computational Statistics 26(3), 443-458. DOI 10.1007/s00180-010-0217-1.
## 
## If you have questions, suggestions, or comments regarding the 'maxLik' package, please use a forum or 'tracker' at maxLik's R-Forge site:
## https://r-forge.r-project.org/projects/maxlik/

Load Data and Clean

migra <- read.csv('migra.csv')
#View(migra)

migra <- migra %>%
  filter(INCWAGE >= 0 & INCWAGE < 999998) %>%
  rename(MIGRATE = MIGRATE5) %>%
  mutate(MIGRATE = ifelse(MIGRATE == 2 | MIGRATE == 3, 1, 0)) %>%
  filter(AGE >= 18 & AGE <= 45) %>%
  filter(EDUC >= 4 & EDUC <= 11) %>%
  mutate(EDUC = case_when(
    EDUC == 6 ~ 12,
    EDUC == 7 ~ 13,
    EDUC == 8 ~ 14,
    EDUC == 9 ~ 15,
    EDUC == 10 ~ 16,
    EDUC == 11 ~ 17
  )) %>%
  select(-c(IND, MIGRATE5D, EDUCD))
 
#factor migration into 1s and 0s
migra <- migra %>%
  mutate(MIGRATE = as.factor(MIGRATE))

# Define a small constant
epsilon <- 0.001

#create variables: exp and lwage
migra <- migra %>%
  mutate(exp = AGE - EDUC) %>%
  mutate(lwage = log(INCWAGE + epsilon))

#omit rows with missing values
migra <- na.omit(migra)

summary(migra)
##       YEAR           AGE             EDUC          INCWAGE       MIGRATE    
##  Min.   :2000   Min.   :18.00   Min.   :12.00   Min.   :     0   0:2024049  
##  1st Qu.:2000   1st Qu.:26.00   1st Qu.:12.00   1st Qu.:  4600   1:2665455  
##  Median :2000   Median :33.00   Median :13.00   Median : 19000              
##  Mean   :2000   Mean   :32.56   Mean   :13.39   Mean   : 25206              
##  3rd Qu.:2000   3rd Qu.:39.00   3rd Qu.:14.00   3rd Qu.: 35000              
##  Max.   :2000   Max.   :45.00   Max.   :17.00   Max.   :385000              
##       exp            lwage       
##  Min.   : 2.00   Min.   :-6.908  
##  1st Qu.:12.00   1st Qu.: 8.434  
##  Median :20.00   Median : 9.852  
##  Mean   :19.17   Mean   : 7.044  
##  3rd Qu.:26.00   3rd Qu.:10.463  
##  Max.   :33.00   Max.   :12.861

Probit and Heckit Selection Model

#generalized linear model using probit
myProbit <- glm(MIGRATE ~ AGE + EDUC + exp + I(exp^2), family = binomial(link = 'probit'), data = migra)
summary(myProbit)
## 
## Call:
## glm(formula = MIGRATE ~ AGE + EDUC + exp + I(exp^2), family = binomial(link = "probit"), 
##     data = migra)
## 
## Coefficients:
##               Estimate Std. Error  z value Pr(>|z|)    
## (Intercept) -2.463e-01  5.424e-03  -45.419   <2e-16 ***
## AGE         -1.486e+06  1.508e+06   -0.985    0.325    
## EDUC         1.486e+06  1.508e+06    0.985    0.325    
## exp          1.486e+06  1.508e+06    0.985    0.325    
## I(exp^2)    -2.406e-03  1.094e-05 -219.872   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6413029  on 4689503  degrees of freedom
## Residual deviance: 6120967  on 4689499  degrees of freedom
## AIC: 6120977
## 
## Number of Fisher Scoring iterations: 21
migra$IMR <- invMillsRatio(myProbit)$IMR1

myHeckit <- lm(lwage ~ EDUC + exp + I(exp^2) + IMR, data = migra[migra$MIGRATE == 1,])
summary(myHeckit)
## 
## Call:
## lm(formula = lwage ~ EDUC + exp + I(exp^2) + IMR, data = migra[migra$MIGRATE == 
##     1, ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -16.343   1.216   2.356   3.155   6.998 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -26.695105   0.425823  -62.69   <2e-16 ***
## EDUC          0.884922   0.007418  119.30   <2e-16 ***
## exp           1.250729   0.016942   73.82   <2e-16 ***
## I(exp^2)     -0.048825   0.000697  -70.05   <2e-16 ***
## IMR          27.269928   0.437599   62.32   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.008 on 2665450 degrees of freedom
## Multiple R-squared:  0.02284,    Adjusted R-squared:  0.02284 
## F-statistic: 1.558e+04 on 4 and 2665450 DF,  p-value: < 2.2e-16

Alternative Estimation

# two-step estimation
summary(heckit(MIGRATE ~ AGE + EDUC + exp + I(exp^2),
                 lwage ~ EDUC + exp + I(exp^2),method = "2step",data = migra))
## --------------------------------------------
## Tobit 2 model (sample selection model)
## 2-step Heckman / heckit estimation
## 4689504 observations (2024049 censored and 2665455 observed)
## 12 free parameters (df = 4689493)
## Probit selection equation:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.2468463        Inf       0        1
## AGE          0.0281919        Inf       0        1
## EDUC         0.0006445        Inf       0        1
## exp          0.0277223        Inf       0        1
## I(exp^2)    -0.0024060        Inf       0        1
## Outcome equation:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept) -26.71765        NaN     NaN      NaN
## EDUC          0.88511        NaN     NaN      NaN
## exp           1.25140        NaN     NaN      NaN
## I(exp^2)     -0.04885        NaN     NaN      NaN
## Multiple R-Squared:0.0228,   Adjusted R-Squared:0.0228
##    Error terms:
##               Estimate Std. Error t value Pr(>|t|)
## invMillsRatio   27.283        NaN     NaN      NaN
## sigma           21.541         NA      NA       NA
## rho              1.267         NA      NA       NA
## --------------------------------------------
# ML estimation
summary(selection(MIGRATE ~ AGE + EDUC + exp + I(exp^2),
                  lwage ~ EDUC + exp + I(exp^2), migra))
## --------------------------------------------
## Tobit 2 model (sample selection model)
## Maximum Likelihood estimation
## Newton-Raphson maximisation, 10 iterations
## Return code 3: Last step could not find a value above the current.
## Boundary of parameter space?  
## Consider switching to a more robust optimisation method temporarily.
## Log-Likelihood: -12676675 
## 4689504 observations (2024049 censored and 2665455 observed)
## 11 free parameters (df = 4689493)
## Probit selection equation:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.359737        Inf       0        1
## AGE          0.029919        Inf       0        1
## EDUC         0.011024        Inf       0        1
## exp          0.019071        Inf       0        1
## I(exp^2)    -0.002266        Inf       0        1
## Outcome equation:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept) -26.51641        Inf       0        1
## EDUC          0.96210        Inf       0        1
## exp           1.47930        Inf       0        1
## I(exp^2)     -0.05013        Inf       0        1
##    Error terms:
##       Estimate Std. Error t value Pr(>|t|)
## sigma    18.36        Inf       0        1
## rho       1.00        Inf       0        1
## --------------------------------------------