Load Libraries
library(psych)
## Warning: package 'psych' was built under R version 4.3.3
library(visdat)
## Warning: package 'visdat' was built under R version 4.3.3
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(stringr)
library(sjlabelled)
## Warning: package 'sjlabelled' was built under R version 4.3.3
##
## Attaching package: 'sjlabelled'
## The following object is masked from 'package:dplyr':
##
## as_label
library(sampleSelection)
## Warning: package 'sampleSelection' was built under R version 4.3.3
## Loading required package: maxLik
## Warning: package 'maxLik' was built under R version 4.3.3
## Loading required package: miscTools
## Warning: package 'miscTools' was built under R version 4.3.3
##
## Please cite the 'maxLik' package as:
## Henningsen, Arne and Toomet, Ott (2011). maxLik: A package for maximum likelihood estimation in R. Computational Statistics 26(3), 443-458. DOI 10.1007/s00180-010-0217-1.
##
## If you have questions, suggestions, or comments regarding the 'maxLik' package, please use a forum or 'tracker' at maxLik's R-Forge site:
## https://r-forge.r-project.org/projects/maxlik/
Load Data and Clean
migra <- read.csv('migra.csv')
#View(migra)
migra <- migra %>%
filter(INCWAGE >= 0 & INCWAGE < 999998) %>%
rename(MIGRATE = MIGRATE5) %>%
mutate(MIGRATE = ifelse(MIGRATE == 2 | MIGRATE == 3, 1, 0)) %>%
filter(AGE >= 18 & AGE <= 45) %>%
filter(EDUC >= 4 & EDUC <= 11) %>%
mutate(EDUC = case_when(
EDUC == 6 ~ 12,
EDUC == 7 ~ 13,
EDUC == 8 ~ 14,
EDUC == 9 ~ 15,
EDUC == 10 ~ 16,
EDUC == 11 ~ 17
)) %>%
select(-c(IND, MIGRATE5D, EDUCD))
#factor migration into 1s and 0s
migra <- migra %>%
mutate(MIGRATE = as.factor(MIGRATE))
# Define a small constant
epsilon <- 0.001
#create variables: exp and lwage
migra <- migra %>%
mutate(exp = AGE - EDUC) %>%
mutate(lwage = log(INCWAGE + epsilon))
#omit rows with missing values
migra <- na.omit(migra)
summary(migra)
## YEAR AGE EDUC INCWAGE MIGRATE
## Min. :2000 Min. :18.00 Min. :12.00 Min. : 0 0:2024049
## 1st Qu.:2000 1st Qu.:26.00 1st Qu.:12.00 1st Qu.: 4600 1:2665455
## Median :2000 Median :33.00 Median :13.00 Median : 19000
## Mean :2000 Mean :32.56 Mean :13.39 Mean : 25206
## 3rd Qu.:2000 3rd Qu.:39.00 3rd Qu.:14.00 3rd Qu.: 35000
## Max. :2000 Max. :45.00 Max. :17.00 Max. :385000
## exp lwage
## Min. : 2.00 Min. :-6.908
## 1st Qu.:12.00 1st Qu.: 8.434
## Median :20.00 Median : 9.852
## Mean :19.17 Mean : 7.044
## 3rd Qu.:26.00 3rd Qu.:10.463
## Max. :33.00 Max. :12.861
Probit and Heckit Selection Model
#generalized linear model using probit
myProbit <- glm(MIGRATE ~ AGE + EDUC + exp + I(exp^2), family = binomial(link = 'probit'), data = migra)
summary(myProbit)
##
## Call:
## glm(formula = MIGRATE ~ AGE + EDUC + exp + I(exp^2), family = binomial(link = "probit"),
## data = migra)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.463e-01 5.424e-03 -45.419 <2e-16 ***
## AGE -1.486e+06 1.508e+06 -0.985 0.325
## EDUC 1.486e+06 1.508e+06 0.985 0.325
## exp 1.486e+06 1.508e+06 0.985 0.325
## I(exp^2) -2.406e-03 1.094e-05 -219.872 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6413029 on 4689503 degrees of freedom
## Residual deviance: 6120967 on 4689499 degrees of freedom
## AIC: 6120977
##
## Number of Fisher Scoring iterations: 21
migra$IMR <- invMillsRatio(myProbit)$IMR1
myHeckit <- lm(lwage ~ EDUC + exp + I(exp^2) + IMR, data = migra[migra$MIGRATE == 1,])
summary(myHeckit)
##
## Call:
## lm(formula = lwage ~ EDUC + exp + I(exp^2) + IMR, data = migra[migra$MIGRATE ==
## 1, ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.343 1.216 2.356 3.155 6.998
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -26.695105 0.425823 -62.69 <2e-16 ***
## EDUC 0.884922 0.007418 119.30 <2e-16 ***
## exp 1.250729 0.016942 73.82 <2e-16 ***
## I(exp^2) -0.048825 0.000697 -70.05 <2e-16 ***
## IMR 27.269928 0.437599 62.32 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.008 on 2665450 degrees of freedom
## Multiple R-squared: 0.02284, Adjusted R-squared: 0.02284
## F-statistic: 1.558e+04 on 4 and 2665450 DF, p-value: < 2.2e-16
Alternative Estimation
# two-step estimation
summary(heckit(MIGRATE ~ AGE + EDUC + exp + I(exp^2),
lwage ~ EDUC + exp + I(exp^2),method = "2step",data = migra))
## --------------------------------------------
## Tobit 2 model (sample selection model)
## 2-step Heckman / heckit estimation
## 4689504 observations (2024049 censored and 2665455 observed)
## 12 free parameters (df = 4689493)
## Probit selection equation:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.2468463 Inf 0 1
## AGE 0.0281919 Inf 0 1
## EDUC 0.0006445 Inf 0 1
## exp 0.0277223 Inf 0 1
## I(exp^2) -0.0024060 Inf 0 1
## Outcome equation:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -26.71765 NaN NaN NaN
## EDUC 0.88511 NaN NaN NaN
## exp 1.25140 NaN NaN NaN
## I(exp^2) -0.04885 NaN NaN NaN
## Multiple R-Squared:0.0228, Adjusted R-Squared:0.0228
## Error terms:
## Estimate Std. Error t value Pr(>|t|)
## invMillsRatio 27.283 NaN NaN NaN
## sigma 21.541 NA NA NA
## rho 1.267 NA NA NA
## --------------------------------------------
# ML estimation
summary(selection(MIGRATE ~ AGE + EDUC + exp + I(exp^2),
lwage ~ EDUC + exp + I(exp^2), migra))
## --------------------------------------------
## Tobit 2 model (sample selection model)
## Maximum Likelihood estimation
## Newton-Raphson maximisation, 10 iterations
## Return code 3: Last step could not find a value above the current.
## Boundary of parameter space?
## Consider switching to a more robust optimisation method temporarily.
## Log-Likelihood: -12676675
## 4689504 observations (2024049 censored and 2665455 observed)
## 11 free parameters (df = 4689493)
## Probit selection equation:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.359737 Inf 0 1
## AGE 0.029919 Inf 0 1
## EDUC 0.011024 Inf 0 1
## exp 0.019071 Inf 0 1
## I(exp^2) -0.002266 Inf 0 1
## Outcome equation:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -26.51641 Inf 0 1
## EDUC 0.96210 Inf 0 1
## exp 1.47930 Inf 0 1
## I(exp^2) -0.05013 Inf 0 1
## Error terms:
## Estimate Std. Error t value Pr(>|t|)
## sigma 18.36 Inf 0 1
## rho 1.00 Inf 0 1
## --------------------------------------------