## Loading required package: nnls
## Loading required package: gam
## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.20
## Super Learner
## Version: 2.0-28
## Package created on 2021-05-04
library(estimatr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.4 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 2.0.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x purrr::accumulate() masks foreach::accumulate()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::when() masks foreach::when()
data <- read_csv("data(1).csv")
## Rows: 2129 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): region, training, health, education, firm_size, marriage, gender
## dbl (9): income, weights, industry, occupation, num_child, working_hour, age...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Y <- data$income
D <- if_else(data$gender == "Female",1, 0)
X <- select (data, age, exp)
#Prediction----------------
fit.Y <- CV.SuperLearner(Y = Y,
X = X,
SL.library = c("SL.lm",
"SL.ranger",
"SL.glmnet"),
V = 10
)
## Loading required namespace: glmnet
## Loading required namespace: ranger
##
## Call:
## CV.SuperLearner(Y = Y, X = X, V = 10, SL.library = c("SL.lm", "SL.ranger",
## "SL.glmnet"))
##
## Risk is based on: Mean Squared Error
##
## All risk estimates are based on V = 10
##
## Algorithm Ave se Min Max
## Super Learner 307.51 15.081 256.88 346.72
## Discrete SL 324.55 15.702 274.80 370.66
## SL.lm_All 322.99 15.183 262.12 370.66
## SL.ranger_All 319.26 15.905 274.80 356.93
## SL.glmnet_All 323.00 15.188 262.22 370.61
fit.D <- CV.SuperLearner(Y = D,
X = X,
SL.library = c("SL.lm",
"SL.ranger",
"SL.glmnet"),
V = 10
)
summary(fit.D)
##
## Call:
## CV.SuperLearner(Y = D, X = X, V = 10, SL.library = c("SL.lm", "SL.ranger",
## "SL.glmnet"))
##
## Risk is based on: Mean Squared Error
##
## All risk estimates are based on V = 10
##
## Algorithm Ave se Min Max
## Super Learner 0.21278 0.0035722 0.19145 0.23190
## Discrete SL 0.21574 0.0035098 0.19315 0.23332
## SL.lm_All 0.21575 0.0035302 0.19300 0.23356
## SL.ranger_All 0.22187 0.0044585 0.19311 0.24508
## SL.glmnet_All 0.21574 0.0035098 0.19315 0.23332
#estimation
data$Y.oht <- Y - fit.Y$SL.predict
data$D.oht <- D - fit.D$SL.predict
lm_robust(Y.oht ~ 0 + D.oht,
data)
## Estimate Std. Error t value Pr(>|t|) CI Lower CI Upper DF
## D.oht -11.16381 0.705999 -15.81279 2.44952e-53 -12.54833 -9.779293 2128
## Estimate Std. Error t value Pr(>|t|) CI Lower CI Upper
## (Intercept) 42.68846 0.5525704 77.25434 0.000000e+00 41.60482 43.77209
## D -14.06358 0.7496812 -18.75941 8.792142e-73 -15.53376 -12.59339
## DF
## (Intercept) 2127
## D 2127