library(SuperLearner)
## Loading required package: nnls
## Loading required package: gam
## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.20
## Super Learner
## Version: 2.0-28
## Package created on 2021-05-04
library(estimatr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.4     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x purrr::accumulate() masks foreach::accumulate()
## x dplyr::filter()     masks stats::filter()
## x dplyr::lag()        masks stats::lag()
## x purrr::when()       masks foreach::when()
data <- read_csv("data(1).csv")
## Rows: 2129 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): region, training, health, education, firm_size, marriage, gender
## dbl (9): income, weights, industry, occupation, num_child, working_hour, age...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Y <- data$income

D <- if_else(data$gender == "Female",1, 0)

X <- select (data, age, exp)

#Prediction----------------
fit.Y <- CV.SuperLearner(Y = Y,
                         X = X,
                         SL.library = c("SL.lm",
                                        "SL.ranger",
                                        "SL.glmnet"),
                         V = 10
)
## Loading required namespace: glmnet
## Loading required namespace: ranger
summary(fit.Y)
## 
## Call:  
## CV.SuperLearner(Y = Y, X = X, V = 10, SL.library = c("SL.lm", "SL.ranger",  
##     "SL.glmnet")) 
## 
## Risk is based on: Mean Squared Error
## 
## All risk estimates are based on V =  10 
## 
##      Algorithm    Ave     se    Min    Max
##  Super Learner 307.51 15.081 256.88 346.72
##    Discrete SL 324.55 15.702 274.80 370.66
##      SL.lm_All 322.99 15.183 262.12 370.66
##  SL.ranger_All 319.26 15.905 274.80 356.93
##  SL.glmnet_All 323.00 15.188 262.22 370.61
fit.D <- CV.SuperLearner(Y = D,
                         X = X,
                         SL.library = c("SL.lm",
                                        "SL.ranger",
                                        "SL.glmnet"),
                         V = 10
)

summary(fit.D)
## 
## Call:  
## CV.SuperLearner(Y = D, X = X, V = 10, SL.library = c("SL.lm", "SL.ranger",  
##     "SL.glmnet")) 
## 
## Risk is based on: Mean Squared Error
## 
## All risk estimates are based on V =  10 
## 
##      Algorithm     Ave        se     Min     Max
##  Super Learner 0.21278 0.0035722 0.19145 0.23190
##    Discrete SL 0.21574 0.0035098 0.19315 0.23332
##      SL.lm_All 0.21575 0.0035302 0.19300 0.23356
##  SL.ranger_All 0.22187 0.0044585 0.19311 0.24508
##  SL.glmnet_All 0.21574 0.0035098 0.19315 0.23332
#estimation

data$Y.oht <- Y - fit.Y$SL.predict
data$D.oht <- D - fit.D$SL.predict

lm_robust(Y.oht ~ 0 + D.oht,
          data)
##        Estimate Std. Error   t value    Pr(>|t|)  CI Lower  CI Upper   DF
## D.oht -11.16381   0.705999 -15.81279 2.44952e-53 -12.54833 -9.779293 2128
lm_robust(Y ~ D,
          data)
##              Estimate Std. Error   t value     Pr(>|t|)  CI Lower  CI Upper
## (Intercept)  42.68846  0.5525704  77.25434 0.000000e+00  41.60482  43.77209
## D           -14.06358  0.7496812 -18.75941 8.792142e-73 -15.53376 -12.59339
##               DF
## (Intercept) 2127
## D           2127