Y= working_hour, D= gender, X= age, expとしてDの係数値を推定せよ。データの分割回数は10、用いるアルゴリズムはOLS、Random Forest, LASSOとする。
# Load library ------------------------------------------------------------
pacman::p_load("tidyverse", "SuperLearner", "grf", "estimatr", "huxtable")# Import data -------------------------------------------------------------
data_raw <- read_csv("/cloud/project/Data/data.csv")data <- data_raw %>%
mutate(gender = if_else(gender == "Female", 1, 0),
training = if_else(training == "Yes", 1, 0),
marraige = if_else(marriage == "Yes", 1, 0)
) %>%
select(-weights)
# set variables -----------------------------------------------------------
Y <- data$income
D <- data$gender
X <- select(data, age, exp) # other variables# Prediction (Model Specification) --------------------------------------------------------------
# how many of divisions?
v <- 10
# used libraries
SL.libraries <- c("SL.lm", # OLS
"SL.ranger", # random forest
"SL.glmnet") # lasso
# Fitting
fits <- list()
fits$fit.Y <- CV.SuperLearner(Y = Y,
X = X,
SL.library = SL.libraries,
V = v # how many of divisions?
)
fits$fit.D <- CV.SuperLearner(Y = D,
X = X,
SL.library = SL.libraries,
V = v # how many of divisions?
)
# save the result
results <- lapply(fits, summary)
# show
results## $fit.Y
##
## Call: CV.SuperLearner(Y = Y, X = X, V = v, SL.library = SL.libraries)
##
## Risk is based on: Mean Squared Error
##
## All risk estimates are based on V = 10
##
## Algorithm Ave se Min Max
## Super Learner 309.60 15.332 246.31 393.70
## Discrete SL 330.32 16.628 254.61 416.50
## SL.lm_All 323.15 15.187 263.83 401.21
## SL.ranger_All 322.12 16.215 254.61 416.50
## SL.glmnet_All 323.15 15.192 263.83 401.28
##
## $fit.D
##
## Call: CV.SuperLearner(Y = D, X = X, V = v, SL.library = SL.libraries)
##
## Risk is based on: Mean Squared Error
##
## All risk estimates are based on V = 10
##
## Algorithm Ave se Min Max
## Super Learner 0.21354 0.0035853 0.19328 0.24169
## Discrete SL 0.21598 0.0035143 0.19776 0.24497
## SL.lm_All 0.21599 0.0035360 0.19732 0.24478
## SL.ranger_All 0.22271 0.0044903 0.19528 0.25042
## SL.glmnet_All 0.21593 0.0035084 0.19776 0.24497
結果として、SuperLearnerが最もMSE\((=309.5973958)\)が少ないという結果になった。このことから、予測誤差を最小化するには、今回はSuperLearnerが被説明変数である所得(=income)を最もよく説明できるアルゴリズムであることが分かった。
# Estimation (Parameter Estimation) --------------------------------------------------------------
data_result <- data
# 予測誤差
data_result$Y.oht <- Y - fits$fit.Y$SL.predict
data_result$D.oht <- D - fits$fit.D$SL.predict
# ML
mod1 <- lm_robust(Y.oht ~ 0 + D.oht,
data = data_result) # OLS
# OLS
mod2 <- lm_robust(Y ~ D,
data = data_result)
# OLS + own chosen variables
mod3 <- lm_robust(Y ~ D + marriage + training,
data = data_result)
huxreg("SuperLeaner" = mod1,
"OLS" = mod2,
"OLS +" = mod3)| SuperLeaner | OLS | OLS + | |
|---|---|---|---|
| D.oht | -11.244 *** | ||
| (0.709) | |||
| (Intercept) | 42.688 *** | 33.684 *** | |
| (0.553) | (0.610) | ||
| D | -14.064 *** | -12.256 *** | |
| (0.750) | (0.736) | ||
| marriageYes | 11.910 *** | ||
| (0.714) | |||
| training | 4.289 *** | ||
| (0.962) | |||
| N | 2129 | 2129 | 2129 |
| R2 | 0.087 | 0.112 | 0.203 |
| *** p < 0.001; ** p < 0.01; * p < 0.05. | |||
結論として、\(\tau\)はそれぞれ以下のように推定された。
MSEの結果に基づくと、目的変数である\(\tau\)の予測に関して、SuperLearnerと比べると、OLS、OLS+ともに過小予測(underestimated)していることが分かる。
# Load library ------------------------------------------------------------
pacman::p_load("tidyverse", "SuperLearner", "grf", "estimatr", "huxtable")
# Import data -------------------------------------------------------------
data_raw <- read_csv("/cloud/project/Data/data.csv")
# tidy
data <- data_raw %>%
mutate(gender = if_else(gender == "Female", 1, 0),
training = if_else(training == "Yes", 1, 0),
marraige = if_else(marriage == "Yes", 1, 0)
) %>%
select(-weights)
# set variables -----------------------------------------------------------
Y <- data$income
D <- data$gender
X <- select(data, age, exp) # other variables
# Prediction (Model Specification) --------------------------------------------------------------
# how many of divisions?
v <- 10
# used libraries
SL.libraries <- c("SL.lm", # OLS
"SL.ranger", # random forest
"SL.glmnet") # lasso
# Fitting
fits <- list()
fits$fit.Y <- CV.SuperLearner(Y = Y,
X = X,
SL.library = SL.libraries,
V = v # how many of divisions?
)
fits$fit.D <- CV.SuperLearner(Y = D,
X = X,
SL.library = SL.libraries,
V = v # how many of divisions?
)
# save the result
results <- lapply(fits, summary)
# show
results
# Estimation (Parameter Estimation) --------------------------------------------------------------
data_result <- data
# 予測誤差
data_result$Y.oht <- Y - fits$fit.Y$SL.predict
data_result$D.oht <- D - fits$fit.D$SL.predict
# ML
mod1 <- lm_robust(Y.oht ~ 0 + D.oht,
data = data_result) # OLS
# OLS
mod2 <- lm_robust(Y ~ D,
data = data_result)
# OLS + own chosen variables
mod3 <- lm_robust(Y ~ D + marriage + training,
data = data_result)
huxreg("SuperLeaner" = mod1,
"OLS" = mod2,
"OLS +" = mod3)