自動車保有の離散選択問題を例に,conditional logit モデルの推定をRで行います.データは乱数で与えているため各変数の値や推定値は現実ではありえないようなものになっていますが,目を瞑ってください.
世帯数100・選択肢5つ(outside optionを含む)のデータセットを乱数で生成する.
# setwd(dir = "c://hoge")
# install.packages("tidyverse")
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.5.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
N <- 100 # number of household
J <- 5 # number of choice (size of choice set)
set.seed(89) # 推定値の符号が「それっぽく」なるように設定
data <- tibble(
id = rep(1:N, each = J),
model = rep(1:J, times = N),
choice = c(rep(c(1, 0, 0, 0, 0), 30), rep(c(0, 1, 0, 0, 0), 25), rep(c(0, 0, 1, 0, 0), 20),
rep(c(0, 0, 0, 1, 0), 15), rep(c(0, 0, 0, 0, 1), 10)),
income = rep(rnorm(N, sd = 50, mean = 500), each = J),
rental_price = rep(c(0, rnorm(J-1, sd = 10, mean = 100)), times = N),
driving_cost = rep(c(0, rnorm(J-1, sd = 1, mean = 10)), times = N),
)
data
## # A tibble: 500 × 6
## id model choice income rental_price driving_cost
## <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 1 1 427. 0 0
## 2 1 2 0 427. 107. 10.0
## 3 1 3 0 427. 92.3 11.8
## 4 1 4 0 427. 105. 9.16
## 5 1 5 0 427. 104. 11.3
## 6 2 1 1 535. 0 0
## 7 2 2 0 535. 107. 10.0
## 8 2 3 0 535. 92.3 11.8
## 9 2 4 0 535. 105. 9.16
## 10 2 5 0 535. 104. 11.3
## # … with 490 more rows
table(data$model[data$choice == 1]) # choice frequency
##
## 1 2 3 4 5
## 30 25 20 15 10
summary(data)
## id model choice income rental_price
## Min. : 1.00 Min. :1 Min. :0.0 Min. :381.9 Min. : 0.00
## 1st Qu.: 25.75 1st Qu.:2 1st Qu.:0.0 1st Qu.:451.1 1st Qu.: 92.30
## Median : 50.50 Median :3 Median :0.0 Median :493.5 Median :103.95
## Mean : 50.50 Mean :3 Mean :0.2 Mean :490.8 Mean : 81.54
## 3rd Qu.: 75.25 3rd Qu.:4 3rd Qu.:0.0 3rd Qu.:535.8 3rd Qu.:104.89
## Max. :100.00 Max. :5 Max. :1.0 Max. :596.5 Max. :106.57
## driving_cost
## Min. : 0.000
## 1st Qu.: 9.159
## Median :10.035
## Mean : 8.471
## 3rd Qu.:11.328
## Max. :11.830
# write.csv(data, "data.csv", row.names = F) # export to CSV
間接効用関数
\[ u_{ij} = v_{ij} + \epsilon_{ij} = \beta_1 \log(income_i -rentalprice_j) + \beta_2 drivingcost_j + \epsilon_{ij} \]
誤差項 \(\epsilon_{ij}\) が第一種極値分布に従うと仮定することで,選択確率 \(p_{ij}\) がロジット形式で表現できる.
\[ p_{ij} = \frac{\exp(v_{ij})}{\sum_{k} \exp(v_{ik})} \]
パッケージを使って推定する.
# install.packages("survival")
library(survival)
cl0 <- survival::clogit(choice ~ log(income - rental_price) + driving_cost + strata(id), data)
cl0
## Call:
## survival::clogit(choice ~ log(income - rental_price) + driving_cost +
## strata(id), data)
##
## coef exp(coef) se(coef) z p
## log(income - rental_price) 0.21144 1.23546 3.07815 0.069 0.945
## driving_cost -0.04611 0.95494 0.06906 -0.668 0.504
##
## Likelihood ratio test=5.74 on 2 df, p=0.05659
## n= 500, number of events= 100
logLik(cl0) # 最大化された対数尤度関数の値
## 'log Lik.' -158.0718 (df=2)
選択確率
pred0 <- predict(cl0, newdata = data, type = "expected")
pred0_mat <- matrix(pred0, nrow = N, ncol = J, byrow = T)
summary(pred0_mat) # 車のモデルごとの要約統計量
## V1 V2 V3 V4
## Min. :0.2974 Min. :0.1780 Min. :0.1656 Min. :0.1855
## 1st Qu.:0.2985 1st Qu.:0.1787 1st Qu.:0.1659 1st Qu.:0.1863
## Median :0.2994 Median :0.1790 Median :0.1661 Median :0.1866
## Mean :0.2996 Mean :0.1790 Mean :0.1660 Mean :0.1865
## 3rd Qu.:0.3005 3rd Qu.:0.1793 3rd Qu.:0.1662 3rd Qu.:0.1869
## Max. :0.3029 Max. :0.1796 Max. :0.1664 Max. :0.1872
## V5
## Min. :0.1680
## 1st Qu.:0.1686
## Median :0.1689
## Mean :0.1689
## 3rd Qu.:0.1691
## Max. :0.1694
V1がmodel 1の選択確率,V2がmodel 2の選択確率,… に対応.
generate netinc = log(income - rental_price) // log of net income
asclogit choice netinc driving_cost, case(id) alternatives(model) noconstant
Iteration 0: log likelihood = -158.35754
Iteration 1: log likelihood = -158.07203
Iteration 2: log likelihood = -158.07183
Iteration 3: log likelihood = -158.07183
Alternative-specific conditional logit Number of obs = 500
Case ID variable: id Number of cases = 100
Alternatives variable: model Alts per case: min = 5
avg = 5.0
max = 5
Wald chi2(2) = 6.18
Log likelihood = -158.07183 Prob > chi2 = 0.0454
------------------------------------------------------------------------------
choice | Coefficient Std. err. z P>|z| [95% conf. interval]
-------------+----------------------------------------------------------------
model |
netinc | .2114433 3.078147 0.07 0.945 -5.821613 6.2445
driving_cost | -.0461082 .0690643 -0.67 0.504 -.1814718 .0892554
------------------------------------------------------------------------------
尤度関数を自作して推定する.
尤度関数(\(1_{ij}\): 家計 \(i\) がモデル \(j\) を選択しているときに1をとるindicator)
\[ L = \prod_{i=1}^N \prod_{j=1}^J p_{ij}^{1_{ij}} \]
\[ \log(L) = \sum_{i=1}^N \sum_{j=1}^J \left[ 1_{ij} \times \log(p_{ij}) \right] \]
v_ij <- function (a, d, i, j) { # 間接効用関数
yi <- d$income[d$id == i][1]
pj <- d$rental_price[d$model == j][1]
dcj <- d$driving_cost[d$model == j][1]
a[1] * log(yi - pj) + a[2] * dcj # v_ij
}
choice_prob_ij <- function (a, d, i, j) { # 選択確率
# denominator
exp_v_i_temp <- 0
for (j2 in 1:J) {
exp_v_i_temp <- exp_v_i_temp + exp(v_ij(a, d, i, j = j2))
}
exp(v_ij(a, d, i, j)) / exp_v_i_temp # p_ij
}
obj_func <- function (a, d) { # 対数尤度関数(最適化の目的関数)
sum_of_obj <- 0
for (i in 1:N) {
for (j in 1:J) {
dij <- d$choice[d$id == i & d$model == j]
p_ij <- choice_prob_ij(a, d, i, j)
sum_of_obj <- sum_of_obj + dij * log(p_ij)
}
}
sum_of_obj # log(L)
}
上で定義した尤度関数を optim 関数で最大化する.
opt_logit <- optim(par = c(0, 0), fn = obj_func, d = data,
control = list(fnscale = -1), method = "BFGS", hessian = TRUE)
opt_logit
## $par
## [1] 0.21182426 -0.04609918
##
## $value
## [1] -158.0718
##
## $counts
## function gradient
## 17 6
##
## $convergence
## [1] 0
##
## $message
## NULL
##
## $hessian
## [,1] [,2]
## [1,] -1.212786 51.6473
## [2,] 51.647303 -2409.0835
cbind(estimate = opt_logit$par,
standard_error = diag(sqrt(abs(solve(opt_logit$hessian)))) )
## estimate standard_error
## [1,] 0.21182426 3.07814176
## [2,] -0.04609918 0.06906448
survival::clogit 関数で計算した結果と概ね等しい推定値($par)が得られている.「$value」は最大化された対数尤度関数の値.
なお,for 文で繰り返し計算すると計算時間が長くなるので,sapply関数などを利用して並列計算すると計算時間が短縮できる.たとえば,対数尤度関数(目的関数)は次のように書き直すことができる.
obj_func_modified <- function (a, d) {
sum( sapply(X = 1:N, FUN = function (i) {
log(choice_prob_ij(a, d, i, j = d$model[d$id == i & d$choice == 1]))
}) )
}
反実仮想シミュレーション(政策シミュレーション)を行う.
例:ガソリン税を上げて driving cost が10%増加した場合に保有台数はどのように変化するか? (ただし,供給サイドの反応は考えない)
data2 <- data %>% mutate(driving_cost = 1.1 * driving_cost) # 増税後のデータセット
最初に,各モデルの選択確率がどのように変化するかを計算.
prob_before <- matrix(0, nrow = N, ncol = J) # 増税前の選択確率
prob_after <- matrix(0, nrow = N, ncol = J) # 増税後
for (i in 1:N) {
for (j in 1:J) {
prob_before[i, j] <- choice_prob_ij(a = opt_logit$par, d = data, i, j)
prob_after[i, j] <- choice_prob_ij(a = opt_logit$par, d = data2, i, j)
}
}
summary(prob_after - prob_before)
## V1 V2 V3 V4
## Min. :0.01025 Min. :-0.002237 Min. :-0.003429 Min. :-0.001591
## 1st Qu.:0.01027 1st Qu.:-0.002226 1st Qu.:-0.003417 1st Qu.:-0.001576
## Median :0.01029 Median :-0.002220 Median :-0.003412 Median :-0.001568
## Mean :0.01029 Mean :-0.002221 Mean :-0.003413 Mean :-0.001570
## 3rd Qu.:0.01031 3rd Qu.:-0.002216 3rd Qu.:-0.003407 3rd Qu.:-0.001563
## Max. :0.01035 Max. :-0.002211 Max. :-0.003402 Max. :-0.001556
## V5
## Min. :-0.003098
## 1st Qu.:-0.003090
## Median :-0.003086
## Mean :-0.003087
## 3rd Qu.:-0.003083
## Max. :-0.003080
モデル1(車を全く保有しない選択肢)の選択確率が1%ポイント程度増加し,モデル2-5の選択確率が0.1-0.3%ポイント程度ずつ減少する.
世帯保有台数:
summary(apply(t(prob_before) * c(0, 1, 1, 1, 1), 2, sum)) # 増税前
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.6971 0.6995 0.7006 0.7004 0.7015 0.7026
summary(apply(t(prob_after) * c(0, 1, 1, 1, 1), 2, sum)) # 増税後
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.6867 0.6892 0.6903 0.6901 0.6913 0.6923
100世帯の総推計保有台数は,増税前70台から増税後69台に減少.
補償変分 (compensating variation, cv) は次のように定義される(CF: counterfactual; 左辺は増税前,右辺は増税後に対応; 詳しくは Herriges and Kling (REStat 1999) を参照).
\[ \max_{j} u(income-rentalprice,X,\epsilon) = \max_{j} u(income-rentalprice - cv,X^{CF},\epsilon) \]
なお,純所得が間接効用関数に対数で入っているため,
\[ \Delta E[welfare_n] = \frac{1}{\alpha_n} \left[ \log \left( \sum_{j=1}^J \exp(v_{ij}^{CF}) \right) - \log \left( \sum_{j=1}^J \exp(v_{ij}) \right) \right] \]
というよくある計算式は利用できない.
関数の定義:
v_ij_cf <- function (cv, a, d, i, j) { # 間接効用
yi <- d$income[d$id == i][1]
pj <- d$rental_price[d$model == j][1]
dcj <- d$driving_cost[d$model == j][1]
a[1] * log(yi - pj - cv) + a[2] * dcj # v_ij under counterfactual
}
welfare_obj_fun <- function (cv, a, d, d2, i, rg) { # compensating variation
v0 <- v1 <- NULL
for (j in 1:J) {
v0 <- c(v0, v_ij(a, d = d, i, j) + rg[j])
v1 <- c(v1, v_ij_cf(cv, a, d = d2, i, j) + rg[j])
}
(max(v0) - max(v1))^2 # この値を(0になるまで)最小化すれば,上の式で右辺=左辺になる
}
計算:家計ごとに,「\(\epsilon_{ij}\) を第一種極値分布からランダムにドローして,上の式で右辺=左辺になるような cv を求める」という手続きを何度か繰り返して平均値を求める.
# install.packages("evd")
library(evd)
cv_list <- NULL
start_time <- Sys.time()
for (i in 1:N) {
cv_list_i <- NULL
for (k in 1:10) {
# set.seed(i*k)
rg_k <- evd::rgumbel(J) # Gumbel分布(第一種極値分布)から乱数を生成
cv_i <- optim(par = 0, fn = welfare_obj_fun, a = opt_logit$par, d = data, d2 = data2,
i = i, rg = rg_k, method = "L-BFGS-B", lower = -200, upper = 0,
control = list(factr = 1e6)) # 数値計算の収束判定に用いる tolerance
cv_list_i <- c(cv_list_i, cv_i$par)
}
cv_list <- c(cv_list, mean(cv_list_i))
}
Sys.time() - start_time # calculation time
## Time difference of 1.879241 secs
summary(cv_list)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -123.96 -80.31 -65.29 -68.58 -55.17 -27.92
一世帯当たり年間70万円(注:所得・rental priceの単位が万円の場合)ほどのlossが生じる.
sessionInfo()
## R version 4.2.2 (2022-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 22000)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=Japanese_Japan.utf8 LC_CTYPE=Japanese_Japan.utf8
## [3] LC_MONETARY=Japanese_Japan.utf8 LC_NUMERIC=C
## [5] LC_TIME=Japanese_Japan.utf8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] evd_2.3-6.1 survival_3.4-0 forcats_0.5.1 stringr_1.5.0
## [5] dplyr_1.0.9 purrr_0.3.4 readr_2.1.2 tidyr_1.2.0
## [9] tibble_3.1.8 ggplot2_3.3.6 tidyverse_1.3.1
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.1.2 xfun_0.35 bslib_0.3.1 lattice_0.20-45
## [5] splines_4.2.2 haven_2.5.0 colorspace_2.0-3 vctrs_0.5.1
## [9] generics_0.1.2 htmltools_0.5.2 yaml_2.3.6 utf8_1.2.2
## [13] rlang_1.0.6 jquerylib_0.1.4 pillar_1.8.1 withr_2.5.0
## [17] glue_1.6.2 DBI_1.1.2 dbplyr_2.2.0 modelr_0.1.8
## [21] readxl_1.4.0 lifecycle_1.0.3 munsell_0.5.0 gtable_0.3.0
## [25] cellranger_1.1.0 rvest_1.0.2 evaluate_0.18 knitr_1.41
## [29] tzdb_0.3.0 fastmap_1.1.0 fansi_1.0.3 broom_0.8.0
## [33] backports_1.4.1 scales_1.2.0 jsonlite_1.8.4 fs_1.5.2
## [37] hms_1.1.1 digest_0.6.29 stringi_1.7.6 grid_4.2.2
## [41] cli_3.4.1 tools_4.2.2 magrittr_2.0.3 sass_0.4.1
## [45] crayon_1.5.2 pkgconfig_2.0.3 Matrix_1.5-1 ellipsis_0.3.2
## [49] xml2_1.3.3 reprex_2.0.1 lubridate_1.8.0 assertthat_0.2.1
## [53] rmarkdown_2.14 httr_1.4.4 rstudioapi_0.13 R6_2.5.1
## [57] compiler_4.2.2