1 Random variables in R / 確率変数

正規分布の例

rnorm(n = 10, mean = 0, sd = 1)  # 乱数の生成
##  [1] -1.4601938 -0.7174822  0.7204504  1.0997536 -0.2141771 -0.5713745
##  [7]  0.4457926  0.7510519  0.2426628 -1.4602665
rnorm(10, 0, 1)  # 引数の順番を守れば「n=」などを省略可
##  [1]  0.11525756  0.21259274  0.40400300 -0.37884009  2.56881206  0.11747327
##  [7] -0.04137816 -0.27445893 -0.27891363 -2.33215684
set.seed(0); rnorm(3)  # set.seed で乱数生成の seed を指定できる(再現のためにを指定しておくとよい)
## [1]  1.2629543 -0.3262334  1.3297993
set.seed(1); rnorm(3)
## [1] -0.6264538  0.1836433 -0.8356286
set.seed(0); rnorm(3)
## [1]  1.2629543 -0.3262334  1.3297993
dnorm(x = 1.96, mean = 0, sd = 1)  # 確率密度
## [1] 0.05844094
curve(dnorm(x, mean = 0, sd = 1), xlim = c(-5, 5))  # 描画にも使える

integrate(dnorm, -1.96, 1.96)  # 積分: 約 0.95 = 95%
## 0.9500042 with absolute error < 1e-11
qnorm(p = 0.975, mean = 0, sd = 1)  # 確率に対応する x の値
## [1] 1.959964
pnorm(q = 1.96, mean = 0, sd = 1)  # x に対応する確率
## [1] 0.9750021

2 Basic of programming in R / プログラミングの基礎

次のセクションの準備です.

2.1 for loop / for文

for (i in 1:3) {
  print(i)
}
## [1] 1
## [1] 2
## [1] 3

\[ \sum_{i \in \{1, 3, 5\}} (2i-3) = (2-3) + (6-3) + (10-3) = 9 \]

temp <- NULL  # create empty object
for (i in c(1, 3, 5)) {
  temp <- c(temp, 2 * i - 3)
}
temp; sum(temp)
## [1] -1  3  7
## [1] 9

2.2 cbind and rbind

cbind(1:2, 4:5, 10:11)  # combine by columns
##      [,1] [,2] [,3]
## [1,]    1    4   10
## [2,]    2    5   11
rbind(1:2, 4:5, 10:11)  # combine by rows
##      [,1] [,2]
## [1,]    1    2
## [2,]    4    5
## [3,]   10   11

3 Expectation and variance of sample mean / 標本平均の期待値と分散

\[ X_i = \mu + u_i, \quad u_i \sim N(0, \sigma^2) \] \[ \bar{X} = \mu + \frac{1}{n} \sum_i u_i , \quad E(\bar{X}) = \mu, \quad V(\bar{X}) = \frac{1}{n} \sigma^2 \]

mu <- 1
sigma2 <- 2^2
N <- 2
result_mean <- NULL
for (j in 1:10000) {
  sample_j <- rnorm(n = N, mean = mu, sd = sqrt(sigma2))
  result_mean <- c(result_mean, mean(sample_j))
}
mean(result_mean)  # 理論上は mu = 1
## [1] 1.005537
var(result_mean)  # 理論上は sigma^2/n = 4/2 = 2
## [1] 1.968904

sapply 関数を使うことでより簡単に書くことができる.

N <- 2
result_mean <- sapply(X = 1:10000, FUN = function (x) {
  mean(rnorm(n = N, mean = mu, sd = sqrt(sigma2)))
  })
mean(result_mean); var(result_mean)
## [1] 1.008541
## [1] 2.051369

sample size N を変えて計算

result_mean_mat <- NULL
for (k in c(2, 5, 10, 100, 1000)) {
  result_mean <- sapply(X = 1:10000, FUN = function (x) {
    mean(rnorm(n = k, mean = mu, sd = sqrt(sigma2)))
    })
  result_mean_mat <- rbind(result_mean_mat, c(k, mean(result_mean), var(result_mean), sigma2/k))
}
colnames(result_mean_mat) <- c("N", "mean (obs)", "var (obs)", "var (theo)")
result_mean_mat
##         N mean (obs)   var (obs) var (theo)
## [1,]    2  0.9816762 1.911070114      2.000
## [2,]    5  1.0084608 0.804719913      0.800
## [3,]   10  0.9913869 0.403245757      0.400
## [4,]  100  1.0005279 0.039613990      0.040
## [5,] 1000  0.9989577 0.003872632      0.004
plot(x = result_mean_mat[, 1], y = result_mean_mat[, 3], log = "x", type = "b", 
     main = "X ~ N(1, 2^2)", xlab = "N (sample size)", ylab = "variance of sample mean")

4 Test / 検定

t分布の臨界値

qnorm(p = 0.975, mean = 0, sd = 1)  # cf. 標準正規分布
## [1] 1.959964
qt(p = 0.975, df = 10)  # t分布
## [1] 2.228139
qt(p = 0.975, df = 1000)
## [1] 1.962339

5 Regression / 回帰分析

次のような(単)回帰モデルを考える.

\[ Fertility = \alpha + \beta Examination + u \]

str(swiss)  # built-in dataset
## 'data.frame':    47 obs. of  6 variables:
##  $ Fertility       : num  80.2 83.1 92.5 85.8 76.9 76.1 83.8 92.4 82.4 82.9 ...
##  $ Agriculture     : num  17 45.1 39.7 36.5 43.5 35.3 70.2 67.8 53.3 45.2 ...
##  $ Examination     : int  15 6 5 12 17 9 16 14 12 16 ...
##  $ Education       : int  12 9 5 7 15 7 7 8 7 13 ...
##  $ Catholic        : num  9.96 84.84 93.4 33.77 5.16 ...
##  $ Infant.Mortality: num  22.2 22.2 20.2 20.3 20.6 26.6 23.6 24.9 21 24.4 ...
summary(lm(formula = Fertility ~ Examination, data = swiss))
## 
## Call:
## lm(formula = Fertility ~ Examination, data = swiss)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -25.9375  -6.0044  -0.3393   7.9239  19.7399 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  86.8185     3.2576  26.651  < 2e-16 ***
## Examination  -1.0113     0.1782  -5.675 9.45e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.642 on 45 degrees of freedom
## Multiple R-squared:  0.4172, Adjusted R-squared:  0.4042 
## F-statistic: 32.21 on 1 and 45 DF,  p-value: 9.45e-07

5.1 Estimation by hand

OLS回帰係数は次のように与えられる.

\[ \hat{\alpha} = \bar{Y} - \hat{\beta} \bar{X}, \quad \hat{\beta} = \frac{S_{XY}}{S_{XX}} \]

y_bar <- mean(swiss$Fertility)  # Y bar
x_bar <- mean(swiss$Examination)  # X bar
s_xx <- sum((swiss$Examination - x_bar)^2)  # S_XX
s_xy <- sum((swiss$Examination - x_bar) * (swiss$Fertility - y_bar))  # S_XX
y_bar; x_bar; s_xx; s_xy
## [1] 70.14255
## [1] 16.48936
## [1] 2927.745
## [1] -2960.879
beta_hat <- s_xy / s_xx
alpha_hat <- y_bar - beta_hat * x_bar
alpha_hat; beta_hat
## [1] 86.81853
## [1] -1.011317
y_pred <- alpha_hat + beta_hat * swiss$Examination  # y hat
mean(y_pred); y_bar  # should be matched
## [1] 70.14255
## [1] 70.14255
resid_swiss <- swiss$Fertility - y_pred  # residuals
sum(resid_swiss)  # should be zero
## [1] 1.065814e-13
sum(resid_swiss * swiss$Examination)  # eX ... should be zero
## [1] 1.58451e-12
sum((swiss$Examination - x_bar) * (resid_swiss - mean(resid_swiss)))  # S_Xe ... should be zero
## [1] -1.501022e-13
sum((y_pred - y_bar) * (resid_swiss - mean(resid_swiss)))  # S_Ye ... should be zero
## [1] -8.354428e-14

5.2 Coefficient of determination / 決定係数

\[ R^2 = \frac{S_{\hat{Y}\hat{Y}}}{S_{YY}} = 1 - \frac{\sum \hat{u}_i^2}{S_{YY}} \]

s_yy <- sum((swiss$Fertility - y_bar)^2)
s_yhyh <- sum((y_pred - y_bar)^2)
s_yhyh / s_yy  # R^2
## [1] 0.4171645
1 - sum(resid_swiss^2) / s_yy  # R^2 ... should be the same as above
## [1] 0.4171645

5.3 Standard error / 係数の分散(標準誤差)

\[ V(\hat{\beta}) = \frac{\sigma^2}{S_{XX}}, \quad \hat{\sigma^2} = s^2 = \frac{1}{n - 2} \sum \hat{u}_i^2 \]

beta_se <- sqrt(sum(resid_swiss^2)/(nrow(swiss) - 2) / s_xx)  # standard error
beta_se
## [1] 0.1781971
beta_hat / beta_se  # t stat
## [1] -5.675275

t分布の2.5%臨界値(両側5%に対応)は?

qt(p = 0.025, df = nrow(swiss) - 2)  # df: degree of freedom (自由度)
## [1] -2.014103
qt(p = c(0.005, 0.025, 0.975, 0.995), df = nrow(swiss) - 2)  # 1% and 5%
## [1] -2.689585 -2.014103  2.014103  2.689585
qt(p = 0.975, df = 10000)  # cf. df が大きければほとんど標準正規分布の場合と同じ
## [1] 1.960201

5.4 Multiple regression / 重回帰分析

\[ \hat{\beta} = (X'X)^{-1}X'y, \quad V(\hat{\beta}) = \sigma^2 (X'X)^{-1} \]

summary(lm(Fertility ~ Examination + Education, data = swiss))
## 
## Call:
## lm(formula = Fertility ~ Examination + Education, data = swiss)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.9935  -6.8894  -0.3621   7.1640  19.2634 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  85.2533     3.0855  27.630   <2e-16 ***
## Examination  -0.5572     0.2319  -2.402   0.0206 *  
## Education    -0.5395     0.1924  -2.803   0.0075 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.982 on 44 degrees of freedom
## Multiple R-squared:  0.5055, Adjusted R-squared:  0.483 
## F-statistic: 22.49 on 2 and 44 DF,  p-value: 1.87e-07
Y <- as.matrix(swiss[, c("Fertility")])
X <- as.matrix(cbind(1, swiss[, c("Examination", "Education")]))
beta_multi <- solve(t(X) %*% X) %*% t(X) %*% Y  # beta = (X'X)^(-1)X'Y
beta_multi
##                   [,1]
## 1           85.2532753
## Examination -0.5572183
## Education   -0.5394570
resid_multi <- Y - X %*% beta_multi
s2 <- sum(resid_multi^2) / (nrow(swiss)-3)  # estimates of sigma^2
sqrt(diag(s2 * solve(t(X) %*% X)))  # standard errors 標準誤差
##           1 Examination   Education 
##   3.0854981   0.2319374   0.1924380

5.5 Adding quardatic or interaction terms / 二次・交差項

新しい変数を作成するなどのデータフレームの操作をするためには tidyverse に含まれる関数 (mutate, rename, group_by, etc.) を用いると簡単.

# install.packages("tidyverse")  # 一度だけ実行
library(tidyverse)  # Rを立ち上げるたびに実行
swiss2 <- swiss %>%
  mutate(Edu2 = Education^2,
         ExamEdu = Examination * Education)
summary(lm(Fertility ~ Examination + Edu2, data = swiss2))
## 
## Call:
## lm(formula = Fertility ~ Examination + Edu2, data = swiss2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19.2462  -6.1306  -0.6674   6.1323  19.0912 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 83.220719   3.274898  25.412  < 2e-16 ***
## Examination -0.660686   0.205804  -3.210  0.00248 ** 
## Edu2        -0.010349   0.003613  -2.864  0.00638 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.952 on 44 degrees of freedom
## Multiple R-squared:  0.5088, Adjusted R-squared:  0.4864 
## F-statistic: 22.79 on 2 and 44 DF,  p-value: 1.615e-07
summary(lm(Fertility ~ ExamEdu, data = swiss2))
## 
## Call:
## lm(formula = Fertility ~ ExamEdu, data = swiss2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -19.923  -6.512  -1.373   7.444  19.349 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 75.732110   1.715347  44.150  < 2e-16 ***
## ExamEdu     -0.023941   0.004212  -5.684 9.17e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.636 on 45 degrees of freedom
## Multiple R-squared:  0.4179, Adjusted R-squared:  0.405 
## F-statistic: 32.31 on 1 and 45 DF,  p-value: 9.167e-07

データセットを直接操作せずに,回帰モデルを指定する際に I 関数を使って二乗や交差項を表現することもできる.「var1 : var2」は「var1 × var2」を表し,「var1 * var2」は「var1 + var2 + (var1 × var2)」を表す.

summary(lm(Fertility ~ Examination + I(Education^2), data = swiss))
## 
## Call:
## lm(formula = Fertility ~ Examination + I(Education^2), data = swiss)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19.2462  -6.1306  -0.6674   6.1323  19.0912 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    83.220719   3.274898  25.412  < 2e-16 ***
## Examination    -0.660686   0.205804  -3.210  0.00248 ** 
## I(Education^2) -0.010349   0.003613  -2.864  0.00638 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.952 on 44 degrees of freedom
## Multiple R-squared:  0.5088, Adjusted R-squared:  0.4864 
## F-statistic: 22.79 on 2 and 44 DF,  p-value: 1.615e-07
summary(lm(Fertility ~ Examination:Education, data = swiss))
## 
## Call:
## lm(formula = Fertility ~ Examination:Education, data = swiss)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -19.923  -6.512  -1.373   7.444  19.349 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           75.732110   1.715347  44.150  < 2e-16 ***
## Examination:Education -0.023941   0.004212  -5.684 9.17e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.636 on 45 degrees of freedom
## Multiple R-squared:  0.4179, Adjusted R-squared:  0.405 
## F-statistic: 32.31 on 1 and 45 DF,  p-value: 9.167e-07
summary(lm(Fertility ~ Examination*Education, data = swiss))
## 
## Call:
## lm(formula = Fertility ~ Examination * Education, data = swiss)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -13.522  -6.901  -1.169   7.383  19.412 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           87.178104   4.334560  20.112   <2e-16 ***
## Examination           -0.625731   0.257117  -2.434   0.0192 *  
## Education             -0.807552   0.463474  -1.742   0.0886 .  
## Examination:Education  0.009201   0.014451   0.637   0.5277    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.043 on 43 degrees of freedom
## Multiple R-squared:  0.5101, Adjusted R-squared:  0.4759 
## F-statistic: 14.92 on 3 and 43 DF,  p-value: 8.437e-07
summary(lm(log(Fertility) ~ Examination, data = swiss))
## 
## Call:
## lm(formula = log(Fertility) ~ Examination, data = swiss)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.44076 -0.07248 -0.00161  0.11394  0.25392 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.492515   0.051457  87.306  < 2e-16 ***
## Examination -0.015736   0.002815  -5.591 1.26e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1523 on 45 degrees of freedom
## Multiple R-squared:  0.4099, Adjusted R-squared:  0.3968 
## F-statistic: 31.25 on 1 and 45 DF,  p-value: 1.261e-06

5.6 Dummy variables ダミー変数

swiss2 <- swiss %>%
  mutate(exam_dummy = ifelse(Examination > mean(Examination), 1, 0))
swiss2 %>%
  group_by(exam_dummy) %>%
  summarise(mean_fertility = mean(Fertility))
## # A tibble: 2 × 2
##   exam_dummy mean_fertility
##        <dbl>          <dbl>
## 1          0           76.1
## 2          1           62.2
summary(lm(Fertility ~ exam_dummy, data = swiss2))
## 
## Call:
## lm(formula = Fertility ~ exam_dummy, data = swiss2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -31.359  -5.557   1.945   6.793  16.441 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   76.059      2.020  37.657  < 2e-16 ***
## exam_dummy   -13.904      3.096  -4.491 4.91e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.5 on 45 degrees of freedom
## Multiple R-squared:  0.3095, Adjusted R-squared:  0.2941 
## F-statistic: 20.17 on 1 and 45 DF,  p-value: 4.907e-05
summary(lm(Fertility ~ I(Examination > mean(Examination)), data = swiss2))
## 
## Call:
## lm(formula = Fertility ~ I(Examination > mean(Examination)), 
##     data = swiss2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -31.359  -5.557   1.945   6.793  16.441 
## 
## Coefficients:
##                                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                              76.059      2.020  37.657  < 2e-16 ***
## I(Examination > mean(Examination))TRUE  -13.904      3.096  -4.491 4.91e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.5 on 45 degrees of freedom
## Multiple R-squared:  0.3095, Adjusted R-squared:  0.2941 
## F-statistic: 20.17 on 1 and 45 DF,  p-value: 4.907e-05

5.7 Joint hypothesis test / 結合有意性検定

\[ H_0: \beta_{Ed} = \beta_{Ex} = 0 \] \[ F = \frac{(Q_R - R) / 2}{Q / (n-k-1)} \sim F(2, n-k-1) \]

lm_swiss_unrestricted <- lm(Fertility ~ Examination + Education, data = swiss)
lm_swiss_restricted <- lm(Fertility ~ 1, data = swiss)
resid_unrestricted <- sum(lm_swiss_unrestricted$resid^2)
resid_restricted <- sum(lm_swiss_restricted$resid^2)
((resid_restricted - resid_unrestricted) / 2) / (resid_unrestricted / (nrow(swiss) - 2 - 1))  # F
## [1] 22.48799
qf(p = 0.05, df1 = 2, df2 = nrow(swiss) - 2 - 1, lower.tail = F)
## [1] 3.209278
# qf(p = 0.95, df1 = 2, df2 = nrow(swiss) - 2 - 1, lower.tail = T)  # same as above

car というパッケージの linearHypothesis という関数で検定できます.「car::linearHypothesis」のようにパッケージ名と関数名を :: で繋いで使用すれば,当該パッケージを読み込まなくても関数を使えます.

lm_swiss <- lm(Fertility ~ Examination + Education, data = swiss)
summary(lm_swiss)
## 
## Call:
## lm(formula = Fertility ~ Examination + Education, data = swiss)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.9935  -6.8894  -0.3621   7.1640  19.2634 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  85.2533     3.0855  27.630   <2e-16 ***
## Examination  -0.5572     0.2319  -2.402   0.0206 *  
## Education    -0.5395     0.1924  -2.803   0.0075 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.982 on 44 degrees of freedom
## Multiple R-squared:  0.5055, Adjusted R-squared:  0.483 
## F-statistic: 22.49 on 2 and 44 DF,  p-value: 1.87e-07
car::linearHypothesis(lm_swiss, "Examination = 0")
## Linear hypothesis test
## 
## Hypothesis:
## Examination = 0
## 
## Model 1: restricted model
## Model 2: Fertility ~ Examination + Education
## 
##   Res.Df    RSS Df Sum of Sq      F  Pr(>F)  
## 1     45 4015.2                              
## 2     44 3549.6  1    465.63 5.7718 0.02057 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
car::linearHypothesis(lm_swiss, c("Examination = 0", "Education = 0"))
## Linear hypothesis test
## 
## Hypothesis:
## Examination = 0
## Education = 0
## 
## Model 1: restricted model
## Model 2: Fertility ~ Examination + Education
## 
##   Res.Df    RSS Df Sum of Sq      F   Pr(>F)    
## 1     46 7178.0                                 
## 2     44 3549.6  2    3628.3 22.488 1.87e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
car::linearHypothesis(lm_swiss, "Examination + Education = 1")
## Linear hypothesis test
## 
## Hypothesis:
## Examination  + Education = 1
## 
## Model 1: restricted model
## Model 2: Fertility ~ Examination + Education
## 
##   Res.Df     RSS Df Sum of Sq      F    Pr(>F)    
## 1     45 16001.1                                  
## 2     44  3549.6  1     12452 154.35 5.523e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

See alos: https://www.econometrics-with-r.org/7.3-joint-hypothesis-testing-using-the-f-statistic.html

5.8 White standard error / ホワイトの標準誤差

\[ s.e.(\hat{\beta}) = \frac{1}{n s_X^2} \sqrt{\sum_i (X_i - \bar{X})^2 \hat{u}_i^2} \]

x <- swiss$Examination
lm_swiss <- lm(Fertility ~ Examination, data = swiss)
sqrt(sum((x - mean(x))^2 * lm_swiss$resid^2)) / (nrow(swiss) * var(x))  # White SE (original)
## [1] 0.173891
sqrt(sum((x - mean(x))^2 * lm_swiss$resid^2) / (sum((x - mean(x))^2))^2 * nrow(swiss) / (nrow(swiss) - 2)) # White SE (modified)
## [1] 0.1815766

fixest パッケージで推定する場合.

# install.packages("fixest")
library(fixest)
feols(Fertility ~ Examination, data = swiss, vcov = "hetero")
## OLS estimation, Dep. Var.: Fertility
## Observations: 47 
## Standard-errors: Heteroskedasticity-robust 
##             Estimate Std. Error  t value  Pr(>|t|)    
## (Intercept) 86.81853   3.175114 27.34344 < 2.2e-16 ***
## Examination -1.01132   0.181577 -5.56965 1.353e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 9.43462   Adj. R2: 0.404213
# sqrt(diag(car::hccm(model = lm_swiss, type = "hc1")))  # using "car" package
# lmtest::coeftest(lm_swiss, vcov. = sandwich::vcovHC(lm_swiss, type="HC1"))  # using "lmtest" and "sandwich" packages

5.9 Cluster robust SE / クラスターロバスト標準誤差

以下の6地域をクラスターとして利用します(正確性・妥当性は未検証です).

A: ジュラカントン, B: ヴォー州, C: フリブール州, D: ネウシャテル州, E: ジュネーブ州, F: ヴァレー州

library(fixest)
swiss2 <- swiss
swiss2$region <- 
  c("A", "A", "A", "A", "A", "A", "C", "C", "C", "C",
    "C", "B", "B", "B", "B", "B", "B", "B", "B", "B",
    "B", "B", "B", "B", "B", "B", "B", "B", "B", "B",
    "F", "F", "F", "F", "F", "F", "F", "F", "D", "D",
    "D", "D", "D", "D", "E", "E", "E")
table(swiss2$region)
## 
##  A  B  C  D  E  F 
##  6 19  5  6  3  8
feols(Fertility ~ Examination, data = swiss2, cluster = ~ region)
## OLS estimation, Dep. Var.: Fertility
## Observations: 47 
## Standard-errors: Clustered (region) 
##             Estimate Std. Error  t value   Pr(>|t|)    
## (Intercept) 86.81853   4.708134 18.44011 8.6278e-06 ***
## Examination -1.01132   0.281775 -3.58910 1.5723e-02 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 9.43462   Adj. R2: 0.404213

5.10 OLS vs. WLS Monte Carlo simulation / モンテカルロシミュレーション

OLS:

\[ Y_i = 1 + X_i + \sqrt{h} \tilde{u}_i \] \[ X_i \sim N(0, 1) , \quad \tilde{u}_i \sim \mbox{Uni}(0, 4 \sqrt{12}), \quad h_i \sim \mbox{Uni}(0, 1) \]

WLS:

\[ \frac{Y_i}{\sqrt{h_i}} = \frac{1}{\sqrt{h_i}} + \beta \frac{X_i}{\sqrt{h_i}} + u_i \]

注:\(E(u_i) \ne 0\) であるため,WLSで定数項を0にして推定してはいけない.

ols_matrix <- wls_matrix <- NULL
for (i in 1:1000) {
  set.seed(i)
  n <- 50
  x <- rnorm(n, 0, 1)
  u_tilde <- runif(n, 0, 4*sqrt(12))
  h <- runif(n, 0, 1)
  u <- u_tilde * h
  # OLS
  y <- 1 + x + u
  ols_matrix <- rbind(ols_matrix, lm(y ~ x)$coef)
  # WLS
  y_tilde <- y / sqrt(h)
  Ii <- 1 / sqrt(h)
  x_tilde <- x / sqrt(h)
  wls_matrix <- rbind(wls_matrix, lm(y_tilde ~ x_tilde)$coef)
}
summary(wls_matrix[, 2]); summary(ols_matrix[, 2])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -0.7191  0.8522  1.0012  1.0113  1.1636  2.5938
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -0.3889  0.7342  0.9998  1.0167  1.3030  2.3599
hist(wls_matrix[, 2], breaks = 30)
hist(ols_matrix[, 2], breaks = 20, add = T, col = rgb(1,0,0, alpha = .2))
legend(x = "topright", col = c(8, 2), legend = c("WLS", "OLS"), lwd = 5)

plot(wls_matrix[, 2], ols_matrix[, 2], xlab = "beta of WLS", ylab = "beta of OLS", pch = ".")
abline(a = 0, b = 1, col = 2)

6 IV / 操作変数

データは 教科書のサポートページ からExcelファイルとしてダウンロードできるので,それをCSVに変換して適当なフォルダに配置する.

setwd("c://ws_stat")
emp <- read.csv("emp.csv")
y <- emp$temp
x <- emp$unemp
z <- emp$z_unemp05
sum((z - mean(z))*(y - mean(y))) / sum((z - mean(z))*(x - mean(x)))  # S_ZY / S_ZX
## [1] 1.246263
cov(z, y) / cov(z, x)
## [1] 1.246263
library(fixest)
feols(fml = temp ~ 1 | unemp ~ z_unemp05, data = emp, vcov = "hetero")
## TSLS estimation, Dep. Var.: temp, Endo.: unemp, Instr.: z_unemp05
## Second stage: Dep. Var.: temp
## Observations: 47 
## Standard-errors: Heteroskedasticity-robust 
##              Estimate Std. Error  t value  Pr(>|t|)    
## (Intercept) -0.995268   0.350071 -2.84305 0.0066963 ** 
## fit_unemp    1.246263   0.432134  2.88397 0.0060050 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.24615   Adj. R2: 0.026772
## F-test (1st stage), unemp: stat = 336.1    , p < 2.2e-16, on 1 and 45 DoF.
##                Wu-Hausman: stat =   4.41028, p = 0.04149, on 1 and 44 DoF.

教科書 p. 218 の表12.2.

feols(fml = temp ~ income + unemp, data = emp, vcov = "hetero") # OLS
## OLS estimation, Dep. Var.: temp
## Observations: 47 
## Standard-errors: Heteroskedasticity-robust 
##             Estimate Std. Error  t value  Pr(>|t|)    
## (Intercept) -8.05782   2.696037 -2.98876 0.0045702 ** 
## income       1.98484   0.742847  2.67194 0.0105347 *  
## unemp        1.58483   0.468323  3.38406 0.0015107 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.22566   Adj. R2: 0.163464
feols(fml = temp ~ income | unemp ~ z_unemp05, data = emp, vcov = "hetero") # single IV
## TSLS estimation, Dep. Var.: temp, Endo.: unemp, Instr.: z_unemp05
## Second stage: Dep. Var.: temp
## Observations: 47 
## Standard-errors: Heteroskedasticity-robust 
##             Estimate Std. Error  t value   Pr(>|t|)    
## (Intercept) -9.48101   3.237231 -2.92874 0.00537381 ** 
## fit_unemp    2.06933   0.519597  3.98256 0.00025222 ***
## income       2.28655   0.870654  2.62624 0.01183358 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.22768   Adj. R2: 0.148418
## F-test (1st stage), unemp: stat = 256.1    , p < 2.2e-16 , on 1 and 44 DoF.
##                Wu-Hausman: stat =   5.02769, p = 0.030156, on 1 and 43 DoF.
feols(fml = temp ~ income | unemp ~ z_unemp05 + z_unemp00, data = emp, vcov = "hetero") # IVs
## TSLS estimation, Dep. Var.: temp, Endo.: unemp, Instr.: z_unemp05, z_unemp00
## Second stage: Dep. Var.: temp
## Observations: 47 
## Standard-errors: Heteroskedasticity-robust 
##             Estimate Std. Error  t value   Pr(>|t|)    
## (Intercept) -9.29376   3.196733 -2.90727 0.00569204 ** 
## fit_unemp    2.00559   0.522776  3.83641 0.00039482 ***
## income       2.24685   0.859686  2.61357 0.01221875 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.227185   Adj. R2: 0.152117
## F-test (1st stage), unemp: stat = 127.9    , p < 2.2e-16 , on 2 and 43 DoF.
##                Wu-Hausman: stat =   3.77294, p = 0.058656, on 1 and 43 DoF.
##                    Sargan: stat =   3.95004, p = 0.04687 , on 1 DoF.

6.1 Market equilibrium model / 市場均衡モデル

9市場・12か月の取引集計データに基づくみかんの需要曲線推定.教科書 p. 228 表13.1の推定値が再現できない…

library(fixest)
setwd("c://ws_stat")
orange <- read.csv("orange.csv")
feols(fml = log(qo) ~ as.factor(market.1) + log(po), data = orange, vcov = "hetero")  # OLS
## OLS estimation, Dep. Var.: log(qo)
## Observations: 108 
## Standard-errors: Heteroskedasticity-robust 
##                       Estimate Std. Error    t value   Pr(>|t|)    
## (Intercept)          18.737088   0.521598  35.922473  < 2.2e-16 ***
## as.factor(market.1)2  1.629559   0.362850   4.491002 1.9385e-05 ***
## as.factor(market.1)3 -0.568854   0.397785  -1.430057 1.5588e-01    
## as.factor(market.1)4 -1.064572   0.420401  -2.532276 1.2921e-02 *  
## as.factor(market.1)5 -1.213416   0.371531  -3.265991 1.5037e-03 ** 
## as.factor(market.1)6 -0.319987   0.387418  -0.825946 4.1084e-01    
## as.factor(market.1)7 -2.430138   0.373421  -6.507772 3.2536e-09 ***
## as.factor(market.1)8 -1.303571   0.377052  -3.457269 8.0865e-04 ***
## as.factor(market.1)9 -4.040535   0.480658  -8.406263 3.4207e-13 ***
## log(po)              -2.145434   0.079019 -27.151009  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.950804   Adj. R2: 0.815716
feols(fml = log(qo) ~ as.factor(market.1) | 
      log(po) ~ mon + mon2, data = orange, vcov = "hetero")  # IV
## TSLS estimation, Dep. Var.: log(qo), Endo.: log(po), Instr.: mon, mon2
## Second stage: Dep. Var.: log(qo)
## Observations: 108 
## Standard-errors: Heteroskedasticity-robust 
##                       Estimate Std. Error    t value   Pr(>|t|)    
## (Intercept)          21.714923   1.027108  21.141805  < 2.2e-16 ***
## fit_log(po)          -2.641150   0.165606 -15.948347  < 2.2e-16 ***
## as.factor(market.1)2  1.631248   0.393820   4.142113 7.3020e-05 ***
## as.factor(market.1)3 -0.608502   0.413686  -1.470926 1.4451e-01    
## as.factor(market.1)4 -1.103216   0.474688  -2.324086 2.2186e-02 *  
## as.factor(market.1)5 -1.222477   0.402320  -3.038568 3.0473e-03 ** 
## as.factor(market.1)6 -0.336839   0.412219  -0.817134 4.1583e-01    
## as.factor(market.1)7 -2.435371   0.395089  -6.164111 1.5805e-08 ***
## as.factor(market.1)8 -1.381284   0.403130  -3.426399 8.9521e-04 ***
## as.factor(market.1)9 -4.071735   0.489043  -8.325930 5.0899e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 1.01373   Adj. R2: 0.790515
## F-test (1st stage), log(po): stat = 98.6, p < 2.2e-16 , on 2 and 97 DoF.
##                  Wu-Hausman: stat = 37.4, p = 2.043e-8, on 1 and 97 DoF.
##                      Sargan: stat = 32.9, p = 9.6e-9  , on 1 DoF.

7 Maximum likelihood / 最尤法

7.1 Bernoulli distribution / ベルヌーイ分布

5回の試行で3回成功(成功確率 \(p\)),2回失敗(失敗確率 \(1-p\))した場合.

\[ L(p) = (1-p)^2 p^3, \quad \log L(p) = 2 \log (1-p) + 3 \log (p) \]

l_func <- function (p) (1-p)^2 * p^3
ll_func <- function (p) 2 * log(1-p) +3 * log(p)
plot(l_func, xlim = c(0, 1)); abline(v = .6, lty = 2, col = 2)

plot(ll_func, xlim = c(0, 1)); abline(v = .6, lty = 2, col = 2)

7.2 Normal distribution / 正規分布

\[ f = \frac{1}{\sqrt{2 \pi \sigma^2}} \exp \left[- \frac{(y_i - \mu)^2}{2 \sigma^2} \right] , \quad \log f = - \frac{1}{2} \log 2 \pi - \frac{1}{2} \log \sigma^2 - \frac{(y_i - \mu)^2}{2 \sigma^2} \]

\[ \log L = - \frac{n}{2} \log 2 \pi - \frac{n}{2} \log \sigma^2 - \frac{1}{2 \sigma^2} \sum_i (y_i - \mu)^2 \]

\[ \hat{\mu} = \frac{1}{n} \sum_i y_i , \quad \hat{\sigma}^2 = \frac{1}{n} \sum_i (y_i - \bar{y}) \]

y <- c(-3, -2, 0.5, 0.5, 1.5)  # この標本を正規母集団からのサンプリングとみなす
mean(y)  # 標本平均(mu の最尤推定量と同じ)
## [1] -0.5
var(y)  # 標本分散
## [1] 3.625
sum((y - mean(y))^2) / length(y)  # 分散(sigma^2 の最尤推定量と同じ)
## [1] 2.9
ll_func_n <- function (param, y, n) {  # 対数尤度関数
  mu = param[1]
  sigma2 = param[2]
  -n/2*log(2*pi) - n/2*log(sigma2) - 1/(2*sigma2)*sum((y-mu)^2)
}

\(\sigma^2 = 1\) に固定した場合の \(\mu\) と対数尤度の関係をプロットします.

mu_list <- seq(-2, 2, length = 100)
ll_mu_list <- sapply(X = mu_list, 
                     FUN = function(x) ll_func_n(param = c(x, 1), y = y, n = length(y)))
plot(x = mu_list, y = ll_mu_list, xlim = c(-2, 2), type = "l")

3Dで描画します.(outer 関数を使って z を計算するつもりだったが,なぜか outer 関数にかませる尤度関数中で sum((y-mu)^2) が評価できないので for 文をネストして計算する.)

grid_mu <- seq(-2, 2, length = 20)  # 探索する範囲を grid で表す
grid_sigma2 <- seq(1, 10, length = 20)
z <- matrix(0, nrow = length(grid_mu), ncol = length(grid_sigma2))
for (i in 1:length(grid_mu)) {  # 探索 grid に対応する対数尤度を計算
  for (j in 1:length(grid_sigma2)) {
    z[i, j] <- ll_func_n(param = c(grid_mu[i], grid_sigma2[j]), y = y, n = length(y))
  }
}
which_max_ml <- which(z == max(z), arr.ind = T)  # 尤度を最大化する grid の index
grid_mu[which_max_ml[1]]; grid_sigma2[which_max_ml[2]]  # ML推定量 (gridが粗いので精度は低い)
## [1] -0.5263158
## [1] 2.894737
persp(grid_mu, grid_sigma2, z, theta = 50, phi = 20) -> pmat_persp
points(trans3d(x = grid_mu[which_max_ml[1]], y = grid_sigma2[which_max_ml[2]], 
               z = z[which_max_ml], pmat = pmat_persp), col = 2, pch = 19)

汎用の最適化関数 optim を使って最尤推定する.

optim(par = c(0, 1), fn = ll_func_n, y = y, n = length(y), control = list(fnscale = -1))
## $par
## [1] -0.5000029  2.8995297
## 
## $value
## [1] -9.75647
## 
## $counts
## function gradient 
##       65       NA 
## 
## $convergence
## [1] 0
## 
## $message
## NULL

7.3 Linear regression / 線形回帰モデル

See Rで推定する回帰モデル入門 > MLE of linear model / 線形モデルの最尤推定

8 Discrete choice model / 離散選択モデル

8.1 Binary probit model / 二値プロビット・モデル

\[ p = \Pr (y = 1) = \Phi (\alpha + \beta x) \]

Jリーグ(J1)2011-12シーズンの528選手の出場とポストシーズンの移籍に関するデータ

  • outcome: transfer (移籍)
  • x: attend (出場時間), goal (ゴール), j2 (降格ダミー)
setwd("c://ws_stat")
j1 <- read.csv("j1.csv")
summary(lm(transfer ~ attend + goal + j2, data = j1))  # linear probability model
## 
## Call:
## lm(formula = transfer ~ attend + goal + j2, data = j1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.7659 -0.4384 -0.1615  0.4300  0.9019 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.58076    0.03192  18.192  < 2e-16 ***
## attend      -0.49733    0.06838  -7.273 1.29e-12 ***
## goal         0.26143    0.25706   1.017 0.309617    
## j2           0.18514    0.05506   3.362 0.000829 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4678 on 524 degrees of freedom
## Multiple R-squared:  0.1175, Adjusted R-squared:  0.1125 
## F-statistic: 23.26 on 3 and 524 DF,  p-value: 3.797e-14
glm_j1 <- glm(formula = transfer ~  attend + goal + j2, data = j1, 
              family = binomial(link = "probit"))
summary(glm_j1)
## 
## Call:
## glm(formula = transfer ~ attend + goal + j2, family = binomial(link = "probit"), 
##     data = j1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7076  -1.0654  -0.6124   1.0542   2.0281  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.21598    0.08669   2.491 0.012721 *  
## attend      -1.39344    0.20347  -6.848 7.48e-12 ***
## goal         0.82744    0.73727   1.122 0.261732    
## j2           0.51399    0.15352   3.348 0.000814 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 723.69  on 527  degrees of freedom
## Residual deviance: 657.88  on 524  degrees of freedom
## AIC: 665.88
## 
## Number of Fisher Scoring iterations: 4

限界効果

\[ ME_i = \phi (\alpha + \beta x_i) \beta, \quad ME = \phi (\hat{\alpha} + \hat{\beta} \bar{x}) \hat{\beta} \mbox{ or } \frac{1}{n} \sum_i \phi (\hat{\alpha} + \hat{\beta} x_i) \hat{\beta} \]

# marginal effect at the mean
xb_mean <- glm_j1$coef %*% c(1, mean(j1$attend), mean(j1$goal), mean(j1$j2))
as.numeric(dnorm(x = xb_mean)) * glm_j1$coef
## (Intercept)      attend        goal          j2 
##  0.08475404 -0.54680256  0.32469830  0.20169653
# mean marginal effect
xb <- as.matrix(cbind(1, j1[, c("attend", "goal", "j2")])) %*% glm_j1$coef
c(mean(dnorm(x = xb)* glm_j1$coef[2]), mean(dnorm(x = xb)* glm_j1$coef[3]), 
  mean(dnorm(x = xb)* glm_j1$coef[4]))
## [1] -0.4956684  0.2943342  0.1828349

mfx パッケージを使う場合.

# install.packages("mfx")
library(mfx)
probitmfx(formula = transfer ~ attend + goal + j2, data = j1)
## Call:
## probitmfx(formula = transfer ~ attend + goal + j2, data = j1)
## 
## Marginal Effects:
##            dF/dx Std. Err.       z     P>|z|    
## attend -0.546803  0.079510 -6.8772 6.106e-12 ***
## goal    0.324698  0.289228  1.1226 0.2615910    
## j2      0.202809  0.059348  3.4173 0.0006325 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## dF/dx is for discrete change for the following variables:
## 
## [1] "j2"

.