正規分布の例
rnorm(n = 10, mean = 0, sd = 1) # 乱数の生成
## [1] -1.4601938 -0.7174822 0.7204504 1.0997536 -0.2141771 -0.5713745
## [7] 0.4457926 0.7510519 0.2426628 -1.4602665
rnorm(10, 0, 1) # 引数の順番を守れば「n=」などを省略可
## [1] 0.11525756 0.21259274 0.40400300 -0.37884009 2.56881206 0.11747327
## [7] -0.04137816 -0.27445893 -0.27891363 -2.33215684
set.seed(0); rnorm(3) # set.seed で乱数生成の seed を指定できる(再現のためにを指定しておくとよい)
## [1] 1.2629543 -0.3262334 1.3297993
set.seed(1); rnorm(3)
## [1] -0.6264538 0.1836433 -0.8356286
set.seed(0); rnorm(3)
## [1] 1.2629543 -0.3262334 1.3297993
dnorm(x = 1.96, mean = 0, sd = 1) # 確率密度
## [1] 0.05844094
curve(dnorm(x, mean = 0, sd = 1), xlim = c(-5, 5)) # 描画にも使える
integrate(dnorm, -1.96, 1.96) # 積分: 約 0.95 = 95%
## 0.9500042 with absolute error < 1e-11
qnorm(p = 0.975, mean = 0, sd = 1) # 確率に対応する x の値
## [1] 1.959964
pnorm(q = 1.96, mean = 0, sd = 1) # x に対応する確率
## [1] 0.9750021
次のセクションの準備です.
for (i in 1:3) {
print(i)
}
## [1] 1
## [1] 2
## [1] 3
\[ \sum_{i \in \{1, 3, 5\}} (2i-3) = (2-3) + (6-3) + (10-3) = 9 \]
temp <- NULL # create empty object
for (i in c(1, 3, 5)) {
temp <- c(temp, 2 * i - 3)
}
temp; sum(temp)
## [1] -1 3 7
## [1] 9
cbind(1:2, 4:5, 10:11) # combine by columns
## [,1] [,2] [,3]
## [1,] 1 4 10
## [2,] 2 5 11
rbind(1:2, 4:5, 10:11) # combine by rows
## [,1] [,2]
## [1,] 1 2
## [2,] 4 5
## [3,] 10 11
\[ X_i = \mu + u_i, \quad u_i \sim N(0, \sigma^2) \] \[ \bar{X} = \mu + \frac{1}{n} \sum_i u_i , \quad E(\bar{X}) = \mu, \quad V(\bar{X}) = \frac{1}{n} \sigma^2 \]
mu <- 1
sigma2 <- 2^2
N <- 2
result_mean <- NULL
for (j in 1:10000) {
sample_j <- rnorm(n = N, mean = mu, sd = sqrt(sigma2))
result_mean <- c(result_mean, mean(sample_j))
}
mean(result_mean) # 理論上は mu = 1
## [1] 1.005537
var(result_mean) # 理論上は sigma^2/n = 4/2 = 2
## [1] 1.968904
sapply 関数を使うことでより簡単に書くことができる.
N <- 2
result_mean <- sapply(X = 1:10000, FUN = function (x) {
mean(rnorm(n = N, mean = mu, sd = sqrt(sigma2)))
})
mean(result_mean); var(result_mean)
## [1] 1.008541
## [1] 2.051369
sample size N を変えて計算
result_mean_mat <- NULL
for (k in c(2, 5, 10, 100, 1000)) {
result_mean <- sapply(X = 1:10000, FUN = function (x) {
mean(rnorm(n = k, mean = mu, sd = sqrt(sigma2)))
})
result_mean_mat <- rbind(result_mean_mat, c(k, mean(result_mean), var(result_mean), sigma2/k))
}
colnames(result_mean_mat) <- c("N", "mean (obs)", "var (obs)", "var (theo)")
result_mean_mat
## N mean (obs) var (obs) var (theo)
## [1,] 2 0.9816762 1.911070114 2.000
## [2,] 5 1.0084608 0.804719913 0.800
## [3,] 10 0.9913869 0.403245757 0.400
## [4,] 100 1.0005279 0.039613990 0.040
## [5,] 1000 0.9989577 0.003872632 0.004
plot(x = result_mean_mat[, 1], y = result_mean_mat[, 3], log = "x", type = "b",
main = "X ~ N(1, 2^2)", xlab = "N (sample size)", ylab = "variance of sample mean")
t分布の臨界値
qnorm(p = 0.975, mean = 0, sd = 1) # cf. 標準正規分布
## [1] 1.959964
qt(p = 0.975, df = 10) # t分布
## [1] 2.228139
qt(p = 0.975, df = 1000)
## [1] 1.962339
次のような(単)回帰モデルを考える.
\[ Fertility = \alpha + \beta Examination + u \]
str(swiss) # built-in dataset
## 'data.frame': 47 obs. of 6 variables:
## $ Fertility : num 80.2 83.1 92.5 85.8 76.9 76.1 83.8 92.4 82.4 82.9 ...
## $ Agriculture : num 17 45.1 39.7 36.5 43.5 35.3 70.2 67.8 53.3 45.2 ...
## $ Examination : int 15 6 5 12 17 9 16 14 12 16 ...
## $ Education : int 12 9 5 7 15 7 7 8 7 13 ...
## $ Catholic : num 9.96 84.84 93.4 33.77 5.16 ...
## $ Infant.Mortality: num 22.2 22.2 20.2 20.3 20.6 26.6 23.6 24.9 21 24.4 ...
summary(lm(formula = Fertility ~ Examination, data = swiss))
##
## Call:
## lm(formula = Fertility ~ Examination, data = swiss)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.9375 -6.0044 -0.3393 7.9239 19.7399
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 86.8185 3.2576 26.651 < 2e-16 ***
## Examination -1.0113 0.1782 -5.675 9.45e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.642 on 45 degrees of freedom
## Multiple R-squared: 0.4172, Adjusted R-squared: 0.4042
## F-statistic: 32.21 on 1 and 45 DF, p-value: 9.45e-07
OLS回帰係数は次のように与えられる.
\[ \hat{\alpha} = \bar{Y} - \hat{\beta} \bar{X}, \quad \hat{\beta} = \frac{S_{XY}}{S_{XX}} \]
y_bar <- mean(swiss$Fertility) # Y bar
x_bar <- mean(swiss$Examination) # X bar
s_xx <- sum((swiss$Examination - x_bar)^2) # S_XX
s_xy <- sum((swiss$Examination - x_bar) * (swiss$Fertility - y_bar)) # S_XX
y_bar; x_bar; s_xx; s_xy
## [1] 70.14255
## [1] 16.48936
## [1] 2927.745
## [1] -2960.879
beta_hat <- s_xy / s_xx
alpha_hat <- y_bar - beta_hat * x_bar
alpha_hat; beta_hat
## [1] 86.81853
## [1] -1.011317
y_pred <- alpha_hat + beta_hat * swiss$Examination # y hat
mean(y_pred); y_bar # should be matched
## [1] 70.14255
## [1] 70.14255
resid_swiss <- swiss$Fertility - y_pred # residuals
sum(resid_swiss) # should be zero
## [1] 1.065814e-13
sum(resid_swiss * swiss$Examination) # eX ... should be zero
## [1] 1.58451e-12
sum((swiss$Examination - x_bar) * (resid_swiss - mean(resid_swiss))) # S_Xe ... should be zero
## [1] -1.501022e-13
sum((y_pred - y_bar) * (resid_swiss - mean(resid_swiss))) # S_Ye ... should be zero
## [1] -8.354428e-14
\[ R^2 = \frac{S_{\hat{Y}\hat{Y}}}{S_{YY}} = 1 - \frac{\sum \hat{u}_i^2}{S_{YY}} \]
s_yy <- sum((swiss$Fertility - y_bar)^2)
s_yhyh <- sum((y_pred - y_bar)^2)
s_yhyh / s_yy # R^2
## [1] 0.4171645
1 - sum(resid_swiss^2) / s_yy # R^2 ... should be the same as above
## [1] 0.4171645
\[ V(\hat{\beta}) = \frac{\sigma^2}{S_{XX}}, \quad \hat{\sigma^2} = s^2 = \frac{1}{n - 2} \sum \hat{u}_i^2 \]
beta_se <- sqrt(sum(resid_swiss^2)/(nrow(swiss) - 2) / s_xx) # standard error
beta_se
## [1] 0.1781971
beta_hat / beta_se # t stat
## [1] -5.675275
t分布の2.5%臨界値(両側5%に対応)は?
qt(p = 0.025, df = nrow(swiss) - 2) # df: degree of freedom (自由度)
## [1] -2.014103
qt(p = c(0.005, 0.025, 0.975, 0.995), df = nrow(swiss) - 2) # 1% and 5%
## [1] -2.689585 -2.014103 2.014103 2.689585
qt(p = 0.975, df = 10000) # cf. df が大きければほとんど標準正規分布の場合と同じ
## [1] 1.960201
\[ \hat{\beta} = (X'X)^{-1}X'y, \quad V(\hat{\beta}) = \sigma^2 (X'X)^{-1} \]
summary(lm(Fertility ~ Examination + Education, data = swiss))
##
## Call:
## lm(formula = Fertility ~ Examination + Education, data = swiss)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.9935 -6.8894 -0.3621 7.1640 19.2634
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 85.2533 3.0855 27.630 <2e-16 ***
## Examination -0.5572 0.2319 -2.402 0.0206 *
## Education -0.5395 0.1924 -2.803 0.0075 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.982 on 44 degrees of freedom
## Multiple R-squared: 0.5055, Adjusted R-squared: 0.483
## F-statistic: 22.49 on 2 and 44 DF, p-value: 1.87e-07
Y <- as.matrix(swiss[, c("Fertility")])
X <- as.matrix(cbind(1, swiss[, c("Examination", "Education")]))
beta_multi <- solve(t(X) %*% X) %*% t(X) %*% Y # beta = (X'X)^(-1)X'Y
beta_multi
## [,1]
## 1 85.2532753
## Examination -0.5572183
## Education -0.5394570
resid_multi <- Y - X %*% beta_multi
s2 <- sum(resid_multi^2) / (nrow(swiss)-3) # estimates of sigma^2
sqrt(diag(s2 * solve(t(X) %*% X))) # standard errors 標準誤差
## 1 Examination Education
## 3.0854981 0.2319374 0.1924380
新しい変数を作成するなどのデータフレームの操作をするためには
tidyverse に含まれる関数 (mutate,
rename, group_by, etc.) を用いると簡単.
# install.packages("tidyverse") # 一度だけ実行
library(tidyverse) # Rを立ち上げるたびに実行
swiss2 <- swiss %>%
mutate(Edu2 = Education^2,
ExamEdu = Examination * Education)
summary(lm(Fertility ~ Examination + Edu2, data = swiss2))
##
## Call:
## lm(formula = Fertility ~ Examination + Edu2, data = swiss2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.2462 -6.1306 -0.6674 6.1323 19.0912
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 83.220719 3.274898 25.412 < 2e-16 ***
## Examination -0.660686 0.205804 -3.210 0.00248 **
## Edu2 -0.010349 0.003613 -2.864 0.00638 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.952 on 44 degrees of freedom
## Multiple R-squared: 0.5088, Adjusted R-squared: 0.4864
## F-statistic: 22.79 on 2 and 44 DF, p-value: 1.615e-07
summary(lm(Fertility ~ ExamEdu, data = swiss2))
##
## Call:
## lm(formula = Fertility ~ ExamEdu, data = swiss2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.923 -6.512 -1.373 7.444 19.349
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 75.732110 1.715347 44.150 < 2e-16 ***
## ExamEdu -0.023941 0.004212 -5.684 9.17e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.636 on 45 degrees of freedom
## Multiple R-squared: 0.4179, Adjusted R-squared: 0.405
## F-statistic: 32.31 on 1 and 45 DF, p-value: 9.167e-07
データセットを直接操作せずに,回帰モデルを指定する際に I
関数を使って二乗や交差項を表現することもできる.「var1 : var2」は「var1 × var2」を表し,「var1 * var2」は「var1 + var2 + (var1 × var2)」を表す.
summary(lm(Fertility ~ Examination + I(Education^2), data = swiss))
##
## Call:
## lm(formula = Fertility ~ Examination + I(Education^2), data = swiss)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.2462 -6.1306 -0.6674 6.1323 19.0912
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 83.220719 3.274898 25.412 < 2e-16 ***
## Examination -0.660686 0.205804 -3.210 0.00248 **
## I(Education^2) -0.010349 0.003613 -2.864 0.00638 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.952 on 44 degrees of freedom
## Multiple R-squared: 0.5088, Adjusted R-squared: 0.4864
## F-statistic: 22.79 on 2 and 44 DF, p-value: 1.615e-07
summary(lm(Fertility ~ Examination:Education, data = swiss))
##
## Call:
## lm(formula = Fertility ~ Examination:Education, data = swiss)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.923 -6.512 -1.373 7.444 19.349
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 75.732110 1.715347 44.150 < 2e-16 ***
## Examination:Education -0.023941 0.004212 -5.684 9.17e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.636 on 45 degrees of freedom
## Multiple R-squared: 0.4179, Adjusted R-squared: 0.405
## F-statistic: 32.31 on 1 and 45 DF, p-value: 9.167e-07
summary(lm(Fertility ~ Examination*Education, data = swiss))
##
## Call:
## lm(formula = Fertility ~ Examination * Education, data = swiss)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.522 -6.901 -1.169 7.383 19.412
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 87.178104 4.334560 20.112 <2e-16 ***
## Examination -0.625731 0.257117 -2.434 0.0192 *
## Education -0.807552 0.463474 -1.742 0.0886 .
## Examination:Education 0.009201 0.014451 0.637 0.5277
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.043 on 43 degrees of freedom
## Multiple R-squared: 0.5101, Adjusted R-squared: 0.4759
## F-statistic: 14.92 on 3 and 43 DF, p-value: 8.437e-07
summary(lm(log(Fertility) ~ Examination, data = swiss))
##
## Call:
## lm(formula = log(Fertility) ~ Examination, data = swiss)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.44076 -0.07248 -0.00161 0.11394 0.25392
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.492515 0.051457 87.306 < 2e-16 ***
## Examination -0.015736 0.002815 -5.591 1.26e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1523 on 45 degrees of freedom
## Multiple R-squared: 0.4099, Adjusted R-squared: 0.3968
## F-statistic: 31.25 on 1 and 45 DF, p-value: 1.261e-06
swiss2 <- swiss %>%
mutate(exam_dummy = ifelse(Examination > mean(Examination), 1, 0))
swiss2 %>%
group_by(exam_dummy) %>%
summarise(mean_fertility = mean(Fertility))
## # A tibble: 2 × 2
## exam_dummy mean_fertility
## <dbl> <dbl>
## 1 0 76.1
## 2 1 62.2
summary(lm(Fertility ~ exam_dummy, data = swiss2))
##
## Call:
## lm(formula = Fertility ~ exam_dummy, data = swiss2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31.359 -5.557 1.945 6.793 16.441
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 76.059 2.020 37.657 < 2e-16 ***
## exam_dummy -13.904 3.096 -4.491 4.91e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.5 on 45 degrees of freedom
## Multiple R-squared: 0.3095, Adjusted R-squared: 0.2941
## F-statistic: 20.17 on 1 and 45 DF, p-value: 4.907e-05
summary(lm(Fertility ~ I(Examination > mean(Examination)), data = swiss2))
##
## Call:
## lm(formula = Fertility ~ I(Examination > mean(Examination)),
## data = swiss2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31.359 -5.557 1.945 6.793 16.441
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 76.059 2.020 37.657 < 2e-16 ***
## I(Examination > mean(Examination))TRUE -13.904 3.096 -4.491 4.91e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.5 on 45 degrees of freedom
## Multiple R-squared: 0.3095, Adjusted R-squared: 0.2941
## F-statistic: 20.17 on 1 and 45 DF, p-value: 4.907e-05
\[ H_0: \beta_{Ed} = \beta_{Ex} = 0 \] \[ F = \frac{(Q_R - R) / 2}{Q / (n-k-1)} \sim F(2, n-k-1) \]
lm_swiss_unrestricted <- lm(Fertility ~ Examination + Education, data = swiss)
lm_swiss_restricted <- lm(Fertility ~ 1, data = swiss)
resid_unrestricted <- sum(lm_swiss_unrestricted$resid^2)
resid_restricted <- sum(lm_swiss_restricted$resid^2)
((resid_restricted - resid_unrestricted) / 2) / (resid_unrestricted / (nrow(swiss) - 2 - 1)) # F
## [1] 22.48799
qf(p = 0.05, df1 = 2, df2 = nrow(swiss) - 2 - 1, lower.tail = F)
## [1] 3.209278
# qf(p = 0.95, df1 = 2, df2 = nrow(swiss) - 2 - 1, lower.tail = T) # same as above
car というパッケージの linearHypothesis
という関数で検定できます.「car::linearHypothesis」のようにパッケージ名と関数名を
::
で繋いで使用すれば,当該パッケージを読み込まなくても関数を使えます.
lm_swiss <- lm(Fertility ~ Examination + Education, data = swiss)
summary(lm_swiss)
##
## Call:
## lm(formula = Fertility ~ Examination + Education, data = swiss)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.9935 -6.8894 -0.3621 7.1640 19.2634
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 85.2533 3.0855 27.630 <2e-16 ***
## Examination -0.5572 0.2319 -2.402 0.0206 *
## Education -0.5395 0.1924 -2.803 0.0075 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.982 on 44 degrees of freedom
## Multiple R-squared: 0.5055, Adjusted R-squared: 0.483
## F-statistic: 22.49 on 2 and 44 DF, p-value: 1.87e-07
car::linearHypothesis(lm_swiss, "Examination = 0")
## Linear hypothesis test
##
## Hypothesis:
## Examination = 0
##
## Model 1: restricted model
## Model 2: Fertility ~ Examination + Education
##
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 45 4015.2
## 2 44 3549.6 1 465.63 5.7718 0.02057 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
car::linearHypothesis(lm_swiss, c("Examination = 0", "Education = 0"))
## Linear hypothesis test
##
## Hypothesis:
## Examination = 0
## Education = 0
##
## Model 1: restricted model
## Model 2: Fertility ~ Examination + Education
##
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 46 7178.0
## 2 44 3549.6 2 3628.3 22.488 1.87e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
car::linearHypothesis(lm_swiss, "Examination + Education = 1")
## Linear hypothesis test
##
## Hypothesis:
## Examination + Education = 1
##
## Model 1: restricted model
## Model 2: Fertility ~ Examination + Education
##
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 45 16001.1
## 2 44 3549.6 1 12452 154.35 5.523e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
See alos: https://www.econometrics-with-r.org/7.3-joint-hypothesis-testing-using-the-f-statistic.html
\[ s.e.(\hat{\beta}) = \frac{1}{n s_X^2} \sqrt{\sum_i (X_i - \bar{X})^2 \hat{u}_i^2} \]
x <- swiss$Examination
lm_swiss <- lm(Fertility ~ Examination, data = swiss)
sqrt(sum((x - mean(x))^2 * lm_swiss$resid^2)) / (nrow(swiss) * var(x)) # White SE (original)
## [1] 0.173891
sqrt(sum((x - mean(x))^2 * lm_swiss$resid^2) / (sum((x - mean(x))^2))^2 * nrow(swiss) / (nrow(swiss) - 2)) # White SE (modified)
## [1] 0.1815766
fixest パッケージで推定する場合.
# install.packages("fixest")
library(fixest)
feols(Fertility ~ Examination, data = swiss, vcov = "hetero")
## OLS estimation, Dep. Var.: Fertility
## Observations: 47
## Standard-errors: Heteroskedasticity-robust
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 86.81853 3.175114 27.34344 < 2.2e-16 ***
## Examination -1.01132 0.181577 -5.56965 1.353e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 9.43462 Adj. R2: 0.404213
# sqrt(diag(car::hccm(model = lm_swiss, type = "hc1"))) # using "car" package
# lmtest::coeftest(lm_swiss, vcov. = sandwich::vcovHC(lm_swiss, type="HC1")) # using "lmtest" and "sandwich" packages
以下の6地域をクラスターとして利用します(正確性・妥当性は未検証です).
A: ジュラカントン, B: ヴォー州, C: フリブール州, D: ネウシャテル州, E: ジュネーブ州, F: ヴァレー州
library(fixest)
swiss2 <- swiss
swiss2$region <-
c("A", "A", "A", "A", "A", "A", "C", "C", "C", "C",
"C", "B", "B", "B", "B", "B", "B", "B", "B", "B",
"B", "B", "B", "B", "B", "B", "B", "B", "B", "B",
"F", "F", "F", "F", "F", "F", "F", "F", "D", "D",
"D", "D", "D", "D", "E", "E", "E")
table(swiss2$region)
##
## A B C D E F
## 6 19 5 6 3 8
feols(Fertility ~ Examination, data = swiss2, cluster = ~ region)
## OLS estimation, Dep. Var.: Fertility
## Observations: 47
## Standard-errors: Clustered (region)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 86.81853 4.708134 18.44011 8.6278e-06 ***
## Examination -1.01132 0.281775 -3.58910 1.5723e-02 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 9.43462 Adj. R2: 0.404213
OLS:
\[ Y_i = 1 + X_i + \sqrt{h} \tilde{u}_i \] \[ X_i \sim N(0, 1) , \quad \tilde{u}_i \sim \mbox{Uni}(0, 4 \sqrt{12}), \quad h_i \sim \mbox{Uni}(0, 1) \]
WLS:
\[ \frac{Y_i}{\sqrt{h_i}} = \frac{1}{\sqrt{h_i}} + \beta \frac{X_i}{\sqrt{h_i}} + u_i \]
注:\(E(u_i) \ne 0\) であるため,WLSで定数項を0にして推定してはいけない.
ols_matrix <- wls_matrix <- NULL
for (i in 1:1000) {
set.seed(i)
n <- 50
x <- rnorm(n, 0, 1)
u_tilde <- runif(n, 0, 4*sqrt(12))
h <- runif(n, 0, 1)
u <- u_tilde * h
# OLS
y <- 1 + x + u
ols_matrix <- rbind(ols_matrix, lm(y ~ x)$coef)
# WLS
y_tilde <- y / sqrt(h)
Ii <- 1 / sqrt(h)
x_tilde <- x / sqrt(h)
wls_matrix <- rbind(wls_matrix, lm(y_tilde ~ x_tilde)$coef)
}
summary(wls_matrix[, 2]); summary(ols_matrix[, 2])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.7191 0.8522 1.0012 1.0113 1.1636 2.5938
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.3889 0.7342 0.9998 1.0167 1.3030 2.3599
hist(wls_matrix[, 2], breaks = 30)
hist(ols_matrix[, 2], breaks = 20, add = T, col = rgb(1,0,0, alpha = .2))
legend(x = "topright", col = c(8, 2), legend = c("WLS", "OLS"), lwd = 5)
plot(wls_matrix[, 2], ols_matrix[, 2], xlab = "beta of WLS", ylab = "beta of OLS", pch = ".")
abline(a = 0, b = 1, col = 2)
データは 教科書のサポートページ からExcelファイルとしてダウンロードできるので,それをCSVに変換して適当なフォルダに配置する.
setwd("c://ws_stat")
emp <- read.csv("emp.csv")
y <- emp$temp
x <- emp$unemp
z <- emp$z_unemp05
sum((z - mean(z))*(y - mean(y))) / sum((z - mean(z))*(x - mean(x))) # S_ZY / S_ZX
## [1] 1.246263
cov(z, y) / cov(z, x)
## [1] 1.246263
library(fixest)
feols(fml = temp ~ 1 | unemp ~ z_unemp05, data = emp, vcov = "hetero")
## TSLS estimation, Dep. Var.: temp, Endo.: unemp, Instr.: z_unemp05
## Second stage: Dep. Var.: temp
## Observations: 47
## Standard-errors: Heteroskedasticity-robust
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.995268 0.350071 -2.84305 0.0066963 **
## fit_unemp 1.246263 0.432134 2.88397 0.0060050 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.24615 Adj. R2: 0.026772
## F-test (1st stage), unemp: stat = 336.1 , p < 2.2e-16, on 1 and 45 DoF.
## Wu-Hausman: stat = 4.41028, p = 0.04149, on 1 and 44 DoF.
教科書 p. 218 の表12.2.
feols(fml = temp ~ income + unemp, data = emp, vcov = "hetero") # OLS
## OLS estimation, Dep. Var.: temp
## Observations: 47
## Standard-errors: Heteroskedasticity-robust
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.05782 2.696037 -2.98876 0.0045702 **
## income 1.98484 0.742847 2.67194 0.0105347 *
## unemp 1.58483 0.468323 3.38406 0.0015107 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.22566 Adj. R2: 0.163464
feols(fml = temp ~ income | unemp ~ z_unemp05, data = emp, vcov = "hetero") # single IV
## TSLS estimation, Dep. Var.: temp, Endo.: unemp, Instr.: z_unemp05
## Second stage: Dep. Var.: temp
## Observations: 47
## Standard-errors: Heteroskedasticity-robust
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.48101 3.237231 -2.92874 0.00537381 **
## fit_unemp 2.06933 0.519597 3.98256 0.00025222 ***
## income 2.28655 0.870654 2.62624 0.01183358 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.22768 Adj. R2: 0.148418
## F-test (1st stage), unemp: stat = 256.1 , p < 2.2e-16 , on 1 and 44 DoF.
## Wu-Hausman: stat = 5.02769, p = 0.030156, on 1 and 43 DoF.
feols(fml = temp ~ income | unemp ~ z_unemp05 + z_unemp00, data = emp, vcov = "hetero") # IVs
## TSLS estimation, Dep. Var.: temp, Endo.: unemp, Instr.: z_unemp05, z_unemp00
## Second stage: Dep. Var.: temp
## Observations: 47
## Standard-errors: Heteroskedasticity-robust
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.29376 3.196733 -2.90727 0.00569204 **
## fit_unemp 2.00559 0.522776 3.83641 0.00039482 ***
## income 2.24685 0.859686 2.61357 0.01221875 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.227185 Adj. R2: 0.152117
## F-test (1st stage), unemp: stat = 127.9 , p < 2.2e-16 , on 2 and 43 DoF.
## Wu-Hausman: stat = 3.77294, p = 0.058656, on 1 and 43 DoF.
## Sargan: stat = 3.95004, p = 0.04687 , on 1 DoF.
9市場・12か月の取引集計データに基づくみかんの需要曲線推定.教科書 p. 228 表13.1の推定値が再現できない…
library(fixest)
setwd("c://ws_stat")
orange <- read.csv("orange.csv")
feols(fml = log(qo) ~ as.factor(market.1) + log(po), data = orange, vcov = "hetero") # OLS
## OLS estimation, Dep. Var.: log(qo)
## Observations: 108
## Standard-errors: Heteroskedasticity-robust
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18.737088 0.521598 35.922473 < 2.2e-16 ***
## as.factor(market.1)2 1.629559 0.362850 4.491002 1.9385e-05 ***
## as.factor(market.1)3 -0.568854 0.397785 -1.430057 1.5588e-01
## as.factor(market.1)4 -1.064572 0.420401 -2.532276 1.2921e-02 *
## as.factor(market.1)5 -1.213416 0.371531 -3.265991 1.5037e-03 **
## as.factor(market.1)6 -0.319987 0.387418 -0.825946 4.1084e-01
## as.factor(market.1)7 -2.430138 0.373421 -6.507772 3.2536e-09 ***
## as.factor(market.1)8 -1.303571 0.377052 -3.457269 8.0865e-04 ***
## as.factor(market.1)9 -4.040535 0.480658 -8.406263 3.4207e-13 ***
## log(po) -2.145434 0.079019 -27.151009 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.950804 Adj. R2: 0.815716
feols(fml = log(qo) ~ as.factor(market.1) |
log(po) ~ mon + mon2, data = orange, vcov = "hetero") # IV
## TSLS estimation, Dep. Var.: log(qo), Endo.: log(po), Instr.: mon, mon2
## Second stage: Dep. Var.: log(qo)
## Observations: 108
## Standard-errors: Heteroskedasticity-robust
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 21.714923 1.027108 21.141805 < 2.2e-16 ***
## fit_log(po) -2.641150 0.165606 -15.948347 < 2.2e-16 ***
## as.factor(market.1)2 1.631248 0.393820 4.142113 7.3020e-05 ***
## as.factor(market.1)3 -0.608502 0.413686 -1.470926 1.4451e-01
## as.factor(market.1)4 -1.103216 0.474688 -2.324086 2.2186e-02 *
## as.factor(market.1)5 -1.222477 0.402320 -3.038568 3.0473e-03 **
## as.factor(market.1)6 -0.336839 0.412219 -0.817134 4.1583e-01
## as.factor(market.1)7 -2.435371 0.395089 -6.164111 1.5805e-08 ***
## as.factor(market.1)8 -1.381284 0.403130 -3.426399 8.9521e-04 ***
## as.factor(market.1)9 -4.071735 0.489043 -8.325930 5.0899e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 1.01373 Adj. R2: 0.790515
## F-test (1st stage), log(po): stat = 98.6, p < 2.2e-16 , on 2 and 97 DoF.
## Wu-Hausman: stat = 37.4, p = 2.043e-8, on 1 and 97 DoF.
## Sargan: stat = 32.9, p = 9.6e-9 , on 1 DoF.
5回の試行で3回成功(成功確率 \(p\)),2回失敗(失敗確率 \(1-p\))した場合.
\[ L(p) = (1-p)^2 p^3, \quad \log L(p) = 2 \log (1-p) + 3 \log (p) \]
l_func <- function (p) (1-p)^2 * p^3
ll_func <- function (p) 2 * log(1-p) +3 * log(p)
plot(l_func, xlim = c(0, 1)); abline(v = .6, lty = 2, col = 2)
plot(ll_func, xlim = c(0, 1)); abline(v = .6, lty = 2, col = 2)
\[ f = \frac{1}{\sqrt{2 \pi \sigma^2}} \exp \left[- \frac{(y_i - \mu)^2}{2 \sigma^2} \right] , \quad \log f = - \frac{1}{2} \log 2 \pi - \frac{1}{2} \log \sigma^2 - \frac{(y_i - \mu)^2}{2 \sigma^2} \]
\[ \log L = - \frac{n}{2} \log 2 \pi - \frac{n}{2} \log \sigma^2 - \frac{1}{2 \sigma^2} \sum_i (y_i - \mu)^2 \]
\[ \hat{\mu} = \frac{1}{n} \sum_i y_i , \quad \hat{\sigma}^2 = \frac{1}{n} \sum_i (y_i - \bar{y}) \]
y <- c(-3, -2, 0.5, 0.5, 1.5) # この標本を正規母集団からのサンプリングとみなす
mean(y) # 標本平均(mu の最尤推定量と同じ)
## [1] -0.5
var(y) # 標本分散
## [1] 3.625
sum((y - mean(y))^2) / length(y) # 分散(sigma^2 の最尤推定量と同じ)
## [1] 2.9
ll_func_n <- function (param, y, n) { # 対数尤度関数
mu = param[1]
sigma2 = param[2]
-n/2*log(2*pi) - n/2*log(sigma2) - 1/(2*sigma2)*sum((y-mu)^2)
}
\(\sigma^2 = 1\) に固定した場合の \(\mu\) と対数尤度の関係をプロットします.
mu_list <- seq(-2, 2, length = 100)
ll_mu_list <- sapply(X = mu_list,
FUN = function(x) ll_func_n(param = c(x, 1), y = y, n = length(y)))
plot(x = mu_list, y = ll_mu_list, xlim = c(-2, 2), type = "l")
3Dで描画します.(outer 関数を使って z
を計算するつもりだったが,なぜか outer
関数にかませる尤度関数中で sum((y-mu)^2) が評価できないので
for 文をネストして計算する.)
grid_mu <- seq(-2, 2, length = 20) # 探索する範囲を grid で表す
grid_sigma2 <- seq(1, 10, length = 20)
z <- matrix(0, nrow = length(grid_mu), ncol = length(grid_sigma2))
for (i in 1:length(grid_mu)) { # 探索 grid に対応する対数尤度を計算
for (j in 1:length(grid_sigma2)) {
z[i, j] <- ll_func_n(param = c(grid_mu[i], grid_sigma2[j]), y = y, n = length(y))
}
}
which_max_ml <- which(z == max(z), arr.ind = T) # 尤度を最大化する grid の index
grid_mu[which_max_ml[1]]; grid_sigma2[which_max_ml[2]] # ML推定量 (gridが粗いので精度は低い)
## [1] -0.5263158
## [1] 2.894737
persp(grid_mu, grid_sigma2, z, theta = 50, phi = 20) -> pmat_persp
points(trans3d(x = grid_mu[which_max_ml[1]], y = grid_sigma2[which_max_ml[2]],
z = z[which_max_ml], pmat = pmat_persp), col = 2, pch = 19)
汎用の最適化関数 optim を使って最尤推定する.
optim(par = c(0, 1), fn = ll_func_n, y = y, n = length(y), control = list(fnscale = -1))
## $par
## [1] -0.5000029 2.8995297
##
## $value
## [1] -9.75647
##
## $counts
## function gradient
## 65 NA
##
## $convergence
## [1] 0
##
## $message
## NULL
See Rで推定する回帰モデル入門 > MLE of linear model / 線形モデルの最尤推定
\[ p = \Pr (y = 1) = \Phi (\alpha + \beta x) \]
Jリーグ(J1)2011-12シーズンの528選手の出場とポストシーズンの移籍に関するデータ
setwd("c://ws_stat")
j1 <- read.csv("j1.csv")
summary(lm(transfer ~ attend + goal + j2, data = j1)) # linear probability model
##
## Call:
## lm(formula = transfer ~ attend + goal + j2, data = j1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.7659 -0.4384 -0.1615 0.4300 0.9019
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.58076 0.03192 18.192 < 2e-16 ***
## attend -0.49733 0.06838 -7.273 1.29e-12 ***
## goal 0.26143 0.25706 1.017 0.309617
## j2 0.18514 0.05506 3.362 0.000829 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4678 on 524 degrees of freedom
## Multiple R-squared: 0.1175, Adjusted R-squared: 0.1125
## F-statistic: 23.26 on 3 and 524 DF, p-value: 3.797e-14
glm_j1 <- glm(formula = transfer ~ attend + goal + j2, data = j1,
family = binomial(link = "probit"))
summary(glm_j1)
##
## Call:
## glm(formula = transfer ~ attend + goal + j2, family = binomial(link = "probit"),
## data = j1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7076 -1.0654 -0.6124 1.0542 2.0281
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.21598 0.08669 2.491 0.012721 *
## attend -1.39344 0.20347 -6.848 7.48e-12 ***
## goal 0.82744 0.73727 1.122 0.261732
## j2 0.51399 0.15352 3.348 0.000814 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 723.69 on 527 degrees of freedom
## Residual deviance: 657.88 on 524 degrees of freedom
## AIC: 665.88
##
## Number of Fisher Scoring iterations: 4
限界効果
\[ ME_i = \phi (\alpha + \beta x_i) \beta, \quad ME = \phi (\hat{\alpha} + \hat{\beta} \bar{x}) \hat{\beta} \mbox{ or } \frac{1}{n} \sum_i \phi (\hat{\alpha} + \hat{\beta} x_i) \hat{\beta} \]
# marginal effect at the mean
xb_mean <- glm_j1$coef %*% c(1, mean(j1$attend), mean(j1$goal), mean(j1$j2))
as.numeric(dnorm(x = xb_mean)) * glm_j1$coef
## (Intercept) attend goal j2
## 0.08475404 -0.54680256 0.32469830 0.20169653
# mean marginal effect
xb <- as.matrix(cbind(1, j1[, c("attend", "goal", "j2")])) %*% glm_j1$coef
c(mean(dnorm(x = xb)* glm_j1$coef[2]), mean(dnorm(x = xb)* glm_j1$coef[3]),
mean(dnorm(x = xb)* glm_j1$coef[4]))
## [1] -0.4956684 0.2943342 0.1828349
mfx パッケージを使う場合.
# install.packages("mfx")
library(mfx)
probitmfx(formula = transfer ~ attend + goal + j2, data = j1)
## Call:
## probitmfx(formula = transfer ~ attend + goal + j2, data = j1)
##
## Marginal Effects:
## dF/dx Std. Err. z P>|z|
## attend -0.546803 0.079510 -6.8772 6.106e-12 ***
## goal 0.324698 0.289228 1.1226 0.2615910
## j2 0.202809 0.059348 3.4173 0.0006325 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## dF/dx is for discrete change for the following variables:
##
## [1] "j2"
.