Оценка популярных моделей с помощью MCMC

library(MCMCpack)
library(ggplot2)
library(GGally)

h <- swiss
str(h)

## 'data.frame':    47 obs. of  6 variables:
##  $ Fertility       : num  80.2 83.1 92.5 85.8 76.9 76.1 83.8 92.4 82.4 82.9 ...
##  $ Agriculture     : num  17 45.1 39.7 36.5 43.5 35.3 70.2 67.8 53.3 45.2 ...
##  $ Examination     : int  15 6 5 12 17 9 16 14 12 16 ...
##  $ Education       : int  12 9 5 7 15 7 7 8 7 13 ...
##  $ Catholic        : num  9.96 84.84 93.4 33.77 5.16 ...
##  $ Infant.Mortality: num  22.2 22.2 20.2 20.3 20.6 26.6 23.6 24.9 21 24.4 ...

theme_set(theme_bw())
# plotmatrix(h)
ggpairs(h)

plot of chunk unnamed-chunk-3

formula <- Fertility ~ Agriculture + Examination + Catholic + Education

Обычный МНК:

m1 <- lm(formula, data = h)
summary(m1)

## 
## Call:
## lm(formula = formula, data = h)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.781  -6.331   0.811   5.721  15.557 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  91.0554     6.9488   13.10  < 2e-16 ***
## Agriculture  -0.2206     0.0736   -3.00   0.0046 ** 
## Examination  -0.2606     0.2741   -0.95   0.3472    
## Catholic      0.1244     0.0373    3.34   0.0018 ** 
## Education    -0.9616     0.1945   -4.94  1.3e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.74 on 42 degrees of freedom
## Multiple R-squared:  0.65,   Adjusted R-squared:  0.616 
## F-statistic: 19.5 on 4 and 42 DF,  p-value: 3.95e-09

priors <- c(2, 0.8, 0.5, 0.3, -5)

m1.mcmc <- MCMCregress(formula, data = h, b0 = priors, B0 = 1e-04)
summary(m1.mcmc)

## 
## Iterations = 1001:11000
## Thinning interval = 1 
## Number of chains = 1 
## Sample size per chain = 10000 
## 
## 1. Empirical mean and standard deviation for each variable,
##    plus standard error of the mean:
## 
##               Mean      SD Naive SE Time-series SE
## (Intercept) 90.593  7.0427 0.070427       0.070427
## Agriculture -0.216  0.0744 0.000744       0.000724
## Examination -0.248  0.2814 0.002814       0.002861
## Catholic     0.125  0.0378 0.000378       0.000378
## Education   -0.963  0.2003 0.002003       0.002003
## sigma2      62.784 14.4123 0.144123       0.162921
## 
## 2. Quantiles for each variable:
## 
##                2.5%    25%    50%     75%    97.5%
## (Intercept) 76.8020 85.932 90.597 95.1560 104.5796
## Agriculture -0.3634 -0.265 -0.216 -0.1673  -0.0721
## Examination -0.8027 -0.434 -0.248 -0.0611   0.2991
## Catholic     0.0499  0.100  0.125  0.1505   0.1983
## Education   -1.3583 -1.095 -0.964 -0.8290  -0.5647
## sigma2      40.5154 52.492 60.745 70.6544  96.9017

Что можно вытащить из результатов оценивания?

str(m1.mcmc)

##  mcmc [1:10000, 1:6] 101.4 91 80.8 100.3 100.3 ...
##  - attr(*, "mcpar")= num [1:3] 1001 11000 1
##  - attr(*, "dimnames")=List of 2
##   ..$ : NULL
##   ..$ : chr [1:6] "(Intercept)" "Agriculture" "Examination" "Catholic" ...
##  - attr(*, "title")= chr "MCMCregress Posterior Sample"
##  - attr(*, "y")= num [1:47, 1] 80.2 83.1 92.5 85.8 76.9 76.1 83.8 92.4 82.4 82.9 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:47] "Courtelary" "Delemont" "Franches-Mnt" "Moutier" ...
##   .. ..$ : NULL
##  - attr(*, "call")= language MCMCregress(formula = formula, data = h, b0 = priors, B0 = 1e-04)

beta.sample <- as.data.frame(m1.mcmc)
beta.sample$t <- 1:nrow(beta.sample)  # добавим номер элемента последовательности
str(beta.sample)

## 'data.frame':    10000 obs. of  7 variables:
##  $ (Intercept): num  101.4 91 80.8 100.3 100.3 ...
##  $ Agriculture: num  -0.36 -0.237 -0.108 -0.25 -0.353 ...
##  $ Examination: num  -0.377 -0.212 0.04 -0.628 -0.268 ...
##  $ Catholic   : num  0.122 0.151 0.122 0.117 0.149 ...
##  $ Education  : num  -1.015 -0.951 -1.064 -0.992 -1.232 ...
##  $ sigma2     : num  86.8 60.3 40.4 42 45.5 ...
##  $ t          : int  1 2 3 4 5 6 7 8 9 10 ...

Зададим удобные имена переменным

colnames(beta.sample)[1] <- "const"

Построим апостериорное распределение свободного члена и визуально оценим сходимость:

ggplot(beta.sample, aes(x = const)) + geom_density()

plot of chunk unnamed-chunk-11

qplot(t, Agriculture, data = beta.sample)

plot of chunk unnamed-chunk-11

Оценим модель с другим априорным мнением о коэффициентах:

priors <- c(0.2, 0.1, 0.2, -0.3, -0.1)

m1.mcmc <- MCMCregress(formula, data = h, b0 = priors, B0 = 1e-04)
summary(m1.mcmc)

## 
## Iterations = 1001:11000
## Thinning interval = 1 
## Number of chains = 1 
## Sample size per chain = 10000 
## 
## 1. Empirical mean and standard deviation for each variable,
##    plus standard error of the mean:
## 
##               Mean      SD Naive SE Time-series SE
## (Intercept) 90.584  7.0428 0.070428       0.070428
## Agriculture -0.216  0.0744 0.000744       0.000724
## Examination -0.248  0.2814 0.002814       0.002861
## Catholic     0.125  0.0378 0.000378       0.000378
## Education   -0.963  0.2003 0.002003       0.002003
## sigma2      62.784 14.4124 0.144124       0.162922
## 
## 2. Quantiles for each variable:
## 
##                2.5%    25%    50%     75%   97.5%
## (Intercept) 76.7932 85.922 90.588 95.1461 104.572
## Agriculture -0.3634 -0.265 -0.216 -0.1672  -0.072
## Examination -0.8025 -0.434 -0.247 -0.0609   0.299
## Catholic     0.0499  0.100  0.125  0.1505   0.198
## Education   -1.3583 -1.095 -0.964 -0.8289  -0.565
## sigma2      40.5140 52.490 60.744 70.6568  96.899