#まずデータがあるディレクトリを指定
setwd("~/Desktop/Statistics_Study/StatisticalModeling/kubobook_2012/03poisson")
#データの読み込み
library(readr)
d<- read_csv("data3a.csv")
##
## ─ Column specification ────────────────────────────
## cols(
## y = col_double(),
## x = col_double(),
## f = col_character()
## )
#各変数を確認
str(d)
## tibble [100 × 3] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ y: num [1:100] 6 6 6 12 10 4 9 9 9 11 ...
## $ x: num [1:100] 8.31 9.44 9.5 9.07 10.16 ...
## $ f: chr [1:100] "C" "C" "C" "C" ...
## - attr(*, "spec")=
## .. cols(
## .. y = col_double(),
## .. x = col_double(),
## .. f = col_character()
## .. )
#要約統計量
summary(d)
## y x f
## Min. : 2.00 Min. : 7.190 Length:100
## 1st Qu.: 6.00 1st Qu.: 9.428 Class :character
## Median : 8.00 Median :10.155 Mode :character
## Mean : 7.83 Mean :10.089
## 3rd Qu.:10.00 3rd Qu.:10.685
## Max. :15.00 Max. :12.400
#型の確認
class(d)
## [1] "spec_tbl_df" "tbl_df" "tbl" "data.frame"
#CとTの順番についてはアルファベット順
class(d$f)
## [1] "character"
class(d$x)
## [1] "numeric"
class(d$y)
## [1] "numeric"
#超余談
#gtパッケージでめちゃきれいなテーブルを作れる
library(gt)
library(webshot)
#install_phantomjs()
d_table <-d %>%
gt()
#テーブルの保存
#gtsave(d, "table_name.pdf")
#pchで記号を指定
#1つ目がCで2つ目がT
plot(d$x, d$y, pch = c(21, 19),xlab = "body_size", ylab = "mean_seed")
legend("topleft", legend = c("C", "T"), pch=c(21, 19))
#箱ひげ図は書籍のやり方ではできなかったのでggplot2を利用
library(ggplot2)
qplot(d$f, d$y, geom = "boxplot")
-対数尤度\(logL(\beta1, \beta2)\)が最大となる\(\beta1, \beta2\)を探す
#familyで使う分布を指定(linkはなくてもいい)
fit <- glm(y ~ x, data = d, family = poisson(link = "log"))
#結果の確認
library(jtools)
summ(fit, confint = TRUE)
## MODEL INFO:
## Observations: 100
## Dependent Variable: y
## Type: Generalized linear model
## Family: poisson
## Link function: log
##
## MODEL FIT:
## χ²(1) = 4.51, p = 0.03
## Pseudo-R² (Cragg-Uhler) = 0.04
## Pseudo-R² (McFadden) = 0.01
## AIC = 474.77, BIC = 479.98
##
## Standard errors: MLE
## -------------------------------------------------------
## Est. 2.5% 97.5% z val. p
## ----------------- ------ ------ ------- -------- ------
## (Intercept) 1.29 0.58 2.00 3.55 0.00
## x 0.08 0.01 0.15 2.13 0.03
## -------------------------------------------------------
#対数尤度もまとめて出すなら
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
stargazer(fit, type = "text", style = "all", ci = TRUE,
star.cutoffs = NA, omit.table.layout = 'n',
align = TRUE)
##
## =====================================
## Dependent variable:
## -------------------
## y
## -------------------------------------
## x 0.076
## (0.006, 0.145)
## t = 2.125
## p = 0.034
## Constant 1.292
## (0.579, 2.005)
## t = 3.552
## p = 0.0004
## -------------------------------------
## Observations 100
## Log Likelihood -235.386
## Akaike Inf. Crit. 474.773
## Residual Deviance 84.993 (df = 98)
## Null Deviance 89.507 (df = 99)
## =====================================
#最大対数尤度
logLik(fit)
## 'log Lik.' -235.3863 (df=2)
plot(d$x, d$y, pch = c(21, 19), xlab = "body_size", ylab = "mean_seed")
legend("topleft", legend = c("C", "T"), pch=c(21, 19))
#xの最小値から最大値まで100このデータ
xx <- seq(min(d$x), max(d$x), length = 100)
#平均種子数の予測結果のデータフレームを作成
yy <- predict(fit, newdata = data.frame(x = xx), type ="response")
lines(xx, yy, lwd=2)
#fは因子になってる(はずなのに文字列だったのでなおす)
d$f <- factor(d$f)
str(d$f)
## Factor w/ 2 levels "C","T": 1 1 1 1 1 1 1 1 1 1 ...
#そのままでもOKだが, もしダミー変数に変換したかったら
#install.packages("devtools")
library(makedummies)
d_dummy <- makedummies::makedummies(d)
head(d_dummy)
## # A tibble: 6 x 3
## y x f
## <dbl> <dbl> <int>
## 1 6 8.31 0
## 2 6 9.44 0
## 3 6 9.5 0
## 4 12 9.07 0
## 5 10 10.2 0
## 6 4 8.32 0
fit.f <- glm(y ~ f, data = d, family = poisson)
stargazer(fit.f, type = "text", style = "all", ci = TRUE,
star.cutoffs = NA, omit.table.layout = 'n',
align = TRUE)
##
## =====================================
## Dependent variable:
## -------------------
## y
## -------------------------------------
## fT 0.013
## (-0.127, 0.153)
## t = 0.179
## p = 0.859
## Constant 2.052
## (1.952, 2.151)
## t = 40.463
## p = 0.000
## -------------------------------------
## Observations 100
## Log Likelihood -237.627
## Akaike Inf. Crit. 479.255
## Residual Deviance 89.475 (df = 98)
## Null Deviance 89.507 (df = 99)
## =====================================
fit.all <- glm(y ~ x + f, data = d, family = poisson)
#3つのモデルを並べてみる
stargazer(fit, fit.f, fit.all, type = "text", style = "all", ci = TRUE,
star.cutoffs = NA, omit.table.layout = 'n',
align = TRUE)
##
## ==========================================================================
## Dependent variable:
## --------------------------------------------------
## y
## (1) (2) (3)
## --------------------------------------------------------------------------
## x 0.076 0.080
## (0.006, 0.145) (0.007, 0.153)
## t = 2.125 t = 2.162
## p = 0.034 p = 0.031
## fT 0.013 -0.032
## (-0.127, 0.153) (-0.178, 0.114)
## t = 0.179 t = -0.430
## p = 0.859 p = 0.668
## Constant 1.292 2.052 1.263
## (0.579, 2.005) (1.952, 2.151) (0.539, 1.988)
## t = 3.552 t = 40.463 t = 3.417
## p = 0.0004 p = 0.000 p = 0.001
## --------------------------------------------------------------------------
## Observations 100 100 100
## Log Likelihood -235.386 -237.627 -235.294
## Akaike Inf. Crit. 474.773 479.255 476.587
## Residual Deviance 84.993 (df = 98) 89.475 (df = 98) 84.808 (df = 97)
## Null Deviance (df = 99) 89.507 89.507 89.507
## ==========================================================================
#係数の図示
library(coefplot)
coefplot::multiplot(fit,fit.f,fit.all)
- わかること - \(log\lambda = \beta1 + \beta2x + \beta3d\) - 肥料の効果がマイナスになってしまっている - そもそも肥料の値は有意ではなさそう - 対数尤度はわずかの差で1位
#恒等リンク関数でやる場合
fit.all2 <- glm(y ~ x + f, data = d, family = poisson(identity))
summary(fit.all2)
##
## Call:
## glm(formula = y ~ x + f, family = poisson(identity), data = d)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3892 -0.7460 -0.1988 0.6752 2.4222
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.2670 2.8433 0.446 0.6559
## x 0.6607 0.2897 2.281 0.0226 *
## fT -0.2047 0.5823 -0.352 0.7251
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 89.507 on 99 degrees of freedom
## Residual deviance: 84.538 on 97 degrees of freedom
## AIC: 476.32
##
## Number of Fisher Scoring iterations: 5
4章・5章でGLMのあてはまりの良さ, 予測の良さについて検討