前回

今回の内容

データ

6.5 候補作用項の入った線形予測子

library("reshape2")
library("ggplot2")
library("magrittr")
library("dplyr")

ws_dir <- "./"

data4a <- read.csv(paste0(ws_dir,"data4a.csv"))
summary(data4a)
##        N           y              x          f     
##  Min.   :8   Min.   :0.00   Min.   : 7.660   C:50  
##  1st Qu.:8   1st Qu.:3.00   1st Qu.: 9.338   T:50  
##  Median :8   Median :6.00   Median : 9.965         
##  Mean   :8   Mean   :5.08   Mean   : 9.967         
##  3rd Qu.:8   3rd Qu.:8.00   3rd Qu.:10.770         
##  Max.   :8   Max.   :8.00   Max.   :12.440
head(data4a)
##   N y     x f
## 1 8 1  9.76 C
## 2 8 6 10.48 C
## 3 8 5 10.83 C
## 4 8 6 10.94 C
## 5 8 1  9.37 C
## 6 8 1  8.81 C
# p117  pict. 6.2
qplot(data=data4a,x=x,y=y,col=f,size=I(5))

logistic <- function(z){
  1/(1+exp(-z))
}

logit_q <- function(b1,b2,b3,b4,x,f){
  b1 + b2*x + b3*f + b4*x*f  
}

x <- seq(7,13,0.01)
b1<- -18
b2<- 1.8

z_fC <- logit_q(b1,b2, -0.06, -2,x,f=0) # 交互作用項が大きい。無処理
z_fT <- logit_q(b1,b2, -0.06, -2,x,f=1) # 交互作用項が大きい。施肥処理
qplot(x,y=logistic(z_fC),geom="line")

qplot(x,y=logistic(z_fT),geom="line")

# 今までのモデル -19.536 + 1.952*x + 2.022*fT
(model.x_f <- glm(cbind(y,N-y)~x+f,family=binomial,data=data4a))
## 
## Call:  glm(formula = cbind(y, N - y) ~ x + f, family = binomial, data = data4a)
## 
## Coefficients:
## (Intercept)            x           fT  
##     -19.536        1.952        2.022  
## 
## Degrees of Freedom: 99 Total (i.e. Null);  97 Residual
## Null Deviance:       499.2 
## Residual Deviance: 123   AIC: 272.2
# 交互作用項をもつモデル -18.52332 + 1.85251*x -0.06376*fT + 0.21634*x*fT
(model.xf  <- glm(cbind(y,N-y)~x*f,family=binomial,data=data4a))
## 
## Call:  glm(formula = cbind(y, N - y) ~ x * f, family = binomial, data = data4a)
## 
## Coefficients:
## (Intercept)            x           fT         x:fT  
##   -18.52332      1.85251     -0.06376      0.21634  
## 
## Degrees of Freedom: 99 Total (i.e. Null);  96 Residual
## Null Deviance:       499.2 
## Residual Deviance: 122.4     AIC: 273.6
test.d <- data.frame(N=8,x=seq(8, 13 , 0.1),f="T")

fittData <- function(test.d, model){
  pred.t <- predict(model, test.d, type="response") # type="response" : 予測された確率を返す
  test.d$f <- "C"
  pred.c <- predict(model, test.d, type="response")
  test.d$"T"<-pred.t
  test.d$"C"<-pred.c

  result.d <- melt(test.d,id.vars = c("x","N"),measure.vars = c("T","C"))
  names(result.d) <- c("x","N","f","y")
  result.d
}

result.x_f <- fittData(test.d,model.x_f)
result.x_f$model="x + f"
result.xf <- fittData(test.d,model.xf)
result.xf$model="x*f"

# p.129 pict. 6.9 
qplot(data=rbind(result.x_f,result.xf),x,y=N*y,col=factor(f),geom="line")+facet_grid(~model)

# 交互作用項の有無に予測結果の大差はない。それが何をあらわしているのか解釈できなくなる場合がある。
# --> あえて盛り込むことは不要という判断もできる。

# 補足23(p.129)
library("MASS")
stepAIC(model.xf,direction="both")
## Start:  AIC=273.61
## cbind(y, N - y) ~ x * f
## 
##        Df Deviance    AIC
## - x:f   1   123.03 272.21
## <none>      122.43 273.61
## 
## Step:  AIC=272.21
## cbind(y, N - y) ~ x + f
## 
##        Df Deviance    AIC
## <none>      123.03 272.21
## + x:f   1   122.43 273.61
## - f     1   217.17 364.35
## - x     1   490.58 637.76
## 
## Call:  glm(formula = cbind(y, N - y) ~ x + f, family = binomial, data = data4a)
## 
## Coefficients:
## (Intercept)            x           fT  
##     -19.536        1.952        2.022  
## 
## Degrees of Freedom: 99 Total (i.e. Null);  97 Residual
## Null Deviance:       499.2 
## Residual Deviance: 123   AIC: 272.2
# 本ではx:f(のみ?)のモデルがAIC最良と言っているが出てこない...
glm(cbind(y,N-y)~x:f,family=binomial,data=data4a) # AICは確かに他のものより小さそう
## 
## Call:  glm(formula = cbind(y, N - y) ~ x:f, family = binomial, data = data4a)
## 
## Coefficients:
## (Intercept)         x:fC         x:fT  
##     -18.554        1.856        2.065  
## 
## Degrees of Freedom: 99 Total (i.e. Null);  97 Residual
## Null Deviance:       499.2 
## Residual Deviance: 122.4     AIC: 271.6

6.6 割算値の統計モデリングはやめよう

data4b <- read.csv(paste0(ws_dir,"data4b.csv"))
dim(data4b)
## [1] 100   3
head(data4b)
##    y    x    A
## 1 57 0.68 10.3
## 2 64 0.27 15.6
## 3 49 0.46 10.0
## 4 64 0.45 14.9
## 5 82 0.74 14.0
## 6 29 0.15  9.6
plot(data4b) # 散布図行列をみておく

data4b$x_fact <- cut(data4b$x,seq(0,1,.25))
ggplot(data=data4b,aes(x=A,y=y))+geom_point(aes(col=x_fact),size=I(5))# 調査地の明るさx

# Rでのオフセット項を持つモデルの書き方 p.133
(model.offset <- glm(y~x, offset=log(A), family=poisson, data=data4b))
## 
## Call:  glm(formula = y ~ x, family = poisson, data = data4b, offset = log(A))
## 
## Coefficients:
## (Intercept)            x  
##      0.9731       1.0383  
## 
## Degrees of Freedom: 99 Total (i.e. Null);  98 Residual
## Null Deviance:       261.5 
## Residual Deviance: 81.61     AIC: 650.3
glm(y~x+log(A), family=poisson, data=data4b) # これと一緒になるものかと思ったが、結果は違いそう。log(A)に係数がついている
## 
## Call:  glm(formula = y ~ x + log(A), family = poisson, data = data4b)
## 
## Coefficients:
## (Intercept)            x       log(A)  
##      1.0874       1.0365       0.9525  
## 
## Degrees of Freedom: 99 Total (i.e. Null);  97 Residual
## Null Deviance:       606 
## Residual Deviance: 80.8  AIC: 651.5
# glm(y~x+A, family=poisson, data=data4b)

# test data
test.4b <-
  data.frame(
    A=seq(1,18,1),
    x_1=0.1,
    x_2=0.3,
    x_3=0.5,
    x_4=0.7,
    x_5=0.9
  ) %>% melt(id.vars = "A") %>% select_("A","value")
names(test.4b) <- c("A","x")
head(test.4b)
##   A   x
## 1 1 0.1
## 2 2 0.1
## 3 3 0.1
## 4 4 0.1
## 5 5 0.1
## 6 6 0.1
summary(test.4b)
##        A              x      
##  Min.   : 1.0   Min.   :0.1  
##  1st Qu.: 5.0   1st Qu.:0.3  
##  Median : 9.5   Median :0.5  
##  Mean   : 9.5   Mean   :0.5  
##  3rd Qu.:14.0   3rd Qu.:0.7  
##  Max.   :18.0   Max.   :0.9
pred.offset  <- predict(model.offset,test.4b,type="response")

cut.d<-c(0,.1,.3,.5,.7,.9,1)
qplot(data=data4b,x=A,y,col=cut(x,cut.d),size=I(5))+
  geom_line(data=test.4b,aes(x=A,y=pred.offset))

# 重回帰分析でもできそうと思ったが、あれは正規分布なのでこの本ではダメって言われそう。

6.7 正規分布とその尤度

# p.137
# norm dist.

mu = 0
sigm = 1
y <- seq(-5,5,0.1)
qplot(y,dnorm(y,mean=mu,sd=sigm),geom="line")+
  geom_line(aes(x=y,y=dnorm(y,mean=mu,sd=3)),col="blue")+
  geom_line(aes(x=y,y=dnorm(y,mean=2, sd=1)),col="red")+
  geom_vline(x=c(1.2,1.8),col="grey")

# 余談:GLM 正規分布
glm(y~x,family = "gaussian",data=data4b)
## 
## Call:  glm(formula = y ~ x, family = "gaussian", data = data4b)
## 
## Coefficients:
## (Intercept)            x  
##       23.34        48.15  
## 
## Degrees of Freedom: 99 Total (i.e. Null);  98 Residual
## Null Deviance:       29070 
## Residual Deviance: 20910     AIC: 824.1
# 余談:線形回帰
(model.lm <- lm(y~x,data=data4b))
## 
## Call:
## lm(formula = y ~ x, data = data4b)
## 
## Coefficients:
## (Intercept)            x  
##       23.34        48.15
AIC(model.lm)
## [1] 824.0872
# 余談:重回帰
(model.lm <- lm(y~x+A,data=data4b))
## 
## Call:
## lm(formula = y ~ x + A, data = data4b)
## 
## Coefficients:
## (Intercept)            x            A  
##     -23.464       49.317        4.413
AIC(model.lm)
## [1] 670.0779

6.8 ガンマ分布のGLM

# p.138
# gamma dist.
x <- seq(0,5,0.01)
dgamma(x,1,1)     %T>% plot(x,.,type="l",main="(A)r=s=1") %>% mean

## [1] 0.1992623
dgamma(x,5,5)     %T>% plot(x,.,type="l",main="(A)r=s=1") %>% mean

## [1] 0.1996007
dgamma(x,0.1,0.1) %T>% plot(x,.,type="l",main="(A)r=s=1") %>% mean

## [1] Inf
load(paste0(ws_dir,"d.rdata"))
dim(d)
## [1] 50  2
head(d)
##            x            y
## 1 0.00100000 0.0008873584
## 2 0.01730612 0.0234652087
## 3 0.03361224 0.0698755633
## 4 0.04991837 0.0343402528
## 5 0.06622449 0.0265204047
## 6 0.08253061 0.1592148027
qplot(data=d,x,y,size=I(5))

(model.glmGamma <- glm(y~log(x),family=Gamma(link="log"), data=d))
## 
## Call:  glm(formula = y ~ log(x), family = Gamma(link = "log"), data = d)
## 
## Coefficients:
## (Intercept)       log(x)  
##     -1.0403       0.6833  
## 
## Degrees of Freedom: 49 Total (i.e. Null);  48 Residual
## Null Deviance:       35.37 
## Residual Deviance: 17.25     AIC: -110.9
pred <- predict(model.glmGamma,d,type="response")
d$pred <- pred
qplot(data=d,x,y=y,size=I(5))+geom_line(aes(y=pred))

6章の感想