二つのランダムウォークを生成

df<-data.frame(a=cumsum(rnorm(n = 100,mean = 0,sd = 1)),b=cumsum(rnorm(n = 100,mean = 0,sd = 1))) 
plot(df$a,type="l")

plot(df$b,type="l")

b はポアソン分布のλ にするので、>=0

df$b <- ifelse(df$b<0,0,df$b)

ランダムウォークなのにbはa に強い相関がある。

res_1 <- lm(formula = b~a,data = df)
summary(res_1)
## 
## Call:
## lm(formula = b ~ a, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.5352 -0.8150 -0.2839  0.8532  2.6118 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.36341    0.14268   9.556 1.12e-15 ***
## a           -0.11078    0.02719  -4.074 9.39e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.104 on 98 degrees of freedom
## Multiple R-squared:  0.1448, Adjusted R-squared:  0.1361 
## F-statistic:  16.6 on 1 and 98 DF,  p-value: 9.392e-05

ランダムウォークにそってポアソン分布に沿ったカウントデータを生成

rpois_r<- function(x){
  return(rpois(1,x))
}

df$c <- sapply(df$b, rpois_r)

ポアソン回帰

あたりまえだけどcはaに強い相関がある。

res_2 <- glm(formula = c~a,data = df,family=poisson(link="identity"))
summary(res_2)
## 
## Call:
## glm(formula = c ~ a, family = poisson(link = "identity"), data = df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8317  -1.5062  -0.9487   0.4556   3.5778  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.32752    0.14724   9.016  < 2e-16 ***
## a           -0.10450    0.02111  -4.951 7.38e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 238.22  on 99  degrees of freedom
## Residual deviance: 215.67  on 98  degrees of freedom
## AIC: 317.51
## 
## Number of Fisher Scoring iterations: 4

ランダムウォーク二つ作って回帰したらどれくらいの頻度で有意な関連があると判断するか?

library(broom)
## Warning: package 'broom' was built under R version 3.5.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
fun_sim <- function(){
  df<-data.frame(a=cumsum(rnorm(n = 100,mean = 0,sd = 1)),b=cumsum(rnorm(n = 100,mean = 0,sd = 1))) 
#  df$b <- ifelse(df$b<0,0,df$b)
  res_1 <- lm(formula = b~a,data = df)
  res_1 <- broom::tidy(res_1) %>%filter(term=="a")
  return(res_1)
}

res_tab <- fun_sim()
for(i in 1:10000){
  res_tab <- dplyr::bind_rows(res_tab,fun_sim())
}

res_tab %>% pull(estimate) %>% hist()

res_tab %>% pull(p.value) %>%  hist()

res_tab %>% mutate(significant = p.value <= 0.05) %>% group_by(significant) %>% summarise(n = n())
## # A tibble: 2 x 2
##   significant     n
##   <lgl>       <int>
## 1 FALSE        2431
## 2 TRUE         7570

以上から言いたい点は