처음 써봄

lavaan 패키지가 너무 느려서 직접 bootstrapping 함수를 짜서 해보았다. 결과는 단순 매개 모델도 시간이 너무 걸렸다. 아마도 repeat함수를 통해 loop를 사용했기 때문인것 같다. 그런데 복원 추출후 반복 계산하는 bootstrapping이 어떻게 루프를 안쓰고 할 수 있을지 고민 해야겠다.

#단순 선형회귀 일단 선형회귀를 계산하였다.

##boot
library(stats)

x<-rnorm(100,0,1)
y<-x+rnorm(100,0,1)
da<-data.frame(x,x+rnorm(100,0,.00001))
colnames(da)<-c("x","y")
summary(lm(y~x,da))
## 
## Call:
## lm(formula = y ~ x, data = da)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -2.798e-05 -7.571e-06  2.321e-07  8.025e-06  2.291e-05 
## 
## Coefficients:
##              Estimate Std. Error   t value Pr(>|t|)    
## (Intercept) 1.437e-07  1.070e-06 1.340e-01    0.893    
## x           1.000e+00  1.322e-06 7.565e+05   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.044e-05 on 98 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 5.723e+11 on 1 and 98 DF,  p-value: < 2.2e-16

다음으로 복원추출을 통해 data를 만든다음 각각의 data에서 회귀분석을 하였다. 이때 나온 회귀 계수는 당연히 ’bootnum’개이다.

### no covariance
boot2<-function(xxx,yyy,d,bootnum){
  boot1<-function(xxx,yyy,d){
  n<-sample(1:nrow(d),nrow(d),replace = T)
  nnk<-d[n,]
  nnk<-as.data.frame(nnk)
  k<-lm(nnk[,yyy]~ nnk[,xxx], data=nnk)
  s<-summary(k)
  coe<-c(s$coefficients[2,1])
  coe}
  k<-1
  l<-c(1:bootnum)
  repeat{
    l[k]<-boot1(xxx,yyy,d)
    k<-k+1
    if(k>=bootnum+1) break
  }
  estimates<-list(l)
  ci<-quantile(l,probs = c(0.05,0.1,0.9,0.95))
  ci
}
boot2(1,2,da,5000)
   5%       10%       90%       95% 

0.9999983 0.9999987 1.0000019 1.0000024

아래는 Hayes의 4번 모델인 단순 회귀를 bootstrapping으로 계산 하였다. 아래 결과는 간접경로의 결과값을 나타낸다.

#boot_model_4

x<-rnorm(1000,0,1)
m<-x+rnorm(1000,0,1)
y<-x+m+rnorm(1000,0,1)
da<-data.frame(x,m,y)
summary(lm(m~x,da))
## 
## Call:
## lm(formula = m ~ x, data = da)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.0314 -0.7040  0.0079  0.7004  3.1597 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.01382    0.03276   0.422    0.673    
## x            0.99612    0.03295  30.232   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.034 on 998 degrees of freedom
## Multiple R-squared:  0.478,  Adjusted R-squared:  0.4775 
## F-statistic:   914 on 1 and 998 DF,  p-value: < 2.2e-16
summary(lm(y~x+m,da))
## 
## Call:
## lm(formula = y ~ x + m, data = da)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.86648 -0.70017 -0.01461  0.64208  2.73841 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.03660    0.03194   1.146    0.252    
## x            1.06327    0.04447  23.911   <2e-16 ***
## m            0.94807    0.03086  30.717   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.009 on 997 degrees of freedom
## Multiple R-squared:  0.8295, Adjusted R-squared:  0.8291 
## F-statistic:  2424 on 2 and 997 DF,  p-value: < 2.2e-16
##indirect_result_only
boot2<-function(xxx,mmm,yyy,d,bootnum){
  ###estimate a*m
  boot1<-function(xxx,mmm,yyy,d){
    n<-sample(1:nrow(d),nrow(d),replace = T)
    nnk<-d[n,]
    nnk<-as.data.frame(nnk)
    k1<-lm(nnk[,mmm]~ nnk[,xxx], data=nnk)
    k2<-lm(nnk[,yyy]~ nnk[,xxx] + nnk[,mmm], data=nnk)
    s1<-summary(k1)
    s2<-summary(k2)
    coea<-c(s1$coefficients[2,1])
    coem<-c(s2$coefficients[3,1])
    dididi<-c(s2$coefficients[2,1])
    coea*coem}
  k<-1
  l<-c(1:bootnum)
  repeat{
    l[k]<-boot1(xxx,mmm,yyy,d)
    k<-k+1
    if(k>=bootnum+1) break
  }
  estimates<-list(l)
  ci<-quantile(l,probs = c(0.05,0.1,0.9,0.95))
  kmkmkmkm<-list(c(mean(l),sd(l)),  ci)
  names(kmkmkmkm)<-c("effect_bootSE", "CI")
  kmkmkmkm
}

boot2(1,2,3,da,5000)
## $effect_bootSE
## [1] 0.94466955 0.04402422
## 
## $CI
##        5%       10%       90%       95% 
## 0.8729001 0.8885134 1.0015652 1.0193090

아래는 Hayes의 4번 모델에서 나올 수 있는 3가지 CI인 간접효과, 직접효과, 총효과 모두를 보이게 했다.

##total_result
boot2<-function(xxx,mmm,yyy,d,bootnum){
  ###estimate a*m
  boot1<-function(xxx,mmm,yyy,d){
    n<-sample(1:nrow(d),nrow(d),replace = T)
    nnk<-d[n,]
    nnk<-as.data.frame(nnk)
    k1<-lm(nnk[,mmm]~ nnk[,xxx], data=nnk)
    k2<-lm(nnk[,yyy]~ nnk[,xxx] + nnk[,mmm], data=nnk)
    s1<-summary(k1)
    s2<-summary(k2)
    coea<-c(s1$coefficients[2,1])
    coem<-c(s2$coefficients[3,1])
    dididi<-c(s2$coefficients[2,1])
    tototo<-c(coea*coem + dididi)
    eff<-data.frame(coea*coem,dididi, tototo)
    colnames(eff)<-c("indirect", "direct", "total")
    eff}
  k<-1
  l<-matrix(rep(NA,3*bootnum),ncol = 3)
  l<-as.data.frame(l)
  repeat{
    l[k,]<-boot1(xxx,mmm,yyy,d)
    k<-k+1
    if(k>=bootnum+1) break
  }
  estimates<-list(l)
  ci1<-quantile(l[,1],probs = c(0.01,0.05,0.10,0.90,0.95,0.99))
  ci2<-quantile(l[,2],probs = c(0.01,0.05,0.10,0.90,0.95,0.99))
  ci3<-quantile(l[,3],probs = c(0.01,0.05,0.10,0.90,0.95,0.99))
  kmkmkmkm<-list(c(mean(l[,1]),sd(l[,1])),ci1,c(mean(l[,2]),sd(l[,2])),ci2, c(mean(l[,3]),sd(l[,3])),ci3)
  names(kmkmkmkm)<-c("indirecteffect_mean_BootSE", "indirecteffect_CI","directeffect_mean_BootSE", "directeffect_CI",
                     "totaleffect_mean_BootSE", "totaleffect_CI")
  kmkmkmkm
}

boot2(1,2,3,da,5000)
## $indirecteffect_mean_BootSE
## [1] 0.94438464 0.04379965
## 
## $indirecteffect_CI
##        1%        5%       10%       90%       95%       99% 
## 0.8468481 0.8734130 0.8896360 1.0003082 1.0169890 1.0507298 
## 
## $directeffect_mean_BootSE
## [1] 1.06406672 0.04490338
## 
## $directeffect_CI
##        1%        5%       10%       90%       95%       99% 
## 0.9611508 0.9899341 1.0058839 1.1212710 1.1377543 1.1667712 
## 
## $totaleffect_mean_BootSE
## [1] 2.00845137 0.04809432
## 
## $totaleffect_CI
##       1%       5%      10%      90%      95%      99% 
## 1.899069 1.929182 1.945998 2.069318 2.087176 2.119893

하지만 애초에 이 스크립트를 쓴 이유가 lavaan패키지가 너무 느려서인데 속도는 거의 변하지 않은 것 같다.

#개느리다
system.time(boot2(3,2,1,da,5000))
##    user  system elapsed 
##   18.10    0.18   18.42