lavaan 패키지가 너무 느려서 직접 bootstrapping 함수를 짜서 해보았다. 결과는 단순 매개 모델도 시간이 너무 걸렸다. 아마도 repeat함수를 통해 loop를 사용했기 때문인것 같다. 그런데 복원 추출후 반복 계산하는 bootstrapping이 어떻게 루프를 안쓰고 할 수 있을지 고민 해야겠다.
#단순 선형회귀 일단 선형회귀를 계산하였다.
##boot
library(stats)
x<-rnorm(100,0,1)
y<-x+rnorm(100,0,1)
da<-data.frame(x,x+rnorm(100,0,.00001))
colnames(da)<-c("x","y")
summary(lm(y~x,da))
##
## Call:
## lm(formula = y ~ x, data = da)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.798e-05 -7.571e-06 2.321e-07 8.025e-06 2.291e-05
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.437e-07 1.070e-06 1.340e-01 0.893
## x 1.000e+00 1.322e-06 7.565e+05 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.044e-05 on 98 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 5.723e+11 on 1 and 98 DF, p-value: < 2.2e-16
다음으로 복원추출을 통해 data를 만든다음 각각의 data에서 회귀분석을 하였다. 이때 나온 회귀 계수는 당연히 ’bootnum’개이다.
### no covariance
boot2<-function(xxx,yyy,d,bootnum){
boot1<-function(xxx,yyy,d){
n<-sample(1:nrow(d),nrow(d),replace = T)
nnk<-d[n,]
nnk<-as.data.frame(nnk)
k<-lm(nnk[,yyy]~ nnk[,xxx], data=nnk)
s<-summary(k)
coe<-c(s$coefficients[2,1])
coe}
k<-1
l<-c(1:bootnum)
repeat{
l[k]<-boot1(xxx,yyy,d)
k<-k+1
if(k>=bootnum+1) break
}
estimates<-list(l)
ci<-quantile(l,probs = c(0.05,0.1,0.9,0.95))
ci
}
boot2(1,2,da,5000)
5% 10% 90% 95%
0.9999983 0.9999987 1.0000019 1.0000024
아래는 Hayes의 4번 모델인 단순 회귀를 bootstrapping으로 계산 하였다. 아래 결과는 간접경로의 결과값을 나타낸다.
#boot_model_4
x<-rnorm(1000,0,1)
m<-x+rnorm(1000,0,1)
y<-x+m+rnorm(1000,0,1)
da<-data.frame(x,m,y)
summary(lm(m~x,da))
##
## Call:
## lm(formula = m ~ x, data = da)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.0314 -0.7040 0.0079 0.7004 3.1597
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.01382 0.03276 0.422 0.673
## x 0.99612 0.03295 30.232 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.034 on 998 degrees of freedom
## Multiple R-squared: 0.478, Adjusted R-squared: 0.4775
## F-statistic: 914 on 1 and 998 DF, p-value: < 2.2e-16
summary(lm(y~x+m,da))
##
## Call:
## lm(formula = y ~ x + m, data = da)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.86648 -0.70017 -0.01461 0.64208 2.73841
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.03660 0.03194 1.146 0.252
## x 1.06327 0.04447 23.911 <2e-16 ***
## m 0.94807 0.03086 30.717 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.009 on 997 degrees of freedom
## Multiple R-squared: 0.8295, Adjusted R-squared: 0.8291
## F-statistic: 2424 on 2 and 997 DF, p-value: < 2.2e-16
##indirect_result_only
boot2<-function(xxx,mmm,yyy,d,bootnum){
###estimate a*m
boot1<-function(xxx,mmm,yyy,d){
n<-sample(1:nrow(d),nrow(d),replace = T)
nnk<-d[n,]
nnk<-as.data.frame(nnk)
k1<-lm(nnk[,mmm]~ nnk[,xxx], data=nnk)
k2<-lm(nnk[,yyy]~ nnk[,xxx] + nnk[,mmm], data=nnk)
s1<-summary(k1)
s2<-summary(k2)
coea<-c(s1$coefficients[2,1])
coem<-c(s2$coefficients[3,1])
dididi<-c(s2$coefficients[2,1])
coea*coem}
k<-1
l<-c(1:bootnum)
repeat{
l[k]<-boot1(xxx,mmm,yyy,d)
k<-k+1
if(k>=bootnum+1) break
}
estimates<-list(l)
ci<-quantile(l,probs = c(0.05,0.1,0.9,0.95))
kmkmkmkm<-list(c(mean(l),sd(l)), ci)
names(kmkmkmkm)<-c("effect_bootSE", "CI")
kmkmkmkm
}
boot2(1,2,3,da,5000)
## $effect_bootSE
## [1] 0.94466955 0.04402422
##
## $CI
## 5% 10% 90% 95%
## 0.8729001 0.8885134 1.0015652 1.0193090
아래는 Hayes의 4번 모델에서 나올 수 있는 3가지 CI인 간접효과, 직접효과, 총효과 모두를 보이게 했다.
##total_result
boot2<-function(xxx,mmm,yyy,d,bootnum){
###estimate a*m
boot1<-function(xxx,mmm,yyy,d){
n<-sample(1:nrow(d),nrow(d),replace = T)
nnk<-d[n,]
nnk<-as.data.frame(nnk)
k1<-lm(nnk[,mmm]~ nnk[,xxx], data=nnk)
k2<-lm(nnk[,yyy]~ nnk[,xxx] + nnk[,mmm], data=nnk)
s1<-summary(k1)
s2<-summary(k2)
coea<-c(s1$coefficients[2,1])
coem<-c(s2$coefficients[3,1])
dididi<-c(s2$coefficients[2,1])
tototo<-c(coea*coem + dididi)
eff<-data.frame(coea*coem,dididi, tototo)
colnames(eff)<-c("indirect", "direct", "total")
eff}
k<-1
l<-matrix(rep(NA,3*bootnum),ncol = 3)
l<-as.data.frame(l)
repeat{
l[k,]<-boot1(xxx,mmm,yyy,d)
k<-k+1
if(k>=bootnum+1) break
}
estimates<-list(l)
ci1<-quantile(l[,1],probs = c(0.01,0.05,0.10,0.90,0.95,0.99))
ci2<-quantile(l[,2],probs = c(0.01,0.05,0.10,0.90,0.95,0.99))
ci3<-quantile(l[,3],probs = c(0.01,0.05,0.10,0.90,0.95,0.99))
kmkmkmkm<-list(c(mean(l[,1]),sd(l[,1])),ci1,c(mean(l[,2]),sd(l[,2])),ci2, c(mean(l[,3]),sd(l[,3])),ci3)
names(kmkmkmkm)<-c("indirecteffect_mean_BootSE", "indirecteffect_CI","directeffect_mean_BootSE", "directeffect_CI",
"totaleffect_mean_BootSE", "totaleffect_CI")
kmkmkmkm
}
boot2(1,2,3,da,5000)
## $indirecteffect_mean_BootSE
## [1] 0.94438464 0.04379965
##
## $indirecteffect_CI
## 1% 5% 10% 90% 95% 99%
## 0.8468481 0.8734130 0.8896360 1.0003082 1.0169890 1.0507298
##
## $directeffect_mean_BootSE
## [1] 1.06406672 0.04490338
##
## $directeffect_CI
## 1% 5% 10% 90% 95% 99%
## 0.9611508 0.9899341 1.0058839 1.1212710 1.1377543 1.1667712
##
## $totaleffect_mean_BootSE
## [1] 2.00845137 0.04809432
##
## $totaleffect_CI
## 1% 5% 10% 90% 95% 99%
## 1.899069 1.929182 1.945998 2.069318 2.087176 2.119893
하지만 애초에 이 스크립트를 쓴 이유가 lavaan패키지가 너무 느려서인데 속도는 거의 변하지 않은 것 같다.
#개느리다
system.time(boot2(3,2,1,da,5000))
## user system elapsed
## 18.10 0.18 18.42