dta2 <- read.table("physical_activity.txt", h=T)

head(dta2)
##   bmi count
## 1  22     0
## 2  31     0
## 3  29     0
## 4  33     0
## 5  18     0
## 6  25     0
str(dta2)
## 'data.frame':    357 obs. of  2 variables:
##  $ bmi  : int  22 31 29 33 18 25 28 30 29 16 ...
##  $ count: int  0 0 0 0 0 0 0 0 0 0 ...

1 numerical summaries

knitr::kable(cbind(aggregate(count ~ bmi, data=dta2, FUN=mean),
                   var=aggregate(count ~ bmi, data=dta2, FUN=var)[,2],
                   zeros=aggregate(I(count < 1)~bmi, data=dta2, FUN=sum)[,2]))
bmi count var zeros
16 0.0000000 0.0000000 2
17 2.2500000 10.9166667 2
18 1.0000000 3.0000000 2
19 1.4285714 5.9523810 5
20 0.3333333 1.0000000 8
21 0.4166667 0.9924242 10
22 1.1428571 3.1285714 14
23 0.8400000 3.0566667 19
24 0.6666667 2.9230769 22
25 1.1034483 4.6674877 21
26 0.5925926 2.1737892 20
27 0.5952381 2.2955865 35
28 0.5454545 2.0681818 28
29 0.3428571 0.7613445 29
30 0.1379310 0.5517241 28
31 0.1666667 0.6666667 23
32 0.2941176 1.4705882 16
33 0.6000000 1.8000000 4
34 0.0000000 0.0000000 2
35 0.0000000 0.0000000 2
37 0.0000000 NA 1
39 0.0000000 NA 1

表呈現多數bmi的sd遠大於mean,所以不適合poisson,且count為0的數量不少,要考慮zero-inflated的效應

2 histogram

library(ggplot2)
ggplot(dta2, aes(x=count, y=bmi))+
  geom_bar(stat="identity")+
  labs(x="count", y="bmi")+
  theme_minimal()+
  theme(legend.position = c(.1,.9))

呈現多數的bmi的count都是0

3 model

library(pscl)
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
library(MASS)
#negbin
summary(m0_dta2 <- glm.nb(count~bmi, data=dta2))
## 
## Call:
## glm.nb(formula = count ~ bmi, data = dta2, init.theta = 0.1174501232, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.8301  -0.6626  -0.6030  -0.5424   1.7063  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  2.83126    1.18474   2.390  0.01686 * 
## bmi         -0.13087    0.04492  -2.914  0.00357 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.1175) family taken to be 1)
## 
##     Null deviance: 165.11  on 356  degrees of freedom
## Residual deviance: 157.59  on 355  degrees of freedom
## AIC: 608.86
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.1175 
##           Std. Err.:  0.0215 
## 
##  2 x log-likelihood:  -602.8550
#zeroinfl
summary(m1_dta2 <- zeroinfl(count ~ bmi, data=dta2))
## 
## Call:
## zeroinfl(formula = count ~ bmi, data = dta2)
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -0.7046 -0.4250 -0.3568 -0.2989  4.8343 
## 
## Count model coefficients (poisson with log link):
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  1.73817    0.53913   3.224  0.00126 **
## bmi         -0.02276    0.02165  -1.051  0.29321   
## 
## Zero-inflation model coefficients (binomial with logit link):
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept) -1.51385    0.95813  -1.580  0.11411   
## bmi          0.11588    0.03736   3.101  0.00193 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Number of iterations in BFGS optimization: 7 
## Log-likelihood: -281.4 on 4 Df
#model comparison
vuong(m0_dta2, m1_dta2)
## Vuong Non-Nested Hypothesis Test-Statistic: 
## (test-statistic is asymptotically distributed N(0,1) under the
##  null that the models are indistinguishible)
## -------------------------------------------------------------
##               Vuong z-statistic             H_A    p-value
## Raw                   -3.301334 model2 > model1 0.00048113
## AIC-corrected         -2.971763 model2 > model1 0.00148047
## BIC-corrected         -2.332769 model2 > model1 0.00983014
show(m0_est <- cbind(Estimate=coef(m0_dta2), confint(m0_dta2)))
## Waiting for profiling to be done...
##               Estimate      2.5 %      97.5 %
## (Intercept)  2.8312578  0.3797748  5.48739166
## bmi         -0.1308724 -0.2303162 -0.03665307

in m1_dta2,bmi增加1單位,fake pysical activity count減少exp(0.02276);real pysical activity count 增加exp(0.11588)