The data were collected during face-to-face home-based interviews with 357 female participants. Each participant’s height, weight as well as her answer to the following question were recorded. “Vigorous physical activities usually make you breathe hard or feel tired most of the time. Examples of vigorous activities include: jogging, fast dancing, soccer, fast swimming, fast biking, and Stairmaster. How many days in a typical week do you do vigorous physical activities for 20 minutes or more?”
BMI was calculated using the Quetelet index (kg/m2), which is considered both a convenient and reliable indicator of overweight and obesity.

Column 1: Participant’s BMI index
Column 2: Number of times of vigorous activities per week

1 load package

pacman::p_load(tidyverse, MASS, pscl)

2 Input data

dta2 <- read.table("C:/Users/USER/Desktop/homework/physical_activity.txt", h=T)

# first 6 lines
head(dta2)
##   bmi count
## 1  22     0
## 2  31     0
## 3  29     0
## 4  33     0
## 5  18     0
## 6  25     0
# check data structure
str(dta2)
## 'data.frame':    357 obs. of  2 variables:
##  $ bmi  : int  22 31 29 33 18 25 28 30 29 16 ...
##  $ count: int  0 0 0 0 0 0 0 0 0 0 ...
t<-rbind(colMeans(dta2), apply(dta2, 2, FUN=var))
rownames(t)<-c("ave", "var")
t
##          bmi     count
## ave 26.44538 0.5938375
## var 14.88816 2.2194001

3 plot

ggplot(dta2, aes(x=bmi, y=count))+
  geom_bar(stat="identity")+
  labs(x="BMI", y="Count")+
  theme_minimal()+
  theme(legend.position = c(.1,.9))

#Poisson fit
ggplot(dta2, aes(bmi, count))+
  geom_jitter(alpha=.2)+
  stat_smooth(method = "glm", method.args = list(family=poisson))+
  labs(y="Count of BMI",
       x="BMI")
## `geom_smooth()` using formula 'y ~ x'

圖表顯示,受訪者每週的劇烈運動次數多為“0”,但趨勢是“BMI低者,劇烈運動次數愈多次”

summary(m0_dta2<-glm(count~bmi, data=dta2, family="poisson"))
## 
## Call:
## glm(formula = count ~ bmi, family = "poisson", data = dta2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8404  -1.1270  -0.9570  -0.8127   4.8654  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  2.27048    0.43383   5.234 1.66e-07 ***
## bmi         -0.10898    0.01729  -6.302 2.94e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 795.57  on 356  degrees of freedom
## Residual deviance: 756.44  on 355  degrees of freedom
## AIC: 946.99
## 
## Number of Fisher Scoring iterations: 6
summary(m1_dta2<-zeroinfl(count~bmi,data=dta2))
## 
## Call:
## zeroinfl(formula = count ~ bmi, data = dta2)
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -0.7046 -0.4250 -0.3568 -0.2989  4.8343 
## 
## Count model coefficients (poisson with log link):
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  1.73817    0.53913   3.224  0.00126 **
## bmi         -0.02276    0.02165  -1.051  0.29321   
## 
## Zero-inflation model coefficients (binomial with logit link):
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept) -1.51385    0.95813  -1.580  0.11411   
## bmi          0.11588    0.03736   3.101  0.00193 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Number of iterations in BFGS optimization: 7 
## Log-likelihood: -281.4 on 4 Df
vuong(m0_dta2, m1_dta2)
## Vuong Non-Nested Hypothesis Test-Statistic: 
## (test-statistic is asymptotically distributed N(0,1) under the
##  null that the models are indistinguishible)
## -------------------------------------------------------------
##               Vuong z-statistic             H_A    p-value
## Raw                   -7.822786 model2 > model1 2.5834e-15
## AIC-corrected         -7.740484 model2 > model1 4.9520e-15
## BIC-corrected         -7.580912 model2 > model1 1.7157e-14
#parameter estimates
show(m0_est<-cbind(Estimate=coef(m0_dta2), confint(m0_dta2)))
## Waiting for profiling to be done...
##               Estimate      2.5 %      97.5 %
## (Intercept)  2.2704806  1.4099338  3.11111748
## bmi         -0.1089812 -0.1428471 -0.07502992
show(m1_est<-cbind(Estimate=coef(m1_dta2), confint(m1_dta2)))
##                      Estimate       2.5 %     97.5 %
## count_(Intercept)  1.73816504  0.68148156 2.79484853
## count_bmi         -0.02275706 -0.06519126 0.01967714
## zero_(Intercept)  -1.51384962 -3.39175256 0.36405331
## zero_bmi           0.11588088  0.04264803 0.18911372
exp(m0_est)
##              Estimate     2.5 %     97.5 %
## (Intercept) 9.6840538 4.0956844 22.4461134
## bmi         0.8967472 0.8668866  0.9277157

當BMI增加一單位時,感到呼吸困難或疲倦的次數會增加0.89。

exp(m1_est)
##                    Estimate      2.5 %    97.5 %
## count_(Intercept) 5.6868986 1.97680431 16.360151
## count_bmi         0.9774999 0.93688825  1.019872
## zero_(Intercept)  0.2200612 0.03364965  1.439151
## zero_bmi          1.1228621 1.04357053  1.208178
anova(m0_dta2, m1_dta2)
## Analysis of Deviance Table
## 
## Model: poisson, link: log
## 
## Response: count
## 
## Terms added sequentially (first to last)
## 
## 
##      Df Deviance Resid. Df Resid. Dev
## NULL                   356     795.57
## bmi   1    39.13       355     756.44
dta2_m<- data.frame(dta2, 
                     fit0=predict(m0_dta2, type="response"),
                     fit1=predict(m1_dta2, type="response"),
                     r0=resid(m0_dta2, type="pearson"),
                     r1=resid(m1_dta2, type="pearson"))
ggplot(dta2_m, aes(fit0, r0))+
  geom_point(pch=4)+
  geom_point(aes(fit1, r1),pch=1)+
  geom_hline(yintercept = 0)+
  labs(x="Fitted values", y="Residuals")

“x”為m0_dta2
“o”為m1_dta2
圖顯示m0_dta2、m1_dta2兩模型之 fitted values and residuals的關係,m0_dta2的點分布比m1_dta2更接近0,故m1_dta2模型較佳。

4 The end