dta2 <- read.table("physical_activity.txt", h=T)

head(dta2)

##   bmi count
## 1  22     0
## 2  31     0
## 3  29     0
## 4  33     0
## 5  18     0
## 6  25     0

str(dta2)

## 'data.frame':    357 obs. of  2 variables:
##  $ bmi  : int  22 31 29 33 18 25 28 30 29 16 ...
##  $ count: int  0 0 0 0 0 0 0 0 0 0 ...

1 numerical summaries

knitr::kable(cbind(aggregate(count ~ bmi, data=dta2, FUN=mean),
                   var=aggregate(count ~ bmi, data=dta2, FUN=var)[,2],
                   zeros=aggregate(I(count < 1)~bmi, data=dta2, FUN=sum)[,2]))

bmi	count	var	zeros
16	0.0000000	0.0000000	2
17	2.2500000	10.9166667	2
18	1.0000000	3.0000000	2
19	1.4285714	5.9523810	5
20	0.3333333	1.0000000	8
21	0.4166667	0.9924242	10
22	1.1428571	3.1285714	14
23	0.8400000	3.0566667	19
24	0.6666667	2.9230769	22
25	1.1034483	4.6674877	21
26	0.5925926	2.1737892	20
27	0.5952381	2.2955865	35
28	0.5454545	2.0681818	28
29	0.3428571	0.7613445	29
30	0.1379310	0.5517241	28
31	0.1666667	0.6666667	23
32	0.2941176	1.4705882	16
33	0.6000000	1.8000000	4
34	0.0000000	0.0000000	2
35	0.0000000	0.0000000	2
37	0.0000000	NA	1
39	0.0000000	NA	1

表呈現多數bmi的sd遠大於mean，所以不適合poisson，且count為0的數量不少，要考慮zero-inflated的效應

2 histogram

library(ggplot2)
ggplot(dta2, aes(x=count, y=bmi))+
  geom_bar(stat="identity")+
  labs(x="count", y="bmi")+
  theme_minimal()+
  theme(legend.position = c(.1,.9))

呈現多數的bmi的count都是0

3 model

library(pscl)

## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis

library(MASS)
#negbin
summary(m0_dta2 <- glm.nb(count~bmi, data=dta2))

## 
## Call:
## glm.nb(formula = count ~ bmi, data = dta2, init.theta = 0.1174501232, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.8301  -0.6626  -0.6030  -0.5424   1.7063  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  2.83126    1.18474   2.390  0.01686 * 
## bmi         -0.13087    0.04492  -2.914  0.00357 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.1175) family taken to be 1)
## 
##     Null deviance: 165.11  on 356  degrees of freedom
## Residual deviance: 157.59  on 355  degrees of freedom
## AIC: 608.86
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.1175 
##           Std. Err.:  0.0215 
## 
##  2 x log-likelihood:  -602.8550

#zeroinfl
summary(m1_dta2 <- zeroinfl(count ~ bmi, data=dta2))

## 
## Call:
## zeroinfl(formula = count ~ bmi, data = dta2)
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -0.7046 -0.4250 -0.3568 -0.2989  4.8343 
## 
## Count model coefficients (poisson with log link):
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  1.73817    0.53913   3.224  0.00126 **
## bmi         -0.02276    0.02165  -1.051  0.29321   
## 
## Zero-inflation model coefficients (binomial with logit link):
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept) -1.51385    0.95813  -1.580  0.11411   
## bmi          0.11588    0.03736   3.101  0.00193 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Number of iterations in BFGS optimization: 7 
## Log-likelihood: -281.4 on 4 Df

#model comparison
vuong(m0_dta2, m1_dta2)

## Vuong Non-Nested Hypothesis Test-Statistic: 
## (test-statistic is asymptotically distributed N(0,1) under the
##  null that the models are indistinguishible)
## -------------------------------------------------------------
##               Vuong z-statistic             H_A    p-value
## Raw                   -3.301334 model2 > model1 0.00048113
## AIC-corrected         -2.971763 model2 > model1 0.00148047
## BIC-corrected         -2.332769 model2 > model1 0.00983014

show(m0_est <- cbind(Estimate=coef(m0_dta2), confint(m0_dta2)))

## Waiting for profiling to be done...

##               Estimate      2.5 %      97.5 %
## (Intercept)  2.8312578  0.3797748  5.48739166
## bmi         -0.1308724 -0.2303162 -0.03665307

in m1_dta2,bmi增加1單位，fake pysical activity count減少exp(0.02276);real pysical activity count 增加exp(0.11588)

1102hw2

1 numerical summaries

2 histogram

3 model