############################# Count data modeling: Zero-inflated data #############################
################################# Data from Long and Freese (2001) ################################
####################################Example 1: Bio chemistry Phd ##################################
## Load packages
require(ggplot2)
## Loading required package: ggplot2
require(pscl)
## Loading required package: pscl
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
require(MASS)
## Loading required package: MASS
require(boot)
## Loading required package: boot
## Load data
data(bioChemists) # in pscl
## Look a few lines
head(bioChemists)
## art fem mar kid5 phd ment
## 1 0 Men Married 0 2.52 7
## 2 0 Women Single 0 2.05 6
## 3 0 Women Single 0 3.75 6
## 4 0 Men Married 1 1.18 3
## 5 0 Women Single 0 3.75 26
## 6 0 Women Married 2 3.59 2
#Variables in the datasets (915 obs):
#art: articles in last three years of Ph.D.
#fem: coded one for females
#mar: coded one if married
#kid5: number of children under age five
#phd: prestige of Ph.D. program
#ment: articles by mentor in last three years
# Factor variables
bioChemists <- within(bioChemists, {
fem <- factor(fem)
mar <- factor(mar)
})
## Structure of data and check if fem and mar are factor variables
str(bioChemists)
## 'data.frame': 915 obs. of 6 variables:
## $ art : int 0 0 0 0 0 0 0 0 0 0 ...
## $ fem : Factor w/ 2 levels "Men","Women": 1 2 2 1 2 2 2 1 1 2 ...
## $ mar : Factor w/ 2 levels "Single","Married": 2 1 1 2 1 2 1 2 1 2 ...
## $ kid5: num 0 0 0 1 0 2 0 2 0 0 ...
## $ phd : num 2.52 2.05 3.75 1.18 3.75 ...
## $ ment: int 7 6 6 3 26 2 3 4 6 0 ...
## - attr(*, "datalabel")= chr "Academic Biochemists / S Long"
## - attr(*, "time.stamp")= chr "30 Jan 2001 10:49"
## - attr(*, "formats")= chr [1:6] "%9.0g" "%9.0g" "%9.0g" "%9.0g" ...
## - attr(*, "types")= int [1:6] 98 98 98 98 102 98
## - attr(*, "val.labels")= chr [1:6] "" "sexlbl" "marlbl" "" ...
## - attr(*, "var.labels")= chr [1:6] "Articles in last 3 yrs of PhD" "Gender: 1=female 0=male" "Married: 1=yes 0=no" "Number of children < 6" ...
## - attr(*, "version")= int 6
## - attr(*, "label.table")=List of 6
## ..$ marlbl: Named num [1:2] 0 1
## .. ..- attr(*, "names")= chr [1:2] "Single" "Married"
## ..$ sexlbl: Named num [1:2] 0 1
## .. ..- attr(*, "names")= chr [1:2] "Men" "Women"
## ..$ : NULL
## ..$ : NULL
## ..$ : NULL
## ..$ : NULL
# Summarize data
library(psych)
##
## Attaching package: 'psych'
## The following object is masked from 'package:boot':
##
## logit
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
summary(bioChemists)
## art fem mar kid5 phd
## Min. : 0.000 Men :494 Single :309 Min. :0.0000 Min. :0.755
## 1st Qu.: 0.000 Women:421 Married:606 1st Qu.:0.0000 1st Qu.:2.260
## Median : 1.000 Median :0.0000 Median :3.150
## Mean : 1.693 Mean :0.4951 Mean :3.103
## 3rd Qu.: 2.000 3rd Qu.:1.0000 3rd Qu.:3.920
## Max. :19.000 Max. :3.0000 Max. :4.620
## ment
## Min. : 0.000
## 1st Qu.: 3.000
## Median : 6.000
## Mean : 8.767
## 3rd Qu.:12.000
## Max. :77.000
# Summarize DV
describe(bioChemists$art) #variance(3.72) > mean(1.69)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 915 1.69 1.93 1 1.39 1.48 0 19 19 2.51 12.63 0.06
## Histogram with x axis in log10 scale
ggplot(bioChemists, aes(art, fill = fem)) +
geom_histogram() +
scale_x_log10() +
facet_grid(fem ~ ., margins=TRUE, scales="free_y")
## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 550 rows containing non-finite values (stat_bin).

## The models you may consider
# 1) OLS Regression: You can try to analyze these data using OLS regression. However, count data are highly non-normal
# 2) Zero-inflated Poisson regression or Hurdle Poisson does better when the data is not overdispersed
# 3) Poisson or negative binomial models might appropriate if there are not excess zeros
# 4) Zero-inflated negative binomial regression or Hurdle NBD when the data are overdispersed and contain excess zeros
## Models ##################################################
# 1) ZIP
mzip <- zeroinfl(art~fem+mar+kid5+phd+ment, data=bioChemists)
summary(mzip)
##
## Call:
## zeroinfl(formula = art ~ fem + mar + kid5 + phd + ment, data = bioChemists)
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -2.3253 -0.8652 -0.2826 0.5404 7.2976
##
## Count model coefficients (poisson with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.640839 0.121307 5.283 1.27e-07 ***
## femWomen -0.209144 0.063405 -3.299 0.000972 ***
## marMarried 0.103750 0.071111 1.459 0.144567
## kid5 -0.143320 0.047429 -3.022 0.002513 **
## phd -0.006166 0.031008 -0.199 0.842376
## ment 0.018098 0.002294 7.888 3.07e-15 ***
##
## Zero-inflation model coefficients (binomial with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.577060 0.509386 -1.133 0.25728
## femWomen 0.109752 0.280082 0.392 0.69517
## marMarried -0.354018 0.317611 -1.115 0.26501
## kid5 0.217095 0.196483 1.105 0.26920
## phd 0.001275 0.145263 0.009 0.99300
## ment -0.134114 0.045243 -2.964 0.00303 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Number of iterations in BFGS optimization: 19
## Log-likelihood: -1605 on 12 Df
# 2) ZINB
mzinb <- zeroinfl(art~fem+mar+kid5+phd+ment, data=bioChemists, dist = "negbin")
summary(mzinb)
##
## Call:
## zeroinfl(formula = art ~ fem + mar + kid5 + phd + ment, data = bioChemists,
## dist = "negbin")
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -1.2942 -0.7601 -0.2909 0.4448 6.4155
##
## Count model coefficients (negbin with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.4167466 0.1435964 2.902 0.00371 **
## femWomen -0.1955076 0.0755926 -2.586 0.00970 **
## marMarried 0.0975826 0.0844520 1.155 0.24789
## kid5 -0.1517321 0.0542061 -2.799 0.00512 **
## phd -0.0006998 0.0362697 -0.019 0.98461
## ment 0.0247862 0.0034927 7.097 1.28e-12 ***
## Log(theta) 0.9763577 0.1354696 7.207 5.71e-13 ***
##
## Zero-inflation model coefficients (binomial with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.19161 1.32280 -0.145 0.88483
## femWomen 0.63587 0.84890 0.749 0.45382
## marMarried -1.49944 0.93866 -1.597 0.11017
## kid5 0.62841 0.44277 1.419 0.15583
## phd -0.03773 0.30801 -0.123 0.90250
## ment -0.88227 0.31622 -2.790 0.00527 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Theta = 2.6548
## Number of iterations in BFGS optimization: 27
## Log-likelihood: -1550 on 13 Df
### Test model performance
mzinb0 <- update(mzinb, . ~ 1)
pchisq(2 * (logLik(mzinb) - logLik(mzinb0)), df = 3, lower.tail=FALSE)
## 'log Lik.' 8.138862e-26 (df=13)
# 3) Poisson
mp <- glm(art~fem+mar+kid5+phd+ment, family=poisson, data=bioChemists)
summary(mp)
##
## Call:
## glm(formula = art ~ fem + mar + kid5 + phd + ment, family = poisson,
## data = bioChemists)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.5672 -1.5398 -0.3660 0.5722 5.4467
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.304617 0.102981 2.958 0.0031 **
## femWomen -0.224594 0.054613 -4.112 3.92e-05 ***
## marMarried 0.155243 0.061374 2.529 0.0114 *
## kid5 -0.184883 0.040127 -4.607 4.08e-06 ***
## phd 0.012823 0.026397 0.486 0.6271
## ment 0.025543 0.002006 12.733 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 1817.4 on 914 degrees of freedom
## Residual deviance: 1634.4 on 909 degrees of freedom
## AIC: 3314.1
##
## Number of Fisher Scoring iterations: 5
# 4) Negative binomial
mnb <- glm.nb(art~fem+mar+kid5+phd+ment, data=bioChemists)
summary(mnb)
##
## Call:
## glm.nb(formula = art ~ fem + mar + kid5 + phd + ment, data = bioChemists,
## init.theta = 2.264387695, link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1678 -1.3617 -0.2806 0.4476 3.4524
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.256144 0.137348 1.865 0.062191 .
## femWomen -0.216418 0.072636 -2.979 0.002887 **
## marMarried 0.150489 0.082097 1.833 0.066791 .
## kid5 -0.176415 0.052813 -3.340 0.000837 ***
## phd 0.015271 0.035873 0.426 0.670326
## ment 0.029082 0.003214 9.048 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(2.2644) family taken to be 1)
##
## Null deviance: 1109.0 on 914 degrees of freedom
## Residual deviance: 1004.3 on 909 degrees of freedom
## AIC: 3135.9
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 2.264
## Std. Err.: 0.271
##
## 2 x log-likelihood: -3121.917
## Compare models and model fits: 1) Vuong test (Vuong (1989))
#vuong(mzip, mzinb)
vuong(mp, mzip)
## Vuong Non-Nested Hypothesis Test-Statistic:
## (test-statistic is asymptotically distributed N(0,1) under the
## null that the models are indistinguishible)
## -------------------------------------------------------------
## Vuong z-statistic H_A p-value
## Raw -4.180495 model2 > model1 1.4544e-05
## AIC-corrected -3.638552 model2 > model1 0.00013709
## BIC-corrected -2.332763 model2 > model1 0.00983030
vuong(mnb, mzinb)
## Vuong Non-Nested Hypothesis Test-Statistic:
## (test-statistic is asymptotically distributed N(0,1) under the
## null that the models are indistinguishible)
## -------------------------------------------------------------
## Vuong z-statistic H_A p-value
## Raw -2.241826 model2 > model1 0.012486
## AIC-corrected -1.015383 model2 > model1 0.154962
## BIC-corrected 1.939686 model1 > model2 0.026209
## Compare models and model fits: 2) Predicting zero publications
# Real % vs Prediction by Poisson model
sum(bioChemists$art == 0)/nrow(bioChemists)
## [1] 0.3005464
## Prediction by Poisson
# predict expected mean count
mp_mu <- mp$fitted.values
# sum the probabilities of a 0 count for each mean
prozero_mp <- dpois(x = 0, lambda = mp_mu)
mean(prozero_mp)
## [1] 0.2092071
## Prediction by negative binomial
# predict expected mean count
mnb_mu <- mnb$fitted.values
# sum the probabilities of a 0 count for each mean
prozero_mnb <- dpois(x = 0, lambda = mnb_mu)
mean(prozero_mnb)
## [1] 0.2126391
## Prediction by ZIP
# predict expected mean count
mzip_mu <- mzip$fitted.values
# sum the probabilities of a 0 count for each mean
prozero_mzip <- dpois(x = 0, lambda = mzip_mu)
mean(prozero_mzip)
## [1] 0.2131843
# Prediction by ZINB
# predict expected mean count
mzinb_mu <- mzinb$fitted.values
# sum the probabilities of a 0 count for each mean
prozero_mzinb <- dpois(x = 0, lambda = mzinb_mu)
mean(prozero_mzinb)
## [1] 0.216411
######################################## Example 2: Shopping data ####################################
## Load data
shopping <- read.csv(file="C:/Users/ehk994/Desktop/Teaching/Fall 2021/IMC465_Marketing Model II/Week 2 - Advanced count data model/shopping.csv", sep=",", header=TRUE, stringsAsFactors=FALSE)
## Check variable names
names(shopping)
## [1] "creditCard" "car" "persons" "male" "count"
## Check data structure and variable formats
str(shopping)
## 'data.frame': 250 obs. of 5 variables:
## $ creditCard: int 0 1 1 1 1 1 1 1 0 1 ...
## $ car : int 0 1 0 1 0 1 0 0 1 1 ...
## $ persons : int 1 1 1 2 1 4 3 4 3 1 ...
## $ male : int 0 0 0 1 0 2 1 3 2 0 ...
## $ count : int 0 0 0 0 1 0 0 0 0 1 ...
shopping$creditCard <- as.factor(shopping$creditCard)
shopping$car <- as.factor(shopping$car)
## Descriptive statistics
library(psych)
describe(shopping)
## vars n mean sd median trimmed mad min max range skew
## creditCard* 1 250 1.86 0.34 2 1.96 0.00 1 2 1 -2.11
## car* 2 250 1.59 0.49 2 1.61 0.00 1 2 1 -0.36
## persons 3 250 2.53 1.11 2 2.54 1.48 1 4 3 0.01
## male 4 250 0.68 0.85 0 0.56 0.00 0 3 3 1.04
## count 5 250 3.30 11.64 0 1.02 0.00 0 149 149 8.88
## kurtosis se
## creditCard* 2.47 0.02
## car* -1.88 0.03
## persons -1.36 0.07
## male 0.19 0.05
## count 99.58 0.74
## Frequency table and bar plot
#one-way
CreditCard <- table(shopping$creditCard)
barplot(CreditCard, main="Have a credit card",
xlab="Number of consumers")

Car <- table(shopping$car)
barplot(Car, main="Has a car",
xlab="Number of Consumers")

## Distribution of an outcome variable
hist(shopping$count)

plot(table(shopping$count))

sum(shopping$count < 1)
## [1] 142
sum(shopping$count > 50)
## [1] 2
## Mean and variance
r <- c(mean(shopping$count), var(shopping$count))
c(mean=r[1], var=r[2], ratio=r[2]/r[1]) # very over-dispersed data
## mean var ratio
## 3.29600 135.37388 41.07217
## 1) Poisson model
sp <- glm(count~., family=poisson, data=shopping)
summary(sp)
##
## Call:
## glm(formula = count ~ ., family = poisson, data = shopping)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.8792 -1.4955 -0.8784 -0.1345 15.6765
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.39007 0.26490 -12.797 < 2e-16 ***
## creditCard1 1.66967 0.23318 7.160 8.05e-13 ***
## car1 0.80009 0.08947 8.943 < 2e-16 ***
## persons 1.07396 0.03900 27.534 < 2e-16 ***
## male -1.70993 0.08140 -21.006 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 2958.4 on 249 degrees of freedom
## Residual deviance: 1246.9 on 245 degrees of freedom
## AIC: 1594
##
## Number of Fisher Scoring iterations: 6
## Model fit
with(sp, cbind(res.deviance = deviance, df = df.residual,
p = pchisq(deviance, df.residual, lower.tail=FALSE))) # the goodness-of-fit chi-squared test is highly significant; data doesn't fit the model well.
## res.deviance df p
## [1,] 1246.89 245 8.959359e-134
## 2) Negative binomial model
library(MASS)
snb <- glm.nb(count~., data=shopping)
summary(snb)
##
## Call:
## glm.nb(formula = count ~ ., data = shopping, init.theta = 0.5155059824,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7528 -0.9342 -0.6516 -0.0719 4.7688
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.9760 0.4849 -6.138 8.37e-10 ***
## creditCard1 1.5379 0.4035 3.812 0.000138 ***
## car1 0.5179 0.2306 2.245 0.024752 *
## persons 1.0618 0.1108 9.581 < 2e-16 ***
## male -1.8009 0.1839 -9.791 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(0.5155) family taken to be 1)
##
## Null deviance: 423.64 on 249 degrees of freedom
## Residual deviance: 209.88 on 245 degrees of freedom
## AIC: 809.16
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 0.5155
## Std. Err.: 0.0811
##
## 2 x log-likelihood: -797.1610
with(snb, cbind(res.deviance = deviance, df = df.residual,
p = pchisq(deviance, df.residual, lower.tail=FALSE))) # the goodness-of-fit chi-squared test is not significant; the model does fit the data well.
## res.deviance df p
## [1,] 209.8808 245 0.9493809
# Over-dispersion test: H0: Poisson
library(pscl)
odTest(snb)
## Likelihood ratio test of H0: Poisson, as restricted NB model:
## n.b., the distribution of the test-statistic under H0 is non-standard
## e.g., see help(odTest) for details/references
##
## Critical value of test statistic at the alpha= 0.05 level: 2.7055
## Chi-Square Test Statistic = 786.7939 p-value = < 2.2e-16
# the result is highly significant and thus rejects the H0. Poisson doesn't fit the data.
## 3) Poisson Hurdle model
library(pscl)
hurdlePS <- hurdle(count~., data=shopping)
summary(hurdlePS)
##
## Call:
## hurdle(formula = count ~ ., data = shopping)
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -3.1630 -0.7653 -0.4278 -0.1110 24.9101
##
## Count model coefficients (truncated poisson with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.48336 0.34613 -7.175 7.25e-13 ***
## creditCard1 1.83082 0.30899 5.925 3.12e-09 ***
## car1 0.61171 0.09433 6.485 8.90e-11 ***
## persons 0.83838 0.04368 19.194 < 2e-16 ***
## male -1.18301 0.09328 -12.682 < 2e-16 ***
## Zero hurdle model coefficients (binomial with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.1728 0.6482 -4.895 9.84e-07 ***
## creditCard1 0.9768 0.4868 2.006 0.04482 *
## car1 0.9530 0.3275 2.910 0.00361 **
## persons 1.1529 0.1970 5.851 4.88e-09 ***
## male -2.2203 0.3231 -6.871 6.36e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Number of iterations in BFGS optimization: 11
## Log-likelihood: -712 on 10 Df
hurdlePS2 <- hurdle(count~., data=shopping, dist = "poisson",
zero.dist = "binomial")
summary(hurdlePS2)
##
## Call:
## hurdle(formula = count ~ ., data = shopping, dist = "poisson", zero.dist = "binomial")
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -3.1630 -0.7653 -0.4278 -0.1110 24.9101
##
## Count model coefficients (truncated poisson with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.48336 0.34613 -7.175 7.25e-13 ***
## creditCard1 1.83082 0.30899 5.925 3.12e-09 ***
## car1 0.61171 0.09433 6.485 8.90e-11 ***
## persons 0.83838 0.04368 19.194 < 2e-16 ***
## male -1.18301 0.09328 -12.682 < 2e-16 ***
## Zero hurdle model coefficients (binomial with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.1728 0.6482 -4.895 9.84e-07 ***
## creditCard1 0.9768 0.4868 2.006 0.04482 *
## car1 0.9530 0.3275 2.910 0.00361 **
## persons 1.1529 0.1970 5.851 4.88e-09 ***
## male -2.2203 0.3231 -6.871 6.36e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Number of iterations in BFGS optimization: 11
## Log-likelihood: -712 on 10 Df
#Compare the current model to a null model without predictors using chi-sqaured test on the difference of log likelihoods
hurdlePSNull <- update (hurdlePS, . ~1)
pchisq(2 * (logLik(hurdlePS) - logLik(hurdlePSNull)), df = 4, lower.tail = FALSE) #since we have four predictors in the full model
## 'log Lik.' 2.274465e-178 (df=10)
## 4) Negative binomial hurdle model
hurdleNBS <- hurdle(count~., data=shopping, dist = "negbin")
summary(hurdleNBS)
##
## Call:
## hurdle(formula = count ~ ., data = shopping, dist = "negbin")
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -0.78396 -0.52471 -0.30573 -0.08437 12.97201
##
## Count model coefficients (truncated negbin with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.1527 0.7474 -4.218 2.46e-05 ***
## creditCard1 1.9626 0.5946 3.300 0.000965 ***
## car1 0.2364 0.3190 0.741 0.458679
## persons 0.9764 0.1443 6.766 1.33e-11 ***
## male -1.1428 0.2967 -3.852 0.000117 ***
## Log(theta) -0.7310 0.4089 -1.788 0.073811 .
## Zero hurdle model coefficients (binomial with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.1728 0.6482 -4.895 9.84e-07 ***
## creditCard1 0.9768 0.4868 2.006 0.04482 *
## car1 0.9530 0.3275 2.910 0.00361 **
## persons 1.1529 0.1970 5.851 4.88e-09 ***
## male -2.2203 0.3231 -6.871 6.36e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Theta: count = 0.4814
## Number of iterations in BFGS optimization: 13
## Log-likelihood: -388.6 on 11 Df
#Compare the current model to a null model without predictors using chi-sqaured test on the difference of log likelihoods
hurdleNBSNull <- update (hurdleNBS, . ~1)
pchisq(2 * (logLik(hurdleNBS) - logLik(hurdleNBSNull)), df = 4, lower.tail = FALSE) #since we have four predictors in the full model
## 'log Lik.' 2.468039e-30 (df=11)
## Which model should we choose?
vuong(sp,hurdlePS)
## Vuong Non-Nested Hypothesis Test-Statistic:
## (test-statistic is asymptotically distributed N(0,1) under the
## null that the models are indistinguishible)
## -------------------------------------------------------------
## Vuong z-statistic H_A p-value
## Raw -2.352712 model2 > model1 0.0093185
## AIC-corrected -2.205712 model2 > model1 0.0137021
## BIC-corrected -1.946883 model2 > model1 0.0257744
vuong(snb, hurdleNBS)
## Vuong Non-Nested Hypothesis Test-Statistic:
## (test-statistic is asymptotically distributed N(0,1) under the
## null that the models are indistinguishible)
## -------------------------------------------------------------
## Vuong z-statistic H_A p-value
## Raw -1.5845131 model2 > model1 0.056539
## AIC-corrected -0.7893751 model2 > model1 0.214946
## BIC-corrected 0.6106487 model1 > model2 0.270716
## Compare models and model fits: 2) Predicting zero publications
# Real % vs Prediction by Poisson model
sum(shopping$count == 0)/nrow(shopping)
## [1] 0.568
## Prediction by Poisson
# predict expected mean count
sp_mu <- sp$fitted.values
# sum the probabilities of a 0 count for each mean
prozero_sp <- dpois(x = 0, lambda = sp_mu)
mean(prozero_sp)
## [1] 0.3972047
## Prediction by negative binomial
# predict expected mean count
snb_mu <- snb$fitted.values
# sum the probabilities of a 0 count for each mean
prozero_snb <- dpois(x = 0, lambda = snb_mu)
mean(prozero_snb)
## [1] 0.3851591
## Prediction by Poisson hurdle
# predict expected mean count
hurdlePS_mu <- hurdlePS$fitted.values
# sum the probabilities of a 0 count for each mean
prozero_hurdlePS <- dpois(x = 0, lambda = hurdlePS_mu)
mean(prozero_hurdlePS)
## [1] 0.4261927
# Prediction by ZINB
# predict expected mean count
hurdleNBS_mu <- hurdleNBS$fitted.values
# sum the probabilities of a 0 count for each mean
prozero_hurdleNBS <- dpois(x = 0, lambda = hurdleNBS_mu)
mean(prozero_hurdleNBS)
## [1] 0.3953767