Zero inflated model

############################# Count data modeling: Zero-inflated data #############################
################################# Data from Long and Freese (2001) ################################
####################################Example 1: Bio chemistry Phd ##################################
## Load packages
require(ggplot2)

## Loading required package: ggplot2

require(pscl)

## Loading required package: pscl

## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis

require(MASS)

## Loading required package: MASS

require(boot)

## Loading required package: boot

## Load data
data(bioChemists) # in pscl

## Look a few lines
head(bioChemists)

##   art   fem     mar kid5  phd ment
## 1   0   Men Married    0 2.52    7
## 2   0 Women  Single    0 2.05    6
## 3   0 Women  Single    0 3.75    6
## 4   0   Men Married    1 1.18    3
## 5   0 Women  Single    0 3.75   26
## 6   0 Women Married    2 3.59    2

#Variables in the datasets (915 obs):

#art: articles in last three years of Ph.D.
#fem: coded one for females
#mar: coded one if married
#kid5: number of children under age five
#phd: prestige of Ph.D. program
#ment: articles by mentor in last three years

# Factor variables
bioChemists <- within(bioChemists, {
  fem <- factor(fem)
  mar <- factor(mar)
})

## Structure of data and check if fem and mar are factor variables
str(bioChemists)

## 'data.frame':    915 obs. of  6 variables:
##  $ art : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ fem : Factor w/ 2 levels "Men","Women": 1 2 2 1 2 2 2 1 1 2 ...
##  $ mar : Factor w/ 2 levels "Single","Married": 2 1 1 2 1 2 1 2 1 2 ...
##  $ kid5: num  0 0 0 1 0 2 0 2 0 0 ...
##  $ phd : num  2.52 2.05 3.75 1.18 3.75 ...
##  $ ment: int  7 6 6 3 26 2 3 4 6 0 ...
##  - attr(*, "datalabel")= chr "Academic Biochemists / S Long"
##  - attr(*, "time.stamp")= chr "30 Jan 2001 10:49"
##  - attr(*, "formats")= chr [1:6] "%9.0g" "%9.0g" "%9.0g" "%9.0g" ...
##  - attr(*, "types")= int [1:6] 98 98 98 98 102 98
##  - attr(*, "val.labels")= chr [1:6] "" "sexlbl" "marlbl" "" ...
##  - attr(*, "var.labels")= chr [1:6] "Articles in last 3 yrs of PhD" "Gender: 1=female 0=male" "Married: 1=yes 0=no" "Number of children < 6" ...
##  - attr(*, "version")= int 6
##  - attr(*, "label.table")=List of 6
##   ..$ marlbl: Named num [1:2] 0 1
##   .. ..- attr(*, "names")= chr [1:2] "Single" "Married"
##   ..$ sexlbl: Named num [1:2] 0 1
##   .. ..- attr(*, "names")= chr [1:2] "Men" "Women"
##   ..$       : NULL
##   ..$       : NULL
##   ..$       : NULL
##   ..$       : NULL

# Summarize data
library(psych)

## 
## Attaching package: 'psych'

## The following object is masked from 'package:boot':
## 
##     logit

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

summary(bioChemists)

##       art            fem           mar           kid5             phd       
##  Min.   : 0.000   Men  :494   Single :309   Min.   :0.0000   Min.   :0.755  
##  1st Qu.: 0.000   Women:421   Married:606   1st Qu.:0.0000   1st Qu.:2.260  
##  Median : 1.000                             Median :0.0000   Median :3.150  
##  Mean   : 1.693                             Mean   :0.4951   Mean   :3.103  
##  3rd Qu.: 2.000                             3rd Qu.:1.0000   3rd Qu.:3.920  
##  Max.   :19.000                             Max.   :3.0000   Max.   :4.620  
##       ment       
##  Min.   : 0.000  
##  1st Qu.: 3.000  
##  Median : 6.000  
##  Mean   : 8.767  
##  3rd Qu.:12.000  
##  Max.   :77.000

# Summarize DV
describe(bioChemists$art) #variance(3.72) > mean(1.69)

##    vars   n mean   sd median trimmed  mad min max range skew kurtosis   se
## X1    1 915 1.69 1.93      1    1.39 1.48   0  19    19 2.51    12.63 0.06

## Histogram with x axis in log10 scale

ggplot(bioChemists, aes(art, fill = fem)) +
  geom_histogram() +
  scale_x_log10() +
  facet_grid(fem ~ ., margins=TRUE, scales="free_y")

## Warning: Transformation introduced infinite values in continuous x-axis

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 550 rows containing non-finite values (stat_bin).

## The models you may consider

# 1) OLS Regression: You can try to analyze these data using OLS regression. However, count data are highly non-normal

# 2) Zero-inflated Poisson regression or Hurdle Poisson does better when the data is not overdispersed

# 3) Poisson or negative binomial models might appropriate if there are not excess zeros

# 4) Zero-inflated negative binomial regression or Hurdle NBD when the data are overdispersed and contain excess zeros

## Models ##################################################
# 1) ZIP
mzip <- zeroinfl(art~fem+mar+kid5+phd+ment, data=bioChemists)
summary(mzip)

## 
## Call:
## zeroinfl(formula = art ~ fem + mar + kid5 + phd + ment, data = bioChemists)
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3253 -0.8652 -0.2826  0.5404  7.2976 
## 
## Count model coefficients (poisson with log link):
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.640839   0.121307   5.283 1.27e-07 ***
## femWomen    -0.209144   0.063405  -3.299 0.000972 ***
## marMarried   0.103750   0.071111   1.459 0.144567    
## kid5        -0.143320   0.047429  -3.022 0.002513 ** 
## phd         -0.006166   0.031008  -0.199 0.842376    
## ment         0.018098   0.002294   7.888 3.07e-15 ***
## 
## Zero-inflation model coefficients (binomial with logit link):
##              Estimate Std. Error z value Pr(>|z|)   
## (Intercept) -0.577060   0.509386  -1.133  0.25728   
## femWomen     0.109752   0.280082   0.392  0.69517   
## marMarried  -0.354018   0.317611  -1.115  0.26501   
## kid5         0.217095   0.196483   1.105  0.26920   
## phd          0.001275   0.145263   0.009  0.99300   
## ment        -0.134114   0.045243  -2.964  0.00303 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Number of iterations in BFGS optimization: 19 
## Log-likelihood: -1605 on 12 Df

# 2) ZINB
mzinb <- zeroinfl(art~fem+mar+kid5+phd+ment, data=bioChemists, dist = "negbin")
summary(mzinb)

## 
## Call:
## zeroinfl(formula = art ~ fem + mar + kid5 + phd + ment, data = bioChemists, 
##     dist = "negbin")
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -1.2942 -0.7601 -0.2909  0.4448  6.4155 
## 
## Count model coefficients (negbin with log link):
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.4167466  0.1435964   2.902  0.00371 ** 
## femWomen    -0.1955076  0.0755926  -2.586  0.00970 ** 
## marMarried   0.0975826  0.0844520   1.155  0.24789    
## kid5        -0.1517321  0.0542061  -2.799  0.00512 ** 
## phd         -0.0006998  0.0362697  -0.019  0.98461    
## ment         0.0247862  0.0034927   7.097 1.28e-12 ***
## Log(theta)   0.9763577  0.1354696   7.207 5.71e-13 ***
## 
## Zero-inflation model coefficients (binomial with logit link):
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept) -0.19161    1.32280  -0.145  0.88483   
## femWomen     0.63587    0.84890   0.749  0.45382   
## marMarried  -1.49944    0.93866  -1.597  0.11017   
## kid5         0.62841    0.44277   1.419  0.15583   
## phd         -0.03773    0.30801  -0.123  0.90250   
## ment        -0.88227    0.31622  -2.790  0.00527 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Theta = 2.6548 
## Number of iterations in BFGS optimization: 27 
## Log-likelihood: -1550 on 13 Df

### Test model performance
mzinb0 <- update(mzinb, . ~ 1)

pchisq(2 * (logLik(mzinb) - logLik(mzinb0)), df = 3, lower.tail=FALSE)

## 'log Lik.' 8.138862e-26 (df=13)

# 3) Poisson
mp <- glm(art~fem+mar+kid5+phd+ment, family=poisson, data=bioChemists)
summary(mp)

## 
## Call:
## glm(formula = art ~ fem + mar + kid5 + phd + ment, family = poisson, 
##     data = bioChemists)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.5672  -1.5398  -0.3660   0.5722   5.4467  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.304617   0.102981   2.958   0.0031 ** 
## femWomen    -0.224594   0.054613  -4.112 3.92e-05 ***
## marMarried   0.155243   0.061374   2.529   0.0114 *  
## kid5        -0.184883   0.040127  -4.607 4.08e-06 ***
## phd          0.012823   0.026397   0.486   0.6271    
## ment         0.025543   0.002006  12.733  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 1817.4  on 914  degrees of freedom
## Residual deviance: 1634.4  on 909  degrees of freedom
## AIC: 3314.1
## 
## Number of Fisher Scoring iterations: 5

# 4) Negative binomial
mnb <- glm.nb(art~fem+mar+kid5+phd+ment, data=bioChemists)
summary(mnb)

## 
## Call:
## glm.nb(formula = art ~ fem + mar + kid5 + phd + ment, data = bioChemists, 
##     init.theta = 2.264387695, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1678  -1.3617  -0.2806   0.4476   3.4524  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.256144   0.137348   1.865 0.062191 .  
## femWomen    -0.216418   0.072636  -2.979 0.002887 ** 
## marMarried   0.150489   0.082097   1.833 0.066791 .  
## kid5        -0.176415   0.052813  -3.340 0.000837 ***
## phd          0.015271   0.035873   0.426 0.670326    
## ment         0.029082   0.003214   9.048  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(2.2644) family taken to be 1)
## 
##     Null deviance: 1109.0  on 914  degrees of freedom
## Residual deviance: 1004.3  on 909  degrees of freedom
## AIC: 3135.9
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  2.264 
##           Std. Err.:  0.271 
## 
##  2 x log-likelihood:  -3121.917

## Compare models and model fits: 1) Vuong test (Vuong (1989))  

#vuong(mzip, mzinb)
vuong(mp, mzip)

## Vuong Non-Nested Hypothesis Test-Statistic: 
## (test-statistic is asymptotically distributed N(0,1) under the
##  null that the models are indistinguishible)
## -------------------------------------------------------------
##               Vuong z-statistic             H_A    p-value
## Raw                   -4.180495 model2 > model1 1.4544e-05
## AIC-corrected         -3.638552 model2 > model1 0.00013709
## BIC-corrected         -2.332763 model2 > model1 0.00983030

vuong(mnb, mzinb)

## Vuong Non-Nested Hypothesis Test-Statistic: 
## (test-statistic is asymptotically distributed N(0,1) under the
##  null that the models are indistinguishible)
## -------------------------------------------------------------
##               Vuong z-statistic             H_A  p-value
## Raw                   -2.241826 model2 > model1 0.012486
## AIC-corrected         -1.015383 model2 > model1 0.154962
## BIC-corrected          1.939686 model1 > model2 0.026209

## Compare models and model fits: 2) Predicting zero publications
# Real % vs Prediction by Poisson model

sum(bioChemists$art == 0)/nrow(bioChemists)

## [1] 0.3005464

## Prediction by Poisson
# predict expected mean count
mp_mu <- mp$fitted.values

# sum the probabilities of a 0 count for each mean
prozero_mp <- dpois(x = 0, lambda = mp_mu)

mean(prozero_mp)

## [1] 0.2092071

## Prediction by negative binomial
# predict expected mean count
mnb_mu <- mnb$fitted.values

# sum the probabilities of a 0 count for each mean
prozero_mnb <- dpois(x = 0, lambda = mnb_mu)

mean(prozero_mnb)

## [1] 0.2126391

## Prediction by ZIP
# predict expected mean count
mzip_mu <- mzip$fitted.values

# sum the probabilities of a 0 count for each mean
prozero_mzip <- dpois(x = 0, lambda = mzip_mu)

mean(prozero_mzip)

## [1] 0.2131843

# Prediction by ZINB
# predict expected mean count
mzinb_mu <- mzinb$fitted.values

# sum the probabilities of a 0 count for each mean
prozero_mzinb <- dpois(x = 0, lambda = mzinb_mu)

mean(prozero_mzinb)

## [1] 0.216411

######################################## Example 2: Shopping data ####################################
## Load data
shopping <- read.csv(file="C:/Users/ehk994/Desktop/Teaching/Fall 2021/IMC465_Marketing Model II/Week 2 - Advanced count data model/shopping.csv", sep=",", header=TRUE, stringsAsFactors=FALSE)

## Check variable names
names(shopping)

## [1] "creditCard" "car"        "persons"    "male"       "count"

## Check data structure and variable formats
str(shopping)

## 'data.frame':    250 obs. of  5 variables:
##  $ creditCard: int  0 1 1 1 1 1 1 1 0 1 ...
##  $ car       : int  0 1 0 1 0 1 0 0 1 1 ...
##  $ persons   : int  1 1 1 2 1 4 3 4 3 1 ...
##  $ male      : int  0 0 0 1 0 2 1 3 2 0 ...
##  $ count     : int  0 0 0 0 1 0 0 0 0 1 ...

shopping$creditCard <- as.factor(shopping$creditCard)
shopping$car <- as.factor(shopping$car)

## Descriptive statistics
library(psych)
describe(shopping)

##             vars   n mean    sd median trimmed  mad min max range  skew
## creditCard*    1 250 1.86  0.34      2    1.96 0.00   1   2     1 -2.11
## car*           2 250 1.59  0.49      2    1.61 0.00   1   2     1 -0.36
## persons        3 250 2.53  1.11      2    2.54 1.48   1   4     3  0.01
## male           4 250 0.68  0.85      0    0.56 0.00   0   3     3  1.04
## count          5 250 3.30 11.64      0    1.02 0.00   0 149   149  8.88
##             kurtosis   se
## creditCard*     2.47 0.02
## car*           -1.88 0.03
## persons        -1.36 0.07
## male            0.19 0.05
## count          99.58 0.74

## Frequency table and bar plot
#one-way
CreditCard <- table(shopping$creditCard)

barplot(CreditCard, main="Have a credit card",
   xlab="Number of consumers")

Car <- table(shopping$car)

barplot(Car, main="Has a car",
   xlab="Number of Consumers")

## Distribution of an outcome variable
hist(shopping$count)

plot(table(shopping$count))

sum(shopping$count < 1)

## [1] 142

sum(shopping$count > 50)

## [1] 2

## Mean and variance
r <- c(mean(shopping$count), var(shopping$count))
c(mean=r[1], var=r[2], ratio=r[2]/r[1]) # very over-dispersed data

##      mean       var     ratio 
##   3.29600 135.37388  41.07217

## 1) Poisson model
sp <- glm(count~., family=poisson, data=shopping)
summary(sp)

## 
## Call:
## glm(formula = count ~ ., family = poisson, data = shopping)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -5.8792  -1.4955  -0.8784  -0.1345  15.6765  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.39007    0.26490 -12.797  < 2e-16 ***
## creditCard1  1.66967    0.23318   7.160 8.05e-13 ***
## car1         0.80009    0.08947   8.943  < 2e-16 ***
## persons      1.07396    0.03900  27.534  < 2e-16 ***
## male        -1.70993    0.08140 -21.006  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 2958.4  on 249  degrees of freedom
## Residual deviance: 1246.9  on 245  degrees of freedom
## AIC: 1594
## 
## Number of Fisher Scoring iterations: 6

## Model fit
with(sp, cbind(res.deviance = deviance, df = df.residual,
  p = pchisq(deviance, df.residual, lower.tail=FALSE))) # the goodness-of-fit chi-squared test is highly significant;   data doesn't fit the model well.

##      res.deviance  df             p
## [1,]      1246.89 245 8.959359e-134

## 2) Negative binomial model
library(MASS)

snb <- glm.nb(count~., data=shopping)
summary(snb)

## 
## Call:
## glm.nb(formula = count ~ ., data = shopping, init.theta = 0.5155059824, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7528  -0.9342  -0.6516  -0.0719   4.7688  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -2.9760     0.4849  -6.138 8.37e-10 ***
## creditCard1   1.5379     0.4035   3.812 0.000138 ***
## car1          0.5179     0.2306   2.245 0.024752 *  
## persons       1.0618     0.1108   9.581  < 2e-16 ***
## male         -1.8009     0.1839  -9.791  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.5155) family taken to be 1)
## 
##     Null deviance: 423.64  on 249  degrees of freedom
## Residual deviance: 209.88  on 245  degrees of freedom
## AIC: 809.16
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.5155 
##           Std. Err.:  0.0811 
## 
##  2 x log-likelihood:  -797.1610

with(snb, cbind(res.deviance = deviance, df = df.residual,
  p = pchisq(deviance, df.residual, lower.tail=FALSE))) # the goodness-of-fit chi-squared test is not significant; the model does fit the data well.

##      res.deviance  df         p
## [1,]     209.8808 245 0.9493809

# Over-dispersion test: H0: Poisson
library(pscl)
odTest(snb)

## Likelihood ratio test of H0: Poisson, as restricted NB model:
## n.b., the distribution of the test-statistic under H0 is non-standard
## e.g., see help(odTest) for details/references
## 
## Critical value of test statistic at the alpha= 0.05 level: 2.7055 
## Chi-Square Test Statistic =  786.7939 p-value = < 2.2e-16

# the result is highly significant and thus rejects the H0. Poisson doesn't fit the data.

## 3) Poisson Hurdle model
library(pscl)

hurdlePS <- hurdle(count~., data=shopping)
summary(hurdlePS)

## 
## Call:
## hurdle(formula = count ~ ., data = shopping)
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -3.1630 -0.7653 -0.4278 -0.1110 24.9101 
## 
## Count model coefficients (truncated poisson with log link):
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.48336    0.34613  -7.175 7.25e-13 ***
## creditCard1  1.83082    0.30899   5.925 3.12e-09 ***
## car1         0.61171    0.09433   6.485 8.90e-11 ***
## persons      0.83838    0.04368  19.194  < 2e-16 ***
## male        -1.18301    0.09328 -12.682  < 2e-16 ***
## Zero hurdle model coefficients (binomial with logit link):
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -3.1728     0.6482  -4.895 9.84e-07 ***
## creditCard1   0.9768     0.4868   2.006  0.04482 *  
## car1          0.9530     0.3275   2.910  0.00361 ** 
## persons       1.1529     0.1970   5.851 4.88e-09 ***
## male         -2.2203     0.3231  -6.871 6.36e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Number of iterations in BFGS optimization: 11 
## Log-likelihood:  -712 on 10 Df

hurdlePS2 <- hurdle(count~., data=shopping, dist = "poisson", 
                          zero.dist = "binomial") 
summary(hurdlePS2)

## 
## Call:
## hurdle(formula = count ~ ., data = shopping, dist = "poisson", zero.dist = "binomial")
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -3.1630 -0.7653 -0.4278 -0.1110 24.9101 
## 
## Count model coefficients (truncated poisson with log link):
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.48336    0.34613  -7.175 7.25e-13 ***
## creditCard1  1.83082    0.30899   5.925 3.12e-09 ***
## car1         0.61171    0.09433   6.485 8.90e-11 ***
## persons      0.83838    0.04368  19.194  < 2e-16 ***
## male        -1.18301    0.09328 -12.682  < 2e-16 ***
## Zero hurdle model coefficients (binomial with logit link):
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -3.1728     0.6482  -4.895 9.84e-07 ***
## creditCard1   0.9768     0.4868   2.006  0.04482 *  
## car1          0.9530     0.3275   2.910  0.00361 ** 
## persons       1.1529     0.1970   5.851 4.88e-09 ***
## male         -2.2203     0.3231  -6.871 6.36e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Number of iterations in BFGS optimization: 11 
## Log-likelihood:  -712 on 10 Df

#Compare the current model to a null model without predictors using chi-sqaured test on the difference of log likelihoods
hurdlePSNull <- update (hurdlePS, . ~1)
pchisq(2 * (logLik(hurdlePS) - logLik(hurdlePSNull)), df = 4, lower.tail = FALSE) #since we have four predictors in the full model

## 'log Lik.' 2.274465e-178 (df=10)

## 4) Negative binomial hurdle model
hurdleNBS <- hurdle(count~., data=shopping, dist = "negbin")
summary(hurdleNBS)

## 
## Call:
## hurdle(formula = count ~ ., data = shopping, dist = "negbin")
## 
## Pearson residuals:
##      Min       1Q   Median       3Q      Max 
## -0.78396 -0.52471 -0.30573 -0.08437 12.97201 
## 
## Count model coefficients (truncated negbin with log link):
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -3.1527     0.7474  -4.218 2.46e-05 ***
## creditCard1   1.9626     0.5946   3.300 0.000965 ***
## car1          0.2364     0.3190   0.741 0.458679    
## persons       0.9764     0.1443   6.766 1.33e-11 ***
## male         -1.1428     0.2967  -3.852 0.000117 ***
## Log(theta)   -0.7310     0.4089  -1.788 0.073811 .  
## Zero hurdle model coefficients (binomial with logit link):
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -3.1728     0.6482  -4.895 9.84e-07 ***
## creditCard1   0.9768     0.4868   2.006  0.04482 *  
## car1          0.9530     0.3275   2.910  0.00361 ** 
## persons       1.1529     0.1970   5.851 4.88e-09 ***
## male         -2.2203     0.3231  -6.871 6.36e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Theta: count = 0.4814
## Number of iterations in BFGS optimization: 13 
## Log-likelihood: -388.6 on 11 Df

#Compare the current model to a null model without predictors using chi-sqaured test on the difference of log likelihoods
hurdleNBSNull <- update (hurdleNBS, . ~1)
pchisq(2 * (logLik(hurdleNBS) - logLik(hurdleNBSNull)), df = 4, lower.tail = FALSE) #since we have four predictors in the full model

## 'log Lik.' 2.468039e-30 (df=11)

## Which model should we choose?
vuong(sp,hurdlePS)

## Vuong Non-Nested Hypothesis Test-Statistic: 
## (test-statistic is asymptotically distributed N(0,1) under the
##  null that the models are indistinguishible)
## -------------------------------------------------------------
##               Vuong z-statistic             H_A   p-value
## Raw                   -2.352712 model2 > model1 0.0093185
## AIC-corrected         -2.205712 model2 > model1 0.0137021
## BIC-corrected         -1.946883 model2 > model1 0.0257744

vuong(snb, hurdleNBS)

## Vuong Non-Nested Hypothesis Test-Statistic: 
## (test-statistic is asymptotically distributed N(0,1) under the
##  null that the models are indistinguishible)
## -------------------------------------------------------------
##               Vuong z-statistic             H_A  p-value
## Raw                  -1.5845131 model2 > model1 0.056539
## AIC-corrected        -0.7893751 model2 > model1 0.214946
## BIC-corrected         0.6106487 model1 > model2 0.270716

## Compare models and model fits: 2) Predicting zero publications
# Real % vs Prediction by Poisson model

sum(shopping$count == 0)/nrow(shopping)

## [1] 0.568

## Prediction by Poisson
# predict expected mean count
sp_mu <- sp$fitted.values

# sum the probabilities of a 0 count for each mean
prozero_sp <- dpois(x = 0, lambda = sp_mu)

mean(prozero_sp)

## [1] 0.3972047

## Prediction by negative binomial
# predict expected mean count
snb_mu <- snb$fitted.values

# sum the probabilities of a 0 count for each mean
prozero_snb <- dpois(x = 0, lambda = snb_mu)

mean(prozero_snb)

## [1] 0.3851591

## Prediction by Poisson hurdle
# predict expected mean count
hurdlePS_mu <- hurdlePS$fitted.values

# sum the probabilities of a 0 count for each mean
prozero_hurdlePS <- dpois(x = 0, lambda = hurdlePS_mu)

mean(prozero_hurdlePS)

## [1] 0.4261927

# Prediction by ZINB
# predict expected mean count
hurdleNBS_mu <- hurdleNBS$fitted.values

# sum the probabilities of a 0 count for each mean
prozero_hurdleNBS <- dpois(x = 0, lambda = hurdleNBS_mu)

mean(prozero_hurdleNBS)

## [1] 0.3953767

Zero inflated model

Emily Ko

2021 Fall