knitr::opts_chunk$set(warning=FALSE, message=FALSE)

# if not installed, install package pacman
# install.packages("pacman")

pacman::p_load(janitor,  # for data cleaning
               DataExplorer, # for data summary
               performance, # for regression diagnostic check
               lmtest , # for coeftest
               pscl, #  zero-inflated regression models,  functions hurdle() and zeroinfl()
               tidyverse) # several packages for datascience

df <- read_csv("BryanUKB.csv")

glimpse(df)

## Rows: 1,000
## Columns: 51
## $ ...1      <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1…
## $ IID       <dbl> 3928228, 1011965, 4142719, 3905944, 3796835, 3791122, 388540…
## $ HARD      <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ HG        <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, …
## $ ARRAY     <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ BIRTHYEAR <dbl> 1949, 1954, 1955, 1956, 1958, 1952, 1950, 1967, 1955, 1950, …
## $ SEX       <dbl> 1, 1, 2, 1, 2, 2, 2, 2, 1, 1, 2, 1, 1, 2, 2, 1, 2, 2, 1, 2, …
## $ PC1       <dbl> -9.36831, 87.12410, -13.30580, -10.17860, -12.59400, 4.97751…
## $ PC2       <dbl> 4.856510, -128.922000, 3.795190, 3.461760, 6.783790, -4.9432…
## $ PC3       <dbl> -4.618970, 91.693800, -2.318730, -1.057470, 1.201080, 15.227…
## $ PC4       <dbl> -1.0861200, 26.9482000, 2.0924100, -1.6033300, 0.6135840, -4…
## $ PC5       <dbl> -8.859650, -5.616630, -2.314550, -7.527250, -1.998470, 5.980…
## $ A10398G   <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, …
## $ A10550G   <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ A11251G   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, …
## $ A11467G   <dbl> 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ A11812G   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ A12612G   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, …
## $ A15924G   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ A3480G    <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ A4917G    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ C12705T   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, …
## $ C14167T   <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ C150T     <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ C15452A   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, …
## $ C16270T   <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ C295T     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, …
## $ G12372A   <dbl> 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ G13708A   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, …
## $ G14905A   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ G15043A   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, …
## $ G15928A   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ G1719A    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ G228A     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, …
## $ G2706A    <dbl> 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, …
## $ G3010A    <dbl> 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, …
## $ G5147A    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ G709A     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ G73A      <dbl> 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, …
## $ G8697A    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ G9055A    <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ T10463C   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, …
## $ T11299C   <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ T13617C   <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ T14798C   <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, …
## $ T16362C   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ T3197C    <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ T4216C    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, …
## $ T6776C    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ T7028C    <dbl> 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, …
## $ T9698C    <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …

Question

So the question Im trying to answer for the project is if there is any correlation between variants in mitochondrial genes and hard cases of CAD. The main issue here is when I ran my code in python there were a lot of 0s indicating no cases of CAD and that ended up skewing my results which was not good thats why we talked about zero inflated distribution and how that might help us because as we discussed earlier we cant just throw out or put in data to allow for a more balanced picture.

names(df)

##  [1] "...1"      "IID"       "HARD"      "HG"        "ARRAY"     "BIRTHYEAR"
##  [7] "SEX"       "PC1"       "PC2"       "PC3"       "PC4"       "PC5"      
## [13] "A10398G"   "A10550G"   "A11251G"   "A11467G"   "A11812G"   "A12612G"  
## [19] "A15924G"   "A3480G"    "A4917G"    "C12705T"   "C14167T"   "C150T"    
## [25] "C15452A"   "C16270T"   "C295T"     "G12372A"   "G13708A"   "G14905A"  
## [31] "G15043A"   "G15928A"   "G1719A"    "G228A"     "G2706A"    "G3010A"   
## [37] "G5147A"    "G709A"     "G73A"      "G8697A"    "G9055A"    "T10463C"  
## [43] "T11299C"   "T13617C"   "T14798C"   "T16362C"   "T3197C"    "T4216C"   
## [49] "T6776C"    "T7028C"    "T9698C"

Change names

df <- df %>% 
  janitor::clean_names()

Uncomment the following lines to obtain an EDA report

# df %>% 
#  select(-c(x1, iid, pc1, pc2, pc3, pc4, pc5)) %>% 
#  DataExplorer::create_report()

outcome HARD

table(df$hard)

## 
##   0   1 
## 947  53

control by

HG: mitocrondiral hapgroup

PCs

check https://stats.idre.ucla.edu/r/dae/zip/

from A10398G to T9698C are explanatory variables

first try: Poisson model

pois1 <- glm(hard ~ . + (1|iid), 
             data = df, 
             family = poisson)
summary(pois1)

## 
## Call:
## glm(formula = hard ~ . + (1 | iid), family = poisson, data = df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6226  -0.3209  -0.2260  -0.1441   3.1103  
## 
## Coefficients: (3 not defined because of singularities)
##               Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  1.043e+02  1.008e+03   0.103  0.91757   
## x1          -3.865e-04  5.189e-04  -0.745  0.45637   
## iid         -9.758e-08  1.020e-07  -0.957  0.33872   
## hg           1.667e+01  1.007e+03   0.017  0.98679   
## array               NA         NA      NA       NA   
## birthyear   -6.255e-02  2.102e-02  -2.976  0.00292 **
## sex         -1.053e+00  3.207e-01  -3.284  0.00102 **
## pc1         -1.290e-01  8.161e-02  -1.581  0.11385   
## pc2         -9.261e-02  5.550e-02  -1.669  0.09517 . 
## pc3          2.206e-03  2.260e-02   0.098  0.92225   
## pc4          8.404e-03  4.869e-02   0.173  0.86297   
## pc5         -1.210e-02  2.246e-02  -0.538  0.59024   
## a10398g     -7.076e-01  9.964e-01  -0.710  0.47763   
## a10550g      3.509e+01  1.333e+04   0.003  0.99790   
## a11251g     -4.219e+01  1.341e+04  -0.003  0.99749   
## a11467g      1.528e+01  9.427e+03   0.002  0.99871   
## a11812g      1.622e+01  1.997e+03   0.008  0.99352   
## a12612g      4.157e+01  1.341e+04   0.003  0.99753   
## a15924g     -1.406e-01  9.233e-01  -0.152  0.87897   
## a3480g      -3.184e+01  1.333e+04  -0.002  0.99809   
## a4917g       1.606e+01  6.568e+03   0.002  0.99805   
## c12705t     -2.565e+00  1.675e+00  -1.531  0.12573   
## c14167t      1.656e+01  9.427e+03   0.002  0.99860   
## c150t       -9.442e-01  8.597e-01  -1.098  0.27206   
## c15452a             NA         NA      NA       NA   
## c16270t     -9.019e-02  1.171e+00  -0.077  0.93860   
## c295t        7.601e-01  3.687e+00   0.206  0.83668   
## g12372a     -1.722e+01  9.427e+03  -0.002  0.99854   
## g13708a      1.156e-02  1.167e+00   0.010  0.99209   
## g14905a      5.805e+01  3.277e+03   0.018  0.98587   
## g15043a      1.438e+00  8.841e-01   1.626  0.10388   
## g15928a     -1.594e+01  9.427e+03  -0.002  0.99865   
## g1719a       1.106e-01  1.085e+00   0.102  0.91883   
## g228a        4.606e-01  1.671e+00   0.276  0.78288   
## g2706a       1.702e+00  4.407e+00   0.386  0.69939   
## g3010a       1.027e-02  4.399e-01   0.023  0.98137   
## g5147a       6.220e-01  8.396e-01   0.741  0.45878   
## g709a       -8.583e-02  1.040e+00  -0.083  0.93422   
## g73a        -7.588e-01  5.855e-01  -1.296  0.19494   
## g8697a      -1.637e+01  5.191e+03  -0.003  0.99748   
## g9055a       4.548e-01  1.164e+00   0.391  0.69603   
## t10463c     -1.633e+01  2.472e+03  -0.007  0.99473   
## t11299c     -1.729e+01  9.427e+03  -0.002  0.99854   
## t13617c      4.579e-01  3.455e+00   0.133  0.89456   
## t14798c     -1.587e+00  1.994e+00  -0.796  0.42613   
## t16362c      1.613e-01  5.712e-01   0.282  0.77763   
## t3197c       2.540e-01  3.275e+00   0.078  0.93818   
## t4216c      -3.547e-01  3.284e+00  -0.108  0.91398   
## t6776c       4.488e-01  6.366e-01   0.705  0.48088   
## t7028c      -2.104e+00  4.449e+00  -0.473  0.63620   
## t9698c       7.329e-02  1.560e+00   0.047  0.96253   
## 1 | iidTRUE         NA         NA      NA       NA   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 311.37  on 999  degrees of freedom
## Residual deviance: 242.67  on 951  degrees of freedom
## AIC: 446.67
## 
## Number of Fisher Scoring iterations: 17

ZIPR

zi1 <- zeroinfl(
  hard ~
    a10398g + a10550g + a11251g +
    a11467g + a11812g + a12612g + a15924g + a3480g + a4917g +
    c12705t + c14167t + c150t + c15452a + c16270t + c295t +
    g12372a + g13708a + g14905a + g15043a + g15928a + g1719a +
    g228a + g2706a + g3010a + g5147a + g709a + g73a + g8697a +
    g9055a + t10463c + t11299c + t13617c + t14798c + t16362c +
    t3197c + t4216c + t6776c + t7028c + t9698c | iid,
  data = df
)

summary(zi1)

## 
## Call:
## zeroinfl(formula = hard ~ a10398g + a10550g + a11251g + a11467g + a11812g + 
##     a12612g + a15924g + a3480g + a4917g + c12705t + c14167t + c150t + 
##     c15452a + c16270t + c295t + g12372a + g13708a + g14905a + g15043a + 
##     g15928a + g1719a + g228a + g2706a + g3010a + g5147a + g709a + g73a + 
##     g8697a + g9055a + t10463c + t11299c + t13617c + t14798c + t16362c + 
##     t3197c + t4216c + t6776c + t7028c + t9698c | iid, data = df)
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -0.6781 -0.2300 -0.2168 -0.1419  9.0445 
## 
## Count model coefficients (poisson with log link):
##               Estimate Std. Error z value Pr(>|z|)
## (Intercept)   -1.81108         NA      NA       NA
## a10398g       -0.91963         NA      NA       NA
## a10550g      348.50072         NA      NA       NA
## a11251g        2.33188         NA      NA       NA
## a11467g       99.54020         NA      NA       NA
## a11812g      345.92524         NA      NA       NA
## a12612g       -4.76839         NA      NA       NA
## a15924g       -0.07419         NA      NA       NA
## a3480g      -216.51958         NA      NA       NA
## a4917g       314.69516         NA      NA       NA
## c12705t       -3.30661         NA      NA       NA
## c14167t       63.86228         NA      NA       NA
## c150t         -0.96643         NA      NA       NA
## c15452a        2.33188         NA      NA       NA
## c16270t        0.23342         NA      NA       NA
## c295t         -0.35637         NA      NA       NA
## g12372a     -101.28441         NA      NA       NA
## g13708a        0.07627         NA      NA       NA
## g14905a      -11.39876         NA      NA       NA
## g15043a        1.60273         NA      NA       NA
## g15928a     -115.48141         NA      NA       NA
## g1719a         0.35354         NA      NA       NA
## g228a          0.97811         NA      NA       NA
## g2706a         0.07517         NA      NA       NA
## g3010a         0.09341         NA      NA       NA
## g5147a         0.31280         NA      NA       NA
## g709a          0.29383         NA      NA       NA
## g73a          -0.72500         NA      NA       NA
## g8697a      -264.14315         NA      NA       NA
## g9055a         0.69473         NA      NA       NA
## t10463c     -275.12977         NA      NA       NA
## t11299c     -193.97790         NA      NA       NA
## t13617c       -0.32508         NA      NA       NA
## t14798c       -1.61436         NA      NA       NA
## t16362c        0.05653         NA      NA       NA
## t3197c         0.62553         NA      NA       NA
## t4216c         0.00643         NA      NA       NA
## t6776c         0.61764         NA      NA       NA
## t7028c        -0.59689         NA      NA       NA
## t9698c         0.58967         NA      NA       NA
## 
## Zero-inflation model coefficients (binomial with logit link):
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept)  2.513331         NA      NA       NA
## iid         -0.005465         NA      NA       NA
## 
## Number of iterations in BFGS optimization: 77 
## Log-likelihood: -193.7 on 42 Df

Compare both models

vuong(pois1, zi1)

## Vuong Non-Nested Hypothesis Test-Statistic: 
## (test-statistic is asymptotically distributed N(0,1) under the
##  null that the models are indistinguishible)
## -------------------------------------------------------------
##               Vuong z-statistic             H_A   p-value
## Raw                    3.546630 model1 > model2 0.0001951
## AIC-corrected          1.718360 model1 > model2 0.0428655
## BIC-corrected         -2.767991 model2 > model1 0.0028202

The Vuong test compares the zero-inflated model with an ordinary Poisson regression model. In this example, we can see that our test statistic is significant, indicating that the zero-inflated model is superior to the standard Poisson model. Desmarais Bruce A., Harden Jeffrey J., 2013, “Testing for Zero Inflation in Count Models: Bias Correction for the Vuong Test” Stata Journal, 13, 4, 810-835 check https://stats.idre.ucla.edu/r/dae/zip/

Check assumptions and models performance

check_overdispersion(pois1)

## # Overdispersion test
## 
##        dispersion ratio =   0.986
##   Pearson's Chi-Squared = 935.002
##                 p-value =   0.612

check_zeroinflation(pois1)

## # Check for zero-inflation
## 
##    Observed zeros: 947
##   Predicted zeros: 951
##             Ratio: 1.00

check_zeroinflation(zi1)

## # Check for zero-inflation
## 
##    Observed zeros: 947
##   Predicted zeros: 949
##             Ratio: 1.00

Check this with the full dataset

check_heteroscedasticity(pois1)

## Warning: Heteroscedasticity (non-constant error variance) detected (p = 0.010).

check_heteroscedasticity(zi1)

## Warning: Heteroscedasticity (non-constant error variance) detected (p = 0.000).

# check_model(pois1)

# check_model(zi1)

model_performance(pois1)

## # Indices of model performance
## 
## AIC     |     BIC | Nagelkerke's R2 |  RMSE | Sigma | Score_log | Score_spherical
## ---------------------------------------------------------------------------------
## 446.674 | 687.154 |           0.248 | 0.218 | 0.505 |    -0.174 |           0.031

model_performance(zi1)

## # Indices of model performance
## 
## AIC     |     BIC |    R2 | R2 (adj.) |  RMSE | Sigma | Score_log | Score_spherical
## -----------------------------------------------------------------------------------
## 471.471 | 677.597 | 0.002 |    -0.039 | 0.220 | 0.224 |    -0.194 |           0.031

compare_performance(pois1, zi1, rank = TRUE)

## # Comparison of Model Performance Indices
## 
## Name  |    Model |     AIC |     BIC |  RMSE | Sigma | Score_log | Score_spherical | Performance-Score
## ------------------------------------------------------------------------------------------------------
## pois1 |      glm | 446.674 | 687.154 | 0.218 | 0.505 |    -0.174 |           0.031 |            66.67%
## zi1   | zeroinfl | 471.471 | 677.597 | 0.220 | 0.224 |    -0.194 |           0.031 |            33.33%

plot(compare_performance(pois1, zi1, rank = TRUE)) # package see required

test_performance(pois1, zi1)

## Name  |    Model |     BF
## -------------------------
## pois1 |      glm |   1.00
## zi1   | zeroinfl | 118.90
## Each model is compared to pois1.

So far, the Zero inflated model seems perform worst than a poisson regression model.

Next step, run this script with the full dataset

LS0tCnRpdGxlOiAiWmVybyBpbmZsYXRlZCByZWdyZXNzaW9uIgpvdXRwdXQ6CiAgaHRtbF9kb2N1bWVudDogCiAgICBjb2RlX2Rvd25sb2FkOiB0cnVlCiAgICB0b2M6IHllcwogICAgdG9jX2Zsb2F0OgogICAgICBjb2xsYXBzZWQ6IG5vCiAgICAgIHNtb290aF9zY3JvbGw6IG5vCiAgICBjb2RlX2ZvbGRpbmc6IGhpZGUKLS0tCgoKYGBge3J9CmtuaXRyOjpvcHRzX2NodW5rJHNldCh3YXJuaW5nPUZBTFNFLCBtZXNzYWdlPUZBTFNFKQpgYGAKCgpgYGB7cn0KIyBpZiBub3QgaW5zdGFsbGVkLCBpbnN0YWxsIHBhY2thZ2UgcGFjbWFuCiMgaW5zdGFsbC5wYWNrYWdlcygicGFjbWFuIikKCnBhY21hbjo6cF9sb2FkKGphbml0b3IsICAjIGZvciBkYXRhIGNsZWFuaW5nCiAgICAgICAgICAgICAgIERhdGFFeHBsb3JlciwgIyBmb3IgZGF0YSBzdW1tYXJ5CiAgICAgICAgICAgICAgIHBlcmZvcm1hbmNlLCAjIGZvciByZWdyZXNzaW9uIGRpYWdub3N0aWMgY2hlY2sKICAgICAgICAgICAgICAgbG10ZXN0ICwgIyBmb3IgY29lZnRlc3QKICAgICAgICAgICAgICAgcHNjbCwgIyAgemVyby1pbmZsYXRlZCByZWdyZXNzaW9uIG1vZGVscywgIGZ1bmN0aW9ucyBodXJkbGUoKSBhbmQgemVyb2luZmwoKQogICAgICAgICAgICAgICB0aWR5dmVyc2UpICMgc2V2ZXJhbCBwYWNrYWdlcyBmb3IgZGF0YXNjaWVuY2UKYGBgCgpgYGB7cn0KZGYgPC0gcmVhZF9jc3YoIkJyeWFuVUtCLmNzdiIpCmBgYAoKYGBge3J9CmdsaW1wc2UoZGYpCmBgYAoKCiMgUXVlc3Rpb24KClNvIHRoZSBxdWVzdGlvbiBJbSB0cnlpbmcgdG8gYW5zd2VyIGZvciB0aGUgcHJvamVjdCBpcyBpZiB0aGVyZSBpcyBhbnkgY29ycmVsYXRpb24gYmV0d2VlbiB2YXJpYW50cyBpbiBtaXRvY2hvbmRyaWFsIGdlbmVzIGFuZCBoYXJkIGNhc2VzIG9mIENBRC4gVGhlIG1haW4gaXNzdWUgaGVyZSBpcyB3aGVuIEkgcmFuIG15IGNvZGUgaW4gcHl0aG9uIHRoZXJlIHdlcmUgYSBsb3Qgb2YgMHMgaW5kaWNhdGluZyBubyBjYXNlcyBvZiBDQUQgYW5kIHRoYXQgZW5kZWQgdXAgc2tld2luZyBteSByZXN1bHRzIHdoaWNoIHdhcyBub3QgZ29vZCB0aGF0cyB3aHkgd2UgdGFsa2VkIGFib3V0IHplcm8gaW5mbGF0ZWQgZGlzdHJpYnV0aW9uIGFuZCBob3cgdGhhdCBtaWdodCBoZWxwIHVzIGJlY2F1c2UgYXMgd2UgZGlzY3Vzc2VkIGVhcmxpZXIgd2UgY2FudMKganVzdCB0aHJvdyBvdXQgb3IgcHV0IGluIGRhdGEgdG8gYWxsb3cgZm9yIGEgbW9yZSBiYWxhbmNlZCBwaWN0dXJlLsKgCgpgYGB7cn0KbmFtZXMoZGYpCmBgYAoKCkNoYW5nZSBuYW1lcwpgYGB7cn0KZGYgPC0gZGYgJT4lIAogIGphbml0b3I6OmNsZWFuX25hbWVzKCkKYGBgCgoKClVuY29tbWVudCB0aGUgZm9sbG93aW5nIGxpbmVzIHRvIG9idGFpbiBhbiBFREEgcmVwb3J0CgpgYGB7cn0KIyBkZiAlPiUgCiMgIHNlbGVjdCgtYyh4MSwgaWlkLCBwYzEsIHBjMiwgcGMzLCBwYzQsIHBjNSkpICU+JSAKIyAgRGF0YUV4cGxvcmVyOjpjcmVhdGVfcmVwb3J0KCkKYGBgCm91dGNvbWUgSEFSRAoKYGBge3J9CnRhYmxlKGRmJGhhcmQpCmBgYAoKY29udHJvbCBieQoKSEc6IG1pdG9jcm9uZGlyYWwgaGFwZ3JvdXAKClBDcwoKY2hlY2sgaHR0cHM6Ly9zdGF0cy5pZHJlLnVjbGEuZWR1L3IvZGFlL3ppcC8KCmZyb20gQTEwMzk4RyB0byBUOTY5OEMgYXJlIGV4cGxhbmF0b3J5IHZhcmlhYmxlcwoKIyBmaXJzdCB0cnk6IFBvaXNzb24gbW9kZWwKCmBgYHtyfQpwb2lzMSA8LSBnbG0oaGFyZCB+IC4gKyAoMXxpaWQpLCAKICAgICAgICAgICAgIGRhdGEgPSBkZiwgCiAgICAgICAgICAgICBmYW1pbHkgPSBwb2lzc29uKQpzdW1tYXJ5KHBvaXMxKQpgYGAKCgojIFpJUFIKYGBge3J9Cgp6aTEgPC0gemVyb2luZmwoCiAgaGFyZCB+CiAgICBhMTAzOThnICsgYTEwNTUwZyArIGExMTI1MWcgKwogICAgYTExNDY3ZyArIGExMTgxMmcgKyBhMTI2MTJnICsgYTE1OTI0ZyArIGEzNDgwZyArIGE0OTE3ZyArCiAgICBjMTI3MDV0ICsgYzE0MTY3dCArIGMxNTB0ICsgYzE1NDUyYSArIGMxNjI3MHQgKyBjMjk1dCArCiAgICBnMTIzNzJhICsgZzEzNzA4YSArIGcxNDkwNWEgKyBnMTUwNDNhICsgZzE1OTI4YSArIGcxNzE5YSArCiAgICBnMjI4YSArIGcyNzA2YSArIGczMDEwYSArIGc1MTQ3YSArIGc3MDlhICsgZzczYSArIGc4Njk3YSArCiAgICBnOTA1NWEgKyB0MTA0NjNjICsgdDExMjk5YyArIHQxMzYxN2MgKyB0MTQ3OThjICsgdDE2MzYyYyArCiAgICB0MzE5N2MgKyB0NDIxNmMgKyB0Njc3NmMgKyB0NzAyOGMgKyB0OTY5OGMgfCBpaWQsCiAgZGF0YSA9IGRmCikKYGBgCmBgYHtyfQpzdW1tYXJ5KHppMSkKYGBgCgojIyBDb21wYXJlIGJvdGggbW9kZWxzCgpgYGB7cn0KdnVvbmcocG9pczEsIHppMSkKYGBgCgoKVGhlIFZ1b25nIHRlc3QgY29tcGFyZXMgdGhlIHplcm8taW5mbGF0ZWQgbW9kZWwgd2l0aCBhbiBvcmRpbmFyeSBQb2lzc29uIHJlZ3Jlc3Npb24gbW9kZWwuIEluIHRoaXMgZXhhbXBsZSwgd2UgY2FuIHNlZSB0aGF0IG91ciB0ZXN0IHN0YXRpc3RpYyBpcyBzaWduaWZpY2FudCwgaW5kaWNhdGluZyB0aGF0IHRoZSB6ZXJvLWluZmxhdGVkIG1vZGVsIGlzIHN1cGVyaW9yIHRvIHRoZSBzdGFuZGFyZCBQb2lzc29uIG1vZGVsLgpEZXNtYXJhaXMgQnJ1Y2UgQS4sIEhhcmRlbiBKZWZmcmV5IEouLCAyMDEzLCDigJxUZXN0aW5nIGZvciBaZXJvIEluZmxhdGlvbiBpbiBDb3VudCBNb2RlbHM6IEJpYXMgQ29ycmVjdGlvbiBmb3IgdGhlIFZ1b25nIFRlc3TigJ0gU3RhdGEgSm91cm5hbCwgMTMsIDQsIDgxMC04MzUKIGNoZWNrIGh0dHBzOi8vc3RhdHMuaWRyZS51Y2xhLmVkdS9yL2RhZS96aXAvCgojIyBDaGVjayBhc3N1bXB0aW9ucyBhbmQgbW9kZWxzIHBlcmZvcm1hbmNlCgpgYGB7cn0KY2hlY2tfb3ZlcmRpc3BlcnNpb24ocG9pczEpCmBgYApgYGB7cn0KY2hlY2tfemVyb2luZmxhdGlvbihwb2lzMSkKYGBgCmBgYHtyfQpjaGVja196ZXJvaW5mbGF0aW9uKHppMSkKYGBgCkNoZWNrIHRoaXMgd2l0aCB0aGUgZnVsbCBkYXRhc2V0CgpgYGB7cn0KY2hlY2tfaGV0ZXJvc2NlZGFzdGljaXR5KHBvaXMxKQpgYGAKYGBge3J9CmNoZWNrX2hldGVyb3NjZWRhc3RpY2l0eSh6aTEpCmBgYAoKYGBge3J9CiMgY2hlY2tfbW9kZWwocG9pczEpCmBgYAoKYGBge3J9CiMgY2hlY2tfbW9kZWwoemkxKQpgYGAKCmBgYHtyfQptb2RlbF9wZXJmb3JtYW5jZShwb2lzMSkKYGBgCmBgYHtyfQptb2RlbF9wZXJmb3JtYW5jZSh6aTEpCmBgYApgYGB7cn0KY29tcGFyZV9wZXJmb3JtYW5jZShwb2lzMSwgemkxLCByYW5rID0gVFJVRSkKYGBgCmBgYHtyfQpwbG90KGNvbXBhcmVfcGVyZm9ybWFuY2UocG9pczEsIHppMSwgcmFuayA9IFRSVUUpKSAjIHBhY2thZ2Ugc2VlIHJlcXVpcmVkCmBgYApgYGB7cn0KdGVzdF9wZXJmb3JtYW5jZShwb2lzMSwgemkxKQpgYGAKClNvIGZhciwgdGhlIFplcm8gaW5mbGF0ZWQgbW9kZWwgc2VlbXMgcGVyZm9ybSB3b3JzdCB0aGFuIGEgcG9pc3NvbiByZWdyZXNzaW9uIG1vZGVsLgoKTmV4dCBzdGVwLCBydW4gdGhpcyBzY3JpcHQgd2l0aCB0aGUgZnVsbCBkYXRhc2V0Cg==

Zero inflated regression

Question

first try: Poisson model

ZIPR

Compare both models

Check assumptions and models performance