knitr::opts_chunk$set(warning=FALSE, message=FALSE)
# if not installed, install package pacman
# install.packages("pacman")
pacman::p_load(janitor, # for data cleaning
DataExplorer, # for data summary
performance, # for regression diagnostic check
lmtest , # for coeftest
pscl, # zero-inflated regression models, functions hurdle() and zeroinfl()
tidyverse) # several packages for datascience
df <- read_csv("BryanUKB.csv")
glimpse(df)
## Rows: 1,000
## Columns: 51
## $ ...1 <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1…
## $ IID <dbl> 3928228, 1011965, 4142719, 3905944, 3796835, 3791122, 388540…
## $ HARD <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ HG <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, …
## $ ARRAY <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ BIRTHYEAR <dbl> 1949, 1954, 1955, 1956, 1958, 1952, 1950, 1967, 1955, 1950, …
## $ SEX <dbl> 1, 1, 2, 1, 2, 2, 2, 2, 1, 1, 2, 1, 1, 2, 2, 1, 2, 2, 1, 2, …
## $ PC1 <dbl> -9.36831, 87.12410, -13.30580, -10.17860, -12.59400, 4.97751…
## $ PC2 <dbl> 4.856510, -128.922000, 3.795190, 3.461760, 6.783790, -4.9432…
## $ PC3 <dbl> -4.618970, 91.693800, -2.318730, -1.057470, 1.201080, 15.227…
## $ PC4 <dbl> -1.0861200, 26.9482000, 2.0924100, -1.6033300, 0.6135840, -4…
## $ PC5 <dbl> -8.859650, -5.616630, -2.314550, -7.527250, -1.998470, 5.980…
## $ A10398G <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, …
## $ A10550G <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ A11251G <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, …
## $ A11467G <dbl> 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ A11812G <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ A12612G <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, …
## $ A15924G <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ A3480G <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ A4917G <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ C12705T <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, …
## $ C14167T <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ C150T <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ C15452A <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, …
## $ C16270T <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ C295T <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, …
## $ G12372A <dbl> 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ G13708A <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, …
## $ G14905A <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ G15043A <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, …
## $ G15928A <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ G1719A <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ G228A <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, …
## $ G2706A <dbl> 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, …
## $ G3010A <dbl> 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, …
## $ G5147A <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ G709A <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ G73A <dbl> 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, …
## $ G8697A <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ G9055A <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ T10463C <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, …
## $ T11299C <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ T13617C <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ T14798C <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, …
## $ T16362C <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ T3197C <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ T4216C <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, …
## $ T6776C <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ T7028C <dbl> 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, …
## $ T9698C <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
So the question Im trying to answer for the project is if there is any correlation between variants in mitochondrial genes and hard cases of CAD. The main issue here is when I ran my code in python there were a lot of 0s indicating no cases of CAD and that ended up skewing my results which was not good thats why we talked about zero inflated distribution and how that might help us because as we discussed earlier we cant just throw out or put in data to allow for a more balanced picture.
names(df)
## [1] "...1" "IID" "HARD" "HG" "ARRAY" "BIRTHYEAR"
## [7] "SEX" "PC1" "PC2" "PC3" "PC4" "PC5"
## [13] "A10398G" "A10550G" "A11251G" "A11467G" "A11812G" "A12612G"
## [19] "A15924G" "A3480G" "A4917G" "C12705T" "C14167T" "C150T"
## [25] "C15452A" "C16270T" "C295T" "G12372A" "G13708A" "G14905A"
## [31] "G15043A" "G15928A" "G1719A" "G228A" "G2706A" "G3010A"
## [37] "G5147A" "G709A" "G73A" "G8697A" "G9055A" "T10463C"
## [43] "T11299C" "T13617C" "T14798C" "T16362C" "T3197C" "T4216C"
## [49] "T6776C" "T7028C" "T9698C"
Change names
df <- df %>%
janitor::clean_names()
Uncomment the following lines to obtain an EDA report
# df %>%
# select(-c(x1, iid, pc1, pc2, pc3, pc4, pc5)) %>%
# DataExplorer::create_report()
outcome HARD
table(df$hard)
##
## 0 1
## 947 53
control by
HG: mitocrondiral hapgroup
PCs
check https://stats.idre.ucla.edu/r/dae/zip/
from A10398G to T9698C are explanatory variables
pois1 <- glm(hard ~ . + (1|iid),
data = df,
family = poisson)
summary(pois1)
##
## Call:
## glm(formula = hard ~ . + (1 | iid), family = poisson, data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6226 -0.3209 -0.2260 -0.1441 3.1103
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.043e+02 1.008e+03 0.103 0.91757
## x1 -3.865e-04 5.189e-04 -0.745 0.45637
## iid -9.758e-08 1.020e-07 -0.957 0.33872
## hg 1.667e+01 1.007e+03 0.017 0.98679
## array NA NA NA NA
## birthyear -6.255e-02 2.102e-02 -2.976 0.00292 **
## sex -1.053e+00 3.207e-01 -3.284 0.00102 **
## pc1 -1.290e-01 8.161e-02 -1.581 0.11385
## pc2 -9.261e-02 5.550e-02 -1.669 0.09517 .
## pc3 2.206e-03 2.260e-02 0.098 0.92225
## pc4 8.404e-03 4.869e-02 0.173 0.86297
## pc5 -1.210e-02 2.246e-02 -0.538 0.59024
## a10398g -7.076e-01 9.964e-01 -0.710 0.47763
## a10550g 3.509e+01 1.333e+04 0.003 0.99790
## a11251g -4.219e+01 1.341e+04 -0.003 0.99749
## a11467g 1.528e+01 9.427e+03 0.002 0.99871
## a11812g 1.622e+01 1.997e+03 0.008 0.99352
## a12612g 4.157e+01 1.341e+04 0.003 0.99753
## a15924g -1.406e-01 9.233e-01 -0.152 0.87897
## a3480g -3.184e+01 1.333e+04 -0.002 0.99809
## a4917g 1.606e+01 6.568e+03 0.002 0.99805
## c12705t -2.565e+00 1.675e+00 -1.531 0.12573
## c14167t 1.656e+01 9.427e+03 0.002 0.99860
## c150t -9.442e-01 8.597e-01 -1.098 0.27206
## c15452a NA NA NA NA
## c16270t -9.019e-02 1.171e+00 -0.077 0.93860
## c295t 7.601e-01 3.687e+00 0.206 0.83668
## g12372a -1.722e+01 9.427e+03 -0.002 0.99854
## g13708a 1.156e-02 1.167e+00 0.010 0.99209
## g14905a 5.805e+01 3.277e+03 0.018 0.98587
## g15043a 1.438e+00 8.841e-01 1.626 0.10388
## g15928a -1.594e+01 9.427e+03 -0.002 0.99865
## g1719a 1.106e-01 1.085e+00 0.102 0.91883
## g228a 4.606e-01 1.671e+00 0.276 0.78288
## g2706a 1.702e+00 4.407e+00 0.386 0.69939
## g3010a 1.027e-02 4.399e-01 0.023 0.98137
## g5147a 6.220e-01 8.396e-01 0.741 0.45878
## g709a -8.583e-02 1.040e+00 -0.083 0.93422
## g73a -7.588e-01 5.855e-01 -1.296 0.19494
## g8697a -1.637e+01 5.191e+03 -0.003 0.99748
## g9055a 4.548e-01 1.164e+00 0.391 0.69603
## t10463c -1.633e+01 2.472e+03 -0.007 0.99473
## t11299c -1.729e+01 9.427e+03 -0.002 0.99854
## t13617c 4.579e-01 3.455e+00 0.133 0.89456
## t14798c -1.587e+00 1.994e+00 -0.796 0.42613
## t16362c 1.613e-01 5.712e-01 0.282 0.77763
## t3197c 2.540e-01 3.275e+00 0.078 0.93818
## t4216c -3.547e-01 3.284e+00 -0.108 0.91398
## t6776c 4.488e-01 6.366e-01 0.705 0.48088
## t7028c -2.104e+00 4.449e+00 -0.473 0.63620
## t9698c 7.329e-02 1.560e+00 0.047 0.96253
## 1 | iidTRUE NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 311.37 on 999 degrees of freedom
## Residual deviance: 242.67 on 951 degrees of freedom
## AIC: 446.67
##
## Number of Fisher Scoring iterations: 17
zi1 <- zeroinfl(
hard ~
a10398g + a10550g + a11251g +
a11467g + a11812g + a12612g + a15924g + a3480g + a4917g +
c12705t + c14167t + c150t + c15452a + c16270t + c295t +
g12372a + g13708a + g14905a + g15043a + g15928a + g1719a +
g228a + g2706a + g3010a + g5147a + g709a + g73a + g8697a +
g9055a + t10463c + t11299c + t13617c + t14798c + t16362c +
t3197c + t4216c + t6776c + t7028c + t9698c | iid,
data = df
)
summary(zi1)
##
## Call:
## zeroinfl(formula = hard ~ a10398g + a10550g + a11251g + a11467g + a11812g +
## a12612g + a15924g + a3480g + a4917g + c12705t + c14167t + c150t +
## c15452a + c16270t + c295t + g12372a + g13708a + g14905a + g15043a +
## g15928a + g1719a + g228a + g2706a + g3010a + g5147a + g709a + g73a +
## g8697a + g9055a + t10463c + t11299c + t13617c + t14798c + t16362c +
## t3197c + t4216c + t6776c + t7028c + t9698c | iid, data = df)
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -0.6781 -0.2300 -0.2168 -0.1419 9.0445
##
## Count model coefficients (poisson with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.81108 NA NA NA
## a10398g -0.91963 NA NA NA
## a10550g 348.50072 NA NA NA
## a11251g 2.33188 NA NA NA
## a11467g 99.54020 NA NA NA
## a11812g 345.92524 NA NA NA
## a12612g -4.76839 NA NA NA
## a15924g -0.07419 NA NA NA
## a3480g -216.51958 NA NA NA
## a4917g 314.69516 NA NA NA
## c12705t -3.30661 NA NA NA
## c14167t 63.86228 NA NA NA
## c150t -0.96643 NA NA NA
## c15452a 2.33188 NA NA NA
## c16270t 0.23342 NA NA NA
## c295t -0.35637 NA NA NA
## g12372a -101.28441 NA NA NA
## g13708a 0.07627 NA NA NA
## g14905a -11.39876 NA NA NA
## g15043a 1.60273 NA NA NA
## g15928a -115.48141 NA NA NA
## g1719a 0.35354 NA NA NA
## g228a 0.97811 NA NA NA
## g2706a 0.07517 NA NA NA
## g3010a 0.09341 NA NA NA
## g5147a 0.31280 NA NA NA
## g709a 0.29383 NA NA NA
## g73a -0.72500 NA NA NA
## g8697a -264.14315 NA NA NA
## g9055a 0.69473 NA NA NA
## t10463c -275.12977 NA NA NA
## t11299c -193.97790 NA NA NA
## t13617c -0.32508 NA NA NA
## t14798c -1.61436 NA NA NA
## t16362c 0.05653 NA NA NA
## t3197c 0.62553 NA NA NA
## t4216c 0.00643 NA NA NA
## t6776c 0.61764 NA NA NA
## t7028c -0.59689 NA NA NA
## t9698c 0.58967 NA NA NA
##
## Zero-inflation model coefficients (binomial with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.513331 NA NA NA
## iid -0.005465 NA NA NA
##
## Number of iterations in BFGS optimization: 77
## Log-likelihood: -193.7 on 42 Df
vuong(pois1, zi1)
## Vuong Non-Nested Hypothesis Test-Statistic:
## (test-statistic is asymptotically distributed N(0,1) under the
## null that the models are indistinguishible)
## -------------------------------------------------------------
## Vuong z-statistic H_A p-value
## Raw 3.546630 model1 > model2 0.0001951
## AIC-corrected 1.718360 model1 > model2 0.0428655
## BIC-corrected -2.767991 model2 > model1 0.0028202
The Vuong test compares the zero-inflated model with an ordinary Poisson regression model. In this example, we can see that our test statistic is significant, indicating that the zero-inflated model is superior to the standard Poisson model. Desmarais Bruce A., Harden Jeffrey J., 2013, “Testing for Zero Inflation in Count Models: Bias Correction for the Vuong Test” Stata Journal, 13, 4, 810-835 check https://stats.idre.ucla.edu/r/dae/zip/
check_overdispersion(pois1)
## # Overdispersion test
##
## dispersion ratio = 0.986
## Pearson's Chi-Squared = 935.002
## p-value = 0.612
check_zeroinflation(pois1)
## # Check for zero-inflation
##
## Observed zeros: 947
## Predicted zeros: 951
## Ratio: 1.00
check_zeroinflation(zi1)
## # Check for zero-inflation
##
## Observed zeros: 947
## Predicted zeros: 949
## Ratio: 1.00
Check this with the full dataset
check_heteroscedasticity(pois1)
## Warning: Heteroscedasticity (non-constant error variance) detected (p = 0.010).
check_heteroscedasticity(zi1)
## Warning: Heteroscedasticity (non-constant error variance) detected (p = 0.000).
# check_model(pois1)
# check_model(zi1)
model_performance(pois1)
## # Indices of model performance
##
## AIC | BIC | Nagelkerke's R2 | RMSE | Sigma | Score_log | Score_spherical
## ---------------------------------------------------------------------------------
## 446.674 | 687.154 | 0.248 | 0.218 | 0.505 | -0.174 | 0.031
model_performance(zi1)
## # Indices of model performance
##
## AIC | BIC | R2 | R2 (adj.) | RMSE | Sigma | Score_log | Score_spherical
## -----------------------------------------------------------------------------------
## 471.471 | 677.597 | 0.002 | -0.039 | 0.220 | 0.224 | -0.194 | 0.031
compare_performance(pois1, zi1, rank = TRUE)
## # Comparison of Model Performance Indices
##
## Name | Model | AIC | BIC | RMSE | Sigma | Score_log | Score_spherical | Performance-Score
## ------------------------------------------------------------------------------------------------------
## pois1 | glm | 446.674 | 687.154 | 0.218 | 0.505 | -0.174 | 0.031 | 66.67%
## zi1 | zeroinfl | 471.471 | 677.597 | 0.220 | 0.224 | -0.194 | 0.031 | 33.33%
plot(compare_performance(pois1, zi1, rank = TRUE)) # package see required
test_performance(pois1, zi1)
## Name | Model | BF
## -------------------------
## pois1 | glm | 1.00
## zi1 | zeroinfl | 118.90
## Each model is compared to pois1.
So far, the Zero inflated model seems perform worst than a poisson regression model.
Next step, run this script with the full dataset