Model diagnostics and results

A. Scaling and centering of the relevant predictors

B1. Negative Binomial (with the variables bcg_ind, bcg_cov, and hdi)

#knitr::opts_chunk$set(echo = FALSE)

modelNB_1 <- MASS::glm.nb(pa_num ~ offset(log(inf_num)) + bcg_cov_s + bcg_ind_s + hdi_s, data = data, init.theta = 1.0, link = log)
summary(modelNB_1)

## 
## Call:
## MASS::glm.nb(formula = pa_num ~ offset(log(inf_num)) + bcg_cov_s + 
##     bcg_ind_s + hdi_s, data = data, init.theta = 5.924279644, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8234  -0.4890   0.1592   0.3829   2.0288  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.87385    0.07818 -11.178  < 2e-16 ***
## bcg_cov_s    0.15522    0.10387   1.494    0.135    
## bcg_ind_s    0.05720    0.09907   0.577    0.564    
## hdi_s       -0.36321    0.09158  -3.966  7.3e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(5.9243) family taken to be 1)
## 
##     Null deviance: 63.456  on 30  degrees of freedom
## Residual deviance: 31.109  on 27  degrees of freedom
## AIC: 351.38
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  5.92 
##           Std. Err.:  1.61 
## 
##  2 x log-likelihood:  -341.382

modelN1 <- stepAIC(modelNB_1, trace=TRUE)

## Start:  AIC=349.38
## pa_num ~ offset(log(inf_num)) + bcg_cov_s + bcg_ind_s + hdi_s
## 
##             Df    AIC
## - bcg_ind_s  1 347.80
## <none>         349.38
## - bcg_cov_s  1 349.93
## - hdi_s      1 361.22
## 
## Step:  AIC=347.8
## pa_num ~ bcg_cov_s + hdi_s + offset(log(inf_num))
## 
##             Df    AIC
## <none>         347.80
## - bcg_cov_s  1 350.34
## - hdi_s      1 359.42

summary(modelN1)

## 
## Call:
## MASS::glm.nb(formula = pa_num ~ bcg_cov_s + hdi_s + offset(log(inf_num)), 
##     data = data, init.theta = 5.861041227, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8079  -0.5103   0.1122   0.4252   1.7346  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.87461    0.07856 -11.132  < 2e-16 ***
## bcg_cov_s    0.18637    0.08657   2.153   0.0313 *  
## hdi_s       -0.34675    0.08581  -4.041 5.32e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(5.861) family taken to be 1)
## 
##     Null deviance: 62.841  on 30  degrees of freedom
## Residual deviance: 31.230  on 28  degrees of freedom
## AIC: 349.8
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  5.86 
##           Std. Err.:  1.60 
## 
##  2 x log-likelihood:  -341.804

hist(residuals(modelNB_1))

hist(residuals(modelN1))

# plot_coefs(modelN1, legend.title = "Asymptomatic percentage")
# p1  <- plot_coefs(modelN1, legend.title = "Asymptomatic percentage")

resN1 <- residuals(modelN1, type="deviance") 
plot(predict(modelN1), resN1)

plot(log(predict(modelN1)), resN1) 
abline(h=0, lty=2)

res_standardised <- rstandard(modelN1)
plot(predict(modelN1), res_standardised)

plot(log(predict(modelN1)), res_standardised) 
abline(h=0, lty=2)

resP <- residuals(modelN1, type="pearson") 
plot(predict(modelN1), resP)

plot(log(predict(modelN1)), resP) 
abline(h=0, lty=2)

influencePlot(modelN1, id=TRUE) # https://stat.ethz.ch/R-manual/R-devel/library/stats/html/influence.measures.html

##       StudRes        Hat       CookD
## 2  -3.0864108 0.05389941 0.068255719
## 7   0.1221218 0.26346250 0.002102628
## 23  1.8148132 0.07864656 0.145596934
## 26 -2.4712724 0.18286758 0.238159288

Observation 23 (Oman) and 26 (Senegal) have high Cook’s distance. Let’s check, if they are above the cutoff of 4/n.

# Finding outliers
N <- nrow(data)
cooksD <- cooks.distance(modelN1)
influential <- cooksD[(cooksD > 4/N)]  # Outliers: Cook's distance > 4/n
influential

##        23        26 
## 0.1455969 0.2381593

4/N

## [1] 0.1290323

Yes, Oman and Senegal are the outliers.
Let’s redo the regression by removing these outliers

names_of_influential <- names(influential)
outliers <- data[names_of_influential,]
data_without_outliers <- data %>% anti_join(outliers)

modelNB_12 <- MASS::glm.nb(pa_num ~ offset(log(inf_num)) + bcg_cov_s + bcg_ind_s
                          + hdi_s, data = data_without_outliers, 
                          init.theta = 1.0, link = log)
summary(modelNB_12)

## 
## Call:
## MASS::glm.nb(formula = pa_num ~ offset(log(inf_num)) + bcg_cov_s + 
##     bcg_ind_s + hdi_s, data = data_without_outliers, init.theta = 9.521768616, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.4384  -0.4471   0.1544   0.5230   1.3851  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.90507    0.06608 -13.697  < 2e-16 ***
## bcg_cov_s    0.02103    0.09164   0.229   0.8185    
## bcg_ind_s    0.17216    0.08819   1.952   0.0509 .  
## hdi_s       -0.49056    0.08085  -6.068  1.3e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(9.5218) family taken to be 1)
## 
##     Null deviance: 89.858  on 28  degrees of freedom
## Residual deviance: 29.517  on 25  degrees of freedom
## AIC: 311.5
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  9.52 
##           Std. Err.:  2.90 
## 
##  2 x log-likelihood:  -301.502

modelN12 <- stepAIC(modelNB_12, trace=TRUE)

## Start:  AIC=309.5
## pa_num ~ offset(log(inf_num)) + bcg_cov_s + bcg_ind_s + hdi_s
## 
##             Df    AIC
## - bcg_cov_s  1 307.56
## <none>         309.50
## - bcg_ind_s  1 311.49
## - hdi_s      1 332.36
## 
## Step:  AIC=307.56
## pa_num ~ bcg_ind_s + hdi_s + offset(log(inf_num))
## 
##             Df    AIC
## <none>         307.56
## - bcg_ind_s  1 312.72
## - hdi_s      1 336.90

summary(modelN12)

## 
## Call:
## MASS::glm.nb(formula = pa_num ~ bcg_ind_s + hdi_s + offset(log(inf_num)), 
##     data = data_without_outliers, init.theta = 9.50791347, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.4157  -0.3984   0.1243   0.5551   1.4033  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.90681    0.06568 -13.807  < 2e-16 ***
## bcg_ind_s    0.18424    0.06888   2.675  0.00748 ** 
## hdi_s       -0.50130    0.06830  -7.340 2.13e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(9.5079) family taken to be 1)
## 
##     Null deviance: 89.745  on 28  degrees of freedom
## Residual deviance: 29.537  on 26  degrees of freedom
## AIC: 309.56
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  9.51 
##           Std. Err.:  2.89 
## 
##  2 x log-likelihood:  -301.557

The AIC of the model is substantially reduced after removing Oman and Senegal.

hist(residuals(modelNB_12))

hist(residuals(modelN12))

plot_coefs(modelN12, legend.title = "Asymptomatic percentage")

p12  <- plot_coefs(modelN12, legend.title = "Asymptomatic percentage")

resN12 <- residuals(modelN12, type="deviance") 
plot(predict(modelN12), resN12)

plot(log(predict(modelN12)), resN12) 
abline(h=0, lty=2)

influencePlot(modelN12, id=TRUE)

##       StudRes        Hat       CookD
## 2  -4.1523545 0.04720736 0.090787865
## 7  -0.7959162 0.26291309 0.076572680
## 13  0.1289597 0.20442391 0.001718031
## 19  1.4582645 0.11760708 0.133441315
## 22 -2.1442234 0.08889786 0.093640162

Again checking if the Cook’s distance is above the cutoff

cooksD2 <- cooks.distance(modelN12)
influential2 <- cooksD2[(cooksD2 > 4/(N-2))]
influential2

## named numeric(0)

No outlier now!
Also, we checked by removing observation 19 (Lithuania), but the AIC was increased substantialy. So not removing any more observations.

Checking other models…

B2. Poisson (with the variables bcg_ind, bcg_cov, and hdi)

knitr::opts_chunk$set(echo = FALSE) 

modelP1 <- glm(pa_num ~ offset(log(inf_num)) + bcg_cov_s + bcg_ind_s
               + hdi_s, data = data, family=poisson)
summary(modelP1)

## 
## Call:
## glm(formula = pa_num ~ offset(log(inf_num)) + bcg_cov_s + bcg_ind_s + 
##     hdi_s, family = poisson, data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -23.562   -4.038   -1.508    0.338   19.960  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.666915   0.008111 -82.223   <2e-16 ***
## bcg_cov_s    0.249676   0.009054  27.578   <2e-16 ***
## bcg_ind_s   -0.015118   0.007660  -1.974   0.0484 *  
## hdi_s       -0.267646   0.010432 -25.657   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 4444.6  on 30  degrees of freedom
## Residual deviance: 2013.2  on 27  degrees of freedom
## AIC: 2229.7
## 
## Number of Fisher Scoring iterations: 4

modelN2 <- stepAIC(modelP1, trace=TRUE)

## Start:  AIC=2229.73
## pa_num ~ offset(log(inf_num)) + bcg_cov_s + bcg_ind_s + hdi_s
## 
##             Df Deviance    AIC
## <none>           2013.2 2229.7
## - bcg_ind_s  1   2017.1 2231.6
## - hdi_s      1   2638.1 2852.6
## - bcg_cov_s  1   2791.1 3005.6

summary(modelN2)

## 
## Call:
## glm(formula = pa_num ~ offset(log(inf_num)) + bcg_cov_s + bcg_ind_s + 
##     hdi_s, family = poisson, data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -23.562   -4.038   -1.508    0.338   19.960  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.666915   0.008111 -82.223   <2e-16 ***
## bcg_cov_s    0.249676   0.009054  27.578   <2e-16 ***
## bcg_ind_s   -0.015118   0.007660  -1.974   0.0484 *  
## hdi_s       -0.267646   0.010432 -25.657   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 4444.6  on 30  degrees of freedom
## Residual deviance: 2013.2  on 27  degrees of freedom
## AIC: 2229.7
## 
## Number of Fisher Scoring iterations: 4

hist(residuals(modelP1))

hist(residuals(modelN2))

influencePlot(modelN2, id=TRUE) # https://stat.ethz.ch/R-manual/R-devel/library/stats/html/influence.measures.html

##      StudRes        Hat      CookD
## 2  -23.95616 0.04851161   4.916529
## 13  21.93675 0.56886736 165.034980
## 23  31.57194 0.56997027 347.877711

resN1 <- residuals(modelN2, type="deviance") 
plot(predict(modelN2), resN1)

plot(log(predict(modelN2)), resN1) 
abline(h=0, lty=2)

res_standardised <- rstandard(modelN2)
plot(predict(modelN2), res_standardised)

plot(log(predict(modelN2)), res_standardised) 
abline(h=0, lty=2)

resP <- residuals(modelN2, type="pearson") 
plot(predict(modelN2), resP)

plot(log(predict(modelN2)), resP) 
abline(h=0, lty=2)

The AIC and deviance are much higher than the NB model

B3. Quasi-Poisson (with the variables: bcg_ind, bcg_cov, and hdi)

## 
## Call:
## glm(formula = pa_num ~ offset(log(inf_num)) + bcg_cov_s + bcg_ind_s + 
##     hdi_s, family = quasipoisson, data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -23.562   -4.038   -1.508    0.338   19.960  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.66692    0.06496 -10.266 8.06e-11 ***
## bcg_cov_s    0.24968    0.07251   3.443  0.00189 ** 
## bcg_ind_s   -0.01512    0.06135  -0.246  0.80722    
## hdi_s       -0.26765    0.08355  -3.203  0.00347 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasipoisson family taken to be 64.15056)
## 
##     Null deviance: 4444.6  on 30  degrees of freedom
## Residual deviance: 2013.2  on 27  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 4

##      StudRes        Hat      CookD
## 2  -3.230589 0.04851161 0.07664047
## 13  2.834612 0.56886736 2.57261930
## 23  4.883409 0.56997027 5.42283167

The deviance is much higher than the NB model. Thus, we will choose NB model.

(Scaled) Factors affecting the SARS-CoV-2 asymptomatic infection

Ananthu James

08/06/2021

1) Descriptive statistics

Model diagnostics and results

A. Scaling and centering of the relevant predictors

B1. Negative Binomial (with the variables bcg_ind, bcg_cov, and hdi)

B2. Poisson (with the variables bcg_ind, bcg_cov, and hdi)

B3. Quasi-Poisson (with the variables: bcg_ind, bcg_cov, and hdi)