Data collected by: Akshay and Shreya
Outcome: The percentage of asymptomatic patients (PA) among all the SARS-Cov-2 infected individuals
#knitr::opts_chunk$set(echo = FALSE)
modelNB_1 <- MASS::glm.nb(pa_num ~ offset(log(inf_num)) + bcg_cov_s + bcg_ind_s + hdi_s, data = data, init.theta = 1.0, link = log)
summary(modelNB_1)
##
## Call:
## MASS::glm.nb(formula = pa_num ~ offset(log(inf_num)) + bcg_cov_s +
## bcg_ind_s + hdi_s, data = data, init.theta = 5.924279644,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8234 -0.4890 0.1592 0.3829 2.0288
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.87385 0.07818 -11.178 < 2e-16 ***
## bcg_cov_s 0.15522 0.10387 1.494 0.135
## bcg_ind_s 0.05720 0.09907 0.577 0.564
## hdi_s -0.36321 0.09158 -3.966 7.3e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(5.9243) family taken to be 1)
##
## Null deviance: 63.456 on 30 degrees of freedom
## Residual deviance: 31.109 on 27 degrees of freedom
## AIC: 351.38
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 5.92
## Std. Err.: 1.61
##
## 2 x log-likelihood: -341.382
modelN1 <- stepAIC(modelNB_1, trace=TRUE)
## Start: AIC=349.38
## pa_num ~ offset(log(inf_num)) + bcg_cov_s + bcg_ind_s + hdi_s
##
## Df AIC
## - bcg_ind_s 1 347.80
## <none> 349.38
## - bcg_cov_s 1 349.93
## - hdi_s 1 361.22
##
## Step: AIC=347.8
## pa_num ~ bcg_cov_s + hdi_s + offset(log(inf_num))
##
## Df AIC
## <none> 347.80
## - bcg_cov_s 1 350.34
## - hdi_s 1 359.42
summary(modelN1)
##
## Call:
## MASS::glm.nb(formula = pa_num ~ bcg_cov_s + hdi_s + offset(log(inf_num)),
## data = data, init.theta = 5.861041227, link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8079 -0.5103 0.1122 0.4252 1.7346
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.87461 0.07856 -11.132 < 2e-16 ***
## bcg_cov_s 0.18637 0.08657 2.153 0.0313 *
## hdi_s -0.34675 0.08581 -4.041 5.32e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(5.861) family taken to be 1)
##
## Null deviance: 62.841 on 30 degrees of freedom
## Residual deviance: 31.230 on 28 degrees of freedom
## AIC: 349.8
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 5.86
## Std. Err.: 1.60
##
## 2 x log-likelihood: -341.804
hist(residuals(modelNB_1))
hist(residuals(modelN1))
# plot_coefs(modelN1, legend.title = "Asymptomatic percentage")
# p1 <- plot_coefs(modelN1, legend.title = "Asymptomatic percentage")
resN1 <- residuals(modelN1, type="deviance")
plot(predict(modelN1), resN1)
plot(log(predict(modelN1)), resN1)
abline(h=0, lty=2)
res_standardised <- rstandard(modelN1)
plot(predict(modelN1), res_standardised)
plot(log(predict(modelN1)), res_standardised)
abline(h=0, lty=2)
resP <- residuals(modelN1, type="pearson")
plot(predict(modelN1), resP)
plot(log(predict(modelN1)), resP)
abline(h=0, lty=2)
influencePlot(modelN1, id=TRUE) # https://stat.ethz.ch/R-manual/R-devel/library/stats/html/influence.measures.html
## StudRes Hat CookD
## 2 -3.0864108 0.05389941 0.068255719
## 7 0.1221218 0.26346250 0.002102628
## 23 1.8148132 0.07864656 0.145596934
## 26 -2.4712724 0.18286758 0.238159288
Observation 23 (Oman) and 26 (Senegal) have high Cook’s distance. Let’s check, if they are above the cutoff of 4/n.
# Finding outliers
N <- nrow(data)
cooksD <- cooks.distance(modelN1)
influential <- cooksD[(cooksD > 4/N)] # Outliers: Cook's distance > 4/n
influential
## 23 26
## 0.1455969 0.2381593
4/N
## [1] 0.1290323
Yes, Oman and Senegal are the outliers.
Let’s redo the regression by removing these outliers
names_of_influential <- names(influential)
outliers <- data[names_of_influential,]
data_without_outliers <- data %>% anti_join(outliers)
modelNB_12 <- MASS::glm.nb(pa_num ~ offset(log(inf_num)) + bcg_cov_s + bcg_ind_s
+ hdi_s, data = data_without_outliers,
init.theta = 1.0, link = log)
summary(modelNB_12)
##
## Call:
## MASS::glm.nb(formula = pa_num ~ offset(log(inf_num)) + bcg_cov_s +
## bcg_ind_s + hdi_s, data = data_without_outliers, init.theta = 9.521768616,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.4384 -0.4471 0.1544 0.5230 1.3851
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.90507 0.06608 -13.697 < 2e-16 ***
## bcg_cov_s 0.02103 0.09164 0.229 0.8185
## bcg_ind_s 0.17216 0.08819 1.952 0.0509 .
## hdi_s -0.49056 0.08085 -6.068 1.3e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(9.5218) family taken to be 1)
##
## Null deviance: 89.858 on 28 degrees of freedom
## Residual deviance: 29.517 on 25 degrees of freedom
## AIC: 311.5
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 9.52
## Std. Err.: 2.90
##
## 2 x log-likelihood: -301.502
modelN12 <- stepAIC(modelNB_12, trace=TRUE)
## Start: AIC=309.5
## pa_num ~ offset(log(inf_num)) + bcg_cov_s + bcg_ind_s + hdi_s
##
## Df AIC
## - bcg_cov_s 1 307.56
## <none> 309.50
## - bcg_ind_s 1 311.49
## - hdi_s 1 332.36
##
## Step: AIC=307.56
## pa_num ~ bcg_ind_s + hdi_s + offset(log(inf_num))
##
## Df AIC
## <none> 307.56
## - bcg_ind_s 1 312.72
## - hdi_s 1 336.90
summary(modelN12)
##
## Call:
## MASS::glm.nb(formula = pa_num ~ bcg_ind_s + hdi_s + offset(log(inf_num)),
## data = data_without_outliers, init.theta = 9.50791347, link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.4157 -0.3984 0.1243 0.5551 1.4033
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.90681 0.06568 -13.807 < 2e-16 ***
## bcg_ind_s 0.18424 0.06888 2.675 0.00748 **
## hdi_s -0.50130 0.06830 -7.340 2.13e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(9.5079) family taken to be 1)
##
## Null deviance: 89.745 on 28 degrees of freedom
## Residual deviance: 29.537 on 26 degrees of freedom
## AIC: 309.56
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 9.51
## Std. Err.: 2.89
##
## 2 x log-likelihood: -301.557
The AIC of the model is substantially reduced after removing Oman and Senegal.
hist(residuals(modelNB_12))
hist(residuals(modelN12))
plot_coefs(modelN12, legend.title = "Asymptomatic percentage")
p12 <- plot_coefs(modelN12, legend.title = "Asymptomatic percentage")
resN12 <- residuals(modelN12, type="deviance")
plot(predict(modelN12), resN12)
plot(log(predict(modelN12)), resN12)
abline(h=0, lty=2)
influencePlot(modelN12, id=TRUE)
## StudRes Hat CookD
## 2 -4.1523545 0.04720736 0.090787865
## 7 -0.7959162 0.26291309 0.076572680
## 13 0.1289597 0.20442391 0.001718031
## 19 1.4582645 0.11760708 0.133441315
## 22 -2.1442234 0.08889786 0.093640162
Again checking if the Cook’s distance is above the cutoff
cooksD2 <- cooks.distance(modelN12)
influential2 <- cooksD2[(cooksD2 > 4/(N-2))]
influential2
## named numeric(0)
No outlier now!
Also, we checked by removing observation 19 (Lithuania), but the AIC was
increased substantialy. So not removing any more observations.
Checking other models…
knitr::opts_chunk$set(echo = FALSE)
modelP1 <- glm(pa_num ~ offset(log(inf_num)) + bcg_cov_s + bcg_ind_s
+ hdi_s, data = data, family=poisson)
summary(modelP1)
##
## Call:
## glm(formula = pa_num ~ offset(log(inf_num)) + bcg_cov_s + bcg_ind_s +
## hdi_s, family = poisson, data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -23.562 -4.038 -1.508 0.338 19.960
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.666915 0.008111 -82.223 <2e-16 ***
## bcg_cov_s 0.249676 0.009054 27.578 <2e-16 ***
## bcg_ind_s -0.015118 0.007660 -1.974 0.0484 *
## hdi_s -0.267646 0.010432 -25.657 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 4444.6 on 30 degrees of freedom
## Residual deviance: 2013.2 on 27 degrees of freedom
## AIC: 2229.7
##
## Number of Fisher Scoring iterations: 4
modelN2 <- stepAIC(modelP1, trace=TRUE)
## Start: AIC=2229.73
## pa_num ~ offset(log(inf_num)) + bcg_cov_s + bcg_ind_s + hdi_s
##
## Df Deviance AIC
## <none> 2013.2 2229.7
## - bcg_ind_s 1 2017.1 2231.6
## - hdi_s 1 2638.1 2852.6
## - bcg_cov_s 1 2791.1 3005.6
summary(modelN2)
##
## Call:
## glm(formula = pa_num ~ offset(log(inf_num)) + bcg_cov_s + bcg_ind_s +
## hdi_s, family = poisson, data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -23.562 -4.038 -1.508 0.338 19.960
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.666915 0.008111 -82.223 <2e-16 ***
## bcg_cov_s 0.249676 0.009054 27.578 <2e-16 ***
## bcg_ind_s -0.015118 0.007660 -1.974 0.0484 *
## hdi_s -0.267646 0.010432 -25.657 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 4444.6 on 30 degrees of freedom
## Residual deviance: 2013.2 on 27 degrees of freedom
## AIC: 2229.7
##
## Number of Fisher Scoring iterations: 4
hist(residuals(modelP1))
hist(residuals(modelN2))
influencePlot(modelN2, id=TRUE) # https://stat.ethz.ch/R-manual/R-devel/library/stats/html/influence.measures.html
## StudRes Hat CookD
## 2 -23.95616 0.04851161 4.916529
## 13 21.93675 0.56886736 165.034980
## 23 31.57194 0.56997027 347.877711
resN1 <- residuals(modelN2, type="deviance")
plot(predict(modelN2), resN1)
plot(log(predict(modelN2)), resN1)
abline(h=0, lty=2)
res_standardised <- rstandard(modelN2)
plot(predict(modelN2), res_standardised)
plot(log(predict(modelN2)), res_standardised)
abline(h=0, lty=2)
resP <- residuals(modelN2, type="pearson")
plot(predict(modelN2), resP)
plot(log(predict(modelN2)), resP)
abline(h=0, lty=2)
The AIC and deviance are much higher than the NB model
##
## Call:
## glm(formula = pa_num ~ offset(log(inf_num)) + bcg_cov_s + bcg_ind_s +
## hdi_s, family = quasipoisson, data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -23.562 -4.038 -1.508 0.338 19.960
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.66692 0.06496 -10.266 8.06e-11 ***
## bcg_cov_s 0.24968 0.07251 3.443 0.00189 **
## bcg_ind_s -0.01512 0.06135 -0.246 0.80722
## hdi_s -0.26765 0.08355 -3.203 0.00347 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for quasipoisson family taken to be 64.15056)
##
## Null deviance: 4444.6 on 30 degrees of freedom
## Residual deviance: 2013.2 on 27 degrees of freedom
## AIC: NA
##
## Number of Fisher Scoring iterations: 4
## StudRes Hat CookD
## 2 -3.230589 0.04851161 0.07664047
## 13 2.834612 0.56886736 2.57261930
## 23 4.883409 0.56997027 5.42283167
The deviance is much higher than the NB model. Thus, we will choose NB model.