str(covbmi)
## tibble [4,805 × 10] (S3: tbl_df/tbl/data.frame)
## $ Colon_C : num [1:4805] 218 259 153 293 222 ...
## $ Lung_C : num [1:4805] 420 335 230 508 284 ...
## $ Breast_C : num [1:4805] 399 504 479 215 405 ...
## $ Prostate_C : num [1:4805] 259 500 442 303 323 ...
## $ Urinary_C : num [1:4805] 259 227 223 371 211 ...
## $ All_Cancer : num [1:4805] 2703 3249 2923 3084 2582 ...
## $ Covid_Case_Weekly: num [1:4805] 1 1 1 1 1 1 1 1 1 1 ...
## $ Body_Mass_Index : Factor w/ 13 levels "18.5-24.9","25.0andup",..: 1 1 1 1 8 8 8 8 8 8 ...
## $ Age_Group : Factor w/ 8 levels "0-17","18-29",..: 1 2 3 4 5 6 7 8 1 2 ...
## $ COVID-19_Deaths : num [1:4805] 0 0 0 0 0 0 0 0 0 0 ...
The variables I am interested in are:
Colon_C
Prostate_C
Lung_C
Breast_C
Urinary_C
All_Cancer
Body_Mass_Index
Age_Group
COVID-19_Deaths
Descriptive statistics numeric
covbmiB <- covbmi %>%
pivot_longer( names_to = 'All_Types_C', values_to = 'C_Rates',
col=(-c(`COVID-19_Deaths`,Age_Group,Body_Mass_Index,Covid_Case_Weekly))) %>%
filter(`COVID-19_Deaths`>5) %>% filter(Body_Mass_Index != '35.0-39.10' & Body_Mass_Index !=
'35.0-39.11' & Body_Mass_Index != '35.0-39.13'& Body_Mass_Index != '35.0-39.14' &Body_Mass_Index != '35.0-39.7'& Body_Mass_Index != '35.0-39.8') %>%
select( All_Types_C, C_Rates,Age_Group,Body_Mass_Index,`COVID-19_Deaths`) %>%
mutate(All_Types_C = as.factor(All_Types_C))
summary(covbmiB[c(-5)])
## All_Types_C C_Rates Age_Group Body_Mass_Index
## All_Cancer:1987 Min. :-49216 85andolder:2268 40.0andup:2916
## Breast_C :1987 1st Qu.: 3694 75-84 :2256 30.0andup:2520
## Colon_C :1987 Median : 22501 65-74 :2220 30.0-34.9:2124
## Lung_C :1987 Mean : 56400 50-64 :2202 35.0-39.9:1944
## Prostate_C:1987 3rd Qu.: 68285 40-49 :1320 25.0andup:1890
## Urinary_C :1987 Max. :424048 30-39 : 978 18.5-24.9: 378
## (Other) : 678 (Other) : 150
prop.table(table(covbmiB$Body_Mass_Index))*100
##
## 18.5-24.9 25.0andup 30.0-34.9 30.0andup 35.0-39.10 35.0-39.11 35.0-39.12
## 3.170609 15.853045 17.815803 21.137393 0.000000 0.000000 1.258178
## 35.0-39.13 35.0-39.14 35.0-39.7 35.0-39.8 35.0-39.9 40.0andup
## 0.000000 0.000000 0.000000 0.000000 16.305989 24.458983
sum(duplicated(covbmiB))
## [1] 0
sum(is.na(covbmiB))
## [1] 0
unique(covbmiB)
## # A tibble: 11,922 × 5
## All_Types_C C_Rates Age_Group Body_Mass_Index `COVID-19_Deaths`
## <fct> <dbl> <fct> <fct> <dbl>
## 1 Colon_C -4267. 65-74 25.0andup 7
## 2 Lung_C 827. 65-74 25.0andup 7
## 3 Breast_C 11926. 65-74 25.0andup 7
## 4 Prostate_C 5847. 65-74 25.0andup 7
## 5 Urinary_C 3067. 65-74 25.0andup 7
## 6 All_Cancer 40744. 65-74 25.0andup 7
## 7 Colon_C -4289. 85andolder 25.0andup 9
## 8 Lung_C 829. 85andolder 25.0andup 9
## 9 Breast_C 11981. 85andolder 25.0andup 9
## 10 Prostate_C 5874. 85andolder 25.0andup 9
## # ℹ 11,912 more rows
datatable(covbmiB)
descriptive statistics categorical
Frequency Tables
prop.table(table(covbmiB$Age_Group))*100
##
## 0-17 18-29 30-39 40-49 50-64 65-74 75-84
## 0.4529441 5.2340211 8.2033216 11.0719678 18.4700554 18.6210367 18.9229995
## 85andolder
## 19.0236537
prop.table(table(covbmiB$Body_Mass_Index))*100
##
## 18.5-24.9 25.0andup 30.0-34.9 30.0andup 35.0-39.10 35.0-39.11 35.0-39.12
## 3.170609 15.853045 17.815803 21.137393 0.000000 0.000000 1.258178
## 35.0-39.13 35.0-39.14 35.0-39.7 35.0-39.8 35.0-39.9 40.0andup
## 0.000000 0.000000 0.000000 0.000000 16.305989 24.458983
prop.table(table(covbmiB$All_Types_C))*100
##
## All_Cancer Breast_C Colon_C Lung_C Prostate_C Urinary_C
## 16.66667 16.66667 16.66667 16.66667 16.66667 16.66667
prop.table(table(covbmiB$Body_Mass_Index,covbmiB$All_Types_C))*100
##
## All_Cancer Breast_C Colon_C Lung_C Prostate_C Urinary_C
## 18.5-24.9 0.5284348 0.5284348 0.5284348 0.5284348 0.5284348 0.5284348
## 25.0andup 2.6421741 2.6421741 2.6421741 2.6421741 2.6421741 2.6421741
## 30.0-34.9 2.9693005 2.9693005 2.9693005 2.9693005 2.9693005 2.9693005
## 30.0andup 3.5228988 3.5228988 3.5228988 3.5228988 3.5228988 3.5228988
## 35.0-39.10 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
## 35.0-39.11 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
## 35.0-39.12 0.2096964 0.2096964 0.2096964 0.2096964 0.2096964 0.2096964
## 35.0-39.13 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
## 35.0-39.14 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
## 35.0-39.7 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
## 35.0-39.8 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
## 35.0-39.9 2.7176648 2.7176648 2.7176648 2.7176648 2.7176648 2.7176648
## 40.0andup 4.0764972 4.0764972 4.0764972 4.0764972 4.0764972 4.0764972
prop.table(table(covbmiB$Age_Group,covbmiB$All_Types_C))*100
##
## All_Cancer Breast_C Colon_C Lung_C Prostate_C Urinary_C
## 0-17 0.07549069 0.07549069 0.07549069 0.07549069 0.07549069 0.07549069
## 18-29 0.87233686 0.87233686 0.87233686 0.87233686 0.87233686 0.87233686
## 30-39 1.36722027 1.36722027 1.36722027 1.36722027 1.36722027 1.36722027
## 40-49 1.84532797 1.84532797 1.84532797 1.84532797 1.84532797 1.84532797
## 50-64 3.07834256 3.07834256 3.07834256 3.07834256 3.07834256 3.07834256
## 65-74 3.10350612 3.10350612 3.10350612 3.10350612 3.10350612 3.10350612
## 75-84 3.15383325 3.15383325 3.15383325 3.15383325 3.15383325 3.15383325
## 85andolder 3.17060896 3.17060896 3.17060896 3.17060896 3.17060896 3.17060896
prop.table(table(covbmiB$Age_Group,covbmiB$Body_Mass_Index))*100
##
## 18.5-24.9 25.0andup 30.0-34.9 30.0andup 35.0-39.10 35.0-39.11
## 0-17 0.00000000 0.00000000 0.10065425 0.05032713 0.00000000 0.00000000
## 18-29 0.10065425 1.15752391 0.90588827 0.85556115 0.00000000 0.00000000
## 30-39 0.25163563 1.56014092 1.35883241 1.66079517 0.00000000 0.00000000
## 40-49 0.30196276 1.86210367 2.06341218 2.21439356 0.00000000 0.00000000
## 50-64 0.60392552 2.61701057 3.37191746 4.07649723 0.00000000 0.00000000
## 65-74 0.60392552 2.76799195 3.37191746 4.12682436 0.00000000 0.00000000
## 75-84 0.65425264 2.96930045 3.32159034 4.07649723 0.00000000 0.00000000
## 85andolder 0.65425264 2.91897333 3.32159034 4.07649723 0.00000000 0.00000000
##
## 35.0-39.12 35.0-39.13 35.0-39.14 35.0-39.7 35.0-39.8 35.0-39.9
## 0-17 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.05032713
## 18-29 0.05032713 0.00000000 0.00000000 0.00000000 0.00000000 0.60392552
## 30-39 0.05032713 0.00000000 0.00000000 0.00000000 0.00000000 1.05686965
## 40-49 0.15098138 0.00000000 0.00000000 0.00000000 0.00000000 1.45948666
## 50-64 0.25163563 0.00000000 0.00000000 0.00000000 0.00000000 3.12028183
## 65-74 0.25163563 0.00000000 0.00000000 0.00000000 0.00000000 3.27126321
## 75-84 0.30196276 0.00000000 0.00000000 0.00000000 0.00000000 3.37191746
## 85andolder 0.20130851 0.00000000 0.00000000 0.00000000 0.00000000 3.37191746
##
## 40.0andup
## 0-17 0.25163563
## 18-29 1.56014092
## 30-39 2.26472068
## 40-49 3.01962758
## 50-64 4.42878712
## 65-74 4.22747861
## 75-84 4.22747861
## 85andolder 4.47911424
Univariate Visualizations - Quantitative Variables
ggplot(covbmiB,aes(`COVID-19_Deaths`))+geom_histogram(col='black',fill='yellow')+xlab('Covid_death_Count')+ggtitle('Distribution of Covid')
A graph of the number of Covid-19_deaths, reveals a pattern often found with distributions of counts of rare events. Many medical centers reported no Covid-19 deaths or very few Covid-19 deaths. There are a few medical reports illustrating an incredibly large number of Covid-19_deaths making for a distribution that appears to be far from normal. Therefore, Poisson regression should be used to model our data; Poisson random variables are often used to represent counts (e.g., number of Covid-19 deaths) per unit of time or space (e.g., weekly).
Lets take a look at the four covariates of interest: All_Types_C, C_Rates, Body_Mass_Index and Age_Group
---------
ggplot(covbmiB,aes(C_Rates))+geom_histogram(col='black',fill='skyblue')+xlab('Cancer')+ggtitle('Distribution of ALL Cancer Rate Types')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
---
Bivariate visualization
pltybox1 <- covbmiB %>%
mutate(Body_Mass_Index=fct_reorder(Body_Mass_Index,`COVID-19_Deaths`)) %>%
ggplot(aes(y=log(`COVID-19_Deaths`),x=Body_Mass_Index,fill=Body_Mass_Index))+geom_boxplot(outliers = F)+
theme(text = element_text(),plot.title = element_text(size = 13,color = 'darkred',vjust=1,face='italic'))+theme(legend.position = 'none')+
theme(text=element_text(),axis.text.x=element_text(angle=45,vjust=.5))+
ggtitle('B.M.I to COVID-19_Deaths')
ggplotly(pltybox1)
pltybox2 <- covbmiB %>%
mutate(Age_Group=fct_reorder(Age_Group,`COVID-19_Deaths`)) %>%
ggplot(aes(y=log(`COVID-19_Deaths`),x=Age_Group,fill=Age_Group))+geom_boxplot()+
theme(text = element_text(),plot.title = element_text(size = 13,color = 'darkred',vjust=1,face='italic'))+theme(legend.position = 'none')+
theme(text=element_text(),axis.text.x=element_text(angle=45,vjust=.5))+
ggtitle('Age_Group to COVID-19_Deaths')
ggplotly(pltybox2)
grid.arrange(pltybox1,pltybox2)
Here we can see that outside of ‘BMI25.0andup’ there is a trend of growing medians and IQR in the Body Mass Index (BMI) relation to COVID-19 Death
the chart above illustrates a clear trend for how one’s age plays a pivotal role in succumbing to Covid-19 especially once they reach they age of fifty and above
covbmiB %>%
mutate(Body_Mass_Index=fct_reorder(Body_Mass_Index,`COVID-19_Deaths`)) %>%
filter(Body_Mass_Index %in% c('25.0andup','40.0andup')) %>%
ggplot(aes(y=log(`COVID-19_Deaths`),x=Body_Mass_Index,fill=Body_Mass_Index))+geom_boxplot(outliers = F)+
theme(text = element_text(),plot.title = element_text(size = 13,color = 'darkred',vjust=1,face='italic'))+
theme(text=element_text(),axis.text.x=element_text(angle=45,vjust=.5))+
ggtitle('B.M.I to COVID-19_Deaths')
pltybox3 <- covbmiB %>%
# filter()%>%
mutate(All_Types_C=fct_reorder(All_Types_C,`COVID-19_Deaths`)) %>%
ggplot(aes(y=log(`COVID-19_Deaths`),x=All_Types_C,fill=All_Types_C))+geom_boxplot(outliers = F)+
theme(text = element_text(),plot.title = element_text(size = 13,color = 'darkred',vjust=1,face='italic'))+
theme(text=element_text(),axis.text.x=element_text(angle=45,vjust=.5))+
ggtitle('Cancer to COVID-19_Deaths')
ggplotly(pltybox3)
When we look at the distribution of cancer types to the log mean of Covid-19 death, it appears that that there is no trend as each cancer type impacts the rate of Covid-19 death no more than the other
pltypoint <- covbmiB %>%
filter(All_Types_C=='All_Cancer')%>%
#mutate(All_Types_C=fct_reorder(All_Types_C,`COVID-19_Deaths`)) %>%
ggplot(aes(y=log(`COVID-19_Deaths`),x=C_Rates,fill=All_Types_C))+geom_jitter()+geom_smooth(method = 'loess',se = T)+
theme(text = element_text(),plot.title = element_text(size = 13,color = 'darkred',vjust=1,face='italic'))+
theme(text=element_text(),axis.text.x=element_text(angle=20))+facet_wrap(~All_Types_C,scales = 'free_y',ncol=2)+theme(text=element_text(),axis.title.x=element_text(vjust = 1))+
ggtitle('Cancer Rate to COVID-19_Deaths')
ggplotly(pltypoint)
## `geom_smooth()` using formula = 'y ~ x'
summary(covfatality <- glm(`COVID-19_Deaths`~ Body_Mass_Index*C_Rates+Age_Group,family = 'poisson', data=filter(covbmiB, Body_Mass_Index %in% c('25.0andup','40.0andup') & All_Types_C =='All_Cancer')))
##
## Call:
## glm(formula = `COVID-19_Deaths` ~ Body_Mass_Index * C_Rates +
## Age_Group, family = "poisson", data = filter(covbmiB, Body_Mass_Index %in%
## c("25.0andup", "40.0andup") & All_Types_C == "All_Cancer"))
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) 1.65724159854 0.15819746445 10.476
## Body_Mass_Index40.0andup 1.56450208620 0.01721423210 90.884
## C_Rates 0.00000123599 0.00000002279 54.225
## Age_Group18-29 0.99585027494 0.16103647341 6.184
## Age_Group30-39 1.81071694282 0.15903211314 11.386
## Age_Group40-49 2.59411902248 0.15844964183 16.372
## Age_Group50-64 3.86639396550 0.15818191431 24.443
## Age_Group65-74 4.39208909631 0.15815666761 27.770
## Age_Group75-84 4.60904740974 0.15814899402 29.144
## Age_Group85andolder 4.64358747478 0.15814661710 29.363
## Body_Mass_Index40.0andup:C_Rates -0.00000514853 0.00000006102 -84.375
## Pr(>|z|)
## (Intercept) < 0.0000000000000002 ***
## Body_Mass_Index40.0andup < 0.0000000000000002 ***
## C_Rates < 0.0000000000000002 ***
## Age_Group18-29 0.000000000625 ***
## Age_Group30-39 < 0.0000000000000002 ***
## Age_Group40-49 < 0.0000000000000002 ***
## Age_Group50-64 < 0.0000000000000002 ***
## Age_Group65-74 < 0.0000000000000002 ***
## Age_Group75-84 < 0.0000000000000002 ***
## Age_Group85andolder < 0.0000000000000002 ***
## Body_Mass_Index40.0andup:C_Rates < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 855601 on 800 degrees of freedom
## Residual deviance: 658221 on 790 degrees of freedom
## AIC: 663412
##
## Number of Fisher Scoring iterations: 6
report::report(covfatality)
## We fitted a poisson model (estimated using ML) to predict COVID-19_Deaths with
## Body_Mass_Index, C_Rates and Age_Group (formula: `COVID-19_Deaths` ~
## Body_Mass_Index * C_Rates + Age_Group). The model's explanatory power is
## substantial (Nagelkerke's R2 = 1.00). The model's intercept, corresponding to
## Body_Mass_Index = 18.5-24.9, C_Rates = 0 and Age_Group = 0-17, is at 1.66 (95%
## CI [1.33, 1.95], p < .001). Within this model:
##
## - The effect of Body Mass Index [40.0andup] is statistically significant and
## positive (beta = 1.56, 95% CI [1.53, 1.60], p < .001; Std. beta = 0.34, 95% CI
## [0.33, 0.35])
## - The effect of C Rates is statistically significant and positive (beta =
## 1.24e-06, 95% CI [1.19e-06, 1.28e-06], p < .001; Std. beta = 0.13, 95% CI
## [0.13, 0.14])
## - The effect of Age Group [18-29] is statistically significant and positive
## (beta = 1.00, 95% CI [0.69, 1.33], p < .001; Std. beta = 1.00, 95% CI [0.69,
## 1.33])
## - The effect of Age Group [30-39] is statistically significant and positive
## (beta = 1.81, 95% CI [1.51, 2.14], p < .001; Std. beta = 1.81, 95% CI [1.51,
## 2.14])
## - The effect of Age Group [40-49] is statistically significant and positive
## (beta = 2.59, 95% CI [2.30, 2.92], p < .001; Std. beta = 2.59, 95% CI [2.30,
## 2.92])
## - The effect of Age Group [50-64] is statistically significant and positive
## (beta = 3.87, 95% CI [3.57, 4.19], p < .001; Std. beta = 3.87, 95% CI [3.57,
## 4.19])
## - The effect of Age Group [65-74] is statistically significant and positive
## (beta = 4.39, 95% CI [4.10, 4.72], p < .001; Std. beta = 4.39, 95% CI [4.10,
## 4.72])
## - The effect of Age Group [75-84] is statistically significant and positive
## (beta = 4.61, 95% CI [4.31, 4.94], p < .001; Std. beta = 4.61, 95% CI [4.31,
## 4.94])
## - The effect of Age Group [85andolder] is statistically significant and
## positive (beta = 4.64, 95% CI [4.35, 4.97], p < .001; Std. beta = 4.64, 95% CI
## [4.35, 4.97])
## - The effect of Body Mass Index [40.0andup] × C Rates is statistically
## significant and negative (beta = -5.15e-06, 95% CI [-5.27e-06, -5.03e-06], p <
## .001; Std. beta = -0.55, 95% CI [-0.56, -0.54])
##
## Standardized parameters were obtained by fitting the model on a standardized
## version of the dataset. 95% Confidence Intervals (CIs) and p-values were
## computed using a Wald z-distribution approximation.
checking coefficients
coef <- coef(covfatality)
coef
## (Intercept) Body_Mass_Index40.0andup
## 1.657241598541 1.564502086199
## C_Rates Age_Group18-29
## 0.000001235987 0.995850274941
## Age_Group30-39 Age_Group40-49
## 1.810716942821 2.594119022484
## Age_Group50-64 Age_Group65-74
## 3.866393965496 4.392089096309
## Age_Group75-84 Age_Group85andolder
## 4.609047409737 4.643587474781
## Body_Mass_Index40.0andup:C_Rates
## -0.000005148529
in Poisson regression, the dependent variable is modeled as the log of the conditional mean loge(l). The regression parameter of for Body_Mass_Index40.0andup indicates that a one-unit increase in the variable is associated with a 1.56 increase in the log mean number of `COVID-19_Deaths`, holding other variables constant. The intercept is a log mean number of `COVID-19_Deaths` when each of the predictors equals zero.
However, it is easier to interpret the regression coefficients when scaled equal to that of the dependent variable (`COVID-19_Deaths` instead of its log(mean))
--
Seeing Coefficients in actual scale
exp(coef(covfatality))
## (Intercept) Body_Mass_Index40.0andup
## 5.2448235 4.7802942
## C_Rates Age_Group18-29
## 1.0000012 2.7070251
## Age_Group30-39 Age_Group40-49
## 6.1148298 13.3847905
## Age_Group50-64 Age_Group65-74
## 47.7698155 80.8090607
## Age_Group75-84 Age_Group85andolder
## 100.3884750 103.9164774
## Body_Mass_Index40.0andup:C_Rates
## 0.9999949
From the findings, we can say that one unit increase of ‘Body_Mass_Index40andup’ multiplies the expected number of ‘Covid-19_Deaths’ by 4.7802942 and a unit increase of ‘Age_Group18-29’ multiplies the expected number of ‘Covid-19_Deaths’ by 2.7070251 and one unit increase in C_Rates multiplies the expected number of Covid-19 Deaths by 1.0000012. Then a one unit increase BMI40.0andup:C_Rates multiplies the expected number of ‘Covid-19_Deaths’ by 0.9999949
Using the above steps, we obtained a Poisson regression model for predicting the number of ‘Covid-19_Deaths’. However it is very important to check for overdispersion. For in Poisson regression, the variance and means are equal.
-----
Overdispersion occurs when the observed variance of the response variable is larger than would be predicted by the Poisson distribution. Analyzing overdispersion becomes important as it is common with count data, and can negatively impact the final results. In R, one can test to see if overdispersion is a factor using the “qcc” package. The analysis is illustrated below
Testing for Overdispersion
qcc.overdispersion.test(covbmiB$`COVID-19_Deaths`, type='poisson')
##
## Overdispersion test Obs.Var/Theor.Var Statistic p-value
## poisson data 1421.504 16945753 0
The above significant test shows that the p-value is less than 0.05, which strongly suggests the presence of over dispersion. We’ll try fitting a model using glm() function, by replacing family = “Poisson” with family = “quasipoisson”. This is illustrated below.
covfatality3 <- glm(`COVID-19_Deaths`~ Body_Mass_Index*C_Rates+Age_Group,family = 'quasipoisson', data=filter(covbmiB, Body_Mass_Index %in% c('25.0andup','40.0andup') & All_Types_C =='All_Cancer'))
summary(covfatality3)
##
## Call:
## glm(formula = `COVID-19_Deaths` ~ Body_Mass_Index * C_Rates +
## Age_Group, family = "quasipoisson", data = filter(covbmiB,
## Body_Mass_Index %in% c("25.0andup", "40.0andup") & All_Types_C ==
## "All_Cancer"))
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.6572415985 5.4616746599 0.303 0.76164
## Body_Mass_Index40.0andup 1.5645020862 0.5943112652 2.632 0.00864
## C_Rates 0.0000012360 0.0000007869 1.571 0.11667
## Age_Group18-29 0.9958502749 5.5596897790 0.179 0.85789
## Age_Group30-39 1.8107169428 5.4904904166 0.330 0.74164
## Age_Group40-49 2.5941190225 5.4703809362 0.474 0.63548
## Age_Group50-64 3.8663939655 5.4611378004 0.708 0.47916
## Age_Group65-74 4.3920890963 5.4602661732 0.804 0.42142
## Age_Group75-84 4.6090474097 5.4600012470 0.844 0.39884
## Age_Group85andolder 4.6435874748 5.4599191850 0.850 0.39531
## Body_Mass_Index40.0andup:C_Rates -0.0000051485 0.0000021067 -2.444 0.01475
##
## (Intercept)
## Body_Mass_Index40.0andup **
## C_Rates
## Age_Group18-29
## Age_Group30-39
## Age_Group40-49
## Age_Group50-64
## Age_Group65-74
## Age_Group75-84
## Age_Group85andolder
## Body_Mass_Index40.0andup:C_Rates *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for quasipoisson family taken to be 1191.935)
##
## Null deviance: 855601 on 800 degrees of freedom
## Residual deviance: 658221 on 790 degrees of freedom
## AIC: NA
##
## Number of Fisher Scoring iterations: 6
Upon running glm() model using family=‘quasipoisson’
we can see that the parameter estimates in the quasi-Poisson approach are identical to those produced by the Poisson approach, though the standard errors are different for both the approaches.
we now have a situation where the predictor variable of C_Rates as well as Age_Group ranging from ‘18-29’, ‘30-39’, ‘40-49, and ’50-64’, ‘65-74’ and ‘85andolder’ are no longer statistically significant to the model but this is only due to there being a much larger standard error.
## We fitted a poisson model (estimated using ML) to predict COVID-19_Deaths with
## Body_Mass_Index, C_Rates and Age_Group (formula: `COVID-19_Deaths` ~
## Body_Mass_Index * C_Rates + Age_Group). The model's explanatory power is
## substantial (Nagelkerke's R2 = 1.00). The model's intercept, corresponding to
## Body_Mass_Index = 18.5-24.9, C_Rates = 0 and Age_Group = 0-17, is at 1.66 (95%
## CI [, 5.72], t(790) = 0.30, p = 0.762). Within this model:
##
## - The effect of Body Mass Index [40.0andup] is statistically significant and
## positive (beta = 1.56, 95% CI [0.43, 2.76], t(790) = 2.63, p = 0.008; Std. beta
## = 0.34, 95% CI [0.02, 0.68])
## - The effect of C Rates is statistically non-significant and positive (beta =
## 1.24e-06, 95% CI [-3.41e-07, 2.75e-06], t(790) = 1.57, p = 0.116; Std. beta =
## 0.13, 95% CI [-0.04, 0.29])
## - The effect of Age Group [18-29] is statistically non-significant and positive
## (beta = 1.00, 95% CI [-4.49, ], t(790) = 0.18, p = 0.858; Std. beta = 1.00, 95%
## CI [-4.49, ])
## - The effect of Age Group [30-39] is statistically non-significant and positive
## (beta = 1.81, 95% CI [-2.78, ], t(790) = 0.33, p = 0.742; Std. beta = 1.81, 95%
## CI [-2.78, ])
## - The effect of Age Group [40-49] is statistically non-significant and positive
## (beta = 2.59, 95% CI [-1.65, ], t(790) = 0.47, p = 0.635; Std. beta = 2.59, 95%
## CI [-1.65, ])
## - The effect of Age Group [50-64] is statistically non-significant and positive
## (beta = 3.87, 95% CI [-0.19, ], t(790) = 0.71, p = 0.479; Std. beta = 3.87, 95%
## CI [-0.19, ])
## - The effect of Age Group [65-74] is statistically non-significant and positive
## (beta = 4.39, 95% CI [0.35, ], t(790) = 0.80, p = 0.421; Std. beta = 4.39, 95%
## CI [0.35, ])
## - The effect of Age Group [75-84] is statistically non-significant and positive
## (beta = 4.61, 95% CI [0.57, ], t(790) = 0.84, p = 0.399; Std. beta = 4.61, 95%
## CI [0.57, ])
## - The effect of Age Group [85andolder] is statistically non-significant and
## positive (beta = 4.64, 95% CI [0.61, ], t(790) = 0.85, p = 0.395; Std. beta =
## 4.64, 95% CI [0.61, ])
## - The effect of Body Mass Index [40.0andup] × C Rates is statistically
## significant and negative (beta = -5.15e-06, 95% CI [-9.40e-06, -1.13e-06],
## t(790) = -2.44, p = 0.015; Std. beta = -0.55, 95% CI [-1.01, -0.12])
##
## Standardized parameters were obtained by fitting the model on a standardized
## version of the dataset. 95% Confidence Intervals (CIs) and p-values were
## computed using a Wald t-distribution approximation.
-----
Conclusion:
In this session, I have performed an Exploratory Data Analysis (EDA) on a data set of six levels of Body Mass Index, Age-groups and their possible role in contributing to COVID-19 Deaths using Poisson Regression. The data set contains ‘All_Types_C’, C_Rates’, ‘B_M_I’, ‘Age_Group’ and `COVID-19_Death’. The EDA consists of the following steps: · Loading and inspecting the data · Summarizing the data using descriptive statistics · Visualizing the data using charts and graphs · Checking to see if my statistical model works or if its assumptions have in fact been violated · Retesting for Over dispersion.
Poisson regression is based on the concept of Poisson distribution. It is another category belonging to the set of regression techniques that combines the properties of both Linear as well as Logistic regressions. However, unlike Logistic regression which generates only binary output, it is used to predict a discrete variable.
I am looking forward to repeating this same analysis sometime soon but with even more possible comorbidities and looking for possible interactions in order to further see how much of a significant impact they could have in contributing to Covid-19 deaths.