library(gtsummary)
library(readr) #Read Tabular Dat
library(tidyverse)
library(magrittr) #Pipe Operators
library(stringr) #Work with Strings
library(forcats) #Work with Factors
library(skimr) #Data Summary
library(patchwork) #ggplot grids
library(GGally) #Scatterplot Matrice
library(VIM)
library(naniar)
library(gt)

load the dataset

out_new<- vroom::vroom("bloodpressure.csv")
sapply(out_new[1,],class)
#>                Patient_Number    Blood_Pressure_Abnormality 
#>                     "numeric"                     "numeric" 
#>           Level_of_Hemoglobin  Genetic_Pedigree_Coefficient 
#>                     "numeric"                     "numeric" 
#>                           Age                           BMI 
#>                     "numeric"                     "numeric" 
#>                           Sex                     Pregnancy 
#>                     "numeric"                     "numeric" 
#>                       Smoking             Physical_activity 
#>                     "numeric"                     "numeric" 
#>      salt_content_in_the_diet   alcohol_consumption_per_day 
#>                     "numeric"                     "numeric" 
#>               Level_of_Stress        Chronic_kidney_disease 
#>                     "numeric"                     "numeric" 
#> Adrenal_and_thyroid_disorders 
#>                     "numeric"

colSums(is.na(out_new))
#>                Patient_Number    Blood_Pressure_Abnormality 
#>                             0                             0 
#>           Level_of_Hemoglobin  Genetic_Pedigree_Coefficient 
#>                             0                            92 
#>                           Age                           BMI 
#>                             0                             0 
#>                           Sex                     Pregnancy 
#>                             0                          1558 
#>                       Smoking             Physical_activity 
#>                             0                             0 
#>      salt_content_in_the_diet   alcohol_consumption_per_day 
#>                             0                           242 
#>               Level_of_Stress        Chronic_kidney_disease 
#>                             0                             0 
#> Adrenal_and_thyroid_disorders 
#>                             0

Data dictionery

knitr::include_graphics("data_dictionary.png")

recode the dataset

data_new<- out_new |> 
  mutate(Blood_Pressure_Abnormality= ifelse(Blood_Pressure_Abnormality==0,"Normal","Abnormal")) |> 
  mutate(Sex = ifelse(Sex==0,"Male","Female")) |> 
  mutate(Pregnancy=ifelse(Pregnancy==0,"No","Yes")) |> 
  mutate(Smoking=ifelse(Smoking==0,"No","Yes")) |>
  mutate(Chronic_kidney_disease=ifelse(Chronic_kidney_disease==0,"No","Yes")) |> 
  mutate(Adrenal_and_thyroid_disorders=ifelse(Adrenal_and_thyroid_disorders==0,"No","Yes")) |> 
  mutate(Level_of_Stress=case_when(Level_of_Stress==1~"Less",
                                   Level_of_Stress==2~"Normal",
                                   Level_of_Stress==3~"High"))
data_new %>% 
  head() %>% 
  gt() %>%
  tab_header(title = md("**First 6 rows of bloodPressure data**"))  %>%
  tab_options(container.height = 400,
              container.overflow.y = TRUE,
              heading.background.color = "#21908CFF",
              table.width = "75%",
              column_labels.background.color = "black",
              table.font.color = "black") %>%
  tab_style(style = list(cell_fill(color = "#35B779FF")),
            locations = cells_body())
First 6 rows of bloodPressure data
Patient_Number Blood_Pressure_Abnormality Level_of_Hemoglobin Genetic_Pedigree_Coefficient Age BMI Sex Pregnancy Smoking Physical_activity salt_content_in_the_diet alcohol_consumption_per_day Level_of_Stress Chronic_kidney_disease Adrenal_and_thyroid_disorders
1 Abnormal 11.28 0.90 34 23 Female Yes No 45961 48071 NA Normal Yes Yes
2 Normal 9.75 0.23 54 33 Female NA No 26106 25333 205 High No No
3 Abnormal 10.79 0.91 70 49 Male NA No 9995 29465 67 Normal Yes No
4 Normal 11.00 0.43 71 50 Male NA No 10635 7439 242 Less No No
5 Abnormal 14.17 0.83 52 19 Male NA No 15619 49644 397 Normal No No
6 Normal 11.64 0.54 23 48 Male NA Yes 27042 7513 NA High No No

aggr(out_new, numbers = TRUE, prop = FALSE)

# get the proportion of missing data 
prop_miss(out_new)
#> [1] 0.06306667

# get the summary of missing data
miss_var_summary(out_new)

the variable pregrancy is going to be dropped due to high proportion of missing values

Explanatory data analysis

data_new %>% 
  group_by(Blood_Pressure_Abnormality, Smoking) %>% 
  summarize(count= n()) %>% 
  arrange(Blood_Pressure_Abnormality, Smoking)  %>% 
  ggplot(aes(x = Smoking, y = count, fill = Blood_Pressure_Abnormality)) +
  geom_col(position= "dodge", color="white",linewidth=0.1) +
  geom_text(aes(label = comma(count)), position=position_dodge(width=1),
            hjust = 2, vjust=.5, colour = "white", size=3.5) +
  coord_flip() +
  tvthemes::scale_fill_avatar()+
  labs(title="Difference in proportion by sex",fill = "Patients", 
       subtitle="Number of abnormalities per sex", 
       y="\nTotal Number of cases") +
  theme(plot.title = element_text(family="lato", face="bold", vjust = -.5),
        plot.subtitle = element_text(size = 9, family="lato",face="italic"),
        plot.caption.position = "plot",
        plot.caption = element_text(hjust = 0, size= 8, family="lato",face="italic"),
        axis.title.x = element_text(size=9),
        axis.title.y = element_blank(),
        axis.text.y = element_text(family = "lato"),
        axis.line.y.left = element_line(color = "black"),
        axis.line.x.bottom = element_line(color = "gray"),
        ##axis.ticks.length = unit(0, "mm"),
        panel.background = element_rect(fill = "white"),
        panel.border = element_rect(fill = NA, color = "white"),
        panel.grid.major.x = element_line(color = "#A8BAC4", size = 0.3),
        panel.grid.minor.y = element_blank(),
        )  +
  scale_y_continuous(labels = scales::comma, expand=c(0,0)) +
  scale_x_discrete(expand=c(0,1)) +
  guides(fill = guide_legend(reverse=TRUE))

iv_rates <- data_new |>
  group_by(Blood_Pressure_Abnormality) |>
  summarize(count = n()) |> 
  mutate(prop = count/sum(count)) |>
  ungroup() 

plot<-iv_rates |>
  ggplot(aes(x=Blood_Pressure_Abnormality, y=prop, fill=Blood_Pressure_Abnormality)) + 
  geom_col(color="black",width = 0.5)+ 
  theme(legend.position="bottom") + 
  geom_label(aes(label=scales::percent(prop)), color="white") + 
  labs(
    title = "Abnormality ratio",
    y = "proportion(%)", 
    x = "",
    fill="status") + 
  scale_y_continuous(labels = scales::percent)+ 
   paletteer::scale_fill_paletteer_d("ggsci::default_nejm")+
  ggthemes::theme_fivethirtyeight() +
  theme(legend.position = 'right')
plot

Bar Plot

data_new %>% 
  count(Level_of_Stress,Blood_Pressure_Abnormality) %>% 
  ggplot(mapping = aes(x = Level_of_Stress, y = n,fill=Level_of_Stress)) +
  geom_col(show.legend = FALSE, alpha = 0.7, width = 0.5) +
  facet_wrap(~Blood_Pressure_Abnormality)+
  geom_text(aes(label = n, y = n)) +
  paletteer::scale_fill_paletteer_d("ggsci::default_nejm")

data_new |> 
  select_if(is.numeric) %>% 
  dplyr::select(-Patient_Number) |>
  gather() %>% 
  ggplot(aes(value, fill = key)) +
  geom_histogram(alpha = 0.7,bins=30) +
  facet_wrap(~key, scales = 'free') +
  labs(title = 'Count distributions for numeric variables') +
  theme(legend.position = 'none')

library(ggpubr)
library(ggridges)
a <- data_new %>% 
  add_count(Blood_Pressure_Abnormality) %>% 
  mutate(Blood_Pressure_Abnormality = fct_reorder(Blood_Pressure_Abnormality, n)) %>%
  group_by(Blood_Pressure_Abnormality) %>% 
  ggplot(aes(Genetic_Pedigree_Coefficient, Blood_Pressure_Abnormality, fill = Blood_Pressure_Abnormality)) +
  geom_density_ridges() +
  labs(title = 'Density of the GPC by Abnormality status',
       x= 'GPC',
       y= '') +
  theme(legend.position = 'none') +
  scale_fill_viridis_d(alpha = 0.8) +
  theme(panel.grid.minor = element_blank(), panel.border = element_blank())
b <- data_new %>% 
  add_count(Blood_Pressure_Abnormality) %>% 
  mutate(Blood_Pressure_Abnormality = fct_reorder(Blood_Pressure_Abnormality, n)) %>%
  group_by(Blood_Pressure_Abnormality) %>% 
  ggplot(aes(BMI, Blood_Pressure_Abnormality, fill = Blood_Pressure_Abnormality)) +
  geom_density_ridges() +
  # scale_y_discrete(expand = c(0.01, 0)) +
  # scale_x_continuous(expand = c(0.01, 0)) +
  #theme_ridges() + 
  labs(title = 'Density of the BMI by Blood pressure abnormality',
       x= 'BMI',
       y= '') +
  theme(legend.position = 'none') +
  scale_fill_viridis_d(alpha = 0.8) +
  theme(panel.grid.minor = element_blank(), panel.border = element_blank())

(a  / b)

## Picking joint bandwidth of 0.00625
## Picking joint bandwidth of 1.57

Statistical analysis

library(sjPlot)
sjt.xtab(data_new$Blood_Pressure_Abnormality,data_new$Smoking,                 # row var, column var 
         show.row.prc=TRUE,                                        # show row percents  
         show.summary=FALSE,                                       # omit summary statistics (for now) 
         title="Cross Tab Smoking x Blood pressure")       
Cross Tab Smoking x Blood pressure
Blood_Pressure_Abnormality Smoking Total
No Yes
Abnormal 478
48.4 %
509
51.6 %
987
100 %
Normal 503
49.7 %
510
50.3 %
1013
100 %
Total 981
49 %
1019
51 %
2000
100 %
library(compareGroups) 
results <- compareGroups(Blood_Pressure_Abnormality ~ Smoking, data = data_new)                   # do calculations 
readytable <- createTable(results, show.ratio=TRUE, show.p.overall = FALSE)       # produce table 
print(readytable, header.labels = c(p.ratio = "p-value"))   
#> 
#> --------Summary descriptives table by 'Blood_Pressure_Abnormality'---------
#> 
#> _________________________________________________________ 
#>           Abnormal     Normal           OR        p-value 
#>             N=987      N=1013                             
#> ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
#> Smoking:                                                  
#>     No   478 (48.4%) 503 (49.7%)       Ref.        Ref.   
#>     Yes  509 (51.6%) 510 (50.3%) 0.95 [0.80;1.13]  0.584  
#> ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯

Chi squared tests

data_new|> 
  dplyr::select(Blood_Pressure_Abnormality,Sex,Level_of_Stress,
         Chronic_kidney_disease,Adrenal_and_thyroid_disorders) |> 
  mutate(Chronic_kidney_disease=
            factor(Chronic_kidney_disease,levels=c("No","Yes"))) |> 
  tbl_summary(
    by = Blood_Pressure_Abnormality,
    statistic = list(
      all_continuous() ~ "{mean} ({sd})",
      all_categorical() ~ "{n} / {N} ({p}%)"), 
    type=list(Chronic_kidney_disease~"categorical",
              Adrenal_and_thyroid_disorders~"categorical"),
    label = Level_of_Stress ~ "Level of Stress")|> 
  add_p(test = all_continuous() ~ "t.test",
        pvalue_fun = function(x) style_pvalue(x, digits = 2))|> 
   modify_header(statistic ~ "**Test Statistic**")|>
  bold_labels()|> 
  modify_fmt_fun(statistic ~ style_sigfig) 
Characteristic Abnormal, N = 9871 Normal, N = 1,0131 Test Statistic p-value2
Sex 6.0 0.014
    Female 517 / 987 (52%) 475 / 1,013 (47%)
    Male 470 / 987 (48%) 538 / 1,013 (53%)
Level of Stress 0.34 0.84
    High 347 / 987 (35%) 344 / 1,013 (34%)
    Less 327 / 987 (33%) 339 / 1,013 (33%)
    Normal 313 / 987 (32%) 330 / 1,013 (33%)
Chronic_kidney_disease 1,137 <0.001
    No 274 / 987 (28%) 1,013 / 1,013 (100%)
    Yes 713 / 987 (72%) 0 / 1,013 (0%)
Adrenal_and_thyroid_disorders 871 <0.001
    No 391 / 987 (40%) 1,013 / 1,013 (100%)
    Yes 596 / 987 (60%) 0 / 1,013 (0%)
1 n / N (%)
2 Pearson’s Chi-squared test

Comments

  • Sex ,Chronic kidney disease and Adrenal and thyroid disorders are all significantly associated with Abnormal blood pressure
#Correlation Matrix Plot
subset<- data_new |>
  dplyr::select(Age,BMI,Physical_activity,salt_content_in_the_diet,alcohol_consumption_per_day,Level_of_Hemoglobin,Genetic_Pedigree_Coefficient,Blood_Pressure_Abnormality)# Bring in external file for visualisations
source('functions/visualisations.R')

# Use plot function
plot <- histoplotter(subset,Blood_Pressure_Abnormality, 
                     chart_x_axis_lbl = "Abnormality status", 
                     chart_y_axis_lbl = 'Measures',
                     boxplot_color = 'navy', 
                     boxplot_fill = '#89CFF0', 
                     box_fill_transparency = 0.2) 

# Add extras to plot
plot + 
  tvthemes::theme_theLastAirbender() +
 paletteer::scale_color_paletteer_d("ggsci::default_nejm")+
  theme(legend.position = 'top') 

T test

data_new|> 
  dplyr::select(Age,BMI,Physical_activity,salt_content_in_the_diet,alcohol_consumption_per_day,Level_of_Hemoglobin,Genetic_Pedigree_Coefficient,Blood_Pressure_Abnormality) |> 
  tbl_summary(
    by = Blood_Pressure_Abnormality,
    statistic = list(
      all_continuous() ~ "{mean} ({sd})",
      all_categorical() ~ "{n} / {N} ({p}%)"), 
    label = BMI ~ "BMI")|> 
  add_p(test = all_continuous() ~ "t.test",
        pvalue_fun = function(x) style_pvalue(x, digits = 2))|> 
   modify_header(statistic ~ "**Test Statistic**")|>
  bold_labels()|> 
  modify_fmt_fun(statistic ~ style_sigfig) 
Characteristic Abnormal, N = 9871 Normal, N = 1,0131 Test Statistic p-value2
Age 45 (17) 48 (17) -3.0 0.003
BMI 31 (12) 30 (12) 1.8 0.072
Physical_activity 25,793 (14,168) 24,730 (13,852) 1.7 0.090
salt_content_in_the_diet 25,130 (14,339) 24,727 (14,091) 0.63 0.53
alcohol_consumption_per_day 254 (144) 248 (143) 0.87 0.38
    Unknown 146 96
Level_of_Hemoglobin 12.02 (2.75) 11.41 (1.37) 6.2 <0.001
Genetic_Pedigree_Coefficient 0.49 (0.35) 0.50 (0.21) -1.5 0.14
    Unknown 30 62
1 Mean (SD)
2 Welch Two Sample t-test

Comments

  • there is significant difference in mean BMI and mean haemoglobin level implying that the differences in the averages is not merely an outcome of chance
  • lets explore more

model_data <- data_new |> 
  dplyr::select(-Patient_Number,-Pregnancy) |> 
  mutate(Blood_Pressure_Abnormality=ifelse(Blood_Pressure_Abnormality=="Normal",0,1),
         Blood_Pressure_Abnormality=as.numeric(Blood_Pressure_Abnormality))

Fitting a Binary logistic regression

Fit a full model

full_model<-glm(Blood_Pressure_Abnormality~.-Genetic_Pedigree_Coefficient - alcohol_consumption_per_day ,
                data=model_data,family=binomial(link="logit"))
summary(full_model)
#> 
#> Call:
#> glm(formula = Blood_Pressure_Abnormality ~ . - Genetic_Pedigree_Coefficient - 
#>     alcohol_consumption_per_day, family = binomial(link = "logit"), 
#>     data = model_data)
#> 
#> Coefficients:
#>                                    Estimate Std. Error z value Pr(>|z|)    
#> (Intercept)                      -7.031e+00  9.823e-01  -7.158 8.20e-13 ***
#> Level_of_Hemoglobin               3.584e-01  6.701e-02   5.349 8.85e-08 ***
#> Age                               4.031e-03  7.463e-03   0.540   0.5891    
#> BMI                               5.638e-04  9.528e-03   0.059   0.9528    
#> SexMale                          -4.655e-01  2.380e-01  -1.956   0.0504 .  
#> SmokingYes                        2.683e-01  2.248e-01   1.194   0.2326    
#> Physical_activity                 1.771e-05  8.226e-06   2.153   0.0314 *  
#> salt_content_in_the_diet         -4.453e-07  7.980e-06  -0.056   0.9555    
#> Level_of_StressLess              -1.930e-01  2.797e-01  -0.690   0.4902    
#> Level_of_StressNormal             2.420e-01  2.659e-01   0.910   0.3627    
#> Chronic_kidney_diseaseYes         2.265e+01  9.363e+02   0.024   0.9807    
#> Adrenal_and_thyroid_disordersYes  2.232e+01  9.809e+02   0.023   0.9818    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> (Dispersion parameter for binomial family taken to be 1)
#> 
#>     Null deviance: 2323.50  on 1676  degrees of freedom
#> Residual deviance:  583.78  on 1665  degrees of freedom
#>   (323 observations deleted due to missingness)
#> AIC: 607.78
#> 
#> Number of Fisher Scoring iterations: 20

look at the odds

# create coefficient table with estimates, p-values and odds ratios
coefs<-summary(full_model)$coefficients

(full_coefs <- cbind(coefs[ ,c("Estimate", "Pr(>|z|)")], 
                     odds_ratio = exp(full_model$coefficients))) 
#>                                       Estimate     Pr(>|z|)   odds_ratio
#> (Intercept)                      -7.030851e+00 8.198296e-13 8.841791e-04
#> Level_of_Hemoglobin               3.584342e-01 8.849354e-08 1.431087e+00
#> Age                               4.030797e-03 5.891392e-01 1.004039e+00
#> BMI                               5.637909e-04 9.528153e-01 1.000564e+00
#> SexMale                          -4.655083e-01 5.044978e-02 6.278159e-01
#> SmokingYes                        2.682738e-01 2.326488e-01 1.307705e+00
#> Physical_activity                 1.770756e-05 3.135208e-02 1.000018e+00
#> salt_content_in_the_diet         -4.453453e-07 9.554923e-01 9.999996e-01
#> Level_of_StressLess              -1.929816e-01 4.901519e-01 8.244971e-01
#> Level_of_StressNormal             2.420288e-01 3.626761e-01 1.273831e+00
#> Chronic_kidney_diseaseYes         2.265090e+01 9.807002e-01 6.873214e+09
#> Adrenal_and_thyroid_disordersYes  2.231799e+01 9.818472e-01 4.926975e+09

Comments

  • as would be expected people with Chronic_kidney_disease and Adrenal and thyroid disorders have high odds of developing abnormal blood pressure
  • people with less stress have decreased odds of developing abnormal blood pressure as compared to their counterparts
  • Smoking increases the odds of developing abnormal blood pressure
  • Males have decreased chances of developing abnormal blood pressure as compared to women

Model diagnostics

  • McFadden’s \(R^2\) works by comparing the likelihood function of the fitted model with that of a random model and using this to estimate the explained variance in the outcome.
  • Cox and Snell’s \(R^2\) works by applying a ‘sum of squares’ analogy to the likelihood functions to align more closely with the precise methodology for calculating \(R^2\) in linear regression. However, this usually means that the maximum value is less than 1 and in certain circumstances substantially less than 1, which can be problematic and unintuitive for an \(R^2\).
  • Nagelkerke’s \(R^2\) resolves the issue with the upper bound for Cox and Snell by dividing Cox and Snell’s \(R^2\) by its upper bound. This restores an intuitive scale with a maximum of 1, but is considered somewhat arbitrary with limited theoretical foundation.
  • Tjur’s \(R^2\) is a more recent and simpler concept. It is defined as simply the absolute difference between the predicted probabilities of the positive observations and those of the negative observations.

Standard modeling functions generally do not offer the calculation of pseudo-\(R^2\) as standard, but numerous methods are available for their calculation. For example:

library(DescTools)
DescTools::PseudoR2(
  full_model, 
  which = c("McFadden", "CoxSnell", "Nagelkerke", "Tjur")
)
#>   McFadden   CoxSnell Nagelkerke       Tjur 
#>  0.7487511  0.6456256  0.8610597  0.8080701

We see that the Cox and Snell variant is notably lower than the other estimates, which is consistent with the known issues with its upper bound. However, the other estimates are reasonably aligned and suggest a strong fit.

Model Selection

fit the logistic null models

# define function to run stroke model and glance at results
blood_model_pvals <- function(form, df) {
  model <- glm(formula = form, data = df)
  broom::tidy(model)
}
# create model formula column
formula <- c(
  "Blood_Pressure_Abnormality ~ Adrenal_and_thyroid_disorders", 
  "Blood_Pressure_Abnormality ~ Level_of_Hemoglobin",
  "Blood_Pressure_Abnormality ~ Level_of_Stress",
  "Blood_Pressure_Abnormality ~ Age",
  "Blood_Pressure_Abnormality ~ Sex",
  "Blood_Pressure_Abnormality ~ Chronic_kidney_disease",
  "Blood_Pressure_Abnormality ~ Physical_activity",
  "Blood_Pressure_Abnormality ~ BMI",
  "Blood_Pressure_Abnormality ~ salt_content_in_the_diet",
  "Blood_Pressure_Abnormality ~ Smoking"
)
# create dataframe
models <- data.frame(formula)

view p-values

options(scipen=999)
# run models and glance at results
mods<-models |>
  dplyr::group_by(formula) |>
  dplyr::summarise(blood_model_pvals(formula, model_data)) |> 
  dplyr::select(formula,p.value,term) |>
  mutate(p.value=round(p.value,4)) |> 
  filter(term!="(Intercept)") |> 
  arrange(p.value)
mods

Based on P-values

lr_fitted_add <-mods |> 
  mutate(Significance = ifelse(p.value < 0.05, 
                               "Significant", "Insignificant"))|> 
  arrange(desc(p.value)) 
#Create a ggplot object to visualise significance
plot <- lr_fitted_add|> 
  ggplot(mapping = aes(x=fct_reorder(term,p.value), y=p.value, fill=Significance)) +
  geom_col() + 
  ggthemes::scale_fill_tableau() +
  theme(axis.text.x = element_text(face="bold", 
                                   color="#0070BA",
                                   size=8, 
                                   angle=60)) +
  geom_hline(yintercept = 0.05, col = "black", lty = 2) +
  labs(y="P value", 
       x="Terms",
       title="P value significance chart",
       subtitle="significant variables in the model",
       caption="Produced by Bongani Ncube")

plotly::ggplotly(plot) 

explanation

  • the output shows p-values for the univariate models arranged from smallest to largest
  • Adrenal_and_thyroid_disordersYes and Chronic_kidney_diseaseYes has the least p-value followed by level of haemoglobin

Based on AIC

blood_model_information <- function(form, df) {
  model <- glm(formula = form, data = df)
  broom::glance(model)
}
# run models and glance at results
mods1<-  models |>
  dplyr::group_by(formula) |>
  dplyr::summarise(blood_model_information(formula, model_data)) |> 
  dplyr::select(formula,AIC,BIC,deviance,logLik) |> 
  separate(formula,c("response","term"),sep="~") |> 
  arrange(AIC)
mods1

Based on Akaike Information Criterion

lr_fitted_add <-mods1
#Create a ggplot object to visualise significance
plot <- lr_fitted_add|> 
  ggplot(mapping = aes(x=fct_reorder(term,AIC), y=AIC)) +
  geom_col() + 
  ggthemes::scale_fill_canva() +
  theme(axis.text.x = element_text(face="bold", 
                                   color="#0070BA",
                                   size=8, 
                                   angle=40)) +
  geom_hline(yintercept = 0.05, col = "black", lty = 2) +
  labs(y="Akaike Information Criteria", 
       x="term",
       title="AIC chart",
       subtitle="significant variables in the model",
       caption="Produced by Bongani Ncube")

plotly::ggplotly(plot) 

Comments

  • variables with the least p.value and AIC are more likely to be included first in model building

Optimal model building

add chronic kidney disease

model0<-glm(Blood_Pressure_Abnormality ~ Adrenal_and_thyroid_disorders,data=model_data,family=binomial)
model1<-glm(Blood_Pressure_Abnormality ~ Adrenal_and_thyroid_disorders+Chronic_kidney_disease,data=model_data,family=binomial)
anova(model0,model1,test="LRT")
  • chronic kidney disease should be included because the model performs better than without it

add haemoglobin level

model2<-glm(Blood_Pressure_Abnormality ~ Adrenal_and_thyroid_disorders+Chronic_kidney_disease+Level_of_Hemoglobin,data=model_data,family=binomial)
anova(model1,model2,test="LRT")
  • level of haemoglobin is a significant variable and should be included

add sex

model3<-glm(Blood_Pressure_Abnormality ~ Adrenal_and_thyroid_disorders+Chronic_kidney_disease+Level_of_Hemoglobin+Sex,data=model_data,family=binomial)
anova(model2,model3,test="LRT")
  • Pr(>Chi) is greater than 0.05 so we drop sex

lets try age

model4<-glm(Blood_Pressure_Abnormality ~ Adrenal_and_thyroid_disorders+Chronic_kidney_disease+Level_of_Hemoglobin+Age,data=model_data,family=binomial)
anova(model2,model4,test="LRT")
  • age is not signifinant so we drop it

try Physical_activity

model5<-glm(Blood_Pressure_Abnormality ~ Adrenal_and_thyroid_disorders+Chronic_kidney_disease+Level_of_Hemoglobin+Physical_activity,data=model_data,family=binomial)
anova(model2,model5,test="LRT")

this would go on and on but we can use stepwise algorithms to make it fast

library(MASS)
full_model<-glm(Blood_Pressure_Abnormality~.-Genetic_Pedigree_Coefficient - alcohol_consumption_per_day ,
                data=model_data,family=binomial(link="logit"))

full_model1<-glm(Blood_Pressure_Abnormality~.-Genetic_Pedigree_Coefficient - alcohol_consumption_per_day ,
                data=full_model$model,family=binomial(link="logit"))

step<-stepAIC(full_model1, direction = "both")
#> Start:  AIC=607.78
#> Blood_Pressure_Abnormality ~ (Level_of_Hemoglobin + Genetic_Pedigree_Coefficient + 
#>     Age + BMI + Sex + Smoking + Physical_activity + salt_content_in_the_diet + 
#>     alcohol_consumption_per_day + Level_of_Stress + Chronic_kidney_disease + 
#>     Adrenal_and_thyroid_disorders) - Genetic_Pedigree_Coefficient - 
#>     alcohol_consumption_per_day
#> 
#>                                 Df Deviance     AIC
#> - salt_content_in_the_diet       1   583.78  605.78
#> - BMI                            1   583.78  605.78
#> - Age                            1   584.07  606.07
#> - Level_of_Stress                2   586.29  606.29
#> - Smoking                        1   585.21  607.21
#> <none>                               583.78  607.78
#> - Sex                            1   587.62  609.62
#> - Physical_activity              1   588.48  610.48
#> - Level_of_Hemoglobin            1   611.85  633.85
#> - Adrenal_and_thyroid_disorders  1  1097.92 1119.92
#> - Chronic_kidney_disease         1  1354.30 1376.30
#> 
#> Step:  AIC=605.78
#> Blood_Pressure_Abnormality ~ Level_of_Hemoglobin + Age + BMI + 
#>     Sex + Smoking + Physical_activity + Level_of_Stress + Chronic_kidney_disease + 
#>     Adrenal_and_thyroid_disorders
#> 
#>                                 Df Deviance     AIC
#> - BMI                            1   583.78  603.78
#> - Age                            1   584.07  604.07
#> - Level_of_Stress                2   586.29  604.29
#> - Smoking                        1   585.22  605.22
#> <none>                               583.78  605.78
#> - Sex                            1   587.62  607.62
#> + salt_content_in_the_diet       1   583.78  607.78
#> - Physical_activity              1   588.50  608.50
#> - Level_of_Hemoglobin            1   611.85  631.85
#> - Adrenal_and_thyroid_disorders  1  1099.79 1119.79
#> - Chronic_kidney_disease         1  1354.36 1374.36
#> 
#> Step:  AIC=603.78
#> Blood_Pressure_Abnormality ~ Level_of_Hemoglobin + Age + Sex + 
#>     Smoking + Physical_activity + Level_of_Stress + Chronic_kidney_disease + 
#>     Adrenal_and_thyroid_disorders
#> 
#>                                 Df Deviance     AIC
#> - Age                            1   584.08  602.08
#> - Level_of_Stress                2   586.30  602.30
#> - Smoking                        1   585.22  603.22
#> <none>                               583.78  603.78
#> - Sex                            1   587.66  605.66
#> + BMI                            1   583.78  605.78
#> + salt_content_in_the_diet       1   583.78  605.78
#> - Physical_activity              1   588.50  606.50
#> - Level_of_Hemoglobin            1   612.31  630.31
#> - Adrenal_and_thyroid_disorders  1  1099.83 1117.83
#> - Chronic_kidney_disease         1  1354.49 1372.49
#> 
#> Step:  AIC=602.08
#> Blood_Pressure_Abnormality ~ Level_of_Hemoglobin + Sex + Smoking + 
#>     Physical_activity + Level_of_Stress + Chronic_kidney_disease + 
#>     Adrenal_and_thyroid_disorders
#> 
#>                                 Df Deviance     AIC
#> - Level_of_Stress                2   586.69  600.69
#> - Smoking                        1   585.57  601.57
#> <none>                               584.08  602.08
#> + Age                            1   583.78  603.78
#> - Sex                            1   587.95  603.95
#> + BMI                            1   584.07  604.07
#> + salt_content_in_the_diet       1   584.08  604.08
#> - Physical_activity              1   588.81  604.81
#> - Level_of_Hemoglobin            1   612.31  628.31
#> - Adrenal_and_thyroid_disorders  1  1099.87 1115.87
#> - Chronic_kidney_disease         1  1354.49 1370.49
#> 
#> Step:  AIC=600.69
#> Blood_Pressure_Abnormality ~ Level_of_Hemoglobin + Sex + Smoking + 
#>     Physical_activity + Chronic_kidney_disease + Adrenal_and_thyroid_disorders
#> 
#>                                 Df Deviance     AIC
#> - Smoking                        1   587.95  599.95
#> <none>                               586.69  600.69
#> + Level_of_Stress                2   584.08  602.08
#> + Age                            1   586.30  602.30
#> - Sex                            1   590.41  602.41
#> + BMI                            1   586.68  602.68
#> + salt_content_in_the_diet       1   586.69  602.69
#> - Physical_activity              1   591.09  603.09
#> - Level_of_Hemoglobin            1   613.48  625.48
#> - Adrenal_and_thyroid_disorders  1  1101.12 1113.12
#> - Chronic_kidney_disease         1  1357.03 1369.03
#> 
#> Step:  AIC=599.95
#> Blood_Pressure_Abnormality ~ Level_of_Hemoglobin + Sex + Physical_activity + 
#>     Chronic_kidney_disease + Adrenal_and_thyroid_disorders
#> 
#>                                 Df Deviance     AIC
#> <none>                               587.95  599.95
#> + Smoking                        1   586.69  600.69
#> + Age                            1   587.50  601.50
#> + Level_of_Stress                2   585.57  601.57
#> - Sex                            1   591.87  601.87
#> + BMI                            1   587.93  601.93
#> + salt_content_in_the_diet       1   587.94  601.94
#> - Physical_activity              1   592.17  602.17
#> - Level_of_Hemoglobin            1   615.30  625.30
#> - Adrenal_and_thyroid_disorders  1  1103.15 1113.15
#> - Chronic_kidney_disease         1  1357.04 1367.04
summary(step)
#> 
#> Call:
#> glm(formula = Blood_Pressure_Abnormality ~ Level_of_Hemoglobin + 
#>     Sex + Physical_activity + Chronic_kidney_disease + Adrenal_and_thyroid_disorders, 
#>     family = binomial(link = "logit"), data = full_model$model)
#> 
#> Coefficients:
#>                                       Estimate    Std. Error z value
#> (Intercept)                       -6.491095081   0.800187793  -8.112
#> Level_of_Hemoglobin                0.347319258   0.065145381   5.331
#> SexMale                           -0.467045092   0.236393033  -1.976
#> Physical_activity                  0.000016613   0.000008149   2.039
#> Chronic_kidney_diseaseYes         22.575771733 940.745458348   0.024
#> Adrenal_and_thyroid_disordersYes  22.263364686 984.433896005   0.023
#>                                              Pr(>|z|)    
#> (Intercept)                      0.000000000000000498 ***
#> Level_of_Hemoglobin              0.000000097432321196 ***
#> SexMale                                        0.0482 *  
#> Physical_activity                              0.0415 *  
#> Chronic_kidney_diseaseYes                      0.9809    
#> Adrenal_and_thyroid_disordersYes               0.9820    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> (Dispersion parameter for binomial family taken to be 1)
#> 
#>     Null deviance: 2323.50  on 1676  degrees of freedom
#> Residual deviance:  587.95  on 1671  degrees of freedom
#> AIC: 599.95
#> 
#> Number of Fisher Scoring iterations: 20

Final optimal model

model_final<-glm(Blood_Pressure_Abnormality ~ Adrenal_and_thyroid_disorders+Chronic_kidney_disease+Level_of_Hemoglobin+Sex+Level_of_Hemoglobin,data=model_data,family=binomial)
summary(model_final)
#> 
#> Call:
#> glm(formula = Blood_Pressure_Abnormality ~ Adrenal_and_thyroid_disorders + 
#>     Chronic_kidney_disease + Level_of_Hemoglobin + Sex + Level_of_Hemoglobin, 
#>     family = binomial, data = model_data)
#> 
#> Coefficients:
#>                                   Estimate Std. Error z value
#> (Intercept)                       -6.31188    0.71911  -8.777
#> Adrenal_and_thyroid_disordersYes  22.29857  887.41087   0.025
#> Chronic_kidney_diseaseYes         22.63412  847.14289   0.027
#> Level_of_Hemoglobin                0.35914    0.06085   5.902
#> SexMale                           -0.41252    0.22277  -1.852
#>                                              Pr(>|z|)    
#> (Intercept)                      < 0.0000000000000002 ***
#> Adrenal_and_thyroid_disordersYes               0.9800    
#> Chronic_kidney_diseaseYes                      0.9787    
#> Level_of_Hemoglobin                      0.0000000036 ***
#> SexMale                                        0.0641 .  
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> (Dispersion parameter for binomial family taken to be 1)
#> 
#>     Null deviance: 2772.25  on 1999  degrees of freedom
#> Residual deviance:  658.41  on 1995  degrees of freedom
#> AIC: 668.41
#> 
#> Number of Fisher Scoring iterations: 20

Model diagnostics for the null model

DescTools::PseudoR2(
  model_final, 
  which = c("McFadden", "CoxSnell", "Nagelkerke", "Tjur")
)
#>   McFadden   CoxSnell Nagelkerke       Tjur 
#>  0.7624989  0.6524751  0.8700158  0.8215934
data_final<-DescTools::PseudoR2(
  model_final, 
  which = c("McFadden", "CoxSnell", "Nagelkerke", "Tjur")
)

data_full<-DescTools::PseudoR2(
  model_final, 
  which = c("McFadden", "CoxSnell", "Nagelkerke", "Tjur")
)

data_final<-as.data.frame(data_final)
data_full<-as.data.frame(data_full)

out_new<-cbind(data_final,data_full)
out_new
(out_new<-rownames_to_column(out_new))

Based on AIC

blood_model_information <- function(form, df) {
  model <- glm(formula = form, data = df)
  broom::glance(model)
}
# define function to run stroke model and glance at results
blood_model_pvals <- function(form, df) {
  model <- glm(formula = form, data = df)
  broom::tidy(model)
}
# create model formula column
formula <- c(
  "Blood_Pressure_Abnormality ~ Adrenal_and_thyroid_disorders+Chronic_kidney_disease+Level_of_Hemoglobin+Sex+Level_of_Hemoglobin",
  "Blood_Pressure_Abnormality~.-Genetic_Pedigree_Coefficient - alcohol_consumption_per_day"
)
# create dataframe
models <- data.frame(formula)
# run models and glance at results
mods_compare<-  models |>
  dplyr::group_by(formula) |>
  dplyr::summarise(blood_model_information(formula, model_data)) |> 
  dplyr::select(formula,AIC,BIC,deviance,logLik) |> 
  separate(formula,c("response","term"),sep="~") |> 
  arrange(AIC)
mods_compare
results<-mods_compare |> 
  mutate(term=ifelse(term==".-Genetic_Pedigree_Coefficient - alcohol_consumption_per_day","data_full","data_final")) |> 
  dplyr::select(-response) |> 
  t() |> 
  as.data.frame() |> 
  rownames_to_column() |> 
  relocate(rowname,V2) |> 
  filter(rowname!="term")
colnames(results)<-c("rowname","data_final","data_full")
results
(my_results<-rbind(results,out_new) |> 
   rename(`full model` =data_full,
          `optimal model` = data_final))

Visualise

my_results |> 
  gather("key","value",-rowname) |> 
  mutate(value=as.numeric(value)) |> 
  ggplot(aes(x=key,y=value,fill=rowname)) +
  geom_col(position="dodge")+
  ggthemes::scale_fill_tableau()+
  ggthemes::theme_fivethirtyeight()+
  labs(fill="measure")