Abnormal Blood Pressure Analysis

library(gtsummary)
library(readr) #Read Tabular Dat
library(tidyverse)
library(magrittr) #Pipe Operators
library(stringr) #Work with Strings
library(forcats) #Work with Factors
library(skimr) #Data Summary
library(patchwork) #ggplot grids
library(GGally) #Scatterplot Matrice
library(VIM)
library(naniar)
library(gt)

load the dataset

out_new<- vroom::vroom("bloodpressure.csv")

sapply(out_new[1,],class)
#>                Patient_Number    Blood_Pressure_Abnormality 
#>                     "numeric"                     "numeric" 
#>           Level_of_Hemoglobin  Genetic_Pedigree_Coefficient 
#>                     "numeric"                     "numeric" 
#>                           Age                           BMI 
#>                     "numeric"                     "numeric" 
#>                           Sex                     Pregnancy 
#>                     "numeric"                     "numeric" 
#>                       Smoking             Physical_activity 
#>                     "numeric"                     "numeric" 
#>      salt_content_in_the_diet   alcohol_consumption_per_day 
#>                     "numeric"                     "numeric" 
#>               Level_of_Stress        Chronic_kidney_disease 
#>                     "numeric"                     "numeric" 
#> Adrenal_and_thyroid_disorders 
#>                     "numeric"

colSums(is.na(out_new))
#>                Patient_Number    Blood_Pressure_Abnormality 
#>                             0                             0 
#>           Level_of_Hemoglobin  Genetic_Pedigree_Coefficient 
#>                             0                            92 
#>                           Age                           BMI 
#>                             0                             0 
#>                           Sex                     Pregnancy 
#>                             0                          1558 
#>                       Smoking             Physical_activity 
#>                             0                             0 
#>      salt_content_in_the_diet   alcohol_consumption_per_day 
#>                             0                           242 
#>               Level_of_Stress        Chronic_kidney_disease 
#>                             0                             0 
#> Adrenal_and_thyroid_disorders 
#>                             0

Data dictionery

knitr::include_graphics("data_dictionary.png")

recode the dataset

data_new<- out_new |> 
  mutate(Blood_Pressure_Abnormality= ifelse(Blood_Pressure_Abnormality==0,"Normal","Abnormal")) |> 
  mutate(Sex = ifelse(Sex==0,"Male","Female")) |> 
  mutate(Pregnancy=ifelse(Pregnancy==0,"No","Yes")) |> 
  mutate(Smoking=ifelse(Smoking==0,"No","Yes")) |>
  mutate(Chronic_kidney_disease=ifelse(Chronic_kidney_disease==0,"No","Yes")) |> 
  mutate(Adrenal_and_thyroid_disorders=ifelse(Adrenal_and_thyroid_disorders==0,"No","Yes")) |> 
  mutate(Level_of_Stress=case_when(Level_of_Stress==1~"Less",
                                   Level_of_Stress==2~"Normal",
                                   Level_of_Stress==3~"High"))

data_new %>% 
  head() %>% 
  gt() %>%
  tab_header(title = md("**First 6 rows of bloodPressure data**"))  %>%
  tab_options(container.height = 400,
              container.overflow.y = TRUE,
              heading.background.color = "#21908CFF",
              table.width = "75%",
              column_labels.background.color = "black",
              table.font.color = "black") %>%
  tab_style(style = list(cell_fill(color = "#35B779FF")),
            locations = cells_body())

Patient_Number	Blood_Pressure_Abnormality	Level_of_Hemoglobin	Genetic_Pedigree_Coefficient	Age	BMI	Sex	Pregnancy	Smoking	Physical_activity	salt_content_in_the_diet	alcohol_consumption_per_day	Level_of_Stress	Chronic_kidney_disease	Adrenal_and_thyroid_disorders
First 6 rows of bloodPressure data
1	Abnormal	11.28	0.90	34	23	Female	Yes	No	45961	48071	NA	Normal	Yes	Yes
2	Normal	9.75	0.23	54	33	Female	NA	No	26106	25333	205	High	No	No
3	Abnormal	10.79	0.91	70	49	Male	NA	No	9995	29465	67	Normal	Yes	No
4	Normal	11.00	0.43	71	50	Male	NA	No	10635	7439	242	Less	No	No
5	Abnormal	14.17	0.83	52	19	Male	NA	No	15619	49644	397	Normal	No	No
6	Normal	11.64	0.54	23	48	Male	NA	Yes	27042	7513	NA	High	No	No


aggr(out_new, numbers = TRUE, prop = FALSE)

# get the proportion of missing data 
prop_miss(out_new)
#> [1] 0.06306667

# get the summary of missing data
miss_var_summary(out_new)

the variable pregrancy is going to be dropped due to high proportion of missing values

Explanatory data analysis

data_new %>% 
  group_by(Blood_Pressure_Abnormality, Smoking) %>% 
  summarize(count= n()) %>% 
  arrange(Blood_Pressure_Abnormality, Smoking)  %>% 
  ggplot(aes(x = Smoking, y = count, fill = Blood_Pressure_Abnormality)) +
  geom_col(position= "dodge", color="white",linewidth=0.1) +
  geom_text(aes(label = comma(count)), position=position_dodge(width=1),
            hjust = 2, vjust=.5, colour = "white", size=3.5) +
  coord_flip() +
  tvthemes::scale_fill_avatar()+
  labs(title="Difference in proportion by sex",fill = "Patients", 
       subtitle="Number of abnormalities per sex", 
       y="\nTotal Number of cases") +
  theme(plot.title = element_text(family="lato", face="bold", vjust = -.5),
        plot.subtitle = element_text(size = 9, family="lato",face="italic"),
        plot.caption.position = "plot",
        plot.caption = element_text(hjust = 0, size= 8, family="lato",face="italic"),
        axis.title.x = element_text(size=9),
        axis.title.y = element_blank(),
        axis.text.y = element_text(family = "lato"),
        axis.line.y.left = element_line(color = "black"),
        axis.line.x.bottom = element_line(color = "gray"),
        ##axis.ticks.length = unit(0, "mm"),
        panel.background = element_rect(fill = "white"),
        panel.border = element_rect(fill = NA, color = "white"),
        panel.grid.major.x = element_line(color = "#A8BAC4", size = 0.3),
        panel.grid.minor.y = element_blank(),
        )  +
  scale_y_continuous(labels = scales::comma, expand=c(0,0)) +
  scale_x_discrete(expand=c(0,1)) +
  guides(fill = guide_legend(reverse=TRUE))

iv_rates <- data_new |>
  group_by(Blood_Pressure_Abnormality) |>
  summarize(count = n()) |> 
  mutate(prop = count/sum(count)) |>
  ungroup() 

plot<-iv_rates |>
  ggplot(aes(x=Blood_Pressure_Abnormality, y=prop, fill=Blood_Pressure_Abnormality)) + 
  geom_col(color="black",width = 0.5)+ 
  theme(legend.position="bottom") + 
  geom_label(aes(label=scales::percent(prop)), color="white") + 
  labs(
    title = "Abnormality ratio",
    y = "proportion(%)", 
    x = "",
    fill="status") + 
  scale_y_continuous(labels = scales::percent)+ 
   paletteer::scale_fill_paletteer_d("ggsci::default_nejm")+
  ggthemes::theme_fivethirtyeight() +
  theme(legend.position = 'right')
plot

Bar Plot

data_new %>% 
  count(Level_of_Stress,Blood_Pressure_Abnormality) %>% 
  ggplot(mapping = aes(x = Level_of_Stress, y = n,fill=Level_of_Stress)) +
  geom_col(show.legend = FALSE, alpha = 0.7, width = 0.5) +
  facet_wrap(~Blood_Pressure_Abnormality)+
  geom_text(aes(label = n, y = n)) +
  paletteer::scale_fill_paletteer_d("ggsci::default_nejm")

data_new |> 
  select_if(is.numeric) %>% 
  dplyr::select(-Patient_Number) |>
  gather() %>% 
  ggplot(aes(value, fill = key)) +
  geom_histogram(alpha = 0.7,bins=30) +
  facet_wrap(~key, scales = 'free') +
  labs(title = 'Count distributions for numeric variables') +
  theme(legend.position = 'none')

library(ggpubr)
library(ggridges)
a <- data_new %>% 
  add_count(Blood_Pressure_Abnormality) %>% 
  mutate(Blood_Pressure_Abnormality = fct_reorder(Blood_Pressure_Abnormality, n)) %>%
  group_by(Blood_Pressure_Abnormality) %>% 
  ggplot(aes(Genetic_Pedigree_Coefficient, Blood_Pressure_Abnormality, fill = Blood_Pressure_Abnormality)) +
  geom_density_ridges() +
  labs(title = 'Density of the GPC by Abnormality status',
       x= 'GPC',
       y= '') +
  theme(legend.position = 'none') +
  scale_fill_viridis_d(alpha = 0.8) +
  theme(panel.grid.minor = element_blank(), panel.border = element_blank())

b <- data_new %>% 
  add_count(Blood_Pressure_Abnormality) %>% 
  mutate(Blood_Pressure_Abnormality = fct_reorder(Blood_Pressure_Abnormality, n)) %>%
  group_by(Blood_Pressure_Abnormality) %>% 
  ggplot(aes(BMI, Blood_Pressure_Abnormality, fill = Blood_Pressure_Abnormality)) +
  geom_density_ridges() +
  # scale_y_discrete(expand = c(0.01, 0)) +
  # scale_x_continuous(expand = c(0.01, 0)) +
  #theme_ridges() + 
  labs(title = 'Density of the BMI by Blood pressure abnormality',
       x= 'BMI',
       y= '') +
  theme(legend.position = 'none') +
  scale_fill_viridis_d(alpha = 0.8) +
  theme(panel.grid.minor = element_blank(), panel.border = element_blank())

(a  / b)

## Picking joint bandwidth of 0.00625
## Picking joint bandwidth of 1.57

Statistical analysis

library(sjPlot)
sjt.xtab(data_new$Blood_Pressure_Abnormality,data_new$Smoking,                 # row var, column var 
         show.row.prc=TRUE,                                        # show row percents  
         show.summary=FALSE,                                       # omit summary statistics (for now) 
         title="Cross Tab Smoking x Blood pressure")

Cross Tab Smoking x Blood pressure
Blood_Pressure_Abnormality	Smoking		Total
Blood_Pressure_Abnormality	No	Yes	Total
Abnormal	478 48.4 %	509 51.6 %	987 100 %
Normal	503 49.7 %	510 50.3 %	1013 100 %
Total	981 49 %	1019 51 %	2000 100 %

library(compareGroups) 
results <- compareGroups(Blood_Pressure_Abnormality ~ Smoking, data = data_new)                   # do calculations 
readytable <- createTable(results, show.ratio=TRUE, show.p.overall = FALSE)       # produce table 
print(readytable, header.labels = c(p.ratio = "p-value"))   
#> 
#> --------Summary descriptives table by 'Blood_Pressure_Abnormality'---------
#> 
#> _________________________________________________________ 
#>           Abnormal     Normal           OR        p-value 
#>             N=987      N=1013                             
#> ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
#> Smoking:                                                  
#>     No   478 (48.4%) 503 (49.7%)       Ref.        Ref.   
#>     Yes  509 (51.6%) 510 (50.3%) 0.95 [0.80;1.13]  0.584  
#> ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯

Chi squared tests

data_new|> 
  dplyr::select(Blood_Pressure_Abnormality,Sex,Level_of_Stress,
         Chronic_kidney_disease,Adrenal_and_thyroid_disorders) |> 
  mutate(Chronic_kidney_disease=
            factor(Chronic_kidney_disease,levels=c("No","Yes"))) |> 
  tbl_summary(
    by = Blood_Pressure_Abnormality,
    statistic = list(
      all_continuous() ~ "{mean} ({sd})",
      all_categorical() ~ "{n} / {N} ({p}%)"), 
    type=list(Chronic_kidney_disease~"categorical",
              Adrenal_and_thyroid_disorders~"categorical"),
    label = Level_of_Stress ~ "Level of Stress")|> 
  add_p(test = all_continuous() ~ "t.test",
        pvalue_fun = function(x) style_pvalue(x, digits = 2))|> 
   modify_header(statistic ~ "**Test Statistic**")|>
  bold_labels()|> 
  modify_fmt_fun(statistic ~ style_sigfig)

Characteristic	Abnormal, N = 987¹	Normal, N = 1,013¹	Test Statistic	p-value²
Sex			6.0	0.014
Female	517 / 987 (52%)	475 / 1,013 (47%)
Male	470 / 987 (48%)	538 / 1,013 (53%)
Level of Stress			0.34	0.84
High	347 / 987 (35%)	344 / 1,013 (34%)
Less	327 / 987 (33%)	339 / 1,013 (33%)
Normal	313 / 987 (32%)	330 / 1,013 (33%)
Chronic_kidney_disease			1,137	<0.001
No	274 / 987 (28%)	1,013 / 1,013 (100%)
Yes	713 / 987 (72%)	0 / 1,013 (0%)
Adrenal_and_thyroid_disorders			871	<0.001
No	391 / 987 (40%)	1,013 / 1,013 (100%)
Yes	596 / 987 (60%)	0 / 1,013 (0%)
¹ n / N (%)
² Pearson’s Chi-squared test

Comments

Sex ,Chronic kidney disease and Adrenal and thyroid disorders are all significantly associated with Abnormal blood pressure

#Correlation Matrix Plot
subset<- data_new |>
  dplyr::select(Age,BMI,Physical_activity,salt_content_in_the_diet,alcohol_consumption_per_day,Level_of_Hemoglobin,Genetic_Pedigree_Coefficient,Blood_Pressure_Abnormality)# Bring in external file for visualisations
source('functions/visualisations.R')

# Use plot function
plot <- histoplotter(subset,Blood_Pressure_Abnormality, 
                     chart_x_axis_lbl = "Abnormality status", 
                     chart_y_axis_lbl = 'Measures',
                     boxplot_color = 'navy', 
                     boxplot_fill = '#89CFF0', 
                     box_fill_transparency = 0.2) 

# Add extras to plot
plot + 
  tvthemes::theme_theLastAirbender() +
 paletteer::scale_color_paletteer_d("ggsci::default_nejm")+
  theme(legend.position = 'top')

T test

data_new|> 
  dplyr::select(Age,BMI,Physical_activity,salt_content_in_the_diet,alcohol_consumption_per_day,Level_of_Hemoglobin,Genetic_Pedigree_Coefficient,Blood_Pressure_Abnormality) |> 
  tbl_summary(
    by = Blood_Pressure_Abnormality,
    statistic = list(
      all_continuous() ~ "{mean} ({sd})",
      all_categorical() ~ "{n} / {N} ({p}%)"), 
    label = BMI ~ "BMI")|> 
  add_p(test = all_continuous() ~ "t.test",
        pvalue_fun = function(x) style_pvalue(x, digits = 2))|> 
   modify_header(statistic ~ "**Test Statistic**")|>
  bold_labels()|> 
  modify_fmt_fun(statistic ~ style_sigfig)

Characteristic	Abnormal, N = 987¹	Normal, N = 1,013¹	Test Statistic	p-value²
Age	45 (17)	48 (17)	-3.0	0.003
BMI	31 (12)	30 (12)	1.8	0.072
Physical_activity	25,793 (14,168)	24,730 (13,852)	1.7	0.090
salt_content_in_the_diet	25,130 (14,339)	24,727 (14,091)	0.63	0.53
alcohol_consumption_per_day	254 (144)	248 (143)	0.87	0.38
Unknown	146	96
Level_of_Hemoglobin	12.02 (2.75)	11.41 (1.37)	6.2	<0.001
Genetic_Pedigree_Coefficient	0.49 (0.35)	0.50 (0.21)	-1.5	0.14
Unknown	30	62
¹ Mean (SD)
² Welch Two Sample t-test

Comments

there is significant difference in mean BMI and mean haemoglobin level implying that the differences in the averages is not merely an outcome of chance
lets explore more


model_data <- data_new |> 
  dplyr::select(-Patient_Number,-Pregnancy) |> 
  mutate(Blood_Pressure_Abnormality=ifelse(Blood_Pressure_Abnormality=="Normal",0,1),
         Blood_Pressure_Abnormality=as.numeric(Blood_Pressure_Abnormality))

Fitting a Binary logistic regression

Fit a full model

full_model<-glm(Blood_Pressure_Abnormality~.-Genetic_Pedigree_Coefficient - alcohol_consumption_per_day ,
                data=model_data,family=binomial(link="logit"))
summary(full_model)
#> 
#> Call:
#> glm(formula = Blood_Pressure_Abnormality ~ . - Genetic_Pedigree_Coefficient - 
#>     alcohol_consumption_per_day, family = binomial(link = "logit"), 
#>     data = model_data)
#> 
#> Coefficients:
#>                                    Estimate Std. Error z value Pr(>|z|)    
#> (Intercept)                      -7.031e+00  9.823e-01  -7.158 8.20e-13 ***
#> Level_of_Hemoglobin               3.584e-01  6.701e-02   5.349 8.85e-08 ***
#> Age                               4.031e-03  7.463e-03   0.540   0.5891    
#> BMI                               5.638e-04  9.528e-03   0.059   0.9528    
#> SexMale                          -4.655e-01  2.380e-01  -1.956   0.0504 .  
#> SmokingYes                        2.683e-01  2.248e-01   1.194   0.2326    
#> Physical_activity                 1.771e-05  8.226e-06   2.153   0.0314 *  
#> salt_content_in_the_diet         -4.453e-07  7.980e-06  -0.056   0.9555    
#> Level_of_StressLess              -1.930e-01  2.797e-01  -0.690   0.4902    
#> Level_of_StressNormal             2.420e-01  2.659e-01   0.910   0.3627    
#> Chronic_kidney_diseaseYes         2.265e+01  9.363e+02   0.024   0.9807    
#> Adrenal_and_thyroid_disordersYes  2.232e+01  9.809e+02   0.023   0.9818    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> (Dispersion parameter for binomial family taken to be 1)
#> 
#>     Null deviance: 2323.50  on 1676  degrees of freedom
#> Residual deviance:  583.78  on 1665  degrees of freedom
#>   (323 observations deleted due to missingness)
#> AIC: 607.78
#> 
#> Number of Fisher Scoring iterations: 20

look at the odds

# create coefficient table with estimates, p-values and odds ratios
coefs<-summary(full_model)$coefficients

(full_coefs <- cbind(coefs[ ,c("Estimate", "Pr(>|z|)")], 
                     odds_ratio = exp(full_model$coefficients))) 
#>                                       Estimate     Pr(>|z|)   odds_ratio
#> (Intercept)                      -7.030851e+00 8.198296e-13 8.841791e-04
#> Level_of_Hemoglobin               3.584342e-01 8.849354e-08 1.431087e+00
#> Age                               4.030797e-03 5.891392e-01 1.004039e+00
#> BMI                               5.637909e-04 9.528153e-01 1.000564e+00
#> SexMale                          -4.655083e-01 5.044978e-02 6.278159e-01
#> SmokingYes                        2.682738e-01 2.326488e-01 1.307705e+00
#> Physical_activity                 1.770756e-05 3.135208e-02 1.000018e+00
#> salt_content_in_the_diet         -4.453453e-07 9.554923e-01 9.999996e-01
#> Level_of_StressLess              -1.929816e-01 4.901519e-01 8.244971e-01
#> Level_of_StressNormal             2.420288e-01 3.626761e-01 1.273831e+00
#> Chronic_kidney_diseaseYes         2.265090e+01 9.807002e-01 6.873214e+09
#> Adrenal_and_thyroid_disordersYes  2.231799e+01 9.818472e-01 4.926975e+09

Comments

as would be expected people with Chronic_kidney_disease and Adrenal and thyroid disorders have high odds of developing abnormal blood pressure
people with less stress have decreased odds of developing abnormal blood pressure as compared to their counterparts
Smoking increases the odds of developing abnormal blood pressure
Males have decreased chances of developing abnormal blood pressure as compared to women

Model diagnostics

McFadden’s \(R^2\) works by comparing the likelihood function of the fitted model with that of a random model and using this to estimate the explained variance in the outcome.
Cox and Snell’s \(R^2\) works by applying a ‘sum of squares’ analogy to the likelihood functions to align more closely with the precise methodology for calculating \(R^2\) in linear regression. However, this usually means that the maximum value is less than 1 and in certain circumstances substantially less than 1, which can be problematic and unintuitive for an \(R^2\).
Nagelkerke’s \(R^2\) resolves the issue with the upper bound for Cox and Snell by dividing Cox and Snell’s \(R^2\) by its upper bound. This restores an intuitive scale with a maximum of 1, but is considered somewhat arbitrary with limited theoretical foundation.
Tjur’s \(R^2\) is a more recent and simpler concept. It is defined as simply the absolute difference between the predicted probabilities of the positive observations and those of the negative observations.

Standard modeling functions generally do not offer the calculation of pseudo-\(R^2\) as standard, but numerous methods are available for their calculation. For example:

library(DescTools)
DescTools::PseudoR2(
  full_model, 
  which = c("McFadden", "CoxSnell", "Nagelkerke", "Tjur")
)
#>   McFadden   CoxSnell Nagelkerke       Tjur 
#>  0.7487511  0.6456256  0.8610597  0.8080701

We see that the Cox and Snell variant is notably lower than the other estimates, which is consistent with the known issues with its upper bound. However, the other estimates are reasonably aligned and suggest a strong fit.

Model Selection

fit the logistic null models

# define function to run stroke model and glance at results
blood_model_pvals <- function(form, df) {
  model <- glm(formula = form, data = df)
  broom::tidy(model)
}
# create model formula column
formula <- c(
  "Blood_Pressure_Abnormality ~ Adrenal_and_thyroid_disorders", 
  "Blood_Pressure_Abnormality ~ Level_of_Hemoglobin",
  "Blood_Pressure_Abnormality ~ Level_of_Stress",
  "Blood_Pressure_Abnormality ~ Age",
  "Blood_Pressure_Abnormality ~ Sex",
  "Blood_Pressure_Abnormality ~ Chronic_kidney_disease",
  "Blood_Pressure_Abnormality ~ Physical_activity",
  "Blood_Pressure_Abnormality ~ BMI",
  "Blood_Pressure_Abnormality ~ salt_content_in_the_diet",
  "Blood_Pressure_Abnormality ~ Smoking"
)
# create dataframe
models <- data.frame(formula)

view p-values

options(scipen=999)
# run models and glance at results
mods<-models |>
  dplyr::group_by(formula) |>
  dplyr::summarise(blood_model_pvals(formula, model_data)) |> 
  dplyr::select(formula,p.value,term) |>
  mutate(p.value=round(p.value,4)) |> 
  filter(term!="(Intercept)") |> 
  arrange(p.value)
mods

Based on P-values

lr_fitted_add <-mods |> 
  mutate(Significance = ifelse(p.value < 0.05, 
                               "Significant", "Insignificant"))|> 
  arrange(desc(p.value)) 
#Create a ggplot object to visualise significance
plot <- lr_fitted_add|> 
  ggplot(mapping = aes(x=fct_reorder(term,p.value), y=p.value, fill=Significance)) +
  geom_col() + 
  ggthemes::scale_fill_tableau() +
  theme(axis.text.x = element_text(face="bold", 
                                   color="#0070BA",
                                   size=8, 
                                   angle=60)) +
  geom_hline(yintercept = 0.05, col = "black", lty = 2) +
  labs(y="P value", 
       x="Terms",
       title="P value significance chart",
       subtitle="significant variables in the model",
       caption="Produced by Bongani Ncube")

plotly::ggplotly(plot)

explanation

the output shows p-values for the univariate models arranged from smallest to largest
Adrenal_and_thyroid_disordersYes and Chronic_kidney_diseaseYes has the least p-value followed by level of haemoglobin

Based on AIC

blood_model_information <- function(form, df) {
  model <- glm(formula = form, data = df)
  broom::glance(model)
}

# run models and glance at results
mods1<-  models |>
  dplyr::group_by(formula) |>
  dplyr::summarise(blood_model_information(formula, model_data)) |> 
  dplyr::select(formula,AIC,BIC,deviance,logLik) |> 
  separate(formula,c("response","term"),sep="~") |> 
  arrange(AIC)
mods1

Based on Akaike Information Criterion

lr_fitted_add <-mods1
#Create a ggplot object to visualise significance
plot <- lr_fitted_add|> 
  ggplot(mapping = aes(x=fct_reorder(term,AIC), y=AIC)) +
  geom_col() + 
  ggthemes::scale_fill_canva() +
  theme(axis.text.x = element_text(face="bold", 
                                   color="#0070BA",
                                   size=8, 
                                   angle=40)) +
  geom_hline(yintercept = 0.05, col = "black", lty = 2) +
  labs(y="Akaike Information Criteria", 
       x="term",
       title="AIC chart",
       subtitle="significant variables in the model",
       caption="Produced by Bongani Ncube")

plotly::ggplotly(plot)

Comments

variables with the least p.value and AIC are more likely to be included first in model building

Optimal model building

add chronic kidney disease

model0<-glm(Blood_Pressure_Abnormality ~ Adrenal_and_thyroid_disorders,data=model_data,family=binomial)
model1<-glm(Blood_Pressure_Abnormality ~ Adrenal_and_thyroid_disorders+Chronic_kidney_disease,data=model_data,family=binomial)

anova(model0,model1,test="LRT")

chronic kidney disease should be included because the model performs better than without it

add haemoglobin level

model2<-glm(Blood_Pressure_Abnormality ~ Adrenal_and_thyroid_disorders+Chronic_kidney_disease+Level_of_Hemoglobin,data=model_data,family=binomial)

anova(model1,model2,test="LRT")

level of haemoglobin is a significant variable and should be included

add sex

model3<-glm(Blood_Pressure_Abnormality ~ Adrenal_and_thyroid_disorders+Chronic_kidney_disease+Level_of_Hemoglobin+Sex,data=model_data,family=binomial)

anova(model2,model3,test="LRT")

Pr(>Chi) is greater than 0.05 so we drop sex

lets try age

model4<-glm(Blood_Pressure_Abnormality ~ Adrenal_and_thyroid_disorders+Chronic_kidney_disease+Level_of_Hemoglobin+Age,data=model_data,family=binomial)

anova(model2,model4,test="LRT")

age is not signifinant so we drop it

try Physical_activity

model5<-glm(Blood_Pressure_Abnormality ~ Adrenal_and_thyroid_disorders+Chronic_kidney_disease+Level_of_Hemoglobin+Physical_activity,data=model_data,family=binomial)

anova(model2,model5,test="LRT")

this would go on and on but we can use stepwise algorithms to make it fast

library(MASS)
full_model<-glm(Blood_Pressure_Abnormality~.-Genetic_Pedigree_Coefficient - alcohol_consumption_per_day ,
                data=model_data,family=binomial(link="logit"))

full_model1<-glm(Blood_Pressure_Abnormality~.-Genetic_Pedigree_Coefficient - alcohol_consumption_per_day ,
                data=full_model$model,family=binomial(link="logit"))

step<-stepAIC(full_model1, direction = "both")
#> Start:  AIC=607.78
#> Blood_Pressure_Abnormality ~ (Level_of_Hemoglobin + Genetic_Pedigree_Coefficient + 
#>     Age + BMI + Sex + Smoking + Physical_activity + salt_content_in_the_diet + 
#>     alcohol_consumption_per_day + Level_of_Stress + Chronic_kidney_disease + 
#>     Adrenal_and_thyroid_disorders) - Genetic_Pedigree_Coefficient - 
#>     alcohol_consumption_per_day
#> 
#>                                 Df Deviance     AIC
#> - salt_content_in_the_diet       1   583.78  605.78
#> - BMI                            1   583.78  605.78
#> - Age                            1   584.07  606.07
#> - Level_of_Stress                2   586.29  606.29
#> - Smoking                        1   585.21  607.21
#> <none>                               583.78  607.78
#> - Sex                            1   587.62  609.62
#> - Physical_activity              1   588.48  610.48
#> - Level_of_Hemoglobin            1   611.85  633.85
#> - Adrenal_and_thyroid_disorders  1  1097.92 1119.92
#> - Chronic_kidney_disease         1  1354.30 1376.30
#> 
#> Step:  AIC=605.78
#> Blood_Pressure_Abnormality ~ Level_of_Hemoglobin + Age + BMI + 
#>     Sex + Smoking + Physical_activity + Level_of_Stress + Chronic_kidney_disease + 
#>     Adrenal_and_thyroid_disorders
#> 
#>                                 Df Deviance     AIC
#> - BMI                            1   583.78  603.78
#> - Age                            1   584.07  604.07
#> - Level_of_Stress                2   586.29  604.29
#> - Smoking                        1   585.22  605.22
#> <none>                               583.78  605.78
#> - Sex                            1   587.62  607.62
#> + salt_content_in_the_diet       1   583.78  607.78
#> - Physical_activity              1   588.50  608.50
#> - Level_of_Hemoglobin            1   611.85  631.85
#> - Adrenal_and_thyroid_disorders  1  1099.79 1119.79
#> - Chronic_kidney_disease         1  1354.36 1374.36
#> 
#> Step:  AIC=603.78
#> Blood_Pressure_Abnormality ~ Level_of_Hemoglobin + Age + Sex + 
#>     Smoking + Physical_activity + Level_of_Stress + Chronic_kidney_disease + 
#>     Adrenal_and_thyroid_disorders
#> 
#>                                 Df Deviance     AIC
#> - Age                            1   584.08  602.08
#> - Level_of_Stress                2   586.30  602.30
#> - Smoking                        1   585.22  603.22
#> <none>                               583.78  603.78
#> - Sex                            1   587.66  605.66
#> + BMI                            1   583.78  605.78
#> + salt_content_in_the_diet       1   583.78  605.78
#> - Physical_activity              1   588.50  606.50
#> - Level_of_Hemoglobin            1   612.31  630.31
#> - Adrenal_and_thyroid_disorders  1  1099.83 1117.83
#> - Chronic_kidney_disease         1  1354.49 1372.49
#> 
#> Step:  AIC=602.08
#> Blood_Pressure_Abnormality ~ Level_of_Hemoglobin + Sex + Smoking + 
#>     Physical_activity + Level_of_Stress + Chronic_kidney_disease + 
#>     Adrenal_and_thyroid_disorders
#> 
#>                                 Df Deviance     AIC
#> - Level_of_Stress                2   586.69  600.69
#> - Smoking                        1   585.57  601.57
#> <none>                               584.08  602.08
#> + Age                            1   583.78  603.78
#> - Sex                            1   587.95  603.95
#> + BMI                            1   584.07  604.07
#> + salt_content_in_the_diet       1   584.08  604.08
#> - Physical_activity              1   588.81  604.81
#> - Level_of_Hemoglobin            1   612.31  628.31
#> - Adrenal_and_thyroid_disorders  1  1099.87 1115.87
#> - Chronic_kidney_disease         1  1354.49 1370.49
#> 
#> Step:  AIC=600.69
#> Blood_Pressure_Abnormality ~ Level_of_Hemoglobin + Sex + Smoking + 
#>     Physical_activity + Chronic_kidney_disease + Adrenal_and_thyroid_disorders
#> 
#>                                 Df Deviance     AIC
#> - Smoking                        1   587.95  599.95
#> <none>                               586.69  600.69
#> + Level_of_Stress                2   584.08  602.08
#> + Age                            1   586.30  602.30
#> - Sex                            1   590.41  602.41
#> + BMI                            1   586.68  602.68
#> + salt_content_in_the_diet       1   586.69  602.69
#> - Physical_activity              1   591.09  603.09
#> - Level_of_Hemoglobin            1   613.48  625.48
#> - Adrenal_and_thyroid_disorders  1  1101.12 1113.12
#> - Chronic_kidney_disease         1  1357.03 1369.03
#> 
#> Step:  AIC=599.95
#> Blood_Pressure_Abnormality ~ Level_of_Hemoglobin + Sex + Physical_activity + 
#>     Chronic_kidney_disease + Adrenal_and_thyroid_disorders
#> 
#>                                 Df Deviance     AIC
#> <none>                               587.95  599.95
#> + Smoking                        1   586.69  600.69
#> + Age                            1   587.50  601.50
#> + Level_of_Stress                2   585.57  601.57
#> - Sex                            1   591.87  601.87
#> + BMI                            1   587.93  601.93
#> + salt_content_in_the_diet       1   587.94  601.94
#> - Physical_activity              1   592.17  602.17
#> - Level_of_Hemoglobin            1   615.30  625.30
#> - Adrenal_and_thyroid_disorders  1  1103.15 1113.15
#> - Chronic_kidney_disease         1  1357.04 1367.04
summary(step)
#> 
#> Call:
#> glm(formula = Blood_Pressure_Abnormality ~ Level_of_Hemoglobin + 
#>     Sex + Physical_activity + Chronic_kidney_disease + Adrenal_and_thyroid_disorders, 
#>     family = binomial(link = "logit"), data = full_model$model)
#> 
#> Coefficients:
#>                                       Estimate    Std. Error z value
#> (Intercept)                       -6.491095081   0.800187793  -8.112
#> Level_of_Hemoglobin                0.347319258   0.065145381   5.331
#> SexMale                           -0.467045092   0.236393033  -1.976
#> Physical_activity                  0.000016613   0.000008149   2.039
#> Chronic_kidney_diseaseYes         22.575771733 940.745458348   0.024
#> Adrenal_and_thyroid_disordersYes  22.263364686 984.433896005   0.023
#>                                              Pr(>|z|)    
#> (Intercept)                      0.000000000000000498 ***
#> Level_of_Hemoglobin              0.000000097432321196 ***
#> SexMale                                        0.0482 *  
#> Physical_activity                              0.0415 *  
#> Chronic_kidney_diseaseYes                      0.9809    
#> Adrenal_and_thyroid_disordersYes               0.9820    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> (Dispersion parameter for binomial family taken to be 1)
#> 
#>     Null deviance: 2323.50  on 1676  degrees of freedom
#> Residual deviance:  587.95  on 1671  degrees of freedom
#> AIC: 599.95
#> 
#> Number of Fisher Scoring iterations: 20

Final optimal model

model_final<-glm(Blood_Pressure_Abnormality ~ Adrenal_and_thyroid_disorders+Chronic_kidney_disease+Level_of_Hemoglobin+Sex+Level_of_Hemoglobin,data=model_data,family=binomial)
summary(model_final)
#> 
#> Call:
#> glm(formula = Blood_Pressure_Abnormality ~ Adrenal_and_thyroid_disorders + 
#>     Chronic_kidney_disease + Level_of_Hemoglobin + Sex + Level_of_Hemoglobin, 
#>     family = binomial, data = model_data)
#> 
#> Coefficients:
#>                                   Estimate Std. Error z value
#> (Intercept)                       -6.31188    0.71911  -8.777
#> Adrenal_and_thyroid_disordersYes  22.29857  887.41087   0.025
#> Chronic_kidney_diseaseYes         22.63412  847.14289   0.027
#> Level_of_Hemoglobin                0.35914    0.06085   5.902
#> SexMale                           -0.41252    0.22277  -1.852
#>                                              Pr(>|z|)    
#> (Intercept)                      < 0.0000000000000002 ***
#> Adrenal_and_thyroid_disordersYes               0.9800    
#> Chronic_kidney_diseaseYes                      0.9787    
#> Level_of_Hemoglobin                      0.0000000036 ***
#> SexMale                                        0.0641 .  
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> (Dispersion parameter for binomial family taken to be 1)
#> 
#>     Null deviance: 2772.25  on 1999  degrees of freedom
#> Residual deviance:  658.41  on 1995  degrees of freedom
#> AIC: 668.41
#> 
#> Number of Fisher Scoring iterations: 20

Model diagnostics for the null model

DescTools::PseudoR2(
  model_final, 
  which = c("McFadden", "CoxSnell", "Nagelkerke", "Tjur")
)
#>   McFadden   CoxSnell Nagelkerke       Tjur 
#>  0.7624989  0.6524751  0.8700158  0.8215934

data_final<-DescTools::PseudoR2(
  model_final, 
  which = c("McFadden", "CoxSnell", "Nagelkerke", "Tjur")
)

data_full<-DescTools::PseudoR2(
  model_final, 
  which = c("McFadden", "CoxSnell", "Nagelkerke", "Tjur")
)

data_final<-as.data.frame(data_final)
data_full<-as.data.frame(data_full)

out_new<-cbind(data_final,data_full)
out_new

(out_new<-rownames_to_column(out_new))

Based on AIC

blood_model_information <- function(form, df) {
  model <- glm(formula = form, data = df)
  broom::glance(model)
}

# define function to run stroke model and glance at results
blood_model_pvals <- function(form, df) {
  model <- glm(formula = form, data = df)
  broom::tidy(model)
}
# create model formula column
formula <- c(
  "Blood_Pressure_Abnormality ~ Adrenal_and_thyroid_disorders+Chronic_kidney_disease+Level_of_Hemoglobin+Sex+Level_of_Hemoglobin",
  "Blood_Pressure_Abnormality~.-Genetic_Pedigree_Coefficient - alcohol_consumption_per_day"
)
# create dataframe
models <- data.frame(formula)

# run models and glance at results
mods_compare<-  models |>
  dplyr::group_by(formula) |>
  dplyr::summarise(blood_model_information(formula, model_data)) |> 
  dplyr::select(formula,AIC,BIC,deviance,logLik) |> 
  separate(formula,c("response","term"),sep="~") |> 
  arrange(AIC)
mods_compare

results<-mods_compare |> 
  mutate(term=ifelse(term==".-Genetic_Pedigree_Coefficient - alcohol_consumption_per_day","data_full","data_final")) |> 
  dplyr::select(-response) |> 
  t() |> 
  as.data.frame() |> 
  rownames_to_column() |> 
  relocate(rowname,V2) |> 
  filter(rowname!="term")

colnames(results)<-c("rowname","data_final","data_full")
results

(my_results<-rbind(results,out_new) |> 
   rename(`full model` =data_full,
          `optimal model` = data_final))

Visualise

my_results |> 
  gather("key","value",-rowname) |> 
  mutate(value=as.numeric(value)) |> 
  ggplot(aes(x=key,y=value,fill=rowname)) +
  geom_col(position="dodge")+
  ggthemes::scale_fill_tableau()+
  ggthemes::theme_fivethirtyeight()+
  labs(fill="measure")

Abnormal Blood Pressure Analysis

Data Science in Healthcare

Bongani Ncube

load the dataset

Data dictionery

recode the dataset

Explanatory data analysis

Bar Plot

Statistical analysis

Chi squared tests

T test

Fitting a Binary logistic regression

Fit a full model

look at the odds

Model diagnostics

Model Selection

fit the logistic null models

view p-values

Based on P-values

explanation

Based on AIC

Based on Akaike Information Criterion

Optimal model building

Final optimal model

Model diagnostics for the null model

Based on AIC

Visualise