library(gtsummary)
library(readr) #Read Tabular Dat
library(tidyverse)
library(magrittr) #Pipe Operators
library(stringr) #Work with Strings
library(forcats) #Work with Factors
library(skimr) #Data Summary
library(patchwork) #ggplot grids
library(GGally) #Scatterplot Matrice
library(VIM)
library(naniar)
library(gt)out_new<- vroom::vroom("bloodpressure.csv")sapply(out_new[1,],class)
#> Patient_Number Blood_Pressure_Abnormality
#> "numeric" "numeric"
#> Level_of_Hemoglobin Genetic_Pedigree_Coefficient
#> "numeric" "numeric"
#> Age BMI
#> "numeric" "numeric"
#> Sex Pregnancy
#> "numeric" "numeric"
#> Smoking Physical_activity
#> "numeric" "numeric"
#> salt_content_in_the_diet alcohol_consumption_per_day
#> "numeric" "numeric"
#> Level_of_Stress Chronic_kidney_disease
#> "numeric" "numeric"
#> Adrenal_and_thyroid_disorders
#> "numeric"
colSums(is.na(out_new))
#> Patient_Number Blood_Pressure_Abnormality
#> 0 0
#> Level_of_Hemoglobin Genetic_Pedigree_Coefficient
#> 0 92
#> Age BMI
#> 0 0
#> Sex Pregnancy
#> 0 1558
#> Smoking Physical_activity
#> 0 0
#> salt_content_in_the_diet alcohol_consumption_per_day
#> 0 242
#> Level_of_Stress Chronic_kidney_disease
#> 0 0
#> Adrenal_and_thyroid_disorders
#> 0knitr::include_graphics("data_dictionary.png")data_new<- out_new |>
mutate(Blood_Pressure_Abnormality= ifelse(Blood_Pressure_Abnormality==0,"Normal","Abnormal")) |>
mutate(Sex = ifelse(Sex==0,"Male","Female")) |>
mutate(Pregnancy=ifelse(Pregnancy==0,"No","Yes")) |>
mutate(Smoking=ifelse(Smoking==0,"No","Yes")) |>
mutate(Chronic_kidney_disease=ifelse(Chronic_kidney_disease==0,"No","Yes")) |>
mutate(Adrenal_and_thyroid_disorders=ifelse(Adrenal_and_thyroid_disorders==0,"No","Yes")) |>
mutate(Level_of_Stress=case_when(Level_of_Stress==1~"Less",
Level_of_Stress==2~"Normal",
Level_of_Stress==3~"High"))data_new %>%
head() %>%
gt() %>%
tab_header(title = md("**First 6 rows of bloodPressure data**")) %>%
tab_options(container.height = 400,
container.overflow.y = TRUE,
heading.background.color = "#21908CFF",
table.width = "75%",
column_labels.background.color = "black",
table.font.color = "black") %>%
tab_style(style = list(cell_fill(color = "#35B779FF")),
locations = cells_body())| First 6 rows of bloodPressure data | ||||||||||||||
| Patient_Number | Blood_Pressure_Abnormality | Level_of_Hemoglobin | Genetic_Pedigree_Coefficient | Age | BMI | Sex | Pregnancy | Smoking | Physical_activity | salt_content_in_the_diet | alcohol_consumption_per_day | Level_of_Stress | Chronic_kidney_disease | Adrenal_and_thyroid_disorders |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | Abnormal | 11.28 | 0.90 | 34 | 23 | Female | Yes | No | 45961 | 48071 | NA | Normal | Yes | Yes |
| 2 | Normal | 9.75 | 0.23 | 54 | 33 | Female | NA | No | 26106 | 25333 | 205 | High | No | No |
| 3 | Abnormal | 10.79 | 0.91 | 70 | 49 | Male | NA | No | 9995 | 29465 | 67 | Normal | Yes | No |
| 4 | Normal | 11.00 | 0.43 | 71 | 50 | Male | NA | No | 10635 | 7439 | 242 | Less | No | No |
| 5 | Abnormal | 14.17 | 0.83 | 52 | 19 | Male | NA | No | 15619 | 49644 | 397 | Normal | No | No |
| 6 | Normal | 11.64 | 0.54 | 23 | 48 | Male | NA | Yes | 27042 | 7513 | NA | High | No | No |
aggr(out_new, numbers = TRUE, prop = FALSE)# get the proportion of missing data
prop_miss(out_new)
#> [1] 0.06306667
# get the summary of missing data
miss_var_summary(out_new)the variable pregrancy is going to be dropped due to high proportion of missing values
data_new %>%
group_by(Blood_Pressure_Abnormality, Smoking) %>%
summarize(count= n()) %>%
arrange(Blood_Pressure_Abnormality, Smoking) %>%
ggplot(aes(x = Smoking, y = count, fill = Blood_Pressure_Abnormality)) +
geom_col(position= "dodge", color="white",linewidth=0.1) +
geom_text(aes(label = comma(count)), position=position_dodge(width=1),
hjust = 2, vjust=.5, colour = "white", size=3.5) +
coord_flip() +
tvthemes::scale_fill_avatar()+
labs(title="Difference in proportion by sex",fill = "Patients",
subtitle="Number of abnormalities per sex",
y="\nTotal Number of cases") +
theme(plot.title = element_text(family="lato", face="bold", vjust = -.5),
plot.subtitle = element_text(size = 9, family="lato",face="italic"),
plot.caption.position = "plot",
plot.caption = element_text(hjust = 0, size= 8, family="lato",face="italic"),
axis.title.x = element_text(size=9),
axis.title.y = element_blank(),
axis.text.y = element_text(family = "lato"),
axis.line.y.left = element_line(color = "black"),
axis.line.x.bottom = element_line(color = "gray"),
##axis.ticks.length = unit(0, "mm"),
panel.background = element_rect(fill = "white"),
panel.border = element_rect(fill = NA, color = "white"),
panel.grid.major.x = element_line(color = "#A8BAC4", size = 0.3),
panel.grid.minor.y = element_blank(),
) +
scale_y_continuous(labels = scales::comma, expand=c(0,0)) +
scale_x_discrete(expand=c(0,1)) +
guides(fill = guide_legend(reverse=TRUE))iv_rates <- data_new |>
group_by(Blood_Pressure_Abnormality) |>
summarize(count = n()) |>
mutate(prop = count/sum(count)) |>
ungroup()
plot<-iv_rates |>
ggplot(aes(x=Blood_Pressure_Abnormality, y=prop, fill=Blood_Pressure_Abnormality)) +
geom_col(color="black",width = 0.5)+
theme(legend.position="bottom") +
geom_label(aes(label=scales::percent(prop)), color="white") +
labs(
title = "Abnormality ratio",
y = "proportion(%)",
x = "",
fill="status") +
scale_y_continuous(labels = scales::percent)+
paletteer::scale_fill_paletteer_d("ggsci::default_nejm")+
ggthemes::theme_fivethirtyeight() +
theme(legend.position = 'right')
plotdata_new %>%
count(Level_of_Stress,Blood_Pressure_Abnormality) %>%
ggplot(mapping = aes(x = Level_of_Stress, y = n,fill=Level_of_Stress)) +
geom_col(show.legend = FALSE, alpha = 0.7, width = 0.5) +
facet_wrap(~Blood_Pressure_Abnormality)+
geom_text(aes(label = n, y = n)) +
paletteer::scale_fill_paletteer_d("ggsci::default_nejm")data_new |>
select_if(is.numeric) %>%
dplyr::select(-Patient_Number) |>
gather() %>%
ggplot(aes(value, fill = key)) +
geom_histogram(alpha = 0.7,bins=30) +
facet_wrap(~key, scales = 'free') +
labs(title = 'Count distributions for numeric variables') +
theme(legend.position = 'none')library(ggpubr)
library(ggridges)
a <- data_new %>%
add_count(Blood_Pressure_Abnormality) %>%
mutate(Blood_Pressure_Abnormality = fct_reorder(Blood_Pressure_Abnormality, n)) %>%
group_by(Blood_Pressure_Abnormality) %>%
ggplot(aes(Genetic_Pedigree_Coefficient, Blood_Pressure_Abnormality, fill = Blood_Pressure_Abnormality)) +
geom_density_ridges() +
labs(title = 'Density of the GPC by Abnormality status',
x= 'GPC',
y= '') +
theme(legend.position = 'none') +
scale_fill_viridis_d(alpha = 0.8) +
theme(panel.grid.minor = element_blank(), panel.border = element_blank())b <- data_new %>%
add_count(Blood_Pressure_Abnormality) %>%
mutate(Blood_Pressure_Abnormality = fct_reorder(Blood_Pressure_Abnormality, n)) %>%
group_by(Blood_Pressure_Abnormality) %>%
ggplot(aes(BMI, Blood_Pressure_Abnormality, fill = Blood_Pressure_Abnormality)) +
geom_density_ridges() +
# scale_y_discrete(expand = c(0.01, 0)) +
# scale_x_continuous(expand = c(0.01, 0)) +
#theme_ridges() +
labs(title = 'Density of the BMI by Blood pressure abnormality',
x= 'BMI',
y= '') +
theme(legend.position = 'none') +
scale_fill_viridis_d(alpha = 0.8) +
theme(panel.grid.minor = element_blank(), panel.border = element_blank())
(a / b)## Picking joint bandwidth of 0.00625
## Picking joint bandwidth of 1.57library(sjPlot)
sjt.xtab(data_new$Blood_Pressure_Abnormality,data_new$Smoking, # row var, column var
show.row.prc=TRUE, # show row percents
show.summary=FALSE, # omit summary statistics (for now)
title="Cross Tab Smoking x Blood pressure") | Blood_Pressure_Abnormality | Smoking | Total | |
|---|---|---|---|
| No | Yes | ||
| Abnormal |
478 48.4Â % |
509 51.6Â % |
987 100Â % |
| Normal |
503 49.7Â % |
510 50.3Â % |
1013 100Â % |
| Total |
981 49Â % |
1019 51Â % |
2000 100Â % |
library(compareGroups)
results <- compareGroups(Blood_Pressure_Abnormality ~ Smoking, data = data_new) # do calculations
readytable <- createTable(results, show.ratio=TRUE, show.p.overall = FALSE) # produce table
print(readytable, header.labels = c(p.ratio = "p-value"))
#>
#> --------Summary descriptives table by 'Blood_Pressure_Abnormality'---------
#>
#> _________________________________________________________
#> Abnormal Normal OR p-value
#> N=987 N=1013
#> ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
#> Smoking:
#> No 478 (48.4%) 503 (49.7%) Ref. Ref.
#> Yes 509 (51.6%) 510 (50.3%) 0.95 [0.80;1.13] 0.584
#> ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯data_new|>
dplyr::select(Blood_Pressure_Abnormality,Sex,Level_of_Stress,
Chronic_kidney_disease,Adrenal_and_thyroid_disorders) |>
mutate(Chronic_kidney_disease=
factor(Chronic_kidney_disease,levels=c("No","Yes"))) |>
tbl_summary(
by = Blood_Pressure_Abnormality,
statistic = list(
all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} / {N} ({p}%)"),
type=list(Chronic_kidney_disease~"categorical",
Adrenal_and_thyroid_disorders~"categorical"),
label = Level_of_Stress ~ "Level of Stress")|>
add_p(test = all_continuous() ~ "t.test",
pvalue_fun = function(x) style_pvalue(x, digits = 2))|>
modify_header(statistic ~ "**Test Statistic**")|>
bold_labels()|>
modify_fmt_fun(statistic ~ style_sigfig) | Characteristic | Abnormal, N = 9871 | Normal, N = 1,0131 | Test Statistic | p-value2 |
|---|---|---|---|---|
| Sex | 6.0 | 0.014 | ||
| Â Â Â Â Female | 517 / 987 (52%) | 475 / 1,013 (47%) | ||
| Â Â Â Â Male | 470 / 987 (48%) | 538 / 1,013 (53%) | ||
| Level of Stress | 0.34 | 0.84 | ||
| Â Â Â Â High | 347 / 987 (35%) | 344 / 1,013 (34%) | ||
| Â Â Â Â Less | 327 / 987 (33%) | 339 / 1,013 (33%) | ||
| Â Â Â Â Normal | 313 / 987 (32%) | 330 / 1,013 (33%) | ||
| Chronic_kidney_disease | 1,137 | <0.001 | ||
| Â Â Â Â No | 274 / 987 (28%) | 1,013 / 1,013 (100%) | ||
| Â Â Â Â Yes | 713 / 987 (72%) | 0 / 1,013 (0%) | ||
| Adrenal_and_thyroid_disorders | 871 | <0.001 | ||
| Â Â Â Â No | 391 / 987 (40%) | 1,013 / 1,013 (100%) | ||
| Â Â Â Â Yes | 596 / 987 (60%) | 0 / 1,013 (0%) | ||
| 1 n / N (%) | ||||
| 2 Pearson’s Chi-squared test | ||||
Comments
#Correlation Matrix Plot
subset<- data_new |>
dplyr::select(Age,BMI,Physical_activity,salt_content_in_the_diet,alcohol_consumption_per_day,Level_of_Hemoglobin,Genetic_Pedigree_Coefficient,Blood_Pressure_Abnormality)# Bring in external file for visualisations
source('functions/visualisations.R')
# Use plot function
plot <- histoplotter(subset,Blood_Pressure_Abnormality,
chart_x_axis_lbl = "Abnormality status",
chart_y_axis_lbl = 'Measures',
boxplot_color = 'navy',
boxplot_fill = '#89CFF0',
box_fill_transparency = 0.2)
# Add extras to plot
plot +
tvthemes::theme_theLastAirbender() +
paletteer::scale_color_paletteer_d("ggsci::default_nejm")+
theme(legend.position = 'top') data_new|>
dplyr::select(Age,BMI,Physical_activity,salt_content_in_the_diet,alcohol_consumption_per_day,Level_of_Hemoglobin,Genetic_Pedigree_Coefficient,Blood_Pressure_Abnormality) |>
tbl_summary(
by = Blood_Pressure_Abnormality,
statistic = list(
all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} / {N} ({p}%)"),
label = BMI ~ "BMI")|>
add_p(test = all_continuous() ~ "t.test",
pvalue_fun = function(x) style_pvalue(x, digits = 2))|>
modify_header(statistic ~ "**Test Statistic**")|>
bold_labels()|>
modify_fmt_fun(statistic ~ style_sigfig) | Characteristic | Abnormal, N = 9871 | Normal, N = 1,0131 | Test Statistic | p-value2 |
|---|---|---|---|---|
| Age | 45 (17) | 48 (17) | -3.0 | 0.003 |
| BMI | 31 (12) | 30 (12) | 1.8 | 0.072 |
| Physical_activity | 25,793 (14,168) | 24,730 (13,852) | 1.7 | 0.090 |
| salt_content_in_the_diet | 25,130 (14,339) | 24,727 (14,091) | 0.63 | 0.53 |
| alcohol_consumption_per_day | 254 (144) | 248 (143) | 0.87 | 0.38 |
| Â Â Â Â Unknown | 146 | 96 | ||
| Level_of_Hemoglobin | 12.02 (2.75) | 11.41 (1.37) | 6.2 | <0.001 |
| Genetic_Pedigree_Coefficient | 0.49 (0.35) | 0.50 (0.21) | -1.5 | 0.14 |
| Â Â Â Â Unknown | 30 | 62 | ||
| 1 Mean (SD) | ||||
| 2 Welch Two Sample t-test | ||||
Comments
model_data <- data_new |>
dplyr::select(-Patient_Number,-Pregnancy) |>
mutate(Blood_Pressure_Abnormality=ifelse(Blood_Pressure_Abnormality=="Normal",0,1),
Blood_Pressure_Abnormality=as.numeric(Blood_Pressure_Abnormality))full_model<-glm(Blood_Pressure_Abnormality~.-Genetic_Pedigree_Coefficient - alcohol_consumption_per_day ,
data=model_data,family=binomial(link="logit"))
summary(full_model)
#>
#> Call:
#> glm(formula = Blood_Pressure_Abnormality ~ . - Genetic_Pedigree_Coefficient -
#> alcohol_consumption_per_day, family = binomial(link = "logit"),
#> data = model_data)
#>
#> Coefficients:
#> Estimate Std. Error z value Pr(>|z|)
#> (Intercept) -7.031e+00 9.823e-01 -7.158 8.20e-13 ***
#> Level_of_Hemoglobin 3.584e-01 6.701e-02 5.349 8.85e-08 ***
#> Age 4.031e-03 7.463e-03 0.540 0.5891
#> BMI 5.638e-04 9.528e-03 0.059 0.9528
#> SexMale -4.655e-01 2.380e-01 -1.956 0.0504 .
#> SmokingYes 2.683e-01 2.248e-01 1.194 0.2326
#> Physical_activity 1.771e-05 8.226e-06 2.153 0.0314 *
#> salt_content_in_the_diet -4.453e-07 7.980e-06 -0.056 0.9555
#> Level_of_StressLess -1.930e-01 2.797e-01 -0.690 0.4902
#> Level_of_StressNormal 2.420e-01 2.659e-01 0.910 0.3627
#> Chronic_kidney_diseaseYes 2.265e+01 9.363e+02 0.024 0.9807
#> Adrenal_and_thyroid_disordersYes 2.232e+01 9.809e+02 0.023 0.9818
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> (Dispersion parameter for binomial family taken to be 1)
#>
#> Null deviance: 2323.50 on 1676 degrees of freedom
#> Residual deviance: 583.78 on 1665 degrees of freedom
#> (323 observations deleted due to missingness)
#> AIC: 607.78
#>
#> Number of Fisher Scoring iterations: 20# create coefficient table with estimates, p-values and odds ratios
coefs<-summary(full_model)$coefficients
(full_coefs <- cbind(coefs[ ,c("Estimate", "Pr(>|z|)")],
odds_ratio = exp(full_model$coefficients)))
#> Estimate Pr(>|z|) odds_ratio
#> (Intercept) -7.030851e+00 8.198296e-13 8.841791e-04
#> Level_of_Hemoglobin 3.584342e-01 8.849354e-08 1.431087e+00
#> Age 4.030797e-03 5.891392e-01 1.004039e+00
#> BMI 5.637909e-04 9.528153e-01 1.000564e+00
#> SexMale -4.655083e-01 5.044978e-02 6.278159e-01
#> SmokingYes 2.682738e-01 2.326488e-01 1.307705e+00
#> Physical_activity 1.770756e-05 3.135208e-02 1.000018e+00
#> salt_content_in_the_diet -4.453453e-07 9.554923e-01 9.999996e-01
#> Level_of_StressLess -1.929816e-01 4.901519e-01 8.244971e-01
#> Level_of_StressNormal 2.420288e-01 3.626761e-01 1.273831e+00
#> Chronic_kidney_diseaseYes 2.265090e+01 9.807002e-01 6.873214e+09
#> Adrenal_and_thyroid_disordersYes 2.231799e+01 9.818472e-01 4.926975e+09Comments
Standard modeling functions generally do not offer the calculation of pseudo-\(R^2\) as standard, but numerous methods are available for their calculation. For example:
library(DescTools)
DescTools::PseudoR2(
full_model,
which = c("McFadden", "CoxSnell", "Nagelkerke", "Tjur")
)
#> McFadden CoxSnell Nagelkerke Tjur
#> 0.7487511 0.6456256 0.8610597 0.8080701We see that the Cox and Snell variant is notably lower than the other estimates, which is consistent with the known issues with its upper bound. However, the other estimates are reasonably aligned and suggest a strong fit.
# define function to run stroke model and glance at results
blood_model_pvals <- function(form, df) {
model <- glm(formula = form, data = df)
broom::tidy(model)
}
# create model formula column
formula <- c(
"Blood_Pressure_Abnormality ~ Adrenal_and_thyroid_disorders",
"Blood_Pressure_Abnormality ~ Level_of_Hemoglobin",
"Blood_Pressure_Abnormality ~ Level_of_Stress",
"Blood_Pressure_Abnormality ~ Age",
"Blood_Pressure_Abnormality ~ Sex",
"Blood_Pressure_Abnormality ~ Chronic_kidney_disease",
"Blood_Pressure_Abnormality ~ Physical_activity",
"Blood_Pressure_Abnormality ~ BMI",
"Blood_Pressure_Abnormality ~ salt_content_in_the_diet",
"Blood_Pressure_Abnormality ~ Smoking"
)
# create dataframe
models <- data.frame(formula)options(scipen=999)
# run models and glance at results
mods<-models |>
dplyr::group_by(formula) |>
dplyr::summarise(blood_model_pvals(formula, model_data)) |>
dplyr::select(formula,p.value,term) |>
mutate(p.value=round(p.value,4)) |>
filter(term!="(Intercept)") |>
arrange(p.value)
modslr_fitted_add <-mods |>
mutate(Significance = ifelse(p.value < 0.05,
"Significant", "Insignificant"))|>
arrange(desc(p.value))
#Create a ggplot object to visualise significance
plot <- lr_fitted_add|>
ggplot(mapping = aes(x=fct_reorder(term,p.value), y=p.value, fill=Significance)) +
geom_col() +
ggthemes::scale_fill_tableau() +
theme(axis.text.x = element_text(face="bold",
color="#0070BA",
size=8,
angle=60)) +
geom_hline(yintercept = 0.05, col = "black", lty = 2) +
labs(y="P value",
x="Terms",
title="P value significance chart",
subtitle="significant variables in the model",
caption="Produced by Bongani Ncube")
plotly::ggplotly(plot) blood_model_information <- function(form, df) {
model <- glm(formula = form, data = df)
broom::glance(model)
}# run models and glance at results
mods1<- models |>
dplyr::group_by(formula) |>
dplyr::summarise(blood_model_information(formula, model_data)) |>
dplyr::select(formula,AIC,BIC,deviance,logLik) |>
separate(formula,c("response","term"),sep="~") |>
arrange(AIC)
mods1lr_fitted_add <-mods1
#Create a ggplot object to visualise significance
plot <- lr_fitted_add|>
ggplot(mapping = aes(x=fct_reorder(term,AIC), y=AIC)) +
geom_col() +
ggthemes::scale_fill_canva() +
theme(axis.text.x = element_text(face="bold",
color="#0070BA",
size=8,
angle=40)) +
geom_hline(yintercept = 0.05, col = "black", lty = 2) +
labs(y="Akaike Information Criteria",
x="term",
title="AIC chart",
subtitle="significant variables in the model",
caption="Produced by Bongani Ncube")
plotly::ggplotly(plot) Comments
add chronic kidney disease
model0<-glm(Blood_Pressure_Abnormality ~ Adrenal_and_thyroid_disorders,data=model_data,family=binomial)
model1<-glm(Blood_Pressure_Abnormality ~ Adrenal_and_thyroid_disorders+Chronic_kidney_disease,data=model_data,family=binomial)anova(model0,model1,test="LRT")add haemoglobin level
model2<-glm(Blood_Pressure_Abnormality ~ Adrenal_and_thyroid_disorders+Chronic_kidney_disease+Level_of_Hemoglobin,data=model_data,family=binomial)anova(model1,model2,test="LRT")level of haemoglobin is a significant variable and
should be includedadd sex
model3<-glm(Blood_Pressure_Abnormality ~ Adrenal_and_thyroid_disorders+Chronic_kidney_disease+Level_of_Hemoglobin+Sex,data=model_data,family=binomial)anova(model2,model3,test="LRT")Pr(>Chi) is greater than 0.05 so we
drop sexlets try age
model4<-glm(Blood_Pressure_Abnormality ~ Adrenal_and_thyroid_disorders+Chronic_kidney_disease+Level_of_Hemoglobin+Age,data=model_data,family=binomial)anova(model2,model4,test="LRT")try Physical_activity
model5<-glm(Blood_Pressure_Abnormality ~ Adrenal_and_thyroid_disorders+Chronic_kidney_disease+Level_of_Hemoglobin+Physical_activity,data=model_data,family=binomial)anova(model2,model5,test="LRT")this would go on and on but we can use
stepwisealgorithms to make it fast
library(MASS)
full_model<-glm(Blood_Pressure_Abnormality~.-Genetic_Pedigree_Coefficient - alcohol_consumption_per_day ,
data=model_data,family=binomial(link="logit"))
full_model1<-glm(Blood_Pressure_Abnormality~.-Genetic_Pedigree_Coefficient - alcohol_consumption_per_day ,
data=full_model$model,family=binomial(link="logit"))
step<-stepAIC(full_model1, direction = "both")
#> Start: AIC=607.78
#> Blood_Pressure_Abnormality ~ (Level_of_Hemoglobin + Genetic_Pedigree_Coefficient +
#> Age + BMI + Sex + Smoking + Physical_activity + salt_content_in_the_diet +
#> alcohol_consumption_per_day + Level_of_Stress + Chronic_kidney_disease +
#> Adrenal_and_thyroid_disorders) - Genetic_Pedigree_Coefficient -
#> alcohol_consumption_per_day
#>
#> Df Deviance AIC
#> - salt_content_in_the_diet 1 583.78 605.78
#> - BMI 1 583.78 605.78
#> - Age 1 584.07 606.07
#> - Level_of_Stress 2 586.29 606.29
#> - Smoking 1 585.21 607.21
#> <none> 583.78 607.78
#> - Sex 1 587.62 609.62
#> - Physical_activity 1 588.48 610.48
#> - Level_of_Hemoglobin 1 611.85 633.85
#> - Adrenal_and_thyroid_disorders 1 1097.92 1119.92
#> - Chronic_kidney_disease 1 1354.30 1376.30
#>
#> Step: AIC=605.78
#> Blood_Pressure_Abnormality ~ Level_of_Hemoglobin + Age + BMI +
#> Sex + Smoking + Physical_activity + Level_of_Stress + Chronic_kidney_disease +
#> Adrenal_and_thyroid_disorders
#>
#> Df Deviance AIC
#> - BMI 1 583.78 603.78
#> - Age 1 584.07 604.07
#> - Level_of_Stress 2 586.29 604.29
#> - Smoking 1 585.22 605.22
#> <none> 583.78 605.78
#> - Sex 1 587.62 607.62
#> + salt_content_in_the_diet 1 583.78 607.78
#> - Physical_activity 1 588.50 608.50
#> - Level_of_Hemoglobin 1 611.85 631.85
#> - Adrenal_and_thyroid_disorders 1 1099.79 1119.79
#> - Chronic_kidney_disease 1 1354.36 1374.36
#>
#> Step: AIC=603.78
#> Blood_Pressure_Abnormality ~ Level_of_Hemoglobin + Age + Sex +
#> Smoking + Physical_activity + Level_of_Stress + Chronic_kidney_disease +
#> Adrenal_and_thyroid_disorders
#>
#> Df Deviance AIC
#> - Age 1 584.08 602.08
#> - Level_of_Stress 2 586.30 602.30
#> - Smoking 1 585.22 603.22
#> <none> 583.78 603.78
#> - Sex 1 587.66 605.66
#> + BMI 1 583.78 605.78
#> + salt_content_in_the_diet 1 583.78 605.78
#> - Physical_activity 1 588.50 606.50
#> - Level_of_Hemoglobin 1 612.31 630.31
#> - Adrenal_and_thyroid_disorders 1 1099.83 1117.83
#> - Chronic_kidney_disease 1 1354.49 1372.49
#>
#> Step: AIC=602.08
#> Blood_Pressure_Abnormality ~ Level_of_Hemoglobin + Sex + Smoking +
#> Physical_activity + Level_of_Stress + Chronic_kidney_disease +
#> Adrenal_and_thyroid_disorders
#>
#> Df Deviance AIC
#> - Level_of_Stress 2 586.69 600.69
#> - Smoking 1 585.57 601.57
#> <none> 584.08 602.08
#> + Age 1 583.78 603.78
#> - Sex 1 587.95 603.95
#> + BMI 1 584.07 604.07
#> + salt_content_in_the_diet 1 584.08 604.08
#> - Physical_activity 1 588.81 604.81
#> - Level_of_Hemoglobin 1 612.31 628.31
#> - Adrenal_and_thyroid_disorders 1 1099.87 1115.87
#> - Chronic_kidney_disease 1 1354.49 1370.49
#>
#> Step: AIC=600.69
#> Blood_Pressure_Abnormality ~ Level_of_Hemoglobin + Sex + Smoking +
#> Physical_activity + Chronic_kidney_disease + Adrenal_and_thyroid_disorders
#>
#> Df Deviance AIC
#> - Smoking 1 587.95 599.95
#> <none> 586.69 600.69
#> + Level_of_Stress 2 584.08 602.08
#> + Age 1 586.30 602.30
#> - Sex 1 590.41 602.41
#> + BMI 1 586.68 602.68
#> + salt_content_in_the_diet 1 586.69 602.69
#> - Physical_activity 1 591.09 603.09
#> - Level_of_Hemoglobin 1 613.48 625.48
#> - Adrenal_and_thyroid_disorders 1 1101.12 1113.12
#> - Chronic_kidney_disease 1 1357.03 1369.03
#>
#> Step: AIC=599.95
#> Blood_Pressure_Abnormality ~ Level_of_Hemoglobin + Sex + Physical_activity +
#> Chronic_kidney_disease + Adrenal_and_thyroid_disorders
#>
#> Df Deviance AIC
#> <none> 587.95 599.95
#> + Smoking 1 586.69 600.69
#> + Age 1 587.50 601.50
#> + Level_of_Stress 2 585.57 601.57
#> - Sex 1 591.87 601.87
#> + BMI 1 587.93 601.93
#> + salt_content_in_the_diet 1 587.94 601.94
#> - Physical_activity 1 592.17 602.17
#> - Level_of_Hemoglobin 1 615.30 625.30
#> - Adrenal_and_thyroid_disorders 1 1103.15 1113.15
#> - Chronic_kidney_disease 1 1357.04 1367.04
summary(step)
#>
#> Call:
#> glm(formula = Blood_Pressure_Abnormality ~ Level_of_Hemoglobin +
#> Sex + Physical_activity + Chronic_kidney_disease + Adrenal_and_thyroid_disorders,
#> family = binomial(link = "logit"), data = full_model$model)
#>
#> Coefficients:
#> Estimate Std. Error z value
#> (Intercept) -6.491095081 0.800187793 -8.112
#> Level_of_Hemoglobin 0.347319258 0.065145381 5.331
#> SexMale -0.467045092 0.236393033 -1.976
#> Physical_activity 0.000016613 0.000008149 2.039
#> Chronic_kidney_diseaseYes 22.575771733 940.745458348 0.024
#> Adrenal_and_thyroid_disordersYes 22.263364686 984.433896005 0.023
#> Pr(>|z|)
#> (Intercept) 0.000000000000000498 ***
#> Level_of_Hemoglobin 0.000000097432321196 ***
#> SexMale 0.0482 *
#> Physical_activity 0.0415 *
#> Chronic_kidney_diseaseYes 0.9809
#> Adrenal_and_thyroid_disordersYes 0.9820
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> (Dispersion parameter for binomial family taken to be 1)
#>
#> Null deviance: 2323.50 on 1676 degrees of freedom
#> Residual deviance: 587.95 on 1671 degrees of freedom
#> AIC: 599.95
#>
#> Number of Fisher Scoring iterations: 20model_final<-glm(Blood_Pressure_Abnormality ~ Adrenal_and_thyroid_disorders+Chronic_kidney_disease+Level_of_Hemoglobin+Sex+Level_of_Hemoglobin,data=model_data,family=binomial)
summary(model_final)
#>
#> Call:
#> glm(formula = Blood_Pressure_Abnormality ~ Adrenal_and_thyroid_disorders +
#> Chronic_kidney_disease + Level_of_Hemoglobin + Sex + Level_of_Hemoglobin,
#> family = binomial, data = model_data)
#>
#> Coefficients:
#> Estimate Std. Error z value
#> (Intercept) -6.31188 0.71911 -8.777
#> Adrenal_and_thyroid_disordersYes 22.29857 887.41087 0.025
#> Chronic_kidney_diseaseYes 22.63412 847.14289 0.027
#> Level_of_Hemoglobin 0.35914 0.06085 5.902
#> SexMale -0.41252 0.22277 -1.852
#> Pr(>|z|)
#> (Intercept) < 0.0000000000000002 ***
#> Adrenal_and_thyroid_disordersYes 0.9800
#> Chronic_kidney_diseaseYes 0.9787
#> Level_of_Hemoglobin 0.0000000036 ***
#> SexMale 0.0641 .
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> (Dispersion parameter for binomial family taken to be 1)
#>
#> Null deviance: 2772.25 on 1999 degrees of freedom
#> Residual deviance: 658.41 on 1995 degrees of freedom
#> AIC: 668.41
#>
#> Number of Fisher Scoring iterations: 20DescTools::PseudoR2(
model_final,
which = c("McFadden", "CoxSnell", "Nagelkerke", "Tjur")
)
#> McFadden CoxSnell Nagelkerke Tjur
#> 0.7624989 0.6524751 0.8700158 0.8215934data_final<-DescTools::PseudoR2(
model_final,
which = c("McFadden", "CoxSnell", "Nagelkerke", "Tjur")
)
data_full<-DescTools::PseudoR2(
model_final,
which = c("McFadden", "CoxSnell", "Nagelkerke", "Tjur")
)
data_final<-as.data.frame(data_final)
data_full<-as.data.frame(data_full)
out_new<-cbind(data_final,data_full)
out_new(out_new<-rownames_to_column(out_new))blood_model_information <- function(form, df) {
model <- glm(formula = form, data = df)
broom::glance(model)
}# define function to run stroke model and glance at results
blood_model_pvals <- function(form, df) {
model <- glm(formula = form, data = df)
broom::tidy(model)
}
# create model formula column
formula <- c(
"Blood_Pressure_Abnormality ~ Adrenal_and_thyroid_disorders+Chronic_kidney_disease+Level_of_Hemoglobin+Sex+Level_of_Hemoglobin",
"Blood_Pressure_Abnormality~.-Genetic_Pedigree_Coefficient - alcohol_consumption_per_day"
)
# create dataframe
models <- data.frame(formula)# run models and glance at results
mods_compare<- models |>
dplyr::group_by(formula) |>
dplyr::summarise(blood_model_information(formula, model_data)) |>
dplyr::select(formula,AIC,BIC,deviance,logLik) |>
separate(formula,c("response","term"),sep="~") |>
arrange(AIC)
mods_compareresults<-mods_compare |>
mutate(term=ifelse(term==".-Genetic_Pedigree_Coefficient - alcohol_consumption_per_day","data_full","data_final")) |>
dplyr::select(-response) |>
t() |>
as.data.frame() |>
rownames_to_column() |>
relocate(rowname,V2) |>
filter(rowname!="term")colnames(results)<-c("rowname","data_final","data_full")
results(my_results<-rbind(results,out_new) |>
rename(`full model` =data_full,
`optimal model` = data_final))my_results |>
gather("key","value",-rowname) |>
mutate(value=as.numeric(value)) |>
ggplot(aes(x=key,y=value,fill=rowname)) +
geom_col(position="dodge")+
ggthemes::scale_fill_tableau()+
ggthemes::theme_fivethirtyeight()+
labs(fill="measure")