library(gtsummary)
library(readr) #Read Tabular Dat
library(tidyverse)
library(magrittr) #Pipe Operators
library(stringr) #Work with Strings
library(forcats) #Work with Factors
library(skimr) #Data Summary
library(patchwork) #ggplot grids
library(GGally) #Scatterplot Matrice
library(VIM)
library(naniar)
library(gt)
<- vroom::vroom("bloodpressure.csv") out_new
sapply(out_new[1,],class)
#> Patient_Number Blood_Pressure_Abnormality
#> "numeric" "numeric"
#> Level_of_Hemoglobin Genetic_Pedigree_Coefficient
#> "numeric" "numeric"
#> Age BMI
#> "numeric" "numeric"
#> Sex Pregnancy
#> "numeric" "numeric"
#> Smoking Physical_activity
#> "numeric" "numeric"
#> salt_content_in_the_diet alcohol_consumption_per_day
#> "numeric" "numeric"
#> Level_of_Stress Chronic_kidney_disease
#> "numeric" "numeric"
#> Adrenal_and_thyroid_disorders
#> "numeric"
colSums(is.na(out_new))
#> Patient_Number Blood_Pressure_Abnormality
#> 0 0
#> Level_of_Hemoglobin Genetic_Pedigree_Coefficient
#> 0 92
#> Age BMI
#> 0 0
#> Sex Pregnancy
#> 0 1558
#> Smoking Physical_activity
#> 0 0
#> salt_content_in_the_diet alcohol_consumption_per_day
#> 0 242
#> Level_of_Stress Chronic_kidney_disease
#> 0 0
#> Adrenal_and_thyroid_disorders
#> 0
::include_graphics("data_dictionary.png") knitr
<- out_new |>
data_newmutate(Blood_Pressure_Abnormality= ifelse(Blood_Pressure_Abnormality==0,"Normal","Abnormal")) |>
mutate(Sex = ifelse(Sex==0,"Male","Female")) |>
mutate(Pregnancy=ifelse(Pregnancy==0,"No","Yes")) |>
mutate(Smoking=ifelse(Smoking==0,"No","Yes")) |>
mutate(Chronic_kidney_disease=ifelse(Chronic_kidney_disease==0,"No","Yes")) |>
mutate(Adrenal_and_thyroid_disorders=ifelse(Adrenal_and_thyroid_disorders==0,"No","Yes")) |>
mutate(Level_of_Stress=case_when(Level_of_Stress==1~"Less",
==2~"Normal",
Level_of_Stress==3~"High")) Level_of_Stress
%>%
data_new head() %>%
gt() %>%
tab_header(title = md("**First 6 rows of bloodPressure data**")) %>%
tab_options(container.height = 400,
container.overflow.y = TRUE,
heading.background.color = "#21908CFF",
table.width = "75%",
column_labels.background.color = "black",
table.font.color = "black") %>%
tab_style(style = list(cell_fill(color = "#35B779FF")),
locations = cells_body())
First 6 rows of bloodPressure data | ||||||||||||||
Patient_Number | Blood_Pressure_Abnormality | Level_of_Hemoglobin | Genetic_Pedigree_Coefficient | Age | BMI | Sex | Pregnancy | Smoking | Physical_activity | salt_content_in_the_diet | alcohol_consumption_per_day | Level_of_Stress | Chronic_kidney_disease | Adrenal_and_thyroid_disorders |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | Abnormal | 11.28 | 0.90 | 34 | 23 | Female | Yes | No | 45961 | 48071 | NA | Normal | Yes | Yes |
2 | Normal | 9.75 | 0.23 | 54 | 33 | Female | NA | No | 26106 | 25333 | 205 | High | No | No |
3 | Abnormal | 10.79 | 0.91 | 70 | 49 | Male | NA | No | 9995 | 29465 | 67 | Normal | Yes | No |
4 | Normal | 11.00 | 0.43 | 71 | 50 | Male | NA | No | 10635 | 7439 | 242 | Less | No | No |
5 | Abnormal | 14.17 | 0.83 | 52 | 19 | Male | NA | No | 15619 | 49644 | 397 | Normal | No | No |
6 | Normal | 11.64 | 0.54 | 23 | 48 | Male | NA | Yes | 27042 | 7513 | NA | High | No | No |
aggr(out_new, numbers = TRUE, prop = FALSE)
# get the proportion of missing data
prop_miss(out_new)
#> [1] 0.06306667
# get the summary of missing data
miss_var_summary(out_new)
the variable pregrancy is going to be dropped due to high proportion of missing values
%>%
data_new group_by(Blood_Pressure_Abnormality, Smoking) %>%
summarize(count= n()) %>%
arrange(Blood_Pressure_Abnormality, Smoking) %>%
ggplot(aes(x = Smoking, y = count, fill = Blood_Pressure_Abnormality)) +
geom_col(position= "dodge", color="white",linewidth=0.1) +
geom_text(aes(label = comma(count)), position=position_dodge(width=1),
hjust = 2, vjust=.5, colour = "white", size=3.5) +
coord_flip() +
::scale_fill_avatar()+
tvthemeslabs(title="Difference in proportion by sex",fill = "Patients",
subtitle="Number of abnormalities per sex",
y="\nTotal Number of cases") +
theme(plot.title = element_text(family="lato", face="bold", vjust = -.5),
plot.subtitle = element_text(size = 9, family="lato",face="italic"),
plot.caption.position = "plot",
plot.caption = element_text(hjust = 0, size= 8, family="lato",face="italic"),
axis.title.x = element_text(size=9),
axis.title.y = element_blank(),
axis.text.y = element_text(family = "lato"),
axis.line.y.left = element_line(color = "black"),
axis.line.x.bottom = element_line(color = "gray"),
##axis.ticks.length = unit(0, "mm"),
panel.background = element_rect(fill = "white"),
panel.border = element_rect(fill = NA, color = "white"),
panel.grid.major.x = element_line(color = "#A8BAC4", size = 0.3),
panel.grid.minor.y = element_blank(),
+
) scale_y_continuous(labels = scales::comma, expand=c(0,0)) +
scale_x_discrete(expand=c(0,1)) +
guides(fill = guide_legend(reverse=TRUE))
<- data_new |>
iv_rates group_by(Blood_Pressure_Abnormality) |>
summarize(count = n()) |>
mutate(prop = count/sum(count)) |>
ungroup()
<-iv_rates |>
plotggplot(aes(x=Blood_Pressure_Abnormality, y=prop, fill=Blood_Pressure_Abnormality)) +
geom_col(color="black",width = 0.5)+
theme(legend.position="bottom") +
geom_label(aes(label=scales::percent(prop)), color="white") +
labs(
title = "Abnormality ratio",
y = "proportion(%)",
x = "",
fill="status") +
scale_y_continuous(labels = scales::percent)+
::scale_fill_paletteer_d("ggsci::default_nejm")+
paletteer::theme_fivethirtyeight() +
ggthemestheme(legend.position = 'right')
plot
%>%
data_new count(Level_of_Stress,Blood_Pressure_Abnormality) %>%
ggplot(mapping = aes(x = Level_of_Stress, y = n,fill=Level_of_Stress)) +
geom_col(show.legend = FALSE, alpha = 0.7, width = 0.5) +
facet_wrap(~Blood_Pressure_Abnormality)+
geom_text(aes(label = n, y = n)) +
::scale_fill_paletteer_d("ggsci::default_nejm") paletteer
|>
data_new select_if(is.numeric) %>%
::select(-Patient_Number) |>
dplyrgather() %>%
ggplot(aes(value, fill = key)) +
geom_histogram(alpha = 0.7,bins=30) +
facet_wrap(~key, scales = 'free') +
labs(title = 'Count distributions for numeric variables') +
theme(legend.position = 'none')
library(ggpubr)
library(ggridges)
<- data_new %>%
a add_count(Blood_Pressure_Abnormality) %>%
mutate(Blood_Pressure_Abnormality = fct_reorder(Blood_Pressure_Abnormality, n)) %>%
group_by(Blood_Pressure_Abnormality) %>%
ggplot(aes(Genetic_Pedigree_Coefficient, Blood_Pressure_Abnormality, fill = Blood_Pressure_Abnormality)) +
geom_density_ridges() +
labs(title = 'Density of the GPC by Abnormality status',
x= 'GPC',
y= '') +
theme(legend.position = 'none') +
scale_fill_viridis_d(alpha = 0.8) +
theme(panel.grid.minor = element_blank(), panel.border = element_blank())
<- data_new %>%
b add_count(Blood_Pressure_Abnormality) %>%
mutate(Blood_Pressure_Abnormality = fct_reorder(Blood_Pressure_Abnormality, n)) %>%
group_by(Blood_Pressure_Abnormality) %>%
ggplot(aes(BMI, Blood_Pressure_Abnormality, fill = Blood_Pressure_Abnormality)) +
geom_density_ridges() +
# scale_y_discrete(expand = c(0.01, 0)) +
# scale_x_continuous(expand = c(0.01, 0)) +
#theme_ridges() +
labs(title = 'Density of the BMI by Blood pressure abnormality',
x= 'BMI',
y= '') +
theme(legend.position = 'none') +
scale_fill_viridis_d(alpha = 0.8) +
theme(panel.grid.minor = element_blank(), panel.border = element_blank())
/ b) (a
## Picking joint bandwidth of 0.00625
## Picking joint bandwidth of 1.57
library(sjPlot)
sjt.xtab(data_new$Blood_Pressure_Abnormality,data_new$Smoking, # row var, column var
show.row.prc=TRUE, # show row percents
show.summary=FALSE, # omit summary statistics (for now)
title="Cross Tab Smoking x Blood pressure")
Blood_Pressure_Abnormality | Smoking | Total | |
---|---|---|---|
No | Yes | ||
Abnormal |
478 48.4Â % |
509 51.6Â % |
987 100Â % |
Normal |
503 49.7Â % |
510 50.3Â % |
1013 100Â % |
Total |
981 49Â % |
1019 51Â % |
2000 100Â % |
library(compareGroups)
<- compareGroups(Blood_Pressure_Abnormality ~ Smoking, data = data_new) # do calculations
results <- createTable(results, show.ratio=TRUE, show.p.overall = FALSE) # produce table
readytable print(readytable, header.labels = c(p.ratio = "p-value"))
#>
#> --------Summary descriptives table by 'Blood_Pressure_Abnormality'---------
#>
#> _________________________________________________________
#> Abnormal Normal OR p-value
#> N=987 N=1013
#> ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
#> Smoking:
#> No 478 (48.4%) 503 (49.7%) Ref. Ref.
#> Yes 509 (51.6%) 510 (50.3%) 0.95 [0.80;1.13] 0.584
#> ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
|>
data_new::select(Blood_Pressure_Abnormality,Sex,Level_of_Stress,
dplyr|>
Chronic_kidney_disease,Adrenal_and_thyroid_disorders) mutate(Chronic_kidney_disease=
factor(Chronic_kidney_disease,levels=c("No","Yes"))) |>
tbl_summary(
by = Blood_Pressure_Abnormality,
statistic = list(
all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} / {N} ({p}%)"),
type=list(Chronic_kidney_disease~"categorical",
~"categorical"),
Adrenal_and_thyroid_disorderslabel = Level_of_Stress ~ "Level of Stress")|>
add_p(test = all_continuous() ~ "t.test",
pvalue_fun = function(x) style_pvalue(x, digits = 2))|>
modify_header(statistic ~ "**Test Statistic**")|>
bold_labels()|>
modify_fmt_fun(statistic ~ style_sigfig)
Characteristic | Abnormal, N = 9871 | Normal, N = 1,0131 | Test Statistic | p-value2 |
---|---|---|---|---|
Sex | 6.0 | 0.014 | ||
    Female | 517 / 987 (52%) | 475 / 1,013 (47%) | ||
    Male | 470 / 987 (48%) | 538 / 1,013 (53%) | ||
Level of Stress | 0.34 | 0.84 | ||
    High | 347 / 987 (35%) | 344 / 1,013 (34%) | ||
    Less | 327 / 987 (33%) | 339 / 1,013 (33%) | ||
    Normal | 313 / 987 (32%) | 330 / 1,013 (33%) | ||
Chronic_kidney_disease | 1,137 | <0.001 | ||
    No | 274 / 987 (28%) | 1,013 / 1,013 (100%) | ||
    Yes | 713 / 987 (72%) | 0 / 1,013 (0%) | ||
Adrenal_and_thyroid_disorders | 871 | <0.001 | ||
    No | 391 / 987 (40%) | 1,013 / 1,013 (100%) | ||
    Yes | 596 / 987 (60%) | 0 / 1,013 (0%) | ||
1 n / N (%) | ||||
2 Pearson’s Chi-squared test |
Comments
#Correlation Matrix Plot
<- data_new |>
subset::select(Age,BMI,Physical_activity,salt_content_in_the_diet,alcohol_consumption_per_day,Level_of_Hemoglobin,Genetic_Pedigree_Coefficient,Blood_Pressure_Abnormality)# Bring in external file for visualisations
dplyrsource('functions/visualisations.R')
# Use plot function
<- histoplotter(subset,Blood_Pressure_Abnormality,
plot chart_x_axis_lbl = "Abnormality status",
chart_y_axis_lbl = 'Measures',
boxplot_color = 'navy',
boxplot_fill = '#89CFF0',
box_fill_transparency = 0.2)
# Add extras to plot
+
plot ::theme_theLastAirbender() +
tvthemes::scale_color_paletteer_d("ggsci::default_nejm")+
paletteertheme(legend.position = 'top')
|>
data_new::select(Age,BMI,Physical_activity,salt_content_in_the_diet,alcohol_consumption_per_day,Level_of_Hemoglobin,Genetic_Pedigree_Coefficient,Blood_Pressure_Abnormality) |>
dplyrtbl_summary(
by = Blood_Pressure_Abnormality,
statistic = list(
all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} / {N} ({p}%)"),
label = BMI ~ "BMI")|>
add_p(test = all_continuous() ~ "t.test",
pvalue_fun = function(x) style_pvalue(x, digits = 2))|>
modify_header(statistic ~ "**Test Statistic**")|>
bold_labels()|>
modify_fmt_fun(statistic ~ style_sigfig)
Characteristic | Abnormal, N = 9871 | Normal, N = 1,0131 | Test Statistic | p-value2 |
---|---|---|---|---|
Age | 45 (17) | 48 (17) | -3.0 | 0.003 |
BMI | 31 (12) | 30 (12) | 1.8 | 0.072 |
Physical_activity | 25,793 (14,168) | 24,730 (13,852) | 1.7 | 0.090 |
salt_content_in_the_diet | 25,130 (14,339) | 24,727 (14,091) | 0.63 | 0.53 |
alcohol_consumption_per_day | 254 (144) | 248 (143) | 0.87 | 0.38 |
    Unknown | 146 | 96 | ||
Level_of_Hemoglobin | 12.02 (2.75) | 11.41 (1.37) | 6.2 | <0.001 |
Genetic_Pedigree_Coefficient | 0.49 (0.35) | 0.50 (0.21) | -1.5 | 0.14 |
    Unknown | 30 | 62 | ||
1 Mean (SD) | ||||
2 Welch Two Sample t-test |
Comments
<- data_new |>
model_data ::select(-Patient_Number,-Pregnancy) |>
dplyrmutate(Blood_Pressure_Abnormality=ifelse(Blood_Pressure_Abnormality=="Normal",0,1),
Blood_Pressure_Abnormality=as.numeric(Blood_Pressure_Abnormality))
<-glm(Blood_Pressure_Abnormality~.-Genetic_Pedigree_Coefficient - alcohol_consumption_per_day ,
full_modeldata=model_data,family=binomial(link="logit"))
summary(full_model)
#>
#> Call:
#> glm(formula = Blood_Pressure_Abnormality ~ . - Genetic_Pedigree_Coefficient -
#> alcohol_consumption_per_day, family = binomial(link = "logit"),
#> data = model_data)
#>
#> Coefficients:
#> Estimate Std. Error z value Pr(>|z|)
#> (Intercept) -7.031e+00 9.823e-01 -7.158 8.20e-13 ***
#> Level_of_Hemoglobin 3.584e-01 6.701e-02 5.349 8.85e-08 ***
#> Age 4.031e-03 7.463e-03 0.540 0.5891
#> BMI 5.638e-04 9.528e-03 0.059 0.9528
#> SexMale -4.655e-01 2.380e-01 -1.956 0.0504 .
#> SmokingYes 2.683e-01 2.248e-01 1.194 0.2326
#> Physical_activity 1.771e-05 8.226e-06 2.153 0.0314 *
#> salt_content_in_the_diet -4.453e-07 7.980e-06 -0.056 0.9555
#> Level_of_StressLess -1.930e-01 2.797e-01 -0.690 0.4902
#> Level_of_StressNormal 2.420e-01 2.659e-01 0.910 0.3627
#> Chronic_kidney_diseaseYes 2.265e+01 9.363e+02 0.024 0.9807
#> Adrenal_and_thyroid_disordersYes 2.232e+01 9.809e+02 0.023 0.9818
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> (Dispersion parameter for binomial family taken to be 1)
#>
#> Null deviance: 2323.50 on 1676 degrees of freedom
#> Residual deviance: 583.78 on 1665 degrees of freedom
#> (323 observations deleted due to missingness)
#> AIC: 607.78
#>
#> Number of Fisher Scoring iterations: 20
# create coefficient table with estimates, p-values and odds ratios
<-summary(full_model)$coefficients
coefs
<- cbind(coefs[ ,c("Estimate", "Pr(>|z|)")],
(full_coefs odds_ratio = exp(full_model$coefficients)))
#> Estimate Pr(>|z|) odds_ratio
#> (Intercept) -7.030851e+00 8.198296e-13 8.841791e-04
#> Level_of_Hemoglobin 3.584342e-01 8.849354e-08 1.431087e+00
#> Age 4.030797e-03 5.891392e-01 1.004039e+00
#> BMI 5.637909e-04 9.528153e-01 1.000564e+00
#> SexMale -4.655083e-01 5.044978e-02 6.278159e-01
#> SmokingYes 2.682738e-01 2.326488e-01 1.307705e+00
#> Physical_activity 1.770756e-05 3.135208e-02 1.000018e+00
#> salt_content_in_the_diet -4.453453e-07 9.554923e-01 9.999996e-01
#> Level_of_StressLess -1.929816e-01 4.901519e-01 8.244971e-01
#> Level_of_StressNormal 2.420288e-01 3.626761e-01 1.273831e+00
#> Chronic_kidney_diseaseYes 2.265090e+01 9.807002e-01 6.873214e+09
#> Adrenal_and_thyroid_disordersYes 2.231799e+01 9.818472e-01 4.926975e+09
Comments
Standard modeling functions generally do not offer the calculation of pseudo-\(R^2\) as standard, but numerous methods are available for their calculation. For example:
library(DescTools)
::PseudoR2(
DescTools
full_model, which = c("McFadden", "CoxSnell", "Nagelkerke", "Tjur")
)#> McFadden CoxSnell Nagelkerke Tjur
#> 0.7487511 0.6456256 0.8610597 0.8080701
We see that the Cox and Snell variant is notably lower than the other estimates, which is consistent with the known issues with its upper bound. However, the other estimates are reasonably aligned and suggest a strong fit.
# define function to run stroke model and glance at results
<- function(form, df) {
blood_model_pvals <- glm(formula = form, data = df)
model ::tidy(model)
broom
}# create model formula column
<- c(
formula "Blood_Pressure_Abnormality ~ Adrenal_and_thyroid_disorders",
"Blood_Pressure_Abnormality ~ Level_of_Hemoglobin",
"Blood_Pressure_Abnormality ~ Level_of_Stress",
"Blood_Pressure_Abnormality ~ Age",
"Blood_Pressure_Abnormality ~ Sex",
"Blood_Pressure_Abnormality ~ Chronic_kidney_disease",
"Blood_Pressure_Abnormality ~ Physical_activity",
"Blood_Pressure_Abnormality ~ BMI",
"Blood_Pressure_Abnormality ~ salt_content_in_the_diet",
"Blood_Pressure_Abnormality ~ Smoking"
)# create dataframe
<- data.frame(formula) models
options(scipen=999)
# run models and glance at results
<-models |>
mods::group_by(formula) |>
dplyr::summarise(blood_model_pvals(formula, model_data)) |>
dplyr::select(formula,p.value,term) |>
dplyrmutate(p.value=round(p.value,4)) |>
filter(term!="(Intercept)") |>
arrange(p.value)
mods
<-mods |>
lr_fitted_add mutate(Significance = ifelse(p.value < 0.05,
"Significant", "Insignificant"))|>
arrange(desc(p.value))
#Create a ggplot object to visualise significance
<- lr_fitted_add|>
plot ggplot(mapping = aes(x=fct_reorder(term,p.value), y=p.value, fill=Significance)) +
geom_col() +
::scale_fill_tableau() +
ggthemestheme(axis.text.x = element_text(face="bold",
color="#0070BA",
size=8,
angle=60)) +
geom_hline(yintercept = 0.05, col = "black", lty = 2) +
labs(y="P value",
x="Terms",
title="P value significance chart",
subtitle="significant variables in the model",
caption="Produced by Bongani Ncube")
::ggplotly(plot) plotly
<- function(form, df) {
blood_model_information <- glm(formula = form, data = df)
model ::glance(model)
broom }
# run models and glance at results
<- models |>
mods1::group_by(formula) |>
dplyr::summarise(blood_model_information(formula, model_data)) |>
dplyr::select(formula,AIC,BIC,deviance,logLik) |>
dplyrseparate(formula,c("response","term"),sep="~") |>
arrange(AIC)
mods1
<-mods1
lr_fitted_add #Create a ggplot object to visualise significance
<- lr_fitted_add|>
plot ggplot(mapping = aes(x=fct_reorder(term,AIC), y=AIC)) +
geom_col() +
::scale_fill_canva() +
ggthemestheme(axis.text.x = element_text(face="bold",
color="#0070BA",
size=8,
angle=40)) +
geom_hline(yintercept = 0.05, col = "black", lty = 2) +
labs(y="Akaike Information Criteria",
x="term",
title="AIC chart",
subtitle="significant variables in the model",
caption="Produced by Bongani Ncube")
::ggplotly(plot) plotly
Comments
add chronic kidney disease
<-glm(Blood_Pressure_Abnormality ~ Adrenal_and_thyroid_disorders,data=model_data,family=binomial)
model0<-glm(Blood_Pressure_Abnormality ~ Adrenal_and_thyroid_disorders+Chronic_kidney_disease,data=model_data,family=binomial) model1
anova(model0,model1,test="LRT")
add haemoglobin level
<-glm(Blood_Pressure_Abnormality ~ Adrenal_and_thyroid_disorders+Chronic_kidney_disease+Level_of_Hemoglobin,data=model_data,family=binomial) model2
anova(model1,model2,test="LRT")
level of haemoglobin
is a significant variable and
should be includedadd sex
<-glm(Blood_Pressure_Abnormality ~ Adrenal_and_thyroid_disorders+Chronic_kidney_disease+Level_of_Hemoglobin+Sex,data=model_data,family=binomial) model3
anova(model2,model3,test="LRT")
Pr(>Chi)
is greater than 0.05
so we
drop sexlets try age
<-glm(Blood_Pressure_Abnormality ~ Adrenal_and_thyroid_disorders+Chronic_kidney_disease+Level_of_Hemoglobin+Age,data=model_data,family=binomial) model4
anova(model2,model4,test="LRT")
try Physical_activity
<-glm(Blood_Pressure_Abnormality ~ Adrenal_and_thyroid_disorders+Chronic_kidney_disease+Level_of_Hemoglobin+Physical_activity,data=model_data,family=binomial) model5
anova(model2,model5,test="LRT")
this would go on and on but we can use
stepwise
algorithms to make it fast
library(MASS)
<-glm(Blood_Pressure_Abnormality~.-Genetic_Pedigree_Coefficient - alcohol_consumption_per_day ,
full_modeldata=model_data,family=binomial(link="logit"))
<-glm(Blood_Pressure_Abnormality~.-Genetic_Pedigree_Coefficient - alcohol_consumption_per_day ,
full_model1data=full_model$model,family=binomial(link="logit"))
<-stepAIC(full_model1, direction = "both")
step#> Start: AIC=607.78
#> Blood_Pressure_Abnormality ~ (Level_of_Hemoglobin + Genetic_Pedigree_Coefficient +
#> Age + BMI + Sex + Smoking + Physical_activity + salt_content_in_the_diet +
#> alcohol_consumption_per_day + Level_of_Stress + Chronic_kidney_disease +
#> Adrenal_and_thyroid_disorders) - Genetic_Pedigree_Coefficient -
#> alcohol_consumption_per_day
#>
#> Df Deviance AIC
#> - salt_content_in_the_diet 1 583.78 605.78
#> - BMI 1 583.78 605.78
#> - Age 1 584.07 606.07
#> - Level_of_Stress 2 586.29 606.29
#> - Smoking 1 585.21 607.21
#> <none> 583.78 607.78
#> - Sex 1 587.62 609.62
#> - Physical_activity 1 588.48 610.48
#> - Level_of_Hemoglobin 1 611.85 633.85
#> - Adrenal_and_thyroid_disorders 1 1097.92 1119.92
#> - Chronic_kidney_disease 1 1354.30 1376.30
#>
#> Step: AIC=605.78
#> Blood_Pressure_Abnormality ~ Level_of_Hemoglobin + Age + BMI +
#> Sex + Smoking + Physical_activity + Level_of_Stress + Chronic_kidney_disease +
#> Adrenal_and_thyroid_disorders
#>
#> Df Deviance AIC
#> - BMI 1 583.78 603.78
#> - Age 1 584.07 604.07
#> - Level_of_Stress 2 586.29 604.29
#> - Smoking 1 585.22 605.22
#> <none> 583.78 605.78
#> - Sex 1 587.62 607.62
#> + salt_content_in_the_diet 1 583.78 607.78
#> - Physical_activity 1 588.50 608.50
#> - Level_of_Hemoglobin 1 611.85 631.85
#> - Adrenal_and_thyroid_disorders 1 1099.79 1119.79
#> - Chronic_kidney_disease 1 1354.36 1374.36
#>
#> Step: AIC=603.78
#> Blood_Pressure_Abnormality ~ Level_of_Hemoglobin + Age + Sex +
#> Smoking + Physical_activity + Level_of_Stress + Chronic_kidney_disease +
#> Adrenal_and_thyroid_disorders
#>
#> Df Deviance AIC
#> - Age 1 584.08 602.08
#> - Level_of_Stress 2 586.30 602.30
#> - Smoking 1 585.22 603.22
#> <none> 583.78 603.78
#> - Sex 1 587.66 605.66
#> + BMI 1 583.78 605.78
#> + salt_content_in_the_diet 1 583.78 605.78
#> - Physical_activity 1 588.50 606.50
#> - Level_of_Hemoglobin 1 612.31 630.31
#> - Adrenal_and_thyroid_disorders 1 1099.83 1117.83
#> - Chronic_kidney_disease 1 1354.49 1372.49
#>
#> Step: AIC=602.08
#> Blood_Pressure_Abnormality ~ Level_of_Hemoglobin + Sex + Smoking +
#> Physical_activity + Level_of_Stress + Chronic_kidney_disease +
#> Adrenal_and_thyroid_disorders
#>
#> Df Deviance AIC
#> - Level_of_Stress 2 586.69 600.69
#> - Smoking 1 585.57 601.57
#> <none> 584.08 602.08
#> + Age 1 583.78 603.78
#> - Sex 1 587.95 603.95
#> + BMI 1 584.07 604.07
#> + salt_content_in_the_diet 1 584.08 604.08
#> - Physical_activity 1 588.81 604.81
#> - Level_of_Hemoglobin 1 612.31 628.31
#> - Adrenal_and_thyroid_disorders 1 1099.87 1115.87
#> - Chronic_kidney_disease 1 1354.49 1370.49
#>
#> Step: AIC=600.69
#> Blood_Pressure_Abnormality ~ Level_of_Hemoglobin + Sex + Smoking +
#> Physical_activity + Chronic_kidney_disease + Adrenal_and_thyroid_disorders
#>
#> Df Deviance AIC
#> - Smoking 1 587.95 599.95
#> <none> 586.69 600.69
#> + Level_of_Stress 2 584.08 602.08
#> + Age 1 586.30 602.30
#> - Sex 1 590.41 602.41
#> + BMI 1 586.68 602.68
#> + salt_content_in_the_diet 1 586.69 602.69
#> - Physical_activity 1 591.09 603.09
#> - Level_of_Hemoglobin 1 613.48 625.48
#> - Adrenal_and_thyroid_disorders 1 1101.12 1113.12
#> - Chronic_kidney_disease 1 1357.03 1369.03
#>
#> Step: AIC=599.95
#> Blood_Pressure_Abnormality ~ Level_of_Hemoglobin + Sex + Physical_activity +
#> Chronic_kidney_disease + Adrenal_and_thyroid_disorders
#>
#> Df Deviance AIC
#> <none> 587.95 599.95
#> + Smoking 1 586.69 600.69
#> + Age 1 587.50 601.50
#> + Level_of_Stress 2 585.57 601.57
#> - Sex 1 591.87 601.87
#> + BMI 1 587.93 601.93
#> + salt_content_in_the_diet 1 587.94 601.94
#> - Physical_activity 1 592.17 602.17
#> - Level_of_Hemoglobin 1 615.30 625.30
#> - Adrenal_and_thyroid_disorders 1 1103.15 1113.15
#> - Chronic_kidney_disease 1 1357.04 1367.04
summary(step)
#>
#> Call:
#> glm(formula = Blood_Pressure_Abnormality ~ Level_of_Hemoglobin +
#> Sex + Physical_activity + Chronic_kidney_disease + Adrenal_and_thyroid_disorders,
#> family = binomial(link = "logit"), data = full_model$model)
#>
#> Coefficients:
#> Estimate Std. Error z value
#> (Intercept) -6.491095081 0.800187793 -8.112
#> Level_of_Hemoglobin 0.347319258 0.065145381 5.331
#> SexMale -0.467045092 0.236393033 -1.976
#> Physical_activity 0.000016613 0.000008149 2.039
#> Chronic_kidney_diseaseYes 22.575771733 940.745458348 0.024
#> Adrenal_and_thyroid_disordersYes 22.263364686 984.433896005 0.023
#> Pr(>|z|)
#> (Intercept) 0.000000000000000498 ***
#> Level_of_Hemoglobin 0.000000097432321196 ***
#> SexMale 0.0482 *
#> Physical_activity 0.0415 *
#> Chronic_kidney_diseaseYes 0.9809
#> Adrenal_and_thyroid_disordersYes 0.9820
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> (Dispersion parameter for binomial family taken to be 1)
#>
#> Null deviance: 2323.50 on 1676 degrees of freedom
#> Residual deviance: 587.95 on 1671 degrees of freedom
#> AIC: 599.95
#>
#> Number of Fisher Scoring iterations: 20
<-glm(Blood_Pressure_Abnormality ~ Adrenal_and_thyroid_disorders+Chronic_kidney_disease+Level_of_Hemoglobin+Sex+Level_of_Hemoglobin,data=model_data,family=binomial)
model_finalsummary(model_final)
#>
#> Call:
#> glm(formula = Blood_Pressure_Abnormality ~ Adrenal_and_thyroid_disorders +
#> Chronic_kidney_disease + Level_of_Hemoglobin + Sex + Level_of_Hemoglobin,
#> family = binomial, data = model_data)
#>
#> Coefficients:
#> Estimate Std. Error z value
#> (Intercept) -6.31188 0.71911 -8.777
#> Adrenal_and_thyroid_disordersYes 22.29857 887.41087 0.025
#> Chronic_kidney_diseaseYes 22.63412 847.14289 0.027
#> Level_of_Hemoglobin 0.35914 0.06085 5.902
#> SexMale -0.41252 0.22277 -1.852
#> Pr(>|z|)
#> (Intercept) < 0.0000000000000002 ***
#> Adrenal_and_thyroid_disordersYes 0.9800
#> Chronic_kidney_diseaseYes 0.9787
#> Level_of_Hemoglobin 0.0000000036 ***
#> SexMale 0.0641 .
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> (Dispersion parameter for binomial family taken to be 1)
#>
#> Null deviance: 2772.25 on 1999 degrees of freedom
#> Residual deviance: 658.41 on 1995 degrees of freedom
#> AIC: 668.41
#>
#> Number of Fisher Scoring iterations: 20
::PseudoR2(
DescTools
model_final, which = c("McFadden", "CoxSnell", "Nagelkerke", "Tjur")
)#> McFadden CoxSnell Nagelkerke Tjur
#> 0.7624989 0.6524751 0.8700158 0.8215934
<-DescTools::PseudoR2(
data_final
model_final, which = c("McFadden", "CoxSnell", "Nagelkerke", "Tjur")
)
<-DescTools::PseudoR2(
data_full
model_final, which = c("McFadden", "CoxSnell", "Nagelkerke", "Tjur")
)
<-as.data.frame(data_final)
data_final<-as.data.frame(data_full)
data_full
<-cbind(data_final,data_full)
out_new out_new
<-rownames_to_column(out_new)) (out_new
<- function(form, df) {
blood_model_information <- glm(formula = form, data = df)
model ::glance(model)
broom }
# define function to run stroke model and glance at results
<- function(form, df) {
blood_model_pvals <- glm(formula = form, data = df)
model ::tidy(model)
broom
}# create model formula column
<- c(
formula "Blood_Pressure_Abnormality ~ Adrenal_and_thyroid_disorders+Chronic_kidney_disease+Level_of_Hemoglobin+Sex+Level_of_Hemoglobin",
"Blood_Pressure_Abnormality~.-Genetic_Pedigree_Coefficient - alcohol_consumption_per_day"
)# create dataframe
<- data.frame(formula) models
# run models and glance at results
<- models |>
mods_compare::group_by(formula) |>
dplyr::summarise(blood_model_information(formula, model_data)) |>
dplyr::select(formula,AIC,BIC,deviance,logLik) |>
dplyrseparate(formula,c("response","term"),sep="~") |>
arrange(AIC)
mods_compare
<-mods_compare |>
resultsmutate(term=ifelse(term==".-Genetic_Pedigree_Coefficient - alcohol_consumption_per_day","data_full","data_final")) |>
::select(-response) |>
dplyrt() |>
as.data.frame() |>
rownames_to_column() |>
relocate(rowname,V2) |>
filter(rowname!="term")
colnames(results)<-c("rowname","data_final","data_full")
results
<-rbind(results,out_new) |>
(my_resultsrename(`full model` =data_full,
`optimal model` = data_final))
|>
my_results gather("key","value",-rowname) |>
mutate(value=as.numeric(value)) |>
ggplot(aes(x=key,y=value,fill=rowname)) +
geom_col(position="dodge")+
::scale_fill_tableau()+
ggthemes::theme_fivethirtyeight()+
ggthemeslabs(fill="measure")