library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(readr)
library(skimr)
library(gt)
library(ggthemes)
library(knitr)
library(kableExtra)
##
## Attaching package: 'kableExtra'
##
## The following object is masked from 'package:dplyr':
##
## group_rows
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
Intradialytic hypotension (IDH) is more than simply a clinical number; it is a lived experience. After witnessing a family member suffer with IDH on several occasions, frequently following major and fast fluid removal , I was compelled to investigate the patterns. This study uses real-world dialysis session data to identify risk variables and investigate if better fluid and weight tracking might help avoid IDH.
In this study, we’ll utilize the Dialysis Database (Gómez-Pulido, 2021), which includes approximately 98,000 hemodialysis sessions including patient vitals, clinical events, and machine characteristics. In addition, we use lessons from the Real-time prediction of IDH using machine learning (Zhang et al., 2023) to guide future research.
setwd("/cloud/project/IDH DATA")
# over-wright session date if exist
df_idh <- read_csv("dataset_11.csv") |> clean_names() |>
mutate(session_date = ymd(Sys.Date()))
## Rows: 12404 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): D_Hipotension;D_Sexo;D_Edad;D_Dializador;D_Bano;D_Tecnica;D_PesoSec...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(df_idh)
## tibble [12,404 × 2] (S3: tbl_df/tbl/data.frame)
## $ d_hipotension_d_sexo_d_edad_d_dializador_d_bano_d_tecnica_d_peso_seco_d_peso_pre_d_peso_post_d_ganancia_d_temperatura_bano_d_volumen_reposicion_d_kt_d_flujo_sangre_d_flujo_bano_d_conductividad_bano_d_conductividad_bicarbonato_d_presion_arterial_d_presion_venosa_d_ptm_d_tas_d_tad_d_pulso_d_uf_d_temperatura: chr [1:12404] "NO;1;69;5;1;1;73.000000;75.400000;74.000000;1.400000;35.500000;24.050000;75.300000;450.000000;798.000000;14.800"| __truncated__ "NO;1;69;5;1;1;73.000000;75.400000;73.800000;1.400000;35.500000;26.530000;74.500000;450.000000;800.000000;13.900"| __truncated__ "NO;1;69;5;1;1;73.000000;76.600000;74.600000;2.800000;35.500000;25.510000;73.600000;450.000000;806.000000;14.100"| __truncated__ "NO;1;69;5;1;1;73.000000;76.000000;74.400000;1.600000;35.500000;21.790000;54.000000;350.000000;793.000000;13.900"| __truncated__ ...
## $ session_date : Date[1:12404], format: "2025-05-14" "2025-05-14" ...
glimpse(df_idh)
## Rows: 12,404
## Columns: 2
## $ d_hipotension_d_sexo_d_edad_d_dializador_d_bano_d_tecnica_d_peso_seco_d_peso_pre_d_peso_post_d_ganancia_d_temperatura_bano_d_volumen_reposicion_d_kt_d_flujo_sangre_d_flujo_bano_d_conductividad_bano_d_conductividad_bicarbonato_d_presion_arterial_d_presion_venosa_d_ptm_d_tas_d_tad_d_pulso_d_uf_d_temperatura <chr> …
## $ session_date <date> …
df_idh_clean <- df_idh %>%
separate(
d_hipotension_d_sexo_d_edad_d_dializador_d_bano_d_tecnica_d_peso_seco_d_peso_pre_d_peso_post_d_ganancia_d_temperatura_bano_d_volumen_reposicion_d_kt_d_flujo_sangre_d_flujo_bano_d_conductividad_bano_d_conductividad_bicarbonato_d_presion_arterial_d_presion_venosa_d_ptm_d_tas_d_tad_d_pulso_d_uf_d_temperatura,
into = c("hipotension", "sexo", "edad", "dializador", "bano", "tecnica",
"peso_seco", "peso_pre", "peso_post", "ganancia", "temperatura_bano",
"volumen_reposicion", "kt", "flujo_sangre", "flujo_bano",
"conductividad_bano", "conductividad_bicarbonato", "presion_arterial",
"presion_venosa", "ptm", "tas", "tad", "pulso", "uf", "temperatura", "EXTRA"),
sep = ";",
remove = TRUE
)
This process help on assuming that d delimiter consistently seperates all values, which can be verify with str_split_fixed()
# Convert numeric columns
df_idh_clean <- df_idh_clean %>%
mutate(across(c(edad, peso_seco, peso_pre, peso_post, ganancia,
temperatura_bano, volumen_reposicion, kt,
flujo_sangre, flujo_bano, conductividad_bano,
conductividad_bicarbonato, presion_arterial,
presion_venosa, ptm, tas, tad, pulso, uf, temperatura),
~ as.numeric(.)))
No missing Value were detected during this process.
glimpse(df_idh_clean)
## Rows: 12,404
## Columns: 27
## $ hipotension <chr> "NO", "NO", "NO", "NO", "NO", "NO", "NO", "N…
## $ sexo <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1",…
## $ edad <dbl> 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, …
## $ dializador <chr> "5", "5", "5", "5", "5", "5", "5", "5", "5",…
## $ bano <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1",…
## $ tecnica <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1",…
## $ peso_seco <dbl> 73.0, 73.0, 73.0, 73.0, 73.0, 73.0, 74.0, 74…
## $ peso_pre <dbl> 75.4, 75.4, 76.6, 76.0, 77.4, 76.0, 76.6, 77…
## $ peso_post <dbl> 74.0, 73.8, 74.6, 74.4, 74.4, 73.8, 74.6, 74…
## $ ganancia <dbl> 1.4, 1.4, 2.8, 1.6, 3.0, 1.6, 2.8, 2.4, 2.2,…
## $ temperatura_bano <dbl> 35.5, 35.5, 35.5, 35.5, 35.5, 35.5, 35.5, 35…
## $ volumen_reposicion <dbl> 24.05, 26.53, 25.51, 21.79, 19.14, 24.11, 25…
## $ kt <dbl> 75.3, 74.5, 73.6, 54.0, 55.1, 65.0, 73.7, 61…
## $ flujo_sangre <dbl> 450, 450, 450, 350, 300, 400, 450, 350, 400,…
## $ flujo_bano <dbl> 798, 800, 806, 793, 365, 478, 820, 808, 797,…
## $ conductividad_bano <dbl> 14.8, 13.9, 14.1, 13.9, 13.9, 13.9, 14.7, 14…
## $ conductividad_bicarbonato <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ presion_arterial <dbl> -185, -200, -200, -140, -275, -245, -195, -1…
## $ presion_venosa <dbl> 170, 190, 185, 140, 200, 170, 190, 135, 170,…
## $ ptm <dbl> 135, 155, 190, 165, 210, 165, 180, 135, 190,…
## $ tas <dbl> 93, 92, 80, 82, 92, 87, 93, 83, 84, 81, 88, …
## $ tad <dbl> 40, 42, 35, 37, 36, 36, 35, 34, 37, 29, 37, …
## $ pulso <dbl> 70, 71, 83, 70, 70, 69, 71, 69, 71, 70, 69, …
## $ uf <dbl> 0.18, 0.17, 0.15, 0.15, 0.21, 0.14, 0.17, 0.…
## $ temperatura <dbl> 35.9, 36.7, 36.7, 36.8, 37.3, 36.7, 37.0, 37…
## $ EXTRA <chr> "", "", "", "", "", "", "", "", "", "", "", …
## $ session_date <date> 2025-05-14, 2025-05-14, 2025-05-14, 2025-05…
# Possibly Decode categorical variables
df_idh_clean <- df_idh_clean %>%
mutate(
hipotension = factor(hipotension),
sexo = factor(sexo, labels = c("Male", "Female"))
)
We’ll evaluate target variable, and numeric
table(df_idh_clean$hipotension)
##
## NO SI
## 9923 2481
df_idh_clean %>%
group_by(hipotension) %>%
summarise(across(where(is.numeric), \(x) mean(x, na.rm = TRUE))) %>%
kable()
hipotension | edad | peso_seco | peso_pre | peso_post | ganancia | temperatura_bano | volumen_reposicion | kt | flujo_sangre | flujo_bano | conductividad_bano | conductividad_bicarbonato | presion_arterial | presion_venosa | ptm | tas | tad | pulso | uf | temperatura |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
NO | 65.95747 | 69.98322 | 71.77686 | 70.20388 | 1.608561 | 35.63635 | 27.04447 | 57.74907 | 381.6394 | 656.2002 | 13.98978 | 0.0455507 | -187.3566 | 163.4269 | 149.9773 | 126.8261 | 60.43626 | 69.85277 | 0.137145 | 36.45786 |
SI | 67.30996 | 73.21491 | 75.35308 | 73.53216 | 1.836477 | 35.60387 | 26.92231 | 55.55111 | 378.5324 | 634.8356 | 13.97070 | 0.0205562 | -191.4268 | 169.7239 | 137.7852 | 145.9440 | 64.54010 | 71.25756 | 0.156836 | 36.41080 |
**In this step data were group by hipotension status (NO,SI). Each mean were computed appropriately for each numeric variable group. Moreover Na were removed using na.rm=TRUE. Lastly Kable was implemented to display a decent table.
In this step we’ll create some visual to dive into the relationship between the variables.
ggplot(df_idh_clean, aes(x = peso_seco, fill = hipotension)) +
geom_histogram(binwidth = 0.5, position = "dodge") +
labs(title = "Distribution of Peso Seco (Dry Weight) by Hipotension Status",
x = "Peso Seco (kg)", y = "Frequency") +
theme_minimal()
ggplot(df_idh_clean, aes(x = flujo_sangre, y = conductividad_bano, color = hipotension)) +
geom_point() +
labs(title = "Blood Flow vs. Conductivity by Hipotension Status",
x = "Blood Flow (mL/min)", y = "Conductivity (mS/cm)") +
theme_minimal()
Running a t-test will help determine any significant differences in peso_seco bet. The hipotension groups (NO, SI)
# Perform a t-test on peso_seco
t_test_results <- t.test(peso_seco ~ hipotension, data = df_idh_clean)
t_test_results
##
## Welch Two Sample t-test
##
## data: peso_seco by hipotension
## t = -8.9681, df = 3500.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group NO and group SI is not equal to 0
## 95 percent confidence interval:
## -3.938214 -2.525171
## sample estimates:
## mean in group NO mean in group SI
## 69.98322 73.21491
Here will build a logistic regression model , the goal is to predict hipotension in regards to the available predictors
logit_model <- glm(hipotension ~ sexo + edad + peso_seco + flujo_sangre +
conductividad_bano + presion_arterial + presion_venosa,
data = df_idh_clean, family = "binomial")
# Summary of the model
summary(logit_model)
##
## Call:
## glm(formula = hipotension ~ sexo + edad + peso_seco + flujo_sangre +
## conductividad_bano + presion_arterial + presion_venosa, family = "binomial",
## data = df_idh_clean)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.9774523 1.1684173 -0.837 0.403
## sexoFemale 0.3828428 0.0556016 6.885 5.76e-12 ***
## edad 0.0114497 0.0016693 6.859 6.93e-12 ***
## peso_seco 0.0166766 0.0015804 10.552 < 2e-16 ***
## flujo_sangre -0.0060746 0.0006421 -9.460 < 2e-16 ***
## conductividad_bano -0.1348660 0.0826128 -1.633 0.103
## presion_arterial -0.0043707 0.0006970 -6.271 3.59e-10 ***
## presion_venosa 0.0054257 0.0006636 8.176 2.94e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 12415 on 12403 degrees of freedom
## Residual deviance: 12111 on 12396 degrees of freedom
## AIC: 12127
##
## Number of Fisher Scoring iterations: 4
Let’s check the confusion matrix on ROC curve, to help evaluate model performance.
# Predicted probabilities
pred_probs <- predict(logit_model, type = "response")
pred_class <- ifelse(pred_probs > 0.5, "SI", "NO")
# Confusion matrix
conf_matrix <- table(Predicted = pred_class, Actual = df_idh_clean$hipotension)
conf_matrix
## Actual
## Predicted NO SI
## NO 9917 2476
## SI 6 5
# ROC Curve
roc_curve <- roc(df_idh_clean$hipotension, pred_probs)
## Setting levels: control = NO, case = SI
## Setting direction: controls < cases
## Setting levels: control = NO, case = SI
## Setting direction: controls < cases
plot(roc_curve, main = paste("ROC Curve (AUC =", round(auc(roc_curve), 3), ")"))
## Futher Assessing Causation - Adjusted ratios(OR) to demonstrate the
most strongly associated variables with IDH
library(broom)
logit_or <- tidy(logit_model, exponentiate = TRUE, conf.int = TRUE)
logit_or |>
filter(term != "(Intercept)") |>
arrange(desc(estimate)) |>
kable(caption = "Adjusted Odds Ratios for IDH") |>
kable_styling(full_width = FALSE)
term | estimate | std.error | statistic | p.value | conf.low | conf.high |
---|---|---|---|---|---|---|
sexoFemale | 1.4664475 | 0.0556016 | 6.885470 | 0.0000000 | 1.3146544 | 1.6348602 |
peso_seco | 1.0168165 | 0.0015804 | 10.552463 | 0.0000000 | 1.0136708 | 1.0199705 |
edad | 1.0115155 | 0.0016693 | 6.859146 | 0.0000000 | 1.0082270 | 1.0148464 |
presion_venosa | 1.0054405 | 0.0006636 | 8.175640 | 0.0000000 | 1.0041319 | 1.0067480 |
presion_arterial | 0.9956389 | 0.0006970 | -6.270881 | 0.0000000 | 0.9942823 | 0.9970030 |
flujo_sangre | 0.9939439 | 0.0006421 | -9.459724 | 0.0000000 | 0.9926937 | 0.9951961 |
conductividad_bano | 0.8738330 | 0.0826128 | -1.632508 | 0.1025726 | 0.7424805 | 1.0264712 |
if ("fainting" %in% names(df_idh_clean)) {
df_idh_clean %>%
count(hipotension, fainting) %>%
kable(caption = "Frequency of Fainting by IDH Status") %>%
kable_styling()
}
# Approximate fainting risk using blood pressure drops (e.g., TAS < 90)
df_idh_clean <- df_idh_clean %>%
mutate(fainting_risk = ifelse(tas < 90 & pulso > 100, "High Risk", "Normal"))
# Summary of risk by IDH status
df_idh_clean %>%
count(hipotension, fainting_risk) %>%
kable(caption = "Estimated Fainting Risk by IDH Status") %>%
kable_styling()
hipotension | fainting_risk | n |
---|---|---|
NO | High Risk | 51 |
NO | Normal | 9872 |
SI | High Risk | 3 |
SI | Normal | 2478 |
ggplot(df_idh_clean, aes(x = temperatura_bano, fill = hipotension)) +
geom_histogram(position = "dodge", binwidth = 0.2) +
labs(title = "Dialysate Temperature Distribution by IDH Status",
x = "Temperature (°C)", y = "Count") +
theme_minimal()
ggplot(df_idh_clean, aes(x = uf, fill = hipotension)) +
geom_density(alpha = 0.6) +
labs(title = "Ultrafiltration Rate and IDH Risk", x = "UF (mL/min)", y = "Density") +
theme_minimal()
## Visual of Importance variable (Broom)
library(vip)
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
vip(logit_model, num_features = 10, geom = "col", aesthetics = list(fill = "skyblue"))
This study demonstrates that dialysis conductivity, blood flow rate, and patient dry weight are all potential predictors of intradialytic hypotension (IDH). AUC = r round(auc(roc(df_idh_clean$hipotension, pred_probs)), 2), the logistic regression model’s reasonable performance, suggests that real-time risk prediction is possible with the data at hand. Base on experience, when using proper intake for weight makes a big impact on the post dialysis pressure.
These results imply that improving fluid monitoring and machine parameter settings may lead to better patient outcomes and are consistent with the clinical modeling strategy suggested by Zhang et al. (2023).
References
Gómez-Pulido, J. A. (2021). Dialysis Dataset. Mendeley Data. https://doi.org/10.17632/wz3n4bb7vp.1
Zhang, X., Liu, M., et al. (2023). Real-time prediction of intradialytic hypotension using machine learning. Journal of Biomedical Informatics.