0.1 Loading Required Packages

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(janitor)
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(readr)
library(skimr)
library(gt)
library(ggthemes)
library(knitr)
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

0.2 Motivation

Intradialytic hypotension (IDH) is more than simply a clinical number; it is a lived experience. After witnessing a family member suffer with IDH on several occasions, frequently following major and fast fluid removal , I was compelled to investigate the patterns. This study uses real-world dialysis session data to identify risk variables and investigate if better fluid and weight tracking might help avoid IDH.

0.3 Data Aquisition and loading

In this study, we’ll utilize the Dialysis Database (Gómez-Pulido, 2021), which includes approximately 98,000 hemodialysis sessions including patient vitals, clinical events, and machine characteristics. In addition, we use lessons from the Real-time prediction of IDH using machine learning (Zhang et al., 2023) to guide future research.

0.4 Load Data

setwd("/cloud/project/IDH DATA")
# over-wright session date if exist
df_idh <- read_csv("dataset_11.csv") |> clean_names() |>
  mutate(session_date = ymd(Sys.Date()))
## Rows: 12404 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): D_Hipotension;D_Sexo;D_Edad;D_Dializador;D_Bano;D_Tecnica;D_PesoSec...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(df_idh)
## tibble [12,404 × 2] (S3: tbl_df/tbl/data.frame)
##  $ d_hipotension_d_sexo_d_edad_d_dializador_d_bano_d_tecnica_d_peso_seco_d_peso_pre_d_peso_post_d_ganancia_d_temperatura_bano_d_volumen_reposicion_d_kt_d_flujo_sangre_d_flujo_bano_d_conductividad_bano_d_conductividad_bicarbonato_d_presion_arterial_d_presion_venosa_d_ptm_d_tas_d_tad_d_pulso_d_uf_d_temperatura: chr [1:12404] "NO;1;69;5;1;1;73.000000;75.400000;74.000000;1.400000;35.500000;24.050000;75.300000;450.000000;798.000000;14.800"| __truncated__ "NO;1;69;5;1;1;73.000000;75.400000;73.800000;1.400000;35.500000;26.530000;74.500000;450.000000;800.000000;13.900"| __truncated__ "NO;1;69;5;1;1;73.000000;76.600000;74.600000;2.800000;35.500000;25.510000;73.600000;450.000000;806.000000;14.100"| __truncated__ "NO;1;69;5;1;1;73.000000;76.000000;74.400000;1.600000;35.500000;21.790000;54.000000;350.000000;793.000000;13.900"| __truncated__ ...
##  $ session_date                                                                                                                                                                                                                                                                                                      : Date[1:12404], format: "2025-05-14" "2025-05-14" ...
glimpse(df_idh)
## Rows: 12,404
## Columns: 2
## $ d_hipotension_d_sexo_d_edad_d_dializador_d_bano_d_tecnica_d_peso_seco_d_peso_pre_d_peso_post_d_ganancia_d_temperatura_bano_d_volumen_reposicion_d_kt_d_flujo_sangre_d_flujo_bano_d_conductividad_bano_d_conductividad_bicarbonato_d_presion_arterial_d_presion_venosa_d_ptm_d_tas_d_tad_d_pulso_d_uf_d_temperatura <chr> …
## $ session_date                                                                                                                                                                                                                                                                                                       <date> …

0.5 Splitting Clinical Data into Columns

 df_idh_clean <- df_idh %>%
  separate(
    d_hipotension_d_sexo_d_edad_d_dializador_d_bano_d_tecnica_d_peso_seco_d_peso_pre_d_peso_post_d_ganancia_d_temperatura_bano_d_volumen_reposicion_d_kt_d_flujo_sangre_d_flujo_bano_d_conductividad_bano_d_conductividad_bicarbonato_d_presion_arterial_d_presion_venosa_d_ptm_d_tas_d_tad_d_pulso_d_uf_d_temperatura,
    into = c("hipotension", "sexo", "edad", "dializador", "bano", "tecnica", 
             "peso_seco", "peso_pre", "peso_post", "ganancia", "temperatura_bano",
             "volumen_reposicion", "kt", "flujo_sangre", "flujo_bano",
             "conductividad_bano", "conductividad_bicarbonato", "presion_arterial",
             "presion_venosa", "ptm", "tas", "tad", "pulso", "uf", "temperatura", "EXTRA"),
    sep = ";",
    remove = TRUE
  )

This process help on assuming that d delimiter consistently seperates all values, which can be verify with str_split_fixed()

# Convert numeric columns
df_idh_clean <- df_idh_clean %>%
  mutate(across(c(edad, peso_seco, peso_pre, peso_post, ganancia,
                  temperatura_bano, volumen_reposicion, kt,
                  flujo_sangre, flujo_bano, conductividad_bano,
                  conductividad_bicarbonato, presion_arterial,
                  presion_venosa, ptm, tas, tad, pulso, uf, temperatura),
                ~ as.numeric(.)))

No missing Value were detected during this process.

glimpse(df_idh_clean)
## Rows: 12,404
## Columns: 27
## $ hipotension               <chr> "NO", "NO", "NO", "NO", "NO", "NO", "NO", "N…
## $ sexo                      <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1",…
## $ edad                      <dbl> 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, …
## $ dializador                <chr> "5", "5", "5", "5", "5", "5", "5", "5", "5",…
## $ bano                      <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1",…
## $ tecnica                   <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1",…
## $ peso_seco                 <dbl> 73.0, 73.0, 73.0, 73.0, 73.0, 73.0, 74.0, 74…
## $ peso_pre                  <dbl> 75.4, 75.4, 76.6, 76.0, 77.4, 76.0, 76.6, 77…
## $ peso_post                 <dbl> 74.0, 73.8, 74.6, 74.4, 74.4, 73.8, 74.6, 74…
## $ ganancia                  <dbl> 1.4, 1.4, 2.8, 1.6, 3.0, 1.6, 2.8, 2.4, 2.2,…
## $ temperatura_bano          <dbl> 35.5, 35.5, 35.5, 35.5, 35.5, 35.5, 35.5, 35…
## $ volumen_reposicion        <dbl> 24.05, 26.53, 25.51, 21.79, 19.14, 24.11, 25…
## $ kt                        <dbl> 75.3, 74.5, 73.6, 54.0, 55.1, 65.0, 73.7, 61…
## $ flujo_sangre              <dbl> 450, 450, 450, 350, 300, 400, 450, 350, 400,…
## $ flujo_bano                <dbl> 798, 800, 806, 793, 365, 478, 820, 808, 797,…
## $ conductividad_bano        <dbl> 14.8, 13.9, 14.1, 13.9, 13.9, 13.9, 14.7, 14…
## $ conductividad_bicarbonato <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ presion_arterial          <dbl> -185, -200, -200, -140, -275, -245, -195, -1…
## $ presion_venosa            <dbl> 170, 190, 185, 140, 200, 170, 190, 135, 170,…
## $ ptm                       <dbl> 135, 155, 190, 165, 210, 165, 180, 135, 190,…
## $ tas                       <dbl> 93, 92, 80, 82, 92, 87, 93, 83, 84, 81, 88, …
## $ tad                       <dbl> 40, 42, 35, 37, 36, 36, 35, 34, 37, 29, 37, …
## $ pulso                     <dbl> 70, 71, 83, 70, 70, 69, 71, 69, 71, 70, 69, …
## $ uf                        <dbl> 0.18, 0.17, 0.15, 0.15, 0.21, 0.14, 0.17, 0.…
## $ temperatura               <dbl> 35.9, 36.7, 36.7, 36.8, 37.3, 36.7, 37.0, 37…
## $ EXTRA                     <chr> "", "", "", "", "", "", "", "", "", "", "", …
## $ session_date              <date> 2025-05-14, 2025-05-14, 2025-05-14, 2025-05…

0.6 Recoding Categorical Variables

# Possibly Decode categorical variables 
df_idh_clean <- df_idh_clean %>%
  mutate(
    hipotension = factor(hipotension),
    sexo = factor(sexo, labels = c("Male", "Female"))
  )

0.7 statistics Description

We’ll evaluate target variable, and numeric

table(df_idh_clean$hipotension)
## 
##   NO   SI 
## 9923 2481
df_idh_clean %>%
  group_by(hipotension) %>%
  summarise(across(where(is.numeric), \(x) mean(x, na.rm = TRUE))) %>%
  kable()
hipotension edad peso_seco peso_pre peso_post ganancia temperatura_bano volumen_reposicion kt flujo_sangre flujo_bano conductividad_bano conductividad_bicarbonato presion_arterial presion_venosa ptm tas tad pulso uf temperatura
NO 65.95747 69.98322 71.77686 70.20388 1.608561 35.63635 27.04447 57.74907 381.6394 656.2002 13.98978 0.0455507 -187.3566 163.4269 149.9773 126.8261 60.43626 69.85277 0.137145 36.45786
SI 67.30996 73.21491 75.35308 73.53216 1.836477 35.60387 26.92231 55.55111 378.5324 634.8356 13.97070 0.0205562 -191.4268 169.7239 137.7852 145.9440 64.54010 71.25756 0.156836 36.41080

**In this step data were group by hipotension status (NO,SI). Each mean were computed appropriately for each numeric variable group. Moreover Na were removed using na.rm=TRUE. Lastly Kable was implemented to display a decent table.

0.8 Data Visualizations

In this step we’ll create some visual to dive into the relationship between the variables.

0.9 -1 Distrinution of Peso Seco (Dry Weight)

ggplot(df_idh_clean, aes(x = peso_seco, fill = hipotension)) +
  geom_histogram(binwidth = 0.5, position = "dodge") +
  labs(title = "Distribution of Peso Seco (Dry Weight) by Hipotension Status", 
       x = "Peso Seco (kg)", y = "Frequency") +
  theme_minimal()

0.10 Correlation Bet. Blood Flow and Conductivity

ggplot(df_idh_clean, aes(x = flujo_sangre, y = conductividad_bano, color = hipotension)) +
  geom_point() +
  labs(title = "Blood Flow vs. Conductivity by Hipotension Status", 
       x = "Blood Flow (mL/min)", y = "Conductivity (mS/cm)") +
  theme_minimal()

0.11 Hypothesis Testing

Running a t-test will help determine any significant differences in peso_seco bet. The hipotension groups (NO, SI)

# Perform a t-test on peso_seco

t_test_results <- t.test(peso_seco ~ hipotension, data = df_idh_clean)
t_test_results
## 
##  Welch Two Sample t-test
## 
## data:  peso_seco by hipotension
## t = -8.9681, df = 3500.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group NO and group SI is not equal to 0
## 95 percent confidence interval:
##  -3.938214 -2.525171
## sample estimates:
## mean in group NO mean in group SI 
##         69.98322         73.21491

0.12 Predictive Modeling

Here will build a logistic regression model , the goal is to predict hipotension in regards to the available predictors

0.13 Logistic Regression Model

logit_model <- glm(hipotension ~ sexo + edad + peso_seco + flujo_sangre + 
                     conductividad_bano + presion_arterial + presion_venosa, 
                   data = df_idh_clean, family = "binomial")

# Summary of the model
summary(logit_model)
## 
## Call:
## glm(formula = hipotension ~ sexo + edad + peso_seco + flujo_sangre + 
##     conductividad_bano + presion_arterial + presion_venosa, family = "binomial", 
##     data = df_idh_clean)
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -0.9774523  1.1684173  -0.837    0.403    
## sexoFemale          0.3828428  0.0556016   6.885 5.76e-12 ***
## edad                0.0114497  0.0016693   6.859 6.93e-12 ***
## peso_seco           0.0166766  0.0015804  10.552  < 2e-16 ***
## flujo_sangre       -0.0060746  0.0006421  -9.460  < 2e-16 ***
## conductividad_bano -0.1348660  0.0826128  -1.633    0.103    
## presion_arterial   -0.0043707  0.0006970  -6.271 3.59e-10 ***
## presion_venosa      0.0054257  0.0006636   8.176 2.94e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 12415  on 12403  degrees of freedom
## Residual deviance: 12111  on 12396  degrees of freedom
## AIC: 12127
## 
## Number of Fisher Scoring iterations: 4
  • We can gather that the most significant predictors showcase the p-values < 0.05.
  • The positive and negatives coeficient: negative shows a decrease were positives shows increase odds of IDH

0.14 Model Evaluation: ROC Curve & Confusion Matrix

Let’s check the confusion matrix on ROC curve, to help evaluate model performance.

# Predicted probabilities
pred_probs <- predict(logit_model, type = "response")
pred_class <- ifelse(pred_probs > 0.5, "SI", "NO")
# Confusion matrix
conf_matrix <- table(Predicted = pred_class, Actual = df_idh_clean$hipotension)
conf_matrix
##          Actual
## Predicted   NO   SI
##        NO 9917 2476
##        SI    6    5
# ROC Curve
roc_curve <- roc(df_idh_clean$hipotension, pred_probs)
## Setting levels: control = NO, case = SI
## Setting direction: controls < cases
## Setting levels: control = NO, case = SI
## Setting direction: controls < cases
plot(roc_curve, main = paste("ROC Curve (AUC =", round(auc(roc_curve), 3), ")"))

## Futher Assessing Causation - Adjusted ratios(OR) to demonstrate the most strongly associated variables with IDH

library(broom)
logit_or <- tidy(logit_model, exponentiate = TRUE, conf.int = TRUE)

logit_or |>
  filter(term != "(Intercept)") |>
  arrange(desc(estimate)) |>
  kable(caption = "Adjusted Odds Ratios for IDH") |>
  kable_styling(full_width = FALSE)
Adjusted Odds Ratios for IDH
term estimate std.error statistic p.value conf.low conf.high
sexoFemale 1.4664475 0.0556016 6.885470 0.0000000 1.3146544 1.6348602
peso_seco 1.0168165 0.0015804 10.552463 0.0000000 1.0136708 1.0199705
edad 1.0115155 0.0016693 6.859146 0.0000000 1.0082270 1.0148464
presion_venosa 1.0054405 0.0006636 8.175640 0.0000000 1.0041319 1.0067480
presion_arterial 0.9956389 0.0006970 -6.270881 0.0000000 0.9942823 0.9970030
flujo_sangre 0.9939439 0.0006421 -9.459724 0.0000000 0.9926937 0.9951961
conductividad_bano 0.8738330 0.0826128 -1.632508 0.1025726 0.7424805 1.0264712
if ("fainting" %in% names(df_idh_clean)) {
  df_idh_clean %>%
    count(hipotension, fainting) %>%
    kable(caption = "Frequency of Fainting by IDH Status") %>%
    kable_styling()
}
# Approximate fainting risk using blood pressure drops (e.g., TAS < 90)
df_idh_clean <- df_idh_clean %>%
  mutate(fainting_risk = ifelse(tas < 90 & pulso > 100, "High Risk", "Normal"))

# Summary of risk by IDH status
df_idh_clean %>%
  count(hipotension, fainting_risk) %>%
  kable(caption = "Estimated Fainting Risk by IDH Status") %>%
  kable_styling()
Estimated Fainting Risk by IDH Status
hipotension fainting_risk n
NO High Risk 51
NO Normal 9872
SI High Risk 3
SI Normal 2478
ggplot(df_idh_clean, aes(x = temperatura_bano, fill = hipotension)) +
  geom_histogram(position = "dodge", binwidth = 0.2) +
  labs(title = "Dialysate Temperature Distribution by IDH Status",
       x = "Temperature (°C)", y = "Count") +
  theme_minimal()

ggplot(df_idh_clean, aes(x = uf, fill = hipotension)) +
  geom_density(alpha = 0.6) +
  labs(title = "Ultrafiltration Rate and IDH Risk", x = "UF (mL/min)", y = "Density") +
  theme_minimal()

## Visual of Importance variable (Broom)

library(vip)
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
vip(logit_model, num_features = 10, geom = "col", aesthetics = list(fill = "skyblue"))

0.15 Conclusion

This study demonstrates that dialysis conductivity, blood flow rate, and patient dry weight are all potential predictors of intradialytic hypotension (IDH). AUC = r round(auc(roc(df_idh_clean$hipotension, pred_probs)), 2), the logistic regression model’s reasonable performance, suggests that real-time risk prediction is possible with the data at hand. Base on experience, when using proper intake for weight makes a big impact on the post dialysis pressure.

These results imply that improving fluid monitoring and machine parameter settings may lead to better patient outcomes and are consistent with the clinical modeling strategy suggested by Zhang et al. (2023).

References

Gómez-Pulido, J. A. (2021). Dialysis Dataset. Mendeley Data. https://doi.org/10.17632/wz3n4bb7vp.1

Zhang, X., Liu, M., et al. (2023). Real-time prediction of intradialytic hypotension using machine learning. Journal of Biomedical Informatics.