RNN w GRU

library(keras3)
library(tensorflow)

## 
## Attaching package: 'tensorflow'

## The following objects are masked from 'package:keras3':
## 
##     set_random_seed, shape

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(caret)

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
## 
## The following object is masked from 'package:tensorflow':
## 
##     train

library(tidyr)

NHANES <- readRDS('/Users/darrensummerlee/Library/CloudStorage/Dropbox/NHANES paper/data set/BP_stats_no_enhance.rds')

n <- nrow(NHANES)
MIMS <- NHANES$MIMS
age <- NHANES$age
gender <- NHANES$gender
CHD <- NHANES$CHD
BMI <- NHANES$BMI
bpS <- NHANES$BPS_avg

MIMS_scaled <- scale(MIMS)
age_scaled <- scale(age)
BMI_scaled <- scale(BMI)
gender_onehot <- model.matrix(~ gender - 1)
CHD_onehot <- model.matrix(~ CHD - 1)

scalars <- cbind(age_scaled, BMI_scaled, gender_onehot, CHD_onehot)
colnames(scalars)[1:2] <- c("Age", "BMI")

set.seed(1)

train_idx <- sample(1:n, size =  0.8 * n, replace = FALSE)
test_idx <- setdiff(1:n, train_idx)

scalar_train <- scalars[train_idx, ]
scalar_test <- scalars[test_idx, ]

bpS_train <- bpS[train_idx]
bpS_test <-bpS[test_idx]

MIMS_train <- MIMS_scaled[train_idx, ]
MIMS_test <- MIMS_scaled[test_idx, ]
MIMS_train <- array(MIMS_train, dim = c(nrow(MIMS_train), 1440, 1))
MIMS_test <- array(MIMS_test, dim = c(nrow(MIMS_test), 1440, 1))

set.seed(1)
func_input <- layer_input(shape = c(1440, 1), name = "func_input")
func_branch <- func_input %>%
  layer_gru(units = 32, activation = "tanh")

scalar_input_layer <- layer_input(shape = ncol(scalar_train), name = "scalar_input")
scalar_branch <- scalar_input_layer %>%
  layer_dense(units = 32, activation = "relu")

combined <- layer_concatenate(list(func_branch, scalar_branch)) %>%
  layer_dense(units = 1)

input <- layer_input(shape = c(10))
output <- input %>%
  layer_dense(units = 16, activation = "relu") %>%
  layer_dense(units = 1)

model <- keras_model(inputs = list(func_input, scalar_input_layer), outputs = combined)

model %>% compile(
  optimizer = "adam",
  loss = "mse",
  metrics = list("mae")
)

set.seed(1)
history <- model %>% fit(
  x = list(func_input = MIMS_train, scalar_input = scalar_train),
  y = bpS_train,
  epochs = 10,
  batch_size = 32,
  validation_split = 0.2,
  callbacks = list(
    callback_early_stopping(
      monitor = "val_loss",
      patience = 3, 
      restore_best_weights = TRUE
    )
  ),
  verbose = 2
)

## Epoch 1/10
## 158/158 - 32s - 201ms/step - loss: 14024.3418 - mae: 116.8675 - val_loss: 12204.8652 - val_mae: 109.0161
## Epoch 2/10
## 158/158 - 31s - 197ms/step - loss: 10877.1006 - mae: 102.5447 - val_loss: 8996.3320 - val_mae: 93.1568
## Epoch 3/10
## 158/158 - 31s - 193ms/step - loss: 7379.4907 - mae: 83.7094 - val_loss: 5449.1182 - val_mae: 71.7261
## Epoch 4/10
## 158/158 - 32s - 200ms/step - loss: 4027.9019 - mae: 60.5061 - val_loss: 2537.1602 - val_mae: 47.4276
## Epoch 5/10
## 158/158 - 34s - 213ms/step - loss: 1705.7799 - mae: 37.0831 - val_loss: 928.8557 - val_mae: 25.9777
## Epoch 6/10
## 158/158 - 34s - 218ms/step - loss: 640.4885 - mae: 20.0205 - val_loss: 376.8244 - val_mae: 14.4872
## Epoch 7/10
## 158/158 - 32s - 200ms/step - loss: 330.2272 - mae: 13.3580 - val_loss: 267.8032 - val_mae: 12.1342
## Epoch 8/10
## 158/158 - 32s - 200ms/step - loss: 275.8527 - mae: 12.3064 - val_loss: 258.1783 - val_mae: 12.0922
## Epoch 9/10
## 158/158 - 33s - 210ms/step - loss: 270.1825 - mae: 12.2882 - val_loss: 257.8130 - val_mae: 12.1358
## Epoch 10/10
## 158/158 - 32s - 202ms/step - loss: 269.1870 - mae: 12.2690 - val_loss: 257.4179 - val_mae: 12.1301

preds <- model %>% predict(list(func_input = MIMS_test, scalar_input = scalar_test))

## 50/50 - 3s - 70ms/step

rmse <- sqrt(mean((preds - bpS_test)^2))

ss_res <- sum((bpS_test - preds)^2)
ss_tot <- sum((bpS_test - mean(bpS_test))^2)
r_squared <- 1 - (ss_res / ss_tot)
r_2 <- summary(lm(bpS_test ~ preds))$r.squared

#cov rate
residuals <- bpS_test - preds
resid_sd <- sd(residuals)
lower_bound <- preds - 1.96 * resid_sd
upper_bound <- preds + 1.96 * resid_sd
covered <- bpS_test >= lower_bound & bpS_test <= upper_bound
coverage_rate <- mean(covered)

cat("95% Coverage Rate:", round(coverage_rate * 100, 2), "%\n")

## 95% Coverage Rate: 95.13 %

cat("Manual R^2:", round(r_squared, 4), "\n")

## Manual R^2: 0.2054

cat("Auto R^2:", round(r_2, 4), "\n")

## Auto R^2: 0.2064

cat("Test RMSE:", rmse, "\n")

## Test RMSE: 15.91577

plot(history)

results <- data.frame(actual = bpS_test, predicted = as.vector(preds))

ggplot(results, aes(x = actual, y = predicted)) +
  geom_point(alpha = 0.5) +
  geom_abline(color = "red", linetype = "dashed") + 
  labs(title = "Actual vs. Predicted Values",
       x = "Actual Blood Pressure",
       y = "Predicted Blood Pressure") +
  coord_fixed() +
  theme_minimal()

results$residuals <- results$predicted - results$actual

ggplot(results, aes(x = predicted, y = residuals)) +
  geom_point(alpha = 0.5) +
  geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
  labs(title = "Residuals vs. Predicted Values",
       x = "Predicted Blood Pressure",
       y = "Residuals (Predicted - Actual)") +
  theme_minimal()

zones <- data.frame(
  xmin = c(50, 100, 120, 130, 140),
  xmax = c(100, 120, 130, 140, Inf),
  fill = c("Low", "Normal", "Elevated", "ISH-S1", "S2"),
  color = c("lightblue", "green3", "yellow", "orange", "red")
)

zone_rects <- expand.grid(x = 1:nrow(zones), y = 1:nrow(zones))

max_risk_index <- pmax(zone_rects$x, zone_rects$y)
zone_rects <- cbind(
  zone_rects,
  xmin = zones$xmin[zone_rects$x],
  xmax = zones$xmax[zone_rects$x],
  ymin = zones$xmin[zone_rects$y],
  ymax = zones$xmax[zone_rects$y],
  fill = zones$color[max_risk_index]
)

ggplot() +
  geom_rect(data = zone_rects, aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax, fill = fill), alpha = 0.4) +
  scale_fill_identity() +
  geom_point(data = results, aes(x = predicted, y = actual), color = "blue", size = 0.5) +
  geom_abline(intercept = -20, slope = 1, linetype = "dashed", color = "black", linewidth = 0.4) +
  geom_abline(intercept = 20, slope = 1, linetype = "dashed", color = "black", linewidth = 0.4) +
  xlim(50, 150) +
  ylim(50, 250) +
  coord_cartesian(xlim = c(50, 200), ylim = c(50, 200), expand = FALSE) +
  scale_x_continuous(breaks = c(50, 80, 100, 120, 140, 150)) +
  scale_y_continuous(breaks = c(50, 80, 100, 120, 140, 150)) +
  labs(x = "Predicted BP", y = "True BP") +
  coord_fixed() +
  theme_minimal()

## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Coordinate system already present. Adding new coordinate system, which will
## replace the existing one.

set.seed(1)

scalar_variable_names <- colnames(scalar_test)
variable_importance <- data.frame(
  Variable = character(),
  Importance_RMSE_Increase = numeric(),
  stringsAsFactors = FALSE
)

for (i in 1:ncol(scalar_test)) {
  scalar_test_shuffled <- scalar_test
  scalar_test_shuffled[, i] <- sample(scalar_test_shuffled[, i])
  
  preds_shuffled <- model %>% predict(list(MIMS_test, scalar_test_shuffled))
  rmse_shuffled <- sqrt(mean((preds_shuffled - bpS_test)^2))
  importance <- rmse_shuffled - rmse
  
  variable_importance <- rbind(
    variable_importance, 
    data.frame(
      Variable = scalar_variable_names[i], 
      Importance_RMSE_Increase = importance
    )
  )
}

## 50/50 - 3s - 61ms/step
## 50/50 - 3s - 58ms/step
## 50/50 - 3s - 58ms/step
## 50/50 - 3s - 60ms/step
## 50/50 - 3s - 57ms/step
## 50/50 - 3s - 59ms/step
## 50/50 - 3s - 57ms/step
## 50/50 - 3s - 60ms/step

MIMS_test_shuffled <- MIMS_test
for (t in 1:dim(MIMS_test)[2]) {
  MIMS_test_shuffled[, t, 1] <- sample(MIMS_test_shuffled[, t, 1])
}

preds_mims_shuffled <- model %>% predict(list(MIMS_test_shuffled, scalar_test))

## 50/50 - 3s - 58ms/step

rmse_mims_shuffled <- sqrt(mean((preds_mims_shuffled - bpS_test)^2))
mims_importance <- rmse_mims_shuffled - rmse

variable_importance <- rbind(
  variable_importance, 
  data.frame(
    Variable = "MIMS", 
    Importance_RMSE_Increase = mims_importance
  )
)

options(scipen = 999)
print("Variable Importance (Increase in RMSE):")

## [1] "Variable Importance (Increase in RMSE):"

print(variable_importance[order(-variable_importance$Importance_RMSE_Increase), ])

##        Variable Importance_RMSE_Increase
## 3    genderMale          12.613344661966
## 4  genderFemale          10.705532627400
## 1           Age           3.761359252388
## 5         CHDNo           2.241904219418
## 6        CHDYes           1.751069767620
## 2           BMI           0.149921321009
## 7    CHDRefused           0.000000000000
## 8 CHDDon't know           0.000000000000
## 9          MIMS          -0.000006617201

```