library(dplyr)
## Warning: 套件 'dplyr' 是用 R 版本 4.4.3 來建造的
## 
## 載入套件:'dplyr'
## 下列物件被遮斷自 'package:stats':
## 
##     filter, lag
## 下列物件被遮斷自 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lubridate)
## Warning: 套件 'lubridate' 是用 R 版本 4.4.2 來建造的
## 
## 載入套件:'lubridate'
## 下列物件被遮斷自 'package:base':
## 
##     date, intersect, setdiff, union
library(readr)
## Warning: 套件 'readr' 是用 R 版本 4.4.3 來建造的
library(tidyr)

# =========================
# 1️⃣ 讀資料
# =========================
df <- read_csv("C:/Users/yuting/Downloads/test.csv") %>%
  mutate(
    Rdate = as.Date(Rdate),
    Birth = as.Date(Birth),
    Outcome = as.numeric(Outcome)
  )
## Rows: 21 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): ssid, ITEM, Rdate, Birth
## dbl (1): Outcome
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# =========================
# 2️⃣ abnormal + episode
# =========================
df2 <- df %>%
  group_by(ssid, ITEM) %>%
  arrange(Rdate) %>%
  mutate(
    abnormal = case_when(
      ITEM == "EGFRM" ~ Outcome < 60,
      ITEM == "UPCR"  ~ Outcome > 0.2,
      ITEM == "UACR"  ~ Outcome > 0.03,
      TRUE ~ FALSE
    ),
    episode = cumsum(!abnormal)
  ) %>%
  ungroup()

# =========================
# 3️⃣ index function
# =========================
find_index_date <- function(d, a) {
  i <- which(a)
  if (length(i) < 2) return(NA_Date_)
  
  for (k in seq_len(length(i) - 1)) {
    for (m in (k + 1):length(i)) {
      if ((d[i[m]] - d[i[k]]) >= 90) return(d[i[m]])
    }
  }
  NA_Date_
}

# =========================
# 4️⃣ index date
# =========================
index_wide <- df2 %>%
  group_by(ssid, ITEM, episode) %>%
  summarise(
    index_date = find_index_date(Rdate, abnormal),
    .groups = "drop"
  ) %>%
  filter(!is.na(index_date)) %>%
  group_by(ssid, ITEM) %>%
  summarise(index_date = max(index_date), .groups = "drop") %>%
  pivot_wider(names_from = ITEM, values_from = index_date, names_prefix = "index_") %>%
  mutate(across(starts_with("index_"), as.Date)) %>%
  rowwise() %>%
  mutate(index_min = min(c_across(starts_with("index_")), na.rm = TRUE)) %>%
  ungroup() %>%
  left_join(df %>% distinct(ssid, Birth), by = "ssid") %>%
  mutate(
    Age = floor(time_length(interval(Birth, Sys.Date()), "years")),
    Age_index_date = floor(time_length(interval(Birth, index_min), "years"))
  )

# =========================
# 5️⃣ baseline mean (2y + index day)
# =========================
baseline_wide <- df %>%
  left_join(index_wide %>% select(ssid, index_min), by = "ssid") %>%
  mutate(start_2y = index_min %m-% years(2)) %>%
  filter(Rdate >= start_2y, Rdate <= index_min) %>%
  group_by(ssid, ITEM) %>%
  summarise(mean_2y = mean(Outcome, na.rm = TRUE), .groups = "drop") %>%
  complete(ssid, ITEM = unique(df$ITEM)) %>%
  pivot_wider(names_from = ITEM, values_from = mean_2y, names_prefix = "mean_")

# =========================
# 6️⃣ final output
# =========================
final_df <- index_wide %>%
  left_join(baseline_wide, by = "ssid")

final_df
## # A tibble: 2 × 12
##   ssid  index_EGFRM index_UACR index_UPCR index_min  Birth        Age
##   <chr> <date>      <date>     <date>     <date>     <date>     <dbl>
## 1 A1    2021-05-19  2011-05-17 NA         2011-05-17 1970-01-01    56
## 2 A2    2015-01-09  NA         2025-01-03 2015-01-09 2000-12-01    25
## # ℹ 5 more variables: Age_index_date <dbl>, mean_EGFRM <dbl>, mean_LDLC <dbl>,
## #   mean_UACR <dbl>, mean_UPCR <dbl>