library(dplyr)
## Warning: 套件 'dplyr' 是用 R 版本 4.4.3 來建造的
##
## 載入套件:'dplyr'
## 下列物件被遮斷自 'package:stats':
##
## filter, lag
## 下列物件被遮斷自 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
## Warning: 套件 'lubridate' 是用 R 版本 4.4.2 來建造的
##
## 載入套件:'lubridate'
## 下列物件被遮斷自 'package:base':
##
## date, intersect, setdiff, union
library(readr)
## Warning: 套件 'readr' 是用 R 版本 4.4.3 來建造的
library(tidyr)
# =========================
# 1️⃣ 讀資料
# =========================
df <- read_csv("C:/Users/yuting/Downloads/test.csv") %>%
mutate(
Rdate = as.Date(Rdate),
Birth = as.Date(Birth),
Outcome = as.numeric(Outcome)
)
## Rows: 21 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): ssid, ITEM, Rdate, Birth
## dbl (1): Outcome
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# =========================
# 2️⃣ abnormal + episode
# =========================
df2 <- df %>%
group_by(ssid, ITEM) %>%
arrange(Rdate) %>%
mutate(
abnormal = case_when(
ITEM == "EGFRM" ~ Outcome < 60,
ITEM == "UPCR" ~ Outcome > 0.2,
ITEM == "UACR" ~ Outcome > 0.03,
TRUE ~ FALSE
),
episode = cumsum(!abnormal)
) %>%
ungroup()
# =========================
# 3️⃣ index function
# =========================
find_index_date <- function(d, a) {
i <- which(a)
if (length(i) < 2) return(NA_Date_)
for (k in seq_len(length(i) - 1)) {
for (m in (k + 1):length(i)) {
if ((d[i[m]] - d[i[k]]) >= 90) return(d[i[m]])
}
}
NA_Date_
}
# =========================
# 4️⃣ index date
# =========================
index_wide <- df2 %>%
group_by(ssid, ITEM, episode) %>%
summarise(
index_date = find_index_date(Rdate, abnormal),
.groups = "drop"
) %>%
filter(!is.na(index_date)) %>%
group_by(ssid, ITEM) %>%
summarise(index_date = max(index_date), .groups = "drop") %>%
pivot_wider(names_from = ITEM, values_from = index_date, names_prefix = "index_") %>%
mutate(across(starts_with("index_"), as.Date)) %>%
rowwise() %>%
mutate(index_min = min(c_across(starts_with("index_")), na.rm = TRUE)) %>%
ungroup() %>%
left_join(df %>% distinct(ssid, Birth), by = "ssid") %>%
mutate(
Age = floor(time_length(interval(Birth, Sys.Date()), "years")),
Age_index_date = floor(time_length(interval(Birth, index_min), "years"))
)
# =========================
# 5️⃣ baseline mean (2y + index day)
# =========================
baseline_wide <- df %>%
left_join(index_wide %>% select(ssid, index_min), by = "ssid") %>%
mutate(start_2y = index_min %m-% years(2)) %>%
filter(Rdate >= start_2y, Rdate <= index_min) %>%
group_by(ssid, ITEM) %>%
summarise(mean_2y = mean(Outcome, na.rm = TRUE), .groups = "drop") %>%
complete(ssid, ITEM = unique(df$ITEM)) %>%
pivot_wider(names_from = ITEM, values_from = mean_2y, names_prefix = "mean_")
# =========================
# 6️⃣ final output
# =========================
final_df <- index_wide %>%
left_join(baseline_wide, by = "ssid")
final_df
## # A tibble: 2 × 12
## ssid index_EGFRM index_UACR index_UPCR index_min Birth Age
## <chr> <date> <date> <date> <date> <date> <dbl>
## 1 A1 2021-05-19 2011-05-17 NA 2011-05-17 1970-01-01 56
## 2 A2 2015-01-09 NA 2025-01-03 2015-01-09 2000-12-01 25
## # ℹ 5 more variables: Age_index_date <dbl>, mean_EGFRM <dbl>, mean_LDLC <dbl>,
## # mean_UACR <dbl>, mean_UPCR <dbl>