knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
library(haven)
library(dplyr)
# 從原始 SAS 檔案中讀取資料
d4_die <- read_sas("d4_die.sas7bdat")
cis_newid <- read_sas("cis_newid.sas7bdat")
bone_newid <- read_sas("bone_newid.sas7bdat")
篩選出符合條件的對象: bone_newid:_COL3 等於 0 (滿65歲)。 cis_newid:所有出現過的對象。 主群體:原本就在 d4_die 中的所有人。 我們將這三個來源的 ID 取「聯集 (Union)」並去除重複,確保每個人只會出現一次。
# 保留滿 65 歲的人,並轉化欄位名稱以利對接
bone65 <- bone_newid %>%
filter(`_COL3` == 0) %>%
select(newid = newidno2) %>%
distinct()
# 提取 CIS 資料的唯一 ID
cis_id <- cis_newid %>%
select(newid = newidno2) %>%
distinct()
# 建立總目標群體:將三個來源合併, distinct 確保唯一
all_target_ids <- bind_rows(
bone65,
cis_id,
d4_die %>% select(newid)) %>%
distinct(newid)
cat("篩選後總唯一 ID 人數:", nrow(all_target_ids), "\n")
## 篩選後總唯一 ID 人數: 29163
以 d4_die 為主體進行合併,並僅保留符合上述 ID 清單的人員。 篩選:僅保留屬於上述目標 ID 清單的人。
去重複:先按 newid 與 Screen_date 排序,保留每個人「最近一次(最晚)」的篩檢紀錄。
final_base <- d4_die %>%
filter(newid %in% all_target_ids$newid) %>%
arrange(newid, desc(Screen_date)) %>%
distinct(newid, .keep_all = TRUE) %>%
mutate(death = ifelse(!is.na(D_date) & nchar(as.character(D_date)) == 8, 1, 0)) #D_date 不為空值且字元數等於 8 為死亡 (1),其餘為存活 (0)
cat("d4_die 去重後總人數:", nrow(final_base), "\n")
## d4_die 去重後總人數: 8222
將 bone_newid (滿65歲) 與 cis_newid 的特定欄位併入主表。
# 以 final_base 為主體,進行左合併 (Left Join)
final_dataset <- final_base %>%
left_join(
bone_newid %>%
select(newid = newidno2, `_COL1`, `_COL2`, `_COL3`, `_COL4`, `_COL5`, `_COL6`, `_COL17`, `_COL19`) %>%
distinct(newid, .keep_all = TRUE),
by = "newid"
) %>%
left_join(
cis_newid %>%
select(newid = newidno2, bir_y) %>%
distinct(newid, .keep_all = TRUE),
by = "newid"
)
將 bone、cis、d4_die 所需的變項整理成分析資料集。
# 定義欲保留的變項清單
bone_vars <- c("_COL1","_COL2","_COL3","_COL4",
"_COL5","_COL6","_COL17","_COL19")
cis_vars <- c("bir_y")
d4_vars <- c(
"gender","residence","marriage","edu",
"go_out","visit","helpful","conversation","memory",
"depressed_mood","anxiety","stress",
paste0("PHQ_", 1:9), "PHQ_total",
paste0("AD_", 1:8), "AD_total",
"BH","BW","BW_loss_year","stand_up","Not_energetic",
"grip","walk_4s",
paste0("ADL_", 1:10), "ADL_total",
"e1","e2","e3","e4","e5","e8","e21",
"smoke","Drinkhabbit","chewnut","coffee","Exercise"
)
analysis_dataset <- final_dataset %>%
select(
newid, Screen_date, D_date, death,
all_of(bone_vars),
all_of(cis_vars),
all_of(d4_vars)
)
cat("最終資料集維度 (列, 欄):", dim(analysis_dataset), "\n")
## 最終資料集維度 (列, 欄): 8222 74