data-preprocessing-3

資料預處理摘要：本研究以 d4_die 原始資料庫與 cis_newid 資料庫之交集個案作為研究主體，確保每位個案皆具備出生年資歷。針對重複篩檢者，統一保留其最早（Earliest）之一筆紀錄作為基準點。骨密度資料（bone_newid 且年齡滿 65 歲者）則採補件方式併入，若個案缺乏骨密度檢測資料，相關欄位以缺失值（NA）處理，不影響個案之納入。最終分析族群總人數為 4,354 人。

knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
library(haven)
library(dplyr)
library(ggvenn)
# 從原始 SAS 檔案中讀取資料
d4_die <- read_sas("d4_die.sas7bdat")
cis_newid <- read_sas("cis_newid.sas7bdat")
bone_newid <- read_sas("bone_newid.sas7bdat")

建立 ID 清單

出現在 bone_newid 且 _COL3 == 0 (滿 65 歲)。出現在 cis_newid 資料中。

# 保留滿 65 歲的人，並轉化欄位名稱以利對接
bone65 <- bone_newid %>%
  filter(`_COL3` == 0) %>%
  select(newid = newidno2) %>%
  distinct()

# 提取 CIS 資料的唯一 ID
cis_id <- cis_newid %>%
  select(newid = newidno2) %>%
  distinct()

# 取聯集 (Union)：只要符合其中一個條件就納入
#all_target_ids <- bind_rows(bone65, cis_id) %>%
all_target_ids <- cis_id %>%
  distinct(newid)

cat("符合納入標準 (Bone 65+ 或 CIS) 的總人數:", nrow(all_target_ids), "\n")

## 符合納入標準 (Bone 65+ 或 CIS) 的總人數: 14378

篩選

以 d4_die 為主體進行合併，並僅保留符合上述 ID 清單的人員。篩選：僅保留屬於上述目標 ID 清單的人。

去重複：先按 newid 與 Screen_date 排序，保留時間上最遠（最早）的一筆資料作為分析基準。。

final_base <- d4_die %>%
  filter(newid %in% all_target_ids$newid) %>%
  arrange(newid, Screen_date) %>%
  distinct(newid, .keep_all = TRUE) %>%
  mutate(death = ifelse(!is.na(D_date) & nchar(as.character(D_date)) == 8, 1, 0)) #D_date 不為空值且字元數等於 8 為死亡 (1)，其餘為存活 (0)

cat("最終納入分析的研究族群人數 (N):", nrow(final_base), "\n")

## 最終納入分析的研究族群人數 (N): 400

合併外部變項 (Bone & CIS)

將 bone_newid (滿65歲) 與 cis_newid 的特定欄位併入主表。

# 定義欲保留的變項
bone_vars <- c("_COL1","_COL2","_COL3","_COL4","_COL5","_COL6","_COL17","_COL19")
cis_vars  <- c("bir_y")
d4_vars   <- c("gender","residence","marriage","edu","go_out","visit","helpful",
               "conversation","memory","depressed_mood","anxiety","stress",
               paste0("PHQ_", 1:9), "PHQ_total", paste0("AD_", 1:8), "AD_total",
               "BH","BW","BW_loss_year","stand_up","Not_energetic","grip","walk_4s",
               paste0("ADL_", 1:10), "ADL_total","e1","e2","e3","e4","e5","e8","e21",
               "smoke","Drinkhabbit","chewnut","coffee","Exercise")

final_dataset <- final_base %>%
  left_join(
    bone_newid %>% 
      filter(`_COL3` == 0) %>%
      select(newid = newidno2, all_of(bone_vars)) %>%
      distinct(newid, .keep_all = TRUE), 
    by = "newid"
  ) %>%
  left_join(
    cis_newid %>% 
      select(newid = newidno2, all_of(cis_vars)) %>%
      distinct(newid, .keep_all = TRUE), 
    by = "newid"
  )

analysis_dataset <- final_dataset %>%
  select(
    newid, Screen_date, D_date, death,
    all_of(bone_vars),
    all_of(cis_vars),
    all_of(d4_vars)
  )

cat("最終資料集維度:", dim(analysis_dataset), "\n")

## 最終資料集維度: 400 74

venn_list <- list(
  "d4_die" = d4_die$newid,
  "Bone 65+" = bone65$newid,
  "CIS" = cis_id$newid
)
ggvenn(venn_list, fill_color = c("#0073C2FF", "#EFC000FF", "#868686FF"), text_size = 3)

data-preprocessing-3

kellysi666s@gmail.com

2025-12-25

建立 ID 清單

篩選

合併外部變項 (Bone & CIS)