データ読み込み
pacman::p_load(tidyverse, lubridate, tableone,
survival, survminer, car, svglite,missForest)
user.data_8.detect <- read.csv("/Users/abekohei/Desktop/HDS演習_DeSC/HDS_DeSC/user.data_8.欠測値補完用.csv")
colnames(user.data_8.detect)
## [1] "ID" "gender" "age"
## [4] "kencom登録年月" "資格取得年月" "資格喪失年月"
## [7] "レセプトデータ開始年月" "レセプトデータ終了年月" "baseline健診受診年月"
## [10] "weight_nz.b" "systolicBP_nz.b" "diastolicBP_nz.b"
## [13] "ldl_c_nz.b" "hdl_c_nz.b" "TG_nz.b"
## [16] "HbA1c_nz.b" "データ有り月数_total" "データ有り月数_pre"
## [19] "データ有り月数_post" "月ごとの平均データ数" "step_mean_pre"
## [22] "step_mean_post" "step_sd_pre" "step_sd_post"
## [25] "step_mean_pre.wd" "step_mean_post.wd" "step_sd_pre.wd"
## [28] "step_sd_post.wd" "step_mean_pre.we" "step_mean_post.we"
## [31] "step_sd_pre.we" "step_sd_post.we" "step.diff"
## [34] "step.diff.wd" "step.diff.we" "weight_nz.f"
## [37] "systolicBP_nz.f" "diastolicBP_nz.f" "ldl_c_nz.f"
## [40] "hdl_c_nz.f" "TG_nz.f" "HbA1c_nz.f"
## [43] "followup健診受診年月" "weight_nz.diff" "systolicBP_nz.diff"
## [46] "diastolicBP_nz.diff" "ldl_c_nz.diff" "hdl_c_nz.diff"
## [49] "TG_nz.diff" "HbA1c_nz.diff"
欠測値の確認
is.na(user.data_8.detect) %>% colSums()
## ID gender age
## 0 0 0
## kencom登録年月 資格取得年月 資格喪失年月
## 0 0 0
## レセプトデータ開始年月 レセプトデータ終了年月 baseline健診受診年月
## 0 0 0
## weight_nz.b systolicBP_nz.b diastolicBP_nz.b
## 4 4 4
## ldl_c_nz.b hdl_c_nz.b TG_nz.b
## 24 23 23
## HbA1c_nz.b データ有り月数_total データ有り月数_pre
## 175 0 0
## データ有り月数_post 月ごとの平均データ数 step_mean_pre
## 0 0 0
## step_mean_post step_sd_pre step_sd_post
## 0 0 0
## step_mean_pre.wd step_mean_post.wd step_sd_pre.wd
## 0 0 0
## step_sd_post.wd step_mean_pre.we step_mean_post.we
## 0 0 0
## step_sd_pre.we step_sd_post.we step.diff
## 0 0 0
## step.diff.wd step.diff.we weight_nz.f
## 0 0 2
## systolicBP_nz.f diastolicBP_nz.f ldl_c_nz.f
## 1 1 9
## hdl_c_nz.f TG_nz.f HbA1c_nz.f
## 8 8 104
## followup健診受診年月 weight_nz.diff systolicBP_nz.diff
## 0 6 5
## diastolicBP_nz.diff ldl_c_nz.diff hdl_c_nz.diff
## 5 26 24
## TG_nz.diff HbA1c_nz.diff
## 24 203
データ型の確認(missForest関数はcharacter型を含むデータは不可能のため。factor型、numeric型、integer型は可能)
sapply(user.data_8.detect, class)
## ID gender age
## "character" "character" "integer"
## kencom登録年月 資格取得年月 資格喪失年月
## "character" "character" "character"
## レセプトデータ開始年月 レセプトデータ終了年月 baseline健診受診年月
## "character" "character" "character"
## weight_nz.b systolicBP_nz.b diastolicBP_nz.b
## "numeric" "numeric" "numeric"
## ldl_c_nz.b hdl_c_nz.b TG_nz.b
## "numeric" "numeric" "numeric"
## HbA1c_nz.b データ有り月数_total データ有り月数_pre
## "numeric" "integer" "integer"
## データ有り月数_post 月ごとの平均データ数 step_mean_pre
## "integer" "numeric" "numeric"
## step_mean_post step_sd_pre step_sd_post
## "numeric" "numeric" "numeric"
## step_mean_pre.wd step_mean_post.wd step_sd_pre.wd
## "numeric" "numeric" "numeric"
## step_sd_post.wd step_mean_pre.we step_mean_post.we
## "numeric" "numeric" "numeric"
## step_sd_pre.we step_sd_post.we step.diff
## "numeric" "numeric" "numeric"
## step.diff.wd step.diff.we weight_nz.f
## "numeric" "numeric" "numeric"
## systolicBP_nz.f diastolicBP_nz.f ldl_c_nz.f
## "numeric" "numeric" "numeric"
## hdl_c_nz.f TG_nz.f HbA1c_nz.f
## "numeric" "numeric" "numeric"
## followup健診受診年月 weight_nz.diff systolicBP_nz.diff
## "character" "numeric" "numeric"
## diastolicBP_nz.diff ldl_c_nz.diff hdl_c_nz.diff
## "numeric" "numeric" "numeric"
## TG_nz.diff HbA1c_nz.diff
## "numeric" "numeric"
#user.data_8.detect$ID<-as.factor(user.data_8.detect$ID) #IDは削除するので不要
user.data_8.detect$gender<-as.factor(user.data_8.detect$gender)
user.data_8.detect$age<-as.numeric(user.data_8.detect$age)
user.data_8.detect$データ有り月数_total<-as.numeric(user.data_8.detect$データ有り月数_total)
user.data_8.detect$データ有り月数_pre<-as.numeric(user.data_8.detect$データ有り月数_pre)
user.data_8.detect$データ有り月数_post<-as.numeric(user.data_8.detect$データ有り月数_post)
missForest対応していないデータは削除
user.data_8.detect_missF <-
user.data_8.detect %>%
ungroup() %>% # グルーピングを解除
select(-ID, -kencom登録年月, -baseline健診受診年月, -followup健診受診年月,
-資格取得年月, -資格喪失年月, -レセプトデータ開始年月, -レセプトデータ終了年月)
#IDは削除したくないため、取り出しておく。missForest関数の結果と後で統合するため。
id <- user.data_8.detect$ID
sapply(user.data_8.detect_missF, class)
## gender age weight_nz.b
## "factor" "numeric" "numeric"
## systolicBP_nz.b diastolicBP_nz.b ldl_c_nz.b
## "numeric" "numeric" "numeric"
## hdl_c_nz.b TG_nz.b HbA1c_nz.b
## "numeric" "numeric" "numeric"
## データ有り月数_total データ有り月数_pre データ有り月数_post
## "numeric" "numeric" "numeric"
## 月ごとの平均データ数 step_mean_pre step_mean_post
## "numeric" "numeric" "numeric"
## step_sd_pre step_sd_post step_mean_pre.wd
## "numeric" "numeric" "numeric"
## step_mean_post.wd step_sd_pre.wd step_sd_post.wd
## "numeric" "numeric" "numeric"
## step_mean_pre.we step_mean_post.we step_sd_pre.we
## "numeric" "numeric" "numeric"
## step_sd_post.we step.diff step.diff.wd
## "numeric" "numeric" "numeric"
## step.diff.we weight_nz.f systolicBP_nz.f
## "numeric" "numeric" "numeric"
## diastolicBP_nz.f ldl_c_nz.f hdl_c_nz.f
## "numeric" "numeric" "numeric"
## TG_nz.f HbA1c_nz.f weight_nz.diff
## "numeric" "numeric" "numeric"
## systolicBP_nz.diff diastolicBP_nz.diff ldl_c_nz.diff
## "numeric" "numeric" "numeric"
## hdl_c_nz.diff TG_nz.diff HbA1c_nz.diff
## "numeric" "numeric" "numeric"
missForest実行
imp_missForest <-
missForest(xmis = as.data.frame(user.data_8.detect_missF),
maxiter = 30,
ntree = 20
)
結果の確認
user.data_9_hokan <- imp_missForest$ximp
user.data_9_hokan <- cbind(ID = id, imp_missForest$ximp) #IDを統合。IDは保持できている。
user.data_9_hokan %>%
write.csv(., "/Users/abekohei/Desktop/HDS演習_DeSC/HDS_DeSC/user.data_9.欠測値補完済.csv", row.names = FALSE)