データ読み込み

pacman::p_load(tidyverse, lubridate, tableone,
               survival, survminer, car, svglite,missForest)
 user.data_8.detect <- read.csv("/Users/abekohei/Desktop/HDS演習_DeSC/HDS_DeSC/user.data_8.欠測値補完用.csv")
colnames(user.data_8.detect)
##  [1] "ID"                     "gender"                 "age"                   
##  [4] "kencom登録年月"         "資格取得年月"           "資格喪失年月"          
##  [7] "レセプトデータ開始年月" "レセプトデータ終了年月" "baseline健診受診年月"  
## [10] "weight_nz.b"            "systolicBP_nz.b"        "diastolicBP_nz.b"      
## [13] "ldl_c_nz.b"             "hdl_c_nz.b"             "TG_nz.b"               
## [16] "HbA1c_nz.b"             "データ有り月数_total"   "データ有り月数_pre"    
## [19] "データ有り月数_post"    "月ごとの平均データ数"   "step_mean_pre"         
## [22] "step_mean_post"         "step_sd_pre"            "step_sd_post"          
## [25] "step_mean_pre.wd"       "step_mean_post.wd"      "step_sd_pre.wd"        
## [28] "step_sd_post.wd"        "step_mean_pre.we"       "step_mean_post.we"     
## [31] "step_sd_pre.we"         "step_sd_post.we"        "step.diff"             
## [34] "step.diff.wd"           "step.diff.we"           "weight_nz.f"           
## [37] "systolicBP_nz.f"        "diastolicBP_nz.f"       "ldl_c_nz.f"            
## [40] "hdl_c_nz.f"             "TG_nz.f"                "HbA1c_nz.f"            
## [43] "followup健診受診年月"   "weight_nz.diff"         "systolicBP_nz.diff"    
## [46] "diastolicBP_nz.diff"    "ldl_c_nz.diff"          "hdl_c_nz.diff"         
## [49] "TG_nz.diff"             "HbA1c_nz.diff"

欠測値の確認

is.na(user.data_8.detect) %>% colSums()
##                     ID                 gender                    age 
##                      0                      0                      0 
##         kencom登録年月           資格取得年月           資格喪失年月 
##                      0                      0                      0 
## レセプトデータ開始年月 レセプトデータ終了年月   baseline健診受診年月 
##                      0                      0                      0 
##            weight_nz.b        systolicBP_nz.b       diastolicBP_nz.b 
##                      4                      4                      4 
##             ldl_c_nz.b             hdl_c_nz.b                TG_nz.b 
##                     24                     23                     23 
##             HbA1c_nz.b   データ有り月数_total     データ有り月数_pre 
##                    175                      0                      0 
##    データ有り月数_post   月ごとの平均データ数          step_mean_pre 
##                      0                      0                      0 
##         step_mean_post            step_sd_pre           step_sd_post 
##                      0                      0                      0 
##       step_mean_pre.wd      step_mean_post.wd         step_sd_pre.wd 
##                      0                      0                      0 
##        step_sd_post.wd       step_mean_pre.we      step_mean_post.we 
##                      0                      0                      0 
##         step_sd_pre.we        step_sd_post.we              step.diff 
##                      0                      0                      0 
##           step.diff.wd           step.diff.we            weight_nz.f 
##                      0                      0                      2 
##        systolicBP_nz.f       diastolicBP_nz.f             ldl_c_nz.f 
##                      1                      1                      9 
##             hdl_c_nz.f                TG_nz.f             HbA1c_nz.f 
##                      8                      8                    104 
##   followup健診受診年月         weight_nz.diff     systolicBP_nz.diff 
##                      0                      6                      5 
##    diastolicBP_nz.diff          ldl_c_nz.diff          hdl_c_nz.diff 
##                      5                     26                     24 
##             TG_nz.diff          HbA1c_nz.diff 
##                     24                    203

データ型の確認(missForest関数はcharacter型を含むデータは不可能のため。factor型、numeric型、integer型は可能)

sapply(user.data_8.detect, class)
##                     ID                 gender                    age 
##            "character"            "character"              "integer" 
##         kencom登録年月           資格取得年月           資格喪失年月 
##            "character"            "character"            "character" 
## レセプトデータ開始年月 レセプトデータ終了年月   baseline健診受診年月 
##            "character"            "character"            "character" 
##            weight_nz.b        systolicBP_nz.b       diastolicBP_nz.b 
##              "numeric"              "numeric"              "numeric" 
##             ldl_c_nz.b             hdl_c_nz.b                TG_nz.b 
##              "numeric"              "numeric"              "numeric" 
##             HbA1c_nz.b   データ有り月数_total     データ有り月数_pre 
##              "numeric"              "integer"              "integer" 
##    データ有り月数_post   月ごとの平均データ数          step_mean_pre 
##              "integer"              "numeric"              "numeric" 
##         step_mean_post            step_sd_pre           step_sd_post 
##              "numeric"              "numeric"              "numeric" 
##       step_mean_pre.wd      step_mean_post.wd         step_sd_pre.wd 
##              "numeric"              "numeric"              "numeric" 
##        step_sd_post.wd       step_mean_pre.we      step_mean_post.we 
##              "numeric"              "numeric"              "numeric" 
##         step_sd_pre.we        step_sd_post.we              step.diff 
##              "numeric"              "numeric"              "numeric" 
##           step.diff.wd           step.diff.we            weight_nz.f 
##              "numeric"              "numeric"              "numeric" 
##        systolicBP_nz.f       diastolicBP_nz.f             ldl_c_nz.f 
##              "numeric"              "numeric"              "numeric" 
##             hdl_c_nz.f                TG_nz.f             HbA1c_nz.f 
##              "numeric"              "numeric"              "numeric" 
##   followup健診受診年月         weight_nz.diff     systolicBP_nz.diff 
##            "character"              "numeric"              "numeric" 
##    diastolicBP_nz.diff          ldl_c_nz.diff          hdl_c_nz.diff 
##              "numeric"              "numeric"              "numeric" 
##             TG_nz.diff          HbA1c_nz.diff 
##              "numeric"              "numeric"
#user.data_8.detect$ID<-as.factor(user.data_8.detect$ID) #IDは削除するので不要
user.data_8.detect$gender<-as.factor(user.data_8.detect$gender)
user.data_8.detect$age<-as.numeric(user.data_8.detect$age)
user.data_8.detect$データ有り月数_total<-as.numeric(user.data_8.detect$データ有り月数_total)
user.data_8.detect$データ有り月数_pre<-as.numeric(user.data_8.detect$データ有り月数_pre)
user.data_8.detect$データ有り月数_post<-as.numeric(user.data_8.detect$データ有り月数_post)

missForest対応していないデータは削除

user.data_8.detect_missF <-
  user.data_8.detect %>%
  ungroup() %>%  # グルーピングを解除
  select(-ID, -kencom登録年月, -baseline健診受診年月, -followup健診受診年月, 
         -資格取得年月, -資格喪失年月, -レセプトデータ開始年月, -レセプトデータ終了年月)

#IDは削除したくないため、取り出しておく。missForest関数の結果と後で統合するため。
id <- user.data_8.detect$ID
sapply(user.data_8.detect_missF, class)
##               gender                  age          weight_nz.b 
##             "factor"            "numeric"            "numeric" 
##      systolicBP_nz.b     diastolicBP_nz.b           ldl_c_nz.b 
##            "numeric"            "numeric"            "numeric" 
##           hdl_c_nz.b              TG_nz.b           HbA1c_nz.b 
##            "numeric"            "numeric"            "numeric" 
## データ有り月数_total   データ有り月数_pre  データ有り月数_post 
##            "numeric"            "numeric"            "numeric" 
## 月ごとの平均データ数        step_mean_pre       step_mean_post 
##            "numeric"            "numeric"            "numeric" 
##          step_sd_pre         step_sd_post     step_mean_pre.wd 
##            "numeric"            "numeric"            "numeric" 
##    step_mean_post.wd       step_sd_pre.wd      step_sd_post.wd 
##            "numeric"            "numeric"            "numeric" 
##     step_mean_pre.we    step_mean_post.we       step_sd_pre.we 
##            "numeric"            "numeric"            "numeric" 
##      step_sd_post.we            step.diff         step.diff.wd 
##            "numeric"            "numeric"            "numeric" 
##         step.diff.we          weight_nz.f      systolicBP_nz.f 
##            "numeric"            "numeric"            "numeric" 
##     diastolicBP_nz.f           ldl_c_nz.f           hdl_c_nz.f 
##            "numeric"            "numeric"            "numeric" 
##              TG_nz.f           HbA1c_nz.f       weight_nz.diff 
##            "numeric"            "numeric"            "numeric" 
##   systolicBP_nz.diff  diastolicBP_nz.diff        ldl_c_nz.diff 
##            "numeric"            "numeric"            "numeric" 
##        hdl_c_nz.diff           TG_nz.diff        HbA1c_nz.diff 
##            "numeric"            "numeric"            "numeric"

missForest実行

imp_missForest <-
  missForest(xmis = as.data.frame(user.data_8.detect_missF),
             maxiter = 30,
             ntree = 20
             )

結果の確認

user.data_9_hokan <- imp_missForest$ximp
user.data_9_hokan <- cbind(ID = id, imp_missForest$ximp) #IDを統合。IDは保持できている。
user.data_9_hokan %>%
  write.csv(., "/Users/abekohei/Desktop/HDS演習_DeSC/HDS_DeSC/user.data_9.欠測値補完済.csv", row.names = FALSE)