knitr::opts_chunk$set(warning = FALSE, message = FALSE)

package

pacman::p_load(tidyverse, lubridate, tableone, skimr,
                duckdb, arrow, RSQLite)

データベース

FドライブのDatabaseフォルダ内のmimic.dbにアクセスする場合は以下のコード(各自の状況に合わせて適宜修正)。 (mimic.dbが無い場合は以下の操作で空のmimic.dbが作成される)

con_mimic <- dbConnect(duckdb::duckdb(), "F:/Database/mimic.db")

mimic.dbの中に入っているデータリスト

dbListTables(con_mimic)
##  [1] "admissions"                 "apsiii"                    
##  [3] "callout"                    "caregivers"                
##  [5] "chartevents"                "comorbidities_AHRQ"        
##  [7] "comorbidities_Quan"         "cptevents"                 
##  [9] "d_cpt"                      "d_icd_diagnoses"           
## [11] "d_icd_procedures"           "d_items"                   
## [13] "d_labitems"                 "datetimeevents"            
## [15] "db_HtWt"                    "db_LVEF"                   
## [17] "db_LVEF_assess"             "db_PEO"                    
## [19] "db_comorb"                  "db_ethnicity"              
## [21] "db_patients"                "db_rrt"                    
## [23] "db_sofa"                    "db_urine"                  
## [25] "db_venti"                   "db_vital"                  
## [27] "df_lab_list"                "diagnoses_icd"             
## [29] "drgcodes"                   "height_firstday"           
## [31] "height_weight"              "icustay_details"           
## [33] "icustays"                   "inputevents_cv"            
## [35] "inputevents_mv"             "labevents"                 
## [37] "microbiologyevents"         "noteevents"                
## [39] "outputevents"               "patients"                  
## [41] "pivot_RRT"                  "pivot_height"              
## [43] "pivot_labo"                 "pivot_urine_output"        
## [45] "pivot_vitals"               "prescriptions"             
## [47] "procedureevents_mv"         "procedures_icd"            
## [49] "services"                   "sofa"                      
## [51] "test"                       "transfers"                 
## [53] "urine_output"               "urine_output_firstday"     
## [55] "ventilation_classification" "ventilation_duration"      
## [57] "weight_duration"            "weight_firstday"

その他の処理

削除

# con_mimic %>% dbRemoveTable("caregivers")

最後に接続解除する時

# dbDisconnect(con_mimic, shutdown=TRUE)

—dataの読み込み —

## ファイル名リストの抽出

以下は、 F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4 の下にmimic iiiの全データファイルが入っていた場合。
同じフォルダに他の.csv.gzファイルを入れない事!

csv_list <- list.files(path = "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/",
                       pattern = ".csv.gz$",
                       full.names = T)

csv_list
##  [1] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/ADMISSIONS.csv.gz"        
##  [2] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/CALLOUT.csv.gz"           
##  [3] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/CAREGIVERS.csv.gz"        
##  [4] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/CHARTEVENTS.csv.gz"       
##  [5] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/CPTEVENTS.csv.gz"         
##  [6] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/D_CPT.csv.gz"             
##  [7] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/D_ICD_DIAGNOSES.csv.gz"   
##  [8] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/D_ICD_PROCEDURES.csv.gz"  
##  [9] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/D_ITEMS.csv.gz"           
## [10] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/D_LABITEMS.csv.gz"        
## [11] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/DATETIMEEVENTS.csv.gz"    
## [12] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/DIAGNOSES_ICD.csv.gz"     
## [13] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/DRGCODES.csv.gz"          
## [14] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/ICUSTAYS.csv.gz"          
## [15] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/INPUTEVENTS_CV.csv.gz"    
## [16] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/INPUTEVENTS_MV.csv.gz"    
## [17] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/LABEVENTS.csv.gz"         
## [18] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/MICROBIOLOGYEVENTS.csv.gz"
## [19] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/NOTEEVENTS.csv.gz"        
## [20] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/OUTPUTEVENTS.csv.gz"      
## [21] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/PATIENTS.csv.gz"          
## [22] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/PRESCRIPTIONS.csv.gz"     
## [23] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/PROCEDUREEVENTS_MV.csv.gz"
## [24] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/PROCEDURES_ICD.csv.gz"    
## [25] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/SERVICES.csv.gz"          
## [26] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/TRANSFERS.csv.gz"

この場合、スラッシュ(/)はバックスラッシュ2つ(\\)と同じ意味になるので、以下の様なパス指定方法でも可。

short_list <- 
  list.files(path = "F:\\BigDatas\\mimic-iii-clinical-database-1.4\\mimic-iii-clinical-database-1.4",
                       pattern = ".csv.gz$",
                       full.names = F)

name_list <- short_list |> 
  str_remove_all(pattern = ".csv.gz") |> 
  str_to_lower()

name_list
##  [1] "admissions"         "callout"            "caregivers"        
##  [4] "chartevents"        "cptevents"          "d_cpt"             
##  [7] "d_icd_diagnoses"    "d_icd_procedures"   "d_items"           
## [10] "d_labitems"         "datetimeevents"     "diagnoses_icd"     
## [13] "drgcodes"           "icustays"           "inputevents_cv"    
## [16] "inputevents_mv"     "labevents"          "microbiologyevents"
## [19] "noteevents"         "outputevents"       "patients"          
## [22] "prescriptions"      "procedureevents_mv" "procedures_icd"    
## [25] "services"           "transfers"
# name_list_2 <- name_list[c(1:3, 5:26)]

各ファイルををDB上に登録する

一回読んだら二度と読まない!
append = TRUEなので、同じ処理がエラーにならずにデータ量が追加されていく。危険。

name_list名で各ファイルをDB上に登録する

# for (i in seq_along(name_list)) {
# 
# callback <- function(df, pos) {
#    dbWriteTable(con_mimic, name_list[i], df, append = TRUE)
# }
# 
# # デフォルトではチャンクサイズは 10,000 レコードずつ。
# read_csv_chunked(csv_list[i],
#                  DataFrameCallback$new(callback),
#                  chunk_size = 100000)
# }

データの確認

charteventsは3.3億行!

db_ad <- tbl(con_mimic, "admissions") 

db_ad |> colnames()
##  [1] "ROW_ID"               "SUBJECT_ID"           "HADM_ID"             
##  [4] "ADMITTIME"            "DISCHTIME"            "DEATHTIME"           
##  [7] "ADMISSION_TYPE"       "ADMISSION_LOCATION"   "DISCHARGE_LOCATION"  
## [10] "INSURANCE"            "LANGUAGE"             "RELIGION"            
## [13] "MARITAL_STATUS"       "ETHNICITY"            "EDREGTIME"           
## [16] "EDOUTTIME"            "DIAGNOSIS"            "HOSPITAL_EXPIRE_FLAG"
## [19] "HAS_CHARTEVENTS_DATA"
db_ad |> count()
## # Source:   SQL [?? x 1]
## # Database: DuckDB v1.3.2 [mihara@Windows 10 x64:R 4.5.0/F:\Database\mimic.db]
##       n
##   <dbl>
## 1 58976
db_ad |> head(5)
## # Source:   SQL [?? x 19]
## # Database: DuckDB v1.3.2 [mihara@Windows 10 x64:R 4.5.0/F:\Database\mimic.db]
##   ROW_ID SUBJECT_ID HADM_ID ADMITTIME           DISCHTIME          
##    <dbl>      <dbl>   <dbl> <dttm>              <dttm>             
## 1     21         22  165315 2196-04-09 12:26:00 2196-04-10 15:54:00
## 2     22         23  152223 2153-09-03 07:15:00 2153-09-08 19:10:00
## 3     23         23  124321 2157-10-18 19:34:00 2157-10-25 14:00:00
## 4     24         24  161859 2139-06-06 16:14:00 2139-06-09 12:48:00
## 5     25         25  129635 2160-11-02 02:06:00 2160-11-05 14:55:00
## # ℹ 14 more variables: DEATHTIME <dttm>, ADMISSION_TYPE <chr>,
## #   ADMISSION_LOCATION <chr>, DISCHARGE_LOCATION <chr>, INSURANCE <chr>,
## #   LANGUAGE <chr>, RELIGION <chr>, MARITAL_STATUS <chr>, ETHNICITY <chr>,
## #   EDREGTIME <dttm>, EDOUTTIME <dttm>, DIAGNOSIS <chr>,
## #   HOSPITAL_EXPIRE_FLAG <dbl>, HAS_CHARTEVENTS_DATA <dbl>

SUBJECT_IDのユニーク数は46520 (< 58976) なので、重複がある事が分かる。

db_ad |> distinct(SUBJECT_ID) |> count()
## # Source:   SQL [?? x 1]
## # Database: DuckDB v1.3.2 [mihara@Windows 10 x64:R 4.5.0/F:\Database\mimic.db]
##       n
##   <dbl>
## 1 46520
db_pt <- tbl(con_mimic, "patients")

db_pt |> colnames()
## [1] "ROW_ID"      "SUBJECT_ID"  "GENDER"      "DOB"         "DOD"        
## [6] "DOD_HOSP"    "DOD_SSN"     "EXPIRE_FLAG"
db_pt |> count()
## # Source:   SQL [?? x 1]
## # Database: DuckDB v1.3.2 [mihara@Windows 10 x64:R 4.5.0/F:\Database\mimic.db]
##       n
##   <dbl>
## 1 46520
db_pt |> head(5)
## # Source:   SQL [?? x 8]
## # Database: DuckDB v1.3.2 [mihara@Windows 10 x64:R 4.5.0/F:\Database\mimic.db]
##   ROW_ID SUBJECT_ID GENDER DOB                 DOD                
##    <dbl>      <dbl> <chr>  <dttm>              <dttm>             
## 1    234        249 F      2075-03-13 00:00:00 NA                 
## 2    235        250 F      2164-12-27 00:00:00 2188-11-22 00:00:00
## 3    236        251 M      2090-03-15 00:00:00 NA                 
## 4    237        252 M      2078-03-06 00:00:00 NA                 
## 5    238        253 F      2089-11-26 00:00:00 NA                 
## # ℹ 3 more variables: DOD_HOSP <dttm>, DOD_SSN <dttm>, EXPIRE_FLAG <dbl>
# dbDisconnect(con_mimic, shutdown=TRUE)

事前課題の内容はスライド参照

END