knitr::opts_chunk$set(warning = FALSE, message = FALSE)
pacman::p_load(tidyverse, lubridate, tableone, skimr,
duckdb, arrow, RSQLite)
FドライブのDatabaseフォルダ内のmimic.dbにアクセスする場合は以下のコード(各自の状況に合わせて適宜修正)。 (mimic.dbが無い場合は以下の操作で空のmimic.dbが作成される)
con_mimic <- dbConnect(duckdb::duckdb(), "F:/Database/mimic.db")
mimic.dbの中に入っているデータリスト
dbListTables(con_mimic)
## [1] "admissions" "apsiii"
## [3] "callout" "caregivers"
## [5] "chartevents" "comorbidities_AHRQ"
## [7] "comorbidities_Quan" "cptevents"
## [9] "d_cpt" "d_icd_diagnoses"
## [11] "d_icd_procedures" "d_items"
## [13] "d_labitems" "datetimeevents"
## [15] "db_HtWt" "db_LVEF"
## [17] "db_LVEF_assess" "db_PEO"
## [19] "db_comorb" "db_ethnicity"
## [21] "db_patients" "db_rrt"
## [23] "db_sofa" "db_urine"
## [25] "db_venti" "db_vital"
## [27] "df_lab_list" "diagnoses_icd"
## [29] "drgcodes" "height_firstday"
## [31] "height_weight" "icustay_details"
## [33] "icustays" "inputevents_cv"
## [35] "inputevents_mv" "labevents"
## [37] "microbiologyevents" "noteevents"
## [39] "outputevents" "patients"
## [41] "pivot_RRT" "pivot_height"
## [43] "pivot_labo" "pivot_urine_output"
## [45] "pivot_vitals" "prescriptions"
## [47] "procedureevents_mv" "procedures_icd"
## [49] "services" "sofa"
## [51] "test" "transfers"
## [53] "urine_output" "urine_output_firstday"
## [55] "ventilation_classification" "ventilation_duration"
## [57] "weight_duration" "weight_firstday"
削除
# con_mimic %>% dbRemoveTable("caregivers")
最後に接続解除する時
# dbDisconnect(con_mimic, shutdown=TRUE)
## ファイル名リストの抽出
以下は、
F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4
の下にmimic iiiの全データファイルが入っていた場合。
同じフォルダに他の.csv.gzファイルを入れない事!
csv_list <- list.files(path = "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/",
pattern = ".csv.gz$",
full.names = T)
csv_list
## [1] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/ADMISSIONS.csv.gz"
## [2] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/CALLOUT.csv.gz"
## [3] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/CAREGIVERS.csv.gz"
## [4] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/CHARTEVENTS.csv.gz"
## [5] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/CPTEVENTS.csv.gz"
## [6] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/D_CPT.csv.gz"
## [7] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/D_ICD_DIAGNOSES.csv.gz"
## [8] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/D_ICD_PROCEDURES.csv.gz"
## [9] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/D_ITEMS.csv.gz"
## [10] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/D_LABITEMS.csv.gz"
## [11] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/DATETIMEEVENTS.csv.gz"
## [12] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/DIAGNOSES_ICD.csv.gz"
## [13] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/DRGCODES.csv.gz"
## [14] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/ICUSTAYS.csv.gz"
## [15] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/INPUTEVENTS_CV.csv.gz"
## [16] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/INPUTEVENTS_MV.csv.gz"
## [17] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/LABEVENTS.csv.gz"
## [18] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/MICROBIOLOGYEVENTS.csv.gz"
## [19] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/NOTEEVENTS.csv.gz"
## [20] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/OUTPUTEVENTS.csv.gz"
## [21] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/PATIENTS.csv.gz"
## [22] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/PRESCRIPTIONS.csv.gz"
## [23] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/PROCEDUREEVENTS_MV.csv.gz"
## [24] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/PROCEDURES_ICD.csv.gz"
## [25] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/SERVICES.csv.gz"
## [26] "F:/BigDatas/mimic-iii-clinical-database-1.4/mimic-iii-clinical-database-1.4/TRANSFERS.csv.gz"
この場合、スラッシュ(/)はバックスラッシュ2つ(\\)と同じ意味になるので、以下の様なパス指定方法でも可。
short_list <-
list.files(path = "F:\\BigDatas\\mimic-iii-clinical-database-1.4\\mimic-iii-clinical-database-1.4",
pattern = ".csv.gz$",
full.names = F)
name_list <- short_list |>
str_remove_all(pattern = ".csv.gz") |>
str_to_lower()
name_list
## [1] "admissions" "callout" "caregivers"
## [4] "chartevents" "cptevents" "d_cpt"
## [7] "d_icd_diagnoses" "d_icd_procedures" "d_items"
## [10] "d_labitems" "datetimeevents" "diagnoses_icd"
## [13] "drgcodes" "icustays" "inputevents_cv"
## [16] "inputevents_mv" "labevents" "microbiologyevents"
## [19] "noteevents" "outputevents" "patients"
## [22] "prescriptions" "procedureevents_mv" "procedures_icd"
## [25] "services" "transfers"
# name_list_2 <- name_list[c(1:3, 5:26)]
一回読んだら二度と読まない!
append =
TRUEなので、同じ処理がエラーにならずにデータ量が追加されていく。危険。
name_list名で各ファイルをDB上に登録する
# for (i in seq_along(name_list)) {
#
# callback <- function(df, pos) {
# dbWriteTable(con_mimic, name_list[i], df, append = TRUE)
# }
#
# # デフォルトではチャンクサイズは 10,000 レコードずつ。
# read_csv_chunked(csv_list[i],
# DataFrameCallback$new(callback),
# chunk_size = 100000)
# }
charteventsは3.3億行!
db_ad <- tbl(con_mimic, "admissions")
db_ad |> colnames()
## [1] "ROW_ID" "SUBJECT_ID" "HADM_ID"
## [4] "ADMITTIME" "DISCHTIME" "DEATHTIME"
## [7] "ADMISSION_TYPE" "ADMISSION_LOCATION" "DISCHARGE_LOCATION"
## [10] "INSURANCE" "LANGUAGE" "RELIGION"
## [13] "MARITAL_STATUS" "ETHNICITY" "EDREGTIME"
## [16] "EDOUTTIME" "DIAGNOSIS" "HOSPITAL_EXPIRE_FLAG"
## [19] "HAS_CHARTEVENTS_DATA"
db_ad |> count()
## # Source: SQL [?? x 1]
## # Database: DuckDB v1.3.2 [mihara@Windows 10 x64:R 4.5.0/F:\Database\mimic.db]
## n
## <dbl>
## 1 58976
db_ad |> head(5)
## # Source: SQL [?? x 19]
## # Database: DuckDB v1.3.2 [mihara@Windows 10 x64:R 4.5.0/F:\Database\mimic.db]
## ROW_ID SUBJECT_ID HADM_ID ADMITTIME DISCHTIME
## <dbl> <dbl> <dbl> <dttm> <dttm>
## 1 21 22 165315 2196-04-09 12:26:00 2196-04-10 15:54:00
## 2 22 23 152223 2153-09-03 07:15:00 2153-09-08 19:10:00
## 3 23 23 124321 2157-10-18 19:34:00 2157-10-25 14:00:00
## 4 24 24 161859 2139-06-06 16:14:00 2139-06-09 12:48:00
## 5 25 25 129635 2160-11-02 02:06:00 2160-11-05 14:55:00
## # ℹ 14 more variables: DEATHTIME <dttm>, ADMISSION_TYPE <chr>,
## # ADMISSION_LOCATION <chr>, DISCHARGE_LOCATION <chr>, INSURANCE <chr>,
## # LANGUAGE <chr>, RELIGION <chr>, MARITAL_STATUS <chr>, ETHNICITY <chr>,
## # EDREGTIME <dttm>, EDOUTTIME <dttm>, DIAGNOSIS <chr>,
## # HOSPITAL_EXPIRE_FLAG <dbl>, HAS_CHARTEVENTS_DATA <dbl>
SUBJECT_IDのユニーク数は46520 (< 58976) なので、重複がある事が分かる。
db_ad |> distinct(SUBJECT_ID) |> count()
## # Source: SQL [?? x 1]
## # Database: DuckDB v1.3.2 [mihara@Windows 10 x64:R 4.5.0/F:\Database\mimic.db]
## n
## <dbl>
## 1 46520
db_pt <- tbl(con_mimic, "patients")
db_pt |> colnames()
## [1] "ROW_ID" "SUBJECT_ID" "GENDER" "DOB" "DOD"
## [6] "DOD_HOSP" "DOD_SSN" "EXPIRE_FLAG"
db_pt |> count()
## # Source: SQL [?? x 1]
## # Database: DuckDB v1.3.2 [mihara@Windows 10 x64:R 4.5.0/F:\Database\mimic.db]
## n
## <dbl>
## 1 46520
db_pt |> head(5)
## # Source: SQL [?? x 8]
## # Database: DuckDB v1.3.2 [mihara@Windows 10 x64:R 4.5.0/F:\Database\mimic.db]
## ROW_ID SUBJECT_ID GENDER DOB DOD
## <dbl> <dbl> <chr> <dttm> <dttm>
## 1 234 249 F 2075-03-13 00:00:00 NA
## 2 235 250 F 2164-12-27 00:00:00 2188-11-22 00:00:00
## 3 236 251 M 2090-03-15 00:00:00 NA
## 4 237 252 M 2078-03-06 00:00:00 NA
## 5 238 253 F 2089-11-26 00:00:00 NA
## # ℹ 3 more variables: DOD_HOSP <dttm>, DOD_SSN <dttm>, EXPIRE_FLAG <dbl>
# dbDisconnect(con_mimic, shutdown=TRUE)