必要パッケージのよみこみとlibrary

##パッケージのlibrary

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(RSQLite)
library(lubridate)
## 
##  次のパッケージを付け加えます: 'lubridate' 
## 
##  以下のオブジェクトは 'package:base' からマスクされています:
## 
##     date, intersect, setdiff, union
library(tableone)

データベースの作成

con_mimic <- dbConnect(SQLite(),"/Users/ichitachikamasa/Desktop/RSQLite/temp/mimic.db")

データの確認(最初は何もない)

dbListTables(con_mimic)
##  [1] "admissions"         "callout"            "caregivers"        
##  [4] "chartevents"        "cptevents"          "d_cpt"             
##  [7] "d_icd_diagnoses"    "d_icd_procedures"   "d_items"           
## [10] "d_labitems"         "datetimeevents"     "diagnoses_icd"     
## [13] "drgcodes"           "icustays"           "inputevents_cv"    
## [16] "inputevents_mv"     "labevents"          "microbiologyevents"
## [19] "noteevents"         "outputevents"       "patients"          
## [22] "prescriptions"      "procedureevents_mv" "procedures_icd"    
## [25] "services"           "transfers"

ファイル名リストの抽出

csv_list <- list.files(path = "/Users/ichitachikamasa/Desktop/BicDatas/mimic_iii_clinical_database_1.4",
                       pattern = ".csv.gz$",
                       full.names = T)
csv_list
##  [1] "/Users/ichitachikamasa/Desktop/BicDatas/mimic_iii_clinical_database_1.4/ADMISSIONS.csv.gz"        
##  [2] "/Users/ichitachikamasa/Desktop/BicDatas/mimic_iii_clinical_database_1.4/CALLOUT.csv.gz"           
##  [3] "/Users/ichitachikamasa/Desktop/BicDatas/mimic_iii_clinical_database_1.4/CAREGIVERS.csv.gz"        
##  [4] "/Users/ichitachikamasa/Desktop/BicDatas/mimic_iii_clinical_database_1.4/CHARTEVENTS.csv.gz"       
##  [5] "/Users/ichitachikamasa/Desktop/BicDatas/mimic_iii_clinical_database_1.4/CPTEVENTS.csv.gz"         
##  [6] "/Users/ichitachikamasa/Desktop/BicDatas/mimic_iii_clinical_database_1.4/D_CPT.csv.gz"             
##  [7] "/Users/ichitachikamasa/Desktop/BicDatas/mimic_iii_clinical_database_1.4/D_ICD_DIAGNOSES.csv.gz"   
##  [8] "/Users/ichitachikamasa/Desktop/BicDatas/mimic_iii_clinical_database_1.4/D_ICD_PROCEDURES.csv.gz"  
##  [9] "/Users/ichitachikamasa/Desktop/BicDatas/mimic_iii_clinical_database_1.4/D_ITEMS.csv.gz"           
## [10] "/Users/ichitachikamasa/Desktop/BicDatas/mimic_iii_clinical_database_1.4/D_LABITEMS.csv.gz"        
## [11] "/Users/ichitachikamasa/Desktop/BicDatas/mimic_iii_clinical_database_1.4/DATETIMEEVENTS.csv.gz"    
## [12] "/Users/ichitachikamasa/Desktop/BicDatas/mimic_iii_clinical_database_1.4/DIAGNOSES_ICD.csv.gz"     
## [13] "/Users/ichitachikamasa/Desktop/BicDatas/mimic_iii_clinical_database_1.4/DRGCODES.csv.gz"          
## [14] "/Users/ichitachikamasa/Desktop/BicDatas/mimic_iii_clinical_database_1.4/ICUSTAYS.csv.gz"          
## [15] "/Users/ichitachikamasa/Desktop/BicDatas/mimic_iii_clinical_database_1.4/INPUTEVENTS_CV.csv.gz"    
## [16] "/Users/ichitachikamasa/Desktop/BicDatas/mimic_iii_clinical_database_1.4/INPUTEVENTS_MV.csv.gz"    
## [17] "/Users/ichitachikamasa/Desktop/BicDatas/mimic_iii_clinical_database_1.4/LABEVENTS.csv.gz"         
## [18] "/Users/ichitachikamasa/Desktop/BicDatas/mimic_iii_clinical_database_1.4/MICROBIOLOGYEVENTS.csv.gz"
## [19] "/Users/ichitachikamasa/Desktop/BicDatas/mimic_iii_clinical_database_1.4/NOTEEVENTS.csv.gz"        
## [20] "/Users/ichitachikamasa/Desktop/BicDatas/mimic_iii_clinical_database_1.4/OUTPUTEVENTS.csv.gz"      
## [21] "/Users/ichitachikamasa/Desktop/BicDatas/mimic_iii_clinical_database_1.4/PATIENTS.csv.gz"          
## [22] "/Users/ichitachikamasa/Desktop/BicDatas/mimic_iii_clinical_database_1.4/PRESCRIPTIONS.csv.gz"     
## [23] "/Users/ichitachikamasa/Desktop/BicDatas/mimic_iii_clinical_database_1.4/PROCEDUREEVENTS_MV.csv.gz"
## [24] "/Users/ichitachikamasa/Desktop/BicDatas/mimic_iii_clinical_database_1.4/PROCEDURES_ICD.csv.gz"    
## [25] "/Users/ichitachikamasa/Desktop/BicDatas/mimic_iii_clinical_database_1.4/SERVICES.csv.gz"          
## [26] "/Users/ichitachikamasa/Desktop/BicDatas/mimic_iii_clinical_database_1.4/TRANSFERS.csv.gz"
short_list <- list.files(path = "/Users/ichitachikamasa/Desktop/BicDatas/mimic_iii_clinical_database_1.4",
                       pattern = ".csv.gz$",
                       full.names = F)
name_list <- short_list |> str_remove_all(pattern = ".csv.gz")
name_list <- name_list |> str_to_lower()
name_list
##  [1] "admissions"         "callout"            "caregivers"        
##  [4] "chartevents"        "cptevents"          "d_cpt"             
##  [7] "d_icd_diagnoses"    "d_icd_procedures"   "d_items"           
## [10] "d_labitems"         "datetimeevents"     "diagnoses_icd"     
## [13] "drgcodes"           "icustays"           "inputevents_cv"    
## [16] "inputevents_mv"     "labevents"          "microbiologyevents"
## [19] "noteevents"         "outputevents"       "patients"          
## [22] "prescriptions"      "procedureevents_mv" "procedures_icd"    
## [25] "services"           "transfers"

各ファイルををDB上に登録する

一回読んだら二度と読まない!

append = TRUEなので、同じ処理がエラーにならずにデータ量が追加されていく。危険。

name_list名で各ファイルをDB上に登録する

#for (i in seq_along(name_list)) {
#callback <- function(df, pos) {
#   dbWriteTable(con_mimic, name_list[i], df, append = TRUE)
# }
# 
## デフォルトではチャンクサイズは 10,000 レコードずつ。
# read_csv_chunked(csv_list[i],
#                  DataFrameCallback$new(callback),
#                  chunk_size = 100000)
# }

データベース内に格納されているファイルの確認

dbListTables(con_mimic)
##  [1] "admissions"         "callout"            "caregivers"        
##  [4] "chartevents"        "cptevents"          "d_cpt"             
##  [7] "d_icd_diagnoses"    "d_icd_procedures"   "d_items"           
## [10] "d_labitems"         "datetimeevents"     "diagnoses_icd"     
## [13] "drgcodes"           "icustays"           "inputevents_cv"    
## [16] "inputevents_mv"     "labevents"          "microbiologyevents"
## [19] "noteevents"         "outputevents"       "patients"          
## [22] "prescriptions"      "procedureevents_mv" "procedures_icd"    
## [25] "services"           "transfers"

各データへのアクセスを作成

db_adm<- tbl(con_mimic, "admissions")

1.行数を数える

db_adm |> count()
## # Source:   SQL [1 x 1]
## # Database: sqlite 3.39.4 [/Users/ichitachikamasa/Desktop/RSQLite/temp/mimic.db]
##       n
##   <int>
## 1 58976

2.列名を表示する

db_adm |> colnames()
##  [1] "ROW_ID"               "SUBJECT_ID"           "HADM_ID"             
##  [4] "ADMITTIME"            "DISCHTIME"            "DEATHTIME"           
##  [7] "ADMISSION_TYPE"       "ADMISSION_LOCATION"   "DISCHARGE_LOCATION"  
## [10] "INSURANCE"            "LANGUAGE"             "RELIGION"            
## [13] "MARITAL_STATUS"       "ETHNICITY"            "EDREGTIME"           
## [16] "EDOUTTIME"            "DIAGNOSIS"            "HOSPITAL_EXPIRE_FLAG"
## [19] "HAS_CHARTEVENTS_DATA"

3.先頭の数行表示する

db_adm |> head(5)
## # Source:   SQL [5 x 19]
## # Database: sqlite 3.39.4 [/Users/ichitachikamasa/Desktop/RSQLite/temp/mimic.db]
##   ROW_ID SUBJE…¹ HADM_ID ADMIT…² DISCH…³ DEATH…⁴ ADMIS…⁵ ADMIS…⁶ DISCH…⁷ INSUR…⁸
##    <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl> <chr>   <chr>   <chr>   <chr>  
## 1     21      22  165315  7.14e9  7.14e9      NA EMERGE… EMERGE… DISC-T… Private
## 2     22      23  152223  5.80e9  5.80e9      NA ELECTI… PHYS R… HOME H… Medica…
## 3     23      23  124321  5.93e9  5.93e9      NA EMERGE… TRANSF… HOME H… Medica…
## 4     24      24  161859  5.35e9  5.35e9      NA EMERGE… TRANSF… HOME    Private
## 5     25      25  129635  6.02e9  6.02e9      NA EMERGE… EMERGE… HOME    Private
## # … with 9 more variables: LANGUAGE <chr>, RELIGION <chr>,
## #   MARITAL_STATUS <chr>, ETHNICITY <chr>, EDREGTIME <dbl>, EDOUTTIME <dbl>,
## #   DIAGNOSIS <chr>, HOSPITAL_EXPIRE_FLAG <dbl>, HAS_CHARTEVENTS_DATA <dbl>,
## #   and abbreviated variable names ¹​SUBJECT_ID, ²​ADMITTIME, ³​DISCHTIME,
## #   ⁴​DEATHTIME, ⁵​ADMISSION_TYPE, ⁶​ADMISSION_LOCATION, ⁷​DISCHARGE_LOCATION,
## #   ⁸​INSURANCE

4.特定のユニーク数を数える

db_adm |> distinct(SUBJECT_ID) |> count()
## # Source:   SQL [1 x 1]
## # Database: sqlite 3.39.4 [/Users/ichitachikamasa/Desktop/RSQLite/temp/mimic.db]
##       n
##   <int>
## 1 46520

5.処理した後のデータを一時的にmimic.dbに保存する

6.データをメモリに上げる

db_call<- tbl(con_mimic,"callout")
db_care<- tbl(con_mimic, "caregivers")
db_chart<- tbl(con_mimic, "chartevents")
db_cpt<- tbl(con_mimic, "cptevents")
master_cpt<- tbl(con_mimic, "d_cpt")
master_icd_d<- tbl(con_mimic, "d_icd_diagnoses")
master_icd_p<- tbl(con_mimic, "d_icd_procedures")
master_item<- tbl(con_mimic, "d_items")
master_lab<- tbl(con_mimic, "d_labitems")
db_datetime<- tbl(con_mimic, "datetimeevents")
db_diagnose<- tbl(con_mimic, "diagnoses_icd")
db_drg<- tbl(con_mimic, "drgcodes")
db_icustay<- tbl(con_mimic, "icustays")
db_inpcv<- tbl(con_mimic, "inputevents_cv")
db_inpmv<- tbl(con_mimic, "inputevents_mv")
db_lab<- tbl(con_mimic, "labevents")
db_micro<- tbl(con_mimic, "microbiologyevents")
db_noteeve<- tbl(con_mimic, "noteevents")
db_outputeve<- tbl(con_mimic, "outputevents")
db_patient<- tbl(con_mimic, "patients")
db_presc<- tbl(con_mimic, "prescriptions")
db_promv<- tbl(con_mimic, "procedureevents_mv")
db_proicd<- tbl(con_mimic, "procedures_icd")
db_serv<- tbl(con_mimic, "services")
db_trans<- tbl(con_mimic, "transfers")