(前回までの資料も一部記載していますが、コードは最低限にしています。復習は前回の資料を参照してください。)

package

pacman::p_load(tidyverse, lubridate, tableone, skimr,
                duckdb, arrow, RSQLite)

データベース

FドライブのDatabaseフォルダ内のmimic.dbにアクセスする場合は以下のコード(各自の状況に合わせて適宜修正)。 (mimic.dbが無い場合は以下の操作で空のmimic.dbが作成される)

con_mimic <- dbConnect(duckdb::duckdb(), "F:/Database/mimic.db")

mimic.dbの中に入っているデータリスト

dbListTables(con_mimic)
##  [1] "admissions"                 "apsiii"                    
##  [3] "callout"                    "caregivers"                
##  [5] "chartevents"                "comorbidities_AHRQ"        
##  [7] "comorbidities_Quan"         "cptevents"                 
##  [9] "d_cpt"                      "d_icd_diagnoses"           
## [11] "d_icd_procedures"           "d_items"                   
## [13] "d_labitems"                 "datetimeevents"            
## [15] "db_HtWt"                    "db_LVEF"                   
## [17] "db_LVEF_assess"             "db_PEO"                    
## [19] "db_comorb"                  "db_ethnicity"              
## [21] "db_patients"                "db_rrt"                    
## [23] "db_sofa"                    "db_urine"                  
## [25] "db_venti"                   "db_vital"                  
## [27] "df_lab_list"                "diagnoses_icd"             
## [29] "drgcodes"                   "height_firstday"           
## [31] "height_weight"              "icustay_details"           
## [33] "icustays"                   "inputevents_cv"            
## [35] "inputevents_mv"             "labevents"                 
## [37] "microbiologyevents"         "noteevents"                
## [39] "outputevents"               "patients"                  
## [41] "pivot_RRT"                  "pivot_height"              
## [43] "pivot_labo"                 "pivot_urine_output"        
## [45] "pivot_vitals"               "prescriptions"             
## [47] "procedureevents_mv"         "procedures_icd"            
## [49] "services"                   "sofa"                      
## [51] "test"                       "transfers"                 
## [53] "urine_output"               "urine_output_firstday"     
## [55] "ventilation_classification" "ventilation_duration"      
## [57] "weight_duration"            "weight_firstday"
# dbDisconnect(con_mimic, shutdown=TRUE)

使用データ

作成データ

db_patients <- tbl(con_mimic, "db_patients")
db_PEO <- tbl(con_mimic, "db_PEO")
db_HtWt <- tbl(con_mimic, "db_HtWt")
db_ad <- tbl(con_mimic, "admissions") 
db_pt <- tbl(con_mimic, "patients")
db_diag <- tbl(con_mimic, "diagnoses_icd")
db_icustay <- tbl(con_mimic, "icustays")

db_icustay_hadm <- 
  db_icustay |> 
  distinct(HADM_ID, .keep_all = T) |> 
  select(HADM_ID, ICUSTAY_ID)

hadm_icu_vec <- 
  db_icustay_hadm |> 
  select(HADM_ID) |> 
  pull()

icustay_id_vec <- 
  db_icustay_hadm |> 
  select(ICUSTAY_ID) |> 
  pull()
db_d_icd_diag <- tbl(con_mimic, "d_icd_diagnoses")
df_d_icd_diag <- collect(db_d_icd_diag)

<第2回講義・課題> 解答例

  1. BigQueryから必要なファイルをダウンロードし、自分の環境下でmimic.dbに入れる(以下の18ファイルは参考までに)
    (希望者のみでOK。希望しない人は個別に連絡ください。)

  2. 入手したテーブルを利用して、対象患者の以下の項目を抽出する
    (論文再現が目的であるが、全く同じにならないことに注意)
    (授業と同様、HADM_IDと目的変数のみのファイルを作成する)

- vitals

pivot_vital

db_vital <- tbl(con_mimic, "pivot_vitals" )
db_vital |> count()

対象患者のicustay_idに限定する

db_vital_target <- db_vital |>  filter(icustay_id %in% icustay_id_vec)

バイタル毎にicustay_idに紐づくデータを抽出する。
1入室中に複数のバイタルデータがあることがほとんどだが、どの時点でのデータを抽出するかは論文未記載(たぶん)。
以下では入室最初のデータを採用したが、平均値・中央値にすることもあり得る。  

(以下のコードをcompute()なしで回してみると、compute()の意味が分かると思います。)

db_HR <-
  db_vital_target %>% 
  select(icustay_id, charttime, heartrate) %>% 
  filter(!is.na(heartrate)) %>% 
  arrange(icustay_id, charttime) %>% 
  distinct(icustay_id, .keep_all = T) %>% 
  select(!c(charttime)) %>% 
  compute()

db_sBP <-
  db_vital_target %>% 
  select(icustay_id, charttime,sysbp) %>% 
  filter(!is.na(sysbp)) %>% 
  arrange(icustay_id, charttime) %>% 
  distinct(icustay_id, .keep_all = T) %>% 
  select(!c(charttime)) %>% 
  compute()

db_dBP <-
  db_vital_target %>% 
  select(icustay_id, charttime,diasbp) %>% 
  filter(!is.na(diasbp)) %>% 
  arrange(icustay_id, charttime) %>% 
  distinct(icustay_id, .keep_all = T) %>% 
  select(!c(charttime)) %>% 
  compute()

db_mBP <-
  db_vital_target %>% 
  select(icustay_id, charttime,meanbp) %>% 
  filter(!is.na(meanbp)) %>% 
  arrange(icustay_id, charttime) %>% 
  distinct(icustay_id, .keep_all = T) %>% 
  select(!c(charttime)) %>% 
  compute()

db_RR <-
  db_vital_target %>% 
  select(icustay_id, charttime,resprate) %>% 
  filter(!is.na(resprate))%>% 
  arrange(icustay_id, charttime) %>% 
  distinct(icustay_id, .keep_all = T) %>% 
  select(!c(charttime)) %>% 
  compute()

db_tmpc <-
  db_vital_target %>% 
  select(icustay_id, charttime,tempc) %>% 
  filter(!is.na(tempc)) %>% 
  arrange(icustay_id, charttime) %>% 
  distinct(icustay_id, .keep_all = T) %>% 
  select(!c(charttime)) %>% 
  compute()

db_spo2 <-
  db_vital_target %>% 
  select(icustay_id, charttime,spo2) %>% 
  filter(!is.na(spo2)) %>% 
  arrange(icustay_id, charttime) %>% 
  distinct(icustay_id, .keep_all = T) %>% 
  select(!c(charttime)) %>% 
  compute()


db_vitals_first <-
  db_HR |> 
  left_join(db_sBP, by = "icustay_id") |> 
  left_join(db_dBP, by = "icustay_id") |> 
  left_join(db_mBP, by = "icustay_id") |> 
  left_join(db_RR, by = "icustay_id") |> 
  left_join(db_tmpc, by = "icustay_id") |> 
  left_join(db_spo2, by = "icustay_id")
  

db_vitals_first <- db_vitals_first |> compute()
db_PEO_vital <- 
  db_PEO |> 
  left_join(db_vitals_first, by = c("ICUSTAY_ID" = "icustay_id")) 

db_PEO_vital <- db_PEO_vital |> compute()
db_PEO_vital |> count()
db_PEO_vital |> head(10)

表作成して確認

cols_vital <- c("heartrate", "sysbp", "diasbp", "meanbp", "resprate", "tempc", "spo2")
factors <- c("ADMISSION_TYPE", "INSURANCE", "GENDER",  "mortality30", "hospital_death")

db_PEO_vital |> select(c(cols_vital, "h2ras"))|> collect() |> 
  CreateTableOne(vars = cols_vital, strata = "h2ras", factorVars = factors) -> tableone

print(tableone, smd = TRUE, missing = TRUE) |> write.csv("test.csv")
##                        Stratified by h2ras
##                         0              1              p      test SMD   
##   n                       5949           4441                           
##   heartrate (mean (SD))  88.36 (20.63)  88.09 (18.75)  0.494       0.014
##   sysbp (mean (SD))     123.44 (25.95) 122.17 (25.18)  0.013       0.050
##   diasbp (mean (SD))     62.25 (17.36)  62.11 (16.40)  0.668       0.009
##   meanbp (mean (SD))     80.51 (18.28)  80.41 (17.61)  0.784       0.005
##   resprate (mean (SD))   19.99 (6.22)   18.30 (6.30)  <0.001       0.269
##   tempc (mean (SD))      36.54 (0.99)   36.49 (0.95)   0.005       0.057
##   spo2 (mean (SD))       96.42 (5.44)   97.16 (4.89)  <0.001       0.144
##                        Stratified by h2ras
##                         Missing
##   n                            
##   heartrate (mean (SD)) 1.7    
##   sysbp (mean (SD))     1.7    
##   diasbp (mean (SD))    1.7    
##   meanbp (mean (SD))    1.7    
##   resprate (mean (SD))  1.7    
##   tempc (mean (SD))     2.0    
##   spo2 (mean (SD))      1.8

ヒストグラム作成して確認

cols_continuous <- cols_vital

db_PEO_vital |> 
  select(cols_continuous) |> 
  pivot_longer(cols = cols_continuous, names_to = "name", values_to = "value") |> 
  ggplot()+
  geom_histogram(aes(x = value), color = "black")+
  facet_wrap(~ name, scales = "free", ncol = 3) +
  theme_bw()+
  theme(text = element_text(size = 20))

  • 最後のマージ用ファイル
db_vital <- 
  db_PEO_vital |> 
  select("HADM_ID", cols_vital)

データベース上に書き込む

if(dbExistsTable(con_mimic, "db_vital")) {
  dbRemoveTable(con_mimic, "db_vital")
}

dbWriteTable(con_mimic, 
             "db_vital", 
             db_vital %>% collect(),
             overwrite = TRUE)  

- urine output

使用するファイル
pivot_urine_output: 論文では使われてないっぽい
urine_output
urine_output_firstday

# db_pivot_urine <- tbl(con_mimic, "pivot_urine_output")
db_urine_out <- tbl(con_mimic, "urine_output")
db_urine_firstday <- tbl(con_mimic, "urine_output_firstday")

db_pivot_urine:
icustya_id・時刻ごとの尿量が記載されている。一日の尿量に直す必要がある。

時間を秒単位から日単位に修正し、
各icustay_idの入室時刻をもとに、
そこから1日(24時間)後までのデータから尿量を計算する。
(コードは結構乱雑です)

db_urine_out:
上記と同じくicustya_id・時刻ごとの尿量が記載されている。
同様に初日のデータを抽出する。

db_urine_out |> arrange(icustay_id)
db_urine1 <-
  db_urine_out |> 
  filter(!is.na(icustay_id)) 

db_urine_time0 <-
  db_urine1 |> 
  arrange(icustay_id, charttime) %>% 
  distinct(icustay_id, .keep_all = T) %>% 
  rename(time0 = charttime) %>% 
  select(icustay_id, time0)

db_urine2 <-
  db_urine1 %>% 
  left_join(db_urine_time0, by = "icustay_id") %>% 
  arrange(icustay_id) %>% 
  mutate(elapse = day(charttime - time0)) %>% 
  filter(elapse < 1) |> 
  compute()

db_urine2 |>
  arrange(icustay_id, charttime) %>% head(200)
db_urine3 <-
  db_urine2 |> 
  group_by(icustay_id) |> 
  summarize(value = sum(value))

db_urine3 <- db_urine3 |> compute()
db_urine3 |> head(10)

db_urine_firstday:
初日のデータのみなので処理は楽。

db_urine_firstday1 <-
  db_urine_firstday |> 
  select(icustay_id, urineoutput)

db_urine_firstday1 |> head(10)

3つのファイルをdb_PEOにマージして、
どのファイルのデータを優先して使うかを決めたうえでデータを固定する(論文未記載)

db_u1 <-
  db_PEO |> 
  left_join(db_urine_firstday1, by = c("ICUSTAY_ID" = "icustay_id")) |> 
  compute()

db_u2 <-
  db_u1 |> 
  left_join(db_urine3, by = c("ICUSTAY_ID" = "icustay_id")) |> 
  compute()

# db_u3 <-
#   db_u2 |> 
#   left_join(db_pivot_urine3, by = c("ICUSTAY_ID" = "icustay_id")) |> 
#   compute()
db_PEO_urine <-
  db_u2 |> 
  mutate(urine_out = case_when(
    !is.na(urineoutput) ~ urineoutput,
    !is.na(value) ~ value
  )) |>
  mutate(urine_out = case_when(
    between(urine_out, 0, 25000) ~ urine_out
  )) |> 
  select(!c(urineoutput, value))
# 
# db_PEO_urine <-
#   db_u3 |>
#   mutate(urine_out = case_when(
#     !is.na(urineoutput.y) ~ urineoutput.y,
#     !is.na(value) ~ value,
#     !is.na(urineoutput.x) ~ urineoutput.x
#   )) |>
#   mutate(urine_out = case_when(
#     between(urine_out, 0, 25000) ~ urine_out
#   )) |>
#   select(!c(urineoutput.x, urineoutput.y, value))


db_PEO_urine <- db_PEO_urine |> compute()

表作成

vars <- c("urine_out")

db_PEO_urine |> select(c(vars, "h2ras"))|> collect() |> 
  CreateTableOne(vars = vars, strata = "h2ras") -> tableone

print(tableone, smd = TRUE, missing = TRUE) |> write.csv("test.csv")
##                        Stratified by h2ras
##                         0                 1                 p      test SMD   
##   n                        5949              4441                             
##   urine_out (mean (SD)) 1775.04 (1252.05) 1838.51 (1214.97)  0.011       0.051
##                        Stratified by h2ras
##                         Missing
##   n                            
##   urine_out (mean (SD)) 4.0

ヒストグラム作成

cols_continuous <- c("urine_out")

db_PEO_urine |> 
  select(cols_continuous) |> 
  pivot_longer(cols = cols_continuous, names_to = "name", values_to = "value") |> 
  ggplot()+
  geom_histogram(aes(x = value), color = "black")+
  facet_wrap(~ name, scales = "free", ncol = 2) +
  theme_bw()+
  theme(text = element_text(size = 20))

  • 最後のマージ用ファイル
db_urine <- 
  db_PEO_urine |> 
  select("HADM_ID", "urine_out")

データベース上に書き込む

if(dbExistsTable(con_mimic, "db_urine")) {
  dbRemoveTable(con_mimic, "db_urine")
}

dbWriteTable(con_mimic, 
             "db_urine", 
             db_urine %>% collect(),
             overwrite = TRUE)  

- SOFA, SAPSiii

重症度スコアは各ICU入室ごとに作成されている。正直このまま使えばいいのは楽。

db_sofa <- tbl(con_mimic, "sofa")
db_saps <- tbl(con_mimic, "apsiii")

db_sofa_1 <- db_sofa |> select(icustay_id, SOFA)
db_saps_1 <- db_saps |> select(icustay_id, apsiii)
db_PEO_sofa <-
  db_PEO |> 
  left_join(db_sofa_1, by = c("ICUSTAY_ID" = "icustay_id")) |> 
  left_join(db_saps_1, by = c("ICUSTAY_ID" = "icustay_id")) 

db_PEO_sofa <- db_PEO_sofa |> compute()

表作成

vars <- c("SOFA", "apsiii")

db_PEO_sofa |> select(c(vars, "h2ras"))|> collect() |> 
  CreateTableOne(vars = vars, strata = "h2ras") -> tableone

print(tableone, smd = TRUE, missing = TRUE) |> write.csv("test.csv")
##                     Stratified by h2ras
##                      0             1             p      test SMD    Missing
##   n                   5949          4441                                   
##   SOFA (mean (SD))    4.59 (3.04)   5.04 (3.02)  <0.001       0.146 0.0    
##   apsiii (mean (SD)) 48.63 (20.54) 46.49 (19.59) <0.001       0.107 0.0

ヒストグラム作成

cols_continuous <- c( "SOFA", "apsiii")

db_PEO_sofa |> 
  select(cols_continuous) |> 
  pivot_longer(cols = cols_continuous, names_to = "name", values_to = "value") |> 
  ggplot()+
  geom_histogram(aes(x = value), color = "black")+
  facet_wrap(~ name, scales = "free", ncol = 2) +
  theme_bw()+
  theme(text = element_text(size = 20))

  • 最後のマージ用ファイル
db_sofa <- 
  db_PEO_sofa |> 
  select("HADM_ID", "SOFA", "apsiii")

データベース上に書き込む

if(dbExistsTable(con_mimic, "db_sofa")) {
  dbRemoveTable(con_mimic, "db_sofa")
}

dbWriteTable(con_mimic, 
             "db_sofa", 
             db_sofa %>% collect(),
             overwrite = TRUE)  

- CRRT

  • rrtしてる人のうち、dialysis_typeがCRRT, CVVH, CVVHD, CVVHDFの全員を対象とする。
  • pivot_RRTからicustay_idを取得する。
db_rrt <- tbl(con_mimic, "pivot_RRT")
db_rrt |> count()
db_rrt |> head(10)

そもそもdialysis_typeがどのくらいあるのかチェックする

db_rrt |> distinct(dialysis_type)

特定のdialysis_typeに限定し、該当するicustay_idを取得する

RRT_icu_id <-
  db_rrt |> filter(dialysis_type %in% c("CRRT", "CVVH", "CVVHD", "CVVHDF")) |>
  distinct(icustay_id) |> pull(icustay_id)

db_PEOにrrtの有無を示す列を作る。

db_PEO_rrt <-
  db_PEO |> 
  mutate(rrt = ifelse(ICUSTAY_ID %in% RRT_icu_id, 1, 0))

db_PEO_rrt <- db_PEO_rrt |> compute()
vars <- c("rrt")
vars_fact <- vars
  
db_PEO_rrt |> select(c(vars, "h2ras"))|> collect() |> 
  CreateTableOne(vars = vars, strata = "h2ras", factorVars = vars_fact) -> tableone

print(tableone, smd = TRUE, missing = TRUE) |> write.csv("test.csv")
##              Stratified by h2ras
##               0           1           p      test SMD    Missing
##   n           5949        4441                                  
##   rrt = 1 (%)  155 (2.6)   138 (3.1)   0.142       0.030 0.0
  • 最後のマージ用ファイル
db_rrt <- 
  db_PEO_rrt |> 
  select("HADM_ID", "rrt")

データベース上に書き込む

if(dbExistsTable(con_mimic, "db_rrt")) {
  dbRemoveTable(con_mimic, "db_rrt")
}

dbWriteTable(con_mimic, 
             "db_rrt", 
             db_rrt %>% collect(),
             overwrite = TRUE)  

- use of ventilator

  • ventilation_durationのあるICUSTAY_IDを人工呼吸ありとする
# db_venti_c <- tbl(con_mimic, "ventilation_classification")
db_venti_d <- tbl(con_mimic, "ventilation_duration")

db_venti_d |> count()
db_venti_d |> head(10)
venti_icu_id <-
  db_venti_d |>
  filter(!is.na(icustay_id)) |>
  distinct(icustay_id) |> 
  pull(icustay_id)
db_PEO_venti <-
  db_PEO |> 
  mutate(ventilation = ifelse(ICUSTAY_ID %in% venti_icu_id, 1, 0)) #|> 
  # replace_na(list(ventilation = 0))
  
db_PEO_venti <- db_PEO_venti |> compute()
vars <- c("ventilation")
vars_fact <- vars
  
db_PEO_venti |> select(c(vars, "h2ras"))|> collect() |> 
  CreateTableOne(vars = vars, strata = "h2ras", factorVars = vars_fact) -> tableone

print(tableone, smd = TRUE, missing = TRUE) |> write.csv("test.csv")
##                      Stratified by h2ras
##                       0            1            p      test SMD    Missing
##   n                   5949         4441                                   
##   ventilation = 1 (%) 2402 (40.4)  2839 (63.9)  <0.001       0.485 0.0
  • 最後のマージ用ファイル
db_venti <- 
  db_PEO_venti |> 
  select("HADM_ID", "ventilation")

データベース上に書き込む

if(dbExistsTable(con_mimic, "db_venti")) {
  dbRemoveTable(con_mimic, "db_venti")
}

dbWriteTable(con_mimic, 
             "db_venti", 
             db_venti %>% collect(),
             overwrite = TRUE)  

- details: Ethnicity

もともとEthnicityはadmissionsに入っているが分類が細かすぎる。
より一般的な分類にした表を公式が作成していて、icustay_detailsから参照できる。

db_details <- tbl(con_mimic, "icustay_details")
db_details |> count()

native, other, unknownはotherとした。

db_details_ethnicity <-
  db_details |> 
  distinct(hadm_id, .keep_all = T) |> 
  select(hadm_id, ethnicity_grouped) |> 
  mutate(ethnicity_5 = case_when(
    ethnicity_grouped %in% c("native", "other", "unknown") ~ "other",
    TRUE ~ ethnicity_grouped
  ))
db_PEO_ethnicity <-
  db_PEO |> 
  left_join(db_details_ethnicity, by = c("HADM_ID" = "hadm_id"))

db_PEO_ethnicity <- db_PEO_ethnicity |> compute()
var = c("ethnicity_5")

db_PEO_ethnicity |> select(c(var, "h2ras")) |> collect() |> 
  CreateTableOne(vars = var, strata = "h2ras", factorVars = var) -> tableone

print(tableone, smd=T, missing = T)
##                  Stratified by h2ras
##                   0            1            p      test SMD    Missing
##   n               5949         4441                                   
##   ethnicity_5 (%)                            0.002       0.081 1.0    
##      asian         101 ( 1.7)    80 ( 1.8)                            
##      black         499 ( 8.5)   362 ( 8.2)                            
##      hispanic      122 ( 2.1)   125 ( 2.8)                            
##      other         892 (15.2)   569 (12.9)                            
##      white        4262 (72.5)  3279 (74.3)
db_ethnicity <-
  db_PEO_ethnicity |> 
  select(HADM_ID, ethnicity_5)

データベース上に書き込む

if(dbExistsTable(con_mimic, "db_ethnicity")) {
  dbRemoveTable(con_mimic, "db_ethnicity")
}

dbWriteTable(con_mimic, 
             "db_ethnicity", 
             db_ethnicity %>% collect(),
             overwrite = TRUE)  

第3回講義

- comorbidity

Quanの定義による合併症リストはBigQueryから入手可能
しかし、論文ではこの合併症リストにないものが多い(H2ブロッカー使用に関連のありそうな合併症を選択している)
また、Quanのリストにある合併症に関してもBigQueryファイルと論文とでかなり食い違う。おそらく著者らは病名のICD9コードから自分たちで抽出しているっぽい

我々もQuanのリストからは採用せず、手動でデータ抽出していく事にする

hypertension

“d_icd_diagnoses”と”diagnoses_icd”を使う
やることは心不全を抽出した時と同じなので、比較的単純

HT_icd_9 <- 
  df_d_icd_diag |>
  filter(str_detect(LONG_TITLE, coll("hypertension", ignore_case = T))) |> 
  filter(!str_detect(LONG_TITLE, coll("without", ignore_case = T))) |> 
  pull(ICD9_CODE)
hadm_id_vec <- db_PEO |> pull(HADM_ID)

db_ht <- 
  db_diag |> 
  filter(HADM_ID %in% hadm_id_vec) |> 
  mutate(HT = ifelse(ICD9_CODE %in% HT_icd_9, 1, 0)) |> 
  group_by(HADM_ID) |> 
  summarize(hypertension = max(HT))
  
db_ht |> count()
db_ht |> head(10)

論文にある全ての合併症

以下にコード例を示す。自分でも試行錯誤してみると良い。

#hypertension
HT_icd_9 <- 
  df_d_icd_diag |> 
  filter(str_detect(LONG_TITLE, coll("hypertension", ignore_case = T))) |> 
  filter(!str_detect(LONG_TITLE, coll("without", ignore_case = T))) |> 
  pull(ICD9_CODE)

#diabetes
DM_icd_9 <- 
  df_d_icd_diag |> 
  filter(str_detect(LONG_TITLE, coll("diabetes me", ignore_case = T))|
         str_detect(SHORT_TITLE, coll("DMI", ignore_case = F))) |> 
  filter(str_starts(ICD9_CODE, "2")) |>
  pull(ICD9_CODE)

#anemia
anemia_icd_9 <- 
  df_d_icd_diag |> 
  filter(str_detect(LONG_TITLE, coll("anemia", ignore_case = T))) |> 
  filter(str_starts(ICD9_CODE, "2")) |> 
  pull(ICD9_CODE)

#af
af_icd_9 <- c("42731")

#coronary atherosclerosis
coronary_icd_9 <- 
  df_d_icd_diag |> 
  filter(str_detect(LONG_TITLE, coll("coronary atherosclerosis", ignore_case = T))) |> 
  pull(ICD9_CODE)

#venous thrombus
venous_icd_9 <- 
  df_d_icd_diag |> 
  filter(str_detect(LONG_TITLE, coll("venous embo", ignore_case = T))|
         str_detect(SHORT_TITLE, coll("vein thromb", ignore_case = T))) |> 
  pull(ICD9_CODE)

#myocardial infarction
mi_icd_9 <- 
  df_d_icd_diag |> 
  filter(str_detect(LONG_TITLE, coll("myocardial infarction", ignore_case = T))) |> 
  filter(!ICD9_CODE %in% c("4110", "41181", "42979")) |> 
  pull(ICD9_CODE)

#gastritis
gastritis_icd_9 <- 
  df_d_icd_diag |> 
  filter(str_detect(LONG_TITLE, coll("gastritis", ignore_case = T))) |> 
  pull(ICD9_CODE)

#duodenal ulcer
d_ulcer_icd_9 <- 
  df_d_icd_diag |> 
  filter(str_detect(LONG_TITLE, coll("duodenal ulcer", ignore_case = T))) |> 
  pull(ICD9_CODE)

#gastric ulcer
g_ulcer_icd_9 <- 
  df_d_icd_diag |> 
  filter(str_detect(LONG_TITLE, coll("gastric ulcer", ignore_case = T))) |> 
  pull(ICD9_CODE)

#gastric bleeding
g_bleed_icd_9 <- 
  df_d_icd_diag |> 
  filter(str_detect(LONG_TITLE, coll("gastr", ignore_case = T))|
         str_detect(LONG_TITLE, coll("intest", ignore_case = T))|
         str_detect(LONG_TITLE, coll("duodenal", ignore_case = T))|
         str_detect(LONG_TITLE, coll("colon", ignore_case = T))) |> 
  filter(str_detect(LONG_TITLE, coll("hemorrh", ignore_case = T))) |> 
  filter(!str_detect(LONG_TITLE, coll("without mention of hemorrhage", ignore_case = T))) |>
  pull(ICD9_CODE)

#AKI
aki_icd_9 <- 
  df_d_icd_diag |> 
  filter(str_detect(LONG_TITLE, coll("acute kidney", ignore_case = T))) |> 
  pull(ICD9_CODE)

#septic shock
s_shock_icd_9 <- 
  df_d_icd_diag |> 
  filter(str_detect(LONG_TITLE, coll("septic shock", ignore_case = T))) |> 
  pull(ICD9_CODE)

#pneumonia
pneumonia_icd_9 <- 
  df_d_icd_diag |> 
  filter(str_detect(LONG_TITLE, coll("pneumonia", ignore_case = T))) |> 
  pull(ICD9_CODE)
hadm_id_vec <- db_PEO |> pull(HADM_ID)

db_comorb_manual <- 
  db_diag |> 
  filter(HADM_ID %in% hadm_id_vec) |> 

  mutate(HT = ifelse(ICD9_CODE %in% HT_icd_9, 1, 0),
         DM = ifelse(ICD9_CODE %in% DM_icd_9, 1, 0),
         anemia = ifelse(ICD9_CODE %in% anemia_icd_9, 1, 0),
         Af = ifelse(ICD9_CODE %in% af_icd_9, 1, 0),
         coronary_atherosclerosis = ifelse(ICD9_CODE %in% coronary_icd_9, 1, 0),
         venous_thrombus = ifelse(ICD9_CODE %in% venous_icd_9, 1, 0),
         MI = ifelse(ICD9_CODE %in% mi_icd_9, 1, 0),
         gastritis = ifelse(ICD9_CODE %in% gastritis_icd_9, 1, 0),
         g_ulcer = ifelse(ICD9_CODE %in% g_ulcer_icd_9, 1, 0),
         d_ulcer = ifelse(ICD9_CODE %in% d_ulcer_icd_9, 1, 0),
         g_bleeding = ifelse(ICD9_CODE %in% g_bleed_icd_9, 1, 0),
         aki = ifelse(ICD9_CODE %in% aki_icd_9, 1, 0),
         s_shock = ifelse(ICD9_CODE %in% s_shock_icd_9, 1, 0),
         pneumonia = ifelse(ICD9_CODE %in% pneumonia_icd_9, 1, 0)) |> 
  
  group_by(HADM_ID) |> 
  summarize(hypertension = max(HT),
            DM = max(DM),
            anemia = max(anemia),
            Af = max(Af),
            coronary_atherosclerosis = max(coronary_atherosclerosis),
            venous_thrombus = max(venous_thrombus),
            MI = max(MI),
            gastritis = max(gastritis),
            gastric_ulcer = max(g_ulcer),
            duodenal_ulcer = max(d_ulcer),
            gastric_bleeding = max(g_bleeding),
            aki = max(aki),
            septic_shock = max(s_shock),
            pneumonia = max(pneumonia))
  
db_comorb_manual |> count()
db_PEO_comorb <-
  db_PEO |> 
    left_join(db_comorb_manual, by = "HADM_ID")

db_PEO_comorb <- db_PEO_comorb |> compute()

表作成

cols_comorb <- c(
  "hypertension", "DM", "anemia", "Af", "coronary_atherosclerosis", "venous_thrombus", "MI",
  "gastritis", "gastric_ulcer", "duodenal_ulcer", "gastric_bleeding", "aki", "septic_shock",
  "pneumonia")
vars <- cols_comorb
factors <- cols_comorb

db_PEO_comorb |> select(c(vars, "h2ras"))|> collect() |> 
  CreateTableOne(vars = vars, strata = "h2ras", factorVars = factors) -> tableone

print(tableone, smd = TRUE, missing = TRUE) #|> write.csv("test.csv")
##                                   Stratified by h2ras
##                                    0            1            p      test SMD   
##   n                                5949         4441                           
##   hypertension = 1 (%)             2359 (39.7)  2025 (45.6)  <0.001       0.120
##   DM = 1 (%)                       2133 (35.9)  1642 (37.0)   0.249       0.023
##   anemia = 1 (%)                   1993 (33.5)  1414 (31.8)   0.078       0.035
##   Af = 1 (%)                       2537 (42.6)  2034 (45.8)   0.001       0.064
##   coronary_atherosclerosis = 1 (%) 2264 (38.1)  2049 (46.1)  <0.001       0.164
##   venous_thrombus = 1 (%)           112 ( 1.9)    77 ( 1.7)   0.626       0.011
##   MI = 1 (%)                       1001 (16.8)   717 (16.1)   0.369       0.018
##   gastritis = 1 (%)                 121 ( 2.0)    61 ( 1.4)   0.014       0.051
##   gastric_ulcer = 1 (%)              67 ( 1.1)    20 ( 0.5)  <0.001       0.076
##   duodenal_ulcer = 1 (%)             71 ( 1.2)    25 ( 0.6)   0.001       0.068
##   gastric_bleeding = 1 (%)          514 ( 8.6)   184 ( 4.1)  <0.001       0.185
##   aki = 1 (%)                      2260 (38.0)  1409 (31.7)  <0.001       0.132
##   septic_shock = 1 (%)              414 ( 7.0)   261 ( 5.9)   0.030       0.044
##   pneumonia = 1 (%)                1343 (22.6)   927 (20.9)   0.040       0.041
##                                   Stratified by h2ras
##                                    Missing
##   n                                       
##   hypertension = 1 (%)             0.0    
##   DM = 1 (%)                       0.0    
##   anemia = 1 (%)                   0.0    
##   Af = 1 (%)                       0.0    
##   coronary_atherosclerosis = 1 (%) 0.0    
##   venous_thrombus = 1 (%)          0.0    
##   MI = 1 (%)                       0.0    
##   gastritis = 1 (%)                0.0    
##   gastric_ulcer = 1 (%)            0.0    
##   duodenal_ulcer = 1 (%)           0.0    
##   gastric_bleeding = 1 (%)         0.0    
##   aki = 1 (%)                      0.0    
##   septic_shock = 1 (%)             0.0    
##   pneumonia = 1 (%)                0.0
  • 最後のmerge用ファイル
db_comorb <- 
  db_PEO_comorb |> 
  select("HADM_ID", all_of(cols_comorb))

データベース上に書き込む

if(dbExistsTable(con_mimic, "db_comorb")) {
  dbRemoveTable(con_mimic, "db_comorb")
}

dbWriteTable(con_mimic, 
             "db_comorb", 
             db_comorb %>% collect(),
             overwrite = TRUE)  

- LVEF

noteeventsを使う
超音波検査の結果を記述した文章からEFの値を抽出するという難作業
文章データなので大きなファイルだがstringrを使いたいので、ある程度絞った後にcollectする
メモリに余裕のある人向けのタスクかもしれない(4GBだとおそらく不可、16GBなら可能、8GBは手元にないので検証できていません→是非やってみて結果を教えてください。)

db_note <- tbl(con_mimic, "noteevents")
db_note |> colnames()
##  [1] "ROW_ID"      "SUBJECT_ID"  "HADM_ID"     "CHARTDATE"   "CHARTTIME"  
##  [6] "STORETIME"   "CATEGORY"    "DESCRIPTION" "CGID"        "ISERROR"    
## [11] "TEXT"
db_note |> count()
db_note |> head(10)

文章カテゴリーを見てみる
15カテゴリーの文章情報。ここからLVEFが記載されていそうな文章を選ぶ。
通常はEchoと各種サマリー系だが、面倒であれば全部選ぶのでも問題ない。

db_note |> distinct(CATEGORY)
# db_note |>
#   filter(CATEGORY == "Echo") |>
#   head(30) |>
#   collect() |>
#   write_csv("echo_ex.csv")

対象患者のIDを取得して、データ行数を減らす。

ptids <- db_PEO |> pull(HADM_ID)

Echoデータは7412行
全てのデータでも1万行ちょいなので、全データ参照でもよい。

db_note_echo <- 
  db_note |> 
  filter(HADM_ID %in% ptids) |> 
  # filter(CATEGORY %in% c("Echo")) |>
  compute()

db_note_echo |> distinct(HADM_ID) |> count()

ここから先はstringr等を使いたいのでcollect()する。

df_note_echo <- collect(db_note_echo)

症例を絞ればcollect()しても0.8GB程度のメモリ使用で済む
しかし中間生成物も出てくるのでLVEF抽出コード全体で2GB以上使うことになる。

pryr::object_size(df_note_echo)
## 803.60 MB

LVEFの分類は論文の分け方は謎だったので採用しないことにした(論文ではhyper, normal, ?, severeの4つ)。一般的ではない気がする。

LVEFは通常以下の4つにカテゴリー化する
- normal、
- mild hypokinesis,
- moderate hypokinesis,
- severe hypokinesis

分け方は授業スライドと以下参照。
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5333239/
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5333239/table/t1-2497141/?report=objectonly

まずは記述表現を網羅する

normal_text <-
  c("systolic function.{0,10}normal",
    "Normal regional LV systolic function",
    "left ventricular systolic function appears grossly preserved",
    "LV systolic function appears grossly preserved",
    "left ventricular systolic function is.{0,12}normal",
    "LV systolic function is.{0,12}normal"
    )

mild_text <-
  c("left ventricular systolic function is mildly depressed",
    "LV systolic function is mildly depressed",
    "left ventricular systolic fxn is slightly depressed",
    "LV systolic fxn is slightly depressed",
    "mild regional left ventricular systolic dysfunction",
    "mild regional LV systolic dysfunction",
    "mildly depressed LVEF",
    "mild global left ventricular hypokinesis",
    "mild global LV hypokinesis",
    "mild left ventricular hypokinesis",
    "mild LV hypokinesis",
    "left ventricular systolic function is.{0,5}slightly worse",
    "LV systolic function is.{0,5}slightly worse"
    )


moderate_text <- 
  c("left ventricular systolic function is moderately depressed",
    "LV systolic function is moderately depressed",
    "left ventricular systolic fxn is moderately depressed",
    "LV systolic fxn is moderately depressed",
    "moderate regional left ventricular systolic dysfunction",
    "moderate regional LV systolic dysfunction",
    "moderately depressed LVEF",
    "moderate global left ventricular hypokinesis",
    "moderate global LV hypokinesis",
    "moderate left ventricular hypokinesis",
    "moderate LV hypokinesis",
    "left ventricular systolic function is.{0,5}moderately worse",
    "LV systolic function is.{0,5}moderately worse"
    )

severe_text <-
  c("left ventricular systolic function is severely depressed",
    "LV systolic function is severely depressed",
    "left ventricular systolic fxn is severely depressed",
    "LV systolic fxn is severely depressed",
    "severe regional left ventricular systolic dysfunction",
    "severe regional LV systolic dysfunction",
    "severely depressed LVEF",
    "severe global left ventricular hypokinesis",
    "severe global LV hypokinesis",
    "severe left ventricular hypokinesis",
    "severe LV hypokinesis",
    "left ventricular systolic function is.{0,5}severely worse",
    "LV systolic function is.{0,5}severely worse"
    )

difficult_text <-
  c("LV systolic function is difficult to assess",
    "left ventricular systolic function is difficult to assess",
    "LV systolic function cannot be reliably assessed",
    "left ventricular systolic function cannot be reliably assessed",
    "LV not well seen",
    "LVEF cannot be estimated")


abnormal_text <-
  c("left ventricular systolic function is now depressed",
    "left ventricular systolic function is depressed",
    "LV systolic function appears depressed",
    "depressed left ventricular.{0,10}function",
    "depressed LV.{0,10}function"
  )
  • 改行があると面倒なので、まずは改行削除する
  • EF = 50%等のパターンで数値を抽出する
  • 表現抽出する
    abnormalとだけ書かれているものをどこに分類するのかは迷うが、mild hypokinesisとした。
"apple|orange|banana"
## [1] "apple|orange|banana"
strings_sample <- c("apple", "orange", "banana")

str_c(strings_sample, collapse = "|")
## [1] "apple|orange|banana"
df_note_echo_1 <-
  df_note_echo |> 
  select(HADM_ID, TEXT, CHARTDATE, CATEGORY) |> 
  
  #1
  mutate(TEXT = str_replace_all(TEXT, pattern = "\n", replacement = " ")) |> 

  mutate(
    #2
    LVEF_ex1 = str_extract(TEXT, "EF.{0,8}[0-9]{2}"),
    LVEF_ex2 = str_extract(TEXT, "ejection fraction.{0,8}[0-9]{2}"),
    
    #3
    EF_assess = case_when(
      str_detect(TEXT, regex(str_c(severe_text, collapse = "|"),ignore_case = T)) ~ "severe",
      str_detect(TEXT, regex(str_c(moderate_text, collapse = "|"),ignore_case = T)) ~ "moderate",
      str_detect(TEXT, regex(str_c(mild_text, collapse = "|"),ignore_case = T)) ~ "mild",
      str_detect(TEXT, regex(str_c(normal_text, collapse = "|"),ignore_case = T)) ~ "normal",
      # str_detect(TEXT, regex(str_c(difficult_text, collapse = "|"),ignore_case = T)) ~ "difficult",
      str_detect(TEXT,regex(str_c(abnormal_text, collapse = "|"),ignore_case = T)) ~ "mild"
      
    )) |> select(!TEXT)

df_note_echo_1

LVEFの数値部分のみ抽出する

数値から分類カテゴリー列を作成する。

分類カテゴリーは文章表現を優先し、数値は文章がなかった場合とした。このあたりの優先も論文未記載。

df_note_echo_assess <-
  df_note_echo_1 |> 
  mutate(LVEF_num = case_when(
    !is.na(LVEF_ex1) ~ str_extract(LVEF_ex1, pattern = "[0-9]{2}"),
    !is.na(LVEF_ex2) ~ str_extract(LVEF_ex2, pattern = "[0-9]{2}"))
  ) |> 
  mutate(LVEF_num_assess = case_when(
    LVEF_num >= 55 ~ "normal",
    LVEF_num >= 45 ~ "mild",
    LVEF_num >= 30 ~ "moderate",
    LVEF_num >= 0 ~ "severe"
    
  )) |> 
  mutate(LVEF_ctg = case_when(
    !is.na(EF_assess) ~ EF_assess,
    !is.na(LVEF_num_assess) ~ LVEF_num_assess
  )) |> 
  arrange(CHARTDATE) |> 
  select(HADM_ID, CHARTDATE, LVEF_ctg) |> 
  filter(!is.na(LVEF_ctg)) |> 
  distinct(HADM_ID, .keep_all = T)

df_note_echo_assess

データフレームのままだとmimic.dbファイルに結合できないので、.dbに戻す
dbRemoveTable()は初回は不要だが、2回目以降は既に作成されているdb_LVEF_assessを削除する必要があるので、ifを使って工夫している。

if(dbExistsTable(con_mimic, "db_LVEF_assess")) {
  dbRemoveTable(con_mimic, "db_LVEF_assess")
}

dbWriteTable(con_mimic, "db_LVEF_assess", df_note_echo_assess)

db_LVEF_assess <- tbl(con_mimic, "db_LVEF_assess")

後はいつも通り。

db_PEO_LVEF <-
  db_PEO |> 
    left_join(db_LVEF_assess, by = "HADM_ID")

db_PEO_LVEF <- db_PEO_LVEF |> compute()

欠測は15%未満におさまった。

cols_ctg <- c("LVEF_ctg")

db_PEO_LVEF |> select(cols_ctg, "h2ras")|> collect() |> 
  mutate(LVEF_ctg = factor(LVEF_ctg, 
                           levels = c("severe", "moderate", "mild", "normal"))) |> 
  CreateTableOne(vars = cols_ctg, strata = "h2ras", factorVars = cols_ctg) -> tableone

print(tableone, smd = TRUE, missing = TRUE) #|> write.csv("test.csv")
##               Stratified by h2ras
##                0            1            p      test SMD    Missing
##   n            5949         4441                                   
##   LVEF_ctg (%)                           <0.001       0.097 14.7   
##      severe    1289 (25.8)   850 (22.0)                            
##      moderate   936 (18.7)   708 (18.3)                            
##      mild       736 (14.7)   638 (16.5)                            
##      normal    2041 (40.8)  1663 (43.1)
  • 最後のmerge用ファイル
db_LVEF <- 
  db_PEO_LVEF |> 
  select("HADM_ID", cols_ctg)

データベース上に書き込む

if(dbExistsTable(con_mimic, "db_LVEF")) {
  dbRemoveTable(con_mimic, "db_LVEF")
}

dbWriteTable(con_mimic, 
             "db_LVEF", 
             db_LVEF %>% collect(),
             overwrite = TRUE)  
# dbDisconnect(con_mimic, shutdown=TRUE)

<第3回講義・課題>

以下の抽出を行う。今まで通りHADM_IDと目的の変数群に絞ったファイルを作成する事。
1. Laboratory data(BUN, Cr, Na, Ca, Mg, Glu, WBC, RBC, Plt)
2. Medications(RAAs, Diuritics, inotropic agents, adrenaline receptor antagonists, calcium antagonists, antiplatelets, PPIs)

END