(前回までの資料も一部記載していますが、コードは最低限にしています。復習は前回の資料を参照してください。)
pacman::p_load(tidyverse, lubridate, tableone, skimr,
duckdb, arrow, RSQLite)
FドライブのDatabaseフォルダ内のmimic.dbにアクセスする場合は以下のコード(各自の状況に合わせて適宜修正)。 (mimic.dbが無い場合は以下の操作で空のmimic.dbが作成される)
con_mimic <- dbConnect(duckdb::duckdb(), "F:/Database/mimic.db")
mimic.dbの中に入っているデータリスト
dbListTables(con_mimic)
## [1] "admissions" "apsiii"
## [3] "callout" "caregivers"
## [5] "chartevents" "comorbidities_AHRQ"
## [7] "comorbidities_Quan" "cptevents"
## [9] "d_cpt" "d_icd_diagnoses"
## [11] "d_icd_procedures" "d_items"
## [13] "d_labitems" "datetimeevents"
## [15] "db_HtWt" "db_LVEF"
## [17] "db_LVEF_assess" "db_PEO"
## [19] "db_comorb" "db_ethnicity"
## [21] "db_patients" "db_rrt"
## [23] "db_sofa" "db_urine"
## [25] "db_venti" "db_vital"
## [27] "df_lab_list" "diagnoses_icd"
## [29] "drgcodes" "height_firstday"
## [31] "height_weight" "icustay_details"
## [33] "icustays" "inputevents_cv"
## [35] "inputevents_mv" "labevents"
## [37] "microbiologyevents" "noteevents"
## [39] "outputevents" "patients"
## [41] "pivot_RRT" "pivot_height"
## [43] "pivot_labo" "pivot_urine_output"
## [45] "pivot_vitals" "prescriptions"
## [47] "procedureevents_mv" "procedures_icd"
## [49] "services" "sofa"
## [51] "test" "transfers"
## [53] "urine_output" "urine_output_firstday"
## [55] "ventilation_classification" "ventilation_duration"
## [57] "weight_duration" "weight_firstday"
# dbDisconnect(con_mimic, shutdown=TRUE)
作成データ
db_patients <- tbl(con_mimic, "db_patients")
db_PEO <- tbl(con_mimic, "db_PEO")
db_HtWt <- tbl(con_mimic, "db_HtWt")
db_ad <- tbl(con_mimic, "admissions")
db_pt <- tbl(con_mimic, "patients")
db_diag <- tbl(con_mimic, "diagnoses_icd")
db_icustay <- tbl(con_mimic, "icustays")
db_icustay_hadm <-
db_icustay |>
distinct(HADM_ID, .keep_all = T) |>
select(HADM_ID, ICUSTAY_ID)
hadm_icu_vec <-
db_icustay_hadm |>
select(HADM_ID) |>
pull()
icustay_id_vec <-
db_icustay_hadm |>
select(ICUSTAY_ID) |>
pull()
db_d_icd_diag <- tbl(con_mimic, "d_icd_diagnoses")
df_d_icd_diag <- collect(db_d_icd_diag)
BigQueryから必要なファイルをダウンロードし、自分の環境下でmimic.dbに入れる(以下の18ファイルは参考までに)
(希望者のみでOK。希望しない人は個別に連絡ください。)
入手したテーブルを利用して、対象患者の以下の項目を抽出する
(論文再現が目的であるが、全く同じにならないことに注意)
(授業と同様、HADM_IDと目的変数のみのファイルを作成する)
pivot_vital
db_vital <- tbl(con_mimic, "pivot_vitals" )
db_vital |> count()
対象患者のicustay_idに限定する
db_vital_target <- db_vital |> filter(icustay_id %in% icustay_id_vec)
バイタル毎にicustay_idに紐づくデータを抽出する。
1入室中に複数のバイタルデータがあることがほとんどだが、どの時点でのデータを抽出するかは論文未記載(たぶん)。
以下では入室最初のデータを採用したが、平均値・中央値にすることもあり得る。
(以下のコードをcompute()なしで回してみると、compute()の意味が分かると思います。)
db_HR <-
db_vital_target %>%
select(icustay_id, charttime, heartrate) %>%
filter(!is.na(heartrate)) %>%
arrange(icustay_id, charttime) %>%
distinct(icustay_id, .keep_all = T) %>%
select(!c(charttime)) %>%
compute()
db_sBP <-
db_vital_target %>%
select(icustay_id, charttime,sysbp) %>%
filter(!is.na(sysbp)) %>%
arrange(icustay_id, charttime) %>%
distinct(icustay_id, .keep_all = T) %>%
select(!c(charttime)) %>%
compute()
db_dBP <-
db_vital_target %>%
select(icustay_id, charttime,diasbp) %>%
filter(!is.na(diasbp)) %>%
arrange(icustay_id, charttime) %>%
distinct(icustay_id, .keep_all = T) %>%
select(!c(charttime)) %>%
compute()
db_mBP <-
db_vital_target %>%
select(icustay_id, charttime,meanbp) %>%
filter(!is.na(meanbp)) %>%
arrange(icustay_id, charttime) %>%
distinct(icustay_id, .keep_all = T) %>%
select(!c(charttime)) %>%
compute()
db_RR <-
db_vital_target %>%
select(icustay_id, charttime,resprate) %>%
filter(!is.na(resprate))%>%
arrange(icustay_id, charttime) %>%
distinct(icustay_id, .keep_all = T) %>%
select(!c(charttime)) %>%
compute()
db_tmpc <-
db_vital_target %>%
select(icustay_id, charttime,tempc) %>%
filter(!is.na(tempc)) %>%
arrange(icustay_id, charttime) %>%
distinct(icustay_id, .keep_all = T) %>%
select(!c(charttime)) %>%
compute()
db_spo2 <-
db_vital_target %>%
select(icustay_id, charttime,spo2) %>%
filter(!is.na(spo2)) %>%
arrange(icustay_id, charttime) %>%
distinct(icustay_id, .keep_all = T) %>%
select(!c(charttime)) %>%
compute()
db_vitals_first <-
db_HR |>
left_join(db_sBP, by = "icustay_id") |>
left_join(db_dBP, by = "icustay_id") |>
left_join(db_mBP, by = "icustay_id") |>
left_join(db_RR, by = "icustay_id") |>
left_join(db_tmpc, by = "icustay_id") |>
left_join(db_spo2, by = "icustay_id")
db_vitals_first <- db_vitals_first |> compute()
db_PEO_vital <-
db_PEO |>
left_join(db_vitals_first, by = c("ICUSTAY_ID" = "icustay_id"))
db_PEO_vital <- db_PEO_vital |> compute()
db_PEO_vital |> count()
db_PEO_vital |> head(10)
表作成して確認
cols_vital <- c("heartrate", "sysbp", "diasbp", "meanbp", "resprate", "tempc", "spo2")
factors <- c("ADMISSION_TYPE", "INSURANCE", "GENDER", "mortality30", "hospital_death")
db_PEO_vital |> select(c(cols_vital, "h2ras"))|> collect() |>
CreateTableOne(vars = cols_vital, strata = "h2ras", factorVars = factors) -> tableone
print(tableone, smd = TRUE, missing = TRUE) |> write.csv("test.csv")
## Stratified by h2ras
## 0 1 p test SMD
## n 5949 4441
## heartrate (mean (SD)) 88.36 (20.63) 88.09 (18.75) 0.494 0.014
## sysbp (mean (SD)) 123.44 (25.95) 122.17 (25.18) 0.013 0.050
## diasbp (mean (SD)) 62.25 (17.36) 62.11 (16.40) 0.668 0.009
## meanbp (mean (SD)) 80.51 (18.28) 80.41 (17.61) 0.784 0.005
## resprate (mean (SD)) 19.99 (6.22) 18.30 (6.30) <0.001 0.269
## tempc (mean (SD)) 36.54 (0.99) 36.49 (0.95) 0.005 0.057
## spo2 (mean (SD)) 96.42 (5.44) 97.16 (4.89) <0.001 0.144
## Stratified by h2ras
## Missing
## n
## heartrate (mean (SD)) 1.7
## sysbp (mean (SD)) 1.7
## diasbp (mean (SD)) 1.7
## meanbp (mean (SD)) 1.7
## resprate (mean (SD)) 1.7
## tempc (mean (SD)) 2.0
## spo2 (mean (SD)) 1.8
ヒストグラム作成して確認
cols_continuous <- cols_vital
db_PEO_vital |>
select(cols_continuous) |>
pivot_longer(cols = cols_continuous, names_to = "name", values_to = "value") |>
ggplot()+
geom_histogram(aes(x = value), color = "black")+
facet_wrap(~ name, scales = "free", ncol = 3) +
theme_bw()+
theme(text = element_text(size = 20))
db_vital <-
db_PEO_vital |>
select("HADM_ID", cols_vital)
データベース上に書き込む
if(dbExistsTable(con_mimic, "db_vital")) {
dbRemoveTable(con_mimic, "db_vital")
}
dbWriteTable(con_mimic,
"db_vital",
db_vital %>% collect(),
overwrite = TRUE)
使用するファイル
pivot_urine_output: 論文では使われてないっぽい
urine_output
urine_output_firstday
# db_pivot_urine <- tbl(con_mimic, "pivot_urine_output")
db_urine_out <- tbl(con_mimic, "urine_output")
db_urine_firstday <- tbl(con_mimic, "urine_output_firstday")
db_pivot_urine:
icustya_id・時刻ごとの尿量が記載されている。一日の尿量に直す必要がある。
時間を秒単位から日単位に修正し、
各icustay_idの入室時刻をもとに、
そこから1日(24時間)後までのデータから尿量を計算する。
(コードは結構乱雑です)
db_urine_out:
上記と同じくicustya_id・時刻ごとの尿量が記載されている。
同様に初日のデータを抽出する。
db_urine_out |> arrange(icustay_id)
db_urine1 <-
db_urine_out |>
filter(!is.na(icustay_id))
db_urine_time0 <-
db_urine1 |>
arrange(icustay_id, charttime) %>%
distinct(icustay_id, .keep_all = T) %>%
rename(time0 = charttime) %>%
select(icustay_id, time0)
db_urine2 <-
db_urine1 %>%
left_join(db_urine_time0, by = "icustay_id") %>%
arrange(icustay_id) %>%
mutate(elapse = day(charttime - time0)) %>%
filter(elapse < 1) |>
compute()
db_urine2 |>
arrange(icustay_id, charttime) %>% head(200)
db_urine3 <-
db_urine2 |>
group_by(icustay_id) |>
summarize(value = sum(value))
db_urine3 <- db_urine3 |> compute()
db_urine3 |> head(10)
db_urine_firstday:
初日のデータのみなので処理は楽。
db_urine_firstday1 <-
db_urine_firstday |>
select(icustay_id, urineoutput)
db_urine_firstday1 |> head(10)
3つのファイルをdb_PEOにマージして、
どのファイルのデータを優先して使うかを決めたうえでデータを固定する(論文未記載)
db_u1 <-
db_PEO |>
left_join(db_urine_firstday1, by = c("ICUSTAY_ID" = "icustay_id")) |>
compute()
db_u2 <-
db_u1 |>
left_join(db_urine3, by = c("ICUSTAY_ID" = "icustay_id")) |>
compute()
# db_u3 <-
# db_u2 |>
# left_join(db_pivot_urine3, by = c("ICUSTAY_ID" = "icustay_id")) |>
# compute()
db_PEO_urine <-
db_u2 |>
mutate(urine_out = case_when(
!is.na(urineoutput) ~ urineoutput,
!is.na(value) ~ value
)) |>
mutate(urine_out = case_when(
between(urine_out, 0, 25000) ~ urine_out
)) |>
select(!c(urineoutput, value))
#
# db_PEO_urine <-
# db_u3 |>
# mutate(urine_out = case_when(
# !is.na(urineoutput.y) ~ urineoutput.y,
# !is.na(value) ~ value,
# !is.na(urineoutput.x) ~ urineoutput.x
# )) |>
# mutate(urine_out = case_when(
# between(urine_out, 0, 25000) ~ urine_out
# )) |>
# select(!c(urineoutput.x, urineoutput.y, value))
db_PEO_urine <- db_PEO_urine |> compute()
表作成
vars <- c("urine_out")
db_PEO_urine |> select(c(vars, "h2ras"))|> collect() |>
CreateTableOne(vars = vars, strata = "h2ras") -> tableone
print(tableone, smd = TRUE, missing = TRUE) |> write.csv("test.csv")
## Stratified by h2ras
## 0 1 p test SMD
## n 5949 4441
## urine_out (mean (SD)) 1775.04 (1252.05) 1838.51 (1214.97) 0.011 0.051
## Stratified by h2ras
## Missing
## n
## urine_out (mean (SD)) 4.0
ヒストグラム作成
cols_continuous <- c("urine_out")
db_PEO_urine |>
select(cols_continuous) |>
pivot_longer(cols = cols_continuous, names_to = "name", values_to = "value") |>
ggplot()+
geom_histogram(aes(x = value), color = "black")+
facet_wrap(~ name, scales = "free", ncol = 2) +
theme_bw()+
theme(text = element_text(size = 20))
db_urine <-
db_PEO_urine |>
select("HADM_ID", "urine_out")
データベース上に書き込む
if(dbExistsTable(con_mimic, "db_urine")) {
dbRemoveTable(con_mimic, "db_urine")
}
dbWriteTable(con_mimic,
"db_urine",
db_urine %>% collect(),
overwrite = TRUE)
重症度スコアは各ICU入室ごとに作成されている。正直このまま使えばいいのは楽。
db_sofa <- tbl(con_mimic, "sofa")
db_saps <- tbl(con_mimic, "apsiii")
db_sofa_1 <- db_sofa |> select(icustay_id, SOFA)
db_saps_1 <- db_saps |> select(icustay_id, apsiii)
db_PEO_sofa <-
db_PEO |>
left_join(db_sofa_1, by = c("ICUSTAY_ID" = "icustay_id")) |>
left_join(db_saps_1, by = c("ICUSTAY_ID" = "icustay_id"))
db_PEO_sofa <- db_PEO_sofa |> compute()
表作成
vars <- c("SOFA", "apsiii")
db_PEO_sofa |> select(c(vars, "h2ras"))|> collect() |>
CreateTableOne(vars = vars, strata = "h2ras") -> tableone
print(tableone, smd = TRUE, missing = TRUE) |> write.csv("test.csv")
## Stratified by h2ras
## 0 1 p test SMD Missing
## n 5949 4441
## SOFA (mean (SD)) 4.59 (3.04) 5.04 (3.02) <0.001 0.146 0.0
## apsiii (mean (SD)) 48.63 (20.54) 46.49 (19.59) <0.001 0.107 0.0
ヒストグラム作成
cols_continuous <- c( "SOFA", "apsiii")
db_PEO_sofa |>
select(cols_continuous) |>
pivot_longer(cols = cols_continuous, names_to = "name", values_to = "value") |>
ggplot()+
geom_histogram(aes(x = value), color = "black")+
facet_wrap(~ name, scales = "free", ncol = 2) +
theme_bw()+
theme(text = element_text(size = 20))
db_sofa <-
db_PEO_sofa |>
select("HADM_ID", "SOFA", "apsiii")
データベース上に書き込む
if(dbExistsTable(con_mimic, "db_sofa")) {
dbRemoveTable(con_mimic, "db_sofa")
}
dbWriteTable(con_mimic,
"db_sofa",
db_sofa %>% collect(),
overwrite = TRUE)
db_rrt <- tbl(con_mimic, "pivot_RRT")
db_rrt |> count()
db_rrt |> head(10)
そもそもdialysis_typeがどのくらいあるのかチェックする
db_rrt |> distinct(dialysis_type)
特定のdialysis_typeに限定し、該当するicustay_idを取得する
RRT_icu_id <-
db_rrt |> filter(dialysis_type %in% c("CRRT", "CVVH", "CVVHD", "CVVHDF")) |>
distinct(icustay_id) |> pull(icustay_id)
db_PEOにrrtの有無を示す列を作る。
db_PEO_rrt <-
db_PEO |>
mutate(rrt = ifelse(ICUSTAY_ID %in% RRT_icu_id, 1, 0))
db_PEO_rrt <- db_PEO_rrt |> compute()
vars <- c("rrt")
vars_fact <- vars
db_PEO_rrt |> select(c(vars, "h2ras"))|> collect() |>
CreateTableOne(vars = vars, strata = "h2ras", factorVars = vars_fact) -> tableone
print(tableone, smd = TRUE, missing = TRUE) |> write.csv("test.csv")
## Stratified by h2ras
## 0 1 p test SMD Missing
## n 5949 4441
## rrt = 1 (%) 155 (2.6) 138 (3.1) 0.142 0.030 0.0
db_rrt <-
db_PEO_rrt |>
select("HADM_ID", "rrt")
データベース上に書き込む
if(dbExistsTable(con_mimic, "db_rrt")) {
dbRemoveTable(con_mimic, "db_rrt")
}
dbWriteTable(con_mimic,
"db_rrt",
db_rrt %>% collect(),
overwrite = TRUE)
# db_venti_c <- tbl(con_mimic, "ventilation_classification")
db_venti_d <- tbl(con_mimic, "ventilation_duration")
db_venti_d |> count()
db_venti_d |> head(10)
venti_icu_id <-
db_venti_d |>
filter(!is.na(icustay_id)) |>
distinct(icustay_id) |>
pull(icustay_id)
db_PEO_venti <-
db_PEO |>
mutate(ventilation = ifelse(ICUSTAY_ID %in% venti_icu_id, 1, 0)) #|>
# replace_na(list(ventilation = 0))
db_PEO_venti <- db_PEO_venti |> compute()
vars <- c("ventilation")
vars_fact <- vars
db_PEO_venti |> select(c(vars, "h2ras"))|> collect() |>
CreateTableOne(vars = vars, strata = "h2ras", factorVars = vars_fact) -> tableone
print(tableone, smd = TRUE, missing = TRUE) |> write.csv("test.csv")
## Stratified by h2ras
## 0 1 p test SMD Missing
## n 5949 4441
## ventilation = 1 (%) 2402 (40.4) 2839 (63.9) <0.001 0.485 0.0
db_venti <-
db_PEO_venti |>
select("HADM_ID", "ventilation")
データベース上に書き込む
if(dbExistsTable(con_mimic, "db_venti")) {
dbRemoveTable(con_mimic, "db_venti")
}
dbWriteTable(con_mimic,
"db_venti",
db_venti %>% collect(),
overwrite = TRUE)
もともとEthnicityはadmissionsに入っているが分類が細かすぎる。
より一般的な分類にした表を公式が作成していて、icustay_detailsから参照できる。
db_details <- tbl(con_mimic, "icustay_details")
db_details |> count()
native, other, unknownはotherとした。
db_details_ethnicity <-
db_details |>
distinct(hadm_id, .keep_all = T) |>
select(hadm_id, ethnicity_grouped) |>
mutate(ethnicity_5 = case_when(
ethnicity_grouped %in% c("native", "other", "unknown") ~ "other",
TRUE ~ ethnicity_grouped
))
db_PEO_ethnicity <-
db_PEO |>
left_join(db_details_ethnicity, by = c("HADM_ID" = "hadm_id"))
db_PEO_ethnicity <- db_PEO_ethnicity |> compute()
var = c("ethnicity_5")
db_PEO_ethnicity |> select(c(var, "h2ras")) |> collect() |>
CreateTableOne(vars = var, strata = "h2ras", factorVars = var) -> tableone
print(tableone, smd=T, missing = T)
## Stratified by h2ras
## 0 1 p test SMD Missing
## n 5949 4441
## ethnicity_5 (%) 0.002 0.081 1.0
## asian 101 ( 1.7) 80 ( 1.8)
## black 499 ( 8.5) 362 ( 8.2)
## hispanic 122 ( 2.1) 125 ( 2.8)
## other 892 (15.2) 569 (12.9)
## white 4262 (72.5) 3279 (74.3)
db_ethnicity <-
db_PEO_ethnicity |>
select(HADM_ID, ethnicity_5)
データベース上に書き込む
if(dbExistsTable(con_mimic, "db_ethnicity")) {
dbRemoveTable(con_mimic, "db_ethnicity")
}
dbWriteTable(con_mimic,
"db_ethnicity",
db_ethnicity %>% collect(),
overwrite = TRUE)
Quanの定義による合併症リストはBigQueryから入手可能
しかし、論文ではこの合併症リストにないものが多い(H2ブロッカー使用に関連のありそうな合併症を選択している)
また、Quanのリストにある合併症に関してもBigQueryファイルと論文とでかなり食い違う。おそらく著者らは病名のICD9コードから自分たちで抽出しているっぽい
我々もQuanのリストからは採用せず、手動でデータ抽出していく事にする
“d_icd_diagnoses”と”diagnoses_icd”を使う
やることは心不全を抽出した時と同じなので、比較的単純
HT_icd_9 <-
df_d_icd_diag |>
filter(str_detect(LONG_TITLE, coll("hypertension", ignore_case = T))) |>
filter(!str_detect(LONG_TITLE, coll("without", ignore_case = T))) |>
pull(ICD9_CODE)
hadm_id_vec <- db_PEO |> pull(HADM_ID)
db_ht <-
db_diag |>
filter(HADM_ID %in% hadm_id_vec) |>
mutate(HT = ifelse(ICD9_CODE %in% HT_icd_9, 1, 0)) |>
group_by(HADM_ID) |>
summarize(hypertension = max(HT))
db_ht |> count()
db_ht |> head(10)
以下にコード例を示す。自分でも試行錯誤してみると良い。
#hypertension
HT_icd_9 <-
df_d_icd_diag |>
filter(str_detect(LONG_TITLE, coll("hypertension", ignore_case = T))) |>
filter(!str_detect(LONG_TITLE, coll("without", ignore_case = T))) |>
pull(ICD9_CODE)
#diabetes
DM_icd_9 <-
df_d_icd_diag |>
filter(str_detect(LONG_TITLE, coll("diabetes me", ignore_case = T))|
str_detect(SHORT_TITLE, coll("DMI", ignore_case = F))) |>
filter(str_starts(ICD9_CODE, "2")) |>
pull(ICD9_CODE)
#anemia
anemia_icd_9 <-
df_d_icd_diag |>
filter(str_detect(LONG_TITLE, coll("anemia", ignore_case = T))) |>
filter(str_starts(ICD9_CODE, "2")) |>
pull(ICD9_CODE)
#af
af_icd_9 <- c("42731")
#coronary atherosclerosis
coronary_icd_9 <-
df_d_icd_diag |>
filter(str_detect(LONG_TITLE, coll("coronary atherosclerosis", ignore_case = T))) |>
pull(ICD9_CODE)
#venous thrombus
venous_icd_9 <-
df_d_icd_diag |>
filter(str_detect(LONG_TITLE, coll("venous embo", ignore_case = T))|
str_detect(SHORT_TITLE, coll("vein thromb", ignore_case = T))) |>
pull(ICD9_CODE)
#myocardial infarction
mi_icd_9 <-
df_d_icd_diag |>
filter(str_detect(LONG_TITLE, coll("myocardial infarction", ignore_case = T))) |>
filter(!ICD9_CODE %in% c("4110", "41181", "42979")) |>
pull(ICD9_CODE)
#gastritis
gastritis_icd_9 <-
df_d_icd_diag |>
filter(str_detect(LONG_TITLE, coll("gastritis", ignore_case = T))) |>
pull(ICD9_CODE)
#duodenal ulcer
d_ulcer_icd_9 <-
df_d_icd_diag |>
filter(str_detect(LONG_TITLE, coll("duodenal ulcer", ignore_case = T))) |>
pull(ICD9_CODE)
#gastric ulcer
g_ulcer_icd_9 <-
df_d_icd_diag |>
filter(str_detect(LONG_TITLE, coll("gastric ulcer", ignore_case = T))) |>
pull(ICD9_CODE)
#gastric bleeding
g_bleed_icd_9 <-
df_d_icd_diag |>
filter(str_detect(LONG_TITLE, coll("gastr", ignore_case = T))|
str_detect(LONG_TITLE, coll("intest", ignore_case = T))|
str_detect(LONG_TITLE, coll("duodenal", ignore_case = T))|
str_detect(LONG_TITLE, coll("colon", ignore_case = T))) |>
filter(str_detect(LONG_TITLE, coll("hemorrh", ignore_case = T))) |>
filter(!str_detect(LONG_TITLE, coll("without mention of hemorrhage", ignore_case = T))) |>
pull(ICD9_CODE)
#AKI
aki_icd_9 <-
df_d_icd_diag |>
filter(str_detect(LONG_TITLE, coll("acute kidney", ignore_case = T))) |>
pull(ICD9_CODE)
#septic shock
s_shock_icd_9 <-
df_d_icd_diag |>
filter(str_detect(LONG_TITLE, coll("septic shock", ignore_case = T))) |>
pull(ICD9_CODE)
#pneumonia
pneumonia_icd_9 <-
df_d_icd_diag |>
filter(str_detect(LONG_TITLE, coll("pneumonia", ignore_case = T))) |>
pull(ICD9_CODE)
hadm_id_vec <- db_PEO |> pull(HADM_ID)
db_comorb_manual <-
db_diag |>
filter(HADM_ID %in% hadm_id_vec) |>
mutate(HT = ifelse(ICD9_CODE %in% HT_icd_9, 1, 0),
DM = ifelse(ICD9_CODE %in% DM_icd_9, 1, 0),
anemia = ifelse(ICD9_CODE %in% anemia_icd_9, 1, 0),
Af = ifelse(ICD9_CODE %in% af_icd_9, 1, 0),
coronary_atherosclerosis = ifelse(ICD9_CODE %in% coronary_icd_9, 1, 0),
venous_thrombus = ifelse(ICD9_CODE %in% venous_icd_9, 1, 0),
MI = ifelse(ICD9_CODE %in% mi_icd_9, 1, 0),
gastritis = ifelse(ICD9_CODE %in% gastritis_icd_9, 1, 0),
g_ulcer = ifelse(ICD9_CODE %in% g_ulcer_icd_9, 1, 0),
d_ulcer = ifelse(ICD9_CODE %in% d_ulcer_icd_9, 1, 0),
g_bleeding = ifelse(ICD9_CODE %in% g_bleed_icd_9, 1, 0),
aki = ifelse(ICD9_CODE %in% aki_icd_9, 1, 0),
s_shock = ifelse(ICD9_CODE %in% s_shock_icd_9, 1, 0),
pneumonia = ifelse(ICD9_CODE %in% pneumonia_icd_9, 1, 0)) |>
group_by(HADM_ID) |>
summarize(hypertension = max(HT),
DM = max(DM),
anemia = max(anemia),
Af = max(Af),
coronary_atherosclerosis = max(coronary_atherosclerosis),
venous_thrombus = max(venous_thrombus),
MI = max(MI),
gastritis = max(gastritis),
gastric_ulcer = max(g_ulcer),
duodenal_ulcer = max(d_ulcer),
gastric_bleeding = max(g_bleeding),
aki = max(aki),
septic_shock = max(s_shock),
pneumonia = max(pneumonia))
db_comorb_manual |> count()
db_PEO_comorb <-
db_PEO |>
left_join(db_comorb_manual, by = "HADM_ID")
db_PEO_comorb <- db_PEO_comorb |> compute()
表作成
cols_comorb <- c(
"hypertension", "DM", "anemia", "Af", "coronary_atherosclerosis", "venous_thrombus", "MI",
"gastritis", "gastric_ulcer", "duodenal_ulcer", "gastric_bleeding", "aki", "septic_shock",
"pneumonia")
vars <- cols_comorb
factors <- cols_comorb
db_PEO_comorb |> select(c(vars, "h2ras"))|> collect() |>
CreateTableOne(vars = vars, strata = "h2ras", factorVars = factors) -> tableone
print(tableone, smd = TRUE, missing = TRUE) #|> write.csv("test.csv")
## Stratified by h2ras
## 0 1 p test SMD
## n 5949 4441
## hypertension = 1 (%) 2359 (39.7) 2025 (45.6) <0.001 0.120
## DM = 1 (%) 2133 (35.9) 1642 (37.0) 0.249 0.023
## anemia = 1 (%) 1993 (33.5) 1414 (31.8) 0.078 0.035
## Af = 1 (%) 2537 (42.6) 2034 (45.8) 0.001 0.064
## coronary_atherosclerosis = 1 (%) 2264 (38.1) 2049 (46.1) <0.001 0.164
## venous_thrombus = 1 (%) 112 ( 1.9) 77 ( 1.7) 0.626 0.011
## MI = 1 (%) 1001 (16.8) 717 (16.1) 0.369 0.018
## gastritis = 1 (%) 121 ( 2.0) 61 ( 1.4) 0.014 0.051
## gastric_ulcer = 1 (%) 67 ( 1.1) 20 ( 0.5) <0.001 0.076
## duodenal_ulcer = 1 (%) 71 ( 1.2) 25 ( 0.6) 0.001 0.068
## gastric_bleeding = 1 (%) 514 ( 8.6) 184 ( 4.1) <0.001 0.185
## aki = 1 (%) 2260 (38.0) 1409 (31.7) <0.001 0.132
## septic_shock = 1 (%) 414 ( 7.0) 261 ( 5.9) 0.030 0.044
## pneumonia = 1 (%) 1343 (22.6) 927 (20.9) 0.040 0.041
## Stratified by h2ras
## Missing
## n
## hypertension = 1 (%) 0.0
## DM = 1 (%) 0.0
## anemia = 1 (%) 0.0
## Af = 1 (%) 0.0
## coronary_atherosclerosis = 1 (%) 0.0
## venous_thrombus = 1 (%) 0.0
## MI = 1 (%) 0.0
## gastritis = 1 (%) 0.0
## gastric_ulcer = 1 (%) 0.0
## duodenal_ulcer = 1 (%) 0.0
## gastric_bleeding = 1 (%) 0.0
## aki = 1 (%) 0.0
## septic_shock = 1 (%) 0.0
## pneumonia = 1 (%) 0.0
db_comorb <-
db_PEO_comorb |>
select("HADM_ID", all_of(cols_comorb))
データベース上に書き込む
if(dbExistsTable(con_mimic, "db_comorb")) {
dbRemoveTable(con_mimic, "db_comorb")
}
dbWriteTable(con_mimic,
"db_comorb",
db_comorb %>% collect(),
overwrite = TRUE)
noteeventsを使う
超音波検査の結果を記述した文章からEFの値を抽出するという難作業。
文章データなので大きなファイルだがstringrを使いたいので、ある程度絞った後にcollectする
メモリに余裕のある人向けのタスクかもしれない(4GBだとおそらく不可、16GBなら可能、8GBは手元にないので検証できていません→是非やってみて結果を教えてください。)
db_note <- tbl(con_mimic, "noteevents")
db_note |> colnames()
## [1] "ROW_ID" "SUBJECT_ID" "HADM_ID" "CHARTDATE" "CHARTTIME"
## [6] "STORETIME" "CATEGORY" "DESCRIPTION" "CGID" "ISERROR"
## [11] "TEXT"
db_note |> count()
db_note |> head(10)
文章カテゴリーを見てみる
15カテゴリーの文章情報。ここからLVEFが記載されていそうな文章を選ぶ。
通常はEchoと各種サマリー系だが、面倒であれば全部選ぶのでも問題ない。
db_note |> distinct(CATEGORY)
# db_note |>
# filter(CATEGORY == "Echo") |>
# head(30) |>
# collect() |>
# write_csv("echo_ex.csv")
対象患者のIDを取得して、データ行数を減らす。
ptids <- db_PEO |> pull(HADM_ID)
Echoデータは7412行
全てのデータでも1万行ちょいなので、全データ参照でもよい。
db_note_echo <-
db_note |>
filter(HADM_ID %in% ptids) |>
# filter(CATEGORY %in% c("Echo")) |>
compute()
db_note_echo |> distinct(HADM_ID) |> count()
ここから先はstringr等を使いたいのでcollect()する。
df_note_echo <- collect(db_note_echo)
症例を絞ればcollect()しても0.8GB程度のメモリ使用で済む
しかし中間生成物も出てくるのでLVEF抽出コード全体で2GB以上使うことになる。
pryr::object_size(df_note_echo)
## 803.60 MB
LVEFの分類は論文の分け方は謎だったので採用しないことにした(論文ではhyper, normal, ?, severeの4つ)。一般的ではない気がする。
LVEFは通常以下の4つにカテゴリー化する
- normal、
- mild hypokinesis,
- moderate hypokinesis,
- severe hypokinesis
分け方は授業スライドと以下参照。
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5333239/
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5333239/table/t1-2497141/?report=objectonly
まずは記述表現を網羅する
normal_text <-
c("systolic function.{0,10}normal",
"Normal regional LV systolic function",
"left ventricular systolic function appears grossly preserved",
"LV systolic function appears grossly preserved",
"left ventricular systolic function is.{0,12}normal",
"LV systolic function is.{0,12}normal"
)
mild_text <-
c("left ventricular systolic function is mildly depressed",
"LV systolic function is mildly depressed",
"left ventricular systolic fxn is slightly depressed",
"LV systolic fxn is slightly depressed",
"mild regional left ventricular systolic dysfunction",
"mild regional LV systolic dysfunction",
"mildly depressed LVEF",
"mild global left ventricular hypokinesis",
"mild global LV hypokinesis",
"mild left ventricular hypokinesis",
"mild LV hypokinesis",
"left ventricular systolic function is.{0,5}slightly worse",
"LV systolic function is.{0,5}slightly worse"
)
moderate_text <-
c("left ventricular systolic function is moderately depressed",
"LV systolic function is moderately depressed",
"left ventricular systolic fxn is moderately depressed",
"LV systolic fxn is moderately depressed",
"moderate regional left ventricular systolic dysfunction",
"moderate regional LV systolic dysfunction",
"moderately depressed LVEF",
"moderate global left ventricular hypokinesis",
"moderate global LV hypokinesis",
"moderate left ventricular hypokinesis",
"moderate LV hypokinesis",
"left ventricular systolic function is.{0,5}moderately worse",
"LV systolic function is.{0,5}moderately worse"
)
severe_text <-
c("left ventricular systolic function is severely depressed",
"LV systolic function is severely depressed",
"left ventricular systolic fxn is severely depressed",
"LV systolic fxn is severely depressed",
"severe regional left ventricular systolic dysfunction",
"severe regional LV systolic dysfunction",
"severely depressed LVEF",
"severe global left ventricular hypokinesis",
"severe global LV hypokinesis",
"severe left ventricular hypokinesis",
"severe LV hypokinesis",
"left ventricular systolic function is.{0,5}severely worse",
"LV systolic function is.{0,5}severely worse"
)
difficult_text <-
c("LV systolic function is difficult to assess",
"left ventricular systolic function is difficult to assess",
"LV systolic function cannot be reliably assessed",
"left ventricular systolic function cannot be reliably assessed",
"LV not well seen",
"LVEF cannot be estimated")
abnormal_text <-
c("left ventricular systolic function is now depressed",
"left ventricular systolic function is depressed",
"LV systolic function appears depressed",
"depressed left ventricular.{0,10}function",
"depressed LV.{0,10}function"
)
"apple|orange|banana"
## [1] "apple|orange|banana"
strings_sample <- c("apple", "orange", "banana")
str_c(strings_sample, collapse = "|")
## [1] "apple|orange|banana"
df_note_echo_1 <-
df_note_echo |>
select(HADM_ID, TEXT, CHARTDATE, CATEGORY) |>
#1
mutate(TEXT = str_replace_all(TEXT, pattern = "\n", replacement = " ")) |>
mutate(
#2
LVEF_ex1 = str_extract(TEXT, "EF.{0,8}[0-9]{2}"),
LVEF_ex2 = str_extract(TEXT, "ejection fraction.{0,8}[0-9]{2}"),
#3
EF_assess = case_when(
str_detect(TEXT, regex(str_c(severe_text, collapse = "|"),ignore_case = T)) ~ "severe",
str_detect(TEXT, regex(str_c(moderate_text, collapse = "|"),ignore_case = T)) ~ "moderate",
str_detect(TEXT, regex(str_c(mild_text, collapse = "|"),ignore_case = T)) ~ "mild",
str_detect(TEXT, regex(str_c(normal_text, collapse = "|"),ignore_case = T)) ~ "normal",
# str_detect(TEXT, regex(str_c(difficult_text, collapse = "|"),ignore_case = T)) ~ "difficult",
str_detect(TEXT,regex(str_c(abnormal_text, collapse = "|"),ignore_case = T)) ~ "mild"
)) |> select(!TEXT)
df_note_echo_1
LVEFの数値部分のみ抽出する
数値から分類カテゴリー列を作成する。
分類カテゴリーは文章表現を優先し、数値は文章がなかった場合とした。このあたりの優先も論文未記載。
df_note_echo_assess <-
df_note_echo_1 |>
mutate(LVEF_num = case_when(
!is.na(LVEF_ex1) ~ str_extract(LVEF_ex1, pattern = "[0-9]{2}"),
!is.na(LVEF_ex2) ~ str_extract(LVEF_ex2, pattern = "[0-9]{2}"))
) |>
mutate(LVEF_num_assess = case_when(
LVEF_num >= 55 ~ "normal",
LVEF_num >= 45 ~ "mild",
LVEF_num >= 30 ~ "moderate",
LVEF_num >= 0 ~ "severe"
)) |>
mutate(LVEF_ctg = case_when(
!is.na(EF_assess) ~ EF_assess,
!is.na(LVEF_num_assess) ~ LVEF_num_assess
)) |>
arrange(CHARTDATE) |>
select(HADM_ID, CHARTDATE, LVEF_ctg) |>
filter(!is.na(LVEF_ctg)) |>
distinct(HADM_ID, .keep_all = T)
df_note_echo_assess
データフレームのままだとmimic.dbファイルに結合できないので、.dbに戻す
dbRemoveTable()は初回は不要だが、2回目以降は既に作成されているdb_LVEF_assessを削除する必要があるので、ifを使って工夫している。
if(dbExistsTable(con_mimic, "db_LVEF_assess")) {
dbRemoveTable(con_mimic, "db_LVEF_assess")
}
dbWriteTable(con_mimic, "db_LVEF_assess", df_note_echo_assess)
db_LVEF_assess <- tbl(con_mimic, "db_LVEF_assess")
後はいつも通り。
db_PEO_LVEF <-
db_PEO |>
left_join(db_LVEF_assess, by = "HADM_ID")
db_PEO_LVEF <- db_PEO_LVEF |> compute()
欠測は15%未満におさまった。
cols_ctg <- c("LVEF_ctg")
db_PEO_LVEF |> select(cols_ctg, "h2ras")|> collect() |>
mutate(LVEF_ctg = factor(LVEF_ctg,
levels = c("severe", "moderate", "mild", "normal"))) |>
CreateTableOne(vars = cols_ctg, strata = "h2ras", factorVars = cols_ctg) -> tableone
print(tableone, smd = TRUE, missing = TRUE) #|> write.csv("test.csv")
## Stratified by h2ras
## 0 1 p test SMD Missing
## n 5949 4441
## LVEF_ctg (%) <0.001 0.097 14.7
## severe 1289 (25.8) 850 (22.0)
## moderate 936 (18.7) 708 (18.3)
## mild 736 (14.7) 638 (16.5)
## normal 2041 (40.8) 1663 (43.1)
db_LVEF <-
db_PEO_LVEF |>
select("HADM_ID", cols_ctg)
データベース上に書き込む
if(dbExistsTable(con_mimic, "db_LVEF")) {
dbRemoveTable(con_mimic, "db_LVEF")
}
dbWriteTable(con_mimic,
"db_LVEF",
db_LVEF %>% collect(),
overwrite = TRUE)
# dbDisconnect(con_mimic, shutdown=TRUE)
以下の抽出を行う。今まで通りHADM_IDと目的の変数群に絞ったファイルを作成する事。
1. Laboratory data(BUN, Cr, Na, Ca, Mg, Glu, WBC, RBC, Plt)
2. Medications(RAAs, Diuritics, inotropic agents, adrenaline receptor
antagonists, calcium antagonists, antiplatelets, PPIs)