(前回までの資料も記載していますが、コードは最低限にしています。復習は前回の資料を参照してください。)
pacman::p_load(tidyverse, lubridate, tableone, skimr,
duckdb, arrow, RSQLite)
FドライブのDatabaseフォルダ内のmimic.dbにアクセスする場合は以下のコード(各自の状況に合わせて適宜修正)。 (mimic.dbが無い場合は以下の操作で空のmimic.dbが作成される)
con_mimic <- dbConnect(duckdb::duckdb(), "F:/Database/mimic.db")
mimic.dbの中に入っているデータリスト
(最初のうちは少数のはず。全ての講義終了後は50以上リストアップされる)
dbListTables(con_mimic)
## [1] "admissions" "apsiii"
## [3] "callout" "caregivers"
## [5] "chartevents" "comorbidities_AHRQ"
## [7] "comorbidities_Quan" "cptevents"
## [9] "d_cpt" "d_icd_diagnoses"
## [11] "d_icd_procedures" "d_items"
## [13] "d_labitems" "datetimeevents"
## [15] "db_HtWt" "db_LVEF"
## [17] "db_LVEF_assess" "db_PEO"
## [19] "db_comorb" "db_ethnicity"
## [21] "db_patients" "db_rrt"
## [23] "db_sofa" "db_urine"
## [25] "db_venti" "db_vital"
## [27] "df_lab_list" "diagnoses_icd"
## [29] "drgcodes" "height_firstday"
## [31] "height_weight" "icustay_details"
## [33] "icustays" "inputevents_cv"
## [35] "inputevents_mv" "labevents"
## [37] "microbiologyevents" "noteevents"
## [39] "outputevents" "patients"
## [41] "pivot_RRT" "pivot_height"
## [43] "pivot_labo" "pivot_urine_output"
## [45] "pivot_vitals" "prescriptions"
## [47] "procedureevents_mv" "procedures_icd"
## [49] "services" "sofa"
## [51] "test" "transfers"
## [53] "urine_output" "urine_output_firstday"
## [55] "ventilation_classification" "ventilation_duration"
## [57] "weight_duration" "weight_firstday"
削除
# con_mimic %>% dbRemoveTable("caregivers")
最後に接続解除する時
# dbDisconnect(con_mimic, shutdown=TRUE)
省略
3.ADMISSIONS.csv.gzで全行数とユニーク数が同じ列は?
db_ad <- tbl(con_mimic, "admissions")
db_ad |>
mutate_if(is.numeric, as.character) |>
skimr::skim()
| Name | mutate_if(db_ad, is.numer… |
| Number of rows | 58976 |
| Number of columns | 19 |
| _______________________ | |
| Column type frequency: | |
| character | 14 |
| POSIXct | 5 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| ROW_ID | 0 | 1.00 | 3 | 7 | 0 | 58976 | 0 |
| SUBJECT_ID | 0 | 1.00 | 3 | 7 | 0 | 46520 | 0 |
| HADM_ID | 0 | 1.00 | 8 | 8 | 0 | 58976 | 0 |
| ADMISSION_TYPE | 0 | 1.00 | 6 | 9 | 0 | 4 | 0 |
| ADMISSION_LOCATION | 0 | 1.00 | 17 | 25 | 0 | 9 | 0 |
| DISCHARGE_LOCATION | 0 | 1.00 | 3 | 25 | 0 | 17 | 0 |
| INSURANCE | 0 | 1.00 | 7 | 10 | 0 | 5 | 0 |
| LANGUAGE | 25332 | 0.57 | 4 | 4 | 0 | 75 | 0 |
| RELIGION | 458 | 0.99 | 5 | 22 | 0 | 20 | 0 |
| MARITAL_STATUS | 10128 | 0.83 | 6 | 17 | 0 | 7 | 0 |
| ETHNICITY | 0 | 1.00 | 5 | 56 | 0 | 41 | 0 |
| DIAGNOSIS | 25 | 1.00 | 2 | 190 | 0 | 15646 | 0 |
| HOSPITAL_EXPIRE_FLAG | 0 | 1.00 | 3 | 3 | 0 | 2 | 0 |
| HAS_CHARTEVENTS_DATA | 0 | 1.00 | 3 | 3 | 0 | 2 | 0 |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| ADMITTIME | 0 | 1.00 | 2100-06-07 19:59:00 | 2210-08-17 17:13:00 | 2151-01-15 09:39:00 | 58651 |
| DISCHTIME | 0 | 1.00 | 2100-06-09 17:09:00 | 2210-08-24 19:43:00 | 2151-01-29 16:49:30 | 58657 |
| DEATHTIME | 53122 | 0.10 | 2100-06-19 08:15:00 | 2208-02-05 11:45:00 | 2150-09-15 07:08:00 | 5834 |
| EDREGTIME | 28099 | 0.52 | 2100-06-07 13:14:00 | 2210-08-17 10:14:00 | 2150-12-12 00:32:00 | 30874 |
| EDOUTTIME | 28099 | 0.52 | 2100-06-08 00:06:00 | 2210-08-17 18:25:00 | 2150-12-12 03:28:00 | 30864 |
1列ずつcount()して行ってもいいですし、まとめてforループで処理しても良い。
db_ad %>%
distinct(HADM_ID) %>%
count()
cols_ad <-
db_ad |>
head(1) |>
collect() |>
select(!ends_with("TIME"))|>
colnames()
cols_count <-
data.frame(name = cols_ad,
count = NA)
for (i in seq_along(cols_ad)) {
cols_count[i,2] <-
db_ad |>
select(cols_ad[i]) |>
filter(!is.na(.data[[cols_ad[i]]])) |>
distinct() |>
count() |>
pull()
}
cols_count
4.DIAGNOSES_ICD.csv.gzとD_ICD_DIAGNOSES.csv.gzで共通する列は? ICD9_CODEは共通(ROW_IDはどちらにもあるが、各ファイルで個別のもの)
db_diag_ICD <- tbl(con_mimic, "diagnoses_icd")
db_d_ICD_diag <- tbl(con_mimic, "d_icd_diagnoses")
db_diag_ICD |> colnames()
## [1] "ROW_ID" "SUBJECT_ID" "HADM_ID" "SEQ_NUM" "ICD9_CODE"
db_d_ICD_diag |> colnames()
## [1] "ROW_ID" "ICD9_CODE" "SHORT_TITLE" "LONG_TITLE"
db_pt <- tbl(con_mimic, "patients")
db_pt %>% colnames()
## [1] "ROW_ID" "SUBJECT_ID" "GENDER" "DOB" "DOD"
## [6] "DOD_HOSP" "DOD_SSN" "EXPIRE_FLAG"
db_pt %>% count()
以下のコードでは回帰年を考慮して日数差分を365.242で割って年数に変換している。
https://ja.wikipedia.org/wiki/%E5%A4%AA%E9%99%BD%E5%B9%B4
より正確に計算するコード(以下のage_exact)も書けるが、ほとんど変わらないのでどちらでも良い。
db_pt_age <-
db_pt |>
left_join(db_ad, by = "SUBJECT_ID") |>
mutate(age = day(ADMITTIME - DOB) / 365.242) %>%
mutate(age_exact = year(ADMITTIME) - year(DOB)
- ifelse(month(ADMITTIME) < month(DOB) |
(month(ADMITTIME) == month(DOB) &
day(ADMITTIME) < day(DOB)), 1, 0))
db_pt_age %>%
select(ADMITTIME, DOB, age, age_exact) %>%
head(10)
db_pt_age |>
dbplyr::window_order(ADMITTIME) |>
distinct(SUBJECT_ID, .keep_all = T) |>
# filter(age < 1) |>
ggplot() +
geom_histogram(aes(x = age), color= "black", binwidth = 10) +
# coord_cartesian(xlim = c(0, 0.05)) +
theme_bw()
# db_pt_age |>
# filter(age < 1) |>
# count(EXPIRE_FLAG)
#
# # pdf("Normal_vs_other.pdf", height = 6, width = 12)
#
# db_pt_age |>
# filter(age < 1) |>
# mutate(normal_deli = if_else(ADMISSION_LOCATION=="PHYS REFERRAL/NORMAL DELI", "PHYS REFERRAL/NORMAL DELI", "Other")) %>%
# mutate(hosp_death = if_else(HOSPITAL_EXPIRE_FLAG==1, "death", "alive")) %>%
# mutate(ent = day(DISCHTIME - ADMITTIME)) |>
# ggplot() +
# geom_histogram(aes(x = ent), color="black", binwidth = 1, show.legend = F) +
# geom_vline(xintercept = 7, lty = 2) +
# facet_wrap(~ normal_deli) +
# theme_bw() +
# xlab("days") +
# labs(title = "Days until discharge") +
# # coord_cartesian(ylim = c(0, 50)) +
# theme(text = element_text(size = 18))
#
# # dev.off()
db_pt_age |>
mutate(age_70 = ifelse(age >= 70, "over70", "under70")) |>
group_by(GENDER, age_70) |>
tally(EXPIRE_FLAG) |>
# count(GENDER, age_70, EXPIRE_FLAG) %>%
# filter(EXPIRE_FLAG ==1) %>%
pivot_wider(id_cols = c(GENDER), names_from = age_70, values_from = n)
table <-
db_pt_age |>
mutate(age_70 = ifelse(age >= 70, "over70", "under70")) |>
select(GENDER, age_70, EXPIRE_FLAG) %>%
collect() |>
CreateTableOne(vars = "EXPIRE_FLAG",
strata = c("GENDER", "age_70"), factorVars = "EXPIRE_FLAG")
print(table, test = F) |> as.data.frame()
## Stratified by GENDER:age_70
## F:over70 M:over70 F:under70 M:under70
## n 10224 10476 15802 22474
## EXPIRE_FLAG = 1 (%) 6221 (60.8) 6391 (61.0) 4015 (25.4) 5959 (26.5)
# pryr::object_size(table)
#——————————-
Huang Y-H, Cai W-K, Yin S-J, Wang P, Li Z-R, Yang Q, et al. Histamine H2 receptor antagonist exposure was related to decreased all-cause mortality in critical ill patients with heart failure: a cohort study. Eur J Prev Cardiol. 2022;29: 1854–1865. doi:10.1093/eurjpc/zwac122
“diagnoses_icd”
“d_icd_diagnoses”
# dbListTables(con_mimic)
db_ad <- tbl(con_mimic, "admissions")
db_ad |> colnames()
## [1] "ROW_ID" "SUBJECT_ID" "HADM_ID"
## [4] "ADMITTIME" "DISCHTIME" "DEATHTIME"
## [7] "ADMISSION_TYPE" "ADMISSION_LOCATION" "DISCHARGE_LOCATION"
## [10] "INSURANCE" "LANGUAGE" "RELIGION"
## [13] "MARITAL_STATUS" "ETHNICITY" "EDREGTIME"
## [16] "EDOUTTIME" "DIAGNOSIS" "HOSPITAL_EXPIRE_FLAG"
## [19] "HAS_CHARTEVENTS_DATA"
db_ad |> count()
## # Source: SQL [?? x 1]
## # Database: DuckDB v0.10.1 [mihar@Windows 10 x64:R 4.5.1/F:\Database\mimic.db]
## n
## <dbl>
## 1 58976
db_pt <- tbl(con_mimic, "patients")
db_pt |> colnames()
## [1] "ROW_ID" "SUBJECT_ID" "GENDER" "DOB" "DOD"
## [6] "DOD_HOSP" "DOD_SSN" "EXPIRE_FLAG"
db_pt |> count()
## # Source: SQL [?? x 1]
## # Database: DuckDB v0.10.1 [mihar@Windows 10 x64:R 4.5.1/F:\Database\mimic.db]
## n
## <dbl>
## 1 46520
db_diag <- tbl(con_mimic, "diagnoses_icd")
db_diag |> colnames()
## [1] "ROW_ID" "SUBJECT_ID" "HADM_ID" "SEQ_NUM" "ICD9_CODE"
db_diag |> count()
## # Source: SQL [?? x 1]
## # Database: DuckDB v0.10.1 [mihar@Windows 10 x64:R 4.5.1/F:\Database\mimic.db]
## n
## <dbl>
## 1 651047
db_diag |> head(10)
## # Source: SQL [?? x 5]
## # Database: DuckDB v0.10.1 [mihar@Windows 10 x64:R 4.5.1/F:\Database\mimic.db]
## ROW_ID SUBJECT_ID HADM_ID SEQ_NUM ICD9_CODE
## <dbl> <dbl> <dbl> <dbl> <chr>
## 1 1297 109 172335 1 40301
## 2 1298 109 172335 2 486
## 3 1299 109 172335 3 58281
## 4 1300 109 172335 4 5855
## 5 1301 109 172335 5 4254
## 6 1302 109 172335 6 2762
## 7 1303 109 172335 7 7100
## 8 1304 109 172335 8 2767
## 9 1305 109 172335 9 7243
## 10 1306 109 172335 10 45829
db_d_icd_diag <- tbl(con_mimic, "d_icd_diagnoses")
db_d_icd_diag |> colnames()
## [1] "ROW_ID" "ICD9_CODE" "SHORT_TITLE" "LONG_TITLE"
db_d_icd_diag |> count()
## # Source: SQL [?? x 1]
## # Database: DuckDB v0.10.1 [mihar@Windows 10 x64:R 4.5.1/F:\Database\mimic.db]
## n
## <dbl>
## 1 14567
db_d_icd_diag |> head(10)
## # Source: SQL [?? x 4]
## # Database: DuckDB v0.10.1 [mihar@Windows 10 x64:R 4.5.1/F:\Database\mimic.db]
## ROW_ID ICD9_CODE SHORT_TITLE LONG_TITLE
## <dbl> <chr> <chr> <chr>
## 1 174 01166 TB pneumonia-oth test Tuberculous pneumonia [any form], …
## 2 175 01170 TB pneumothorax-unspec Tuberculous pneumothorax, unspecif…
## 3 176 01171 TB pneumothorax-no exam Tuberculous pneumothorax, bacterio…
## 4 177 01172 TB pneumothorx-exam unkn Tuberculous pneumothorax, bacterio…
## 5 178 01173 TB pneumothorax-micro dx Tuberculous pneumothorax, tubercle…
## 6 179 01174 TB pneumothorax-cult dx Tuberculous pneumothorax, tubercle…
## 7 180 01175 TB pneumothorax-histo dx Tuberculous pneumothorax, tubercle…
## 8 181 01176 TB pneumothorax-oth test Tuberculous pneumothorax, tubercle…
## 9 182 01180 Pulmonary TB NEC-unspec Other specified pulmonary tubercul…
## 10 183 01181 Pulmonary TB NEC-no exam Other specified pulmonary tubercul…
14567行しかないので、collect()して操作する。文字列検索のためのstringrパッケージが使える。
df_d_icd_diag <- collect(db_d_icd_diag)
# df_d_icd_diag |> View()
icd_hf <-
df_d_icd_diag |>
filter(str_detect(LONG_TITLE, regex("heart failure", ignore_case = T))) |>
filter(!str_detect(LONG_TITLE, regex("without heart failure", ignore_case = T))) |>
select(ICD9_CODE) |>
pull()
icd_hf
## [1] "39891" "40201" "40211" "40291" "40401" "40403" "40411" "40413" "40491"
## [10] "40493" "4280" "4281" "42820" "42821" "42822" "42823" "42830" "42831"
## [19] "42832" "42833" "42840" "42841" "42842" "42843" "4289"
再現性の配慮のために論文上でも使用したICDコードくらいは記載されているのが普通。
論文のICDと全く同じ事を以下で確認する。
icd_hf_from_article <-
c(4280, 4281, 4289, 39891, 40201, 40211, 40291, 40401, 40403, 40411, 40413, 40491,
40493, 42820, 42821, 42822, 42823, 42830, 42831, 42832, 42833, 42840, 42841, 42842, 42843)
icd_hf %in% icd_hf_from_article |> as.character()
## [1] "TRUE" "TRUE" "TRUE" "TRUE" "TRUE" "TRUE" "TRUE" "TRUE" "TRUE" "TRUE"
## [11] "TRUE" "TRUE" "TRUE" "TRUE" "TRUE" "TRUE" "TRUE" "TRUE" "TRUE" "TRUE"
## [21] "TRUE" "TRUE" "TRUE" "TRUE" "TRUE"
length(icd_hf)
## [1] 25
length(icd_hf_from_article)
## [1] 25
db_diagの中から心不全ICD9コードをつけられているHADM_IDを同定する
db_diag_hf <-
db_diag |>
filter(ICD9_CODE %in% icd_hf)
db_diag_hf |> count()
## # Source: SQL [?? x 1]
## # Database: DuckDB v0.10.1 [mihar@Windows 10 x64:R 4.5.1/F:\Database\mimic.db]
## n
## <dbl>
## 1 21274
db_diag_hf |> head(10)
## # Source: SQL [?? x 5]
## # Database: DuckDB v0.10.1 [mihar@Windows 10 x64:R 4.5.1/F:\Database\mimic.db]
## ROW_ID SUBJECT_ID HADM_ID SEQ_NUM ICD9_CODE
## <dbl> <dbl> <dbl> <dbl> <chr>
## 1 136523 12098 166746 5 4280
## 2 136571 12104 126725 3 42831
## 3 136576 12104 131802 2 42832
## 4 136577 12104 131802 3 4280
## 5 138005 12232 182524 2 4280
## 6 138015 12233 147721 3 42833
## 7 138022 12233 147721 10 42832
## 8 138035 12233 147721 23 4280
## 9 138049 12233 199265 4 4280
## 10 131452 11702 171625 1 42833
ユニークなHADM_ID数は14040 distinct()は重複判定するキー項目(群)を与えると、キー項目が重複するデータは初出を残し二回目以降を削除する。引数の.keep_all = TRUE とすると、キーでない項目も含めて全項目が戻り値となる。
db_diag_hf |>
distinct(HADM_ID) |>
count()
skimr::skim()を使って全ての列のユニーク数をカウントするのは結構便利
数値型をまとめて文字型にしておく必要がある。
db_diag_hf |>
mutate_if(is.numeric, as.character) |>
skim()
| Name | mutate_if(db_diag_hf, is…. |
| Number of rows | 21274 |
| Number of columns | 5 |
| _______________________ | |
| Column type frequency: | |
| character | 5 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| ROW_ID | 0 | 1 | 3 | 8 | 0 | 21274 | 0 |
| SUBJECT_ID | 0 | 1 | 3 | 7 | 0 | 10436 | 0 |
| HADM_ID | 0 | 1 | 8 | 8 | 0 | 14040 | 0 |
| SEQ_NUM | 0 | 1 | 3 | 4 | 0 | 36 | 0 |
| ICD9_CODE | 0 | 1 | 4 | 5 | 0 | 25 | 0 |
この患者10436人のうち実はICU入室していない患者が含まれているので、除外する必要がある。 まずICU入室記録からICU入室歴のある患者を同定して、それに合致する心不全患者に限定する
db_icustay <- tbl(con_mimic, "icustays")
db_icustay |> colnames()
## [1] "ROW_ID" "SUBJECT_ID" "HADM_ID" "ICUSTAY_ID"
## [5] "DBSOURCE" "FIRST_CAREUNIT" "LAST_CAREUNIT" "FIRST_WARDID"
## [9] "LAST_WARDID" "INTIME" "OUTTIME" "LOS"
db_icustay |> count()
db_icustay |> head(5)
ICU入室歴のあるHADM_IDのベクトル取得 後でICUSTAY_IDをjoinするので、ICUSTAY_IDとHADM_IDと対応させたデータフレームを作り、そこからHADM_IDやICUSTAY_IDのベクトルを取得した。(複数ICU入室がある場合何回目の入室情報を論文で使ったのかは未記載だが、最初のICU入室とした)
db_icustay_hadm <-
db_icustay |>
distinct(HADM_ID, .keep_all = T) |>
select(HADM_ID, ICUSTAY_ID)
hadm_icu_vec <-
db_icustay_hadm |>
select(HADM_ID) |>
pull()
icustay_id_vec <-
db_icustay_hadm |>
select(ICUSTAY_ID) |>
pull()
ICU入室歴のある心不全患者 SUBJECT_IDは10405例に限定される - 論文では10402となっているので若干異なるが、許容範囲。
db_diag_hf_icu <-
db_diag_hf |>
filter(HADM_ID %in% hadm_icu_vec)
db_diag_hf_icu |>
mutate_if(is.numeric, as.character) |>
skim()
| Name | mutate_if(db_diag_hf_icu,… |
| Number of rows | 21061 |
| Number of columns | 5 |
| _______________________ | |
| Column type frequency: | |
| character | 5 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| ROW_ID | 0 | 1 | 3 | 8 | 0 | 21061 | 0 |
| SUBJECT_ID | 0 | 1 | 3 | 7 | 0 | 10405 | 0 |
| HADM_ID | 0 | 1 | 8 | 8 | 0 | 13874 | 0 |
| SEQ_NUM | 0 | 1 | 3 | 4 | 0 | 36 | 0 |
| ICD9_CODE | 0 | 1 | 4 | 5 | 0 | 25 | 0 |
心不全&ICU入室した入院ID(HADM_ID)をベクトルで取得しておく
hadm_hf <-
db_diag_hf_icu |>
distinct(HADM_ID) |>
pull(HADM_ID)
length(hadm_hf)
## [1] 13874
“patients”に誕生日(DOB),
“admissions”に入院日(ADMITTIME)が入っている。
差分を取って年齢を算出する。
情報量の多いadmissions, patientsデータを統合したうえで、
1.18歳以上に絞り、
2.上記で取得した心不全&ICU入室した入院IDに限定する(hadm_hf)
3.複数回入院した患者は初回入院に限定する(と論文に記載があった)
db_ad_pt_age <-
db_ad %>%
left_join(db_pt, by = "SUBJECT_ID") %>%
mutate(age_day = day(ADMITTIME - DOB),
age = age_day/365.242)
db_ad_pt_age |> head(10)
db_ad_pt_age18 <-
db_ad_pt_age |>
filter(age >= 18) |>
filter(HADM_ID %in% hadm_hf) |>
arrange(SUBJECT_ID, ADMITTIME) |> #初回入院に限定するため
compute() |>
distinct(SUBJECT_ID, .keep_all = T)
db_ad_pt_age18 |>
mutate_if(is.numeric, as.character) |>
skim()
| Name | mutate_if(db_ad_pt_age18,… |
| Number of rows | 10390 |
| Number of columns | 28 |
| _______________________ | |
| Column type frequency: | |
| character | 19 |
| POSIXct | 9 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| ROW_ID.x | 0 | 1.00 | 3 | 7 | 0 | 10390 | 0 |
| SUBJECT_ID | 0 | 1.00 | 3 | 7 | 0 | 10390 | 0 |
| HADM_ID | 0 | 1.00 | 8 | 8 | 0 | 10390 | 0 |
| ADMISSION_TYPE | 0 | 1.00 | 6 | 9 | 0 | 3 | 0 |
| ADMISSION_LOCATION | 0 | 1.00 | 20 | 25 | 0 | 7 | 0 |
| DISCHARGE_LOCATION | 0 | 1.00 | 3 | 25 | 0 | 16 | 0 |
| INSURANCE | 0 | 1.00 | 7 | 10 | 0 | 5 | 0 |
| LANGUAGE | 4282 | 0.59 | 4 | 4 | 0 | 53 | 0 |
| RELIGION | 77 | 0.99 | 5 | 22 | 0 | 19 | 0 |
| MARITAL_STATUS | 439 | 0.96 | 6 | 17 | 0 | 7 | 0 |
| ETHNICITY | 0 | 1.00 | 5 | 56 | 0 | 35 | 0 |
| DIAGNOSIS | 1 | 1.00 | 2 | 190 | 0 | 4210 | 0 |
| HOSPITAL_EXPIRE_FLAG | 0 | 1.00 | 3 | 3 | 0 | 2 | 0 |
| HAS_CHARTEVENTS_DATA | 0 | 1.00 | 3 | 3 | 0 | 2 | 0 |
| ROW_ID.y | 0 | 1.00 | 3 | 7 | 0 | 10390 | 0 |
| GENDER | 0 | 1.00 | 1 | 1 | 0 | 2 | 0 |
| EXPIRE_FLAG | 0 | 1.00 | 3 | 3 | 0 | 2 | 0 |
| age_day | 0 | 1.00 | 4 | 6 | 0 | 7071 | 0 |
| age | 0 | 1.00 | 13 | 18 | 0 | 7071 | 0 |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| ADMITTIME | 0 | 1.00 | 2100-06-28 19:29:00 | 2208-05-27 02:33:00 | 2151-05-06 06:06:00 | 10381 |
| DISCHTIME | 0 | 1.00 | 2100-07-07 18:05:00 | 2208-06-06 14:20:00 | 2151-05-18 04:10:30 | 10379 |
| DEATHTIME | 8837 | 0.15 | 2100-07-21 10:34:00 | 2205-03-12 13:13:00 | 2150-01-05 02:00:00 | 1553 |
| EDREGTIME | 4578 | 0.56 | 2100-07-02 14:07:00 | 2208-05-27 00:40:00 | 2151-01-20 03:33:00 | 5812 |
| EDOUTTIME | 4578 | 0.56 | 2100-07-03 00:18:00 | 2208-05-27 04:01:00 | 2151-01-20 07:00:30 | 5812 |
| DOB | 0 | 1.00 | 1800-07-02 00:00:00 | 2183-07-09 00:00:00 | 2074-12-01 00:00:00 | 9448 |
| DOD | 4223 | 0.59 | 2100-07-21 00:00:00 | 2211-06-10 00:00:00 | 2152-10-30 00:00:00 | 5762 |
| DOD_HOSP | 6498 | 0.37 | 2100-07-21 00:00:00 | 2208-02-07 00:00:00 | 2151-07-19 12:00:00 | 3722 |
| DOD_SSN | 5033 | 0.52 | 2100-09-13 00:00:00 | 2211-06-10 00:00:00 | 2152-11-27 00:00:00 | 5055 |
年齢の分布異常: 300歳が多数存在する
データベース上で89歳以上の患者はHIPAAに準拠するために生年月日をシフトしているため。
シフトの方法:生年月日を最初の入院からちょうど300年前に設定した。
よって、これらのデータは89歳以上は確定だが、どう扱うか論文未記載。
ここでは暫定的に年齢を95歳で補完する。
db_ad_pt_age18 |>
filter(age >150) |>
summarize(count = n())
db_ad_pt_age18 |> select(age, GENDER) |>
ggplot() +
geom_histogram(aes(x = age), color = "black", alpha = 0.8) +
theme_bw()
db_ad_pt_age18_mod <-
db_ad_pt_age18 |>
mutate(age = if_else(age > 100, 95, age))
db_ad_pt_age18_mod |>
select(age, GENDER) |>
ggplot() +
geom_histogram(aes(x = age), color = "black", alpha = 0.8) +
theme_bw()
データは必要なものに限定する(保険とか人種とか含め)
(ICUSTAY_IDも必要なので手動で追加) (dischday, deathday,
hosp_deathday:後から計算する)
# db_ad_pt_age18_mod |> colnames() |> dput()
cols <- c("SUBJECT_ID", "HADM_ID", "ICUSTAY_ID", "age", "ADMISSION_TYPE",
"INSURANCE", "ETHNICITY", "DIAGNOSIS",
"HOSPITAL_EXPIRE_FLAG", "GENDER", "EXPIRE_FLAG",
"DOB", "ADMITTIME", "DISCHTIME", "DOD", "DOD_HOSP")
後から必要になるのでICUSTAY_IDを紐づける。
<ここまでの集大成をdb_patientsとする> データベース上に書き込んでおけば、次回呼び出すことができる。
dbListTables(con_mimic)
## [1] "admissions" "apsiii"
## [3] "callout" "caregivers"
## [5] "chartevents" "comorbidities_AHRQ"
## [7] "comorbidities_Quan" "cptevents"
## [9] "d_cpt" "d_icd_diagnoses"
## [11] "d_icd_procedures" "d_items"
## [13] "d_labitems" "datetimeevents"
## [15] "db_HtWt" "db_LVEF"
## [17] "db_LVEF_assess" "db_PEO"
## [19] "db_comorb" "db_ethnicity"
## [21] "db_patients" "db_rrt"
## [23] "db_sofa" "db_urine"
## [25] "db_venti" "db_vital"
## [27] "dbplyr_dtBYaEiOR3" "df_lab_list"
## [29] "diagnoses_icd" "drgcodes"
## [31] "height_firstday" "height_weight"
## [33] "icustay_details" "icustays"
## [35] "inputevents_cv" "inputevents_mv"
## [37] "labevents" "microbiologyevents"
## [39] "noteevents" "outputevents"
## [41] "patients" "pivot_RRT"
## [43] "pivot_height" "pivot_labo"
## [45] "pivot_urine_output" "pivot_vitals"
## [47] "prescriptions" "procedureevents_mv"
## [49] "procedures_icd" "services"
## [51] "sofa" "test"
## [53] "transfers" "urine_output"
## [55] "urine_output_firstday" "ventilation_classification"
## [57] "ventilation_duration" "weight_duration"
## [59] "weight_firstday"
db_patients <-
db_ad_pt_age18_mod |>
left_join(db_icustay_hadm, by = "HADM_ID") |>
select(cols)
db_patients |> head(10) # %>% collect() %>% View()
db_patients |> count()
データベース上に書き込む
既にcon_mimicで接続しているデータベース内にdb_patientsがある場合には、既存のdb_patientsを削除してから新しくdb_patientsを作成するようにする。
if (dbExistsTable(con_mimic, "db_patients")) {
dbRemoveTable(con_mimic, "db_patients")
}
dbWriteTable(con_mimic,
"db_patients",
db_patients %>% collect(),
overwrite = TRUE)
# dbDisconnect(con_mimic, shutdown=TRUE)
Exposureについて:H2RAR使用したかどうかを示す列をdb_patientsに追加する。
ヒント:どのデータを利用するのか考えてください。str_detect()など覚えた関数は最大限利用しましょう。
薬剤使用群と非使用群がそれぞれ何名ずつかカウントし、論文の記載と近いことを確認しましょう。
Outcomeについて:入院日を起点として30日死亡したかどうかを示す列を上記にさらに追加する。
ヒント:NA処理が必要になるかも。
(多分簡単にできると思います。)