必要パッケージのDL
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(skimr)
library(tableone)
library(ggplot2)
データの読み込み
df1<-read.csv("C:/Users/eishin/Desktop/dpc/DPC1.csv", fileEncoding = "CP932")
df2<-read.csv("C:/Users/eishin/Desktop/dpc/DPC2.csv", fileEncoding = "CP932")
統合のため、df1とdf2で重複する列名のみでdfを再構築
colnames(df1)
## [1] "adladm" "adldis" "admpath" "age"
## [5] "ambul" "brthw" "brthwk" "bunketu"
## [9] "dadm" "dcc1" "dcc2" "dcc3"
## [13] "dcc4" "dcin1" "dcin2" "dcin3"
## [17] "dcin4" "dislasts" "disto" "dmain"
## [21] "dres1" "dres2" "dth24h" "height"
## [25] "HospDmycd" "jcs" "jcsdis" "kacd"
## [29] "nyuseqno" "opean1" "opean2" "opean3"
## [33] "opean4" "opean5" "opek1" "opek2"
## [37] "opek3" "opek4" "opek5" "outcome"
## [41] "preg" "pregwkadm" "PtDmyCd" "readmkind"
## [45] "readmrea" "sex" "smokingidx" "urgency"
## [49] "weight" "マンハッタン距離" "ユークリッド距離" "手術1相対日"
## [53] "手術2相対日" "手術3相対日" "手術4相対日" "手術5相対日"
## [57] "特定機能" "入院期間" "年度"
colnames(df2)
## [1] "adladm" "adldis" "admpath" "age" "ambul"
## [6] "brthw" "brthwk" "bunketu" "dadm" "dcc1"
## [11] "dcc2" "dcc3" "dcc4" "dcin1" "dcin2"
## [16] "dcin3" "dcin4" "dislasts" "disto" "dmain"
## [21] "dres1" "dres2" "dth24h" "height" "HospDmycd"
## [26] "idx_nyuseqno" "jcs" "jcsdis" "kacd" "nyuseqno"
## [31] "opean1" "opean2" "opean3" "opean4" "opean5"
## [36] "opek1" "opek2" "opek3" "opek4" "opek5"
## [41] "outcome" "preg" "pregwkadm" "PtDmyCd" "readmkind"
## [46] "readmrea" "sex" "smokingidx" "urgency" "weight"
## [51] "手術1相対日" "手術2相対日" "手術3相対日" "手術4相対日" "手術5相対日"
## [56] "相対入院日" "入院期間"
df3<- df1[c("PtDmyCd", "age", "dadm", "dcc1", "dcc2", "dcc3", "dcc4", "dcin1", "dcin2", "dcin3", "dcin4", "opek1", "opek2", "opek3", "opek4", "opek5", "outcome", "brthw", "brthwk", "bunketu", "入院期間", "HospDmycd")]
df4<- df2[c("PtDmyCd", "age", "dadm", "dcc1", "dcc2", "dcc3", "dcc4", "dcin1", "dcin2", "dcin3", "dcin4", "opek1", "opek2", "opek3", "opek4", "opek5", "outcome", "brthw", "brthwk", "bunketu", "入院期間", "HospDmycd")]
df<-rbind(df3,df4)
skim(df)
Data summary
| Name |
df |
| Number of rows |
197215 |
| Number of columns |
22 |
| _______________________ |
|
| Column type frequency: |
|
| character |
14 |
| numeric |
8 |
| ________________________ |
|
| Group variables |
None |
Variable type: character
| dadm |
0 |
1 |
3 |
5 |
0 |
1307 |
0 |
| dcc1 |
0 |
1 |
0 |
5 |
54634 |
1192 |
0 |
| dcc2 |
0 |
1 |
0 |
5 |
90043 |
1040 |
0 |
| dcc3 |
0 |
1 |
0 |
5 |
122501 |
899 |
0 |
| dcc4 |
0 |
1 |
0 |
5 |
150278 |
771 |
0 |
| dcin1 |
0 |
1 |
0 |
5 |
51731 |
1613 |
0 |
| dcin2 |
0 |
1 |
0 |
5 |
102526 |
1422 |
0 |
| dcin3 |
0 |
1 |
0 |
5 |
141941 |
1171 |
0 |
| dcin4 |
0 |
1 |
0 |
5 |
167668 |
934 |
0 |
| opek1 |
0 |
1 |
0 |
7 |
52568 |
425 |
0 |
| opek2 |
0 |
1 |
0 |
7 |
173242 |
244 |
0 |
| opek3 |
0 |
1 |
0 |
7 |
193260 |
140 |
0 |
| opek4 |
0 |
1 |
0 |
7 |
196493 |
75 |
0 |
| opek5 |
0 |
1 |
0 |
7 |
197025 |
48 |
0 |
Variable type: numeric
| PtDmyCd |
0 |
1.00 |
13680573.12 |
15718578.85 |
155 |
2335435 |
4676216 |
26068062 |
49705857 |
▇▁▁▁▂ |
| age |
0 |
1.00 |
33.76 |
6.65 |
0 |
30 |
34 |
37 |
102 |
▁▇▁▁▁ |
| outcome |
0 |
1.00 |
1.90 |
2.46 |
1 |
1 |
1 |
1 |
9 |
▇▁▁▁▁ |
| brthw |
192315 |
0.02 |
3000.75 |
522.74 |
300 |
2726 |
3024 |
3328 |
4530 |
▁▁▃▇▁ |
| brthwk |
180298 |
0.09 |
37.11 |
4.49 |
0 |
37 |
38 |
39 |
53 |
▁▁▁▇▁ |
| bunketu |
20795 |
0.89 |
9891.95 |
27796.14 |
0 |
1010 |
1241 |
1735 |
99999 |
▇▁▁▁▁ |
| 入院期間 |
0 |
1.00 |
12.06 |
17.81 |
1 |
7 |
8 |
10 |
3631 |
▇▁▁▁▁ |
| HospDmycd |
0 |
1.00 |
899.86 |
524.44 |
6 |
429 |
934 |
1369 |
1837 |
▇▇▇▇▇ |
sapply(df,class)
## PtDmyCd age dadm dcc1 dcc2 dcc3
## "integer" "integer" "character" "character" "character" "character"
## dcc4 dcin1 dcin2 dcin3 dcin4 opek1
## "character" "character" "character" "character" "character" "character"
## opek2 opek3 opek4 opek5 outcome brthw
## "character" "character" "character" "character" "integer" "integer"
## brthwk bunketu 入院期間 HospDmycd
## "integer" "integer" "integer" "integer"
抽出定義にのっとり、df内から「産科出血患者」のみを抽出する。
条件①病名としてO679, O724, O721, O723のいずれかを含む行の抽出
grep関数で文字列の正規表現を抽出するが、“|”表現でgrepを結合するとエラーが生じる
面倒だが各病名列毎にコードを実行し、重複しないようrbindする
df_dadm<- df[grep("O679|O724|O721|O723", df$dadm),]
df_dcc1<-df[grep("O679|O724|O721|O723", df$dcc1),]
df_dcc2<-df[grep("O679|O724|O721|O723", df$dcc2),]
df_dcc3<-df[grep("O679|O724|O721|O723", df$dcc3),]
df_dcc4<-df[grep("O679|O724|O721|O723", df$dcc4),]
df_dcin1<-df[grep("O679|O724|O721|O723", df$dcin1),]
df_dcin2<-df[grep("O679|O724|O721|O723", df$dcin2),]
df_dcin3<-df[grep("O679|O724|O721|O723", df$dcin3),]
df_dcin4<-df[grep("O679|O724|O721|O723", df$dcin4),]
df_combined1 <- unique(rbind(df_dadm,df_dcc1,df_dcc2, df_dcc3,df_dcc4,df_dcin1,df_dcin2, df_dcin3,df_dcin4))
## 条件②分娩時出血量1000以上の症例のみを抽出する
df_combined2 <-filter(df, bunketu>=1000)
最後に、条件①OR条件②で新しいdfを作成
df<-unique(rbind(df_combined1, df_combined2))
連続変数の変換
df$age <- as.numeric(df$age)
df$bunketu <- as.numeric(df$bunketu)
df$入院期間 <- as.numeric(df$入院期間)
sapply(df, class)
## PtDmyCd age dadm dcc1 dcc2 dcc3
## "integer" "numeric" "character" "character" "character" "character"
## dcc4 dcin1 dcin2 dcin3 dcin4 opek1
## "character" "character" "character" "character" "character" "character"
## opek2 opek3 opek4 opek5 outcome brthw
## "character" "character" "character" "character" "integer" "integer"
## brthwk bunketu 入院期間 HospDmycd
## "integer" "numeric" "numeric" "integer"
連続変数の傾向確認
hist(df$age)

hist(df$bunketu)

hist(df$入院期間)

## age, bunketu, 入院期間に明らかな外れ値があるため、除外する。
df<- df[df$age <= 60 & df$bunketu <= 9000 & df$入院期間 <= 365, ]
hist(df$age)

skim(df$age)
Data summary
| Name |
df$age |
| Number of rows |
78426 |
| Number of columns |
1 |
| _______________________ |
|
| Column type frequency: |
|
| numeric |
1 |
| ________________________ |
|
| Group variables |
None |
Variable type: numeric
| data |
1491 |
0.98 |
33.5 |
5.31 |
0 |
30 |
34 |
37 |
56 |
▁▁▇▇▁ |
hist(df$bunketu)

skim(df$bunketu)
Data summary
| Name |
df$bunketu |
| Number of rows |
78426 |
| Number of columns |
1 |
| _______________________ |
|
| Column type frequency: |
|
| numeric |
1 |
| ________________________ |
|
| Group variables |
None |
Variable type: numeric
| data |
1491 |
0.98 |
1326.26 |
698.64 |
0 |
1010 |
1205 |
1561 |
9000 |
▇▂▁▁▁ |
hist(df$入院期間)

skim(df$入院期間)
Data summary
| Name |
df$入院期間 |
| Number of rows |
78426 |
| Number of columns |
1 |
| _______________________ |
|
| Column type frequency: |
|
| numeric |
1 |
| ________________________ |
|
| Group variables |
None |
Variable type: numeric
| data |
1491 |
0.98 |
12.56 |
13.52 |
1 |
7 |
8 |
10 |
244 |
▇▁▁▁▁ |
以上で、2020-2021年に分娩した「産科出血患者」全てを抽出したとする。
計78426例
unique_count <- length(unique(df$PtDmyCd))
unique_count
## [1] 76475
ユニーク数は76475例であり、約2000例が重複したIDからの症例
同一症例が2回以上の分娩、
いずれも産科出血に該当する可能性はあるため問題ないと判断した。
続いて、各変数の構成を表示する。
vars<-c("age", "outcome", "brthw", "brthwk", "bunketu", "入院期間")
table <- CreateTableOne(vars = vars,
includeNA = T,
addOverall = T,
smd = T,
data = df)
table
##
## Overall
## n 78426
## age (mean (SD)) 33.50 (5.31)
## outcome (mean (SD)) 1.90 (2.50)
## brthw (mean (SD)) 3004.69 (517.85)
## brthwk (mean (SD)) 37.22 (4.15)
## bunketu (mean (SD)) 1326.26 (698.64)
## 入院期間 (mean (SD)) 12.56 (13.52)
outcomeの内訳
outcome_freq <- table(df$outcome)
outcome_freq
##
## 1 3 4 5 6 7 9
## 67898 74 510 85 10 2 8356
## outcome=6,7が死亡症例であり、total 12人という結果に
## そのうちわけを確認してみる。
df_death<- filter(df, outcome %in% c(6, 7))
df_death
## PtDmyCd age dadm dcc1 dcc2 dcc3 dcc4 dcin1 dcin2 dcin3 dcin4 opek1
## 5174 41011392 33 O721 D696 I469 D65 K6021
## 56298 3819969 31 O721 D62 D695 K626 I639 O450 O723 O142 N179 K6151
## 81275 930521 37 O721 D65 U071
## 16200 5280120 32 Z349 O814 O721 D65 K877
## 56424 32299658 42 O600 R571 N179 R579 J9609 O300 O721 I460 O723 K8981
## 1496 3387145 37 O459 I469 I710 I319 O459 K810 J980 D65 K8981
## 920410 5340825 30 O459 O904 D696 D62 O600 R571 O881 I460 K904
## 1170610 3103211 35 Q282 Z349 K1492
## 263791 4522329 36 O600 J9609 D696 C509 D649 O342 O600 K8981
## 340091 16387579 38 O300 R21 O411 O996 O908 Z33 U071 O600 O342 K8982
## 44962 1396216 28 I469 I469 Z33 K386-2
## 52540 3716427 44 O142 D65 T796 S3611 S3610 O142 I460 O364 R571 K8981
## opek2 opek3 opek4 opek5 outcome brthw brthwk bunketu 入院期間
## 5174 K6022 6 NA NA 4093 2
## 56298 K386 K0002 K909-2 6 NA NA 3100 97
## 81275 6 NA NA 5350 1
## 16200 K893 7 NA NA 4908 1
## 56424 K6151 K901 K876 K6021 6 NA NA 2537 71
## 1496 7 NA 37 1208 3
## 920410 K545 6 NA NA 1200 37
## 1170610 K8981 K145 6 NA NA 1900 16
## 263791 6 NA 28 2350 8
## 340091 6 NA NA 2050 96
## 44962 6 NA NA 1305 1
## 52540 K6072 K6151 K6072 K6331 6 NA NA 6770 26
## HospDmycd
## 5174 117
## 56298 1165
## 81275 1650
## 16200 320
## 56424 1165
## 1496 42
## 920410 247
## 1170610 314
## 263791 729
## 340091 956
## 44962 1245
## 52540 1426
死亡症例の内訳は大きな矛盾はないと判断する。
df$death <- ifelse(df$outcome %in% c(6, 7), "1", "0")
続いて、df内から子宮全摘術を行った症例をカウントする。
count <- sum(rowSums(sapply(df[c("opek1", "opek2", "opek3", "opek4", "opek5")],
function(x) grepl("K877|K876|K903|K904", x))) > 0)
count
## [1] 615
## 全78426症例中、子宮全摘を要した症例は615例であった。
IVRを行った症例をカウントする。
count1 <- sum(rowSums(sapply(df[c("opek1", "opek2", "opek3", "opek4", "opek5")],
function(x) grepl("K6151|K6153", x))) > 0)
count1
## [1] 1073
全78426例中、IVRを行ったのは1073例
df内に、“hysterectomy”という子宮全摘の有無を表記する列を追加する
df$hysterectomy <- as.integer(rowSums(sapply(df[c("opek1", "opek2", "opek3", "opek4", "opek5")],
function(x) grepl("K877|K876|K903|K904", x))) > 0)
同様に、子宮動脈塞栓となった症例の列を追加する。
df$tae<-as.integer(rowSums(sapply(df[c("opek1", "opek2", "opek3", "opek4", "opek5")],
function(x) grepl("6151|K6153", x))) > 0)
同様に、帝王切開分娩の症例の列を追加する。
df$cs<-as.integer(rowSums(sapply(df[c("opek1", "opek2", "opek3", "opek4", "opek5")],
function(x) grepl("K8981|K8982", x))) > 0)
IVRが施行可能な施設を、タグ付けする。
taeが”1”となったことがあるHospDmycdを同定する。
summary(df$HospDmycd)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 6 436 956 912 1380 1837 1491
df$ivr <- ave(df$tae, df$HospDmycd, FUN = function(x) ifelse(any(x == 1), 1, 0))
df %>%
group_by(HospDmycd) %>%
summarise(ivr = ifelse(any(tae == 1), 1, 0))
## # A tibble: 548 × 2
## HospDmycd ivr
## <int> <dbl>
## 1 6 0
## 2 10 0
## 3 15 0
## 4 17 0
## 5 26 1
## 6 29 1
## 7 30 0
## 8 31 0
## 9 33 0
## 10 35 0
## # ℹ 538 more rows
summary(df$ivr)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 1.0000 0.6024 1.0000 1.0000
病院のユニーク数を表示
unique(df$HospDmycd)
## [1] 6 10 15 NA 17 26 29 30 35 36 40 42 43 48 62
## [16] 66 68 78 80 81 90 93 95 107 115 117 122 124 126 136
## [31] 142 144 147 150 155 164 171 172 177 179 186 188 195 196 206
## [46] 207 210 211 213 215 216 217 220 228 230 246 247 253 262 264
## [61] 272 275 287 289 290 291 296 303 304 307 309 314 318 327 329
## [76] 334 335 337 338 341 346 350 351 363 366 367 368 369 374 378
## [91] 380 393 394 395 399 401 402 410 415 416 421 423 429 430 436
## [106] 442 445 449 451 452 453 454 456 458 459 464 467 473 475 483
## [121] 494 499 500 505 508 515 516 518 525 527 530 531 548 551 553
## [136] 556 563 571 572 578 581 583 589 595 596 597 598 599 601 605
## [151] 619 624 634 642 643 649 661 666 676 681 682 684 702 703 718
## [166] 724 729 736 742 753 754 760 770 779 814 816 823 824 827 829
## [181] 832 833 839 845 847 852 853 858 859 866 868 869 873 876 878
## [196] 879 894 898 912 913 921 924 934 935 951 952 956 957 963 968
## [211] 975 979 984 991 995 997 1002 1009 1014 1016 1020 1021 1023 1034 1035
## [226] 1040 1044 1050 1052 1061 1066 1067 1071 1072 1074 1076 1077 1079 1081 1083
## [241] 1088 1096 1100 1111 1115 1121 1124 1127 1130 1132 1134 1135 1139 1141 1143
## [256] 1144 1151 1161 1165 1170 1173 1177 1178 1180 1183 1189 1190 1200 1202 1203
## [271] 1207 1211 1215 1222 1223 1231 1232 1233 1235 1245 1246 1248 1252 1253 1255
## [286] 1264 1267 1268 1271 1275 1277 1289 1293 1296 1302 1310 1320 1321 1322 1335
## [301] 1339 1340 1344 1345 1348 1352 1355 1359 1362 1369 1370 1371 1375 1378 1380
## [316] 1381 1386 1397 1399 1401 1405 1407 1408 1409 1411 1413 1424 1426 1431 1437
## [331] 1440 1446 1447 1448 1453 1455 1461 1462 1466 1467 1477 1480 1481 1485 1503
## [346] 1504 1507 1510 1519 1521 1525 1527 1528 1531 1533 1535 1541 1545 1552 1553
## [361] 1554 1562 1567 1569 1571 1572 1576 1577 1583 1587 1592 1595 1597 1609 1625
## [376] 1626 1628 1629 1631 1633 1637 1641 1643 1650 1651 1655 1656 1670 1678 1679
## [391] 1680 1685 1690 1695 1697 1702 1710 1712 1717 1721 1725 1730 1733 1734 1745
## [406] 1749 1753 1754 1772 1773 1777 1778 1784 1786 1792 1793 1794 1795 1802 1805
## [421] 1815 1821 1829 1834 31 33 44 45 51 67 106 193 194 288 297
## [436] 299 312 320 345 355 392 444 471 484 492 617 651 654 657 699
## [451] 706 723 739 773 776 788 790 795 806 810 822 840 844 909 928
## [466] 942 964 996 1005 1041 1048 1086 1091 1093 1133 1163 1194 1242 1301 1332
## [481] 1358 1360 1395 1428 1450 1498 1520 1547 1602 1604 1644 1693 1694 1696 1704
## [496] 1750 1762 1785 1790 1801 1813 1816 1837 221 480 658 835 889 1054 1082
## [511] 1208 1470 1561 1610 1723 627 1185 1688 1063 1672 365 472 1258 38 60
## [526] 118 127 156 300 301 388 748 771 854 917 925 961 969 970 1057
## [541] 1243 1266 1392 1458 1472 1683 1698 1781
unique_count_hosp <- df %>%
distinct(HospDmycd) %>%
count()
unique_count_hosp
## n
## 1 548
全548病院
そのうちIVR可能なのは?
count_ivr <- df %>%
group_by(HospDmycd) %>%
summarise(ivr = ifelse(any(tae == 1), 1, 0)) %>%
filter(ivr == 1) %>%
count()
count_ivr
## # A tibble: 1 × 1
## n
## <int>
## 1 225
IVR対応施設は、225施設と判明した。
各変数の尺度の設定
sapply(df, class)
## PtDmyCd age dadm dcc1 dcc2 dcc3
## "integer" "numeric" "character" "character" "character" "character"
## dcc4 dcin1 dcin2 dcin3 dcin4 opek1
## "character" "character" "character" "character" "character" "character"
## opek2 opek3 opek4 opek5 outcome brthw
## "character" "character" "character" "character" "integer" "integer"
## brthwk bunketu 入院期間 HospDmycd death hysterectomy
## "integer" "numeric" "numeric" "integer" "character" "integer"
## tae cs ivr
## "integer" "integer" "numeric"
df$tae <- as.factor(df$tae)
df$hysterectomy <- as.factor(df$hysterectomy)
df$death <- as.factor(df$death)
df$cs<-as.factor(df$cs)
df$ivr<-as.factor(df$ivr)
sapply(df,class)
## PtDmyCd age dadm dcc1 dcc2 dcc3
## "integer" "numeric" "character" "character" "character" "character"
## dcc4 dcin1 dcin2 dcin3 dcin4 opek1
## "character" "character" "character" "character" "character" "character"
## opek2 opek3 opek4 opek5 outcome brthw
## "character" "character" "character" "character" "integer" "integer"
## brthwk bunketu 入院期間 HospDmycd death hysterectomy
## "integer" "numeric" "numeric" "integer" "factor" "factor"
## tae cs ivr
## "factor" "factor" "factor"
これで、全ての下準備は完成した。
# 子宮全摘症例と、非全摘症例の背景の比較
table2 <- CreateTableOne(vars = c("入院期間", "age", "bunketu", "death", "tae", "ivr"),
test = TRUE,
strata = "hysterectomy",
testExact = fisher.test,
addOverall = TRUE,
data = df)
table2
## Stratified by hysterectomy
## Overall 0 1
## n 78426 77811 615
## 入院期間 (mean (SD)) 12.56 (13.52) 12.45 (13.35) 25.25 (24.75)
## age (mean (SD)) 33.50 (5.31) 33.48 (5.31) 36.09 (4.55)
## bunketu (mean (SD)) 1326.26 (698.64) 1311.81 (662.76) 3118.94 (1820.68)
## death = 1 (%) 12 ( 0.0) 9 ( 0.0) 3 ( 0.5)
## tae = 1 (%) 1073 ( 1.4) 967 ( 1.2) 106 (17.2)
## ivr = 1 (%) 47247 (60.2) 46752 (60.1) 495 (80.5)
## Stratified by hysterectomy
## p test
## n
## 入院期間 (mean (SD)) <0.001
## age (mean (SD)) <0.001
## bunketu (mean (SD)) <0.001
## death = 1 (%) <0.001
## tae = 1 (%) <0.001
## ivr = 1 (%) <0.001
## 無事に表記出来た。