必要パッケージのDL

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(skimr)
library(tableone)
library(ggplot2)

データの読み込み

df1<-read.csv("C:/Users/eishin/Desktop/dpc/DPC1.csv", fileEncoding = "CP932")
df2<-read.csv("C:/Users/eishin/Desktop/dpc/DPC2.csv", fileEncoding = "CP932")

統合のため、df1とdf2で重複する列名のみでdfを再構築

colnames(df1)
##  [1] "adladm"           "adldis"           "admpath"          "age"             
##  [5] "ambul"            "brthw"            "brthwk"           "bunketu"         
##  [9] "dadm"             "dcc1"             "dcc2"             "dcc3"            
## [13] "dcc4"             "dcin1"            "dcin2"            "dcin3"           
## [17] "dcin4"            "dislasts"         "disto"            "dmain"           
## [21] "dres1"            "dres2"            "dth24h"           "height"          
## [25] "HospDmycd"        "jcs"              "jcsdis"           "kacd"            
## [29] "nyuseqno"         "opean1"           "opean2"           "opean3"          
## [33] "opean4"           "opean5"           "opek1"            "opek2"           
## [37] "opek3"            "opek4"            "opek5"            "outcome"         
## [41] "preg"             "pregwkadm"        "PtDmyCd"          "readmkind"       
## [45] "readmrea"         "sex"              "smokingidx"       "urgency"         
## [49] "weight"           "マンハッタン距離" "ユークリッド距離" "手術1相対日"     
## [53] "手術2相対日"      "手術3相対日"      "手術4相対日"      "手術5相対日"     
## [57] "特定機能"         "入院期間"         "年度"
colnames(df2)
##  [1] "adladm"       "adldis"       "admpath"      "age"          "ambul"       
##  [6] "brthw"        "brthwk"       "bunketu"      "dadm"         "dcc1"        
## [11] "dcc2"         "dcc3"         "dcc4"         "dcin1"        "dcin2"       
## [16] "dcin3"        "dcin4"        "dislasts"     "disto"        "dmain"       
## [21] "dres1"        "dres2"        "dth24h"       "height"       "HospDmycd"   
## [26] "idx_nyuseqno" "jcs"          "jcsdis"       "kacd"         "nyuseqno"    
## [31] "opean1"       "opean2"       "opean3"       "opean4"       "opean5"      
## [36] "opek1"        "opek2"        "opek3"        "opek4"        "opek5"       
## [41] "outcome"      "preg"         "pregwkadm"    "PtDmyCd"      "readmkind"   
## [46] "readmrea"     "sex"          "smokingidx"   "urgency"      "weight"      
## [51] "手術1相対日"  "手術2相対日"  "手術3相対日"  "手術4相対日"  "手術5相対日" 
## [56] "相対入院日"   "入院期間"
df3<- df1[c("PtDmyCd", "age", "dadm", "dcc1", "dcc2", "dcc3", "dcc4", "dcin1", "dcin2", "dcin3", "dcin4", "opek1", "opek2", "opek3", "opek4", "opek5", "outcome", "brthw", "brthwk", "bunketu", "入院期間", "HospDmycd")]
df4<- df2[c("PtDmyCd", "age", "dadm", "dcc1", "dcc2", "dcc3", "dcc4", "dcin1", "dcin2", "dcin3", "dcin4", "opek1", "opek2", "opek3", "opek4", "opek5", "outcome", "brthw", "brthwk", "bunketu", "入院期間", "HospDmycd")]
df<-rbind(df3,df4)
skim(df)
Data summary
Name df
Number of rows 197215
Number of columns 22
_______________________
Column type frequency:
character 14
numeric 8
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
dadm 0 1 3 5 0 1307 0
dcc1 0 1 0 5 54634 1192 0
dcc2 0 1 0 5 90043 1040 0
dcc3 0 1 0 5 122501 899 0
dcc4 0 1 0 5 150278 771 0
dcin1 0 1 0 5 51731 1613 0
dcin2 0 1 0 5 102526 1422 0
dcin3 0 1 0 5 141941 1171 0
dcin4 0 1 0 5 167668 934 0
opek1 0 1 0 7 52568 425 0
opek2 0 1 0 7 173242 244 0
opek3 0 1 0 7 193260 140 0
opek4 0 1 0 7 196493 75 0
opek5 0 1 0 7 197025 48 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
PtDmyCd 0 1.00 13680573.12 15718578.85 155 2335435 4676216 26068062 49705857 ▇▁▁▁▂
age 0 1.00 33.76 6.65 0 30 34 37 102 ▁▇▁▁▁
outcome 0 1.00 1.90 2.46 1 1 1 1 9 ▇▁▁▁▁
brthw 192315 0.02 3000.75 522.74 300 2726 3024 3328 4530 ▁▁▃▇▁
brthwk 180298 0.09 37.11 4.49 0 37 38 39 53 ▁▁▁▇▁
bunketu 20795 0.89 9891.95 27796.14 0 1010 1241 1735 99999 ▇▁▁▁▁
入院期間 0 1.00 12.06 17.81 1 7 8 10 3631 ▇▁▁▁▁
HospDmycd 0 1.00 899.86 524.44 6 429 934 1369 1837 ▇▇▇▇▇
sapply(df,class)
##     PtDmyCd         age        dadm        dcc1        dcc2        dcc3 
##   "integer"   "integer" "character" "character" "character" "character" 
##        dcc4       dcin1       dcin2       dcin3       dcin4       opek1 
## "character" "character" "character" "character" "character" "character" 
##       opek2       opek3       opek4       opek5     outcome       brthw 
## "character" "character" "character" "character"   "integer"   "integer" 
##      brthwk     bunketu    入院期間   HospDmycd 
##   "integer"   "integer"   "integer"   "integer"

抽出定義にのっとり、df内から「産科出血患者」のみを抽出する。

条件①病名としてO679, O724, O721, O723のいずれかを含む行の抽出

grep関数で文字列の正規表現を抽出するが、“|”表現でgrepを結合するとエラーが生じる

面倒だが各病名列毎にコードを実行し、重複しないようrbindする

df_dadm<- df[grep("O679|O724|O721|O723", df$dadm),]
df_dcc1<-df[grep("O679|O724|O721|O723", df$dcc1),] 
df_dcc2<-df[grep("O679|O724|O721|O723", df$dcc2),] 
df_dcc3<-df[grep("O679|O724|O721|O723", df$dcc3),]
df_dcc4<-df[grep("O679|O724|O721|O723", df$dcc4),]
df_dcin1<-df[grep("O679|O724|O721|O723", df$dcin1),]
df_dcin2<-df[grep("O679|O724|O721|O723", df$dcin2),]
df_dcin3<-df[grep("O679|O724|O721|O723", df$dcin3),]
df_dcin4<-df[grep("O679|O724|O721|O723", df$dcin4),]

df_combined1 <- unique(rbind(df_dadm,df_dcc1,df_dcc2, df_dcc3,df_dcc4,df_dcin1,df_dcin2, df_dcin3,df_dcin4))

## 条件②分娩時出血量1000以上の症例のみを抽出する

df_combined2 <-filter(df, bunketu>=1000)

最後に、条件①OR条件②で新しいdfを作成

df<-unique(rbind(df_combined1, df_combined2))

連続変数の変換

df$age <- as.numeric(df$age)
df$bunketu <- as.numeric(df$bunketu)
df$入院期間 <- as.numeric(df$入院期間)
sapply(df, class)
##     PtDmyCd         age        dadm        dcc1        dcc2        dcc3 
##   "integer"   "numeric" "character" "character" "character" "character" 
##        dcc4       dcin1       dcin2       dcin3       dcin4       opek1 
## "character" "character" "character" "character" "character" "character" 
##       opek2       opek3       opek4       opek5     outcome       brthw 
## "character" "character" "character" "character"   "integer"   "integer" 
##      brthwk     bunketu    入院期間   HospDmycd 
##   "integer"   "numeric"   "numeric"   "integer"

連続変数の傾向確認

hist(df$age)

hist(df$bunketu)

hist(df$入院期間)

## age, bunketu, 入院期間に明らかな外れ値があるため、除外する。
df<- df[df$age <= 60 & df$bunketu <= 9000 & df$入院期間 <= 365, ]
hist(df$age)

skim(df$age)
Data summary
Name df$age
Number of rows 78426
Number of columns 1
_______________________
Column type frequency:
numeric 1
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
data 1491 0.98 33.5 5.31 0 30 34 37 56 ▁▁▇▇▁
hist(df$bunketu)

skim(df$bunketu)
Data summary
Name df$bunketu
Number of rows 78426
Number of columns 1
_______________________
Column type frequency:
numeric 1
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
data 1491 0.98 1326.26 698.64 0 1010 1205 1561 9000 ▇▂▁▁▁
hist(df$入院期間)

skim(df$入院期間)
Data summary
Name df$入院期間
Number of rows 78426
Number of columns 1
_______________________
Column type frequency:
numeric 1
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
data 1491 0.98 12.56 13.52 1 7 8 10 244 ▇▁▁▁▁

以上で、2020-2021年に分娩した「産科出血患者」全てを抽出したとする。

計78426例

unique_count <- length(unique(df$PtDmyCd))
unique_count
## [1] 76475

ユニーク数は76475例であり、約2000例が重複したIDからの症例

同一症例が2回以上の分娩、

いずれも産科出血に該当する可能性はあるため問題ないと判断した。

続いて、各変数の構成を表示する。

vars<-c("age", "outcome", "brthw", "brthwk", "bunketu", "入院期間")
table <- CreateTableOne(vars = vars,
                         includeNA = T,
                         addOverall = T,
                         smd = T,
                         data = df)
table
##                       
##                        Overall         
##   n                      78426         
##   age (mean (SD))        33.50 (5.31)  
##   outcome (mean (SD))     1.90 (2.50)  
##   brthw (mean (SD))    3004.69 (517.85)
##   brthwk (mean (SD))     37.22 (4.15)  
##   bunketu (mean (SD))  1326.26 (698.64)
##   入院期間 (mean (SD))   12.56 (13.52)

outcomeの内訳

outcome_freq <- table(df$outcome)
outcome_freq
## 
##     1     3     4     5     6     7     9 
## 67898    74   510    85    10     2  8356
## outcome=6,7が死亡症例であり、total 12人という結果に
## そのうちわけを確認してみる。
df_death<- filter(df, outcome %in% c(6, 7))
df_death
##          PtDmyCd age dadm  dcc1 dcc2  dcc3  dcc4 dcin1 dcin2 dcin3 dcin4  opek1
## 5174    41011392  33 O721                         D696  I469   D65        K6021
## 56298    3819969  31 O721   D62 D695  K626  I639  O450  O723  O142  N179  K6151
## 81275     930521  37 O721                          D65  U071                   
## 16200    5280120  32 Z349  O814 O721   D65                                 K877
## 56424   32299658  42 O600  R571 N179  R579 J9609  O300  O721  I460  O723  K8981
## 1496     3387145  37 O459  I469 I710  I319        O459  K810  J980   D65  K8981
## 920410   5340825  30 O459  O904 D696   D62        O600  R571  O881  I460   K904
## 1170610  3103211  35 Q282                         Z349                    K1492
## 263791   4522329  36 O600 J9609 D696  C509  D649  O342  O600              K8981
## 340091  16387579  38 O300   R21 O411  O996  O908   Z33  U071  O600  O342  K8982
## 44962    1396216  28 I469                         I469   Z33             K386-2
## 52540    3716427  44 O142   D65 T796 S3611 S3610  O142  I460  O364  R571  K8981
##         opek2 opek3  opek4 opek5 outcome brthw brthwk bunketu 入院期間
## 5174    K6022                          6    NA     NA    4093        2
## 56298    K386 K0002 K909-2             6    NA     NA    3100       97
## 81275                                  6    NA     NA    5350        1
## 16200    K893                          7    NA     NA    4908        1
## 56424   K6151  K901   K876 K6021       6    NA     NA    2537       71
## 1496                                   7    NA     37    1208        3
## 920410   K545                          6    NA     NA    1200       37
## 1170610 K8981  K145                    6    NA     NA    1900       16
## 263791                                 6    NA     28    2350        8
## 340091                                 6    NA     NA    2050       96
## 44962                                  6    NA     NA    1305        1
## 52540   K6072 K6151  K6072 K6331       6    NA     NA    6770       26
##         HospDmycd
## 5174          117
## 56298        1165
## 81275        1650
## 16200         320
## 56424        1165
## 1496           42
## 920410        247
## 1170610       314
## 263791        729
## 340091        956
## 44962        1245
## 52540        1426

死亡症例の内訳は大きな矛盾はないと判断する。

df$death <- ifelse(df$outcome %in% c(6, 7), "1", "0")

続いて、df内から子宮全摘術を行った症例をカウントする。

count <- sum(rowSums(sapply(df[c("opek1", "opek2", "opek3", "opek4", "opek5")], 
                            function(x) grepl("K877|K876|K903|K904", x))) > 0)
count
## [1] 615
## 全78426症例中、子宮全摘を要した症例は615例であった。

IVRを行った症例をカウントする。

count1 <- sum(rowSums(sapply(df[c("opek1", "opek2", "opek3", "opek4", "opek5")], 
                            function(x) grepl("K6151|K6153", x))) > 0)
count1
## [1] 1073

全78426例中、IVRを行ったのは1073例

df内に、“hysterectomy”という子宮全摘の有無を表記する列を追加する

df$hysterectomy <- as.integer(rowSums(sapply(df[c("opek1", "opek2", "opek3", "opek4", "opek5")], 
                                             function(x) grepl("K877|K876|K903|K904", x))) > 0)

同様に、子宮動脈塞栓となった症例の列を追加する。

df$tae<-as.integer(rowSums(sapply(df[c("opek1", "opek2", "opek3", "opek4", "opek5")], 
                                  function(x) grepl("6151|K6153", x))) > 0)

同様に、帝王切開分娩の症例の列を追加する。

df$cs<-as.integer(rowSums(sapply(df[c("opek1", "opek2", "opek3", "opek4", "opek5")], 
                                  function(x) grepl("K8981|K8982", x))) > 0)

IVRが施行可能な施設を、タグ付けする。

taeが”1”となったことがあるHospDmycdを同定する。

summary(df$HospDmycd)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##       6     436     956     912    1380    1837    1491
df$ivr <- ave(df$tae, df$HospDmycd, FUN = function(x) ifelse(any(x == 1), 1, 0))
df %>% 
  group_by(HospDmycd) %>% 
  summarise(ivr = ifelse(any(tae == 1), 1, 0))
## # A tibble: 548 × 2
##    HospDmycd   ivr
##        <int> <dbl>
##  1         6     0
##  2        10     0
##  3        15     0
##  4        17     0
##  5        26     1
##  6        29     1
##  7        30     0
##  8        31     0
##  9        33     0
## 10        35     0
## # ℹ 538 more rows
summary(df$ivr)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  1.0000  0.6024  1.0000  1.0000

病院のユニーク数を表示

unique(df$HospDmycd)
##   [1]    6   10   15   NA   17   26   29   30   35   36   40   42   43   48   62
##  [16]   66   68   78   80   81   90   93   95  107  115  117  122  124  126  136
##  [31]  142  144  147  150  155  164  171  172  177  179  186  188  195  196  206
##  [46]  207  210  211  213  215  216  217  220  228  230  246  247  253  262  264
##  [61]  272  275  287  289  290  291  296  303  304  307  309  314  318  327  329
##  [76]  334  335  337  338  341  346  350  351  363  366  367  368  369  374  378
##  [91]  380  393  394  395  399  401  402  410  415  416  421  423  429  430  436
## [106]  442  445  449  451  452  453  454  456  458  459  464  467  473  475  483
## [121]  494  499  500  505  508  515  516  518  525  527  530  531  548  551  553
## [136]  556  563  571  572  578  581  583  589  595  596  597  598  599  601  605
## [151]  619  624  634  642  643  649  661  666  676  681  682  684  702  703  718
## [166]  724  729  736  742  753  754  760  770  779  814  816  823  824  827  829
## [181]  832  833  839  845  847  852  853  858  859  866  868  869  873  876  878
## [196]  879  894  898  912  913  921  924  934  935  951  952  956  957  963  968
## [211]  975  979  984  991  995  997 1002 1009 1014 1016 1020 1021 1023 1034 1035
## [226] 1040 1044 1050 1052 1061 1066 1067 1071 1072 1074 1076 1077 1079 1081 1083
## [241] 1088 1096 1100 1111 1115 1121 1124 1127 1130 1132 1134 1135 1139 1141 1143
## [256] 1144 1151 1161 1165 1170 1173 1177 1178 1180 1183 1189 1190 1200 1202 1203
## [271] 1207 1211 1215 1222 1223 1231 1232 1233 1235 1245 1246 1248 1252 1253 1255
## [286] 1264 1267 1268 1271 1275 1277 1289 1293 1296 1302 1310 1320 1321 1322 1335
## [301] 1339 1340 1344 1345 1348 1352 1355 1359 1362 1369 1370 1371 1375 1378 1380
## [316] 1381 1386 1397 1399 1401 1405 1407 1408 1409 1411 1413 1424 1426 1431 1437
## [331] 1440 1446 1447 1448 1453 1455 1461 1462 1466 1467 1477 1480 1481 1485 1503
## [346] 1504 1507 1510 1519 1521 1525 1527 1528 1531 1533 1535 1541 1545 1552 1553
## [361] 1554 1562 1567 1569 1571 1572 1576 1577 1583 1587 1592 1595 1597 1609 1625
## [376] 1626 1628 1629 1631 1633 1637 1641 1643 1650 1651 1655 1656 1670 1678 1679
## [391] 1680 1685 1690 1695 1697 1702 1710 1712 1717 1721 1725 1730 1733 1734 1745
## [406] 1749 1753 1754 1772 1773 1777 1778 1784 1786 1792 1793 1794 1795 1802 1805
## [421] 1815 1821 1829 1834   31   33   44   45   51   67  106  193  194  288  297
## [436]  299  312  320  345  355  392  444  471  484  492  617  651  654  657  699
## [451]  706  723  739  773  776  788  790  795  806  810  822  840  844  909  928
## [466]  942  964  996 1005 1041 1048 1086 1091 1093 1133 1163 1194 1242 1301 1332
## [481] 1358 1360 1395 1428 1450 1498 1520 1547 1602 1604 1644 1693 1694 1696 1704
## [496] 1750 1762 1785 1790 1801 1813 1816 1837  221  480  658  835  889 1054 1082
## [511] 1208 1470 1561 1610 1723  627 1185 1688 1063 1672  365  472 1258   38   60
## [526]  118  127  156  300  301  388  748  771  854  917  925  961  969  970 1057
## [541] 1243 1266 1392 1458 1472 1683 1698 1781
unique_count_hosp <- df %>% 
  distinct(HospDmycd) %>% 
  count()

unique_count_hosp
##     n
## 1 548

全548病院

そのうちIVR可能なのは?

count_ivr <- df %>% 
  group_by(HospDmycd) %>% 
  summarise(ivr = ifelse(any(tae == 1), 1, 0)) %>% 
  filter(ivr == 1) %>% 
  count()

count_ivr
## # A tibble: 1 × 1
##       n
##   <int>
## 1   225

IVR対応施設は、225施設と判明した。

各変数の尺度の設定

sapply(df, class)
##      PtDmyCd          age         dadm         dcc1         dcc2         dcc3 
##    "integer"    "numeric"  "character"  "character"  "character"  "character" 
##         dcc4        dcin1        dcin2        dcin3        dcin4        opek1 
##  "character"  "character"  "character"  "character"  "character"  "character" 
##        opek2        opek3        opek4        opek5      outcome        brthw 
##  "character"  "character"  "character"  "character"    "integer"    "integer" 
##       brthwk      bunketu     入院期間    HospDmycd        death hysterectomy 
##    "integer"    "numeric"    "numeric"    "integer"  "character"    "integer" 
##          tae           cs          ivr 
##    "integer"    "integer"    "numeric"
df$tae <- as.factor(df$tae)
df$hysterectomy <- as.factor(df$hysterectomy)
df$death <- as.factor(df$death)
df$cs<-as.factor(df$cs)
df$ivr<-as.factor(df$ivr)
sapply(df,class)
##      PtDmyCd          age         dadm         dcc1         dcc2         dcc3 
##    "integer"    "numeric"  "character"  "character"  "character"  "character" 
##         dcc4        dcin1        dcin2        dcin3        dcin4        opek1 
##  "character"  "character"  "character"  "character"  "character"  "character" 
##        opek2        opek3        opek4        opek5      outcome        brthw 
##  "character"  "character"  "character"  "character"    "integer"    "integer" 
##       brthwk      bunketu     入院期間    HospDmycd        death hysterectomy 
##    "integer"    "numeric"    "numeric"    "integer"     "factor"     "factor" 
##          tae           cs          ivr 
##     "factor"     "factor"     "factor"

これで、全ての下準備は完成した。

# 子宮全摘症例と、非全摘症例の背景の比較
table2 <- CreateTableOne(vars = c("入院期間", "age", "bunketu", "death", "tae", "ivr"),
                         test = TRUE,
                         strata = "hysterectomy",
                         testExact = fisher.test, 
                         addOverall = TRUE,
                         data = df)
table2
##                       Stratified by hysterectomy
##                        Overall          0                1                
##   n                      78426            77811              615          
##   入院期間 (mean (SD))   12.56 (13.52)    12.45 (13.35)    25.25 (24.75)  
##   age (mean (SD))        33.50 (5.31)     33.48 (5.31)     36.09 (4.55)   
##   bunketu (mean (SD))  1326.26 (698.64) 1311.81 (662.76) 3118.94 (1820.68)
##   death = 1 (%)             12 ( 0.0)         9 ( 0.0)         3 ( 0.5)   
##   tae = 1 (%)             1073 ( 1.4)       967 ( 1.2)       106 (17.2)   
##   ivr = 1 (%)            47247 (60.2)     46752 (60.1)       495 (80.5)   
##                       Stratified by hysterectomy
##                        p      test
##   n                               
##   入院期間 (mean (SD)) <0.001     
##   age (mean (SD))      <0.001     
##   bunketu (mean (SD))  <0.001     
##   death = 1 (%)        <0.001     
##   tae = 1 (%)          <0.001     
##   ivr = 1 (%)          <0.001
## 無事に表記出来た。