#0-1 パッケージの読み込み
pacman::p_load(dplyr, tidyverse,ggplot2,tidylog,openxlsx,skimr)
#0-2 データの読み込み
tekiyo_all<-read_tsv("tekiyo.tsv",locale = locale(encoding = "UTF-8"))
## Rows: 1021972 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (4): observable_start_ym, observable_end_ym, insurer_shubetsu, birth_ym
## dbl (2): kojin_id, sex_code
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
exam_interview<-read_tsv("exam_interview_processed.tsv")
## Rows: 1043 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## dbl (10): kojin_id, weight, bmi, hokou_or_shintai_katsudou_code, kitsuen_co...
## date (1): exam_ymd
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
receipt<-read_tsv("receipt.tsv")
## Rows: 441335 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): receipt_ym
## dbl (4): kojin_id, receipt_id, receipt_shubetsu_code, nyugai_kbn_code
## date (2): nyuin_ymd, taiin_ymd
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
diseases<-read_tsv("receipt_diseases.tsv",
locale = locale(encoding = "UTF-8"))
## Rows: 230510 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): receipt_ym, diseases_name
## dbl (5): kojin_id, receipt_id, line_no, diseases_code, utagai_flg
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
diseases_icd10<-read_tsv("receipt_diseases_icd10.tsv",
locale = locale(encoding = "UTF-8"))
## Rows: 230510 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (9): receipt_ym, icd10_major_code_1, icd10_major_name_1, icd10_medium_co...
## dbl (6): kojin_id, receipt_id, line_no, serial_no, diseases_code, utagai_flg
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
drug<-read_tsv("receipt_drug.tsv")
## Rows: 246979 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): receipt_ym
## dbl (6): kojin_id, receipt_id, line_no, drug_code, shiyouryou, points
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
drug_atc<-read_tsv("receipt_drug_atc.tsv",locale = locale(encoding = "UTF-8"))
## Rows: 247021 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (9): receipt_ym, atc_major_code, atc_major_name, atc_medium_code, atc_me...
## dbl (5): kojin_id, receipt_id, line_no, serial_no, drug_code
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
drug_santei<-read_tsv("receipt_drug_santei_ymd.tsv")
## Rows: 308116 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): receipt_ym
## dbl (5): kojin_id, receipt_id, line_no, serial_no, kaisuu
## date (2): shohou_ymd, dispensing_ymd
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
steroid<-read.csv("steroid.csv",fileEncoding = "Shift-JIS")
osteo<-read.csv("osteo.csv",fileEncoding = "UTF-8")
#0-3 データの統合
## receipt上の入院年月日と、icd10の情報を統合する。
receipt_disease_joined <- inner_join(receipt, diseases_icd10, by = "receipt_id")
## inner_join: added 16 columns (kojin_id.x, receipt_ym.x, kojin_id.y, receipt_ym.y, line_no, …)
## > rows only in x (217,527)
## > rows only in y ( 0)
## > matched rows 230,510 (includes duplicates)
## > =========
## > rows total 230,510
receipt_disease_joined<-
receipt_disease_joined %>% rename(kojin_id=kojin_id.x)
## rename: renamed one variable (kojin_id)
## drug_atcとdrug_santeiの結合
receipt_drug_joined <- inner_join(drug_atc, drug_santei, by = c("receipt_id", "line_no"))
## inner_join: added 9 columns (kojin_id.x, receipt_ym.x, serial_no.x, kojin_id.y, receipt_ym.y, …)
## > rows only in x ( 0)
## > rows only in y ( 0)
## > matched rows 308,180 (includes duplicates)
## > =========
## > rows total 308,180
receipt_drug_joined<-receipt_drug_joined %>% rename(kojin_id=kojin_id.x)
## rename: renamed one variable (kojin_id)
### どれとどれを結合するのか?
# 0-4 データフレームの解析、及び必要な情報の取得
tekiyo_all %>% select(kojin_id) %>% nrow()
## select: dropped 5 variables (sex_code, observable_start_ym, observable_end_ym, insurer_shubetsu, birth_ym)
## [1] 1021972
unique_tekiyo <- length(unique(tekiyo_all$kojin_id))
print(unique_tekiyo)
## [1] 1021972
## 全102万のデータあり、tekiyoには患者idの重複は無し。
receipt_disease_joined %>% select(kojin_id) %>% nrow()
## select: dropped 20 variables (receipt_ym.x, receipt_id, receipt_shubetsu_code, nyugai_kbn_code, nyuin_ymd, …)
## [1] 230510
unique_disease<- length(unique(receipt_disease_joined$kojin_id))
print(unique_disease)
## [1] 2465
## レセプト病名データには23万人分のデータがあるが、uniqueでカウントすると2465人。
receipt_drug_joined %>% select(kojin_id) %>% nrow()
## select: dropped 19 variables (receipt_ym.x, receipt_id, line_no, serial_no.x, drug_code, …)
## [1] 308180
unique_drug<-length(unique(receipt_drug_joined$kojin_id))
print(unique_drug)
## [1] 2465
## レセプト処方データには30万人分のデータあり、そのうち2465人分のuniqueデータあり
## 観察期間中に2回以上の処方があるということ。
## 入院データとユニーク数が完全に一致している…?
#0-5データ編集
receipt_disease_joined %>% head(10)
## # A tibble: 10 × 21
## kojin_id receipt_ym.x receipt_id receipt_shubetsu_code nyugai_kbn_code
## <dbl> <chr> <dbl> <dbl> <dbl>
## 1 3229443 2021/01 480719322 1 1
## 2 7272780 2016/10 255129083 1 1
## 3 7695817 2015/06 638940123 2 2
## 4 6649918 2018/02 593100299 6 2
## 5 6649918 2018/02 593100299 6 2
## 6 6649918 2018/02 593100299 6 2
## 7 2782415 2019/08 279595856 1 1
## 8 2782415 2019/08 279595856 1 1
## 9 8032004 2019/03 127117574 1 1
## 10 9698559 2015/11 519771932 1 1
## # ℹ 16 more variables: nyuin_ymd <date>, taiin_ymd <date>, kojin_id.y <dbl>,
## # receipt_ym.y <chr>, line_no <dbl>, serial_no <dbl>, diseases_code <dbl>,
## # utagai_flg <dbl>, icd10_major_code_1 <chr>, icd10_major_name_1 <chr>,
## # icd10_medium_code_1 <chr>, icd10_medium_name_1 <chr>,
## # icd10_sub_code_1 <chr>, icd10_sub_name_1 <chr>, icd10_subdiv_code_1 <chr>,
## # icd10_subdiv_name_1 <chr>
colnames(receipt_disease_joined)
## [1] "kojin_id" "receipt_ym.x" "receipt_id"
## [4] "receipt_shubetsu_code" "nyugai_kbn_code" "nyuin_ymd"
## [7] "taiin_ymd" "kojin_id.y" "receipt_ym.y"
## [10] "line_no" "serial_no" "diseases_code"
## [13] "utagai_flg" "icd10_major_code_1" "icd10_major_name_1"
## [16] "icd10_medium_code_1" "icd10_medium_name_1" "icd10_sub_code_1"
## [19] "icd10_sub_name_1" "icd10_subdiv_code_1" "icd10_subdiv_name_1"
filtered_df <- receipt_disease_joined %>%
filter(
grepl("S72", icd10_major_code_1) |
grepl("S72", icd10_medium_code_1) |
grepl("S72", icd10_sub_code_1) |
grepl("S72", icd10_subdiv_code_1))
## filter: removed 199,411 rows (87%), 31,099 rows remaining
filter_unique_disease<- length(unique(filtered_df$kojin_id))
print(filter_unique_disease)
## [1] 2465
## やはり、全ての患者が"S72"を含む。
##方針
##receipt_disease_joined は病名と、その登録日の情報あり
##→「大腿骨骨折」の罹患日と個人IDのデータ
##exam_interviewは健康診断データ
##→喫煙と、やせ/肥満のデータ
##receipt_drug_joinedは処方薬剤と処方日のデータ
# 1-1 喫煙者の抽出
summary(exam_interview$kitsuen_code)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 2.000 1.873 2.000 2.000
# 1-2 痩身、肥満の抽出
summary(exam_interview$bmi)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 13.80 18.70 20.85 21.63 23.40 41.10 1
exam_interview <- exam_interview %>%
mutate(thinness = if_else(is.na(bmi), NA_real_, ifelse(bmi < 18.5, 1, 0)))
## mutate: new variable 'thinness' (double) with 3 unique values and <1% NA
summary(exam_interview$thinness)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0000 0.0000 0.0000 0.2217 0.0000 1.0000 1
exam_interview <- exam_interview %>%
mutate(obesity = if_else(is.na(bmi), NA_real_, ifelse(bmi >= 25, 1, 0)))
## mutate: new variable 'obesity' (double) with 3 unique values and <1% NA
summary(exam_interview$obesity)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0000 0.0000 0.0000 0.1708 0.0000 1.0000 1
# 1-3 ステロイドユーザーの抽出
### ステロイドの使用状況を細かく分析する
### レセ電算コードから抽出していく。
table(steroid$レセプト電算処理システムコード.1.)
##
## 610408661 610431117 610454071 612450070 612450118 620000694 620000695 620000696
## 6 3 1 1 3 3 3 1
## 620000697 620001894 620001895 620002513 620004294 620004387 620004578 620005125
## 4 1 1 1 5 3 2 1
## 620005126 620005133 620005134 620005848 620006161 620006162 620006903 620006985
## 1 1 1 4 1 3 2 1
## 620006986 620007078 620008651 620009010 620009011 620521501 620530701 620530801
## 2 2 1 2 2 1 6 2
## 620531001 620531101 620531301 620531401 620531501 620531601 621559301 621997701
## 4 2 2 3 4 2 3 1
## 622359901 662450002 662450003
## 2 1 1
### steroid内のレセ電算コードの内容を含めば1、そうでなければ0を出力する、"steroid_use"という変数を作る
receipt_drug_joined$steroid_use <-
ifelse(receipt_drug_joined$drug_code %in% steroid$レセプト電算処理システムコード.1., 1, 0)
summary(receipt_drug_joined$steroid_use)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.0424 0.0000 1.0000
count_steroid_user <- sum(receipt_drug_joined$steroid_use == 1)
print(count_steroid_user)
## [1] 13067
### ステロイド使用者、として新しい変数を作る
df_steroid_user<-receipt_drug_joined %>%
group_by(kojin_id) %>%
summarize(steroid_user = max(steroid_use)) %>%
ungroup()
## group_by: one grouping variable (kojin_id)
## summarize: now 2,465 rows and 2 columns, ungrouped
## ungroup: no grouping variables
sum(df_steroid_user$steroid_user)
## [1] 553
### 2465人のうち、ステロイドの内服を使っていた者の数は553人だった。
# 1-4 骨粗鬆症の薬使ってるひとの抽出
colnames(osteo)
## [1] "レセプト電算処理システムコード" "告示名称"
receipt_drug_joined$osteo_use <-
ifelse(receipt_drug_joined$drug_code %in% osteo$レセプト電算処理システムコード, 1, 0)
summary(receipt_drug_joined$osteo_use)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1595 0.0000 1.0000
count_osteo_user <- sum(receipt_drug_joined$osteo_use == 1)
print(count_osteo_user)
## [1] 49169
### 骨粗鬆症治療薬使用者、として新しい変数を作る
df_osteo_user<-receipt_drug_joined %>%
group_by(kojin_id) %>%
summarize(osteo_user = max(osteo_use)) %>%
ungroup()
## group_by: one grouping variable (kojin_id)
## summarize: now 2,465 rows and 2 columns, ungrouped
## ungroup: no grouping variables
sum(df_osteo_user$osteo_user)
## [1] 1095
### 2465人のうち、骨粗鬆症治療薬を内服していたのは1095人