#0-1 パッケージの読み込み
pacman::p_load(dplyr, tidyverse,ggplot2,tidylog,openxlsx,skimr)

#0-2 データの読み込み
tekiyo_all<-read_tsv("tekiyo.tsv",locale = locale(encoding = "UTF-8"))
## Rows: 1021972 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (4): observable_start_ym, observable_end_ym, insurer_shubetsu, birth_ym
## dbl (2): kojin_id, sex_code
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
exam_interview<-read_tsv("exam_interview_processed.tsv")
## Rows: 1043 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## dbl  (10): kojin_id, weight, bmi, hokou_or_shintai_katsudou_code, kitsuen_co...
## date  (1): exam_ymd
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
receipt<-read_tsv("receipt.tsv")
## Rows: 441335 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (1): receipt_ym
## dbl  (4): kojin_id, receipt_id, receipt_shubetsu_code, nyugai_kbn_code
## date (2): nyuin_ymd, taiin_ymd
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
diseases<-read_tsv("receipt_diseases.tsv",
                         locale = locale(encoding = "UTF-8"))
## Rows: 230510 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): receipt_ym, diseases_name
## dbl (5): kojin_id, receipt_id, line_no, diseases_code, utagai_flg
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
diseases_icd10<-read_tsv("receipt_diseases_icd10.tsv",
                         locale = locale(encoding = "UTF-8"))
## Rows: 230510 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (9): receipt_ym, icd10_major_code_1, icd10_major_name_1, icd10_medium_co...
## dbl (6): kojin_id, receipt_id, line_no, serial_no, diseases_code, utagai_flg
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
drug<-read_tsv("receipt_drug.tsv")
## Rows: 246979 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): receipt_ym
## dbl (6): kojin_id, receipt_id, line_no, drug_code, shiyouryou, points
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
drug_atc<-read_tsv("receipt_drug_atc.tsv",locale = locale(encoding = "UTF-8"))
## Rows: 247021 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (9): receipt_ym, atc_major_code, atc_major_name, atc_medium_code, atc_me...
## dbl (5): kojin_id, receipt_id, line_no, serial_no, drug_code
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
drug_santei<-read_tsv("receipt_drug_santei_ymd.tsv")
## Rows: 308116 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (1): receipt_ym
## dbl  (5): kojin_id, receipt_id, line_no, serial_no, kaisuu
## date (2): shohou_ymd, dispensing_ymd
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
steroid<-read.csv("steroid.csv",fileEncoding = "Shift-JIS")
osteo<-read.csv("osteo.csv",fileEncoding = "UTF-8")
#0-3 データの統合 
## receipt上の入院年月日と、icd10の情報を統合する。
receipt_disease_joined <- inner_join(receipt, diseases_icd10, by = "receipt_id")
## inner_join: added 16 columns (kojin_id.x, receipt_ym.x, kojin_id.y, receipt_ym.y, line_no, …)
##             > rows only in x  (217,527)
##             > rows only in y  (      0)
##             > matched rows     230,510    (includes duplicates)
##             >                 =========
##             > rows total       230,510
receipt_disease_joined<-
  receipt_disease_joined %>% rename(kojin_id=kojin_id.x)
## rename: renamed one variable (kojin_id)
## drug_atcとdrug_santeiの結合
receipt_drug_joined <- inner_join(drug_atc, drug_santei, by = c("receipt_id", "line_no"))
## inner_join: added 9 columns (kojin_id.x, receipt_ym.x, serial_no.x, kojin_id.y, receipt_ym.y, …)
##             > rows only in x  (      0)
##             > rows only in y  (      0)
##             > matched rows     308,180    (includes duplicates)
##             >                 =========
##             > rows total       308,180
receipt_drug_joined<-receipt_drug_joined %>% rename(kojin_id=kojin_id.x)
## rename: renamed one variable (kojin_id)
### どれとどれを結合するのか?

# 0-4 データフレームの解析、及び必要な情報の取得
tekiyo_all %>% select(kojin_id) %>% nrow()
## select: dropped 5 variables (sex_code, observable_start_ym, observable_end_ym, insurer_shubetsu, birth_ym)
## [1] 1021972
unique_tekiyo <- length(unique(tekiyo_all$kojin_id))
print(unique_tekiyo)
## [1] 1021972
## 全102万のデータあり、tekiyoには患者idの重複は無し。

receipt_disease_joined %>% select(kojin_id) %>% nrow()
## select: dropped 20 variables (receipt_ym.x, receipt_id, receipt_shubetsu_code, nyugai_kbn_code, nyuin_ymd, …)
## [1] 230510
unique_disease<- length(unique(receipt_disease_joined$kojin_id))
print(unique_disease)
## [1] 2465
## レセプト病名データには23万人分のデータがあるが、uniqueでカウントすると2465人。

receipt_drug_joined %>% select(kojin_id) %>% nrow()
## select: dropped 19 variables (receipt_ym.x, receipt_id, line_no, serial_no.x, drug_code, …)
## [1] 308180
unique_drug<-length(unique(receipt_drug_joined$kojin_id))
print(unique_drug)
## [1] 2465
## レセプト処方データには30万人分のデータあり、そのうち2465人分のuniqueデータあり
## 観察期間中に2回以上の処方があるということ。
## 入院データとユニーク数が完全に一致している…?

#0-5データ編集
receipt_disease_joined %>% head(10)
## # A tibble: 10 × 21
##    kojin_id receipt_ym.x receipt_id receipt_shubetsu_code nyugai_kbn_code
##       <dbl> <chr>             <dbl>                 <dbl>           <dbl>
##  1  3229443 2021/01       480719322                     1               1
##  2  7272780 2016/10       255129083                     1               1
##  3  7695817 2015/06       638940123                     2               2
##  4  6649918 2018/02       593100299                     6               2
##  5  6649918 2018/02       593100299                     6               2
##  6  6649918 2018/02       593100299                     6               2
##  7  2782415 2019/08       279595856                     1               1
##  8  2782415 2019/08       279595856                     1               1
##  9  8032004 2019/03       127117574                     1               1
## 10  9698559 2015/11       519771932                     1               1
## # ℹ 16 more variables: nyuin_ymd <date>, taiin_ymd <date>, kojin_id.y <dbl>,
## #   receipt_ym.y <chr>, line_no <dbl>, serial_no <dbl>, diseases_code <dbl>,
## #   utagai_flg <dbl>, icd10_major_code_1 <chr>, icd10_major_name_1 <chr>,
## #   icd10_medium_code_1 <chr>, icd10_medium_name_1 <chr>,
## #   icd10_sub_code_1 <chr>, icd10_sub_name_1 <chr>, icd10_subdiv_code_1 <chr>,
## #   icd10_subdiv_name_1 <chr>
colnames(receipt_disease_joined)
##  [1] "kojin_id"              "receipt_ym.x"          "receipt_id"           
##  [4] "receipt_shubetsu_code" "nyugai_kbn_code"       "nyuin_ymd"            
##  [7] "taiin_ymd"             "kojin_id.y"            "receipt_ym.y"         
## [10] "line_no"               "serial_no"             "diseases_code"        
## [13] "utagai_flg"            "icd10_major_code_1"    "icd10_major_name_1"   
## [16] "icd10_medium_code_1"   "icd10_medium_name_1"   "icd10_sub_code_1"     
## [19] "icd10_sub_name_1"      "icd10_subdiv_code_1"   "icd10_subdiv_name_1"
filtered_df <- receipt_disease_joined %>%
  filter(
    grepl("S72", icd10_major_code_1) |
      grepl("S72", icd10_medium_code_1) |
      grepl("S72", icd10_sub_code_1) |
      grepl("S72", icd10_subdiv_code_1))
## filter: removed 199,411 rows (87%), 31,099 rows remaining
filter_unique_disease<- length(unique(filtered_df$kojin_id))
print(filter_unique_disease)
## [1] 2465
## やはり、全ての患者が"S72"を含む。

##方針
##receipt_disease_joined は病名と、その登録日の情報あり
##→「大腿骨骨折」の罹患日と個人IDのデータ
##exam_interviewは健康診断データ
##→喫煙と、やせ/肥満のデータ
##receipt_drug_joinedは処方薬剤と処方日のデータ

# 1-1 喫煙者の抽出
summary(exam_interview$kitsuen_code)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   2.000   1.873   2.000   2.000
# 1-2 痩身、肥満の抽出
summary(exam_interview$bmi)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   13.80   18.70   20.85   21.63   23.40   41.10       1
exam_interview <- exam_interview %>%
  mutate(thinness = if_else(is.na(bmi), NA_real_, ifelse(bmi < 18.5, 1, 0)))
## mutate: new variable 'thinness' (double) with 3 unique values and <1% NA
summary(exam_interview$thinness)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.0000  0.0000  0.2217  0.0000  1.0000       1
exam_interview <- exam_interview %>%
  mutate(obesity = if_else(is.na(bmi), NA_real_, ifelse(bmi >= 25, 1, 0)))
## mutate: new variable 'obesity' (double) with 3 unique values and <1% NA
summary(exam_interview$obesity)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.0000  0.0000  0.1708  0.0000  1.0000       1
# 1-3 ステロイドユーザーの抽出
### ステロイドの使用状況を細かく分析する
### レセ電算コードから抽出していく。
table(steroid$レセプト電算処理システムコード.1.)
## 
## 610408661 610431117 610454071 612450070 612450118 620000694 620000695 620000696 
##         6         3         1         1         3         3         3         1 
## 620000697 620001894 620001895 620002513 620004294 620004387 620004578 620005125 
##         4         1         1         1         5         3         2         1 
## 620005126 620005133 620005134 620005848 620006161 620006162 620006903 620006985 
##         1         1         1         4         1         3         2         1 
## 620006986 620007078 620008651 620009010 620009011 620521501 620530701 620530801 
##         2         2         1         2         2         1         6         2 
## 620531001 620531101 620531301 620531401 620531501 620531601 621559301 621997701 
##         4         2         2         3         4         2         3         1 
## 622359901 662450002 662450003 
##         2         1         1
### steroid内のレセ電算コードの内容を含めば1、そうでなければ0を出力する、"steroid_use"という変数を作る
receipt_drug_joined$steroid_use <- 
  ifelse(receipt_drug_joined$drug_code %in% steroid$レセプト電算処理システムコード.1., 1, 0)
summary(receipt_drug_joined$steroid_use)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.0424  0.0000  1.0000
count_steroid_user <- sum(receipt_drug_joined$steroid_use == 1)
print(count_steroid_user)
## [1] 13067
### ステロイド使用者、として新しい変数を作る
df_steroid_user<-receipt_drug_joined %>%
  group_by(kojin_id) %>%
  summarize(steroid_user = max(steroid_use)) %>%
  ungroup()
## group_by: one grouping variable (kojin_id)
## summarize: now 2,465 rows and 2 columns, ungrouped
## ungroup: no grouping variables
sum(df_steroid_user$steroid_user)
## [1] 553
### 2465人のうち、ステロイドの内服を使っていた者の数は553人だった。


# 1-4 骨粗鬆症の薬使ってるひとの抽出
colnames(osteo)
## [1] "レセプト電算処理システムコード" "告示名称"
receipt_drug_joined$osteo_use <- 
  ifelse(receipt_drug_joined$drug_code %in% osteo$レセプト電算処理システムコード, 1, 0)
summary(receipt_drug_joined$osteo_use)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1595  0.0000  1.0000
count_osteo_user <- sum(receipt_drug_joined$osteo_use == 1)
print(count_osteo_user)
## [1] 49169
### 骨粗鬆症治療薬使用者、として新しい変数を作る
df_osteo_user<-receipt_drug_joined %>%
  group_by(kojin_id) %>%
  summarize(osteo_user = max(osteo_use)) %>%
  ungroup()
## group_by: one grouping variable (kojin_id)
## summarize: now 2,465 rows and 2 columns, ungrouped
## ungroup: no grouping variables
sum(df_osteo_user$osteo_user)
## [1] 1095
### 2465人のうち、骨粗鬆症治療薬を内服していたのは1095人