腎臓内科

#0-1 必要パッケージ読みこみ
pacman::p_load(tidyverse , RSQLite , lubridate, tableone, skimr,rms,ggplot2,
               gtsummary,summarytools,naniar,mice,readr,zoo)

#0-2 データ読み込み
df <- read.csv("尿酸_中村.csv", na = c("", "　", "NA"),fileEncoding = "Shift-JIS")

#0-3 データ編集
## 患者IDと、年の空白を埋める
colnames(df)

##  [1] "患者ID"         "年"             "月"             "アルブミン"    
##  [5] "クレアチニン"   "eGFR"           "蛋白"           "尿クレアチニン"
##  [9] "尿酸"           "尿蛋白"         "総計"

head(df,n=10)

##    患者ID     年   月 アルブミン クレアチニン eGFR 蛋白 尿クレアチニン 尿酸
## 1     265 2020年  2月         NA           NA   NA    0           43.0  7.4
## 2     299 2016年 11月        4.1         0.65 69.3    0             NA  3.1
## 3    1362 2019年  1月        2.0         3.07 12.0    2           32.3  6.4
## 4      NA   <NA>  2月        2.2         3.96 11.7    2           84.8  7.2
## 5      NA   <NA>  3月        2.1         4.56 10.2   NA             NA  5.4
## 6    1677 2018年 12月        3.6        10.89  4.3   NA             NA  8.1
## 7      NA 2019年  1月        3.5        11.44  3.8   NA             NA  6.5
## 8    2188 2015年  8月        4.1         0.63 76.1    0           39.8  4.1
## 9      NA 2016年  2月        4.3         0.62 77.4    0           54.7  4.0
## 10     NA   <NA>  8月        4.2         0.62 77.0    0           79.4  3.8
##    尿蛋白   総計
## 1      12  43.00
## 2      NA  69.30
## 3     254 254.00
## 4     571 571.00
## 5      NA  10.20
## 6      NA  10.89
## 7      NA  11.44
## 8      NA  76.10
## 9      NA  77.40
## 10     NA  79.40

df$患者ID <- na.locf(df$患者ID)
df$年<- na.locf(df$年)

length(unique(df$患者ID))

## [1] 13319

#0-4 糸球体ろ過量、クレアチニン、尿酸のいずれかに欠測がある場合を除外する。
df2 <- df %>%
  filter(!is.na(eGFR) | !is.na(クレアチニン) | !is.na(尿酸))
length(unique(df2$患者ID))

## [1] 12573

# df2は、eGFR、クレアチニン、尿酸の完全症例のみのデータ

#0-5 単回採血を除外する。
unique_patient_ids <- df2 %>%
  group_by(患者ID) %>%
  summarise(count = n()) %>%
  filter(count == 1) %>%
  select(患者ID)
df3 <- df2 %>%
  filter(!(患者ID %in% unique_patient_ids$患者ID))
length(unique(df2$患者ID))

## [1] 12573

## df3は、2回以上採血をしている者だけのデータとなった。


#0-6 年と月を合わせて日付データを作る
df3 <- df3 %>%
  mutate(年 = gsub("年$", "", 年))
df3 <- df3 %>%
  mutate(月 = gsub("月$", "", 月))

df4 <- df3 %>%
  mutate(年月 = paste0(年, "-", sprintf("%02d", as.integer(月))))
#0-7 検査の間隔が15か月以上の症例を除外する
### 日付の変換は終わったので、続いて、同一ID間での検査の間隔が10-14か月の症例のみを抽出する
df4 <- df4 %>%
  mutate(年月 = as.Date(paste0(年, "-", sprintf("%02d", as.integer(月)), "-01")))
df4_grouped <- df4 %>%
  group_by(患者ID) %>%
  arrange(患者ID, 年月) %>%
  mutate(interval_months = as.numeric(lead(年月) - 年月, units = "days") / 30) %>%
  summarise(min_interval_months = min(interval_months, na.rm = TRUE))

valid_patient_ids_min <- df4_grouped %>%
  filter(min_interval_months < 15) %>%
  select(患者ID)

df5 <- df4 %>%
  filter(患者ID %in% valid_patient_ids_min$患者ID)

#0-8 検査の最初と最後の間隔が少なくとも10か月の間隔がある症例のみを抽出する
df5_grouped <- df5 %>%
  group_by(患者ID) %>%
  arrange(患者ID, 年月) %>%
  summarise(first_year_month = min(年月), last_year_month = max(年月)) %>%
  mutate(interval_months = as.numeric(difftime(last_year_month, first_year_month, units = "days") / 30))

# 最初と最後の "年月" の間隔が10か月以上の症例を抽出
valid_patient_ids_max <- df5_grouped %>%
  filter(interval_months >= 10) %>%
  select(患者ID)
df6 <- df5 %>%
  filter(患者ID %in% valid_patient_ids_max$患者ID)

###df6は、重要データの欠測が無く、検査を2回以上行い、検査間隔が10-14か月の症例のみのデータとなった
length(unique(df6$患者ID))

## [1] 4931

skim(df6)

Data summary
Name	df6
Number of rows	104500
Number of columns	12
_______________________
Column type frequency:
character	2
Date	1
numeric	9
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
年	0	1	4	4	0	8	0
月	0	1	1	2	0	12	0

Variable type: Date

skim_variable	n_missing	complete_rate	min	max	median	n_unique
年月	0	1	2014-01-01	2021-12-01	2018-01-01	96

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
患者ID	0	1.00	4958117.80	2708279.05	2188.00	2560670.00	5452495.00	7372634.00	10561967.00	▅▅▅▇▁
アルブミン	4486	0.96	4.00	1.27	0.60	3.70	4.00	4.30	82.10	▇▁▁▁▁
クレアチニン	630	0.99	1.71	1.62	0.31	0.83	1.15	1.87	25.52	▇▁▁▁▁
eGFR	719	0.99	48.43	26.87	2.30	26.90	46.60	67.50	806.20	▇▁▁▁▁
蛋白	6365	0.94	1.01	0.91	0.00	0.00	1.00	2.00	2.00	▇▁▃▁▇
尿クレアチニン	36028	0.66	87.50	65.45	0.50	41.60	70.80	114.10	734.00	▇▁▁▁▁
尿酸	1907	0.98	5.95	1.49	0.40	5.00	5.90	6.80	21.30	▂▇▁▁▁
尿蛋白	48849	0.53	110.20	286.79	2.00	10.00	31.00	112.00	18989.00	▇▁▁▁▁
総計	0	1.00	114.04	210.79	2.80	47.20	73.60	118.10	18989.00	▇▁▁▁▁

### 対象は4931人
### write.csv(df6, "尿酸研究対象患者クリーニング後_20231003.csv",  fileEncoding = "SHIFT-JIS",row.names = FALSE)

腎臓内科

eishin

2023-10-03