library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyverse)

## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──

## ✔ ggplot2 3.4.0     ✔ purrr   0.3.4
## ✔ tibble  3.1.8     ✔ stringr 1.4.1
## ✔ tidyr   1.2.1     ✔ forcats 0.5.2
## ✔ readr   2.1.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(table1)

## 
## Attaching package: 'table1'
## 
## The following objects are masked from 'package:base':
## 
##     units, units<-

1. Step1 : Delete duplicated id

## Importing the original dataset
# dat <- read.csv("C:/Users/Momo/Desktop/Post-doc/National Institute of Biomedical Innovation - Health and Nutrition/01.csv")
dat <- read.csv("~/Desktop/R-dir/R studying/01.csv")
## Identify duplicated rows
duplicated_rows <- duplicated(dat[, "id"], fromLast = FALSE)

## Check number of duplicates
n_duplicates <- sum(duplicated_rows)
index_of_duplicated_rows <- which(duplicated_rows)
list_of_duplicated_rows <- dat[index_of_duplicated_rows, ]

cat("Number of duplicates:", n_duplicates, "\n") # --> Number of duplicates: 2

## Number of duplicates: 2

## Remove duplicated rows
data_df_unique <- dat[!duplicated_rows, ]

## Check number of unique rows
n_unique_rows <- nrow(data_df_unique)
cat("Number of unique rows:", n_unique_rows, "\n")

## Number of unique rows: 7670

2. Step 2: Create target_st

dat2 = data_df_unique
dat2$target_st = 0
dat2$target_st[dat2$dx_st == 1] = 1


table1(~ dx_st + factor(target_st), dat2)

	Overall (N=7670)
dx_st
Mean (SD)	0.0593 (0.236)
Median [Min, Max]	0 [0, 1.00]
Missing	279 (3.6%)
factor(target_st)
0	7232 (94.3%)
1	438 (5.7%)

data_df_s02 <- dat2[!is.na(dat2$dx_st), ]

table(data_df_s02$dx_st)

## 
##    0    1 
## 6953  438

table(data_df_s02$target_st)

## 
##    0    1 
## 6953  438

Check “dx_” series data

dx_cols = colnames(data_df_s02)[startsWith(colnames(data_df_s02), "dx_")]
dx_cols

##  [1] "dx_ht"   "dx_tia"  "dx_bd"   "dx_chf"  "dx_af"   "dx_ar"   "dx_hl"  
##  [8] "dx_dm"   "dx_ckd"  "dx_li"   "dx_ane"  "dx_hua"  "dx_ulc"  "dx_etc1"
## [15] "dx_mi"   "dx_ihd"  "dx_sd"   "dx_int"  "dx_pmi"  "dx_st"   "dx_ich" 
## [22] "dx_ci"   "dx_sah"  "dx_cvd"

3. Step 3: Create BMI and delete height and weight

data_df_s03 = data_df_s02

data_df_s03$BMI = data_df_s03$weight/((data_df_s03$height/100)^2)

dat3drop <- data_df_s03 %>% select(-c(weight, height))

4. Step 4: Calculate mean of sbp and dbp.

data_df_s04 = dat3drop
data_df_s04$sbp = (data_df_s04$sbp1 + data_df_s04$sbp2)/2
data_df_s04$dbp = (data_df_s04$dbp1 + data_df_s04$dbp2)/2

dat4drop <- data_df_s04 %>% select(-c(sbp1, sbp2, dbp1, dbp2))

#dat$pulse = (dat$pulse1 + dat$pulse2)/2

5. Step 5: Proteinuria (upr)

Sokytu_or_groups <- unique(dat4drop$upr)
n_groups <- length(Sokytu_or_groups)
any_missing_values <- any(is.na(dat4drop$upr))

table(dat4drop$upr)

## 
##         -    +   +-   2+   3+    5 
##   50 6025  303  901   83   28    1

u_cols <- c("upr")


# use convert dic version 2
conv_dic <- list(
  `+` = "1",
  `-` = "-1",
  `-1` = "-1",
  `±` = "0",
  `+-` = "0",
  `0` = "-1",
  `1` = "1",
  `2` = "2",
  `3` = "3",
  `4` = "4",
  `5` = "5",
  `1+` = "1",
  `2+` = "2",
  `3+` = "3",
  `4+` = "4",
  `5+` = "5"
)

data_df_s05 <- dat4drop
cat(paste0("before_nan:\n", sum(is.na(data_df_s05[, u_cols]))))

## before_nan:
## 0

for (cl in u_cols) {
  # (missing value was replaced into mode)
  #data_df_s05[, cl] <- ifelse(is.na(data_df_s05[, cl]), mode(data_df_s05[, cl], na.rm = TRUE), data_df_s05[, cl])
  data_df_s05[, cl] <- as.numeric(factor(data_df_s05[, cl], levels = names(conv_dic), labels = unlist(conv_dic)))
  cat(paste0("cl: ", cl, "\nunique: ", unique(data_df_s05[, cl]), "\n"))
  print(table(data_df_s05[, cl]))
}

## cl: upr
## unique: 2
##  cl: upr
## unique: 3
##  cl: upr
## unique: 1
##  cl: upr
## unique: 4
##  cl: upr
## unique: NA
##  cl: upr
## unique: 5
##  cl: upr
## unique: 7
## 
##    1    2    3    4    5    7 
##  303 6025  901   83   28    1

cat(paste0("clean: ", dim(data_df_s05)[1], "\n"))

## clean: 7391

cat(paste0("after_nan:\n", colSums(is.na(as.matrix(data_df_s05[, u_cols])))))

## after_nan:
## 50

6. Step 6: Calculate non_hdl

# non_hdl = tc - hdl
data_df_s06 = data_df_s05
data_df_s06$non_hdl = data_df_s06$tc - data_df_s06$hdl
summary(data_df_s06$non_hdl)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    22.0   127.0   150.0   152.6   176.0   403.0      31

7. Step7 Calculate eGFR using “crea” value

# eGFR formula ref. :JMAJ 54(6):403-405, 2011
# Delete "crea" col.
# eGFR - CKD-EPI
data_df_s07 = data_df_s06 %>%
  mutate(eGFR = case_when(
    sex == 2 & crea <= 0.7 ~ 144 * ((crea/0.7)^-0.329 * 0.993^age) * 0.813,
    sex == 2 & crea > 0.7 ~ 144 * (crea/0.7)^-1.209 * 0.993^age * 0.813,
    sex == 1 & crea <= 0.9 ~ 141 * ((crea/0.9)^-0.411 * 0.993^age)*0.813,
    sex == 1 & crea > 0.9 ~ 141 * ((crea/0.9)^-1.209* 0.993^age)*0.813))

dat7drop <- data_df_s07 %>% select(-crea)

summary(data_df_s07$eGFR)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   4.354  69.128  78.997  77.805  87.911 179.145       2

8. Step 8: Conver mine 1- 5 (minesota code) into mine_af & mine_lvh

data_df_s09 <- dat7drop

mine_col <- paste0("mine", 1:5)
af_code <- c(831)
lvh_code <- c(411, 412, 42, 43, 51, 52, 53)
code_dic <- list(mine_af = af_code, mine_lvh = lvh_code)

for (na in names(code_dic)) {
  tmp_df <- data_df_s09[, mine_col]

  for (m_cl in mine_col) {
    tmp_df[, m_cl] <- tmp_df[, m_cl] %in% code_dic[[na]]
  }
  
  tmp_col <- rowSums(tmp_df)
  tmp_col <- as.integer(tmp_col > 0)
  data_df_s09[[na]] <- tmp_col
}

data_df_s09 <- data_df_s09[, !names(data_df_s09) %in% mine_col]

cat("clean: ", dim(data_df_s09), "\n")

## clean:  7391 165

cat("nan_num: ", sum(is.na(data_df_s09[c("mine_af", "mine_lvh")])), "\n")

## nan_num:  0

cat("mine_af:\n", table(data_df_s09$mine_af), "\n")

## mine_af:
##  7349 42

cat("mine_lvh:\n", table(data_df_s09$mine_lvh), "\n")

## mine_lvh:
##  6964 427

10. Step10: Conversion of arrhythmias (arith1, arith2).

#Those with a flag standing for one of them are merged together into arith12.

data_df_s10 <- data_df_s09
# Define the arrhythmias columns
arith_col <- c("arith1", "arith2")
arith_col <- c("arith1", "arith2")

# Print the data before the conversion
cat("******* before ********\n")

## ******* before ********

cat(sprintf("shape: %s\n", dim(data_df_s10)))

## shape: 7391
##  shape: 165

cat(sprintf("nan_num:\n%s\n", colSums(is.na(data_df_s10[, arith_col]))))

## nan_num:
## 602
##  nan_num:
## 673

cat(sprintf("arith1:\n%s\n", table(data_df_s10$arith1)))

## arith1:
## 6588
##  arith1:
## 201

cat(sprintf("arith2:\n%s\n", table(data_df_s10$arith2)))

## arith2:
## 6518
##  arith2:
## 200

# arith1, arith2: covert 1, 2 -> 0, 1. (missing value was replaced into mode.)
for (cl in arith_col) {
  data_df_s10[, cl] <- data_df_s10[, cl] - 1
  #data_df_s10[, cl] <- ifelse(is.na(data_df_s10[, cl]), mode(data_df_s10[, cl], na.rm = TRUE), data_df_s10[, cl])
}

# merge arith1 and arith2 and put merge data into new columns "arith12".
data_df_s10$arith12 <- data_df_s10$arith1 + data_df_s10$arith2
data_df_s10$arith12[data_df_s10$arith12 > 0] <- 1
data_df_s10 <- data_df_s10[, !names(data_df_s10) %in% arith_col]

cat("************** after *************\n")

## ************** after *************

cat(sprintf("clean: %s\n", dim(data_df_s10)))

## clean: 7391
##  clean: 164

cat(sprintf("nan_num: %s\n", sum(is.na(data_df_s10$arith12))))

## nan_num: 691

cat(sprintf("arith12:\n%s\n", table(data_df_s10$arith12)))

## arith12:
## 6476
##  arith12:
## 224

head(data_df_s10[, "arith12"], 15)

##  [1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA

11. Step11: Delete “ldl” columns

data_df_s11 <- data_df_s10 %>% select(-ldl)

12. Step12: Murmur (pathological heart murmur) conversion

data_df_s12 = data_df_s11
# Check rate of missing values.
cat("******** before *********\n")

## ******** before *********

cat(sprintf("shape: %s\n", dim(data_df_s12)))

## shape: 7391
##  shape: 163

cat(sprintf("nan_num: %s\n", sum(is.na(data_df_s12$murmur))))

## nan_num: 55

cat(sprintf("murmur:\n%s\n", table(data_df_s12$murmur)))

## murmur:
## 180
##  murmur:
## 7156

# (Missing values are replaced by mode values)
#data_df_s12$murmur <- ifelse(is.na(data_df_s12$murmur), mode(data_df_s12$murmur, na.rm = TRUE), data_df_s12$murmur)

# Murmur (pathological heart murmur) conversion 2 -> 0, 1 -> 1
# so murmur = 1 (Yes), =0 (No)
data_df_s12$murmur[data_df_s12$murmur == 2] <- 0

cat("******** after *********\n")

## ******** after *********

cat(sprintf("shape: %s\n", dim(data_df_s12)))

## shape: 7391
##  shape: 163

cat(sprintf("nan_num: %s\n", sum(is.na(data_df_s12$murmur))))

## nan_num: 55

cat(sprintf("murmur:\n%s\n", table(data_df_s12$murmur)))

## murmur:
## 7156
##  murmur:
## 180

Preprocessing

Vu Thien

2023-02-14

1. Step1 : Delete duplicated id

2. Step 2: Create target_st

Check “dx_” series data

3. Step 3: Create BMI and delete height and weight

4. Step 4: Calculate mean of sbp and dbp.

5. Step 5: Proteinuria (upr)

6. Step 6: Calculate non_hdl

7. Step7 Calculate eGFR using “crea” value

8. Step 8: Conver mine 1- 5 (minesota code) into mine_af & mine_lvh

10. Step10: Conversion of arrhythmias (arith1, arith2).

11. Step11: Delete “ldl” columns

12. Step12: Murmur (pathological heart murmur) conversion