library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ stringr 1.4.1
## ✔ tidyr 1.2.1 ✔ forcats 0.5.2
## ✔ readr 2.1.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(table1)
##
## Attaching package: 'table1'
##
## The following objects are masked from 'package:base':
##
## units, units<-
## Importing the original dataset
# dat <- read.csv("C:/Users/Momo/Desktop/Post-doc/National Institute of Biomedical Innovation - Health and Nutrition/01.csv")
dat <- read.csv("~/Desktop/R-dir/R studying/01.csv")
## Identify duplicated rows
duplicated_rows <- duplicated(dat[, "id"], fromLast = FALSE)
## Check number of duplicates
n_duplicates <- sum(duplicated_rows)
index_of_duplicated_rows <- which(duplicated_rows)
list_of_duplicated_rows <- dat[index_of_duplicated_rows, ]
cat("Number of duplicates:", n_duplicates, "\n") # --> Number of duplicates: 2
## Number of duplicates: 2
## Remove duplicated rows
data_df_unique <- dat[!duplicated_rows, ]
## Check number of unique rows
n_unique_rows <- nrow(data_df_unique)
cat("Number of unique rows:", n_unique_rows, "\n")
## Number of unique rows: 7670
dat2 = data_df_unique
dat2$target_st = 0
dat2$target_st[dat2$dx_st == 1] = 1
table1(~ dx_st + factor(target_st), dat2)
| Overall (N=7670) |
|
|---|---|
| dx_st | |
| Mean (SD) | 0.0593 (0.236) |
| Median [Min, Max] | 0 [0, 1.00] |
| Missing | 279 (3.6%) |
| factor(target_st) | |
| 0 | 7232 (94.3%) |
| 1 | 438 (5.7%) |
data_df_s02 <- dat2[!is.na(dat2$dx_st), ]
table(data_df_s02$dx_st)
##
## 0 1
## 6953 438
table(data_df_s02$target_st)
##
## 0 1
## 6953 438
dx_cols = colnames(data_df_s02)[startsWith(colnames(data_df_s02), "dx_")]
dx_cols
## [1] "dx_ht" "dx_tia" "dx_bd" "dx_chf" "dx_af" "dx_ar" "dx_hl"
## [8] "dx_dm" "dx_ckd" "dx_li" "dx_ane" "dx_hua" "dx_ulc" "dx_etc1"
## [15] "dx_mi" "dx_ihd" "dx_sd" "dx_int" "dx_pmi" "dx_st" "dx_ich"
## [22] "dx_ci" "dx_sah" "dx_cvd"
data_df_s03 = data_df_s02
data_df_s03$BMI = data_df_s03$weight/((data_df_s03$height/100)^2)
dat3drop <- data_df_s03 %>% select(-c(weight, height))
data_df_s04 = dat3drop
data_df_s04$sbp = (data_df_s04$sbp1 + data_df_s04$sbp2)/2
data_df_s04$dbp = (data_df_s04$dbp1 + data_df_s04$dbp2)/2
dat4drop <- data_df_s04 %>% select(-c(sbp1, sbp2, dbp1, dbp2))
#dat$pulse = (dat$pulse1 + dat$pulse2)/2
Sokytu_or_groups <- unique(dat4drop$upr)
n_groups <- length(Sokytu_or_groups)
any_missing_values <- any(is.na(dat4drop$upr))
table(dat4drop$upr)
##
## - + +- 2+ 3+ 5
## 50 6025 303 901 83 28 1
u_cols <- c("upr")
# use convert dic version 2
conv_dic <- list(
`+` = "1",
`-` = "-1",
`-1` = "-1",
`±` = "0",
`+-` = "0",
`0` = "-1",
`1` = "1",
`2` = "2",
`3` = "3",
`4` = "4",
`5` = "5",
`1+` = "1",
`2+` = "2",
`3+` = "3",
`4+` = "4",
`5+` = "5"
)
data_df_s05 <- dat4drop
cat(paste0("before_nan:\n", sum(is.na(data_df_s05[, u_cols]))))
## before_nan:
## 0
for (cl in u_cols) {
# (missing value was replaced into mode)
#data_df_s05[, cl] <- ifelse(is.na(data_df_s05[, cl]), mode(data_df_s05[, cl], na.rm = TRUE), data_df_s05[, cl])
data_df_s05[, cl] <- as.numeric(factor(data_df_s05[, cl], levels = names(conv_dic), labels = unlist(conv_dic)))
cat(paste0("cl: ", cl, "\nunique: ", unique(data_df_s05[, cl]), "\n"))
print(table(data_df_s05[, cl]))
}
## cl: upr
## unique: 2
## cl: upr
## unique: 3
## cl: upr
## unique: 1
## cl: upr
## unique: 4
## cl: upr
## unique: NA
## cl: upr
## unique: 5
## cl: upr
## unique: 7
##
## 1 2 3 4 5 7
## 303 6025 901 83 28 1
cat(paste0("clean: ", dim(data_df_s05)[1], "\n"))
## clean: 7391
cat(paste0("after_nan:\n", colSums(is.na(as.matrix(data_df_s05[, u_cols])))))
## after_nan:
## 50
# non_hdl = tc - hdl
data_df_s06 = data_df_s05
data_df_s06$non_hdl = data_df_s06$tc - data_df_s06$hdl
summary(data_df_s06$non_hdl)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 22.0 127.0 150.0 152.6 176.0 403.0 31
# eGFR formula ref. :JMAJ 54(6):403-405, 2011
# Delete "crea" col.
# eGFR - CKD-EPI
data_df_s07 = data_df_s06 %>%
mutate(eGFR = case_when(
sex == 2 & crea <= 0.7 ~ 144 * ((crea/0.7)^-0.329 * 0.993^age) * 0.813,
sex == 2 & crea > 0.7 ~ 144 * (crea/0.7)^-1.209 * 0.993^age * 0.813,
sex == 1 & crea <= 0.9 ~ 141 * ((crea/0.9)^-0.411 * 0.993^age)*0.813,
sex == 1 & crea > 0.9 ~ 141 * ((crea/0.9)^-1.209* 0.993^age)*0.813))
dat7drop <- data_df_s07 %>% select(-crea)
summary(data_df_s07$eGFR)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 4.354 69.128 78.997 77.805 87.911 179.145 2
data_df_s09 <- dat7drop
mine_col <- paste0("mine", 1:5)
af_code <- c(831)
lvh_code <- c(411, 412, 42, 43, 51, 52, 53)
code_dic <- list(mine_af = af_code, mine_lvh = lvh_code)
for (na in names(code_dic)) {
tmp_df <- data_df_s09[, mine_col]
for (m_cl in mine_col) {
tmp_df[, m_cl] <- tmp_df[, m_cl] %in% code_dic[[na]]
}
tmp_col <- rowSums(tmp_df)
tmp_col <- as.integer(tmp_col > 0)
data_df_s09[[na]] <- tmp_col
}
data_df_s09 <- data_df_s09[, !names(data_df_s09) %in% mine_col]
cat("clean: ", dim(data_df_s09), "\n")
## clean: 7391 165
cat("nan_num: ", sum(is.na(data_df_s09[c("mine_af", "mine_lvh")])), "\n")
## nan_num: 0
cat("mine_af:\n", table(data_df_s09$mine_af), "\n")
## mine_af:
## 7349 42
cat("mine_lvh:\n", table(data_df_s09$mine_lvh), "\n")
## mine_lvh:
## 6964 427
#Those with a flag standing for one of them are merged together into arith12.
data_df_s10 <- data_df_s09
# Define the arrhythmias columns
arith_col <- c("arith1", "arith2")
arith_col <- c("arith1", "arith2")
# Print the data before the conversion
cat("******* before ********\n")
## ******* before ********
cat(sprintf("shape: %s\n", dim(data_df_s10)))
## shape: 7391
## shape: 165
cat(sprintf("nan_num:\n%s\n", colSums(is.na(data_df_s10[, arith_col]))))
## nan_num:
## 602
## nan_num:
## 673
cat(sprintf("arith1:\n%s\n", table(data_df_s10$arith1)))
## arith1:
## 6588
## arith1:
## 201
cat(sprintf("arith2:\n%s\n", table(data_df_s10$arith2)))
## arith2:
## 6518
## arith2:
## 200
# arith1, arith2: covert 1, 2 -> 0, 1. (missing value was replaced into mode.)
for (cl in arith_col) {
data_df_s10[, cl] <- data_df_s10[, cl] - 1
#data_df_s10[, cl] <- ifelse(is.na(data_df_s10[, cl]), mode(data_df_s10[, cl], na.rm = TRUE), data_df_s10[, cl])
}
# merge arith1 and arith2 and put merge data into new columns "arith12".
data_df_s10$arith12 <- data_df_s10$arith1 + data_df_s10$arith2
data_df_s10$arith12[data_df_s10$arith12 > 0] <- 1
data_df_s10 <- data_df_s10[, !names(data_df_s10) %in% arith_col]
cat("************** after *************\n")
## ************** after *************
cat(sprintf("clean: %s\n", dim(data_df_s10)))
## clean: 7391
## clean: 164
cat(sprintf("nan_num: %s\n", sum(is.na(data_df_s10$arith12))))
## nan_num: 691
cat(sprintf("arith12:\n%s\n", table(data_df_s10$arith12)))
## arith12:
## 6476
## arith12:
## 224
head(data_df_s10[, "arith12"], 15)
## [1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
data_df_s11 <- data_df_s10 %>% select(-ldl)
data_df_s12 = data_df_s11
# Check rate of missing values.
cat("******** before *********\n")
## ******** before *********
cat(sprintf("shape: %s\n", dim(data_df_s12)))
## shape: 7391
## shape: 163
cat(sprintf("nan_num: %s\n", sum(is.na(data_df_s12$murmur))))
## nan_num: 55
cat(sprintf("murmur:\n%s\n", table(data_df_s12$murmur)))
## murmur:
## 180
## murmur:
## 7156
# (Missing values are replaced by mode values)
#data_df_s12$murmur <- ifelse(is.na(data_df_s12$murmur), mode(data_df_s12$murmur, na.rm = TRUE), data_df_s12$murmur)
# Murmur (pathological heart murmur) conversion 2 -> 0, 1 -> 1
# so murmur = 1 (Yes), =0 (No)
data_df_s12$murmur[data_df_s12$murmur == 2] <- 0
cat("******** after *********\n")
## ******** after *********
cat(sprintf("shape: %s\n", dim(data_df_s12)))
## shape: 7391
## shape: 163
cat(sprintf("nan_num: %s\n", sum(is.na(data_df_s12$murmur))))
## nan_num: 55
cat(sprintf("murmur:\n%s\n", table(data_df_s12$murmur)))
## murmur:
## 7156
## murmur:
## 180