library(readxl)
library(table1)
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
library(ggplot2)
library(caret)
## Loading required package: lattice
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ✔ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
df <- read_excel("~/Desktop/Misc/Clean data.xlsx")
table1(~ age + sex + bmi + sbp_mean + dbp_mean + tob + exer + kaidan + bs + non_hdl + hyper +
diabetes_over_boundary + MetS|target_st, df)
## Warning in table1.formula(~age + sex + bmi + sbp_mean + dbp_mean + tob + : Terms
## to the right of '|' in formula 'x' define table columns and are expected to be
## factors with meaningful labels.
| 0 (N=6951) |
1 (N=438) |
Overall (N=7389) |
|
|---|---|---|---|
| age | |||
| Mean (SD) | 54.6 (13.1) | 64.4 (9.78) | 55.1 (13.1) |
| Median [Min, Max] | 55.0 [30.0, 84.0] | 66.0 [32.0, 80.0] | 56.0 [30.0, 84.0] |
| sex | |||
| Mean (SD) | 0.548 (0.498) | 0.466 (0.499) | 0.543 (0.498) |
| Median [Min, Max] | 1.00 [0, 1.00] | 0 [0, 1.00] | 1.00 [0, 1.00] |
| bmi | |||
| Mean (SD) | 22.5 (3.00) | 23.1 (3.21) | 22.5 (3.01) |
| Median [Min, Max] | 22.3 [14.2, 30.5] | 23.0 [14.2, 30.5] | 22.3 [14.2, 30.5] |
| sbp_mean | |||
| Mean (SD) | 126 (20.5) | 138 (21.9) | 126 (20.8) |
| Median [Min, Max] | 123 [78.0, 180] | 137 [86.0, 180] | 124 [78.0, 180] |
| dbp_mean | |||
| Mean (SD) | 77.4 (11.7) | 81.0 (12.5) | 77.6 (11.8) |
| Median [Min, Max] | 77.0 [47.5, 108] | 81.0 [47.5, 108] | 77.0 [47.5, 108] |
| tob | |||
| Mean (SD) | 2.27 (0.879) | 2.16 (0.882) | 2.26 (0.879) |
| Median [Min, Max] | 3.00 [1.00, 3.00] | 2.00 [1.00, 3.00] | 3.00 [1.00, 3.00] |
| exer | |||
| Mean (SD) | 1.61 (0.488) | 1.60 (0.491) | 1.61 (0.488) |
| Median [Min, Max] | 2.00 [1.00, 2.00] | 2.00 [1.00, 2.00] | 2.00 [1.00, 2.00] |
| kaidan | |||
| Mean (SD) | 3.35 (1.26) | 3.39 (1.27) | 3.35 (1.26) |
| Median [Min, Max] | 3.00 [1.00, 5.00] | 3.00 [1.00, 5.00] | 3.00 [1.00, 5.00] |
| bs | |||
| Mean (SD) | 96.1 (9.78) | 100 (11.0) | 96.4 (9.90) |
| Median [Min, Max] | 95.0 [72.0, 120] | 98.0 [72.0, 120] | 95.0 [72.0, 120] |
| non_hdl | |||
| Mean (SD) | 152 (36.1) | 159 (36.1) | 152 (36.1) |
| Median [Min, Max] | 150 [53.5, 250] | 155 [78.0, 250] | 150 [53.5, 250] |
| hyper | |||
| Mean (SD) | 0.295 (0.456) | 0.550 (0.498) | 0.311 (0.463) |
| Median [Min, Max] | 0 [0, 1.00] | 1.00 [0, 1.00] | 0 [0, 1.00] |
| diabetes_over_boundary | |||
| Mean (SD) | 0.115 (0.319) | 0.228 (0.420) | 0.122 (0.327) |
| Median [Min, Max] | 0 [0, 1.00] | 0 [0, 1.00] | 0 [0, 1.00] |
| MetS | |||
| Mean (SD) | 0.234 (0.424) | 0.413 (0.493) | 0.245 (0.430) |
| Median [Min, Max] | 0 [0, 1.00] | 0 [0, 1.00] | 0 [0, 1.00] |
df2 <- df %>% rename(stroke = target_st, sbp = sbp_mean, dbp = dbp_mean,
eGFR = egfr_ckd_epi, smsts = tob, hypertension = hyper,
DM = diabetes_over_boundary)
# Unbalanced data
table(df2$stroke)
##
## 0 1
## 6951 438
prop.table(table(df2$stroke))
##
## 0 1
## 0.9407227 0.0592773
# Change to factor
df2$stroke = as.factor(df2$stroke)
df2$sex = as.factor(df2$sex)
df2$smsts = as.factor(df2$smsts)
df2$exer = as.factor(df2$exer)
df2$kaidan = as.factor(df2$kaidan)
df2$hypertension = as.factor(df2$hypertension)
df2$DM = as.factor(df2$DM)
df2$MetS = as.factor(df2$MetS)
df2$stroke = as.factor(df2$stroke)
df3 = df2
df3<-downSample(x=df3[,-ncol(df3)],y=df3$stroke)
table(df3$stroke)
##
## 0 1
## 438 438
set.seed(13032019)
index <- createDataPartition(df3$stroke, p = 0.8, list = F)
train_set = df3[index,]
test_set = df3[-index,]
#
table(train_set$stroke)
##
## 0 1
## 351 351
table(test_set$stroke)
##
## 0 1
## 87 87
################################################################
p1 = ggplot(data = df2, mapping = aes(stroke, fill = stroke)) +
geom_bar(width = 0.6) +
theme_bw() +
labs(title = "Stroke distribution before down-sampling", x = "stroke") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_x_discrete(labels = c("No", "Yes")) +
theme(legend.position = "none") # turn off fill legend for p1
p2 = ggplot(data = df3, mapping = aes(stroke, fill = stroke)) +
geom_bar(width = 0.6) +
theme_bw() +
labs(title = "Stroke distribution after down-sampling", x = "stroke") +
scale_x_discrete(labels = c("No", "Yes")) +
theme(plot.title = element_text(hjust = 0.5)) +
theme(legend.position = "none") # turn off fill legend for p2
gridExtra::grid.arrange(p1, p2, nrow = 2)
s1 = ggplot(df2, aes(x = sex, fill = stroke)) +
geom_bar(width = 0.6) +
theme(plot.title = element_text(hjust = 0.5)) +
scale_x_discrete(labels = c("Male", "Female"))
s2 = ggplot(df3, aes(x = sex, fill = stroke)) +
geom_bar(width = 0.6) +
labs(x = "Gender") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_x_discrete(labels = c("Male", "Female"))
gridExtra::grid.arrange(s1, s2, nrow = 2, top = "Stroke incidence according to gender")
sm1 = ggplot(df2, aes(x = smsts, fill = stroke)) +
geom_bar(width = 0.6) +
labs(x = "Smoking") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_x_discrete(labels = c("Current", "Past", "Never"))
sm2 = ggplot(df3, aes(x = smsts, fill = stroke)) +
geom_bar(width = 0.6) +
labs(x = "Smoking") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_x_discrete(labels = c("Current", "Past", "Never"))
gridExtra::grid.arrange(sm1, sm2, nrow = 2, top = "Stroke incidence according to smoking status")
i = ggplot(df3, aes(x = age, fill = stroke, color = stroke)) +
geom_histogram(position = "dodge") +
geom_density(alpha = 0.1)
i
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
i1 = ggplot(df3, aes(x = age, fill = stroke, color = stroke)) +
geom_density(alpha = 0.1)
i1
# The relationship between age and stroke by sex
i2 = ggplot(df2, aes(x = age, y = stroke, color = sex)) +
geom_boxplot() + labs(title = "The relationship between age and stroke by sex") + theme(plot.title = element_text(hjust = 0.5))
i2
# It seems there are no different stroke incidence between male and female
i3 = ggplot(df2, aes(x = age, y = stroke, color = hypertension)) +
geom_boxplot() + labs(title = "The relationship between age and stroke by hypertension") + theme(plot.title = element_text(hjust = 0.5))
# The group of individuals with stroke showed a high prevalence of hypertension.
i3
i4 = ggplot(df2, aes(x = age, y = stroke, color = DM)) +
geom_boxplot() + labs(title = "The relationship between age and stroke by diabetes") + theme(plot.title = element_text(hjust = 0.5))
# It seems the group of individuals with stroke showed a high prevalence of DM
i4
i5 = ggplot(df2, aes(x = age, y = stroke, color = MetS)) +
geom_boxplot() + labs(title = "The relationship between age and stroke by MetS") + theme(plot.title = element_text(hjust = 0.5))
i5
gridExtra::grid.arrange( i2, i3, i4, i5, nrow = 4)
ggplot(df3, aes(x = sbp, fill = stroke, color = stroke)) +
geom_density(alpha = 0.1)
ggplot(df3, aes(x = dbp, fill = stroke, color = stroke)) +
geom_density(alpha = 0.1)
ggplot(df3, aes(x = eGFR, fill = stroke, color = stroke)) +
geom_density(alpha = 0.1)
ggplot(df3, aes(x = non_hdl, fill = stroke, color = stroke)) +
geom_density(alpha = 0.1)
ggplot(df3, aes(x = hdl, fill = stroke, color = stroke)) +
geom_density(alpha = 0.1)
ggplot(df3, aes(x = bmi, fill = stroke, color = stroke)) +
geom_density(alpha = 0.1)
ggplot(df3, aes(x = waist, fill = stroke, color = stroke)) +
geom_density(alpha = 0.1)
ggplot(df3, aes(x = wt20, fill = stroke, color = stroke)) +
geom_density(alpha = 0.1)
ggplot(df3, aes(x = bs, fill = stroke, color = stroke)) +
geom_density(alpha = 0.1)
ggplot(df3, aes(x = rbc, fill = stroke, color = stroke)) +
geom_density(alpha = 0.1) # not good
ggplot(df3, aes(x = wbc, fill = stroke, color = stroke)) +
geom_density(alpha = 0.1)
ggplot(df3, aes(x = tg, fill = stroke, color = stroke)) +
geom_density(alpha = 0.1)
ggplot(df3, aes(x = cpk, fill = stroke, color = stroke)) +
geom_density(alpha = 0.1)
ggplot(df3, aes(x = alb, fill = stroke, color = stroke)) +
geom_density(alpha = 0.1)
ggplot(df3, aes(x = bf, fill = stroke, color = stroke)) +
geom_density(alpha = 0.1)
cor_data = cor(df3[,c("age", "sbp", "dbp", "eGFR", "non_hdl", "waist", "bmi",
"bs", "wt20", "rbc", "wbc", "alb", "cpk", "ua", "bf")])
cor_data
## age sbp dbp eGFR non_hdl
## age 1.00000000 0.4025379760 0.07359780 -0.549114689 0.15222176
## sbp 0.40253798 1.0000000000 0.67338394 -0.248530790 0.20446973
## dbp 0.07359780 0.6733839448 1.00000000 -0.034468573 0.19130179
## eGFR -0.54911469 -0.2485307896 -0.03446857 1.000000000 -0.18722367
## non_hdl 0.15222176 0.2044697275 0.19130179 -0.187223672 1.00000000
## waist 0.20129635 0.2876818164 0.27977209 -0.076697549 0.24217556
## bmi 0.04771788 0.2694061087 0.30068571 -0.062256966 0.29288861
## bs 0.18935703 0.2305298428 0.16366977 -0.137719538 0.18383785
## wt20 -0.01701748 -0.0009918792 0.07209918 -0.036976638 -0.10352265
## rbc -0.20154748 0.0929441255 0.31680076 0.142686454 0.15391495
## wbc -0.01988770 0.0003986677 0.06926732 -0.005669301 0.09551122
## alb -0.15351230 0.0823913553 0.18910256 0.010472884 0.18711360
## cpk 0.02752169 0.0534955661 0.10646878 -0.088251617 0.04372156
## ua 0.09095670 0.2055006979 0.23525928 -0.260337988 0.12217388
## bf 0.03809760 0.1336351593 0.10583662 -0.070681893 0.30799232
## waist bmi bs wt20 rbc
## age 0.20129635 0.04771788 0.18935703 -0.0170174754 -0.20154748
## sbp 0.28768182 0.26940611 0.23052984 -0.0009918792 0.09294413
## dbp 0.27977209 0.30068571 0.16366977 0.0720991789 0.31680076
## eGFR -0.07669755 -0.06225697 -0.13771954 -0.0369766375 0.14268645
## non_hdl 0.24217556 0.29288861 0.18383785 -0.1035226506 0.15391495
## waist 1.00000000 0.80637128 0.23010363 0.2090992222 0.21479824
## bmi 0.80637128 1.00000000 0.22082258 0.1891500517 0.22029290
## bs 0.23010363 0.22082258 1.00000000 0.0487830374 0.18009105
## wt20 0.20909922 0.18915005 0.04878304 1.0000000000 0.23574838
## rbc 0.21479824 0.22029290 0.18009105 0.2357483825 1.00000000
## wbc 0.14987282 0.10827836 0.12342669 0.1310255716 0.26021887
## alb 0.04226280 0.08861803 0.08049792 0.0189703447 0.31504102
## cpk 0.07077697 0.08428466 0.02858005 0.1834279067 -0.02285759
## ua 0.28151913 0.23865199 0.11859207 0.3302661213 0.27108522
## bf 0.39979788 0.51319166 0.05759017 -0.3242298196 -0.08481575
## wbc alb cpk ua bf
## age -0.0198876991 -0.15351230 0.02752169 0.09095670 0.03809760
## sbp 0.0003986677 0.08239136 0.05349557 0.20550070 0.13363516
## dbp 0.0692673172 0.18910256 0.10646878 0.23525928 0.10583662
## eGFR -0.0056693015 0.01047288 -0.08825162 -0.26033799 -0.07068189
## non_hdl 0.0955112194 0.18711360 0.04372156 0.12217388 0.30799232
## waist 0.1498728171 0.04226280 0.07077697 0.28151913 0.39979788
## bmi 0.1082783638 0.08861803 0.08428466 0.23865199 0.51319166
## bs 0.1234266854 0.08049792 0.02858005 0.11859207 0.05759017
## wt20 0.1310255716 0.01897034 0.18342791 0.33026612 -0.32422982
## rbc 0.2602188726 0.31504102 -0.02285759 0.27108522 -0.08481575
## wbc 1.0000000000 0.11024950 -0.05248856 0.19526562 -0.05692357
## alb 0.1102495040 1.00000000 0.03679963 0.12128387 0.11975035
## cpk -0.0524885636 0.03679963 1.00000000 0.10705871 -0.13788311
## ua 0.1952656250 0.12128387 0.10705871 1.00000000 -0.07698699
## bf -0.0569235735 0.11975035 -0.13788311 -0.07698699 1.00000000
corrplot::corrplot(cor_data, type = "upper", method = "number")
corrplot::corrplot(cor_data, method = "pie")