Import library

library(readxl)
library(table1)
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
library(ggplot2)
library(caret)
## Loading required package: lattice
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.2      ✔ forcats 0.5.2 
## ✔ purrr   0.3.4      
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ purrr::lift()   masks caret::lift()

1. Import data set

df <- read_excel("~/Desktop/Misc/Clean data.xlsx")

table1(~ age + sex + bmi + sbp_mean + dbp_mean + tob + exer + kaidan + bs + non_hdl + hyper + 
         diabetes_over_boundary + MetS|target_st, df)
## Warning in table1.formula(~age + sex + bmi + sbp_mean + dbp_mean + tob + : Terms
## to the right of '|' in formula 'x' define table columns and are expected to be
## factors with meaningful labels.
0
(N=6951)
1
(N=438)
Overall
(N=7389)
age
Mean (SD) 54.6 (13.1) 64.4 (9.78) 55.1 (13.1)
Median [Min, Max] 55.0 [30.0, 84.0] 66.0 [32.0, 80.0] 56.0 [30.0, 84.0]
sex
Mean (SD) 0.548 (0.498) 0.466 (0.499) 0.543 (0.498)
Median [Min, Max] 1.00 [0, 1.00] 0 [0, 1.00] 1.00 [0, 1.00]
bmi
Mean (SD) 22.5 (3.00) 23.1 (3.21) 22.5 (3.01)
Median [Min, Max] 22.3 [14.2, 30.5] 23.0 [14.2, 30.5] 22.3 [14.2, 30.5]
sbp_mean
Mean (SD) 126 (20.5) 138 (21.9) 126 (20.8)
Median [Min, Max] 123 [78.0, 180] 137 [86.0, 180] 124 [78.0, 180]
dbp_mean
Mean (SD) 77.4 (11.7) 81.0 (12.5) 77.6 (11.8)
Median [Min, Max] 77.0 [47.5, 108] 81.0 [47.5, 108] 77.0 [47.5, 108]
tob
Mean (SD) 2.27 (0.879) 2.16 (0.882) 2.26 (0.879)
Median [Min, Max] 3.00 [1.00, 3.00] 2.00 [1.00, 3.00] 3.00 [1.00, 3.00]
exer
Mean (SD) 1.61 (0.488) 1.60 (0.491) 1.61 (0.488)
Median [Min, Max] 2.00 [1.00, 2.00] 2.00 [1.00, 2.00] 2.00 [1.00, 2.00]
kaidan
Mean (SD) 3.35 (1.26) 3.39 (1.27) 3.35 (1.26)
Median [Min, Max] 3.00 [1.00, 5.00] 3.00 [1.00, 5.00] 3.00 [1.00, 5.00]
bs
Mean (SD) 96.1 (9.78) 100 (11.0) 96.4 (9.90)
Median [Min, Max] 95.0 [72.0, 120] 98.0 [72.0, 120] 95.0 [72.0, 120]
non_hdl
Mean (SD) 152 (36.1) 159 (36.1) 152 (36.1)
Median [Min, Max] 150 [53.5, 250] 155 [78.0, 250] 150 [53.5, 250]
hyper
Mean (SD) 0.295 (0.456) 0.550 (0.498) 0.311 (0.463)
Median [Min, Max] 0 [0, 1.00] 1.00 [0, 1.00] 0 [0, 1.00]
diabetes_over_boundary
Mean (SD) 0.115 (0.319) 0.228 (0.420) 0.122 (0.327)
Median [Min, Max] 0 [0, 1.00] 0 [0, 1.00] 0 [0, 1.00]
MetS
Mean (SD) 0.234 (0.424) 0.413 (0.493) 0.245 (0.430)
Median [Min, Max] 0 [0, 1.00] 0 [0, 1.00] 0 [0, 1.00]

1.1 Rename variables

df2 <- df %>% rename(stroke = target_st, sbp = sbp_mean, dbp = dbp_mean, 
                     eGFR = egfr_ckd_epi, smsts = tob, hypertension = hyper,
                     DM = diabetes_over_boundary)

# Unbalanced data
table(df2$stroke)
## 
##    0    1 
## 6951  438
prop.table(table(df2$stroke))
## 
##         0         1 
## 0.9407227 0.0592773
# Change to factor
df2$stroke = as.factor(df2$stroke)
df2$sex = as.factor(df2$sex)
df2$smsts = as.factor(df2$smsts)
df2$exer = as.factor(df2$exer)
df2$kaidan = as.factor(df2$kaidan)
df2$hypertension = as.factor(df2$hypertension)
df2$DM = as.factor(df2$DM)
df2$MetS = as.factor(df2$MetS)
df2$stroke = as.factor(df2$stroke)

1.2 Down-sampling

df3 = df2

df3<-downSample(x=df3[,-ncol(df3)],y=df3$stroke)
                      
table(df3$stroke)
## 
##   0   1 
## 438 438

2.Splitting the dataset

set.seed(13032019)
index <- createDataPartition(df3$stroke, p = 0.8, list = F)
train_set = df3[index,]
test_set = df3[-index,]

#
table(train_set$stroke)
## 
##   0   1 
## 351 351
table(test_set$stroke)
## 
##  0  1 
## 87 87

3. Exploratory data analysis

3.1 Stroke distribution

################################################################
p1 = ggplot(data = df2, mapping = aes(stroke, fill = stroke)) +
  geom_bar(width = 0.6) +
  theme_bw() +
  labs(title = "Stroke distribution before down-sampling", x = "stroke") +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_x_discrete(labels = c("No", "Yes")) +
  theme(legend.position = "none") # turn off fill legend for p1


p2 = ggplot(data = df3, mapping = aes(stroke, fill = stroke)) +
  geom_bar(width = 0.6) +
  theme_bw() +
  labs(title = "Stroke distribution after down-sampling", x = "stroke") +
  scale_x_discrete(labels = c("No", "Yes")) + 
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(legend.position = "none") # turn off fill legend for p2

gridExtra::grid.arrange(p1, p2, nrow = 2)

3.1.1 Stroke incidence distributions according to gender

s1 = ggplot(df2, aes(x = sex, fill = stroke)) +
  geom_bar(width = 0.6) +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_x_discrete(labels = c("Male", "Female"))

s2 = ggplot(df3, aes(x = sex, fill = stroke)) +
  geom_bar(width = 0.6) +
  labs(x = "Gender") +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_x_discrete(labels = c("Male", "Female"))
 
gridExtra::grid.arrange(s1, s2, nrow = 2, top = "Stroke incidence according to gender") 

3.1.2 Stroke incidence distributions according to Smoking status

sm1 = ggplot(df2, aes(x = smsts, fill = stroke)) +
  geom_bar(width = 0.6) +
  labs(x = "Smoking") +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_x_discrete(labels = c("Current", "Past", "Never")) 

sm2 = ggplot(df3, aes(x = smsts, fill = stroke)) +
  geom_bar(width = 0.6) +
  labs(x = "Smoking") +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_x_discrete(labels = c("Current", "Past", "Never")) 

gridExtra::grid.arrange(sm1, sm2, nrow = 2, top = "Stroke incidence according to smoking status")

3.2 About the interaction beetween factors

3.2.1 The relationship between age and stroke

i = ggplot(df3, aes(x = age, fill = stroke, color = stroke)) +
  geom_histogram(position = "dodge") +
  geom_density(alpha = 0.1)
i
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

i1 = ggplot(df3, aes(x = age, fill = stroke, color = stroke)) +
  geom_density(alpha = 0.1)
i1

# The relationship between age and stroke by sex
i2 = ggplot(df2, aes(x = age, y = stroke, color = sex)) +
  geom_boxplot() + labs(title = "The relationship between age and stroke by sex") + theme(plot.title = element_text(hjust = 0.5))
i2

# It seems there are no different stroke incidence between male and female
i3 = ggplot(df2, aes(x = age, y = stroke, color = hypertension)) +
  geom_boxplot() + labs(title = "The relationship between age and stroke by hypertension") + theme(plot.title = element_text(hjust = 0.5))
# The group of individuals with stroke showed a high prevalence of hypertension.
i3

i4 = ggplot(df2, aes(x = age, y = stroke, color = DM)) +
  geom_boxplot() + labs(title = "The relationship between age and stroke by diabetes") + theme(plot.title = element_text(hjust = 0.5))
# It seems the group of individuals with stroke showed a high prevalence of DM

i4

i5 = ggplot(df2, aes(x = age, y = stroke, color = MetS)) +
  geom_boxplot() + labs(title = "The relationship between age and stroke by MetS") + theme(plot.title = element_text(hjust = 0.5))
i5

gridExtra::grid.arrange( i2, i3, i4, i5, nrow = 4)

3.3 With numerical variables

3.3.1 sbp, dbp

ggplot(df3, aes(x = sbp, fill = stroke, color = stroke)) +
  geom_density(alpha = 0.1)

ggplot(df3, aes(x = dbp, fill = stroke, color = stroke)) +
  geom_density(alpha = 0.1)

3.3.2 eGFR

ggplot(df3, aes(x = eGFR, fill = stroke, color = stroke)) +
  geom_density(alpha = 0.1)

3.3.3 non_hdl

ggplot(df3, aes(x = non_hdl, fill = stroke, color = stroke)) +
  geom_density(alpha = 0.1)

3.3.4 hdl

ggplot(df3, aes(x = hdl, fill = stroke, color = stroke)) +
  geom_density(alpha = 0.1)

3.3.5 bmi

ggplot(df3, aes(x = bmi, fill = stroke, color = stroke)) +
  geom_density(alpha = 0.1)

3.3.6 waist

ggplot(df3, aes(x = waist, fill = stroke, color = stroke)) +
  geom_density(alpha = 0.1)

3.3.7 wt20

ggplot(df3, aes(x = wt20, fill = stroke, color = stroke)) +
  geom_density(alpha = 0.1)

3.3.8 bs

ggplot(df3, aes(x = bs, fill = stroke, color = stroke)) +
  geom_density(alpha = 0.1)

3.3.9 rbc

ggplot(df3, aes(x = rbc, fill = stroke, color = stroke)) +
  geom_density(alpha = 0.1) # not good

3.3.10 wbc

ggplot(df3, aes(x = wbc, fill = stroke, color = stroke)) +
  geom_density(alpha = 0.1)

3.3.11 tg

ggplot(df3, aes(x = tg, fill = stroke, color = stroke)) +
  geom_density(alpha = 0.1)

3.3.12 cpk

ggplot(df3, aes(x = cpk, fill = stroke, color = stroke)) +
  geom_density(alpha = 0.1)

3.3.13 alb

ggplot(df3, aes(x = alb, fill = stroke, color = stroke)) +
  geom_density(alpha = 0.1)

3.3.14 bf

ggplot(df3, aes(x = bf, fill = stroke, color = stroke)) +
  geom_density(alpha = 0.1) 

4. Correlation Matrix

cor_data = cor(df3[,c("age", "sbp", "dbp", "eGFR", "non_hdl", "waist", "bmi", 
                      "bs", "wt20", "rbc", "wbc", "alb", "cpk", "ua", "bf")])
                                          
cor_data
##                 age           sbp         dbp         eGFR     non_hdl
## age      1.00000000  0.4025379760  0.07359780 -0.549114689  0.15222176
## sbp      0.40253798  1.0000000000  0.67338394 -0.248530790  0.20446973
## dbp      0.07359780  0.6733839448  1.00000000 -0.034468573  0.19130179
## eGFR    -0.54911469 -0.2485307896 -0.03446857  1.000000000 -0.18722367
## non_hdl  0.15222176  0.2044697275  0.19130179 -0.187223672  1.00000000
## waist    0.20129635  0.2876818164  0.27977209 -0.076697549  0.24217556
## bmi      0.04771788  0.2694061087  0.30068571 -0.062256966  0.29288861
## bs       0.18935703  0.2305298428  0.16366977 -0.137719538  0.18383785
## wt20    -0.01701748 -0.0009918792  0.07209918 -0.036976638 -0.10352265
## rbc     -0.20154748  0.0929441255  0.31680076  0.142686454  0.15391495
## wbc     -0.01988770  0.0003986677  0.06926732 -0.005669301  0.09551122
## alb     -0.15351230  0.0823913553  0.18910256  0.010472884  0.18711360
## cpk      0.02752169  0.0534955661  0.10646878 -0.088251617  0.04372156
## ua       0.09095670  0.2055006979  0.23525928 -0.260337988  0.12217388
## bf       0.03809760  0.1336351593  0.10583662 -0.070681893  0.30799232
##               waist         bmi          bs          wt20         rbc
## age      0.20129635  0.04771788  0.18935703 -0.0170174754 -0.20154748
## sbp      0.28768182  0.26940611  0.23052984 -0.0009918792  0.09294413
## dbp      0.27977209  0.30068571  0.16366977  0.0720991789  0.31680076
## eGFR    -0.07669755 -0.06225697 -0.13771954 -0.0369766375  0.14268645
## non_hdl  0.24217556  0.29288861  0.18383785 -0.1035226506  0.15391495
## waist    1.00000000  0.80637128  0.23010363  0.2090992222  0.21479824
## bmi      0.80637128  1.00000000  0.22082258  0.1891500517  0.22029290
## bs       0.23010363  0.22082258  1.00000000  0.0487830374  0.18009105
## wt20     0.20909922  0.18915005  0.04878304  1.0000000000  0.23574838
## rbc      0.21479824  0.22029290  0.18009105  0.2357483825  1.00000000
## wbc      0.14987282  0.10827836  0.12342669  0.1310255716  0.26021887
## alb      0.04226280  0.08861803  0.08049792  0.0189703447  0.31504102
## cpk      0.07077697  0.08428466  0.02858005  0.1834279067 -0.02285759
## ua       0.28151913  0.23865199  0.11859207  0.3302661213  0.27108522
## bf       0.39979788  0.51319166  0.05759017 -0.3242298196 -0.08481575
##                   wbc         alb         cpk          ua          bf
## age     -0.0198876991 -0.15351230  0.02752169  0.09095670  0.03809760
## sbp      0.0003986677  0.08239136  0.05349557  0.20550070  0.13363516
## dbp      0.0692673172  0.18910256  0.10646878  0.23525928  0.10583662
## eGFR    -0.0056693015  0.01047288 -0.08825162 -0.26033799 -0.07068189
## non_hdl  0.0955112194  0.18711360  0.04372156  0.12217388  0.30799232
## waist    0.1498728171  0.04226280  0.07077697  0.28151913  0.39979788
## bmi      0.1082783638  0.08861803  0.08428466  0.23865199  0.51319166
## bs       0.1234266854  0.08049792  0.02858005  0.11859207  0.05759017
## wt20     0.1310255716  0.01897034  0.18342791  0.33026612 -0.32422982
## rbc      0.2602188726  0.31504102 -0.02285759  0.27108522 -0.08481575
## wbc      1.0000000000  0.11024950 -0.05248856  0.19526562 -0.05692357
## alb      0.1102495040  1.00000000  0.03679963  0.12128387  0.11975035
## cpk     -0.0524885636  0.03679963  1.00000000  0.10705871 -0.13788311
## ua       0.1952656250  0.12128387  0.10705871  1.00000000 -0.07698699
## bf      -0.0569235735  0.11975035 -0.13788311 -0.07698699  1.00000000
corrplot::corrplot(cor_data, type = "upper", method = "number")

corrplot::corrplot(cor_data, method = "pie")