Import library

library(readxl)
library(table1)

## 
## Attaching package: 'table1'

## The following objects are masked from 'package:base':
## 
##     units, units<-

library(ggplot2)
library(caret)

## Loading required package: lattice

library(tidyverse)

## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──

## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.2      ✔ forcats 0.5.2 
## ✔ purrr   0.3.4      
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ purrr::lift()   masks caret::lift()

1. Import data set

df <- read_excel("~/Desktop/Misc/Clean data.xlsx")

table1(~ age + sex + bmi + sbp_mean + dbp_mean + tob + exer + kaidan + bs + non_hdl + hyper + 
         diabetes_over_boundary + MetS|target_st, df)

## Warning in table1.formula(~age + sex + bmi + sbp_mean + dbp_mean + tob + : Terms
## to the right of '|' in formula 'x' define table columns and are expected to be
## factors with meaningful labels.

	0 (N=6951)	1 (N=438)	Overall (N=7389)
age
Mean (SD)	54.6 (13.1)	64.4 (9.78)	55.1 (13.1)
Median [Min, Max]	55.0 [30.0, 84.0]	66.0 [32.0, 80.0]	56.0 [30.0, 84.0]
sex
Mean (SD)	0.548 (0.498)	0.466 (0.499)	0.543 (0.498)
Median [Min, Max]	1.00 [0, 1.00]	0 [0, 1.00]	1.00 [0, 1.00]
bmi
Mean (SD)	22.5 (3.00)	23.1 (3.21)	22.5 (3.01)
Median [Min, Max]	22.3 [14.2, 30.5]	23.0 [14.2, 30.5]	22.3 [14.2, 30.5]
sbp_mean
Mean (SD)	126 (20.5)	138 (21.9)	126 (20.8)
Median [Min, Max]	123 [78.0, 180]	137 [86.0, 180]	124 [78.0, 180]
dbp_mean
Mean (SD)	77.4 (11.7)	81.0 (12.5)	77.6 (11.8)
Median [Min, Max]	77.0 [47.5, 108]	81.0 [47.5, 108]	77.0 [47.5, 108]
tob
Mean (SD)	2.27 (0.879)	2.16 (0.882)	2.26 (0.879)
Median [Min, Max]	3.00 [1.00, 3.00]	2.00 [1.00, 3.00]	3.00 [1.00, 3.00]
exer
Mean (SD)	1.61 (0.488)	1.60 (0.491)	1.61 (0.488)
Median [Min, Max]	2.00 [1.00, 2.00]	2.00 [1.00, 2.00]	2.00 [1.00, 2.00]
kaidan
Mean (SD)	3.35 (1.26)	3.39 (1.27)	3.35 (1.26)
Median [Min, Max]	3.00 [1.00, 5.00]	3.00 [1.00, 5.00]	3.00 [1.00, 5.00]
bs
Mean (SD)	96.1 (9.78)	100 (11.0)	96.4 (9.90)
Median [Min, Max]	95.0 [72.0, 120]	98.0 [72.0, 120]	95.0 [72.0, 120]
non_hdl
Mean (SD)	152 (36.1)	159 (36.1)	152 (36.1)
Median [Min, Max]	150 [53.5, 250]	155 [78.0, 250]	150 [53.5, 250]
hyper
Mean (SD)	0.295 (0.456)	0.550 (0.498)	0.311 (0.463)
Median [Min, Max]	0 [0, 1.00]	1.00 [0, 1.00]	0 [0, 1.00]
diabetes_over_boundary
Mean (SD)	0.115 (0.319)	0.228 (0.420)	0.122 (0.327)
Median [Min, Max]	0 [0, 1.00]	0 [0, 1.00]	0 [0, 1.00]
MetS
Mean (SD)	0.234 (0.424)	0.413 (0.493)	0.245 (0.430)
Median [Min, Max]	0 [0, 1.00]	0 [0, 1.00]	0 [0, 1.00]

1.1 Rename variables

df2 <- df %>% rename(stroke = target_st, sbp = sbp_mean, dbp = dbp_mean, 
                     eGFR = egfr_ckd_epi, smsts = tob, hypertension = hyper,
                     DM = diabetes_over_boundary)

# Unbalanced data
table(df2$stroke)

## 
##    0    1 
## 6951  438

prop.table(table(df2$stroke))

## 
##         0         1 
## 0.9407227 0.0592773

# Change to factor
df2$stroke = as.factor(df2$stroke)
df2$sex = as.factor(df2$sex)
df2$smsts = as.factor(df2$smsts)
df2$exer = as.factor(df2$exer)
df2$kaidan = as.factor(df2$kaidan)
df2$hypertension = as.factor(df2$hypertension)
df2$DM = as.factor(df2$DM)
df2$MetS = as.factor(df2$MetS)
df2$stroke = as.factor(df2$stroke)

1.2 Down-sampling

df3 = df2

df3<-downSample(x=df3[,-ncol(df3)],y=df3$stroke)
                      
table(df3$stroke)

## 
##   0   1 
## 438 438

2.Splitting the dataset

set.seed(13032019)
index <- createDataPartition(df3$stroke, p = 0.8, list = F)
train_set = df3[index,]
test_set = df3[-index,]

#
table(train_set$stroke)

## 
##   0   1 
## 351 351

table(test_set$stroke)

## 
##  0  1 
## 87 87

3. Exploratory data analysis

3.1 Stroke distribution

################################################################
p1 = ggplot(data = df2, mapping = aes(stroke, fill = stroke)) +
  geom_bar(width = 0.6) +
  theme_bw() +
  labs(title = "Stroke distribution before down-sampling", x = "stroke") +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_x_discrete(labels = c("No", "Yes")) +
  theme(legend.position = "none") # turn off fill legend for p1


p2 = ggplot(data = df3, mapping = aes(stroke, fill = stroke)) +
  geom_bar(width = 0.6) +
  theme_bw() +
  labs(title = "Stroke distribution after down-sampling", x = "stroke") +
  scale_x_discrete(labels = c("No", "Yes")) + 
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(legend.position = "none") # turn off fill legend for p2

gridExtra::grid.arrange(p1, p2, nrow = 2)

3.1.1 Stroke incidence distributions according to gender

s1 = ggplot(df2, aes(x = sex, fill = stroke)) +
  geom_bar(width = 0.6) +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_x_discrete(labels = c("Male", "Female"))

s2 = ggplot(df3, aes(x = sex, fill = stroke)) +
  geom_bar(width = 0.6) +
  labs(x = "Gender") +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_x_discrete(labels = c("Male", "Female"))
 
gridExtra::grid.arrange(s1, s2, nrow = 2, top = "Stroke incidence according to gender")

3.1.2 Stroke incidence distributions according to Smoking status

sm1 = ggplot(df2, aes(x = smsts, fill = stroke)) +
  geom_bar(width = 0.6) +
  labs(x = "Smoking") +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_x_discrete(labels = c("Current", "Past", "Never")) 

sm2 = ggplot(df3, aes(x = smsts, fill = stroke)) +
  geom_bar(width = 0.6) +
  labs(x = "Smoking") +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_x_discrete(labels = c("Current", "Past", "Never")) 

gridExtra::grid.arrange(sm1, sm2, nrow = 2, top = "Stroke incidence according to smoking status")

3.2 About the interaction beetween factors

3.2.1 The relationship between age and stroke

i = ggplot(df3, aes(x = age, fill = stroke, color = stroke)) +
  geom_histogram(position = "dodge") +
  geom_density(alpha = 0.1)
i

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

i1 = ggplot(df3, aes(x = age, fill = stroke, color = stroke)) +
  geom_density(alpha = 0.1)
i1

# The relationship between age and stroke by sex
i2 = ggplot(df2, aes(x = age, y = stroke, color = sex)) +
  geom_boxplot() + labs(title = "The relationship between age and stroke by sex") + theme(plot.title = element_text(hjust = 0.5))
i2

# It seems there are no different stroke incidence between male and female

i3 = ggplot(df2, aes(x = age, y = stroke, color = hypertension)) +
  geom_boxplot() + labs(title = "The relationship between age and stroke by hypertension") + theme(plot.title = element_text(hjust = 0.5))
# The group of individuals with stroke showed a high prevalence of hypertension.
i3

i4 = ggplot(df2, aes(x = age, y = stroke, color = DM)) +
  geom_boxplot() + labs(title = "The relationship between age and stroke by diabetes") + theme(plot.title = element_text(hjust = 0.5))
# It seems the group of individuals with stroke showed a high prevalence of DM

i4

i5 = ggplot(df2, aes(x = age, y = stroke, color = MetS)) +
  geom_boxplot() + labs(title = "The relationship between age and stroke by MetS") + theme(plot.title = element_text(hjust = 0.5))
i5

gridExtra::grid.arrange( i2, i3, i4, i5, nrow = 4)

3.3 With numerical variables

3.3.1 sbp, dbp

ggplot(df3, aes(x = sbp, fill = stroke, color = stroke)) +
  geom_density(alpha = 0.1)

ggplot(df3, aes(x = dbp, fill = stroke, color = stroke)) +
  geom_density(alpha = 0.1)

3.3.2 eGFR

ggplot(df3, aes(x = eGFR, fill = stroke, color = stroke)) +
  geom_density(alpha = 0.1)

3.3.3 non_hdl

ggplot(df3, aes(x = non_hdl, fill = stroke, color = stroke)) +
  geom_density(alpha = 0.1)

3.3.4 hdl

ggplot(df3, aes(x = hdl, fill = stroke, color = stroke)) +
  geom_density(alpha = 0.1)

3.3.5 bmi

ggplot(df3, aes(x = bmi, fill = stroke, color = stroke)) +
  geom_density(alpha = 0.1)

3.3.6 waist

ggplot(df3, aes(x = waist, fill = stroke, color = stroke)) +
  geom_density(alpha = 0.1)

3.3.7 wt20

ggplot(df3, aes(x = wt20, fill = stroke, color = stroke)) +
  geom_density(alpha = 0.1)

3.3.8 bs

ggplot(df3, aes(x = bs, fill = stroke, color = stroke)) +
  geom_density(alpha = 0.1)

3.3.9 rbc

ggplot(df3, aes(x = rbc, fill = stroke, color = stroke)) +
  geom_density(alpha = 0.1) # not good

3.3.10 wbc

ggplot(df3, aes(x = wbc, fill = stroke, color = stroke)) +
  geom_density(alpha = 0.1)

3.3.11 tg

ggplot(df3, aes(x = tg, fill = stroke, color = stroke)) +
  geom_density(alpha = 0.1)

3.3.12 cpk

ggplot(df3, aes(x = cpk, fill = stroke, color = stroke)) +
  geom_density(alpha = 0.1)

3.3.13 alb

ggplot(df3, aes(x = alb, fill = stroke, color = stroke)) +
  geom_density(alpha = 0.1)

3.3.14 bf

ggplot(df3, aes(x = bf, fill = stroke, color = stroke)) +
  geom_density(alpha = 0.1)

4. Correlation Matrix

cor_data = cor(df3[,c("age", "sbp", "dbp", "eGFR", "non_hdl", "waist", "bmi", 
                      "bs", "wt20", "rbc", "wbc", "alb", "cpk", "ua", "bf")])
                                          
cor_data

##                 age           sbp         dbp         eGFR     non_hdl
## age      1.00000000  0.4025379760  0.07359780 -0.549114689  0.15222176
## sbp      0.40253798  1.0000000000  0.67338394 -0.248530790  0.20446973
## dbp      0.07359780  0.6733839448  1.00000000 -0.034468573  0.19130179
## eGFR    -0.54911469 -0.2485307896 -0.03446857  1.000000000 -0.18722367
## non_hdl  0.15222176  0.2044697275  0.19130179 -0.187223672  1.00000000
## waist    0.20129635  0.2876818164  0.27977209 -0.076697549  0.24217556
## bmi      0.04771788  0.2694061087  0.30068571 -0.062256966  0.29288861
## bs       0.18935703  0.2305298428  0.16366977 -0.137719538  0.18383785
## wt20    -0.01701748 -0.0009918792  0.07209918 -0.036976638 -0.10352265
## rbc     -0.20154748  0.0929441255  0.31680076  0.142686454  0.15391495
## wbc     -0.01988770  0.0003986677  0.06926732 -0.005669301  0.09551122
## alb     -0.15351230  0.0823913553  0.18910256  0.010472884  0.18711360
## cpk      0.02752169  0.0534955661  0.10646878 -0.088251617  0.04372156
## ua       0.09095670  0.2055006979  0.23525928 -0.260337988  0.12217388
## bf       0.03809760  0.1336351593  0.10583662 -0.070681893  0.30799232
##               waist         bmi          bs          wt20         rbc
## age      0.20129635  0.04771788  0.18935703 -0.0170174754 -0.20154748
## sbp      0.28768182  0.26940611  0.23052984 -0.0009918792  0.09294413
## dbp      0.27977209  0.30068571  0.16366977  0.0720991789  0.31680076
## eGFR    -0.07669755 -0.06225697 -0.13771954 -0.0369766375  0.14268645
## non_hdl  0.24217556  0.29288861  0.18383785 -0.1035226506  0.15391495
## waist    1.00000000  0.80637128  0.23010363  0.2090992222  0.21479824
## bmi      0.80637128  1.00000000  0.22082258  0.1891500517  0.22029290
## bs       0.23010363  0.22082258  1.00000000  0.0487830374  0.18009105
## wt20     0.20909922  0.18915005  0.04878304  1.0000000000  0.23574838
## rbc      0.21479824  0.22029290  0.18009105  0.2357483825  1.00000000
## wbc      0.14987282  0.10827836  0.12342669  0.1310255716  0.26021887
## alb      0.04226280  0.08861803  0.08049792  0.0189703447  0.31504102
## cpk      0.07077697  0.08428466  0.02858005  0.1834279067 -0.02285759
## ua       0.28151913  0.23865199  0.11859207  0.3302661213  0.27108522
## bf       0.39979788  0.51319166  0.05759017 -0.3242298196 -0.08481575
##                   wbc         alb         cpk          ua          bf
## age     -0.0198876991 -0.15351230  0.02752169  0.09095670  0.03809760
## sbp      0.0003986677  0.08239136  0.05349557  0.20550070  0.13363516
## dbp      0.0692673172  0.18910256  0.10646878  0.23525928  0.10583662
## eGFR    -0.0056693015  0.01047288 -0.08825162 -0.26033799 -0.07068189
## non_hdl  0.0955112194  0.18711360  0.04372156  0.12217388  0.30799232
## waist    0.1498728171  0.04226280  0.07077697  0.28151913  0.39979788
## bmi      0.1082783638  0.08861803  0.08428466  0.23865199  0.51319166
## bs       0.1234266854  0.08049792  0.02858005  0.11859207  0.05759017
## wt20     0.1310255716  0.01897034  0.18342791  0.33026612 -0.32422982
## rbc      0.2602188726  0.31504102 -0.02285759  0.27108522 -0.08481575
## wbc      1.0000000000  0.11024950 -0.05248856  0.19526562 -0.05692357
## alb      0.1102495040  1.00000000  0.03679963  0.12128387  0.11975035
## cpk     -0.0524885636  0.03679963  1.00000000  0.10705871 -0.13788311
## ua       0.1952656250  0.12128387  0.10705871  1.00000000 -0.07698699
## bf      -0.0569235735  0.11975035 -0.13788311 -0.07698699  1.00000000

corrplot::corrplot(cor_data, type = "upper", method = "number")

corrplot::corrplot(cor_data, method = "pie")

ML - Down sampling & Exploring data

Vu Thien

2023-03-16

Import library

1. Import data set

1.1 Rename variables

1.2 Down-sampling

2.Splitting the dataset

3. Exploratory data analysis

3.1 Stroke distribution

3.1.1 Stroke incidence distributions according to gender

3.1.2 Stroke incidence distributions according to Smoking status

3.2 About the interaction beetween factors

3.2.1 The relationship between age and stroke

3.3 With numerical variables

3.3.1 sbp, dbp

3.3.2 eGFR

3.3.3 non_hdl

3.3.4 hdl

3.3.5 bmi

3.3.6 waist

3.3.7 wt20

3.3.8 bs

3.3.9 rbc

3.3.10 wbc

3.3.11 tg

3.3.12 cpk

3.3.13 alb

3.3.14 bf

4. Correlation Matrix