head(data, n=5) %>% kbl(caption="First five rows of analysed data") %>% kable_classic(full_width = F, html_font = "Cambria") 
First five rows of analysed data
Age Sex ChestPainType RestingBP Cholesterol FastingBS RestingECG MaxHR ExerciseAngina Oldpeak ST_Slope HeartDisease
40 M ATA 140 289 0 Normal 172 N 0.0 Up 0
49 F NAP 160 180 0 Normal 156 N 1.0 Flat 1
37 M ATA 130 283 0 ST 98 N 0.0 Up 0
48 F ASY 138 214 0 Normal 108 Y 1.5 Flat 1
54 M NAP 150 195 0 Normal 122 N 0.0 Up 0
tail(data, n=5) %>% kbl(caption="Last five rows of analysed data") %>% kable_classic(full_width = F, html_font = "Cambria") 
Last five rows of analysed data
Age Sex ChestPainType RestingBP Cholesterol FastingBS RestingECG MaxHR ExerciseAngina Oldpeak ST_Slope HeartDisease
45 M TA 110 264 0 Normal 132 N 1.2 Flat 1
68 M ASY 144 193 1 Normal 141 N 3.4 Flat 1
57 M ASY 130 131 0 Normal 115 Y 1.2 Flat 1
57 F ATA 130 236 0 LVH 174 N 0.0 Flat 1
38 M NAP 138 175 0 Normal 173 N 0.0 Up 0

Initial data check:

# basic statistics
source('functions/initial_info.R')
initial_info(data)
x
Number of variables: 12
x
Number of observations: 918
x
Number of missings: 0
x
Percentage of missings: 0
x
Number of duplicated rows: 0
x
Number of binary variables: 2
x
Number of factors variables: 0
x
Number of continuous variables: 5
x
Number of character variables: 5
# basic information
psych::describe(data) %>% kbl() %>% kable_classic(full_width = T, html_font = "Cambria")
vars n mean sd median trimmed mad min max range skew kurtosis se
Age 1 918 53.511 9.433 54.0 53.711 10.38 28.0 77.0 49.0 -0.195 -0.396 0.311
Sex* 2 918 1.790 0.408 2.0 1.861 0.00 1.0 2.0 1.0 -1.420 0.016 0.013
ChestPainType* 3 918 1.781 0.957 1.0 1.664 0.00 1.0 4.0 3.0 0.791 -0.725 0.032
RestingBP 4 918 132.397 18.514 130.0 131.501 14.83 0.0 200.0 200.0 0.179 3.233 0.611
Cholesterol 5 918 198.800 109.384 223.0 204.413 68.20 0.0 603.0 603.0 -0.608 0.104 3.610
FastingBS 6 918 0.233 0.423 0.0 0.167 0.00 0.0 1.0 1.0 1.260 -0.412 0.014
RestingECG* 7 918 1.989 0.632 2.0 1.986 0.00 1.0 3.0 2.0 0.008 -0.497 0.021
MaxHR 8 918 136.809 25.460 138.0 137.231 26.69 60.0 202.0 142.0 -0.144 -0.458 0.840
ExerciseAngina* 9 918 1.404 0.491 1.0 1.380 0.00 1.0 2.0 1.0 0.390 -1.850 0.016
Oldpeak 10 918 0.887 1.067 0.6 0.738 0.89 -2.6 6.2 8.8 1.020 1.181 0.035
ST_Slope* 11 918 2.362 0.607 2.0 2.413 0.00 1.0 3.0 2.0 -0.380 -0.674 0.020
HeartDisease 12 918 0.553 0.497 1.0 0.567 0.00 0.0 1.0 1.0 -0.214 -1.956 0.016

Missings values

# missing values plots
source('functions/calc_missings.R')
calc_missings(data)

Individual observations

Age

Age
Min. :28.0
1st Qu.:47.0
Median :54.0
Mean :53.5
3rd Qu.:60.0
Max. :77.0
x
Number of missing values: 0
x
Percentage of missing values: 0%
x
Number of zeros: 0
=============== LINEAR MODEL ===============
term estimate std.error statistic p.value
(Intercept) 54.264 2.730 19.874 0.000
RestingBP 0.093 0.015 6.131 0.000
Cholesterol -0.001 0.003 -0.259 0.796
FastingBS 2.756 0.687 4.015 0.000
MaxHR -0.111 0.012 -9.366 0.000
Oldpeak 1.383 0.285 4.850 0.000
HeartDisease 0.832 0.673 1.236 0.217

Sex

Sex
Length:918
Class :character
Mode :character
x
Number of missing values: 0
x
Percentage of missing values: 0%
x
Number of zeros: 0

ChestPainType

ChestPainType
Length:918
Class :character
Mode :character
x
Number of missing values: 0
x
Percentage of missing values: 0%
x
Number of zeros: 0

RestingBP

RestingBP
Min. : 0
1st Qu.:120
Median :130
Mean :132
3rd Qu.:140
Max. :200
x
** Too many levels! Histogram hard to read! **

NULL

x
Number of missing values: 0
x
Percentage of missing values: 0%
x
Number of zeros: 1
=============== LINEAR MODEL ===============
term estimate std.error statistic p.value
(Intercept) 106.098 6.060 17.508 0.000
Age 0.427 0.070 6.131 0.000
Cholesterol 0.024 0.006 4.197 0.000
FastingBS 2.218 1.482 1.496 0.135
MaxHR -0.026 0.027 -0.967 0.334
Oldpeak 1.497 0.617 2.425 0.015
HeartDisease 0.631 1.444 0.437 0.662

Cholesterol

Cholesterol
Min. : 0
1st Qu.:173
Median :223
Mean :199
3rd Qu.:267
Max. :603
x
** Too many levels! Histogram hard to read! **

NULL

x
Number of missing values: 0
x
Percentage of missing values: 0%
x
Number of zeros: 172
=============== LINEAR MODEL ===============
term estimate std.error statistic p.value
(Intercept) 20.809 39.907 0.521 0.602
Age -0.105 0.404 -0.259 0.796
RestingBP 0.785 0.187 4.197 0.000
FastingBS -53.238 8.270 -6.437 0.000
MaxHR 0.738 0.150 4.915 0.000
Oldpeak 14.436 3.494 4.131 0.000
HeartDisease -39.046 8.126 -4.805 0.000

FastingBS

FastingBS
Min. :0.000
1st Qu.:0.000
Median :0.000
Mean :0.233
3rd Qu.:0.000
Max. :1.000
x
Number of missing values: 0
x
Percentage of missing values: 0%
x
Number of zeros: 704
=============== LINEAR MODEL ===============
term estimate std.error statistic p.value
(Intercept) -0.298 0.156 -1.91 0.057
Age 0.006 0.002 4.01 0.000
RestingBP 0.001 0.001 1.50 0.135
Cholesterol -0.001 0.000 -6.44 0.000
MaxHR 0.001 0.001 1.56 0.119
Oldpeak -0.024 0.014 -1.75 0.080
HeartDisease 0.187 0.032 5.92 0.000

RestingECG

RestingECG
Length:918
Class :character
Mode :character
x
Number of missing values: 0
x
Percentage of missing values: 0%
x
Number of zeros: 0

MaxHR

MaxHR
Min. : 60
1st Qu.:120
Median :138
Mean :137
3rd Qu.:156
Max. :202
x
** Too many levels! Histogram hard to read! **

NULL

x
Number of missing values: 0
x
Percentage of missing values: 0%
x
Number of zeros: 0
=============== LINEAR MODEL ===============
term estimate std.error statistic p.value
(Intercept) 184.592 6.182 29.859 0.000
Age -0.788 0.084 -9.366 0.000
RestingBP -0.040 0.041 -0.967 0.334
Cholesterol 0.035 0.007 4.915 0.000
FastingBS 2.874 1.840 1.562 0.119
Oldpeak 0.792 0.768 1.032 0.302
HeartDisease -15.668 1.716 -9.131 0.000

ExerciseAngina

ExerciseAngina
Length:918
Class :character
Mode :character
x
Number of missing values: 0
x
Percentage of missing values: 0%
x
Number of zeros: 0

Oldpeak

Oldpeak
Min. :-2.60
1st Qu.: 0.00
Median : 0.60
Mean : 0.89
3rd Qu.: 1.50
Max. : 6.20
x
** Too many levels! Histogram hard to read! **

NULL

x
Number of missing values: 0
x
Percentage of missing values: 0%
x
Number of zeros: 368
=============== LINEAR MODEL ===============
term estimate std.error statistic p.value
(Intercept) -1.562 0.371 -4.21 0.000
Age 0.018 0.004 4.85 0.000
RestingBP 0.004 0.002 2.42 0.015
Cholesterol 0.001 0.000 4.13 0.000
FastingBS -0.139 0.079 -1.75 0.080
MaxHR 0.001 0.001 1.03 0.302
HeartDisease 0.879 0.072 12.27 0.000

ST_Slope

ST_Slope
Length:918
Class :character
Mode :character
x
Number of missing values: 0
x
Percentage of missing values: 0%
x
Number of zeros: 0

HeartDisease

HeartDisease
Min. :0.000
1st Qu.:0.000
Median :1.000
Mean :0.553
3rd Qu.:1.000
Max. :1.000
x
Number of missing values: 0
x
Percentage of missing values: 0%
x
Number of zeros: 410
=============== LINEAR MODEL ===============
term estimate std.error statistic p.value
(Intercept) 1.070 0.157 6.828 0.000
Age 0.002 0.002 1.236 0.217
RestingBP 0.000 0.001 0.437 0.662
Cholesterol -0.001 0.000 -4.805 0.000
FastingBS 0.198 0.033 5.923 0.000
MaxHR -0.005 0.001 -9.131 0.000
Oldpeak 0.161 0.013 12.272 0.000

Histograms

# plot histograms for all numerical variables together
ww<- ggplot(gather(dplyr::select_if(data, is.numeric)), aes(value)) + 
       geom_histogram(bins = 30, fill="#464040") + 
       facet_wrap(~key, scales = 'free_x') + 
       theme(axis.title.x = element_blank(), axis.title.y = element_blank())

suppressWarnings(print(ww))

Bar plots

# plot bar plots 
source('functions/names_bar.R')
all_of(names_bar(data))

Correlations

Pearson’s

# Pearson
suppressWarnings( corrplot.mixed(cor(dplyr::select_if(data, is.numeric), use = "pairwise.complete.obs"), 
                         bg="white", upper="pie",lower="number", tl.col="black", tl.pos="lt", diag="l",
                         number.font=0.5, tl.cex=1, number.cex=0.55, title="Pearson's correlation", mar=c(0,0,1,0)) )

Kendall’s

# Kendall
suppressWarnings(  corrplot.mixed(cor(dplyr::select_if(data[sample(1:nrow(data), min(c(750, nrow(data)))), ], is.numeric), use = "pairwise.complete.obs", method = 'kendal'), 
                               bg="white", upper="pie",lower="number", tl.col="black", tl.pos="lt", diag="l",
                               number.font=0.5, tl.cex=1, number.cex=0.55, title="Kendal's correlation", mar=c(0,0,1,0)) )

Spearman’s

# Spearman
suppressWarnings( corrplot.mixed(cor(dplyr::select_if(data, is.numeric), use = "pairwise.complete.obs", method = 'spearman'), 
                               bg="white", upper="pie",lower="number", tl.col="black", tl.pos="lt", diag="l", 
                               number.font=0.5, tl.cex=1, number.cex=0.55, title="Spearman's correlation", mar=c(0,0,1,0)) )

Principal Component Analysis

source('functions/clustering.R')
# PCA 
tryCatch(suppressWarnings(clustering(data[sample(1:nrow(data), 750), ])), error = function(e){print("Wrong type of data for PCA")})