This database contains 76 attributes, but all published experiments refer to using a subset of 14 of them. In particular, the Cleveland database is the only one that has been used by ML researchers to this date. The “goal” field refers to the presence of heart disease in the patient. It is integer valued from 0 (no presence) to 4.

Attribute Information:

library(tidyverse) # metapackage with lots of helpful functions
## -- Attaching packages ------------------------------------------------------------------------------ tidyverse 1.2.1 --
## v ggplot2 3.2.0     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   0.8.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## -- Conflicts --------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(dplyr)
library(readr)
library(corrplot)
## corrplot 0.84 loaded
library(qgraph)
## Registered S3 methods overwritten by 'huge':
##   method    from   
##   plot.sim  BDgraph
##   print.sim BDgraph
library(jtools)
library(caret)
## Loading required package: lattice
## Registered S3 methods overwritten by 'lava':
##   method    from
##   plot.sim  huge
##   print.sim huge
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(DataExplorer)
library(funModeling)
## Loading required package: Hmisc
## Loading required package: survival
## 
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
## 
##     cluster
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following object is masked from 'package:jtools':
## 
##     %nin%
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## Registered S3 methods overwritten by 'pROC':
##   method    from
##   print.roc huge
##   plot.roc  huge
## funModeling v.1.8 :)
## Examples and tutorials at livebook.datascienceheroes.com
library(ggm)
## Loading required package: igraph
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:purrr':
## 
##     compose, simplify
## The following object is masked from 'package:tidyr':
## 
##     crossing
## The following object is masked from 'package:tibble':
## 
##     as_data_frame
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
## 
## Attaching package: 'ggm'
## The following object is masked from 'package:igraph':
## 
##     pa
## The following object is masked from 'package:Hmisc':
## 
##     rcorr
heart <- read.csv("C:/Users/Joe/Documents/Datasets/heart-disease-uci/heart.csv")

tbl_df(heart)
## # A tibble: 303 x 14
##    ï..age   sex    cp trestbps  chol   fbs restecg thalach exang oldpeak
##     <int> <int> <int>    <int> <int> <int>   <int>   <int> <int>   <dbl>
##  1     63     1     3      145   233     1       0     150     0     2.3
##  2     37     1     2      130   250     0       1     187     0     3.5
##  3     41     0     1      130   204     0       0     172     0     1.4
##  4     56     1     1      120   236     0       1     178     0     0.8
##  5     57     0     0      120   354     0       1     163     1     0.6
##  6     57     1     0      140   192     0       1     148     0     0.4
##  7     56     0     1      140   294     0       0     153     0     1.3
##  8     44     1     1      120   263     0       1     173     0     0  
##  9     52     1     2      172   199     1       1     162     0     0.5
## 10     57     1     2      150   168     0       1     174     0     1.6
## # ... with 293 more rows, and 4 more variables: slope <int>, ca <int>,
## #   thal <int>, target <int>
str(heart)
## 'data.frame':    303 obs. of  14 variables:
##  $ ï..age  : int  63 37 41 56 57 57 56 44 52 57 ...
##  $ sex     : int  1 1 0 1 0 1 0 1 1 1 ...
##  $ cp      : int  3 2 1 1 0 0 1 1 2 2 ...
##  $ trestbps: int  145 130 130 120 120 140 140 120 172 150 ...
##  $ chol    : int  233 250 204 236 354 192 294 263 199 168 ...
##  $ fbs     : int  1 0 0 0 0 0 0 0 1 0 ...
##  $ restecg : int  0 1 0 1 1 1 0 1 1 1 ...
##  $ thalach : int  150 187 172 178 163 148 153 173 162 174 ...
##  $ exang   : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ oldpeak : num  2.3 3.5 1.4 0.8 0.6 0.4 1.3 0 0.5 1.6 ...
##  $ slope   : int  0 0 2 2 2 1 1 2 2 2 ...
##  $ ca      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ thal    : int  1 2 2 2 2 1 2 3 3 2 ...
##  $ target  : int  1 1 1 1 1 1 1 1 1 1 ...

** Data cleaning 1. The name of the first column appears to be a typographical error.

names(heart)[1] <- "age"
# Displaying the coralation matrix
corr <- cor(heart)
# Visualize the correlation matrix
corrplot(corr,method = "number",type="lower",order = "hclust",title = "Correlations between Variables ")

#———————————- Reseting more variables to categorical

heart$sex <- ifelse(heart$sex==1,"male","female")
heart$cp  <- ifelse(heart$cp==1,"typical angina",ifelse(heart$cp==2, 
                   "atypical angina",ifelse(heart==3, "non-anginal pain","asymptomatic")))
heart$restecg <- ifelse(heart$restecg==0, "normal", ifelse(heart$trestbps==1,
                                                    "ST-T abnormality",      "hypertrophy"))

heart$exang <- ifelse(heart$exang==1,"yes","no")
heart$slope <- ifelse(heart$slope==1,"upsloping", ifelse(heart$slope== 2,
                                                                 "flat","downsloping"))
heart$thal <- ifelse(heart$thal==3,"normal", ifelse(heart$thal==6,"fixed defect",
                                                            "reversable defect"))
heart$fbs <- ifelse(heart$fbs==1,"true","false") # fasting or not
heart$target <- ifelse(heart$target==0,"typical","worse")

Numeric categoriess only

str(heart)
## 'data.frame':    303 obs. of  14 variables:
##  $ age     : int  63 37 41 56 57 57 56 44 52 57 ...
##  $ sex     : chr  "male" "male" "female" "male" ...
##  $ cp      : chr  "asymptomatic" "atypical angina" "typical angina" "typical angina" ...
##  $ trestbps: int  145 130 130 120 120 140 140 120 172 150 ...
##  $ chol    : int  233 250 204 236 354 192 294 263 199 168 ...
##  $ fbs     : chr  "true" "false" "false" "false" ...
##  $ restecg : chr  "normal" "hypertrophy" "normal" "hypertrophy" ...
##  $ thalach : int  150 187 172 178 163 148 153 173 162 174 ...
##  $ exang   : chr  "no" "no" "no" "no" ...
##  $ oldpeak : num  2.3 3.5 1.4 0.8 0.6 0.4 1.3 0 0.5 1.6 ...
##  $ slope   : chr  "downsloping" "downsloping" "flat" "flat" ...
##  $ ca      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ thal    : chr  "reversable defect" "reversable defect" "reversable defect" "reversable defect" ...
##  $ target  : chr  "worse" "worse" "worse" "worse" ...
summary(heart)
##       age            sex                 cp               trestbps    
##  Min.   :29.00   Length:303         Length:303         Min.   : 94.0  
##  1st Qu.:47.50   Class :character   Class :character   1st Qu.:120.0  
##  Median :55.00   Mode  :character   Mode  :character   Median :130.0  
##  Mean   :54.37                                         Mean   :131.6  
##  3rd Qu.:61.00                                         3rd Qu.:140.0  
##  Max.   :77.00                                         Max.   :200.0  
##       chol           fbs              restecg             thalach     
##  Min.   :126.0   Length:303         Length:303         Min.   : 71.0  
##  1st Qu.:211.0   Class :character   Class :character   1st Qu.:133.5  
##  Median :240.0   Mode  :character   Mode  :character   Median :153.0  
##  Mean   :246.3                                         Mean   :149.6  
##  3rd Qu.:274.5                                         3rd Qu.:166.0  
##  Max.   :564.0                                         Max.   :202.0  
##     exang              oldpeak        slope                 ca        
##  Length:303         Min.   :0.00   Length:303         Min.   :0.0000  
##  Class :character   1st Qu.:0.00   Class :character   1st Qu.:0.0000  
##  Mode  :character   Median :0.80   Mode  :character   Median :0.0000  
##                     Mean   :1.04                      Mean   :0.7294  
##                     3rd Qu.:1.60                      3rd Qu.:1.0000  
##                     Max.   :6.20                      Max.   :4.0000  
##      thal              target         
##  Length:303         Length:303        
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 

** Exploratory Data Analysis

df_status(heart)
##    variable q_zeros p_zeros q_na p_na q_inf p_inf      type unique
## 1       age       0    0.00    0    0     0     0   integer     41
## 2       sex       0    0.00    0    0     0     0 character      2
## 3        cp       0    0.00    0    0     0     0 character      3
## 4  trestbps       0    0.00    0    0     0     0   integer     49
## 5      chol       0    0.00    0    0     0     0   integer    152
## 6       fbs       0    0.00    0    0     0     0 character      2
## 7   restecg       0    0.00    0    0     0     0 character      2
## 8   thalach       0    0.00    0    0     0     0   integer     91
## 9     exang       0    0.00    0    0     0     0 character      2
## 10  oldpeak      99   32.67    0    0     0     0   numeric     40
## 11    slope       0    0.00    0    0     0     0 character      3
## 12       ca     175   57.76    0    0     0     0   integer      5
## 13     thal       0    0.00    0    0     0     0 character      2
## 14   target       0    0.00    0    0     0     0 character      2
introduce(heart)
##   rows columns discrete_columns continuous_columns all_missing_columns
## 1  303      14                8                  6                   0
##   total_missing_values complete_rows total_observations memory_usage
## 1                    0           303               4242        32584
plot_intro(heart)

plot_correlation(heart[,-14], type = "c")

# Plot bar charts with `price` feature
plot_bar(heart[,-14],binary_as_factor = TRUE,ggtheme = theme_classic(),title="Barplots for each variable")

# histogram of individual column

par(mfrow=c(2,2))
hist(heart$trestbps, col = "skyblue", xlab = "Blood Pressure", main = 
     "Histogram of trestbps: resting blood pressure")
hist(heart$age, col = "lightblue", xlab = "Age", main = 
     "Histogram of age distribution")
hist(heart$chol, col = "lightgreen", xlab = "Cholesterol", main = 
     "Histogram of Cholesterol distribution")
hist(heart$thalach, col = "green", xlab = "Heart rate", main = 
     "maximum heart rate achieved")

will see the data distribution using box plot

par(mfrow=c(2,2))
boxplot(trestbps  ~ target, heart, main = "Resting Blood Pressure", col = "lightpink3")
boxplot(age   ~ target, heart, main = "Age Distribution", col = "antiquewhite1")
boxplot(chol  ~ target, heart, main = "Cholesterol level", col = "lightskyblue4")
boxplot(thalach  ~ target, heart, main = "Maximum Heart Rate", col = "orange1")

Visualize principal component analysis

plot_prcomp(heart, variance_cap = 0.8, maxcat = 50L,
  prcomp_args = list(scale. = TRUE), geom_label_args = list(),
  title = NULL, ggtheme = theme_light(), theme_config = list(),
  nrow = 2L, ncol = 2L, parallel = FALSE)

Histogram plots

plot_histogram(heart, binary_as_factor = TRUE,
  geom_histogram_args = list(bins = 30L), title = "Histogram Charts",
  ggtheme = theme_get(), nrow = 3L,  ncol = 2L)

Scatterplot for variables

plot_scatterplot(heart, by="age")

Frequency table for categorial variables

freq(heart)

##      sex frequency percentage cumulative_perc
## 1   male       207      68.32           68.32
## 2 female        96      31.68          100.00

##                cp frequency percentage cumulative_perc
## 1    asymptomatic       166      54.79           54.79
## 2 atypical angina        87      28.71           83.50
## 3  typical angina        50      16.50          100.00

##     fbs frequency percentage cumulative_perc
## 1 false       258      85.15           85.15
## 2  true        45      14.85          100.00

##       restecg frequency percentage cumulative_perc
## 1 hypertrophy       156      51.49           51.49
## 2      normal       147      48.51          100.00

##   exang frequency percentage cumulative_perc
## 1    no       204      67.33           67.33
## 2   yes        99      32.67          100.00

##         slope frequency percentage cumulative_perc
## 1        flat       142      46.86           46.86
## 2   upsloping       140      46.20           93.06
## 3 downsloping        21       6.93          100.00

##                thal frequency percentage cumulative_perc
## 1 reversable defect       186      61.39           61.39
## 2            normal       117      38.61          100.00

##    target frequency percentage cumulative_perc
## 1   worse       165      54.46           54.46
## 2 typical       138      45.54          100.00
## [1] "Variables processed: sex, cp, fbs, restecg, exang, slope, thal, target"

Get correlation against target variable

correlation_table(data=heart,target = "age")
##   Variable   age
## 1      age  1.00
## 2 trestbps  0.28
## 3       ca  0.28
## 4     chol  0.21
## 5  oldpeak  0.21
## 6  thalach -0.40
correlation_table(data=heart,target = "chol")
##   Variable  chol
## 1     chol  1.00
## 2      age  0.21
## 3 trestbps  0.12
## 4       ca  0.07
## 5  oldpeak  0.05
## 6  thalach -0.01
ggplot(data=heart, aes(x=cp,y=trestbps,fill=as.factor(cp)))+
  geom_bar(stat = "identity")+
  xlab("Chest Pain")+
  ylab("Rest Blood Pressure")

#------------------------------
corr_mat <- cor(heart[,c(1,4,5,8,10)]) # numberic
corrplot(cor(heart[, c(1,4,5,8,10)]), type = "lower", method = "number") # excluding Outcome

corrplot(corr_mat, order = "hclust", tl.cex = 1, addrect = 5)

corrplot(corr_mat, method="ellipse")

corrplot(corr_mat, method="color")

corrplot(corr_mat,method = "number")

# Network mapping
qgraph(cor(corr_mat))

#————————–

#————————————–

  ggplot(data=heart, aes(x=age, y=trestbps, col=target))+
       geom_point()+
       geom_smooth(method = "lm", colour = "black") +
       xlab("Age") + ylab("T rest blood pressure") +
       scale_color_discrete(name = "cp")

# Correlation, Variance and Covariance Matrices
cov(heart[c(1,4,5,8,10)])
##                 age   trestbps        chol    thalach   oldpeak
## age       82.484558  44.495902  100.585076 -82.903318  2.214583
## trestbps  44.495902 307.586453  111.967215 -18.759131  3.934486
## chol     100.585076 111.967215 2686.426748 -11.800494  3.246794
## thalach  -82.903318 -18.759131  -11.800494 524.646406 -9.153518
## oldpeak    2.214583   3.934486    3.246794  -9.153518  1.348095
var(heart[c(1,4,5,8,10)])
##                 age   trestbps        chol    thalach   oldpeak
## age       82.484558  44.495902  100.585076 -82.903318  2.214583
## trestbps  44.495902 307.586453  111.967215 -18.759131  3.934486
## chol     100.585076 111.967215 2686.426748 -11.800494  3.246794
## thalach  -82.903318 -18.759131  -11.800494 524.646406 -9.153518
## oldpeak    2.214583   3.934486    3.246794  -9.153518  1.348095
cor(heart[c(1,4,5,8,10)])
##                 age    trestbps         chol      thalach     oldpeak
## age       1.0000000  0.27935091  0.213677957 -0.398521938  0.21001257
## trestbps  0.2793509  1.00000000  0.123174207 -0.046697728  0.19321647
## chol      0.2136780  0.12317421  1.000000000 -0.009939839  0.05395192
## thalach  -0.3985219 -0.04669773 -0.009939839  1.000000000 -0.34418695
## oldpeak   0.2100126  0.19321647  0.053951920 -0.344186948  1.00000000

#—————————————–

Correlations between numeric variables

corr_mat <- cor(heart[,c(1,4,5,8,10)])
corrplot(cor(heart[, c(1,4,5,8,10)]), type = "lower", method = "number") 

Visualization

p <- ggplot(heart, aes(x = trestbps, y = oldpeak,col=as.factor(cp))) 
p + geom_point(alpha = 0.7, size = 1.5) 

p + geom_jitter(alpha = 0.7, size = 1) + stat_boxplot(alpha = 0.5) + facet_grid(.~ cp)

p + geom_jitter(alpha = 0.7, size = 1) + stat_boxplot(alpha = 0.5) + facet_grid(.~ fbs )