This database contains 76 attributes, but all published experiments refer to using a subset of 14 of them. In particular, the Cleveland database is the only one that has been used by ML researchers to this date. The “goal” field refers to the presence of heart disease in the patient. It is integer valued from 0 (no presence) to 4.
Attribute Information:
library(tidyverse) # metapackage with lots of helpful functions
## -- Attaching packages ------------------------------------------------------------------------------ tidyverse 1.2.1 --
## v ggplot2 3.2.0 v purrr 0.3.2
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 0.8.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts --------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
library(readr)
library(corrplot)
## corrplot 0.84 loaded
library(qgraph)
## Registered S3 methods overwritten by 'huge':
## method from
## plot.sim BDgraph
## print.sim BDgraph
library(jtools)
library(caret)
## Loading required package: lattice
## Registered S3 methods overwritten by 'lava':
## method from
## plot.sim huge
## print.sim huge
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(DataExplorer)
library(funModeling)
## Loading required package: Hmisc
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
##
## cluster
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following object is masked from 'package:jtools':
##
## %nin%
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
## Registered S3 methods overwritten by 'pROC':
## method from
## print.roc huge
## plot.roc huge
## funModeling v.1.8 :)
## Examples and tutorials at livebook.datascienceheroes.com
library(ggm)
## Loading required package: igraph
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:purrr':
##
## compose, simplify
## The following object is masked from 'package:tidyr':
##
## crossing
## The following object is masked from 'package:tibble':
##
## as_data_frame
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
##
## Attaching package: 'ggm'
## The following object is masked from 'package:igraph':
##
## pa
## The following object is masked from 'package:Hmisc':
##
## rcorr
heart <- read.csv("C:/Users/Joe/Documents/Datasets/heart-disease-uci/heart.csv")
tbl_df(heart)
## # A tibble: 303 x 14
## ï..age sex cp trestbps chol fbs restecg thalach exang oldpeak
## <int> <int> <int> <int> <int> <int> <int> <int> <int> <dbl>
## 1 63 1 3 145 233 1 0 150 0 2.3
## 2 37 1 2 130 250 0 1 187 0 3.5
## 3 41 0 1 130 204 0 0 172 0 1.4
## 4 56 1 1 120 236 0 1 178 0 0.8
## 5 57 0 0 120 354 0 1 163 1 0.6
## 6 57 1 0 140 192 0 1 148 0 0.4
## 7 56 0 1 140 294 0 0 153 0 1.3
## 8 44 1 1 120 263 0 1 173 0 0
## 9 52 1 2 172 199 1 1 162 0 0.5
## 10 57 1 2 150 168 0 1 174 0 1.6
## # ... with 293 more rows, and 4 more variables: slope <int>, ca <int>,
## # thal <int>, target <int>
str(heart)
## 'data.frame': 303 obs. of 14 variables:
## $ ï..age : int 63 37 41 56 57 57 56 44 52 57 ...
## $ sex : int 1 1 0 1 0 1 0 1 1 1 ...
## $ cp : int 3 2 1 1 0 0 1 1 2 2 ...
## $ trestbps: int 145 130 130 120 120 140 140 120 172 150 ...
## $ chol : int 233 250 204 236 354 192 294 263 199 168 ...
## $ fbs : int 1 0 0 0 0 0 0 0 1 0 ...
## $ restecg : int 0 1 0 1 1 1 0 1 1 1 ...
## $ thalach : int 150 187 172 178 163 148 153 173 162 174 ...
## $ exang : int 0 0 0 0 1 0 0 0 0 0 ...
## $ oldpeak : num 2.3 3.5 1.4 0.8 0.6 0.4 1.3 0 0.5 1.6 ...
## $ slope : int 0 0 2 2 2 1 1 2 2 2 ...
## $ ca : int 0 0 0 0 0 0 0 0 0 0 ...
## $ thal : int 1 2 2 2 2 1 2 3 3 2 ...
## $ target : int 1 1 1 1 1 1 1 1 1 1 ...
** Data cleaning 1. The name of the first column appears to be a typographical error.
names(heart)[1] <- "age"
# Displaying the coralation matrix
corr <- cor(heart)
# Visualize the correlation matrix
corrplot(corr,method = "number",type="lower",order = "hclust",title = "Correlations between Variables ")
#———————————- Reseting more variables to categorical
heart$sex <- ifelse(heart$sex==1,"male","female")
heart$cp <- ifelse(heart$cp==1,"typical angina",ifelse(heart$cp==2,
"atypical angina",ifelse(heart==3, "non-anginal pain","asymptomatic")))
heart$restecg <- ifelse(heart$restecg==0, "normal", ifelse(heart$trestbps==1,
"ST-T abnormality", "hypertrophy"))
heart$exang <- ifelse(heart$exang==1,"yes","no")
heart$slope <- ifelse(heart$slope==1,"upsloping", ifelse(heart$slope== 2,
"flat","downsloping"))
heart$thal <- ifelse(heart$thal==3,"normal", ifelse(heart$thal==6,"fixed defect",
"reversable defect"))
heart$fbs <- ifelse(heart$fbs==1,"true","false") # fasting or not
heart$target <- ifelse(heart$target==0,"typical","worse")
Numeric categoriess only
str(heart)
## 'data.frame': 303 obs. of 14 variables:
## $ age : int 63 37 41 56 57 57 56 44 52 57 ...
## $ sex : chr "male" "male" "female" "male" ...
## $ cp : chr "asymptomatic" "atypical angina" "typical angina" "typical angina" ...
## $ trestbps: int 145 130 130 120 120 140 140 120 172 150 ...
## $ chol : int 233 250 204 236 354 192 294 263 199 168 ...
## $ fbs : chr "true" "false" "false" "false" ...
## $ restecg : chr "normal" "hypertrophy" "normal" "hypertrophy" ...
## $ thalach : int 150 187 172 178 163 148 153 173 162 174 ...
## $ exang : chr "no" "no" "no" "no" ...
## $ oldpeak : num 2.3 3.5 1.4 0.8 0.6 0.4 1.3 0 0.5 1.6 ...
## $ slope : chr "downsloping" "downsloping" "flat" "flat" ...
## $ ca : int 0 0 0 0 0 0 0 0 0 0 ...
## $ thal : chr "reversable defect" "reversable defect" "reversable defect" "reversable defect" ...
## $ target : chr "worse" "worse" "worse" "worse" ...
summary(heart)
## age sex cp trestbps
## Min. :29.00 Length:303 Length:303 Min. : 94.0
## 1st Qu.:47.50 Class :character Class :character 1st Qu.:120.0
## Median :55.00 Mode :character Mode :character Median :130.0
## Mean :54.37 Mean :131.6
## 3rd Qu.:61.00 3rd Qu.:140.0
## Max. :77.00 Max. :200.0
## chol fbs restecg thalach
## Min. :126.0 Length:303 Length:303 Min. : 71.0
## 1st Qu.:211.0 Class :character Class :character 1st Qu.:133.5
## Median :240.0 Mode :character Mode :character Median :153.0
## Mean :246.3 Mean :149.6
## 3rd Qu.:274.5 3rd Qu.:166.0
## Max. :564.0 Max. :202.0
## exang oldpeak slope ca
## Length:303 Min. :0.00 Length:303 Min. :0.0000
## Class :character 1st Qu.:0.00 Class :character 1st Qu.:0.0000
## Mode :character Median :0.80 Mode :character Median :0.0000
## Mean :1.04 Mean :0.7294
## 3rd Qu.:1.60 3rd Qu.:1.0000
## Max. :6.20 Max. :4.0000
## thal target
## Length:303 Length:303
## Class :character Class :character
## Mode :character Mode :character
##
##
##
** Exploratory Data Analysis
df_status(heart)
## variable q_zeros p_zeros q_na p_na q_inf p_inf type unique
## 1 age 0 0.00 0 0 0 0 integer 41
## 2 sex 0 0.00 0 0 0 0 character 2
## 3 cp 0 0.00 0 0 0 0 character 3
## 4 trestbps 0 0.00 0 0 0 0 integer 49
## 5 chol 0 0.00 0 0 0 0 integer 152
## 6 fbs 0 0.00 0 0 0 0 character 2
## 7 restecg 0 0.00 0 0 0 0 character 2
## 8 thalach 0 0.00 0 0 0 0 integer 91
## 9 exang 0 0.00 0 0 0 0 character 2
## 10 oldpeak 99 32.67 0 0 0 0 numeric 40
## 11 slope 0 0.00 0 0 0 0 character 3
## 12 ca 175 57.76 0 0 0 0 integer 5
## 13 thal 0 0.00 0 0 0 0 character 2
## 14 target 0 0.00 0 0 0 0 character 2
introduce(heart)
## rows columns discrete_columns continuous_columns all_missing_columns
## 1 303 14 8 6 0
## total_missing_values complete_rows total_observations memory_usage
## 1 0 303 4242 32584
plot_intro(heart)
plot_correlation(heart[,-14], type = "c")
# Plot bar charts with `price` feature
plot_bar(heart[,-14],binary_as_factor = TRUE,ggtheme = theme_classic(),title="Barplots for each variable")
# histogram of individual column
par(mfrow=c(2,2))
hist(heart$trestbps, col = "skyblue", xlab = "Blood Pressure", main =
"Histogram of trestbps: resting blood pressure")
hist(heart$age, col = "lightblue", xlab = "Age", main =
"Histogram of age distribution")
hist(heart$chol, col = "lightgreen", xlab = "Cholesterol", main =
"Histogram of Cholesterol distribution")
hist(heart$thalach, col = "green", xlab = "Heart rate", main =
"maximum heart rate achieved")
par(mfrow=c(2,2))
boxplot(trestbps ~ target, heart, main = "Resting Blood Pressure", col = "lightpink3")
boxplot(age ~ target, heart, main = "Age Distribution", col = "antiquewhite1")
boxplot(chol ~ target, heart, main = "Cholesterol level", col = "lightskyblue4")
boxplot(thalach ~ target, heart, main = "Maximum Heart Rate", col = "orange1")
Visualize principal component analysis
plot_prcomp(heart, variance_cap = 0.8, maxcat = 50L,
prcomp_args = list(scale. = TRUE), geom_label_args = list(),
title = NULL, ggtheme = theme_light(), theme_config = list(),
nrow = 2L, ncol = 2L, parallel = FALSE)
Histogram plots
plot_histogram(heart, binary_as_factor = TRUE,
geom_histogram_args = list(bins = 30L), title = "Histogram Charts",
ggtheme = theme_get(), nrow = 3L, ncol = 2L)
Scatterplot for variables
plot_scatterplot(heart, by="age")
Frequency table for categorial variables
freq(heart)
## sex frequency percentage cumulative_perc
## 1 male 207 68.32 68.32
## 2 female 96 31.68 100.00
## cp frequency percentage cumulative_perc
## 1 asymptomatic 166 54.79 54.79
## 2 atypical angina 87 28.71 83.50
## 3 typical angina 50 16.50 100.00
## fbs frequency percentage cumulative_perc
## 1 false 258 85.15 85.15
## 2 true 45 14.85 100.00
## restecg frequency percentage cumulative_perc
## 1 hypertrophy 156 51.49 51.49
## 2 normal 147 48.51 100.00
## exang frequency percentage cumulative_perc
## 1 no 204 67.33 67.33
## 2 yes 99 32.67 100.00
## slope frequency percentage cumulative_perc
## 1 flat 142 46.86 46.86
## 2 upsloping 140 46.20 93.06
## 3 downsloping 21 6.93 100.00
## thal frequency percentage cumulative_perc
## 1 reversable defect 186 61.39 61.39
## 2 normal 117 38.61 100.00
## target frequency percentage cumulative_perc
## 1 worse 165 54.46 54.46
## 2 typical 138 45.54 100.00
## [1] "Variables processed: sex, cp, fbs, restecg, exang, slope, thal, target"
Get correlation against target variable
correlation_table(data=heart,target = "age")
## Variable age
## 1 age 1.00
## 2 trestbps 0.28
## 3 ca 0.28
## 4 chol 0.21
## 5 oldpeak 0.21
## 6 thalach -0.40
correlation_table(data=heart,target = "chol")
## Variable chol
## 1 chol 1.00
## 2 age 0.21
## 3 trestbps 0.12
## 4 ca 0.07
## 5 oldpeak 0.05
## 6 thalach -0.01
ggplot(data=heart, aes(x=cp,y=trestbps,fill=as.factor(cp)))+
geom_bar(stat = "identity")+
xlab("Chest Pain")+
ylab("Rest Blood Pressure")
#------------------------------
corr_mat <- cor(heart[,c(1,4,5,8,10)]) # numberic
corrplot(cor(heart[, c(1,4,5,8,10)]), type = "lower", method = "number") # excluding Outcome
corrplot(corr_mat, order = "hclust", tl.cex = 1, addrect = 5)
corrplot(corr_mat, method="ellipse")
corrplot(corr_mat, method="color")
corrplot(corr_mat,method = "number")
# Network mapping
qgraph(cor(corr_mat))
#————————–
#————————————–
ggplot(data=heart, aes(x=age, y=trestbps, col=target))+
geom_point()+
geom_smooth(method = "lm", colour = "black") +
xlab("Age") + ylab("T rest blood pressure") +
scale_color_discrete(name = "cp")
# Correlation, Variance and Covariance Matrices
cov(heart[c(1,4,5,8,10)])
## age trestbps chol thalach oldpeak
## age 82.484558 44.495902 100.585076 -82.903318 2.214583
## trestbps 44.495902 307.586453 111.967215 -18.759131 3.934486
## chol 100.585076 111.967215 2686.426748 -11.800494 3.246794
## thalach -82.903318 -18.759131 -11.800494 524.646406 -9.153518
## oldpeak 2.214583 3.934486 3.246794 -9.153518 1.348095
var(heart[c(1,4,5,8,10)])
## age trestbps chol thalach oldpeak
## age 82.484558 44.495902 100.585076 -82.903318 2.214583
## trestbps 44.495902 307.586453 111.967215 -18.759131 3.934486
## chol 100.585076 111.967215 2686.426748 -11.800494 3.246794
## thalach -82.903318 -18.759131 -11.800494 524.646406 -9.153518
## oldpeak 2.214583 3.934486 3.246794 -9.153518 1.348095
cor(heart[c(1,4,5,8,10)])
## age trestbps chol thalach oldpeak
## age 1.0000000 0.27935091 0.213677957 -0.398521938 0.21001257
## trestbps 0.2793509 1.00000000 0.123174207 -0.046697728 0.19321647
## chol 0.2136780 0.12317421 1.000000000 -0.009939839 0.05395192
## thalach -0.3985219 -0.04669773 -0.009939839 1.000000000 -0.34418695
## oldpeak 0.2100126 0.19321647 0.053951920 -0.344186948 1.00000000
#—————————————–
corr_mat <- cor(heart[,c(1,4,5,8,10)])
corrplot(cor(heart[, c(1,4,5,8,10)]), type = "lower", method = "number")
Visualization
p <- ggplot(heart, aes(x = trestbps, y = oldpeak,col=as.factor(cp)))
p + geom_point(alpha = 0.7, size = 1.5)
p + geom_jitter(alpha = 0.7, size = 1) + stat_boxplot(alpha = 0.5) + facet_grid(.~ cp)
p + geom_jitter(alpha = 0.7, size = 1) + stat_boxplot(alpha = 0.5) + facet_grid(.~ fbs )