# HEART DISEASE PREDICTION USING R
#Loading Libraries
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.4
## -- Attaching packages ------------------------------------ tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.0.3 v dplyr 1.0.5
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## Warning: package 'ggplot2' was built under R version 4.0.4
## Warning: package 'tidyr' was built under R version 4.0.4
## Warning: package 'dplyr' was built under R version 4.0.4
## Warning: package 'stringr' was built under R version 4.0.4
## -- Conflicts --------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
data<-read.csv("heart.csv")
View(data)
#Data Exploration
head(data)
## ï..age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal
## 1 63 1 3 145 233 1 0 150 0 2.3 0 0 1
## 2 37 1 2 130 250 0 1 187 0 3.5 0 0 2
## 3 41 0 1 130 204 0 0 172 0 1.4 2 0 2
## 4 56 1 1 120 236 0 1 178 0 0.8 2 0 2
## 5 57 0 0 120 354 0 1 163 1 0.6 2 0 2
## 6 57 1 0 140 192 0 1 148 0 0.4 1 0 1
## target
## 1 1
## 2 1
## 3 1
## 4 1
## 5 1
## 6 1
tail(data)
## ï..age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal
## 298 59 1 0 164 176 1 0 90 0 1.0 1 2 1
## 299 57 0 0 140 241 0 1 123 1 0.2 1 0 3
## 300 45 1 3 110 264 0 1 132 0 1.2 1 0 3
## 301 68 1 0 144 193 1 1 141 0 3.4 1 2 3
## 302 57 1 0 130 131 0 1 115 1 1.2 1 1 3
## 303 57 0 1 130 236 0 0 174 0 0.0 1 1 2
## target
## 298 0
## 299 0
## 300 0
## 301 0
## 302 0
## 303 0
glimpse(data)
## Rows: 303
## Columns: 14
## $ ï..age <int> 63, 37, 41, 56, 57, 57, 56, 44, 52, 57, 54, 48, 49, 64, 58...
## $ sex <int> 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0...
## $ cp <int> 3, 2, 1, 1, 0, 0, 1, 1, 2, 2, 0, 2, 1, 3, 3, 2, 2, 3, 0, 3...
## $ trestbps <int> 145, 130, 130, 120, 120, 140, 140, 120, 172, 150, 140, 130...
## $ chol <int> 233, 250, 204, 236, 354, 192, 294, 263, 199, 168, 239, 275...
## $ fbs <int> 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0...
## $ restecg <int> 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1...
## $ thalach <int> 150, 187, 172, 178, 163, 148, 153, 173, 162, 174, 160, 139...
## $ exang <int> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0...
## $ oldpeak <dbl> 2.3, 3.5, 1.4, 0.8, 0.6, 0.4, 1.3, 0.0, 0.5, 1.6, 1.2, 0.2...
## $ slope <int> 0, 0, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 0, 2, 2...
## $ ca <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2...
## $ thal <int> 1, 2, 2, 2, 2, 1, 2, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2...
## $ target <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
ncol(data)
## [1] 14
nrow(data)
## [1] 303
colnames(data)
## [1] "ï..age" "sex" "cp" "trestbps" "chol" "fbs"
## [7] "restecg" "thalach" "exang" "oldpeak" "slope" "ca"
## [13] "thal" "target"
summary(data)
## ï..age sex cp trestbps
## Min. :29.00 Min. :0.0000 Min. :0.000 Min. : 94.0
## 1st Qu.:47.50 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:120.0
## Median :55.00 Median :1.0000 Median :1.000 Median :130.0
## Mean :54.37 Mean :0.6832 Mean :0.967 Mean :131.6
## 3rd Qu.:61.00 3rd Qu.:1.0000 3rd Qu.:2.000 3rd Qu.:140.0
## Max. :77.00 Max. :1.0000 Max. :3.000 Max. :200.0
## chol fbs restecg thalach
## Min. :126.0 Min. :0.0000 Min. :0.0000 Min. : 71.0
## 1st Qu.:211.0 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:133.5
## Median :240.0 Median :0.0000 Median :1.0000 Median :153.0
## Mean :246.3 Mean :0.1485 Mean :0.5281 Mean :149.6
## 3rd Qu.:274.5 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:166.0
## Max. :564.0 Max. :1.0000 Max. :2.0000 Max. :202.0
## exang oldpeak slope ca
## Min. :0.0000 Min. :0.00 Min. :0.000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.00 1st Qu.:1.000 1st Qu.:0.0000
## Median :0.0000 Median :0.80 Median :1.000 Median :0.0000
## Mean :0.3267 Mean :1.04 Mean :1.399 Mean :0.7294
## 3rd Qu.:1.0000 3rd Qu.:1.60 3rd Qu.:2.000 3rd Qu.:1.0000
## Max. :1.0000 Max. :6.20 Max. :2.000 Max. :4.0000
## thal target
## Min. :0.000 Min. :0.0000
## 1st Qu.:2.000 1st Qu.:0.0000
## Median :2.000 Median :1.0000
## Mean :2.314 Mean :0.5446
## 3rd Qu.:3.000 3rd Qu.:1.0000
## Max. :3.000 Max. :1.0000
#Data Transformation
data2<-data%>%
mutate(sex=if_else(sex==1,"MALE","FEMALE"),
fbs=if_else(fbs==1,">120","<=120"),
exang=if_else(exang==1,"YES","NO"),
cp=if_else(cp==1,"ATYPICAL ANGINA",
if_else(cp==2,"NON-ANGINAL PAIN","ASYMPTOMATIC")),
restecg=if_else(restecg==0,"NORMAL",
if_else(restecg==1,"ABNORMALITY","PROBABLE OR DEFINITE")),
slope=as.factor(slope),
ca=as.factor(ca),
thal=as.factor(thal),
target=if_else(target==1,"YES","NO"))%>%
mutate_if(is.character,as.factor)%>%
dplyr::select(target,sex,fbs,exang,cp,restecg,slope,ca,thal,everything())
#Data Visualisation
#Bar plot for target (heart disease)
ggplot(data2,aes(x=data2$target,fill=data2$target))+
geom_bar()+
xlab("Heart Disease")+
ylab("count")+
ggtitle("Presence and Absence of Heart Disease")+
scale_fill_discrete(name="Heart Disease",labels=c("Absence","Presence"))
## Warning: Use of `data2$target` is discouraged. Use `target` instead.
## Warning: Use of `data2$target` is discouraged. Use `target` instead.

prop.table(table(data2$target))
##
## NO YES
## 0.4554455 0.5445545
#Count the frequency of the values of age
data2%>%
group_by(ï..age)%>%
count()%>%
filter(n>10)%>%
ggplot()+
geom_col(aes(ï..age,n),fill='yellow')+
ggtitle("Age Analysis")+
xlab("Age")+
ylab("AgeCount")

#compare blood pressure across the chest pain
data2%>%
ggplot(aes(x=sex,y=trestbps))+
geom_boxplot(fill='violet')+
xlab("sex")+
ylab("BP")+
facet_grid(~cp)

#compare cholestrol across the chest pain
data2%>%
ggplot(aes(x=sex,y=chol))+
geom_boxplot(fill="orange")+
xlab("sex")+
ylab("Chol")+
facet_grid(~cp)

#Correlation
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.0.4
## corrplot 0.84 loaded
library(ggplot2)
cor_heart<-cor(data2[,10:14])
cor_heart
## ï..age trestbps chol thalach oldpeak
## ï..age 1.0000000 0.27935091 0.213677957 -0.398521938 0.21001257
## trestbps 0.2793509 1.00000000 0.123174207 -0.046697728 0.19321647
## chol 0.2136780 0.12317421 1.000000000 -0.009939839 0.05395192
## thalach -0.3985219 -0.04669773 -0.009939839 1.000000000 -0.34418695
## oldpeak 0.2100126 0.19321647 0.053951920 -0.344186948 1.00000000
corrplot(cor_heart,method = "square",type="upper")
