#                     HEART DISEASE PREDICTION USING R


#Loading Libraries
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.4
## -- Attaching packages ------------------------------------ tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.5
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## Warning: package 'ggplot2' was built under R version 4.0.4
## Warning: package 'tidyr' was built under R version 4.0.4
## Warning: package 'dplyr' was built under R version 4.0.4
## Warning: package 'stringr' was built under R version 4.0.4
## -- Conflicts --------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
data<-read.csv("heart.csv")
View(data)

#Data Exploration
head(data)
##   ï..age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal
## 1     63   1  3      145  233   1       0     150     0     2.3     0  0    1
## 2     37   1  2      130  250   0       1     187     0     3.5     0  0    2
## 3     41   0  1      130  204   0       0     172     0     1.4     2  0    2
## 4     56   1  1      120  236   0       1     178     0     0.8     2  0    2
## 5     57   0  0      120  354   0       1     163     1     0.6     2  0    2
## 6     57   1  0      140  192   0       1     148     0     0.4     1  0    1
##   target
## 1      1
## 2      1
## 3      1
## 4      1
## 5      1
## 6      1
tail(data)
##     ï..age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal
## 298     59   1  0      164  176   1       0      90     0     1.0     1  2    1
## 299     57   0  0      140  241   0       1     123     1     0.2     1  0    3
## 300     45   1  3      110  264   0       1     132     0     1.2     1  0    3
## 301     68   1  0      144  193   1       1     141     0     3.4     1  2    3
## 302     57   1  0      130  131   0       1     115     1     1.2     1  1    3
## 303     57   0  1      130  236   0       0     174     0     0.0     1  1    2
##     target
## 298      0
## 299      0
## 300      0
## 301      0
## 302      0
## 303      0
glimpse(data)
## Rows: 303
## Columns: 14
## $ ï..age   <int> 63, 37, 41, 56, 57, 57, 56, 44, 52, 57, 54, 48, 49, 64, 58...
## $ sex      <int> 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0...
## $ cp       <int> 3, 2, 1, 1, 0, 0, 1, 1, 2, 2, 0, 2, 1, 3, 3, 2, 2, 3, 0, 3...
## $ trestbps <int> 145, 130, 130, 120, 120, 140, 140, 120, 172, 150, 140, 130...
## $ chol     <int> 233, 250, 204, 236, 354, 192, 294, 263, 199, 168, 239, 275...
## $ fbs      <int> 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0...
## $ restecg  <int> 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1...
## $ thalach  <int> 150, 187, 172, 178, 163, 148, 153, 173, 162, 174, 160, 139...
## $ exang    <int> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0...
## $ oldpeak  <dbl> 2.3, 3.5, 1.4, 0.8, 0.6, 0.4, 1.3, 0.0, 0.5, 1.6, 1.2, 0.2...
## $ slope    <int> 0, 0, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 0, 2, 2...
## $ ca       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2...
## $ thal     <int> 1, 2, 2, 2, 2, 1, 2, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2...
## $ target   <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
ncol(data)
## [1] 14
nrow(data)
## [1] 303
colnames(data)
##  [1] "ï..age"   "sex"      "cp"       "trestbps" "chol"     "fbs"     
##  [7] "restecg"  "thalach"  "exang"    "oldpeak"  "slope"    "ca"      
## [13] "thal"     "target"
summary(data)
##      ï..age           sex               cp           trestbps    
##  Min.   :29.00   Min.   :0.0000   Min.   :0.000   Min.   : 94.0  
##  1st Qu.:47.50   1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:120.0  
##  Median :55.00   Median :1.0000   Median :1.000   Median :130.0  
##  Mean   :54.37   Mean   :0.6832   Mean   :0.967   Mean   :131.6  
##  3rd Qu.:61.00   3rd Qu.:1.0000   3rd Qu.:2.000   3rd Qu.:140.0  
##  Max.   :77.00   Max.   :1.0000   Max.   :3.000   Max.   :200.0  
##       chol            fbs            restecg          thalach     
##  Min.   :126.0   Min.   :0.0000   Min.   :0.0000   Min.   : 71.0  
##  1st Qu.:211.0   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:133.5  
##  Median :240.0   Median :0.0000   Median :1.0000   Median :153.0  
##  Mean   :246.3   Mean   :0.1485   Mean   :0.5281   Mean   :149.6  
##  3rd Qu.:274.5   3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:166.0  
##  Max.   :564.0   Max.   :1.0000   Max.   :2.0000   Max.   :202.0  
##      exang           oldpeak         slope             ca        
##  Min.   :0.0000   Min.   :0.00   Min.   :0.000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.00   1st Qu.:1.000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.80   Median :1.000   Median :0.0000  
##  Mean   :0.3267   Mean   :1.04   Mean   :1.399   Mean   :0.7294  
##  3rd Qu.:1.0000   3rd Qu.:1.60   3rd Qu.:2.000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :6.20   Max.   :2.000   Max.   :4.0000  
##       thal           target      
##  Min.   :0.000   Min.   :0.0000  
##  1st Qu.:2.000   1st Qu.:0.0000  
##  Median :2.000   Median :1.0000  
##  Mean   :2.314   Mean   :0.5446  
##  3rd Qu.:3.000   3rd Qu.:1.0000  
##  Max.   :3.000   Max.   :1.0000
#Data Transformation
data2<-data%>%
  mutate(sex=if_else(sex==1,"MALE","FEMALE"),
         fbs=if_else(fbs==1,">120","<=120"),
         exang=if_else(exang==1,"YES","NO"),
         cp=if_else(cp==1,"ATYPICAL ANGINA",
                    if_else(cp==2,"NON-ANGINAL PAIN","ASYMPTOMATIC")),
         restecg=if_else(restecg==0,"NORMAL",
                         if_else(restecg==1,"ABNORMALITY","PROBABLE OR DEFINITE")),
         slope=as.factor(slope),
         ca=as.factor(ca),
         thal=as.factor(thal),
         target=if_else(target==1,"YES","NO"))%>%
  mutate_if(is.character,as.factor)%>%
  dplyr::select(target,sex,fbs,exang,cp,restecg,slope,ca,thal,everything())

#Data Visualisation

#Bar plot for target (heart disease)
ggplot(data2,aes(x=data2$target,fill=data2$target))+
  geom_bar()+
  xlab("Heart Disease")+
  ylab("count")+
  ggtitle("Presence and Absence of Heart Disease")+
  scale_fill_discrete(name="Heart Disease",labels=c("Absence","Presence"))
## Warning: Use of `data2$target` is discouraged. Use `target` instead.
## Warning: Use of `data2$target` is discouraged. Use `target` instead.

prop.table(table(data2$target))
## 
##        NO       YES 
## 0.4554455 0.5445545
#Count the frequency of the values of age
data2%>%
  group_by(ï..age)%>%
  count()%>%
  filter(n>10)%>%
  ggplot()+
  geom_col(aes(ï..age,n),fill='yellow')+
  ggtitle("Age Analysis")+
  xlab("Age")+
  ylab("AgeCount")

#compare blood pressure across the chest pain
data2%>%
  ggplot(aes(x=sex,y=trestbps))+
  geom_boxplot(fill='violet')+
  xlab("sex")+
  ylab("BP")+
  facet_grid(~cp)

#compare cholestrol across the chest pain
data2%>%
  ggplot(aes(x=sex,y=chol))+
  geom_boxplot(fill="orange")+
  xlab("sex")+
  ylab("Chol")+
  facet_grid(~cp)

#Correlation
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.0.4
## corrplot 0.84 loaded
library(ggplot2)
cor_heart<-cor(data2[,10:14])
cor_heart
##              ï..age    trestbps         chol      thalach     oldpeak
## ï..age    1.0000000  0.27935091  0.213677957 -0.398521938  0.21001257
## trestbps  0.2793509  1.00000000  0.123174207 -0.046697728  0.19321647
## chol      0.2136780  0.12317421  1.000000000 -0.009939839  0.05395192
## thalach  -0.3985219 -0.04669773 -0.009939839  1.000000000 -0.34418695
## oldpeak   0.2100126  0.19321647  0.053951920 -0.344186948  1.00000000
corrplot(cor_heart,method = "square",type="upper")