Dataset: HEART Disease as per my discussion post

found at: https://www.kaggle.com/ronitf/heart-disease-uci

Columns definition:

  1. age

  2. sex

  3. chest pain type (4 values)

  4. resting blood pressure

  5. serum cholestoral in mg/dl

  6. fasting blood sugar > 120 mg/dl

  7. resting electrocardiographic results (values 0,1,2)

  8. maximum heart rate achieved

  9. exercise induced angina

  10. oldpeak = ST depression induced by exercise relative to rest

  11. the slope of the peak exercise ST segment

  12. number of major vessels (0-3) colored by flourosopy

  13. thal: 3 = normal; 6 = fixed defect; 7 = reversable defect

#Load all required packages
library(DT)
library(tidyr)
library(dplyr)    
library(ggplot2)
library(tidyverse)

Load data and examine columns

#Reading our data from csv file in github
heartDS <- read.csv("https://raw.githubusercontent.com/theoracley/Data607/master/Project2/heart.csv", header=TRUE, sep=",")

head(heartDS)
##   ï..age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca
## 1     63   1  3      145  233   1       0     150     0     2.3     0  0
## 2     37   1  2      130  250   0       1     187     0     3.5     0  0
## 3     41   0  1      130  204   0       0     172     0     1.4     2  0
## 4     56   1  1      120  236   0       1     178     0     0.8     2  0
## 5     57   0  0      120  354   0       1     163     1     0.6     2  0
## 6     57   1  0      140  192   0       1     148     0     0.4     1  0
##   thal target
## 1    1      1
## 2    2      1
## 3    2      1
## 4    2      1
## 5    2      1
## 6    1      1
colnames(heartDS)
##  [1] "ï..age"   "sex"      "cp"       "trestbps" "chol"     "fbs"     
##  [7] "restecg"  "thalach"  "exang"    "oldpeak"  "slope"    "ca"      
## [13] "thal"     "target"
#Let's rename those columns
colnames(heartDS) <- c("age","sex","chest_pain","blood_pressure","cholesterol","sugar","cardio","heart_rate","exercise",  "oldpeak","slope","ca","thal","target")
colnames(heartDS)
##  [1] "age"            "sex"            "chest_pain"     "blood_pressure"
##  [5] "cholesterol"    "sugar"          "cardio"         "heart_rate"    
##  [9] "exercise"       "oldpeak"        "slope"          "ca"            
## [13] "thal"           "target"
# str(heartDS)
# summary(heartDS)

#View data
datatable(heartDS)

Clean up time

#Remove unwanted data
ChoosenColumns <- c("age","sex","chest_pain","blood_pressure","cholesterol","sugar","cardio","heart_rate")
heartDS_new <- heartDS[ChoosenColumns]

#check out our new data
datatable(heartDS_new)
#data dimension
dim(heartDS_new)
## [1] 303   8
#Create two new categories for sex.
heartDS_new <- heartDS_new %>% mutate(M=if_else(sex==1, 1, 0))
heartDS_new <- heartDS_new %>% mutate(F=if_else(sex==0, 1, 0))

#categorize blood_pressure
heartDS_new <- heartDS_new %>% mutate(blood_pressure_cat=if_else(blood_pressure>=120, "high", 
                                                         if_else(blood_pressure<120, "Normal", "")))

#what about chess_pain, binarize it!
heartDS_new <- heartDS_new %>% mutate(has_chest_pain=if_else(chest_pain>2, 1, 0))


heartDS_new <- gather(data=heartDS_new, key=sex_col, value=sex_count, M, F)
datatable(heartDS_new)  

eyeball the graph

#Examine data for the entire population,
qplot(data=heartDS_new, x=cholesterol, y=heart_rate, size=I(3), color=blood_pressure_cat, alpha=I(0.6), main="Cholesterol vs Heart Rate")

According to this plot, there is no clear relationship between Cholesterol and Heart rate. Points are everywhere (high or normal High Pressure), does not matter. No pattern is noticed.

Consider the population above 60

heartDS_new_above_60 <- filter(heartDS_new, age > 60)

#Male portion
heartDS_new_above_60_M <- filter(heartDS_new_above_60, age > 60 & sex==1)

#Female portion
heartDS_new_above_60_F <- filter(heartDS_new_above_60, age > 60 & sex==0)

#blood pressure to percent
heartDS_new_above_60 <- mutate(heartDS_new_above_60, blood_pressure_per = (blood_pressure/100))

#ceck it
datatable(heartDS_new_above_60)
#Plot it!!
qplot(data=heartDS_new_above_60, x=cholesterol, y=heart_rate, size=I(3), color=blood_pressure_cat, alpha=I(0.6), main="Cholesterol vs Heart Rate (Above 60)")

Consider the population below 60

heartDS_new_below_60 <- filter(heartDS_new, age < 60)

#Male portion
heartDS_new_below_60_M <- filter(heartDS_new_below_60, age < 60 & sex==1)

#Female portion
heartDS_new_below_60_F <- filter(heartDS_new_below_60, age < 60 & sex==0)

#blood pressure to percent
heartDS_new_below_60  <- mutate(heartDS_new_below_60, blood_pressure_per = (blood_pressure/100))

#ceck it
datatable(heartDS_new_below_60)
#Plot it!!
qplot(data=heartDS_new_below_60, x=cholesterol, y=heart_rate, size=I(3), color=blood_pressure_cat, alpha=I(0.6), main="Cholesterol vs Heart Rate (Below 60)")

Plot the population groups

qplot(data=heartDS_new_above_60_M, x=cholesterol, y=heart_rate, size=I(3), color=blood_pressure_cat, alpha=I(0.6), main="Cholesterol vs Heart Rate (Male Above 60)")