Columns definition:
age
sex
chest pain type (4 values)
resting blood pressure
serum cholestoral in mg/dl
fasting blood sugar > 120 mg/dl
resting electrocardiographic results (values 0,1,2)
maximum heart rate achieved
exercise induced angina
oldpeak = ST depression induced by exercise relative to rest
the slope of the peak exercise ST segment
number of major vessels (0-3) colored by flourosopy
thal: 3 = normal; 6 = fixed defect; 7 = reversable defect
#Load all required packages
library(DT)
library(tidyr)
library(dplyr)
library(ggplot2)
library(tidyverse)
Load data and examine columns
#Reading our data from csv file in github
heartDS <- read.csv("https://raw.githubusercontent.com/theoracley/Data607/master/Project2/heart.csv", header=TRUE, sep=",")
head(heartDS)
## ï..age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca
## 1 63 1 3 145 233 1 0 150 0 2.3 0 0
## 2 37 1 2 130 250 0 1 187 0 3.5 0 0
## 3 41 0 1 130 204 0 0 172 0 1.4 2 0
## 4 56 1 1 120 236 0 1 178 0 0.8 2 0
## 5 57 0 0 120 354 0 1 163 1 0.6 2 0
## 6 57 1 0 140 192 0 1 148 0 0.4 1 0
## thal target
## 1 1 1
## 2 2 1
## 3 2 1
## 4 2 1
## 5 2 1
## 6 1 1
colnames(heartDS)
## [1] "ï..age" "sex" "cp" "trestbps" "chol" "fbs"
## [7] "restecg" "thalach" "exang" "oldpeak" "slope" "ca"
## [13] "thal" "target"
#Let's rename those columns
colnames(heartDS) <- c("age","sex","chest_pain","blood_pressure","cholesterol","sugar","cardio","heart_rate","exercise", "oldpeak","slope","ca","thal","target")
colnames(heartDS)
## [1] "age" "sex" "chest_pain" "blood_pressure"
## [5] "cholesterol" "sugar" "cardio" "heart_rate"
## [9] "exercise" "oldpeak" "slope" "ca"
## [13] "thal" "target"
# str(heartDS)
# summary(heartDS)
#View data
datatable(heartDS)
Clean up time
#Remove unwanted data
ChoosenColumns <- c("age","sex","chest_pain","blood_pressure","cholesterol","sugar","cardio","heart_rate")
heartDS_new <- heartDS[ChoosenColumns]
#check out our new data
datatable(heartDS_new)
#data dimension
dim(heartDS_new)
## [1] 303 8
#Create two new categories for sex.
heartDS_new <- heartDS_new %>% mutate(M=if_else(sex==1, 1, 0))
heartDS_new <- heartDS_new %>% mutate(F=if_else(sex==0, 1, 0))
#categorize blood_pressure
heartDS_new <- heartDS_new %>% mutate(blood_pressure_cat=if_else(blood_pressure>=120, "high",
if_else(blood_pressure<120, "Normal", "")))
#what about chess_pain, binarize it!
heartDS_new <- heartDS_new %>% mutate(has_chest_pain=if_else(chest_pain>2, 1, 0))
heartDS_new <- gather(data=heartDS_new, key=sex_col, value=sex_count, M, F)
datatable(heartDS_new)
eyeball the graph
#Examine data for the entire population,
qplot(data=heartDS_new, x=cholesterol, y=heart_rate, size=I(3), color=blood_pressure_cat, alpha=I(0.6), main="Cholesterol vs Heart Rate")
According to this plot, there is no clear relationship between Cholesterol and Heart rate. Points are everywhere (high or normal High Pressure), does not matter. No pattern is noticed.
Consider the population above 60
heartDS_new_above_60 <- filter(heartDS_new, age > 60)
#Male portion
heartDS_new_above_60_M <- filter(heartDS_new_above_60, age > 60 & sex==1)
#Female portion
heartDS_new_above_60_F <- filter(heartDS_new_above_60, age > 60 & sex==0)
#blood pressure to percent
heartDS_new_above_60 <- mutate(heartDS_new_above_60, blood_pressure_per = (blood_pressure/100))
#ceck it
datatable(heartDS_new_above_60)
#Plot it!!
qplot(data=heartDS_new_above_60, x=cholesterol, y=heart_rate, size=I(3), color=blood_pressure_cat, alpha=I(0.6), main="Cholesterol vs Heart Rate (Above 60)")
Consider the population below 60
heartDS_new_below_60 <- filter(heartDS_new, age < 60)
#Male portion
heartDS_new_below_60_M <- filter(heartDS_new_below_60, age < 60 & sex==1)
#Female portion
heartDS_new_below_60_F <- filter(heartDS_new_below_60, age < 60 & sex==0)
#blood pressure to percent
heartDS_new_below_60 <- mutate(heartDS_new_below_60, blood_pressure_per = (blood_pressure/100))
#ceck it
datatable(heartDS_new_below_60)
#Plot it!!
qplot(data=heartDS_new_below_60, x=cholesterol, y=heart_rate, size=I(3), color=blood_pressure_cat, alpha=I(0.6), main="Cholesterol vs Heart Rate (Below 60)")
Plot the population groups
qplot(data=heartDS_new_above_60_M, x=cholesterol, y=heart_rate, size=I(3), color=blood_pressure_cat, alpha=I(0.6), main="Cholesterol vs Heart Rate (Male Above 60)")
qplot(data=heartDS_new_above_60_F, x=cholesterol, y=heart_rate, size=I(3), color=blood_pressure_cat, alpha=I(0.6), main="Cholesterol vs Heart Rate (Female above 60)")
qplot(data=heartDS_new_below_60_M, x=cholesterol, y=heart_rate, size=I(3), color=blood_pressure_cat, alpha=I(0.6), main="Cholesterol vs Heart Rate (Male Below 60)")
qplot(data=heartDS_new_below_60_F, x=cholesterol, y=heart_rate, size=I(3), color=blood_pressure_cat, alpha=I(0.6), main="Cholesterol vs Heart Rate (Female Below 60)")
What about age and cholesterol
#Age and cholesterol
qplot(data=heartDS_new, x=cholesterol, y=age, size=I(3), color=blood_pressure_cat, alpha=I(0.6), main="Age vs Cholesterol (All)")
from different the plots for the cholesterol vs Heart rate, there seems to be no relation between these 2 variables. But when ploting the Age against cholesterol, it is clear that the cholesterol is increasing when the age increases. therefore there is a strong positive correlation between the age and the Cholesterol level.
We can continue doing statistics, but will be doing the combinations in a randomness manner. but there is a statistical method that can do futures selection and select for us the best columns to choose that may lead into a fruitfull conclusion.Untol then let me show off some skills I picked up in statistics.
#Look at the mean blood_pressure
summarise(heartDS_new, mean(blood_pressure, na.rm=TRUE))
## mean(blood_pressure, na.rm = TRUE)
## 1 131.6238
#What about sex and blood_pressure
summarise(group_by(heartDS_new, sex), mean(blood_pressure, na.rm=TRUE))
## # A tibble: 2 x 2
## sex `mean(blood_pressure, na.rm = TRUE)`
## <int> <dbl>
## 1 0 133.
## 2 1 131.
#How about that random sample
datatable(sample_n(heartDS_new, size=20))
#what about 15% of the data
datatable(sample_frac(heartDS_new, size=.15))
#give me the count by sex
count(heartDS_new, sex)
## # A tibble: 2 x 2
## sex n
## <int> <int>
## 1 0 192
## 2 1 414
#sorting the data
datatable(arrange(heartDS_new, desc(age), sex))
#and finally
heartDS_new %>%
filter(sex==1) %>%
group_by(age) %>%
summary(mean(age, na.rm=TRUE))
## age sex chest_pain blood_pressure
## Min. :29.00 Min. :1 Min. :0.0000 Min. : 94.0
## 1st Qu.:47.00 1st Qu.:1 1st Qu.:0.0000 1st Qu.:120.0
## Median :54.00 Median :1 Median :0.0000 Median :130.0
## Mean :53.76 Mean :1 Mean :0.9324 Mean :130.9
## 3rd Qu.:59.75 3rd Qu.:1 3rd Qu.:2.0000 3rd Qu.:140.0
## Max. :77.00 Max. :1 Max. :3.0000 Max. :192.0
## cholesterol sugar cardio heart_rate
## Min. :126.0 Min. :0.0000 Min. :0.0000 Min. : 71
## 1st Qu.:208.0 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:132
## Median :235.0 Median :0.0000 Median :1.0000 Median :151
## Mean :239.3 Mean :0.1594 Mean :0.5072 Mean :149
## 3rd Qu.:268.5 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:168
## Max. :353.0 Max. :1.0000 Max. :2.0000 Max. :202
## blood_pressure_cat has_chest_pain sex_col sex_count
## Length:414 Min. :0.00000 Length:414 Min. :0.0
## Class :character 1st Qu.:0.00000 Class :character 1st Qu.:0.0
## Mode :character Median :0.00000 Mode :character Median :0.5
## Mean :0.09179 Mean :0.5
## 3rd Qu.:0.00000 3rd Qu.:1.0
## Max. :1.00000 Max. :1.0