Project2c

Dataset: HEART Disease as per my discussion post

found at: https://www.kaggle.com/ronitf/heart-disease-uci

Columns definition:

age
sex
chest pain type (4 values)
resting blood pressure
serum cholestoral in mg/dl
fasting blood sugar > 120 mg/dl
resting electrocardiographic results (values 0,1,2)
maximum heart rate achieved
exercise induced angina
oldpeak = ST depression induced by exercise relative to rest
the slope of the peak exercise ST segment
number of major vessels (0-3) colored by flourosopy
thal: 3 = normal; 6 = fixed defect; 7 = reversable defect

#Load all required packages
library(DT)
library(tidyr)
library(dplyr)    
library(ggplot2)
library(tidyverse)

Load data and examine columns

#Reading our data from csv file in github
heartDS <- read.csv("https://raw.githubusercontent.com/theoracley/Data607/master/Project2/heart.csv", header=TRUE, sep=",")

head(heartDS)

##   ï..age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca
## 1     63   1  3      145  233   1       0     150     0     2.3     0  0
## 2     37   1  2      130  250   0       1     187     0     3.5     0  0
## 3     41   0  1      130  204   0       0     172     0     1.4     2  0
## 4     56   1  1      120  236   0       1     178     0     0.8     2  0
## 5     57   0  0      120  354   0       1     163     1     0.6     2  0
## 6     57   1  0      140  192   0       1     148     0     0.4     1  0
##   thal target
## 1    1      1
## 2    2      1
## 3    2      1
## 4    2      1
## 5    2      1
## 6    1      1

colnames(heartDS)

##  [1] "ï..age"   "sex"      "cp"       "trestbps" "chol"     "fbs"     
##  [7] "restecg"  "thalach"  "exang"    "oldpeak"  "slope"    "ca"      
## [13] "thal"     "target"

#Let's rename those columns
colnames(heartDS) <- c("age","sex","chest_pain","blood_pressure","cholesterol","sugar","cardio","heart_rate","exercise",  "oldpeak","slope","ca","thal","target")
colnames(heartDS)

##  [1] "age"            "sex"            "chest_pain"     "blood_pressure"
##  [5] "cholesterol"    "sugar"          "cardio"         "heart_rate"    
##  [9] "exercise"       "oldpeak"        "slope"          "ca"            
## [13] "thal"           "target"

# str(heartDS)
# summary(heartDS)

#View data
datatable(heartDS)

Clean up time

#Remove unwanted data
ChoosenColumns <- c("age","sex","chest_pain","blood_pressure","cholesterol","sugar","cardio","heart_rate")
heartDS_new <- heartDS[ChoosenColumns]

#check out our new data
datatable(heartDS_new)

#data dimension
dim(heartDS_new)

## [1] 303   8

#Create two new categories for sex.
heartDS_new <- heartDS_new %>% mutate(M=if_else(sex==1, 1, 0))
heartDS_new <- heartDS_new %>% mutate(F=if_else(sex==0, 1, 0))

#categorize blood_pressure
heartDS_new <- heartDS_new %>% mutate(blood_pressure_cat=if_else(blood_pressure>=120, "high", 
                                                         if_else(blood_pressure<120, "Normal", "")))

#what about chess_pain, binarize it!
heartDS_new <- heartDS_new %>% mutate(has_chest_pain=if_else(chest_pain>2, 1, 0))


heartDS_new <- gather(data=heartDS_new, key=sex_col, value=sex_count, M, F)
datatable(heartDS_new)

eyeball the graph

#Examine data for the entire population,
qplot(data=heartDS_new, x=cholesterol, y=heart_rate, size=I(3), color=blood_pressure_cat, alpha=I(0.6), main="Cholesterol vs Heart Rate")

According to this plot, there is no clear relationship between Cholesterol and Heart rate. Points are everywhere (high or normal High Pressure), does not matter. No pattern is noticed.

Consider the population above 60

heartDS_new_above_60 <- filter(heartDS_new, age > 60)

#Male portion
heartDS_new_above_60_M <- filter(heartDS_new_above_60, age > 60 & sex==1)

#Female portion
heartDS_new_above_60_F <- filter(heartDS_new_above_60, age > 60 & sex==0)

#blood pressure to percent
heartDS_new_above_60 <- mutate(heartDS_new_above_60, blood_pressure_per = (blood_pressure/100))

#ceck it
datatable(heartDS_new_above_60)

#Plot it!!
qplot(data=heartDS_new_above_60, x=cholesterol, y=heart_rate, size=I(3), color=blood_pressure_cat, alpha=I(0.6), main="Cholesterol vs Heart Rate (Above 60)")

Consider the population below 60

heartDS_new_below_60 <- filter(heartDS_new, age < 60)

#Male portion
heartDS_new_below_60_M <- filter(heartDS_new_below_60, age < 60 & sex==1)

#Female portion
heartDS_new_below_60_F <- filter(heartDS_new_below_60, age < 60 & sex==0)

#blood pressure to percent
heartDS_new_below_60  <- mutate(heartDS_new_below_60, blood_pressure_per = (blood_pressure/100))

#ceck it
datatable(heartDS_new_below_60)

#Plot it!!
qplot(data=heartDS_new_below_60, x=cholesterol, y=heart_rate, size=I(3), color=blood_pressure_cat, alpha=I(0.6), main="Cholesterol vs Heart Rate (Below 60)")

Plot the population groups

qplot(data=heartDS_new_above_60_M, x=cholesterol, y=heart_rate, size=I(3), color=blood_pressure_cat, alpha=I(0.6), main="Cholesterol vs Heart Rate (Male Above 60)")

qplot(data=heartDS_new_above_60_F, x=cholesterol, y=heart_rate, size=I(3), color=blood_pressure_cat, alpha=I(0.6), main="Cholesterol vs Heart Rate (Female above 60)")

qplot(data=heartDS_new_below_60_M, x=cholesterol, y=heart_rate, size=I(3), color=blood_pressure_cat, alpha=I(0.6), main="Cholesterol vs Heart Rate (Male Below 60)")

qplot(data=heartDS_new_below_60_F, x=cholesterol, y=heart_rate, size=I(3), color=blood_pressure_cat, alpha=I(0.6), main="Cholesterol vs Heart Rate (Female Below 60)")

What about age and cholesterol

#Age and cholesterol
qplot(data=heartDS_new, x=cholesterol, y=age, size=I(3), color=blood_pressure_cat, alpha=I(0.6), main="Age vs Cholesterol (All)")

Conclusion

from different the plots for the cholesterol vs Heart rate, there seems to be no relation between these 2 variables. But when ploting the Age against cholesterol, it is clear that the cholesterol is increasing when the age increases. therefore there is a strong positive correlation between the age and the Cholesterol level.

We can continue doing statistics, but will be doing the combinations in a randomness manner. but there is a statistical method that can do futures selection and select for us the best columns to choose that may lead into a fruitfull conclusion.Untol then let me show off some skills I picked up in statistics.

#Look at the mean blood_pressure
summarise(heartDS_new, mean(blood_pressure, na.rm=TRUE))

##   mean(blood_pressure, na.rm = TRUE)
## 1                           131.6238

#What about sex and blood_pressure
summarise(group_by(heartDS_new, sex), mean(blood_pressure, na.rm=TRUE))

## # A tibble: 2 x 2
##     sex `mean(blood_pressure, na.rm = TRUE)`
##   <int>                                <dbl>
## 1     0                                 133.
## 2     1                                 131.

#How about that random sample
datatable(sample_n(heartDS_new, size=20))

#what about 15% of the data
datatable(sample_frac(heartDS_new, size=.15))

#give me the count by sex
count(heartDS_new, sex)

## # A tibble: 2 x 2
##     sex     n
##   <int> <int>
## 1     0   192
## 2     1   414

#sorting the data
datatable(arrange(heartDS_new, desc(age), sex))

#and finally
heartDS_new %>%
  filter(sex==1) %>%
  group_by(age) %>%
  summary(mean(age, na.rm=TRUE))

##       age             sex      chest_pain     blood_pressure 
##  Min.   :29.00   Min.   :1   Min.   :0.0000   Min.   : 94.0  
##  1st Qu.:47.00   1st Qu.:1   1st Qu.:0.0000   1st Qu.:120.0  
##  Median :54.00   Median :1   Median :0.0000   Median :130.0  
##  Mean   :53.76   Mean   :1   Mean   :0.9324   Mean   :130.9  
##  3rd Qu.:59.75   3rd Qu.:1   3rd Qu.:2.0000   3rd Qu.:140.0  
##  Max.   :77.00   Max.   :1   Max.   :3.0000   Max.   :192.0  
##   cholesterol        sugar            cardio         heart_rate 
##  Min.   :126.0   Min.   :0.0000   Min.   :0.0000   Min.   : 71  
##  1st Qu.:208.0   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:132  
##  Median :235.0   Median :0.0000   Median :1.0000   Median :151  
##  Mean   :239.3   Mean   :0.1594   Mean   :0.5072   Mean   :149  
##  3rd Qu.:268.5   3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:168  
##  Max.   :353.0   Max.   :1.0000   Max.   :2.0000   Max.   :202  
##  blood_pressure_cat has_chest_pain      sex_col            sex_count  
##  Length:414         Min.   :0.00000   Length:414         Min.   :0.0  
##  Class :character   1st Qu.:0.00000   Class :character   1st Qu.:0.0  
##  Mode  :character   Median :0.00000   Mode  :character   Median :0.5  
##                     Mean   :0.09179                      Mean   :0.5  
##                     3rd Qu.:0.00000                      3rd Qu.:1.0  
##                     Max.   :1.00000                      Max.   :1.0

Project2c

Abdelmalek Hajjam

10/2/2019

Dataset: HEART Disease as per my discussion post

found at: https://www.kaggle.com/ronitf/heart-disease-uci

Conclusion