Columns | Description |
---|---|
age | ranges from mid-20’s to late 70’s |
sex | 1 = male; 0 = female |
chest pain type | 4 value types from 0 - 3 |
resting blood pressure | in mm Hg |
serum cholesterol | in mg/dl |
fasting blood sugar (> 120 mg/dl) | 1 = true; 0 = false |
resting electrocardiograph results | values 0, 1, 2 |
thalach | maximum heart rate achieved |
exercise induced angina | 1 = yes; 0 = no |
oldpeak | ST depression induced by exercise relative to rest |
slope | peak exercise ST segment |
ca | number of major vessels (0 - 3) colored by fluoroscopy |
thal | 1 = normal; 2 = fixed defect; 3 = reversible defect |
target | presence of heart disease in the patient: 0 = no disease; 1 = disease |
# load libraries needed
library("tidyverse")
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.4 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 2.0.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library("dplyr")
library("tidyr")
library("ggplot2")
library("hrbrthemes")
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
## Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
## if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
# install packages if not already
# install.packages("tidyverse")
# install.packages("dplyr")
# install.packages("tidyr")
# install.packages("ggplot2")
# hrbrthemes::import_roboto_condensed()
# load csv file
<- read.csv("https://raw.githubusercontent.com/letisalba/Data-607/main/Tidyverse/heart.csv", header = TRUE, na = ",")
heart_disease head(heart_disease, n = 4) # by adding n = followed by any number you can select how many rows are to display
## age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal
## 1 52 1 0 125 212 0 1 168 0 1.0 2 2 3
## 2 53 1 0 140 203 1 0 155 1 3.1 0 0 3
## 3 70 1 0 145 174 0 1 125 1 2.6 0 0 3
## 4 61 1 0 148 203 0 1 161 0 0.0 2 1 3
## target
## 1 0
## 2 0
## 3 0
## 4 0
# glimpse function
glimpse(heart_disease)
## Rows: 1,025
## Columns: 14
## $ age <int> 52, 53, 70, 61, 62, 58, 58, 55, 46, 54, 71, 43, 34, 51, 52, 3…
## $ sex <int> 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1…
## $ cp <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 2, 0, 1, 2, 2…
## $ trestbps <int> 125, 140, 145, 148, 138, 100, 114, 160, 120, 122, 112, 132, 1…
## $ chol <int> 212, 203, 174, 203, 294, 248, 318, 289, 249, 286, 149, 341, 2…
## $ fbs <int> 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0…
## $ restecg <int> 1, 0, 1, 1, 1, 0, 2, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0…
## $ thalach <int> 168, 155, 125, 161, 106, 122, 140, 145, 144, 116, 125, 136, 1…
## $ exang <int> 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0…
## $ oldpeak <dbl> 1.0, 3.1, 2.6, 0.0, 1.9, 1.0, 4.4, 0.8, 0.8, 3.2, 1.6, 3.0, 0…
## $ slope <int> 2, 0, 0, 2, 1, 1, 0, 1, 2, 1, 1, 1, 2, 1, 1, 2, 2, 1, 2, 2, 1…
## $ ca <int> 2, 0, 0, 1, 3, 0, 3, 1, 0, 2, 0, 0, 0, 3, 0, 0, 1, 1, 0, 0, 0…
## $ thal <int> 3, 3, 3, 3, 2, 2, 1, 3, 3, 2, 2, 3, 2, 3, 0, 2, 2, 3, 2, 2, 2…
## $ target <int> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0…
# summary function for whole data set
summary(heart_disease)
## age sex cp trestbps
## Min. :29.00 Min. :0.0000 Min. :0.0000 Min. : 94.0
## 1st Qu.:48.00 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:120.0
## Median :56.00 Median :1.0000 Median :1.0000 Median :130.0
## Mean :54.43 Mean :0.6956 Mean :0.9424 Mean :131.6
## 3rd Qu.:61.00 3rd Qu.:1.0000 3rd Qu.:2.0000 3rd Qu.:140.0
## Max. :77.00 Max. :1.0000 Max. :3.0000 Max. :200.0
## chol fbs restecg thalach
## Min. :126 Min. :0.0000 Min. :0.0000 Min. : 71.0
## 1st Qu.:211 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:132.0
## Median :240 Median :0.0000 Median :1.0000 Median :152.0
## Mean :246 Mean :0.1493 Mean :0.5298 Mean :149.1
## 3rd Qu.:275 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:166.0
## Max. :564 Max. :1.0000 Max. :2.0000 Max. :202.0
## exang oldpeak slope ca
## Min. :0.0000 Min. :0.000 Min. :0.000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:1.000 1st Qu.:0.0000
## Median :0.0000 Median :0.800 Median :1.000 Median :0.0000
## Mean :0.3366 Mean :1.072 Mean :1.385 Mean :0.7541
## 3rd Qu.:1.0000 3rd Qu.:1.800 3rd Qu.:2.000 3rd Qu.:1.0000
## Max. :1.0000 Max. :6.200 Max. :2.000 Max. :4.0000
## thal target
## Min. :0.000 Min. :0.0000
## 1st Qu.:2.000 1st Qu.:0.0000
## Median :2.000 Median :1.0000
## Mean :2.324 Mean :0.5132
## 3rd Qu.:3.000 3rd Qu.:1.0000
## Max. :3.000 Max. :1.0000
# summary function for particular columns
summary(heart_disease$chol)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 126 211 240 246 275 564
# checking for any NA values in columns
colSums(is.na(heart_disease))
## age sex cp trestbps chol fbs restecg thalach
## 0 0 0 0 0 0 0 0
## exang oldpeak slope ca thal target
## 0 0 0 0 0 0
# get column names
colnames(heart_disease)
## [1] "age" "sex" "cp" "trestbps" "chol" "fbs"
## [7] "restecg" "thalach" "exang" "oldpeak" "slope" "ca"
## [13] "thal" "target"
# rename columns in data set
names(heart_disease) <- c('Age', 'Gender', 'Chest_Pain_Type', 'Resting_Blood_Pressure', 'Cholesterol', 'Fasting_Blood_Sugar', 'Resting_ECG', 'Thalach', 'Exercise_Induced_Angina', 'Oldpeak', 'Slope', 'Num_Major_Vessels', 'Thalassemia', 'Target')
# rename one column in data set
# names(heart_disease)[1] <- 'Age' # [1] the number in the bracket is the column number you would like to change
head(heart_disease, n = 2)
## Age Gender Chest_Pain_Type Resting_Blood_Pressure Cholesterol
## 1 52 1 0 125 212
## 2 53 1 0 140 203
## Fasting_Blood_Sugar Resting_ECG Thalach Exercise_Induced_Angina Oldpeak Slope
## 1 0 1 168 0 1.0 2
## 2 1 0 155 1 3.1 0
## Num_Major_Vessels Thalassemia Target
## 1 2 3 0
## 2 0 3 0
# replace gender values 1 = Male and 0 = Female; Fasting_blood_sugar values 1 = True and 0 = False; Exercise_Induced_Angina values 1 = Yes and 0 = No; Thalassemia values 0 = Normal, 1 = Fixed Defect and 2 = Reversible Defect; and Target values 0 = No Disease and 1 = Disease
<- heart_disease %>%
heart mutate(Gender = replace(Gender, Gender == 1, 'Male')) %>%
mutate(Gender = replace(Gender, Gender == 0, 'Female')) %>%
mutate(Fasting_Blood_Sugar = replace(Fasting_Blood_Sugar, Fasting_Blood_Sugar == 1, 'True')) %>%
mutate(Fasting_Blood_Sugar = replace(Fasting_Blood_Sugar, Fasting_Blood_Sugar == 0, 'False')) %>%
mutate(Exercise_Induced_Angina = replace(Exercise_Induced_Angina, Exercise_Induced_Angina == 1, 'Yes')) %>%
mutate(Exercise_Induced_Angina = replace(Exercise_Induced_Angina, Exercise_Induced_Angina == 0, 'No')) %>%
mutate(Thalassemia = replace(Thalassemia, Thalassemia == 1, 'Normal')) %>%
mutate(Thalassemia = replace(Thalassemia, Thalassemia == 2, 'Fixed Defect')) %>%
mutate(Thalassemia = replace(Thalassemia, Thalassemia == 3, 'Reversible Defect')) %>%
mutate(Target = replace(Target, Target == 0, 'No Disease')) %>%
mutate(Target = replace(Target, Target == 1, 'Disease'))
head(heart, n = 4)
## Age Gender Chest_Pain_Type Resting_Blood_Pressure Cholesterol
## 1 52 Male 0 125 212
## 2 53 Male 0 140 203
## 3 70 Male 0 145 174
## 4 61 Male 0 148 203
## Fasting_Blood_Sugar Resting_ECG Thalach Exercise_Induced_Angina Oldpeak Slope
## 1 False 1 168 No 1.0 2
## 2 True 0 155 Yes 3.1 0
## 3 False 1 125 Yes 2.6 0
## 4 False 1 161 No 0.0 2
## Num_Major_Vessels Thalassemia Target
## 1 2 Reversible Defect No Disease
## 2 0 Reversible Defect No Disease
## 3 0 Reversible Defect No Disease
## 4 1 Reversible Defect No Disease
# Age Analysis histogram
<- heart %>%
age_histogram ggplot(aes(x = Age)) +
geom_histogram( binwidth=3, fill="#69b3a2", color="#e9ecef", alpha=0.9) +
ggtitle("Age Analysis Histogram") +
theme_ipsum() +
theme(
plot.title = element_text(size=15)
) age_histogram
# Age and Chest Pain Type histogram
<- heart %>%
barplot ggplot(aes(x = Fasting_Blood_Sugar, fill = Target)) +
geom_bar() +
ggtitle("Fasting Blood Sugar in Relation to Heart Disease Histogram") +
xlab("Fasting Blood Sugar") +
ylab("Count") +
theme_ipsum() +
theme(legend.position="right")
barplot
# side by side bar plot
<- heart %>%
plot ggplot(aes(x= Chest_Pain_Type,fill= Target)) +
theme_bw() +
geom_bar() +
facet_wrap(~Gender) +
labs(x = "Chest Pain Type",
y = "Count",
title = "Distribution of Target by Gender based on Chest Pain Type")
plot
# Blood Pressure and Cholesterol scatter plot
<- heart %>%
scatterplot ggplot(aes(x = Resting_Blood_Pressure, y = Cholesterol)) +
geom_point(color = "#69b3a2") +
geom_smooth(method = lm , color="red", se = FALSE) +
theme_ipsum()
scatterplot
## `geom_smooth()` using formula 'y ~ x'