#Data Description (Heartdata) & Preprocessing
https://archive.ics.uci.edu/ml/datasets/Heart+Disease
data <- read_csv("heart.csv")
## Parsed with column specification:
## cols(
## age = col_double(),
## sex = col_double(),
## cp = col_double(),
## trestbps = col_double(),
## chol = col_double(),
## fbs = col_double(),
## restecg = col_double(),
## thalach = col_double(),
## exang = col_double(),
## oldpeak = col_double(),
## slope = col_double(),
## ca = col_double(),
## thal = col_double(),
## target = col_double()
## )
str(data)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 303 obs. of 14 variables:
## $ age : num 63 37 41 56 57 57 56 44 52 57 ...
## $ sex : num 1 1 0 1 0 1 0 1 1 1 ...
## $ cp : num 3 2 1 1 0 0 1 1 2 2 ...
## $ trestbps: num 145 130 130 120 120 140 140 120 172 150 ...
## $ chol : num 233 250 204 236 354 192 294 263 199 168 ...
## $ fbs : num 1 0 0 0 0 0 0 0 1 0 ...
## $ restecg : num 0 1 0 1 1 1 0 1 1 1 ...
## $ thalach : num 150 187 172 178 163 148 153 173 162 174 ...
## $ exang : num 0 0 0 0 1 0 0 0 0 0 ...
## $ oldpeak : num 2.3 3.5 1.4 0.8 0.6 0.4 1.3 0 0.5 1.6 ...
## $ slope : num 0 0 2 2 2 1 1 2 2 2 ...
## $ ca : num 0 0 0 0 0 0 0 0 0 0 ...
## $ thal : num 1 2 2 2 2 1 2 3 3 2 ...
## $ target : num 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, "spec")=
## .. cols(
## .. age = col_double(),
## .. sex = col_double(),
## .. cp = col_double(),
## .. trestbps = col_double(),
## .. chol = col_double(),
## .. fbs = col_double(),
## .. restecg = col_double(),
## .. thalach = col_double(),
## .. exang = col_double(),
## .. oldpeak = col_double(),
## .. slope = col_double(),
## .. ca = col_double(),
## .. thal = col_double(),
## .. target = col_double()
## .. )
The dataset consists of 14 physological patient attributes as follows:
Attribute Information
age: age (continuous)
sex: gender (categorical, 0=male, 1=female)
cp: chest pain type (4 values, ordinal/categorical))
trestbps resting blood pressure (continuous)
chol: serum cholestoral in mg/dl (continuous)
fbs: fasting blood sugar > 120 mg/dl (categorical is >120 =1 or <120 =0)
restecg: resting electrocardiographic results (values 0,1,2, categorical)
thalack: maximum heart rate achieved (continuous)
exang: exercise induced angina (categorical, 1=angina 0 = no agina)
oldpeak: ST depression induced by exercise relative to rest (continuous)
slope: the slope of the peak exercise ST segment (continuous)
ca : number of major vessels (0-3) colored by flourosopy (factor)
thal 3 = normal; 6 = fixed defect; 7 = reversable defect (categorical)
target 1= heart disease present; 0 = no heart disease (categorical, dependant variable)
plot_missing(data)
Data engineering
data2 <- data %>%
mutate(sex = if_else(sex == 1, "MALE", "FEMALE"),
fbs = if_else(fbs == 1, ">120", "<=120"),
exang = if_else(exang == 1, "YES" ,"NO"),
cp = if_else(cp == 1, "ATYPICAL ANGINA",
if_else(cp == 2, "NON-ANGINAL PAIN", "ASYMPTOMATIC")),
restecg = if_else(restecg == 0, "NORMAL",
if_else(restecg == 1, "ABNORMALITY", "PROBABLE OR DEFINITE")),
target = if_else(target == 1, "YES", "NO")
) %>%
mutate_if(is.character, as.factor) %>%
dplyr::select(target, sex, fbs, exang, cp, restecg, slope, ca, thal, everything())
str(data2)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 303 obs. of 14 variables:
## $ target : Factor w/ 2 levels "NO","YES": 2 2 2 2 2 2 2 2 2 2 ...
## $ sex : Factor w/ 2 levels "FEMALE","MALE": 2 2 1 2 1 2 1 2 2 2 ...
## $ fbs : Factor w/ 2 levels "<=120",">120": 2 1 1 1 1 1 1 1 2 1 ...
## $ exang : Factor w/ 2 levels "NO","YES": 1 1 1 1 2 1 1 1 1 1 ...
## $ cp : Factor w/ 3 levels "ASYMPTOMATIC",..: 1 3 2 2 1 1 2 2 3 3 ...
## $ restecg : Factor w/ 3 levels "ABNORMALITY",..: 2 1 2 1 1 1 2 1 1 1 ...
## $ slope : num 0 0 2 2 2 1 1 2 2 2 ...
## $ ca : num 0 0 0 0 0 0 0 0 0 0 ...
## $ thal : num 1 2 2 2 2 1 2 3 3 2 ...
## $ age : num 63 37 41 56 57 57 56 44 52 57 ...
## $ trestbps: num 145 130 130 120 120 140 140 120 172 150 ...
## $ chol : num 233 250 204 236 354 192 294 263 199 168 ...
## $ thalach : num 150 187 172 178 163 148 153 173 162 174 ...
## $ oldpeak : num 2.3 3.5 1.4 0.8 0.6 0.4 1.3 0 0.5 1.6 ...
cols <- c("slope","ca","thal")
Heart <- data2 %>%
mutate_each_(funs(factor(.)),cols)
## Warning: mutate_each() is deprecated
## Please use mutate_if(), mutate_at(), or mutate_all() instead:
##
## - To map `funs` over all variables, use mutate_all()
## - To map `funs` over a selection of variables, use mutate_at()
## This warning is displayed once per session.
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.
str(Heart)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 303 obs. of 14 variables:
## $ target : Factor w/ 2 levels "NO","YES": 2 2 2 2 2 2 2 2 2 2 ...
## $ sex : Factor w/ 2 levels "FEMALE","MALE": 2 2 1 2 1 2 1 2 2 2 ...
## $ fbs : Factor w/ 2 levels "<=120",">120": 2 1 1 1 1 1 1 1 2 1 ...
## $ exang : Factor w/ 2 levels "NO","YES": 1 1 1 1 2 1 1 1 1 1 ...
## $ cp : Factor w/ 3 levels "ASYMPTOMATIC",..: 1 3 2 2 1 1 2 2 3 3 ...
## $ restecg : Factor w/ 3 levels "ABNORMALITY",..: 2 1 2 1 1 1 2 1 1 1 ...
## $ slope : Factor w/ 3 levels "0","1","2": 1 1 3 3 3 2 2 3 3 3 ...
## $ ca : Factor w/ 5 levels "0","1","2","3",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ thal : Factor w/ 4 levels "0","1","2","3": 2 3 3 3 3 2 3 4 4 3 ...
## $ age : num 63 37 41 56 57 57 56 44 52 57 ...
## $ trestbps: num 145 130 130 120 120 140 140 120 172 150 ...
## $ chol : num 233 250 204 236 354 192 294 263 199 168 ...
## $ thalach : num 150 187 172 178 163 148 153 173 162 174 ...
## $ oldpeak : num 2.3 3.5 1.4 0.8 0.6 0.4 1.3 0 0.5 1.6 ...
##Attribute Statistcs
Basic statistics about the data are obtained in the below table:
## target sex fbs exang cp
## NO :138 FEMALE: 96 <=120:258 NO :204 ASYMPTOMATIC :166
## YES:165 MALE :207 >120 : 45 YES: 99 ATYPICAL ANGINA : 50
## NON-ANGINAL PAIN: 87
##
##
##
## restecg slope ca thal age
## ABNORMALITY :152 0: 21 0:175 0: 2 Min. :29.00
## NORMAL :147 1:140 1: 65 1: 18 1st Qu.:47.50
## PROBABLE OR DEFINITE: 4 2:142 2: 38 2:166 Median :55.00
## 3: 20 3:117 Mean :54.37
## 4: 5 3rd Qu.:61.00
## Max. :77.00
## trestbps chol thalach oldpeak
## Min. : 94.0 Min. :126.0 Min. : 71.0 Min. :0.00
## 1st Qu.:120.0 1st Qu.:211.0 1st Qu.:133.5 1st Qu.:0.00
## Median :130.0 Median :240.0 Median :153.0 Median :0.80
## Mean :131.6 Mean :246.3 Mean :149.6 Mean :1.04
## 3rd Qu.:140.0 3rd Qu.:274.5 3rd Qu.:166.0 3rd Qu.:1.60
## Max. :200.0 Max. :564.0 Max. :202.0 Max. :6.20
From the summary, we can conclude there are no common issues with unclean data.
There are no “N/A” values and no negative values where one would not expect to see them.
The summary function in R would show those if they existed in the data.
45.54% no heart disease
Heart %>% ggplot(aes(age)) +
geom_histogram(fill= "lightblue",
color = 'blue',
binwidth = 1) +
labs(title= "Age Distribution") +
theme(plot.title = element_text(hjust = 0.5))
Heart %>% ggplot(aes(age)) +
geom_histogram(fill= "lightblue",
color = 'blue',
binwidth = 5) +
labs(title= "Age Distribution") +
theme(plot.title = element_text(hjust = 0.5))
Heart %>% ggplot(aes(age)) +
geom_histogram(aes(fill= target),
color = 'grey',
binwidth = 1) +
scale_fill_manual(values=c("#E3CD81FF", "#B1B3B3FF")) +
labs(title= "Age Distribution for Income")+
theme(plot.title = element_text(hjust = 0.5))
Heart %>%
ggplot(aes(age,
fill= target)) +
geom_density(alpha= 0.7, color = 'blue') +
scale_fill_manual(values=c("#E3CD81FF", "#B1B3B3FF")) +
labs(x = "Age", y = "Density", title = "Density graph of age distribution")
GENDER
library(ggpubr)
## Loading required package: magrittr
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
##
## col_factor
gender_prop <- Heart %>%
group_by(sex) %>%
summarise(count = n()) %>%
ungroup()%>%
arrange(desc(sex)) %>%
mutate(percentage = round(count/sum(count),4)*100,
lab.pos = cumsum(percentage)-0.5*percentage)
gender_distr <- ggplot(data = gender_prop,
aes(x = "",
y = percentage,
fill = sex))+
geom_bar(stat = "identity")+
coord_polar("y") +
geom_text(aes(y = lab.pos,
label = paste(percentage,"%", sep = "")), col = "blue", size = 4) +
scale_fill_manual(values=c("orange", "lightblue"),
name = "Gender") +
theme_void() +
theme(legend.title = element_text(color = "black", size = 12),
legend.text = element_text(color = "black", size = 12))
gender_prop <- Heart %>%
group_by(sex, target) %>%
summarize(n = n()) %>%
mutate(pct = n*100/sum(n)) %>%
ggplot(aes(x = reorder(sex, n),
y = pct,
fill = target)) +
geom_bar(stat = "identity", width = 0.6) +
scale_x_discrete(name = "") +
scale_fill_manual(values=c("#E3CD81FF", "#B1B3B3FF")) +
geom_text(aes(label = paste0(round(pct,0),"%")),
position = position_stack(vjust = 0.5),
size = 4,
color = "black") +
theme(axis.text.y = element_blank(),
axis.text.x = element_text(color = "black", size = 12),
axis.title.y = element_blank(),
axis.ticks.y = element_blank(),
legend.title = element_text(color = "black", size = 12),
legend.text = element_text(color = "black", size = 12))
ggarrange(gender_distr, gender_prop, nrow = 1)
31.68 % of people are male 68.32 % are female
75% of males had heart disease. 45% of female had heart disease.
#Distribution of Male and Female population across Age parameter
Heart %>%
ggplot(aes(x=age,fill=sex))+
geom_histogram()+
xlab("Age") +
ylab("Number")+
scale_fill_manual(values=c("orange", "lightblue"),
name = "Gender")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Chest pain type (cp)
cp_distr <- Heart %>%
group_by(cp) %>%
summarise(counts = n()) %>%
mutate(Percentage = round(counts*100/sum(counts),2)) %>%
arrange(desc(counts)) %>%
ggplot(aes(x= reorder(cp, counts),
y = counts)) +
geom_bar(stat = "identity",
width = 0.6,
fill = "steelblue") +
geom_text(aes(label = paste0(round(counts,1),"\n",Percentage,"%")),
vjust = 0.5,
hjust = -0.5,
color = "darkblue",
size = 4) +
scale_y_continuous(limits = c(0,200)) +
theme_minimal() +
labs(x = "Chest pain types",y = "Frequency") +
coord_flip()
cp_prop <- Heart %>%
group_by(cp, target) %>%
summarize(n = n()) %>%
mutate(pct = n*100/sum(n)) %>%
ggplot(aes(x = reorder(cp, n),
y = pct/100,
fill = target)) +
geom_bar(stat = "identity", width = 0.6) +
scale_x_discrete(name = "") +
scale_y_continuous(name= "Percentage",
labels = percent) +
scale_fill_manual(values=c("#E3CD81FF", "#B1B3B3FF")) +
geom_text(aes(label = paste0(round(pct,0),"%")),
position = position_stack(vjust = 0.5),
size = 4,
color = "black") +
theme(plot.title = element_text(hjust = 0.5),
axis.text.y=element_blank()) +
coord_flip()
ggarrange(cp_distr, cp_prop, nrow = 1)
Fasting blood sugar (fbs)
fbs_distr <- Heart %>%
group_by(fbs) %>%
dplyr::summarise(counts = n()) %>%
mutate(Percentage = round(counts*100/sum(counts),2)) %>%
arrange(desc(counts)) %>%
ggplot(aes(x= reorder(fbs, counts),
y = counts)) +
geom_bar(stat = "identity",
width = 0.6,
fill = "steelblue") +
geom_text(aes(label = paste0(round(counts,1),"\n",Percentage,"%")),
vjust = 0.5,
hjust = -0.5,
color = "darkblue",
size = 4) +
scale_y_continuous(limits = c(0,350)) +
theme_minimal() +
labs(x = "Chest pain types",y = "Frequency") +
coord_flip()
fbs_prop <- Heart %>%
group_by(fbs, target) %>%
dplyr::summarize(n = n()) %>%
mutate(pct = n*100/sum(n)) %>%
ggplot(aes(x = reorder(fbs, n),
y = pct/100,
fill = target)) +
geom_bar(stat = "identity", width = 0.6) +
scale_x_discrete(name = "") +
scale_y_continuous(name= "Percentage",
labels = percent) +
scale_fill_manual(values=c("#E3CD81FF", "#B1B3B3FF")) +
geom_text(aes(label = paste0(round(pct,0),"%")),
position = position_stack(vjust = 0.5),
size = 4,
color = "black") +
theme(plot.title = element_text(hjust = 0.5),
axis.text.y=element_blank()) +
coord_flip()
ggarrange(fbs_distr,fbs_prop, nrow = 1)
It seems that there is a slight difference in percentage of pp having heart disease for two groups (fasting blood sugar)
chisq.test(Heart$target, Heart$fbs)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: Heart$target and Heart$fbs
## X-squared = 0.10627, df = 1, p-value = 0.7444
The p = 0.744 > 0.05. There is no relationship between fast blood sugar and heart disease for this data.
restecg resting electrocardiographic results (values 0,1,2, categorical)
restecg_distr <- Heart %>%
group_by(restecg) %>%
dplyr::summarise(counts = n()) %>%
mutate(Percentage = round(counts*100/sum(counts),2)) %>%
arrange(desc(counts)) %>%
ggplot(aes(x= restecg,
y = counts)) +
geom_bar(stat = "identity",
width = 0.6,
fill = "steelblue") +
geom_text(aes(label = paste0(round(counts,1),"\n",Percentage,"%")),
vjust = 0.5,
hjust = -0.5,
color = "darkblue",
size = 4) +
scale_y_continuous(limits = c(0,250)) +
theme_minimal() +
labs(x = "resting electrocardiographic results",y = "Frequency") +
coord_flip()
restecg_prop <- Heart %>%
group_by(restecg, target) %>%
dplyr::summarize(n = n()) %>%
mutate(pct = n*100/sum(n)) %>%
ggplot(aes(x = restecg,
y = pct/100,
fill = target)) +
geom_bar(stat = "identity", width = 0.6) +
scale_x_discrete(name = "") +
scale_y_continuous(name= "Percentage",
labels = percent) +
scale_fill_manual(values=c("#E3CD81FF", "#B1B3B3FF")) +
geom_text(aes(label = paste0(round(pct,0),"%")),
position = position_stack(vjust = 0.5),
size = 4,
color = "black") +
theme(plot.title = element_text(hjust = 0.5),
axis.text.y=element_blank()) +
coord_flip()
ggarrange(restecg_distr,restecg_prop, nrow = 1)
chisq.test(data$target, Heart$restecg)
## Warning in chisq.test(data$target, Heart$restecg): Chi-squared approximation may
## be incorrect
##
## Pearson's Chi-squared test
##
## data: data$target and Heart$restecg
## X-squared = 10.023, df = 2, p-value = 0.006661
exang exercise induced angina (categorical, 1=angina 0 = no agina)
Angina (pronounced ANN-juh-nuh or ann-JIE-nuh) is pain in the chest that comes on with exercise, stress, or other things that make the heart work harder.
exang_distr <- Heart %>%
group_by(exang) %>%
dplyr::summarise(counts = n()) %>%
mutate(Percentage = round(counts*100/sum(counts),2)) %>%
arrange(desc(counts)) %>%
ggplot(aes(x= exang,
y = counts)) +
geom_bar(stat = "identity",
width = 0.6,
fill = "steelblue") +
geom_text(aes(label = paste0(round(counts,1),"\n",Percentage,"%")),
vjust = 0.5,
hjust = -0.5,
color = "darkblue",
size = 4) +
scale_y_continuous(limits = c(0,300)) +
theme_minimal() +
labs(x = "Exercise induced anginas",y = "Frequency") +
coord_flip()
exang_prop <- Heart %>%
group_by(exang, target) %>%
dplyr::summarize(n = n()) %>%
mutate(pct = n*100/sum(n)) %>%
ggplot(aes(x = exang,
y = pct/100,
fill = target)) +
geom_bar(stat = "identity", width = 0.6) +
scale_x_discrete(name = "") +
scale_y_continuous(name= "Percentage",
labels = percent) +
scale_fill_manual(values=c("#E3CD81FF", "#B1B3B3FF")) +
geom_text(aes(label = paste0(round(pct,0),"%")),
position = position_stack(vjust = 0.5),
size = 4,
color = "black") +
theme(plot.title = element_text(hjust = 0.5),
axis.text.y=element_blank()) +
coord_flip()
ggarrange(exang_distr,exang_prop, nrow = 1)
ca number of major vessels (0-3) colored by flourosopy (factor)
ca_distr <- Heart %>%
group_by(ca) %>%
dplyr::summarise(counts = n()) %>%
mutate(Percentage = round(counts*100/sum(counts),2)) %>%
arrange(desc(counts)) %>%
ggplot(aes(x= ca,
y = counts)) +
geom_bar(stat = "identity",
width = 0.6,
fill = "steelblue") +
geom_text(aes(label = paste0(round(counts,1),"\n",Percentage,"%")),
vjust = 0.5,
hjust = -0.5,
color = "darkblue",
size = 4) +
scale_y_continuous(limits = c(0,300)) +
theme_minimal() +
labs(x = "Number of major vessels colored by fluoroscopy",y = "Frequency") +
coord_flip()
ca_prop <- Heart %>%
group_by(ca, target) %>%
dplyr::summarize(n = n()) %>%
mutate(pct = n*100/sum(n)) %>%
ggplot(aes(x = ca,
y = pct/100,
fill = target)) +
geom_bar(stat = "identity", width = 0.6) +
scale_x_discrete(name = "") +
scale_y_continuous(name= "Percentage",
labels = percent) +
scale_fill_manual(values=c("#E3CD81FF", "#B1B3B3FF")) +
geom_text(aes(label = paste0(round(pct,0),"%")),
position = position_stack(vjust = 0.5),
size = 4,
color = "black") +
theme(plot.title = element_text(hjust = 0.5),
axis.text.y=element_blank()) +
coord_flip()
ggarrange(ca_distr,ca_prop, nrow = 1)
thal
thal_distr <- Heart %>%
group_by(thal ) %>%
dplyr::summarise(counts = n()) %>%
mutate(Percentage = round(counts*100/sum(counts),2)) %>%
arrange(desc(counts)) %>%
ggplot(aes(x= thal ,
y = counts)) +
geom_bar(stat = "identity",
width = 0.6,
fill = "steelblue") +
geom_text(aes(label = paste0(round(counts,1),"\n",Percentage,"%")),
vjust = 0.5,
hjust = -0.5,
color = "darkblue",
size = 4) +
scale_y_continuous(limits = c(0,300)) +
theme_minimal() +
labs(x = "Thalium stress test result",y = "Frequency") +
coord_flip()
thal_prop <- Heart %>%
group_by(thal , target) %>%
dplyr::summarize(n = n()) %>%
mutate(pct = n*100/sum(n)) %>%
ggplot(aes(x = thal ,
y = pct/100,
fill = target)) +
geom_bar(stat = "identity", width = 0.6) +
scale_x_discrete(name = "") +
scale_y_continuous(name= "Percentage",
labels = percent) +
scale_fill_manual(values=c("#E3CD81FF", "#B1B3B3FF")) +
geom_text(aes(label = paste0(round(pct,0),"%")),
position = position_stack(vjust = 0.5),
size = 4,
color = "black") +
theme(plot.title = element_text(hjust = 0.5),
axis.text.y=element_blank()) +
coord_flip()
ggarrange(thal_distr,thal_prop, nrow = 1)
slope Slope of peak exercise ST segment (upsloping, flat, or downsloping
slope_distr <- Heart %>%
group_by(slope) %>%
dplyr::summarise(counts = n()) %>%
mutate(Percentage = round(counts*100/sum(counts),2)) %>%
arrange(desc(counts)) %>%
ggplot(aes(x= slope,
y = counts)) +
geom_bar(stat = "identity",
width = 0.4,
fill = "steelblue") +
geom_text(aes(label = paste0(round(counts,1),"\n",Percentage,"%")),
vjust = 0.5,
hjust = -0.5,
color = "darkblue",
size = 4) +
scale_y_continuous(limits = c(0,200)) +
theme_minimal() +
labs(x = "Slope of peak exercise ST segment",y = "Frequency") +
coord_flip()
slope_prop <- Heart %>%
group_by(slope, target) %>%
dplyr::summarize(n = n()) %>%
mutate(pct = n*100/sum(n)) %>%
ggplot(aes(x = slope,
y = pct/100,
fill = target)) +
geom_bar(stat = "identity", width = 0.6) +
scale_x_discrete(name = "") +
scale_y_continuous(name= "Percentage",
labels = percent) +
scale_fill_manual(values=c("#E3CD81FF", "#B1B3B3FF")) +
geom_text(aes(label = paste0(round(pct,0),"%")),
position = position_stack(vjust = 0.5),
size = 4,
color = "black") +
theme(plot.title = element_text(hjust = 0.5),
axis.text.y=element_blank()) +
coord_flip()
ggarrange(slope_distr,slope_prop, nrow = 1)
Numerical variable trestbps Resting blood pressure (mm Hg)
Heart %>% ggplot(aes(trestbps)) +
geom_histogram(fill= "lightblue",
color = 'blue',
binwidth = 1) +
labs(title= "Resting blood pressure") +
theme(plot.title = element_text(hjust = 0.5))
Heart %>%
ggplot(aes(trestbps,
fill= target)) +
geom_density(alpha= 0.7, color = 'blue') +
scale_fill_manual(values=c("#E3CD81FF", "#B1B3B3FF")) +
labs(x = "Age", y = "Density", title = "Density graph of trestbps distribution")
chol Serum cholesterol (mg/dl)
Heart %>% ggplot(aes(chol)) +
geom_histogram(fill= "lightblue",
color = 'blue',
binwidth = 3) +
labs(title= "Serum cholesterol (mg/dl)") +
theme(plot.title = element_text(hjust = 0.5))
Heart$chol2 <- cut(Heart$chol, breaks = c(100,200,400,600),
labels = c("100-200","200-400","400-600"), include.lowest = TRUE)
head(Heart$chol2)
## [1] 200-400 200-400 200-400 200-400 200-400 100-200
## Levels: 100-200 200-400 400-600
#######Plotting Age(groups) vs Attrition
p1_chol2 <- ggplot(Heart,aes(x = chol2,fill = target)) +
geom_bar(position = "dodge") +
ggtitle("chol2(group) vs target - count")
p2_chole2 <- ggplot(Heart,aes(x = chol2,fill = target)) +
geom_bar(position = "fill") +
ggtitle("chol2(groups) vs target - proportion")
ggarrange(p1_chol2, p2_chole2, ncol=1)
All aged 19 and younger : At most 170 mg/dL
Females aged 20 and older 125–200 mg/dL
Males aged 20 and older
In the data, the youngest are 29 years old.
There are some people whose the serum cholesterol are more than 200 mg/dl
How many people are healthy
Representation of Cholestoral level with age and gender
Heart %>%
ggplot(aes(x = age,
y = chol))+
geom_point(alpha=0.7) +
geom_smooth(method='lm')+
xlab("Age") +
ylab("Serum cholesterol")
## `geom_smooth()` using formula 'y ~ x'
Heart %>%
ggplot(aes(x = age,
y = chol,
color = sex,
size = chol))+
geom_point(alpha=0.7) +
xlab("Age") +
ylab("Serum cholesterol") +
scale_fill_manual(values=c("orange", "lightblue"),
name = "Gender")
Comparison of Blood pressure across pain type
Heart %>%
ggplot(aes(x=sex,
y=trestbps, colour = sex))+
geom_boxplot()+
xlab("Sex")+
ylab("BP")+
facet_grid(~cp)
Heart %>%
ggplot(aes(x=sex,
y=chol))+
geom_boxplot(fill="#D55E00")+
xlab("Sex")+
ylab("Chol")+
facet_grid(~cp)
Heart %>%
ggplot(aes(chol,
fill= target)) +
geom_density(alpha= 0.7, color = 'blue') +
scale_fill_manual(values=c("#E3CD81FF", "#B1B3B3FF")) +
labs(x = "Serum cholesterol (mg/dl)", y = "Density", title = "Density graph of chol distribution")
Heart %>%
filter(chol >= 125 & chol <= 200) %>%
summarise(count = n())
## # A tibble: 1 x 1
## count
## <int>
## 1 51
Heart %>%
filter(chol >= 125 & chol <= 200) %>%
group_by(target) %>%
summarise(count = n())
## # A tibble: 2 x 2
## target count
## <fct> <int>
## 1 NO 21
## 2 YES 30
about 60% of them having heart disease
Heart %>%
filter(chol >= 400) %>%
group_by(target) %>%
summarise(count = n())
## # A tibble: 2 x 2
## target count
## <fct> <int>
## 1 NO 2
## 2 YES 2
thalach
Heart %>% ggplot(aes(thalach)) +
geom_histogram(fill= "lightblue",
color = 'blue',
binwidth = 1) +
labs(title= "Max. heart rate achieved during thalium stress test") +
theme(plot.title = element_text(hjust = 0.5))
Heart %>%
ggplot(aes(thalach,
fill= target)) +
geom_density(alpha= 0.7, color = 'blue') +
scale_fill_manual(values=c("#E3CD81FF", "#B1B3B3FF")) +
labs(x = "Max. heart rate achieved during thalium stress test", y = "Density", title = "Density graph of thalach distribution")
oldpeak ST depression induced by exercise relative to rest
Heart %>% ggplot(aes(oldpeak)) +
geom_histogram(fill= "lightblue",
color = 'blue',
binwidth = 0.1) +
labs(title= "ST depression induced by exercise relative to rest") +
theme(plot.title = element_text(hjust = 0.5))
Heart %>%
ggplot(aes(oldpeak,
fill= target)) +
geom_density(alpha= 0.7, color = 'blue') +
scale_fill_manual(values=c("#E3CD81FF", "#B1B3B3FF")) +
labs(x = "ST depression induced by exercise relative to rest", y = "Density", title = "Density graph of oldpeak distribution")
CORRELATION
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.6.3
## corrplot 0.84 loaded
cor_heart <- cor(data2[,10:14])
cor_heart
## age trestbps chol thalach oldpeak
## age 1.0000000 0.27935091 0.213677957 -0.398521938 0.21001257
## trestbps 0.2793509 1.00000000 0.123174207 -0.046697728 0.19321647
## chol 0.2136780 0.12317421 1.000000000 -0.009939839 0.05395192
## thalach -0.3985219 -0.04669773 -0.009939839 1.000000000 -0.34418695
## oldpeak 0.2100126 0.19321647 0.053951920 -0.344186948 1.00000000
corrplot(cor_heart, method = "ellipse", type="upper",)
library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 3.6.3
ggcorrplot(cor_heart,lab = T)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
##
## Attaching package: 'GGally'
## The following object is masked from 'package:pander':
##
## wrap
## The following object is masked from 'package:dplyr':
##
## nasa
ggcorr(cor_heart, label = T, label_round = 2)
heartcor <- cor(cor_heart) #the correlation graph is saved to `heartcor`
corrplot(heartcor, method = "pie", type = "lower")
p1 <- ggplot(Heart,
aes(x = age, y = trestbps, color = target, shape = target)) +
geom_point(size = 2) + geom_smooth(se = FALSE)
p2 <- ggplot(Heart, aes(x = age, y = chol, color = target, shape = target)) +
geom_point() +
geom_smooth(se = FALSE)
p3 <- ggplot(Heart, aes(x = age, y = thalach, color = target, shape = target))+
geom_point()+
geom_smooth(se = FALSE)
p4 <- ggplot(data2, aes(x = age, y = oldpeak, color = target, shape = target))+
geom_point()+
geom_smooth(se = FALSE)
ggarrange(p1,p2,p3,p4, ncol = 2, nrow= 2)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
K-means
library(funModeling)
## Warning: package 'funModeling' was built under R version 3.6.3
## Loading required package: Hmisc
## Warning: package 'Hmisc' was built under R version 3.6.3
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
## funModeling v.1.9.3 :)
## Examples and tutorials at livebook.datascienceheroes.com
## / Now in Spanish: librovivodecienciadedatos.ai
##
## Attaching package: 'funModeling'
## The following object is masked from 'package:GGally':
##
## range01
uns_df <- scale(data2[,10:14])
head(as_tibble(uns_df))
## # A tibble: 6 x 5
## age trestbps chol thalach oldpeak
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.951 0.763 -0.256 0.0154 1.09
## 2 -1.91 -0.0926 0.0721 1.63 2.12
## 3 -1.47 -0.0926 -0.815 0.976 0.310
## 4 0.180 -0.663 -0.198 1.24 -0.206
## 5 0.290 -0.663 2.08 0.583 -0.379
## 6 0.290 0.478 -1.05 -0.0719 -0.551
library(factoextra)
## Warning: package 'factoextra' was built under R version 3.6.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
distance <- get_dist(uns_df)
head(distance)
## [1] 3.565404 2.904427 2.406805 3.222464 1.956767 1.679101
fviz_dist(distance, gradient = list(low = "blue", mid = "white", high = "red"))
k2 <- kmeans(uns_df,
center = 2,
nstart = 25 )
str(k2)
## List of 9
## $ cluster : int [1:303] 1 2 2 2 2 2 1 2 2 2 ...
## $ centers : num [1:2, 1:5] 0.711 -0.542 0.441 -0.336 0.317 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "1" "2"
## .. ..$ : chr [1:5] "age" "trestbps" "chol" "thalach" ...
## $ totss : num 1510
## $ withinss : num [1:2] 623 512
## $ tot.withinss: num 1136
## $ betweenss : num 374
## $ size : int [1:2] 131 172
## $ iter : int 1
## $ ifault : int 0
## - attr(*, "class")= chr "kmeans"
fviz_cluster(k2, data = uns_df)
k3 <- kmeans(uns_df, centers = 3, nstart = 25)
k4 <- kmeans(uns_df, centers = 4, nstart = 25)
k5 <- kmeans(uns_df, centers = 5, nstart = 25)
pKM1 <- fviz_cluster(k2, geom = "point", data = uns_df)+
ggtitle("k = 2")
pKM2 <- fviz_cluster(k3, geom = "point", data = uns_df)+
ggtitle("k = 3")
pKM3 <- fviz_cluster(k4, geom = "point", data = uns_df)+
ggtitle("k = 4")
pKM4 <- fviz_cluster(k5, geom = "point", data = uns_df)+
ggtitle("k = 5")
ggarrange(pKM1,pKM2,pKM3,pKM4, ncol = 2, nrow = 2)
8.1. Optimum Cluster Number
set.seed(123)
fviz_nbclust(uns_df, kmeans, method = "wss")
# Average Silouette Method
fviz_nbclust(uns_df, kmeans, method = "silhouette")
library(cluster)
## Warning: package 'cluster' was built under R version 3.6.3
# Gap Statistics
set.seed(123)
gap_stat <- clusGap(uns_df, FUN = kmeans, nstart = 25, K.max = 10, B = 50)
print(gap_stat, method = "firstmax")
## Clustering Gap statistic ["clusGap"] from call:
## clusGap(x = uns_df, FUNcluster = kmeans, K.max = 10, B = 50, nstart = 25)
## B=50 simulated reference sets, k = 1..10; spaceH0="scaledPCA"
## --> Number of clusters (method 'firstmax'): 2
## logW E.logW gap SE.sim
## [1,] 5.410459 5.963889 0.5534297 0.01063475
## [2,] 5.262299 5.836118 0.5738189 0.01081094
## [3,] 5.187107 5.742668 0.5555616 0.01139483
## [4,] 5.131819 5.665213 0.5333938 0.01186413
## [5,] 5.079557 5.605857 0.5263005 0.01183466
## [6,] 5.033868 5.553238 0.5193703 0.01176835
## [7,] 4.994771 5.511709 0.5169386 0.01201947
## [8,] 4.959897 5.476290 0.5163929 0.01258630
## [9,] 4.925016 5.444976 0.5199599 0.01242377
## [10,] 4.904663 5.416427 0.5117637 0.01250435
fviz_gap_stat(gap_stat)
set.seed(123)
final <- kmeans(uns_df, 2, nstart = 25)
final
## K-means clustering with 2 clusters of sizes 172, 131
##
## Cluster means:
## age trestbps chol thalach oldpeak
## 1 -0.5415776 -0.3355766 -0.2416645 0.5202796 -0.4537289
## 2 0.7110790 0.4406044 0.3173000 -0.6831152 0.5957356
##
## Clustering vector:
## [1] 2 1 1 1 1 1 2 1 1 1 1 1 1 2 2 1 1 2 1 2 1 1 1 2 1 2 2 1 2 1 1 1 1 1 2 1 1
## [38] 1 2 2 2 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1
## [75] 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 2 1 1 1 1 1 2 2 1 1 1 1 2 1 1 1 2 2 1 1 1 2
## [112] 1 2 1 1 1 1 1 1 1 2 1 1 1 1 1 1 2 1 2 1 1 1 1 1 1 2 1 2 2 1 1 1 1 2 2 1 1
## [149] 1 1 2 2 2 2 1 2 1 1 1 1 1 2 1 1 1 2 2 2 2 2 1 1 2 2 2 1 1 2 1 2 2 2 1 1 2
## [186] 1 2 2 1 1 2 2 2 2 2 2 2 1 2 1 1 2 2 2 2 1 1 2 1 1 1 2 1 2 2 2 2 2 2 1 2 2
## [223] 2 2 2 2 2 1 2 2 1 2 2 2 2 1 1 2 2 1 2 2 2 2 2 1 2 2 1 2 2 1 2 2 2 1 2 2 2
## [260] 1 2 1 2 1 1 2 2 1 2 2 1 2 2 1 1 1 2 1 2 2 2 1 2 1 2 2 1 1 2 2 1 2 2 2 1 2
## [297] 1 2 2 1 2 2 1
##
## Within cluster sum of squares by cluster:
## [1] 512.4914 623.1961
## (between_SS / total_SS = 24.8 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
Descriptive Statistics for Clusters
Heart[,10:14] %>%
mutate(Cluster = final$cluster) %>%
group_by(Cluster) %>%
summarise_all("mean")
## # A tibble: 2 x 6
## Cluster age trestbps chol thalach oldpeak
## <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 49.4 126. 234. 162. 0.513
## 2 2 60.8 139. 263. 134 1.73
MACHINE LEARNING
library(caret)
## Warning: package 'caret' was built under R version 3.6.3
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
train_indeks <- createDataPartition(Heart$target, p = 0.7, list = FALSE, times = 1)
train <- Heart[train_indeks,]
test <- Heart[-train_indeks,]
train_x <- train %>% dplyr::select(-target)
train_y <- train$target
test_x <- test %>% dplyr::select(-target)
test_y <- test$target
training <- data.frame(train_x, target = train_y)
model_glm <- glm(target~.,
data = training,
family = "binomial")
model_glm
##
## Call: glm(formula = target ~ ., family = "binomial", data = training)
##
## Coefficients:
## (Intercept) sexMALE
## -1.44693 -1.91424
## fbs>120 exangYES
## 0.92358 -0.65496
## cpATYPICAL ANGINA cpNON-ANGINAL PAIN
## -0.13140 1.45518
## restecgNORMAL restecgPROBABLE OR DEFINITE
## -0.53504 -0.92066
## slope1 slope2
## -0.04961 1.63488
## ca1 ca2
## -2.47021 -3.31721
## ca3 ca4
## -3.40393 -1.21654
## thal1 thal2
## 4.06481 3.85934
## thal3 age
## 1.89929 0.02966
## trestbps chol
## -0.01930 -0.01028
## thalach oldpeak
## 0.02033 -0.61554
## chol2200-400 chol2400-600
## 1.64309 3.51099
##
## Degrees of Freedom: 212 Total (i.e. Null); 189 Residual
## Null Deviance: 293.6
## Residual Deviance: 121.6 AIC: 169.6
summary(model_glm)
##
## Call:
## glm(formula = target ~ ., family = "binomial", data = training)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.90153 -0.26747 0.07939 0.41668 2.92331
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.446930 3.799194 -0.381 0.703313
## sexMALE -1.914244 0.709138 -2.699 0.006947 **
## fbs>120 0.923576 0.780387 1.183 0.236617
## exangYES -0.654962 0.563591 -1.162 0.245185
## cpATYPICAL ANGINA -0.131397 0.758416 -0.173 0.862454
## cpNON-ANGINAL PAIN 1.455178 0.606339 2.400 0.016398 *
## restecgNORMAL -0.535036 0.488536 -1.095 0.273437
## restecgPROBABLE OR DEFINITE -0.920665 2.385379 -0.386 0.699525
## slope1 -0.049611 1.032670 -0.048 0.961683
## slope2 1.634875 1.171889 1.395 0.162992
## ca1 -2.470214 0.666300 -3.707 0.000209 ***
## ca2 -3.317206 0.989400 -3.353 0.000800 ***
## ca3 -3.403930 1.311543 -2.595 0.009449 **
## ca4 -1.216536 2.551628 -0.477 0.633527
## thal1 4.064807 2.436510 1.668 0.095258 .
## thal2 3.859338 2.331877 1.655 0.097917 .
## thal3 1.899289 2.302151 0.825 0.409368
## age 0.029664 0.029341 1.011 0.312021
## trestbps -0.019299 0.015063 -1.281 0.200120
## chol -0.010284 0.007555 -1.361 0.173460
## thalach 0.020327 0.013631 1.491 0.135918
## oldpeak -0.615541 0.321895 -1.912 0.055845 .
## chol2200-400 1.643086 0.916214 1.793 0.072918 .
## chol2400-600 3.510991 7.269844 0.483 0.629129
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 293.58 on 212 degrees of freedom
## Residual deviance: 121.65 on 189 degrees of freedom
## AIC: 169.65
##
## Number of Fisher Scoring iterations: 6
10.2. Random Forest
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
rf_fit <- randomForest(train_x, train_y, importance = TRUE)
rf_fit
##
## Call:
## randomForest(x = train_x, y = train_y, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 18.31%
## Confusion matrix:
## NO YES class.error
## NO 76 21 0.2164948
## YES 18 98 0.1551724
importance(rf_fit)
## NO YES MeanDecreaseAccuracy MeanDecreaseGini
## sex 2.407462 11.3684400 10.6118227 3.4329081
## fbs -1.633384 1.4594114 -0.1214795 0.7720273
## exang 5.197280 4.3947063 6.7215988 4.9442285
## cp 11.274378 4.9598032 11.5600559 7.5094053
## restecg 1.555085 -0.6593266 0.6736534 1.7695557
## slope 9.930101 3.7652346 9.6312023 6.4720831
## ca 11.961786 18.4525033 20.0110037 12.4436797
## thal 12.801312 18.5404195 21.9523963 15.2866274
## age 2.891410 7.5518669 7.5258026 8.9835041
## trestbps -1.086904 3.2221793 1.8022123 7.1038118
## chol -2.005635 0.7519465 -0.8313648 8.0635006
## thalach 4.785756 7.2135341 9.1349684 12.2929163
## oldpeak 14.138700 9.6829965 15.8083655 14.2598394
## chol2 1.246250 -2.1411801 -0.4668505 0.8338564
varImpPlot(rf_fit)
confusionMatrix(predict(rf_fit, test_x), test_y, positive = "YES")
## Confusion Matrix and Statistics
##
## Reference
## Prediction NO YES
## NO 33 8
## YES 8 41
##
## Accuracy : 0.8222
## 95% CI : (0.7274, 0.8948)
## No Information Rate : 0.5444
## P-Value [Acc > NIR] : 2.84e-08
##
## Kappa : 0.6416
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.8367
## Specificity : 0.8049
## Pos Pred Value : 0.8367
## Neg Pred Value : 0.8049
## Prevalence : 0.5444
## Detection Rate : 0.4556
## Detection Prevalence : 0.5444
## Balanced Accuracy : 0.8208
##
## 'Positive' Class : YES
##
# Model Tuning
control <- trainControl(method='cv',
number=10,
search='grid')
tunegrid <- expand.grid(mtry = (1:10))
rf_gridsearch <- train(target ~ .,
data = train,
method = 'rf',
metric = 'Accuracy',
tuneGrid = tunegrid)
rf_gridsearch
## Random Forest
##
## 213 samples
## 14 predictor
## 2 classes: 'NO', 'YES'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 213, 213, 213, 213, 213, 213, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 1 0.8078647 0.6095593
## 2 0.8052842 0.6057955
## 3 0.7957809 0.5875444
## 4 0.7892162 0.5742515
## 5 0.7869512 0.5699819
## 6 0.7810923 0.5582653
## 7 0.7826607 0.5610224
## 8 0.7773592 0.5500318
## 9 0.7766169 0.5490764
## 10 0.7762688 0.5480922
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 1.
plot(rf_gridsearch)
confusionMatrix(predict(rf_gridsearch, test_x), test_y, positive = "YES")
## Confusion Matrix and Statistics
##
## Reference
## Prediction NO YES
## NO 30 6
## YES 11 43
##
## Accuracy : 0.8111
## 95% CI : (0.7149, 0.8859)
## No Information Rate : 0.5444
## P-Value [Acc > NIR] : 1.061e-07
##
## Kappa : 0.6154
##
## Mcnemar's Test P-Value : 0.332
##
## Sensitivity : 0.8776
## Specificity : 0.7317
## Pos Pred Value : 0.7963
## Neg Pred Value : 0.8333
## Prevalence : 0.5444
## Detection Rate : 0.4778
## Detection Prevalence : 0.6000
## Balanced Accuracy : 0.8046
##
## 'Positive' Class : YES
##
rf_cm <- confusionMatrix(predict(rf_gridsearch, test_x), test_y, positive = "YES")
rf_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction NO YES
## NO 30 6
## YES 11 43
##
## Accuracy : 0.8111
## 95% CI : (0.7149, 0.8859)
## No Information Rate : 0.5444
## P-Value [Acc > NIR] : 1.061e-07
##
## Kappa : 0.6154
##
## Mcnemar's Test P-Value : 0.332
##
## Sensitivity : 0.8776
## Specificity : 0.7317
## Pos Pred Value : 0.7963
## Neg Pred Value : 0.8333
## Prevalence : 0.5444
## Detection Rate : 0.4778
## Detection Prevalence : 0.6000
## Balanced Accuracy : 0.8046
##
## 'Positive' Class : YES
##