#Data Description (Heartdata) & Preprocessing

https://archive.ics.uci.edu/ml/datasets/Heart+Disease

data <- read_csv("heart.csv")
## Parsed with column specification:
## cols(
##   age = col_double(),
##   sex = col_double(),
##   cp = col_double(),
##   trestbps = col_double(),
##   chol = col_double(),
##   fbs = col_double(),
##   restecg = col_double(),
##   thalach = col_double(),
##   exang = col_double(),
##   oldpeak = col_double(),
##   slope = col_double(),
##   ca = col_double(),
##   thal = col_double(),
##   target = col_double()
## )
str(data)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 303 obs. of  14 variables:
##  $ age     : num  63 37 41 56 57 57 56 44 52 57 ...
##  $ sex     : num  1 1 0 1 0 1 0 1 1 1 ...
##  $ cp      : num  3 2 1 1 0 0 1 1 2 2 ...
##  $ trestbps: num  145 130 130 120 120 140 140 120 172 150 ...
##  $ chol    : num  233 250 204 236 354 192 294 263 199 168 ...
##  $ fbs     : num  1 0 0 0 0 0 0 0 1 0 ...
##  $ restecg : num  0 1 0 1 1 1 0 1 1 1 ...
##  $ thalach : num  150 187 172 178 163 148 153 173 162 174 ...
##  $ exang   : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ oldpeak : num  2.3 3.5 1.4 0.8 0.6 0.4 1.3 0 0.5 1.6 ...
##  $ slope   : num  0 0 2 2 2 1 1 2 2 2 ...
##  $ ca      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ thal    : num  1 2 2 2 2 1 2 3 3 2 ...
##  $ target  : num  1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   age = col_double(),
##   ..   sex = col_double(),
##   ..   cp = col_double(),
##   ..   trestbps = col_double(),
##   ..   chol = col_double(),
##   ..   fbs = col_double(),
##   ..   restecg = col_double(),
##   ..   thalach = col_double(),
##   ..   exang = col_double(),
##   ..   oldpeak = col_double(),
##   ..   slope = col_double(),
##   ..   ca = col_double(),
##   ..   thal = col_double(),
##   ..   target = col_double()
##   .. )

The dataset consists of 14 physological patient attributes as follows:

Attribute Information

age: age (continuous)

sex: gender (categorical, 0=male, 1=female)

cp: chest pain type (4 values, ordinal/categorical))

trestbps resting blood pressure (continuous)

chol: serum cholestoral in mg/dl (continuous)

fbs: fasting blood sugar > 120 mg/dl (categorical is >120 =1 or <120 =0)

restecg: resting electrocardiographic results (values 0,1,2, categorical)

thalack: maximum heart rate achieved (continuous)

exang: exercise induced angina (categorical, 1=angina 0 = no agina)

oldpeak: ST depression induced by exercise relative to rest (continuous)

slope: the slope of the peak exercise ST segment (continuous)

ca : number of major vessels (0-3) colored by flourosopy (factor)

thal 3 = normal; 6 = fixed defect; 7 = reversable defect (categorical)

target 1= heart disease present; 0 = no heart disease (categorical, dependant variable)

plot_missing(data)

Data engineering

data2 <- data %>% 
  mutate(sex = if_else(sex == 1, "MALE", "FEMALE"),
         fbs = if_else(fbs == 1, ">120", "<=120"),
         exang = if_else(exang == 1, "YES" ,"NO"),
         cp = if_else(cp == 1, "ATYPICAL ANGINA",
                      if_else(cp == 2, "NON-ANGINAL PAIN", "ASYMPTOMATIC")),
         restecg = if_else(restecg == 0, "NORMAL",
                           if_else(restecg == 1, "ABNORMALITY", "PROBABLE OR DEFINITE")),
         target = if_else(target == 1, "YES", "NO")
         ) %>% 
  mutate_if(is.character, as.factor) %>% 
  dplyr::select(target, sex, fbs, exang, cp, restecg, slope, ca, thal, everything())
str(data2)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 303 obs. of  14 variables:
##  $ target  : Factor w/ 2 levels "NO","YES": 2 2 2 2 2 2 2 2 2 2 ...
##  $ sex     : Factor w/ 2 levels "FEMALE","MALE": 2 2 1 2 1 2 1 2 2 2 ...
##  $ fbs     : Factor w/ 2 levels "<=120",">120": 2 1 1 1 1 1 1 1 2 1 ...
##  $ exang   : Factor w/ 2 levels "NO","YES": 1 1 1 1 2 1 1 1 1 1 ...
##  $ cp      : Factor w/ 3 levels "ASYMPTOMATIC",..: 1 3 2 2 1 1 2 2 3 3 ...
##  $ restecg : Factor w/ 3 levels "ABNORMALITY",..: 2 1 2 1 1 1 2 1 1 1 ...
##  $ slope   : num  0 0 2 2 2 1 1 2 2 2 ...
##  $ ca      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ thal    : num  1 2 2 2 2 1 2 3 3 2 ...
##  $ age     : num  63 37 41 56 57 57 56 44 52 57 ...
##  $ trestbps: num  145 130 130 120 120 140 140 120 172 150 ...
##  $ chol    : num  233 250 204 236 354 192 294 263 199 168 ...
##  $ thalach : num  150 187 172 178 163 148 153 173 162 174 ...
##  $ oldpeak : num  2.3 3.5 1.4 0.8 0.6 0.4 1.3 0 0.5 1.6 ...
cols <- c("slope","ca","thal")

Heart <- data2 %>% 
  mutate_each_(funs(factor(.)),cols)
## Warning: mutate_each() is deprecated
## Please use mutate_if(), mutate_at(), or mutate_all() instead: 
## 
##   - To map `funs` over all variables, use mutate_all()
##   - To map `funs` over a selection of variables, use mutate_at()
## This warning is displayed once per session.
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.
str(Heart)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 303 obs. of  14 variables:
##  $ target  : Factor w/ 2 levels "NO","YES": 2 2 2 2 2 2 2 2 2 2 ...
##  $ sex     : Factor w/ 2 levels "FEMALE","MALE": 2 2 1 2 1 2 1 2 2 2 ...
##  $ fbs     : Factor w/ 2 levels "<=120",">120": 2 1 1 1 1 1 1 1 2 1 ...
##  $ exang   : Factor w/ 2 levels "NO","YES": 1 1 1 1 2 1 1 1 1 1 ...
##  $ cp      : Factor w/ 3 levels "ASYMPTOMATIC",..: 1 3 2 2 1 1 2 2 3 3 ...
##  $ restecg : Factor w/ 3 levels "ABNORMALITY",..: 2 1 2 1 1 1 2 1 1 1 ...
##  $ slope   : Factor w/ 3 levels "0","1","2": 1 1 3 3 3 2 2 3 3 3 ...
##  $ ca      : Factor w/ 5 levels "0","1","2","3",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ thal    : Factor w/ 4 levels "0","1","2","3": 2 3 3 3 3 2 3 4 4 3 ...
##  $ age     : num  63 37 41 56 57 57 56 44 52 57 ...
##  $ trestbps: num  145 130 130 120 120 140 140 120 172 150 ...
##  $ chol    : num  233 250 204 236 354 192 294 263 199 168 ...
##  $ thalach : num  150 187 172 178 163 148 153 173 162 174 ...
##  $ oldpeak : num  2.3 3.5 1.4 0.8 0.6 0.4 1.3 0 0.5 1.6 ...

##Attribute Statistcs

Basic statistics about the data are obtained in the below table:

##  target        sex         fbs      exang                    cp     
##  NO :138   FEMALE: 96   <=120:258   NO :204   ASYMPTOMATIC    :166  
##  YES:165   MALE  :207   >120 : 45   YES: 99   ATYPICAL ANGINA : 50  
##                                               NON-ANGINAL PAIN: 87  
##                                                                     
##                                                                     
##                                                                     
##                  restecg    slope   ca      thal         age       
##  ABNORMALITY         :152   0: 21   0:175   0:  2   Min.   :29.00  
##  NORMAL              :147   1:140   1: 65   1: 18   1st Qu.:47.50  
##  PROBABLE OR DEFINITE:  4   2:142   2: 38   2:166   Median :55.00  
##                                     3: 20   3:117   Mean   :54.37  
##                                     4:  5           3rd Qu.:61.00  
##                                                     Max.   :77.00  
##     trestbps          chol          thalach         oldpeak    
##  Min.   : 94.0   Min.   :126.0   Min.   : 71.0   Min.   :0.00  
##  1st Qu.:120.0   1st Qu.:211.0   1st Qu.:133.5   1st Qu.:0.00  
##  Median :130.0   Median :240.0   Median :153.0   Median :0.80  
##  Mean   :131.6   Mean   :246.3   Mean   :149.6   Mean   :1.04  
##  3rd Qu.:140.0   3rd Qu.:274.5   3rd Qu.:166.0   3rd Qu.:1.60  
##  Max.   :200.0   Max.   :564.0   Max.   :202.0   Max.   :6.20

From the summary, we can conclude there are no common issues with unclean data.

There are no “N/A” values and no negative values where one would not expect to see them.

The summary function in R would show those if they existed in the data.

Visual exploration of Categorical Variables

45.54% no heart disease

Heart %>% ggplot(aes(age)) + 
  geom_histogram(fill= "lightblue",
                 color = 'blue',
                 binwidth = 1) +   
  labs(title= "Age Distribution") +
  theme(plot.title = element_text(hjust = 0.5))

Heart %>% ggplot(aes(age)) + 
  geom_histogram(fill= "lightblue",
                 color = 'blue',
                 binwidth = 5) +   
  labs(title= "Age Distribution") +
  theme(plot.title = element_text(hjust = 0.5))

Heart %>% ggplot(aes(age)) + 
  geom_histogram(aes(fill= target),
                 color = 'grey',
                 binwidth = 1) +   
  scale_fill_manual(values=c("#E3CD81FF", "#B1B3B3FF")) +
  labs(title= "Age Distribution for Income")+
  theme(plot.title = element_text(hjust = 0.5))

Heart %>% 
  ggplot(aes(age, 
             fill= target)) +
  geom_density(alpha= 0.7, color = 'blue') +
  scale_fill_manual(values=c("#E3CD81FF", "#B1B3B3FF")) +
  labs(x = "Age", y = "Density", title = "Density graph of age distribution")

GENDER

library(ggpubr)
## Loading required package: magrittr
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
## 
##     col_factor
gender_prop <- Heart %>% 
  group_by(sex) %>% 
  summarise(count = n()) %>% 
  ungroup()%>% 
  arrange(desc(sex)) %>%
  mutate(percentage = round(count/sum(count),4)*100,
         lab.pos = cumsum(percentage)-0.5*percentage)

gender_distr <- ggplot(data = gender_prop, 
       aes(x = "", 
           y = percentage, 
           fill = sex))+
  geom_bar(stat = "identity")+
  coord_polar("y") +
  geom_text(aes(y = lab.pos, 
                label = paste(percentage,"%", sep = "")), col = "blue", size = 4) +
  scale_fill_manual(values=c("orange", "lightblue"),
                    name = "Gender") +
  theme_void() +
  theme(legend.title = element_text(color = "black", size = 12),
        legend.text = element_text(color = "black", size = 12))


gender_prop <- Heart %>% 
  group_by(sex, target) %>% 
  summarize(n = n()) %>% 
  mutate(pct = n*100/sum(n)) %>% 
  ggplot(aes(x = reorder(sex, n), 
             y = pct, 
             fill = target)) +
  geom_bar(stat = "identity", width = 0.6) + 
  scale_x_discrete(name = "") +
  scale_fill_manual(values=c("#E3CD81FF", "#B1B3B3FF")) +
  geom_text(aes(label = paste0(round(pct,0),"%")), 
            position = position_stack(vjust = 0.5), 
            size = 4, 
            color = "black") +
  theme(axis.text.y = element_blank(),
        axis.text.x = element_text(color = "black", size = 12),
        axis.title.y = element_blank(),
        axis.ticks.y = element_blank(),
        legend.title = element_text(color = "black", size = 12),
        legend.text = element_text(color = "black", size = 12))

ggarrange(gender_distr, gender_prop, nrow = 1)

31.68 % of people are male 68.32 % are female

75% of males had heart disease. 45% of female had heart disease.

#Distribution of Male and Female population across Age parameter
Heart %>%
  ggplot(aes(x=age,fill=sex))+
  geom_histogram()+
  xlab("Age") + 
  ylab("Number")+
  scale_fill_manual(values=c("orange", "lightblue"),
                    name = "Gender") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Chest pain type (cp)

cp_distr <- Heart %>%
  group_by(cp) %>%
  summarise(counts = n()) %>% 
  mutate(Percentage = round(counts*100/sum(counts),2)) %>% 
  arrange(desc(counts)) %>% 
  ggplot(aes(x= reorder(cp, counts),
             y  = counts)) +
  geom_bar(stat = "identity",
           width = 0.6,
           fill = "steelblue") +
  geom_text(aes(label = paste0(round(counts,1),"\n",Percentage,"%")), 
            vjust = 0.5, 
            hjust = -0.5,
            color = "darkblue", 
            size = 4) +
  scale_y_continuous(limits = c(0,200)) +
  theme_minimal() +
  labs(x = "Chest pain types",y = "Frequency") + 
  coord_flip()

cp_prop <- Heart %>% 
  group_by(cp, target) %>% 
  summarize(n = n()) %>% 
  mutate(pct = n*100/sum(n)) %>% 
  ggplot(aes(x = reorder(cp, n), 
             y = pct/100, 
             fill = target)) +
  geom_bar(stat = "identity", width = 0.6) + 
  scale_x_discrete(name = "") +
  scale_y_continuous(name= "Percentage", 
                     labels = percent) +
  scale_fill_manual(values=c("#E3CD81FF", "#B1B3B3FF")) +
  geom_text(aes(label = paste0(round(pct,0),"%")), 
            position = position_stack(vjust = 0.5), 
            size = 4, 
            color = "black") +
  theme(plot.title = element_text(hjust = 0.5), 
        axis.text.y=element_blank()) + 
  coord_flip()

ggarrange(cp_distr, cp_prop, nrow = 1)

Fasting blood sugar (fbs)

fbs_distr <- Heart %>%
  group_by(fbs) %>%
  dplyr::summarise(counts = n()) %>% 
  mutate(Percentage = round(counts*100/sum(counts),2)) %>% 
  arrange(desc(counts)) %>% 
  ggplot(aes(x= reorder(fbs, counts),
             y  = counts)) +
  geom_bar(stat = "identity",
           width = 0.6,
           fill = "steelblue") +
  geom_text(aes(label = paste0(round(counts,1),"\n",Percentage,"%")), 
            vjust = 0.5, 
            hjust = -0.5,
            color = "darkblue", 
            size = 4) +
  scale_y_continuous(limits = c(0,350)) +
  theme_minimal() +
  labs(x = "Chest pain types",y = "Frequency") + 
  coord_flip()

fbs_prop <- Heart %>% 
  group_by(fbs, target) %>% 
  dplyr::summarize(n = n()) %>% 
  mutate(pct = n*100/sum(n)) %>% 
  ggplot(aes(x = reorder(fbs, n), 
             y = pct/100, 
             fill = target)) +
  geom_bar(stat = "identity", width = 0.6) + 
  scale_x_discrete(name = "") +
  scale_y_continuous(name= "Percentage", 
                     labels = percent) +
  scale_fill_manual(values=c("#E3CD81FF", "#B1B3B3FF")) +
  geom_text(aes(label = paste0(round(pct,0),"%")), 
            position = position_stack(vjust = 0.5), 
            size = 4, 
            color = "black") +
  theme(plot.title = element_text(hjust = 0.5), 
        axis.text.y=element_blank()) + 
  coord_flip()

ggarrange(fbs_distr,fbs_prop, nrow = 1)

It seems that there is a slight difference in percentage of pp having heart disease for two groups (fasting blood sugar)

chisq.test(Heart$target, Heart$fbs)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  Heart$target and Heart$fbs
## X-squared = 0.10627, df = 1, p-value = 0.7444

The p = 0.744 > 0.05. There is no relationship between fast blood sugar and heart disease for this data.

restecg resting electrocardiographic results (values 0,1,2, categorical)

restecg_distr <- Heart %>%
  group_by(restecg) %>%
  dplyr::summarise(counts = n()) %>% 
  mutate(Percentage = round(counts*100/sum(counts),2)) %>% 
  arrange(desc(counts)) %>% 
  ggplot(aes(x= restecg,
             y  = counts)) +
  geom_bar(stat = "identity",
           width = 0.6,
           fill = "steelblue") +
  geom_text(aes(label = paste0(round(counts,1),"\n",Percentage,"%")), 
            vjust = 0.5, 
            hjust = -0.5,
            color = "darkblue", 
            size = 4) +
  scale_y_continuous(limits = c(0,250)) +
  theme_minimal() +
  labs(x = "resting electrocardiographic results",y = "Frequency") + 
  coord_flip()

restecg_prop <- Heart %>% 
  group_by(restecg, target) %>% 
  dplyr::summarize(n = n()) %>% 
  mutate(pct = n*100/sum(n)) %>% 
  ggplot(aes(x = restecg, 
             y = pct/100, 
             fill = target)) +
  geom_bar(stat = "identity", width = 0.6) + 
  scale_x_discrete(name = "") +
  scale_y_continuous(name= "Percentage", 
                     labels = percent) +
  scale_fill_manual(values=c("#E3CD81FF", "#B1B3B3FF")) +
  geom_text(aes(label = paste0(round(pct,0),"%")), 
            position = position_stack(vjust = 0.5), 
            size = 4, 
            color = "black") +
  theme(plot.title = element_text(hjust = 0.5), 
        axis.text.y=element_blank()) + 
  coord_flip()

ggarrange(restecg_distr,restecg_prop, nrow = 1)

chisq.test(data$target, Heart$restecg)
## Warning in chisq.test(data$target, Heart$restecg): Chi-squared approximation may
## be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  data$target and Heart$restecg
## X-squared = 10.023, df = 2, p-value = 0.006661

exang exercise induced angina (categorical, 1=angina 0 = no agina)

Angina (pronounced ANN-juh-nuh or ann-JIE-nuh) is pain in the chest that comes on with exercise, stress, or other things that make the heart work harder.

exang_distr <- Heart %>%
  group_by(exang) %>%
  dplyr::summarise(counts = n()) %>% 
  mutate(Percentage = round(counts*100/sum(counts),2)) %>% 
  arrange(desc(counts)) %>% 
  ggplot(aes(x= exang,
             y  = counts)) +
  geom_bar(stat = "identity",
           width = 0.6,
           fill = "steelblue") +
  geom_text(aes(label = paste0(round(counts,1),"\n",Percentage,"%")), 
            vjust = 0.5, 
            hjust = -0.5,
            color = "darkblue", 
            size = 4) +
  scale_y_continuous(limits = c(0,300)) +
  theme_minimal() +
  labs(x = "Exercise induced anginas",y = "Frequency") + 
  coord_flip()

exang_prop <- Heart %>% 
  group_by(exang, target) %>% 
  dplyr::summarize(n = n()) %>% 
  mutate(pct = n*100/sum(n)) %>% 
  ggplot(aes(x = exang, 
             y = pct/100, 
             fill = target)) +
  geom_bar(stat = "identity", width = 0.6) + 
  scale_x_discrete(name = "") +
  scale_y_continuous(name= "Percentage", 
                     labels = percent) +
  scale_fill_manual(values=c("#E3CD81FF", "#B1B3B3FF")) +
  geom_text(aes(label = paste0(round(pct,0),"%")), 
            position = position_stack(vjust = 0.5), 
            size = 4, 
            color = "black") +
  theme(plot.title = element_text(hjust = 0.5), 
        axis.text.y=element_blank()) + 
  coord_flip()

ggarrange(exang_distr,exang_prop, nrow = 1)

ca number of major vessels (0-3) colored by flourosopy (factor)

ca_distr <- Heart %>%
  group_by(ca) %>%
  dplyr::summarise(counts = n()) %>% 
  mutate(Percentage = round(counts*100/sum(counts),2)) %>% 
  arrange(desc(counts)) %>% 
  ggplot(aes(x= ca,
             y  = counts)) +
  geom_bar(stat = "identity",
           width = 0.6,
           fill = "steelblue") +
  geom_text(aes(label = paste0(round(counts,1),"\n",Percentage,"%")), 
            vjust = 0.5, 
            hjust = -0.5,
            color = "darkblue", 
            size = 4) +
  scale_y_continuous(limits = c(0,300)) +
  theme_minimal() +
  labs(x = "Number of major vessels colored by fluoroscopy",y = "Frequency") + 
  coord_flip()

ca_prop <- Heart %>% 
  group_by(ca, target) %>% 
  dplyr::summarize(n = n()) %>% 
  mutate(pct = n*100/sum(n)) %>% 
  ggplot(aes(x = ca, 
             y = pct/100, 
             fill = target)) +
  geom_bar(stat = "identity", width = 0.6) + 
  scale_x_discrete(name = "") +
  scale_y_continuous(name= "Percentage", 
                     labels = percent) +
  scale_fill_manual(values=c("#E3CD81FF", "#B1B3B3FF")) +
  geom_text(aes(label = paste0(round(pct,0),"%")), 
            position = position_stack(vjust = 0.5), 
            size = 4, 
            color = "black") +
  theme(plot.title = element_text(hjust = 0.5), 
        axis.text.y=element_blank()) + 
  coord_flip()

ggarrange(ca_distr,ca_prop, nrow = 1)

thal

thal_distr <- Heart %>%
  group_by(thal ) %>%
  dplyr::summarise(counts = n()) %>% 
  mutate(Percentage = round(counts*100/sum(counts),2)) %>% 
  arrange(desc(counts)) %>% 
  ggplot(aes(x= thal ,
             y  = counts)) +
  geom_bar(stat = "identity",
           width = 0.6,
           fill = "steelblue") +
  geom_text(aes(label = paste0(round(counts,1),"\n",Percentage,"%")), 
            vjust = 0.5, 
            hjust = -0.5,
            color = "darkblue", 
            size = 4) +
  scale_y_continuous(limits = c(0,300)) +
  theme_minimal() +
  labs(x = "Thalium stress test result",y = "Frequency") + 
  coord_flip()

thal_prop <- Heart %>% 
  group_by(thal , target) %>% 
  dplyr::summarize(n = n()) %>% 
  mutate(pct = n*100/sum(n)) %>% 
  ggplot(aes(x = thal , 
             y = pct/100, 
             fill = target)) +
  geom_bar(stat = "identity", width = 0.6) + 
  scale_x_discrete(name = "") +
  scale_y_continuous(name= "Percentage", 
                     labels = percent) +
  scale_fill_manual(values=c("#E3CD81FF", "#B1B3B3FF")) +
  geom_text(aes(label = paste0(round(pct,0),"%")), 
            position = position_stack(vjust = 0.5), 
            size = 4, 
            color = "black") +
  theme(plot.title = element_text(hjust = 0.5), 
        axis.text.y=element_blank()) + 
  coord_flip()

ggarrange(thal_distr,thal_prop, nrow = 1)

slope Slope of peak exercise ST segment (upsloping, flat, or downsloping

slope_distr <- Heart %>%
  group_by(slope) %>%
  dplyr::summarise(counts = n()) %>% 
  mutate(Percentage = round(counts*100/sum(counts),2)) %>% 
  arrange(desc(counts)) %>% 
  ggplot(aes(x= slope,
             y  = counts)) +
  geom_bar(stat = "identity",
           width = 0.4,
           fill = "steelblue") +
  geom_text(aes(label = paste0(round(counts,1),"\n",Percentage,"%")), 
            vjust = 0.5, 
            hjust = -0.5,
            color = "darkblue", 
            size = 4) +
  scale_y_continuous(limits = c(0,200)) +
  theme_minimal() +
  labs(x = "Slope of peak exercise ST segment",y = "Frequency") + 
  coord_flip()

slope_prop <- Heart %>% 
  group_by(slope, target) %>% 
  dplyr::summarize(n = n()) %>% 
  mutate(pct = n*100/sum(n)) %>% 
  ggplot(aes(x = slope, 
             y = pct/100, 
             fill = target)) +
  geom_bar(stat = "identity", width = 0.6) + 
  scale_x_discrete(name = "") +
  scale_y_continuous(name= "Percentage", 
                     labels = percent) +
  scale_fill_manual(values=c("#E3CD81FF", "#B1B3B3FF")) +
  geom_text(aes(label = paste0(round(pct,0),"%")), 
            position = position_stack(vjust = 0.5), 
            size = 4, 
            color = "black") +
  theme(plot.title = element_text(hjust = 0.5), 
        axis.text.y=element_blank()) + 
  coord_flip()

ggarrange(slope_distr,slope_prop, nrow = 1)

Numerical variable trestbps Resting blood pressure (mm Hg)

Heart %>% ggplot(aes(trestbps)) + 
  geom_histogram(fill= "lightblue",
                 color = 'blue',
                 binwidth = 1) +   
  labs(title= "Resting blood pressure") +
  theme(plot.title = element_text(hjust = 0.5))

Heart %>% 
  ggplot(aes(trestbps, 
             fill= target)) +
  geom_density(alpha= 0.7, color = 'blue') +
  scale_fill_manual(values=c("#E3CD81FF", "#B1B3B3FF")) +
  labs(x = "Age", y = "Density", title = "Density graph of trestbps distribution")

chol Serum cholesterol (mg/dl)

Heart %>% ggplot(aes(chol)) + 
  geom_histogram(fill= "lightblue",
                 color = 'blue',
                 binwidth = 3) +   
  labs(title= "Serum cholesterol (mg/dl)") +
  theme(plot.title = element_text(hjust = 0.5))

Heart$chol2 <- cut(Heart$chol, breaks = c(100,200,400,600),
                   labels = c("100-200","200-400","400-600"), include.lowest = TRUE)
head(Heart$chol2)
## [1] 200-400 200-400 200-400 200-400 200-400 100-200
## Levels: 100-200 200-400 400-600
#######Plotting Age(groups) vs Attrition
p1_chol2 <- ggplot(Heart,aes(x = chol2,fill = target)) +
  geom_bar(position = "dodge") +
    ggtitle("chol2(group) vs target - count")


p2_chole2 <- ggplot(Heart,aes(x = chol2,fill = target)) +
  geom_bar(position = "fill") +
  ggtitle("chol2(groups) vs target - proportion")

ggarrange(p1_chol2, p2_chole2, ncol=1)

All aged 19 and younger : At most 170 mg/dL

Females aged 20 and older 125–200 mg/dL

Males aged 20 and older

In the data, the youngest are 29 years old.

There are some people whose the serum cholesterol are more than 200 mg/dl

How many people are healthy

Representation of Cholestoral level with age and gender

Heart %>%
  ggplot(aes(x = age, 
             y = chol))+
  geom_point(alpha=0.7) +   
  geom_smooth(method='lm')+
  xlab("Age") +
  ylab("Serum cholesterol")
## `geom_smooth()` using formula 'y ~ x'

Heart %>%
  ggplot(aes(x = age, 
             y = chol, 
             color = sex, 
             size = chol))+
  geom_point(alpha=0.7) + 
  xlab("Age") +
  ylab("Serum cholesterol") +
  scale_fill_manual(values=c("orange", "lightblue"),
                    name = "Gender")

Comparison of Blood pressure across pain type

Heart %>%
  ggplot(aes(x=sex,
             y=trestbps, colour = sex))+
  geom_boxplot()+
  xlab("Sex")+
  ylab("BP")+ 
  facet_grid(~cp)

Comparison of Cholestoral across pain type

Heart  %>%
  ggplot(aes(x=sex,
             y=chol))+
  geom_boxplot(fill="#D55E00")+
  xlab("Sex")+
  ylab("Chol")+
  facet_grid(~cp)

Heart %>% 
  ggplot(aes(chol, 
             fill= target)) +
  geom_density(alpha= 0.7, color = 'blue') +
  scale_fill_manual(values=c("#E3CD81FF", "#B1B3B3FF")) +
  labs(x = "Serum cholesterol (mg/dl)", y = "Density", title = "Density graph of chol distribution")

Heart %>% 
  filter(chol >= 125  & chol <= 200) %>% 
  summarise(count = n())
## # A tibble: 1 x 1
##   count
##   <int>
## 1    51
Heart %>% 
  filter(chol >= 125  & chol <= 200) %>% 
  group_by(target) %>% 
  summarise(count = n())
## # A tibble: 2 x 2
##   target count
##   <fct>  <int>
## 1 NO        21
## 2 YES       30

about 60% of them having heart disease

Heart %>% 
  filter(chol >= 400) %>% 
  group_by(target) %>% 
  summarise(count = n())
## # A tibble: 2 x 2
##   target count
##   <fct>  <int>
## 1 NO         2
## 2 YES        2

thalach

Heart %>% ggplot(aes(thalach)) + 
  geom_histogram(fill= "lightblue",
                 color = 'blue',
                 binwidth = 1) +   
  labs(title= "Max. heart rate achieved during thalium stress test") +
  theme(plot.title = element_text(hjust = 0.5))

Heart %>% 
  ggplot(aes(thalach, 
             fill= target)) +
  geom_density(alpha= 0.7, color = 'blue') +
  scale_fill_manual(values=c("#E3CD81FF", "#B1B3B3FF")) +
  labs(x = "Max. heart rate achieved during thalium stress test", y = "Density", title = "Density graph of thalach distribution")

oldpeak ST depression induced by exercise relative to rest

Heart %>% ggplot(aes(oldpeak)) + 
  geom_histogram(fill= "lightblue",
                 color = 'blue',
                 binwidth = 0.1) +   
  labs(title= "ST depression induced by exercise relative to rest") +
  theme(plot.title = element_text(hjust = 0.5))

Heart %>% 
  ggplot(aes(oldpeak, 
             fill= target)) +
  geom_density(alpha= 0.7, color = 'blue') +
  scale_fill_manual(values=c("#E3CD81FF", "#B1B3B3FF")) +
  labs(x = "ST depression induced by exercise relative to rest", y = "Density", title = "Density graph of oldpeak distribution")

CORRELATION

library(corrplot)
## Warning: package 'corrplot' was built under R version 3.6.3
## corrplot 0.84 loaded
cor_heart <- cor(data2[,10:14])
cor_heart 
##                 age    trestbps         chol      thalach     oldpeak
## age       1.0000000  0.27935091  0.213677957 -0.398521938  0.21001257
## trestbps  0.2793509  1.00000000  0.123174207 -0.046697728  0.19321647
## chol      0.2136780  0.12317421  1.000000000 -0.009939839  0.05395192
## thalach  -0.3985219 -0.04669773 -0.009939839  1.000000000 -0.34418695
## oldpeak   0.2100126  0.19321647  0.053951920 -0.344186948  1.00000000
corrplot(cor_heart, method = "ellipse", type="upper",)

library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 3.6.3
ggcorrplot(cor_heart,lab = T)

library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
## 
## Attaching package: 'GGally'
## The following object is masked from 'package:pander':
## 
##     wrap
## The following object is masked from 'package:dplyr':
## 
##     nasa
ggcorr(cor_heart, label = T, label_round = 2)

heartcor <- cor(cor_heart) #the correlation graph is saved to `heartcor`
corrplot(heartcor, method = "pie", type = "lower")

p1 <- ggplot(Heart, 
             aes(x = age, y = trestbps, color = target, shape = target)) +
  geom_point(size = 2) + geom_smooth(se = FALSE)

p2 <- ggplot(Heart, aes(x = age, y = chol, color = target, shape = target)) + 
  geom_point() +
  geom_smooth(se = FALSE)

p3 <- ggplot(Heart, aes(x = age, y = thalach, color = target, shape = target))+
  geom_point()+
  geom_smooth(se = FALSE)

p4 <- ggplot(data2, aes(x = age, y = oldpeak, color = target, shape = target))+
  geom_point()+
  geom_smooth(se = FALSE)

ggarrange(p1,p2,p3,p4, ncol = 2, nrow= 2)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

K-means

library(funModeling)
## Warning: package 'funModeling' was built under R version 3.6.3
## Loading required package: Hmisc
## Warning: package 'Hmisc' was built under R version 3.6.3
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## funModeling v.1.9.3 :)
## Examples and tutorials at livebook.datascienceheroes.com
##  / Now in Spanish: librovivodecienciadedatos.ai
## 
## Attaching package: 'funModeling'
## The following object is masked from 'package:GGally':
## 
##     range01
uns_df <- scale(data2[,10:14])

head(as_tibble(uns_df)) 
## # A tibble: 6 x 5
##      age trestbps    chol thalach oldpeak
##    <dbl>    <dbl>   <dbl>   <dbl>   <dbl>
## 1  0.951   0.763  -0.256   0.0154   1.09 
## 2 -1.91   -0.0926  0.0721  1.63     2.12 
## 3 -1.47   -0.0926 -0.815   0.976    0.310
## 4  0.180  -0.663  -0.198   1.24    -0.206
## 5  0.290  -0.663   2.08    0.583   -0.379
## 6  0.290   0.478  -1.05   -0.0719  -0.551
library(factoextra)
## Warning: package 'factoextra' was built under R version 3.6.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
distance <- get_dist(uns_df)
head(distance)
## [1] 3.565404 2.904427 2.406805 3.222464 1.956767 1.679101
fviz_dist(distance, gradient = list(low = "blue", mid = "white", high = "red"))

k2 <- kmeans(uns_df, 
             center = 2,
             nstart = 25  )

str(k2)
## List of 9
##  $ cluster     : int [1:303] 1 2 2 2 2 2 1 2 2 2 ...
##  $ centers     : num [1:2, 1:5] 0.711 -0.542 0.441 -0.336 0.317 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "1" "2"
##   .. ..$ : chr [1:5] "age" "trestbps" "chol" "thalach" ...
##  $ totss       : num 1510
##  $ withinss    : num [1:2] 623 512
##  $ tot.withinss: num 1136
##  $ betweenss   : num 374
##  $ size        : int [1:2] 131 172
##  $ iter        : int 1
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"
fviz_cluster(k2, data = uns_df)

k3 <- kmeans(uns_df, centers = 3, nstart = 25)
k4 <- kmeans(uns_df, centers = 4, nstart = 25)
k5 <- kmeans(uns_df, centers = 5, nstart = 25)

pKM1 <- fviz_cluster(k2, geom = "point", data = uns_df)+
  ggtitle("k = 2")
pKM2 <- fviz_cluster(k3, geom = "point", data = uns_df)+
  ggtitle("k = 3")
pKM3 <- fviz_cluster(k4, geom = "point", data = uns_df)+
  ggtitle("k = 4")
pKM4 <- fviz_cluster(k5, geom = "point", data = uns_df)+
  ggtitle("k = 5")


ggarrange(pKM1,pKM2,pKM3,pKM4, ncol = 2, nrow = 2)

8.1. Optimum Cluster Number

Elbow Method

set.seed(123)
fviz_nbclust(uns_df, kmeans, method = "wss")

# Average Silouette Method
fviz_nbclust(uns_df, kmeans, method = "silhouette")

library(cluster)
## Warning: package 'cluster' was built under R version 3.6.3
# Gap Statistics
set.seed(123)

gap_stat <- clusGap(uns_df, FUN = kmeans, nstart = 25, K.max = 10, B = 50)

print(gap_stat, method = "firstmax")
## Clustering Gap statistic ["clusGap"] from call:
## clusGap(x = uns_df, FUNcluster = kmeans, K.max = 10, B = 50,     nstart = 25)
## B=50 simulated reference sets, k = 1..10; spaceH0="scaledPCA"
##  --> Number of clusters (method 'firstmax'): 2
##           logW   E.logW       gap     SE.sim
##  [1,] 5.410459 5.963889 0.5534297 0.01063475
##  [2,] 5.262299 5.836118 0.5738189 0.01081094
##  [3,] 5.187107 5.742668 0.5555616 0.01139483
##  [4,] 5.131819 5.665213 0.5333938 0.01186413
##  [5,] 5.079557 5.605857 0.5263005 0.01183466
##  [6,] 5.033868 5.553238 0.5193703 0.01176835
##  [7,] 4.994771 5.511709 0.5169386 0.01201947
##  [8,] 4.959897 5.476290 0.5163929 0.01258630
##  [9,] 4.925016 5.444976 0.5199599 0.01242377
## [10,] 4.904663 5.416427 0.5117637 0.01250435
fviz_gap_stat(gap_stat)

set.seed(123)

final <- kmeans(uns_df, 2, nstart = 25)
final
## K-means clustering with 2 clusters of sizes 172, 131
## 
## Cluster means:
##          age   trestbps       chol    thalach    oldpeak
## 1 -0.5415776 -0.3355766 -0.2416645  0.5202796 -0.4537289
## 2  0.7110790  0.4406044  0.3173000 -0.6831152  0.5957356
## 
## Clustering vector:
##   [1] 2 1 1 1 1 1 2 1 1 1 1 1 1 2 2 1 1 2 1 2 1 1 1 2 1 2 2 1 2 1 1 1 1 1 2 1 1
##  [38] 1 2 2 2 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [75] 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 2 1 1 1 1 1 2 2 1 1 1 1 2 1 1 1 2 2 1 1 1 2
## [112] 1 2 1 1 1 1 1 1 1 2 1 1 1 1 1 1 2 1 2 1 1 1 1 1 1 2 1 2 2 1 1 1 1 2 2 1 1
## [149] 1 1 2 2 2 2 1 2 1 1 1 1 1 2 1 1 1 2 2 2 2 2 1 1 2 2 2 1 1 2 1 2 2 2 1 1 2
## [186] 1 2 2 1 1 2 2 2 2 2 2 2 1 2 1 1 2 2 2 2 1 1 2 1 1 1 2 1 2 2 2 2 2 2 1 2 2
## [223] 2 2 2 2 2 1 2 2 1 2 2 2 2 1 1 2 2 1 2 2 2 2 2 1 2 2 1 2 2 1 2 2 2 1 2 2 2
## [260] 1 2 1 2 1 1 2 2 1 2 2 1 2 2 1 1 1 2 1 2 2 2 1 2 1 2 2 1 1 2 2 1 2 2 2 1 2
## [297] 1 2 2 1 2 2 1
## 
## Within cluster sum of squares by cluster:
## [1] 512.4914 623.1961
##  (between_SS / total_SS =  24.8 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

Descriptive Statistics for Clusters

Heart[,10:14] %>% 
  mutate(Cluster = final$cluster) %>% 
  group_by(Cluster) %>% 
  summarise_all("mean") 
## # A tibble: 2 x 6
##   Cluster   age trestbps  chol thalach oldpeak
##     <int> <dbl>    <dbl> <dbl>   <dbl>   <dbl>
## 1       1  49.4     126.  234.    162.   0.513
## 2       2  60.8     139.  263.    134    1.73

MACHINE LEARNING

library(caret)
## Warning: package 'caret' was built under R version 3.6.3
## 
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
## 
##     cluster
train_indeks <- createDataPartition(Heart$target, p = 0.7, list = FALSE, times = 1)

train <- Heart[train_indeks,]
test <- Heart[-train_indeks,]

train_x <- train %>% dplyr::select(-target)
train_y <- train$target

test_x <- test %>% dplyr::select(-target)
test_y <- test$target

training <- data.frame(train_x, target = train_y)
model_glm <- glm(target~., 
                 data = training, 
                 family = "binomial")
model_glm
## 
## Call:  glm(formula = target ~ ., family = "binomial", data = training)
## 
## Coefficients:
##                 (Intercept)                      sexMALE  
##                    -1.44693                     -1.91424  
##                     fbs>120                     exangYES  
##                     0.92358                     -0.65496  
##           cpATYPICAL ANGINA           cpNON-ANGINAL PAIN  
##                    -0.13140                      1.45518  
##               restecgNORMAL  restecgPROBABLE OR DEFINITE  
##                    -0.53504                     -0.92066  
##                      slope1                       slope2  
##                    -0.04961                      1.63488  
##                         ca1                          ca2  
##                    -2.47021                     -3.31721  
##                         ca3                          ca4  
##                    -3.40393                     -1.21654  
##                       thal1                        thal2  
##                     4.06481                      3.85934  
##                       thal3                          age  
##                     1.89929                      0.02966  
##                    trestbps                         chol  
##                    -0.01930                     -0.01028  
##                     thalach                      oldpeak  
##                     0.02033                     -0.61554  
##                chol2200-400                 chol2400-600  
##                     1.64309                      3.51099  
## 
## Degrees of Freedom: 212 Total (i.e. Null);  189 Residual
## Null Deviance:       293.6 
## Residual Deviance: 121.6     AIC: 169.6
summary(model_glm)
## 
## Call:
## glm(formula = target ~ ., family = "binomial", data = training)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.90153  -0.26747   0.07939   0.41668   2.92331  
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 -1.446930   3.799194  -0.381 0.703313    
## sexMALE                     -1.914244   0.709138  -2.699 0.006947 ** 
## fbs>120                      0.923576   0.780387   1.183 0.236617    
## exangYES                    -0.654962   0.563591  -1.162 0.245185    
## cpATYPICAL ANGINA           -0.131397   0.758416  -0.173 0.862454    
## cpNON-ANGINAL PAIN           1.455178   0.606339   2.400 0.016398 *  
## restecgNORMAL               -0.535036   0.488536  -1.095 0.273437    
## restecgPROBABLE OR DEFINITE -0.920665   2.385379  -0.386 0.699525    
## slope1                      -0.049611   1.032670  -0.048 0.961683    
## slope2                       1.634875   1.171889   1.395 0.162992    
## ca1                         -2.470214   0.666300  -3.707 0.000209 ***
## ca2                         -3.317206   0.989400  -3.353 0.000800 ***
## ca3                         -3.403930   1.311543  -2.595 0.009449 ** 
## ca4                         -1.216536   2.551628  -0.477 0.633527    
## thal1                        4.064807   2.436510   1.668 0.095258 .  
## thal2                        3.859338   2.331877   1.655 0.097917 .  
## thal3                        1.899289   2.302151   0.825 0.409368    
## age                          0.029664   0.029341   1.011 0.312021    
## trestbps                    -0.019299   0.015063  -1.281 0.200120    
## chol                        -0.010284   0.007555  -1.361 0.173460    
## thalach                      0.020327   0.013631   1.491 0.135918    
## oldpeak                     -0.615541   0.321895  -1.912 0.055845 .  
## chol2200-400                 1.643086   0.916214   1.793 0.072918 .  
## chol2400-600                 3.510991   7.269844   0.483 0.629129    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 293.58  on 212  degrees of freedom
## Residual deviance: 121.65  on 189  degrees of freedom
## AIC: 169.65
## 
## Number of Fisher Scoring iterations: 6

10.2. Random Forest

library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
rf_fit <- randomForest(train_x, train_y, importance = TRUE)

rf_fit
## 
## Call:
##  randomForest(x = train_x, y = train_y, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 18.31%
## Confusion matrix:
##     NO YES class.error
## NO  76  21   0.2164948
## YES 18  98   0.1551724
importance(rf_fit)
##                 NO        YES MeanDecreaseAccuracy MeanDecreaseGini
## sex       2.407462 11.3684400           10.6118227        3.4329081
## fbs      -1.633384  1.4594114           -0.1214795        0.7720273
## exang     5.197280  4.3947063            6.7215988        4.9442285
## cp       11.274378  4.9598032           11.5600559        7.5094053
## restecg   1.555085 -0.6593266            0.6736534        1.7695557
## slope     9.930101  3.7652346            9.6312023        6.4720831
## ca       11.961786 18.4525033           20.0110037       12.4436797
## thal     12.801312 18.5404195           21.9523963       15.2866274
## age       2.891410  7.5518669            7.5258026        8.9835041
## trestbps -1.086904  3.2221793            1.8022123        7.1038118
## chol     -2.005635  0.7519465           -0.8313648        8.0635006
## thalach   4.785756  7.2135341            9.1349684       12.2929163
## oldpeak  14.138700  9.6829965           15.8083655       14.2598394
## chol2     1.246250 -2.1411801           -0.4668505        0.8338564
varImpPlot(rf_fit)

confusionMatrix(predict(rf_fit, test_x), test_y, positive = "YES")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction NO YES
##        NO  33   8
##        YES  8  41
##                                           
##                Accuracy : 0.8222          
##                  95% CI : (0.7274, 0.8948)
##     No Information Rate : 0.5444          
##     P-Value [Acc > NIR] : 2.84e-08        
##                                           
##                   Kappa : 0.6416          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.8367          
##             Specificity : 0.8049          
##          Pos Pred Value : 0.8367          
##          Neg Pred Value : 0.8049          
##              Prevalence : 0.5444          
##          Detection Rate : 0.4556          
##    Detection Prevalence : 0.5444          
##       Balanced Accuracy : 0.8208          
##                                           
##        'Positive' Class : YES             
## 
# Model Tuning
control <- trainControl(method='cv', 
                        number=10, 
                        search='grid')
 
tunegrid <- expand.grid(mtry = (1:10)) 

rf_gridsearch <- train(target ~ ., 
                       data = train,
                       method = 'rf',
                       metric = 'Accuracy',
                       tuneGrid = tunegrid)

rf_gridsearch
## Random Forest 
## 
## 213 samples
##  14 predictor
##   2 classes: 'NO', 'YES' 
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 213, 213, 213, 213, 213, 213, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    1    0.8078647  0.6095593
##    2    0.8052842  0.6057955
##    3    0.7957809  0.5875444
##    4    0.7892162  0.5742515
##    5    0.7869512  0.5699819
##    6    0.7810923  0.5582653
##    7    0.7826607  0.5610224
##    8    0.7773592  0.5500318
##    9    0.7766169  0.5490764
##   10    0.7762688  0.5480922
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 1.
plot(rf_gridsearch)

confusionMatrix(predict(rf_gridsearch, test_x), test_y, positive = "YES")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction NO YES
##        NO  30   6
##        YES 11  43
##                                           
##                Accuracy : 0.8111          
##                  95% CI : (0.7149, 0.8859)
##     No Information Rate : 0.5444          
##     P-Value [Acc > NIR] : 1.061e-07       
##                                           
##                   Kappa : 0.6154          
##                                           
##  Mcnemar's Test P-Value : 0.332           
##                                           
##             Sensitivity : 0.8776          
##             Specificity : 0.7317          
##          Pos Pred Value : 0.7963          
##          Neg Pred Value : 0.8333          
##              Prevalence : 0.5444          
##          Detection Rate : 0.4778          
##    Detection Prevalence : 0.6000          
##       Balanced Accuracy : 0.8046          
##                                           
##        'Positive' Class : YES             
## 
rf_cm <- confusionMatrix(predict(rf_gridsearch, test_x), test_y, positive = "YES")
rf_cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction NO YES
##        NO  30   6
##        YES 11  43
##                                           
##                Accuracy : 0.8111          
##                  95% CI : (0.7149, 0.8859)
##     No Information Rate : 0.5444          
##     P-Value [Acc > NIR] : 1.061e-07       
##                                           
##                   Kappa : 0.6154          
##                                           
##  Mcnemar's Test P-Value : 0.332           
##                                           
##             Sensitivity : 0.8776          
##             Specificity : 0.7317          
##          Pos Pred Value : 0.7963          
##          Neg Pred Value : 0.8333          
##              Prevalence : 0.5444          
##          Detection Rate : 0.4778          
##    Detection Prevalence : 0.6000          
##       Balanced Accuracy : 0.8046          
##                                           
##        'Positive' Class : YES             
##