1.0.INTRODUCTION

Objectives:

Dataset Information:

The characteristics variables includes the following

2.Loading and Checking the dataset:

#Load the packages
library(dplyr)  
library(ggplot2)
library(tidyr)
library(psych)
library(psychTools)
library(tidyverse)
library(hrbrthemes)
library(randomForest)
library(forcats)
library(corrplot)
library(GGally)
library(viridis)
library(corrr)
library(DataExplorer)
library(caret)
library(caTools)
library(e1071)
library(class)
library(pROC)

load the dataset

# define the filename
filename <- "heart.csv"
# load the CSV file from the local directory
dataset <- read.csv(filename,header=T,sep=",")
# column names of the dataset
colnames(dataset)
 [1] "age"      "sex"      "cp"       "trtbps"   "chol"     "fbs"     
 [7] "restecg"  "thalachh" "exng"     "oldpeak"  "slp"      "caa"     
[13] "thall"    "output"  
#list types for each attribute
sapply(dataset, class)
      age       sex        cp    trtbps      chol       fbs   restecg  thalachh 
"integer" "integer" "integer" "integer" "integer" "integer" "integer" "integer" 
     exng   oldpeak       slp       caa     thall    output 
"integer" "numeric" "integer" "integer" "integer" "integer" 

observations

# display the data
head(dataset,n=10)

3.Data Cleaning And Analysis:

feature Classification

#classify the variables into categorical and numerical variables 
#select the numerical variables
numeric_var <-dataset %>% 
  select("age","trtbps","chol","thalachh","oldpeak")
#select the categorical values 
categorical_var<- dataset %>%
  select("sex","cp","fbs","restecg","exng","slp","caa",
         "thall","output")%>%
  mutate_if(is.numeric, as.factor)

#combine the categorical  and numerical values
dataset1 = cbind(categorical_var,numeric_var)

Dimensions of the datasets:

BarPlot

plot_intro(dataset1,title="Dataset Information")

Table

introduce(dataset1)

Observations

Descriptive Basic Statistics

describeBy(dataset1)

Observations

Correlation Plot Analysis

## plot correlations 
correlation_tab <- cor(dataset) 
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
#png(file="corr2.png",res=150,width=900,height=700)                        
corrplot(correlation_tab, method = "color", shade.col = NA, tl.col = "black", tl.srt = 45,tl.cex =4,cl.cex=4,col = col(200), addCoef.col = "black", order = "AOE",number.cex = 3)

Observations

Positive correlations are displayed in blue and negative correlations in red color. Color intensities are proportional to the correlation coefficients

Output Distribution

To see if the dataset is balanced or not

df_output <- dataset1 %>%
  group_by(output) %>%
  summarise(freq= n()) %>%
  mutate(percentage= freq/sum(freq)*100)

output_bar <- ggplot(dataset1, aes(x=output, fill=output)) +
  geom_bar( ) +geom_text(stat='count',aes(label=..count..),vjust=-0.30)+
  labs( x = 'Target', y = 'Number of observations')+scale_fill_discrete(labels=c('0-less chances', '1-High Chances'))+theme_bw()

output_bar

Observation

Categorical feature Analysis

#select the categorical variable 
par(mfrow=c(2,2))
categorical_var<- list("sex","cp","fbs","restecg","exng","slp","caa",
        "thall")
for(i in categorical_var){
  plot<- ggplot(dataset1,aes_string(x=i,fill=dataset1$output))+geom_bar(position = position_dodge()) + scale_fill_discrete(name="output",labels=c('0-less chances', '1-High Chances'))
  print(plot)
}

Observations

Numerical feature analysis

## select the numerical features
## plot a pairplot
plt <- ggpairs(numeric_var,columns=1:5,ggplot2::aes(alpha=0.75,color=dataset1$output),legend=2,upper = list(continuous = wrap("points",alpha = 0.75,size=2.8)),
              lower = list(continuous = wrap("points",alpha = 0.75,size=2.8)))+ theme(text=element_text(size=22))+scale_colour_discrete(name="output",labels=c('0-less chances', '1-High Chances'))
plt 

3.Handling Outliers:

There are several methods on how to detect outliers in a dataset such as using Box plots,Scatter plot,Z score etc.Therefore,i have used boxplot analysis on numerical variables to detect the outliers

Boxplot Analysis

Boxplot

#define the data 
#classify the numerical variables into feature and value columns
data <- numeric_var %>% 
  gather(key="feature", value="value") %>% 
  mutate(feature = gsub("\\.", " ",feature)) 
data %>%
  mutate(text=fct_reorder(feature, value))%>%
  ggplot(aes(x=feature, y=value, fill=feature))+
     geom_boxplot(outlier.colour = "red",outlier.shape = 1.5)+   
     scale_fill_viridis(discrete=TRUE)+theme(legend.position="false",
     text=element_text(size=15)) +coord_flip()

Table

head(data)

Observations

Remove the Outliers

# create a dataframe from numerical variables and exclude age
df_outliers<-as.data.frame(dataset1 %>%
                  select("trtbps","thalachh","chol","oldpeak"))
outliers <- function(x) { 
 #IQR
  Q1 <- quantile(x, probs=.25) 
  Q3 <- quantile(x, probs=.75) 
  iqr = Q3-Q1 
 
 #Upper Range
 upper_limit = Q3 + (iqr*1.5) 
 #Lower Range Eliminating Outliers 
 lower_limit = Q1 - (iqr*1.5) 
 
 x > upper_limit | x < lower_limit 
} 
# remove the outliers
remove_outliers <- function(df_outliers, cols = names(df_outliers)) { 
  for (col in cols) { 
    df_outliers<- df_outliers[!outliers(df_outliers[[col]]),] 
  } 
  df_outliers 
}
# we have removed the outliers from the selected features 
# create new dataset without outliers
dataset2<-remove_outliers(dataset1,c("trtbps","oldpeak" ,"thalachh", "chol"))
#check the dimesions of the new dataset
dim(dataset2)
[1] 284  14

Observation

Boxplot Without Outliers
#define the data 
#classify the numerical variables into feature and value columns
numeric_feature <- dataset2%>% 
  select("trtbps","oldpeak" ,"thalachh", "chol","age")
data <- numeric_feature %>% 
  gather(key="feature", value="value") %>% 
  mutate(feature = gsub("\\.", " ",feature))

data %>%
  mutate(text = fct_reorder(feature, value)) %>%
  ggplot( aes(x=feature, y=value, fill=feature)) +
  geom_boxplot(outlier.colour = "red",outlier.shape = 1.5) +
  scale_fill_viridis(discrete=TRUE) +
  theme(legend.position="false",text=element_text(size=15)
  ) +
  coord_flip()

4.Feature Selection

  • According to Michail.T(2018),Feature (or variable) selection is the process of identifying the minimal set of features with the highest predictive performance on the target variable of interest.

  • There are several feature selection methods provided by the caret R package,and among are Searching for and removing redundant feature,Boruta,Ranking features by importance,RFE etc.

  • In this dataset we are going to use automatic methods, using Recursive Feature Elimination(RFE).

Predictors Results

set.seed(100)
#create the subsets for sizes
subsets <- c(1:8,10,13)
# define the control using random forest selection 
ctrl <- rfeControl(functions = rfFuncs,
                   method = "repeatedcv",
                   repeats = 5,
                   number = 10,
                   verbose = FALSE)

#run the RFE
results <- rfe(x=dataset2[, c(1:8,10:14)], y=dataset2$output,
                 sizes = subsets,
                 rfeControl = ctrl)

# Print the selected features
print(predictors(results))
[1] "caa"      "thall"    "cp"       "oldpeak"  "sex"      "thalachh" "exng"    
[8] "age"     

Variable Importance

set.seed(100)
varimp_data <- data.frame(feature = row.names(varImp(results))[1:9],
                          importance = varImp(results)[1:9, 1])

ggplot(data = varimp_data, 
       aes(x = reorder(feature, -importance), y = importance, fill = feature)) +
  geom_bar(stat="identity") + labs(x = "Features", y = "Variable Importance") + 
  geom_text(aes(label = round(importance, 2)), vjust=1.6, color="white", size=4) + 
  theme_bw() + theme(legend.position = "none")

Observations

#drop the least columns from the dataset2
set.seed(100)
data1 <- dataset2 %>%
  select( "sex","cp","caa","thall","exng","slp","age","oldpeak","thalachh","output")
head(data1)

6.Split the Dataset

set.seed(100)
split = sample.split(data1$output, SplitRatio = 0.80)
train_set = subset(data1, split == TRUE)
test_set = subset(data1, split == FALSE)
# Feature Scaling
set.seed(100)
train_set[c(7,8,9)]= scale(train_set[c(7,8,9)])
test_set[c(7,8,9)] = scale(test_set[c(7,8,9)])
head(test_set)

7.Model Classification

Naive Bayes

set.seed(100)
classifier = naiveBayes(x = train_set[-10],y = train_set$output)
y_pred = predict(classifier, newdata = test_set[-10])
cm = confusionMatrix(test_set[, 10], y_pred)
cm
Confusion Matrix and Statistics

          Reference
Prediction  0  1
         0 20  5
         1  2 30
                                          
               Accuracy : 0.8772          
                 95% CI : (0.7632, 0.9492)
    No Information Rate : 0.614           
    P-Value [Acc > NIR] : 1.096e-05       
                                          
                  Kappa : 0.7473          
                                          
 Mcnemar's Test P-Value : 0.4497          
                                          
            Sensitivity : 0.9091          
            Specificity : 0.8571          
         Pos Pred Value : 0.8000          
         Neg Pred Value : 0.9375          
             Prevalence : 0.3860          
         Detection Rate : 0.3509          
   Detection Prevalence : 0.4386          
      Balanced Accuracy : 0.8831          
                                          
       'Positive' Class : 0               
                                          
set.seed(100)
naive_acc <- cm$overall["Accuracy"]
#plot confusion Matrix 
test_set$pred <- y_pred
ggplot(test_set, aes(output, pred, color = output)) +
  geom_jitter(width = 0.2, height = 0.1, size=2) +
  labs(title="Confusion Matrix",
       y="Predicted",
       x="Truth")

Support Vector Machine(SVM)

set.seed(100)
svm_model= svm(formula=output~.,
             data=train_set,
             type="C-classification",
             kernal="linear")
y_pred= predict(svm_model,newdata = test_set[-10])

cm= confusionMatrix(test_set[, 10], y_pred)
cm
Confusion Matrix and Statistics

          Reference
Prediction  0  1
         0 20  5
         1  4 28
                                          
               Accuracy : 0.8421          
                 95% CI : (0.7213, 0.9252)
    No Information Rate : 0.5789          
    P-Value [Acc > NIR] : 1.998e-05       
                                          
                  Kappa : 0.678           
                                          
 Mcnemar's Test P-Value : 1               
                                          
            Sensitivity : 0.8333          
            Specificity : 0.8485          
         Pos Pred Value : 0.8000          
         Neg Pred Value : 0.8750          
             Prevalence : 0.4211          
         Detection Rate : 0.3509          
   Detection Prevalence : 0.4386          
      Balanced Accuracy : 0.8409          
                                          
       'Positive' Class : 0               
                                          
set.seed(100)
svm_acc <- cm$overall["Accuracy"]
#plot confusion Matrix 
test_set$pred <- y_pred
ggplot(test_set, aes(output, pred, color = output)) +
  geom_jitter(width = 0.2, height = 0.1, size=2) +
  labs(title="Confusion Matrix",
       y="Predicted",
       x="Truth")

Random Forest Model

set.seed(100)
#Initial model
random_frst <- randomForest( output ~ .,
                             data=train_set)

y_pred <- predict(random_frst, test_set)

#plot confusion matrix
cm<-confusionMatrix(factor(y_pred),test_set$output)
cm
Confusion Matrix and Statistics

          Reference
Prediction  0  1
         0 18  3
         1  7 29
                                          
               Accuracy : 0.8246          
                 95% CI : (0.7009, 0.9125)
    No Information Rate : 0.5614          
    P-Value [Acc > NIR] : 2.515e-05       
                                          
                  Kappa : 0.6374          
                                          
 Mcnemar's Test P-Value : 0.3428          
                                          
            Sensitivity : 0.7200          
            Specificity : 0.9062          
         Pos Pred Value : 0.8571          
         Neg Pred Value : 0.8056          
             Prevalence : 0.4386          
         Detection Rate : 0.3158          
   Detection Prevalence : 0.3684          
      Balanced Accuracy : 0.8131          
                                          
       'Positive' Class : 0               
                                          
set.seed(100)
random_acc <- cm$overall["Accuracy"]
#plot confusion Matrix 
test_set$pred <- y_pred
ggplot(test_set, aes(output, pred, color = output)) +
  geom_jitter(width = 0.2, height = 0.1, size=2) +
  labs(title="Confusion Matrix",
       y="Predicted",
       x="Truth")

#### Logistic Regression

#logistic regression model
set.seed(100)
logistic_reg = glm(formula = output ~ .,
                 family = binomial,
                 data = train_set)
prob_pred= predict(logistic_reg,type='response',newdata = test_set[-10])
y_pred = ifelse(prob_pred > 0.5, 1, 0)

#confusion matrix
cm = confusionMatrix(test_set[,10],factor(y_pred))
cm
Confusion Matrix and Statistics

          Reference
Prediction  0  1
         0 19  6
         1  2 30
                                          
               Accuracy : 0.8596          
                 95% CI : (0.7421, 0.9374)
    No Information Rate : 0.6316          
    P-Value [Acc > NIR] : 0.0001263       
                                          
                  Kappa : 0.7099          
                                          
 Mcnemar's Test P-Value : 0.2888444       
                                          
            Sensitivity : 0.9048          
            Specificity : 0.8333          
         Pos Pred Value : 0.7600          
         Neg Pred Value : 0.9375          
             Prevalence : 0.3684          
         Detection Rate : 0.3333          
   Detection Prevalence : 0.4386          
      Balanced Accuracy : 0.8690          
                                          
       'Positive' Class : 0               
                                          
set.seed(100)
glm_acc <- cm$overall["Accuracy"]
#plot confusion Matrix 
test_set$pred <- y_pred
ggplot(test_set, aes(output, pred, color = output)) +
  geom_jitter(width = 0.2, height = 0.1, size=2) +
  labs(title="Confusion Matrix",
       y="Predicted",
       x="Truth")

K Nearest Neighbor

set.seed(100)
#initial model
y_pred<-knn(train=train_set[,1:10],test = test_set[,1:10],cl=train_set$output,k=5)

#confusion matrix 
cm=confusionMatrix(test_set$output,y_pred)
cm
Confusion Matrix and Statistics

          Reference
Prediction  0  1
         0 21  4
         1  1 31
                                         
               Accuracy : 0.9123         
                 95% CI : (0.807, 0.9709)
    No Information Rate : 0.614          
    P-Value [Acc > NIR] : 4.061e-07      
                                         
                  Kappa : 0.8195         
                                         
 Mcnemar's Test P-Value : 0.3711         
                                         
            Sensitivity : 0.9545         
            Specificity : 0.8857         
         Pos Pred Value : 0.8400         
         Neg Pred Value : 0.9688         
             Prevalence : 0.3860         
         Detection Rate : 0.3684         
   Detection Prevalence : 0.4386         
      Balanced Accuracy : 0.9201         
                                         
       'Positive' Class : 0              
                                         
set.seed(100)
Knn_acc <- cm$overall["Accuracy"]
#plot confusion Matrix 
test_set$pred <- y_pred
ggplot(test_set, aes(output, pred, color = output)) +
  geom_jitter(width = 0.2, height = 0.1, size=2) +
  labs(title="Confusion Matrix",
       y="Predicted",
       x="Truth")

Observations

8.Model Comparison

set.seed(100)
# create a dataframe for models accuracy 
model_names <- c("Naive Bayes", "Logistic Regression","SVM","Random Forest",'KNN')
# extract accuracy for various models 
acc<- c(naive_acc, glm_acc, svm_acc,random_acc,Knn_acc)
df_acc <- data.frame(model_names, acc)
df_acc$model_names <- factor(df_acc$model_names,levels = df_acc$model_names)

ggplot( mapping = aes(x=df_acc$model_names)) +
    geom_bar(aes(y = ..acc.., fill = df_acc$model_names),width = 0.9,show.legend = FALSE) + geom_text(aes( y = ..acc.., label = scales::percent(..acc..)),size=4, stat = "count", vjust = -1)+ ylim(0, 1)+labs(y = "Accuracy", x="")+
    theme(text = element_text(size = 15))