The dataset can be downloaded using this link.
The Dataset consist of 14 columns of which are both numerical and categorical variables and one of them is a Output variable-which consist of 1 or 0 indicating whether a person has a more chance of heart attack(1) or less chance of heart attack(0).
The characteristics variables includes the following
#Load the packages
library(dplyr)
library(ggplot2)
library(tidyr)
library(psych)
library(psychTools)
library(tidyverse)
library(hrbrthemes)
library(randomForest)
library(forcats)
library(corrplot)
library(GGally)
library(viridis)
library(corrr)
library(DataExplorer)
library(caret)
library(caTools)
library(e1071)
library(class)
library(pROC)
# define the filename
filename <- "heart.csv"
# load the CSV file from the local directory
dataset <- read.csv(filename,header=T,sep=",")
# column names of the dataset
colnames(dataset)
[1] "age" "sex" "cp" "trtbps" "chol" "fbs"
[7] "restecg" "thalachh" "exng" "oldpeak" "slp" "caa"
[13] "thall" "output"
#list types for each attribute
sapply(dataset, class)
age sex cp trtbps chol fbs restecg thalachh
"integer" "integer" "integer" "integer" "integer" "integer" "integer" "integer"
exng oldpeak slp caa thall output
"integer" "numeric" "integer" "integer" "integer" "integer"
observations
# display the data
head(dataset,n=10)
#classify the variables into categorical and numerical variables
#select the numerical variables
numeric_var <-dataset %>%
select("age","trtbps","chol","thalachh","oldpeak")
#select the categorical values
categorical_var<- dataset %>%
select("sex","cp","fbs","restecg","exng","slp","caa",
"thall","output")%>%
mutate_if(is.numeric, as.factor)
#combine the categorical and numerical values
dataset1 = cbind(categorical_var,numeric_var)
plot_intro(dataset1,title="Dataset Information")
introduce(dataset1)
Observations
describeBy(dataset1)
Observations
## plot correlations
correlation_tab <- cor(dataset)
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
#png(file="corr2.png",res=150,width=900,height=700)
corrplot(correlation_tab, method = "color", shade.col = NA, tl.col = "black", tl.srt = 45,tl.cex =4,cl.cex=4,col = col(200), addCoef.col = "black", order = "AOE",number.cex = 3)
Observations
Positive correlations are displayed in blue and negative correlations in red color. Color intensities are proportional to the correlation coefficients
To see if the dataset is balanced or not
df_output <- dataset1 %>%
group_by(output) %>%
summarise(freq= n()) %>%
mutate(percentage= freq/sum(freq)*100)
output_bar <- ggplot(dataset1, aes(x=output, fill=output)) +
geom_bar( ) +geom_text(stat='count',aes(label=..count..),vjust=-0.30)+
labs( x = 'Target', y = 'Number of observations')+scale_fill_discrete(labels=c('0-less chances', '1-High Chances'))+theme_bw()
output_bar
Observation
#select the categorical variable
par(mfrow=c(2,2))
categorical_var<- list("sex","cp","fbs","restecg","exng","slp","caa",
"thall")
for(i in categorical_var){
plot<- ggplot(dataset1,aes_string(x=i,fill=dataset1$output))+geom_bar(position = position_dodge()) + scale_fill_discrete(name="output",labels=c('0-less chances', '1-High Chances'))
print(plot)
}
Observations
## select the numerical features
## plot a pairplot
plt <- ggpairs(numeric_var,columns=1:5,ggplot2::aes(alpha=0.75,color=dataset1$output),legend=2,upper = list(continuous = wrap("points",alpha = 0.75,size=2.8)),
lower = list(continuous = wrap("points",alpha = 0.75,size=2.8)))+ theme(text=element_text(size=22))+scale_colour_discrete(name="output",labels=c('0-less chances', '1-High Chances'))
plt
There are several methods on how to detect outliers in a dataset such as using Box plots,Scatter plot,Z score etc.Therefore,i have used boxplot analysis on numerical variables to detect the outliers
#define the data
#classify the numerical variables into feature and value columns
data <- numeric_var %>%
gather(key="feature", value="value") %>%
mutate(feature = gsub("\\.", " ",feature))
data %>%
mutate(text=fct_reorder(feature, value))%>%
ggplot(aes(x=feature, y=value, fill=feature))+
geom_boxplot(outlier.colour = "red",outlier.shape = 1.5)+
scale_fill_viridis(discrete=TRUE)+theme(legend.position="false",
text=element_text(size=15)) +coord_flip()
head(data)
Observations
# create a dataframe from numerical variables and exclude age
df_outliers<-as.data.frame(dataset1 %>%
select("trtbps","thalachh","chol","oldpeak"))
outliers <- function(x) {
#IQR
Q1 <- quantile(x, probs=.25)
Q3 <- quantile(x, probs=.75)
iqr = Q3-Q1
#Upper Range
upper_limit = Q3 + (iqr*1.5)
#Lower Range Eliminating Outliers
lower_limit = Q1 - (iqr*1.5)
x > upper_limit | x < lower_limit
}
# remove the outliers
remove_outliers <- function(df_outliers, cols = names(df_outliers)) {
for (col in cols) {
df_outliers<- df_outliers[!outliers(df_outliers[[col]]),]
}
df_outliers
}
# we have removed the outliers from the selected features
# create new dataset without outliers
dataset2<-remove_outliers(dataset1,c("trtbps","oldpeak" ,"thalachh", "chol"))
#check the dimesions of the new dataset
dim(dataset2)
[1] 284 14
Observation
#define the data
#classify the numerical variables into feature and value columns
numeric_feature <- dataset2%>%
select("trtbps","oldpeak" ,"thalachh", "chol","age")
data <- numeric_feature %>%
gather(key="feature", value="value") %>%
mutate(feature = gsub("\\.", " ",feature))
data %>%
mutate(text = fct_reorder(feature, value)) %>%
ggplot( aes(x=feature, y=value, fill=feature)) +
geom_boxplot(outlier.colour = "red",outlier.shape = 1.5) +
scale_fill_viridis(discrete=TRUE) +
theme(legend.position="false",text=element_text(size=15)
) +
coord_flip()
According to Michail.T(2018),Feature (or variable) selection is the process of identifying the minimal set of features with the highest predictive performance on the target variable of interest.
There are several feature selection methods provided by the caret R package,and among are Searching for and removing redundant feature,Boruta,Ranking features by importance,RFE etc.
In this dataset we are going to use automatic methods, using Recursive Feature Elimination(RFE).
set.seed(100)
#create the subsets for sizes
subsets <- c(1:8,10,13)
# define the control using random forest selection
ctrl <- rfeControl(functions = rfFuncs,
method = "repeatedcv",
repeats = 5,
number = 10,
verbose = FALSE)
#run the RFE
results <- rfe(x=dataset2[, c(1:8,10:14)], y=dataset2$output,
sizes = subsets,
rfeControl = ctrl)
# Print the selected features
print(predictors(results))
[1] "caa" "thall" "cp" "oldpeak" "sex" "thalachh" "exng"
[8] "age"
set.seed(100)
varimp_data <- data.frame(feature = row.names(varImp(results))[1:9],
importance = varImp(results)[1:9, 1])
ggplot(data = varimp_data,
aes(x = reorder(feature, -importance), y = importance, fill = feature)) +
geom_bar(stat="identity") + labs(x = "Features", y = "Variable Importance") +
geom_text(aes(label = round(importance, 2)), vjust=1.6, color="white", size=4) +
theme_bw() + theme(legend.position = "none")
Observations
#drop the least columns from the dataset2
set.seed(100)
data1 <- dataset2 %>%
select( "sex","cp","caa","thall","exng","slp","age","oldpeak","thalachh","output")
head(data1)
set.seed(100)
split = sample.split(data1$output, SplitRatio = 0.80)
train_set = subset(data1, split == TRUE)
test_set = subset(data1, split == FALSE)
# Feature Scaling
set.seed(100)
train_set[c(7,8,9)]= scale(train_set[c(7,8,9)])
test_set[c(7,8,9)] = scale(test_set[c(7,8,9)])
head(test_set)
set.seed(100)
classifier = naiveBayes(x = train_set[-10],y = train_set$output)
y_pred = predict(classifier, newdata = test_set[-10])
cm = confusionMatrix(test_set[, 10], y_pred)
cm
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 20 5
1 2 30
Accuracy : 0.8772
95% CI : (0.7632, 0.9492)
No Information Rate : 0.614
P-Value [Acc > NIR] : 1.096e-05
Kappa : 0.7473
Mcnemar's Test P-Value : 0.4497
Sensitivity : 0.9091
Specificity : 0.8571
Pos Pred Value : 0.8000
Neg Pred Value : 0.9375
Prevalence : 0.3860
Detection Rate : 0.3509
Detection Prevalence : 0.4386
Balanced Accuracy : 0.8831
'Positive' Class : 0
set.seed(100)
naive_acc <- cm$overall["Accuracy"]
#plot confusion Matrix
test_set$pred <- y_pred
ggplot(test_set, aes(output, pred, color = output)) +
geom_jitter(width = 0.2, height = 0.1, size=2) +
labs(title="Confusion Matrix",
y="Predicted",
x="Truth")
set.seed(100)
svm_model= svm(formula=output~.,
data=train_set,
type="C-classification",
kernal="linear")
y_pred= predict(svm_model,newdata = test_set[-10])
cm= confusionMatrix(test_set[, 10], y_pred)
cm
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 20 5
1 4 28
Accuracy : 0.8421
95% CI : (0.7213, 0.9252)
No Information Rate : 0.5789
P-Value [Acc > NIR] : 1.998e-05
Kappa : 0.678
Mcnemar's Test P-Value : 1
Sensitivity : 0.8333
Specificity : 0.8485
Pos Pred Value : 0.8000
Neg Pred Value : 0.8750
Prevalence : 0.4211
Detection Rate : 0.3509
Detection Prevalence : 0.4386
Balanced Accuracy : 0.8409
'Positive' Class : 0
set.seed(100)
svm_acc <- cm$overall["Accuracy"]
#plot confusion Matrix
test_set$pred <- y_pred
ggplot(test_set, aes(output, pred, color = output)) +
geom_jitter(width = 0.2, height = 0.1, size=2) +
labs(title="Confusion Matrix",
y="Predicted",
x="Truth")
set.seed(100)
#Initial model
random_frst <- randomForest( output ~ .,
data=train_set)
y_pred <- predict(random_frst, test_set)
#plot confusion matrix
cm<-confusionMatrix(factor(y_pred),test_set$output)
cm
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 18 3
1 7 29
Accuracy : 0.8246
95% CI : (0.7009, 0.9125)
No Information Rate : 0.5614
P-Value [Acc > NIR] : 2.515e-05
Kappa : 0.6374
Mcnemar's Test P-Value : 0.3428
Sensitivity : 0.7200
Specificity : 0.9062
Pos Pred Value : 0.8571
Neg Pred Value : 0.8056
Prevalence : 0.4386
Detection Rate : 0.3158
Detection Prevalence : 0.3684
Balanced Accuracy : 0.8131
'Positive' Class : 0
set.seed(100)
random_acc <- cm$overall["Accuracy"]
#plot confusion Matrix
test_set$pred <- y_pred
ggplot(test_set, aes(output, pred, color = output)) +
geom_jitter(width = 0.2, height = 0.1, size=2) +
labs(title="Confusion Matrix",
y="Predicted",
x="Truth")
#### Logistic Regression
#logistic regression model
set.seed(100)
logistic_reg = glm(formula = output ~ .,
family = binomial,
data = train_set)
prob_pred= predict(logistic_reg,type='response',newdata = test_set[-10])
y_pred = ifelse(prob_pred > 0.5, 1, 0)
#confusion matrix
cm = confusionMatrix(test_set[,10],factor(y_pred))
cm
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 19 6
1 2 30
Accuracy : 0.8596
95% CI : (0.7421, 0.9374)
No Information Rate : 0.6316
P-Value [Acc > NIR] : 0.0001263
Kappa : 0.7099
Mcnemar's Test P-Value : 0.2888444
Sensitivity : 0.9048
Specificity : 0.8333
Pos Pred Value : 0.7600
Neg Pred Value : 0.9375
Prevalence : 0.3684
Detection Rate : 0.3333
Detection Prevalence : 0.4386
Balanced Accuracy : 0.8690
'Positive' Class : 0
set.seed(100)
glm_acc <- cm$overall["Accuracy"]
#plot confusion Matrix
test_set$pred <- y_pred
ggplot(test_set, aes(output, pred, color = output)) +
geom_jitter(width = 0.2, height = 0.1, size=2) +
labs(title="Confusion Matrix",
y="Predicted",
x="Truth")
set.seed(100)
#initial model
y_pred<-knn(train=train_set[,1:10],test = test_set[,1:10],cl=train_set$output,k=5)
#confusion matrix
cm=confusionMatrix(test_set$output,y_pred)
cm
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 21 4
1 1 31
Accuracy : 0.9123
95% CI : (0.807, 0.9709)
No Information Rate : 0.614
P-Value [Acc > NIR] : 4.061e-07
Kappa : 0.8195
Mcnemar's Test P-Value : 0.3711
Sensitivity : 0.9545
Specificity : 0.8857
Pos Pred Value : 0.8400
Neg Pred Value : 0.9688
Prevalence : 0.3860
Detection Rate : 0.3684
Detection Prevalence : 0.4386
Balanced Accuracy : 0.9201
'Positive' Class : 0
set.seed(100)
Knn_acc <- cm$overall["Accuracy"]
#plot confusion Matrix
test_set$pred <- y_pred
ggplot(test_set, aes(output, pred, color = output)) +
geom_jitter(width = 0.2, height = 0.1, size=2) +
labs(title="Confusion Matrix",
y="Predicted",
x="Truth")
Observations
set.seed(100)
# create a dataframe for models accuracy
model_names <- c("Naive Bayes", "Logistic Regression","SVM","Random Forest",'KNN')
# extract accuracy for various models
acc<- c(naive_acc, glm_acc, svm_acc,random_acc,Knn_acc)
df_acc <- data.frame(model_names, acc)
df_acc$model_names <- factor(df_acc$model_names,levels = df_acc$model_names)
ggplot( mapping = aes(x=df_acc$model_names)) +
geom_bar(aes(y = ..acc.., fill = df_acc$model_names),width = 0.9,show.legend = FALSE) + geom_text(aes( y = ..acc.., label = scales::percent(..acc..)),size=4, stat = "count", vjust = -1)+ ylim(0, 1)+labs(y = "Accuracy", x="")+
theme(text = element_text(size = 15))