Machine Learning - Classification[Heart Disease Dataset(uci)]

1. Importing Dataset

setwd("D:/PDFs/5th semester/Data Science Programming/lab/R files")

# --> Importing the dataset :
dataset = read.csv('heart.csv')

2. Visualizing and Analyzing the dataset

# --> Analyzing the dataset :
View(dataset)
str(dataset)

## 'data.frame':    303 obs. of  14 variables:
##  $ ï..age  : int  63 37 41 56 57 57 56 44 52 57 ...
##  $ sex     : int  1 1 0 1 0 1 0 1 1 1 ...
##  $ cp      : int  3 2 1 1 0 0 1 1 2 2 ...
##  $ trestbps: int  145 130 130 120 120 140 140 120 172 150 ...
##  $ chol    : int  233 250 204 236 354 192 294 263 199 168 ...
##  $ fbs     : int  1 0 0 0 0 0 0 0 1 0 ...
##  $ restecg : int  0 1 0 1 1 1 0 1 1 1 ...
##  $ thalach : int  150 187 172 178 163 148 153 173 162 174 ...
##  $ exang   : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ oldpeak : num  2.3 3.5 1.4 0.8 0.6 0.4 1.3 0 0.5 1.6 ...
##  $ slope   : int  0 0 2 2 2 1 1 2 2 2 ...
##  $ ca      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ thal    : int  1 2 2 2 2 1 2 3 3 2 ...
##  $ target  : int  1 1 1 1 1 1 1 1 1 1 ...

summary(dataset)

##      ï..age           sex               cp           trestbps    
##  Min.   :29.00   Min.   :0.0000   Min.   :0.000   Min.   : 94.0  
##  1st Qu.:47.50   1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:120.0  
##  Median :55.00   Median :1.0000   Median :1.000   Median :130.0  
##  Mean   :54.37   Mean   :0.6832   Mean   :0.967   Mean   :131.6  
##  3rd Qu.:61.00   3rd Qu.:1.0000   3rd Qu.:2.000   3rd Qu.:140.0  
##  Max.   :77.00   Max.   :1.0000   Max.   :3.000   Max.   :200.0  
##       chol            fbs            restecg          thalach     
##  Min.   :126.0   Min.   :0.0000   Min.   :0.0000   Min.   : 71.0  
##  1st Qu.:211.0   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:133.5  
##  Median :240.0   Median :0.0000   Median :1.0000   Median :153.0  
##  Mean   :246.3   Mean   :0.1485   Mean   :0.5281   Mean   :149.6  
##  3rd Qu.:274.5   3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:166.0  
##  Max.   :564.0   Max.   :1.0000   Max.   :2.0000   Max.   :202.0  
##      exang           oldpeak         slope             ca        
##  Min.   :0.0000   Min.   :0.00   Min.   :0.000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.00   1st Qu.:1.000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.80   Median :1.000   Median :0.0000  
##  Mean   :0.3267   Mean   :1.04   Mean   :1.399   Mean   :0.7294  
##  3rd Qu.:1.0000   3rd Qu.:1.60   3rd Qu.:2.000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :6.20   Max.   :2.000   Max.   :4.0000  
##       thal           target      
##  Min.   :0.000   Min.   :0.0000  
##  1st Qu.:2.000   1st Qu.:0.0000  
##  Median :2.000   Median :1.0000  
##  Mean   :2.314   Mean   :0.5446  
##  3rd Qu.:3.000   3rd Qu.:1.0000  
##  Max.   :3.000   Max.   :1.0000

#Renaming the column name For Further Plotting(Data Cleaning):
colnames(dataset)

##  [1] "ï..age"   "sex"      "cp"       "trestbps" "chol"     "fbs"     
##  [7] "restecg"  "thalach"  "exang"    "oldpeak"  "slope"    "ca"      
## [13] "thal"     "target"

colnames(dataset)[1] = "age"
colnames(dataset)

##  [1] "age"      "sex"      "cp"       "trestbps" "chol"     "fbs"     
##  [7] "restecg"  "thalach"  "exang"    "oldpeak"  "slope"    "ca"      
## [13] "thal"     "target"

#Analyzing through Plots:
library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.1.3

   #Univariate Analysis :
#Bar Plot:
gg0 <- ggplot(dataset, aes(x=target, fill=target)) +
  geom_bar() +
  xlab("Heart Disease") +
  ylab("Count") +
  scale_fill_discrete(name = "Heart Disease", labels = c("Absence",
                                                         "Presence")) +
  labs(title = "BarPlot-Analysis of Presence and Absence of Heart
Disease", x = "HearDisease", y = "Count")
gg0

ggplot(data=dataset, aes(chol)) + 
  geom_histogram(bins = 50) + 
  theme_bw() + 
  labs(title = "Histogram of chol")

ggplot(data=dataset, aes(trestbps)) + 
  geom_histogram(bins = 50) + 
  theme_bw() + 
  labs(title = "Histogram of trestbps")

ggplot(data=dataset, aes(target)) + 
  geom_bar(aes(fill=factor(sex)), position = "dodge") +
  labs(title = "Barplot of target with sex as factor")

ggplot(data=dataset, aes(target)) + 
  geom_bar(aes(fill=factor(cp)), position = "dodge") + 
  labs(title = "Bar plot target with cp as factor")

ggplot(data=dataset, aes(target)) + 
  geom_bar(aes(fill=factor(restecg)), position = "dodge") +
  labs(title = "Bar plot of target with restecg as factor")

ggplot(data=dataset, aes(target)) + 
  geom_bar(aes(fill=factor(thal)), position = "dodge")+
  labs(title = "Bar plot of target with thal as factor")

ggplot(data=dataset, aes(target)) + 
  geom_bar(aes(fill=factor(slope)), position = "dodge") +
  labs(title = "Bar plot of target with slope as factor")

  #Bivariate Analysis:

  #Box Plot between age and target :
ggplot(dataset, aes(y = age, x = target, col = target)) +
  geom_boxplot() + 
  theme_minimal() +
  labs(title = "Box plot between age and target")

## Warning: Continuous x aesthetic -- did you forget aes(group=...)?

  #Box plot between target and trestbps :
ggplot(data=dataset, aes(target, trestbps)) + 
  geom_boxplot() + 
  theme_bw() +
  labs(title = "Box plot between target and trestbps")

## Warning: Continuous x aesthetic -- did you forget aes(group=...)?

  #Box plot between target and trestbps
ggplot(data=dataset, aes(target, thalach)) + 
  geom_boxplot() + 
  theme_bw() +
  labs(title = "Box plot between target and thalach")

## Warning: Continuous x aesthetic -- did you forget aes(group=...)?

  #Box plot between target and chol :
ggplot(data=dataset, aes(target, chol)) + 
  geom_boxplot() + 
  theme_bw() +
  labs(title = "Box plot between target and chol")

## Warning: Continuous x aesthetic -- did you forget aes(group=...)?

  #Box plot between target and oldpeak :
ggplot(data=dataset, aes(target, oldpeak)) + 
  geom_boxplot() + 
  theme_bw() +
  labs(title = "Box plot between target and oldpeak")

## Warning: Continuous x aesthetic -- did you forget aes(group=...)?

  #Box plot between target and ca :
ggplot(data=dataset, aes(target, ca)) + 
  geom_boxplot() + 
  theme_bw() +
  labs(title = "Box plot between target and ca")

## Warning: Continuous x aesthetic -- did you forget aes(group=...)?

ggplot(dataset, aes(x = target, y = age, fill = sex, col = sex)) +
  geom_jitter() +
  theme_minimal()

ggplot(dataset, aes(x = target, y = thalach, col = age)) +
  geom_jitter() +
  theme_minimal()

#Line Plot
ggplot(dataset, aes(y = trestbps, x = chol))+
  labs(title = "Line Plot for trestbps and chol(serum cholesterol") +
  geom_line()

ggplot(dataset, aes(y = ca, x = oldpeak))+
  labs(title = "Line Plot for oldpeak(ST depression) and ca(major
vessels(n0.))") +
  geom_line()

#Scatter Plot:
gg1 <- ggplot(dataset, aes(x = chol, y = trestbps)) +
  geom_point(aes(col = target, size = oldpeak))+
  geom_smooth(method = "loess", se= F) +
  xlim(c(100, 430)) + ylim(c(75, 200)) +
  labs(title = "ScatterPlot-chol Vs trestbps",
       x = "chol", y = "trestbps")
gg1

## `geom_smooth()` using formula 'y ~ x'

## Warning: Removed 1 rows containing non-finite values (stat_smooth).

## Warning: Removed 1 rows containing missing values (geom_point).

   #Multivariate Analysis:
plot(dataset)

#Correlogram-Correlation plot

library(ggcorrplot)

## Warning: package 'ggcorrplot' was built under R version 4.1.3

p.mat <- cor_pmat(dataset)
ggcorrplot(p.mat) +
  labs(title = "Correlation Matrix")

#Heat Map
cormat <- round(cor(dataset),2)
library(reshape2)

## Warning: package 'reshape2' was built under R version 4.1.3

melted_cormat <- melt(cormat)
ggplot(data=melted_cormat,aes(x = Var1, y = Var2, fill = value))+
  geom_tile()+scale_fill_gradient(high = "green", low = "red")+
  ggtitle("Heat Map")+
  theme(plot.title = element_text(face = "bold"))+
  geom_text(aes(label = round(value, 3)))+
  scale_fill_continuous(low = "red", high = "lightgreen")

## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.

3. Cleaning the dataset

# ==> Cleaning the dataset :

colnames(dataset) #first column name is not clearly specified.

##  [1] "age"      "sex"      "cp"       "trestbps" "chol"     "fbs"     
##  [7] "restecg"  "thalach"  "exang"    "oldpeak"  "slope"    "ca"      
## [13] "thal"     "target"

library(dplyr)

## Warning: package 'dplyr' was built under R version 4.1.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

#Checking for NA values:
dataset %>% count(target)

##   target   n
## 1      0 138
## 2      1 165

table(is.na(dataset))

## 
## FALSE 
##  4242

#Checking for duplicated rows:
table(duplicated(dataset))

## 
## FALSE  TRUE 
##   302     1

#Removing Duplicated rows:
dataset <- dataset[-c(165),]
dataset %>% count(target)

##   target   n
## 1      0 138
## 2      1 164

4. a) Model Creation Phase - First Model : Naive Bayes

# Splitting the dataset into the Training set and Test set
library(caTools)

## Warning: package 'caTools' was built under R version 4.1.3

set.seed(123)
split = sample.split(dataset$target, SplitRatio = 0.80)
train_set_naivebayes = subset(dataset, split == TRUE)
test_set_naivebayes = subset(dataset, split == FALSE)

library(e1071)

## Warning: package 'e1071' was built under R version 4.1.3

classifier_naivebayes = naiveBayes(x = train_set_naivebayes[-14],
                        y = train_set_naivebayes$target)

# Predicting the Test set results
y_pred_naivebayes = predict(classifier_naivebayes, newdata = test_set_naivebayes[-14])

# Making the Confusion Matrix
confusion_matrix_naivebayes = table(test_set_naivebayes[, 14], y_pred_naivebayes)

accuracy_of_naivebayes = sum(diag(confusion_matrix_naivebayes))/sum(confusion_matrix_naivebayes)

library(scales)

## Warning: package 'scales' was built under R version 4.1.3

5. Analyzing the Model- Naive Bayes

classifier_naivebayes

## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = train_set_naivebayes[-14], y = train_set_naivebayes$target)
## 
## A-priori probabilities:
## train_set_naivebayes$target
##         0         1 
## 0.4564315 0.5435685 
## 
## Conditional probabilities:
##                            age
## train_set_naivebayes$target     [,1]     [,2]
##                           0 56.13636 7.880387
##                           1 52.02290 9.209916
## 
##                            sex
## train_set_naivebayes$target      [,1]      [,2]
##                           0 0.8272727 0.3797414
##                           1 0.5496183 0.4994418
## 
##                            cp
## train_set_naivebayes$target      [,1]      [,2]
##                           0 0.4545455 0.8844878
##                           1 1.3816794 0.9401965
## 
##                            trestbps
## train_set_naivebayes$target     [,1]     [,2]
##                           0 133.6000 17.92912
##                           1 129.8092 15.85080
## 
##                            chol
## train_set_naivebayes$target     [,1]     [,2]
##                           0 250.8909 49.23271
##                           1 245.7786 56.32055
## 
##                            fbs
## train_set_naivebayes$target      [,1]      [,2]
##                           0 0.1545455 0.3631252
##                           1 0.1297710 0.3373413
## 
##                            restecg
## train_set_naivebayes$target      [,1]      [,2]
##                           0 0.4636364 0.5531772
##                           1 0.5496183 0.4994418
## 
##                            thalach
## train_set_naivebayes$target     [,1]     [,2]
##                           0 138.7455 22.15891
##                           1 159.4504 18.59165
## 
##                            exang
## train_set_naivebayes$target      [,1]      [,2]
##                           0 0.5545455 0.4992906
##                           1 0.1374046 0.3455956
## 
##                            oldpeak
## train_set_naivebayes$target      [,1]      [,2]
##                           0 1.6200000 1.2983264
##                           1 0.5473282 0.7968523
## 
##                            slope
## train_set_naivebayes$target     [,1]      [,2]
##                           0 1.145455 0.5558094
##                           1 1.618321 0.5877992
## 
##                            ca
## train_set_naivebayes$target      [,1]      [,2]
##                           0 1.1272727 1.0414271
##                           1 0.3587786 0.8326852
## 
##                            thal
## train_set_naivebayes$target     [,1]      [,2]
##                           0 2.545455 0.6858513
##                           1 2.099237 0.4447119

y_pred_naivebayes

##  [1] 1 1 1 1 1 1 0 0 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 0 0 1 0
## [39] 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0
## Levels: 0 1

confusion_matrix_naivebayes

##    y_pred_naivebayes
##      0  1
##   0 23  5
##   1  6 27

percent(accuracy_of_naivebayes)

## [1] "82%"

4. b)Model Creation Phase: Second Model - Decision Tree

library(caTools)
set.seed(1200000) 
split = sample.split(dataset$target, SplitRatio = 0.63)
training_set_dt = subset(dataset, split == TRUE)
test_set_dt = subset(dataset, split == FALSE)


# Feature scaling
training_set_dt[,c(1,4,5,8,10,12)] = scale(training_set_dt[,c(1,4,5,8,10,12)])
test_set_dt[,c(1,4,5,8,10,12)] = scale(test_set_dt[,c(1,4,5,8,10,12)])


# Fitting Decision Tree Classification to the Training set

library(rpart)
library(rpart.plot)

## Warning: package 'rpart.plot' was built under R version 4.1.3

classifier_decisiontree = rpart(formula = target ~ .,data = training_set_dt, method = 'class')

rpart.plot(classifier_decisiontree)

# Predicting the Test set results
y_pred_decisiontree = predict(classifier_decisiontree, newdata = test_set_dt[-14], type = 'class')

# Making the Confusion Matrix
confusionmatrix_dt = table(test_set_dt[, 14], y_pred_decisiontree)

Accuracy_for_Decision_Tree = sum(diag(confusionmatrix_dt))/sum(confusionmatrix_dt)

5. Analysing the model - Decision Tree

classifier_decisiontree

## n= 190 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 190 87 1 (0.45789474 0.54210526)  
##    2) cp< 0.5 96 27 0 (0.71875000 0.28125000)  
##      4) ca>=-0.2720426 54  4 0 (0.92592593 0.07407407) *
##      5) ca< -0.2720426 42 19 1 (0.45238095 0.54761905)  
##       10) thal>=2.5 17  4 0 (0.76470588 0.23529412) *
##       11) thal< 2.5 25  6 1 (0.24000000 0.76000000)  
##         22) exang>=0.5 9  4 0 (0.55555556 0.44444444) *
##         23) exang< 0.5 16  1 1 (0.06250000 0.93750000) *
##    3) cp>=0.5 94 18 1 (0.19148936 0.80851064)  
##      6) oldpeak>=0.7833287 10  3 0 (0.70000000 0.30000000) *
##      7) oldpeak< 0.7833287 84 11 1 (0.13095238 0.86904762) *

y_pred_decisiontree

##   1   3   9  11  12  15  17  18  23  26  27  28  29  33  38  40  42  45  46  47 
##   0   1   1   1   1   1   1   0   1   1   1   1   1   1   1   1   1   1   1   1 
##  49  57  58  59  60  64  65  73  74  78  79  84  87  91  95  97 100 109 111 113 
##   1   1   1   1   0   1   1   1   0   1   1   1   1   1   1   1   1   1   0   1 
## 114 116 117 118 119 120 122 129 130 135 136 138 141 145 150 154 155 160 161 162 
##   0   1   0   1   1   0   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 164 166 167 170 172 173 174 175 180 181 182 183 185 188 189 191 192 200 201 203 
##   1   0   0   0   1   1   0   0   0   0   0   1   0   0   1   0   0   0   0   0 
## 204 205 215 216 217 218 222 224 226 227 229 233 241 242 243 244 245 248 249 255 
##   1   0   0   0   1   0   0   0   0   1   1   0   0   0   0   0   0   1   1   1 
## 263 264 270 271 272 278 280 282 290 293 294 300 
##   0   0   0   0   0   1   0   0   0   0   1   1 
## Levels: 0 1

confusionmatrix_dt

##    y_pred_decisiontree
##      0  1
##   0 37 14
##   1  8 53

Accuracy_for_Decision_Tree

## [1] 0.8035714

Machine Learning - Classification[Heart Disease Dataset(uci)]

by guru

2022-11-11