1. Importing Dataset
setwd("D:/PDFs/5th semester/Data Science Programming/lab/R files")
# --> Importing the dataset :
dataset = read.csv('heart.csv')
2. Visualizing and Analyzing the dataset
# --> Analyzing the dataset :
View(dataset)
str(dataset)
## 'data.frame': 303 obs. of 14 variables:
## $ ï..age : int 63 37 41 56 57 57 56 44 52 57 ...
## $ sex : int 1 1 0 1 0 1 0 1 1 1 ...
## $ cp : int 3 2 1 1 0 0 1 1 2 2 ...
## $ trestbps: int 145 130 130 120 120 140 140 120 172 150 ...
## $ chol : int 233 250 204 236 354 192 294 263 199 168 ...
## $ fbs : int 1 0 0 0 0 0 0 0 1 0 ...
## $ restecg : int 0 1 0 1 1 1 0 1 1 1 ...
## $ thalach : int 150 187 172 178 163 148 153 173 162 174 ...
## $ exang : int 0 0 0 0 1 0 0 0 0 0 ...
## $ oldpeak : num 2.3 3.5 1.4 0.8 0.6 0.4 1.3 0 0.5 1.6 ...
## $ slope : int 0 0 2 2 2 1 1 2 2 2 ...
## $ ca : int 0 0 0 0 0 0 0 0 0 0 ...
## $ thal : int 1 2 2 2 2 1 2 3 3 2 ...
## $ target : int 1 1 1 1 1 1 1 1 1 1 ...
summary(dataset)
## ï..age sex cp trestbps
## Min. :29.00 Min. :0.0000 Min. :0.000 Min. : 94.0
## 1st Qu.:47.50 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:120.0
## Median :55.00 Median :1.0000 Median :1.000 Median :130.0
## Mean :54.37 Mean :0.6832 Mean :0.967 Mean :131.6
## 3rd Qu.:61.00 3rd Qu.:1.0000 3rd Qu.:2.000 3rd Qu.:140.0
## Max. :77.00 Max. :1.0000 Max. :3.000 Max. :200.0
## chol fbs restecg thalach
## Min. :126.0 Min. :0.0000 Min. :0.0000 Min. : 71.0
## 1st Qu.:211.0 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:133.5
## Median :240.0 Median :0.0000 Median :1.0000 Median :153.0
## Mean :246.3 Mean :0.1485 Mean :0.5281 Mean :149.6
## 3rd Qu.:274.5 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:166.0
## Max. :564.0 Max. :1.0000 Max. :2.0000 Max. :202.0
## exang oldpeak slope ca
## Min. :0.0000 Min. :0.00 Min. :0.000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.00 1st Qu.:1.000 1st Qu.:0.0000
## Median :0.0000 Median :0.80 Median :1.000 Median :0.0000
## Mean :0.3267 Mean :1.04 Mean :1.399 Mean :0.7294
## 3rd Qu.:1.0000 3rd Qu.:1.60 3rd Qu.:2.000 3rd Qu.:1.0000
## Max. :1.0000 Max. :6.20 Max. :2.000 Max. :4.0000
## thal target
## Min. :0.000 Min. :0.0000
## 1st Qu.:2.000 1st Qu.:0.0000
## Median :2.000 Median :1.0000
## Mean :2.314 Mean :0.5446
## 3rd Qu.:3.000 3rd Qu.:1.0000
## Max. :3.000 Max. :1.0000
#Renaming the column name For Further Plotting(Data Cleaning):
colnames(dataset)
## [1] "ï..age" "sex" "cp" "trestbps" "chol" "fbs"
## [7] "restecg" "thalach" "exang" "oldpeak" "slope" "ca"
## [13] "thal" "target"
colnames(dataset)[1] = "age"
colnames(dataset)
## [1] "age" "sex" "cp" "trestbps" "chol" "fbs"
## [7] "restecg" "thalach" "exang" "oldpeak" "slope" "ca"
## [13] "thal" "target"
#Analyzing through Plots:
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.1.3
#Univariate Analysis :
#Bar Plot:
gg0 <- ggplot(dataset, aes(x=target, fill=target)) +
geom_bar() +
xlab("Heart Disease") +
ylab("Count") +
scale_fill_discrete(name = "Heart Disease", labels = c("Absence",
"Presence")) +
labs(title = "BarPlot-Analysis of Presence and Absence of Heart
Disease", x = "HearDisease", y = "Count")
gg0
ggplot(data=dataset, aes(chol)) +
geom_histogram(bins = 50) +
theme_bw() +
labs(title = "Histogram of chol")
ggplot(data=dataset, aes(trestbps)) +
geom_histogram(bins = 50) +
theme_bw() +
labs(title = "Histogram of trestbps")
ggplot(data=dataset, aes(target)) +
geom_bar(aes(fill=factor(sex)), position = "dodge") +
labs(title = "Barplot of target with sex as factor")
ggplot(data=dataset, aes(target)) +
geom_bar(aes(fill=factor(cp)), position = "dodge") +
labs(title = "Bar plot target with cp as factor")
ggplot(data=dataset, aes(target)) +
geom_bar(aes(fill=factor(restecg)), position = "dodge") +
labs(title = "Bar plot of target with restecg as factor")
ggplot(data=dataset, aes(target)) +
geom_bar(aes(fill=factor(thal)), position = "dodge")+
labs(title = "Bar plot of target with thal as factor")
ggplot(data=dataset, aes(target)) +
geom_bar(aes(fill=factor(slope)), position = "dodge") +
labs(title = "Bar plot of target with slope as factor")
#Bivariate Analysis:
#Box Plot between age and target :
ggplot(dataset, aes(y = age, x = target, col = target)) +
geom_boxplot() +
theme_minimal() +
labs(title = "Box plot between age and target")
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
#Box plot between target and trestbps :
ggplot(data=dataset, aes(target, trestbps)) +
geom_boxplot() +
theme_bw() +
labs(title = "Box plot between target and trestbps")
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
#Box plot between target and trestbps
ggplot(data=dataset, aes(target, thalach)) +
geom_boxplot() +
theme_bw() +
labs(title = "Box plot between target and thalach")
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
#Box plot between target and chol :
ggplot(data=dataset, aes(target, chol)) +
geom_boxplot() +
theme_bw() +
labs(title = "Box plot between target and chol")
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
#Box plot between target and oldpeak :
ggplot(data=dataset, aes(target, oldpeak)) +
geom_boxplot() +
theme_bw() +
labs(title = "Box plot between target and oldpeak")
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
#Box plot between target and ca :
ggplot(data=dataset, aes(target, ca)) +
geom_boxplot() +
theme_bw() +
labs(title = "Box plot between target and ca")
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
ggplot(dataset, aes(x = target, y = age, fill = sex, col = sex)) +
geom_jitter() +
theme_minimal()
ggplot(dataset, aes(x = target, y = thalach, col = age)) +
geom_jitter() +
theme_minimal()
#Line Plot
ggplot(dataset, aes(y = trestbps, x = chol))+
labs(title = "Line Plot for trestbps and chol(serum cholesterol") +
geom_line()
ggplot(dataset, aes(y = ca, x = oldpeak))+
labs(title = "Line Plot for oldpeak(ST depression) and ca(major
vessels(n0.))") +
geom_line()
#Scatter Plot:
gg1 <- ggplot(dataset, aes(x = chol, y = trestbps)) +
geom_point(aes(col = target, size = oldpeak))+
geom_smooth(method = "loess", se= F) +
xlim(c(100, 430)) + ylim(c(75, 200)) +
labs(title = "ScatterPlot-chol Vs trestbps",
x = "chol", y = "trestbps")
gg1
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
#Multivariate Analysis:
plot(dataset)
#Correlogram-Correlation plot
library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 4.1.3
p.mat <- cor_pmat(dataset)
ggcorrplot(p.mat) +
labs(title = "Correlation Matrix")
#Heat Map
cormat <- round(cor(dataset),2)
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.1.3
melted_cormat <- melt(cormat)
ggplot(data=melted_cormat,aes(x = Var1, y = Var2, fill = value))+
geom_tile()+scale_fill_gradient(high = "green", low = "red")+
ggtitle("Heat Map")+
theme(plot.title = element_text(face = "bold"))+
geom_text(aes(label = round(value, 3)))+
scale_fill_continuous(low = "red", high = "lightgreen")
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
3. Cleaning the dataset
# ==> Cleaning the dataset :
colnames(dataset) #first column name is not clearly specified.
## [1] "age" "sex" "cp" "trestbps" "chol" "fbs"
## [7] "restecg" "thalach" "exang" "oldpeak" "slope" "ca"
## [13] "thal" "target"
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.1.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#Checking for NA values:
dataset %>% count(target)
## target n
## 1 0 138
## 2 1 165
table(is.na(dataset))
##
## FALSE
## 4242
#Checking for duplicated rows:
table(duplicated(dataset))
##
## FALSE TRUE
## 302 1
#Removing Duplicated rows:
dataset <- dataset[-c(165),]
dataset %>% count(target)
## target n
## 1 0 138
## 2 1 164
4. a) Model Creation Phase - First Model : Naive Bayes
# Splitting the dataset into the Training set and Test set
library(caTools)
## Warning: package 'caTools' was built under R version 4.1.3
set.seed(123)
split = sample.split(dataset$target, SplitRatio = 0.80)
train_set_naivebayes = subset(dataset, split == TRUE)
test_set_naivebayes = subset(dataset, split == FALSE)
library(e1071)
## Warning: package 'e1071' was built under R version 4.1.3
classifier_naivebayes = naiveBayes(x = train_set_naivebayes[-14],
y = train_set_naivebayes$target)
# Predicting the Test set results
y_pred_naivebayes = predict(classifier_naivebayes, newdata = test_set_naivebayes[-14])
# Making the Confusion Matrix
confusion_matrix_naivebayes = table(test_set_naivebayes[, 14], y_pred_naivebayes)
accuracy_of_naivebayes = sum(diag(confusion_matrix_naivebayes))/sum(confusion_matrix_naivebayes)
library(scales)
## Warning: package 'scales' was built under R version 4.1.3
5. Analyzing the Model- Naive Bayes
classifier_naivebayes
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = train_set_naivebayes[-14], y = train_set_naivebayes$target)
##
## A-priori probabilities:
## train_set_naivebayes$target
## 0 1
## 0.4564315 0.5435685
##
## Conditional probabilities:
## age
## train_set_naivebayes$target [,1] [,2]
## 0 56.13636 7.880387
## 1 52.02290 9.209916
##
## sex
## train_set_naivebayes$target [,1] [,2]
## 0 0.8272727 0.3797414
## 1 0.5496183 0.4994418
##
## cp
## train_set_naivebayes$target [,1] [,2]
## 0 0.4545455 0.8844878
## 1 1.3816794 0.9401965
##
## trestbps
## train_set_naivebayes$target [,1] [,2]
## 0 133.6000 17.92912
## 1 129.8092 15.85080
##
## chol
## train_set_naivebayes$target [,1] [,2]
## 0 250.8909 49.23271
## 1 245.7786 56.32055
##
## fbs
## train_set_naivebayes$target [,1] [,2]
## 0 0.1545455 0.3631252
## 1 0.1297710 0.3373413
##
## restecg
## train_set_naivebayes$target [,1] [,2]
## 0 0.4636364 0.5531772
## 1 0.5496183 0.4994418
##
## thalach
## train_set_naivebayes$target [,1] [,2]
## 0 138.7455 22.15891
## 1 159.4504 18.59165
##
## exang
## train_set_naivebayes$target [,1] [,2]
## 0 0.5545455 0.4992906
## 1 0.1374046 0.3455956
##
## oldpeak
## train_set_naivebayes$target [,1] [,2]
## 0 1.6200000 1.2983264
## 1 0.5473282 0.7968523
##
## slope
## train_set_naivebayes$target [,1] [,2]
## 0 1.145455 0.5558094
## 1 1.618321 0.5877992
##
## ca
## train_set_naivebayes$target [,1] [,2]
## 0 1.1272727 1.0414271
## 1 0.3587786 0.8326852
##
## thal
## train_set_naivebayes$target [,1] [,2]
## 0 2.545455 0.6858513
## 1 2.099237 0.4447119
y_pred_naivebayes
## [1] 1 1 1 1 1 1 0 0 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 0 0 1 0
## [39] 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0
## Levels: 0 1
confusion_matrix_naivebayes
## y_pred_naivebayes
## 0 1
## 0 23 5
## 1 6 27
percent(accuracy_of_naivebayes)
## [1] "82%"
4. b)Model Creation Phase: Second Model - Decision Tree
library(caTools)
set.seed(1200000)
split = sample.split(dataset$target, SplitRatio = 0.63)
training_set_dt = subset(dataset, split == TRUE)
test_set_dt = subset(dataset, split == FALSE)
# Feature scaling
training_set_dt[,c(1,4,5,8,10,12)] = scale(training_set_dt[,c(1,4,5,8,10,12)])
test_set_dt[,c(1,4,5,8,10,12)] = scale(test_set_dt[,c(1,4,5,8,10,12)])
# Fitting Decision Tree Classification to the Training set
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.1.3
classifier_decisiontree = rpart(formula = target ~ .,data = training_set_dt, method = 'class')
rpart.plot(classifier_decisiontree)
# Predicting the Test set results
y_pred_decisiontree = predict(classifier_decisiontree, newdata = test_set_dt[-14], type = 'class')
# Making the Confusion Matrix
confusionmatrix_dt = table(test_set_dt[, 14], y_pred_decisiontree)
Accuracy_for_Decision_Tree = sum(diag(confusionmatrix_dt))/sum(confusionmatrix_dt)
5. Analysing the model - Decision Tree
classifier_decisiontree
## n= 190
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 190 87 1 (0.45789474 0.54210526)
## 2) cp< 0.5 96 27 0 (0.71875000 0.28125000)
## 4) ca>=-0.2720426 54 4 0 (0.92592593 0.07407407) *
## 5) ca< -0.2720426 42 19 1 (0.45238095 0.54761905)
## 10) thal>=2.5 17 4 0 (0.76470588 0.23529412) *
## 11) thal< 2.5 25 6 1 (0.24000000 0.76000000)
## 22) exang>=0.5 9 4 0 (0.55555556 0.44444444) *
## 23) exang< 0.5 16 1 1 (0.06250000 0.93750000) *
## 3) cp>=0.5 94 18 1 (0.19148936 0.80851064)
## 6) oldpeak>=0.7833287 10 3 0 (0.70000000 0.30000000) *
## 7) oldpeak< 0.7833287 84 11 1 (0.13095238 0.86904762) *
y_pred_decisiontree
## 1 3 9 11 12 15 17 18 23 26 27 28 29 33 38 40 42 45 46 47
## 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1
## 49 57 58 59 60 64 65 73 74 78 79 84 87 91 95 97 100 109 111 113
## 1 1 1 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1
## 114 116 117 118 119 120 122 129 130 135 136 138 141 145 150 154 155 160 161 162
## 0 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 164 166 167 170 172 173 174 175 180 181 182 183 185 188 189 191 192 200 201 203
## 1 0 0 0 1 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0
## 204 205 215 216 217 218 222 224 226 227 229 233 241 242 243 244 245 248 249 255
## 1 0 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 1 1 1
## 263 264 270 271 272 278 280 282 290 293 294 300
## 0 0 0 0 0 1 0 0 0 0 1 1
## Levels: 0 1
confusionmatrix_dt
## y_pred_decisiontree
## 0 1
## 0 37 14
## 1 8 53
Accuracy_for_Decision_Tree
## [1] 0.8035714