3. Try to understand dataset
## Getting the dimensions of the dataframe
dim(df)
## [1] 1025 14
## Getting the structure of the dataframe
str(df)
## 'data.frame': 1025 obs. of 14 variables:
## $ age : int 52 53 70 61 62 58 58 55 46 54 ...
## $ sex : int 1 1 1 1 0 0 1 1 1 1 ...
## $ cp : int 0 0 0 0 0 0 0 0 0 0 ...
## $ trestbps: int 125 140 145 148 138 100 114 160 120 122 ...
## $ chol : int 212 203 174 203 294 248 318 289 249 286 ...
## $ fbs : int 0 1 0 0 1 0 0 0 0 0 ...
## $ restecg : int 1 0 1 1 1 0 2 0 0 0 ...
## $ thalach : int 168 155 125 161 106 122 140 145 144 116 ...
## $ exang : int 0 1 1 0 0 0 0 1 0 1 ...
## $ oldpeak : num 1 3.1 2.6 0 1.9 1 4.4 0.8 0.8 3.2 ...
## $ slope : int 2 0 0 2 1 1 0 1 2 1 ...
## $ ca : int 2 0 0 1 3 0 3 1 0 2 ...
## $ thal : int 3 3 3 3 2 2 1 3 3 2 ...
## $ target : int 0 0 0 0 0 1 0 0 0 0 ...
glimpse(df)
## Rows: 1,025
## Columns: 14
## $ age <int> 52, 53, 70, 61, 62, 58, 58, 55, 46, 54, 71, 43, 34, 51, 52, 3…
## $ sex <int> 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1…
## $ cp <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 2, 0, 1, 2, 2…
## $ trestbps <int> 125, 140, 145, 148, 138, 100, 114, 160, 120, 122, 112, 132, 1…
## $ chol <int> 212, 203, 174, 203, 294, 248, 318, 289, 249, 286, 149, 341, 2…
## $ fbs <int> 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0…
## $ restecg <int> 1, 0, 1, 1, 1, 0, 2, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0…
## $ thalach <int> 168, 155, 125, 161, 106, 122, 140, 145, 144, 116, 125, 136, 1…
## $ exang <int> 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0…
## $ oldpeak <dbl> 1.0, 3.1, 2.6, 0.0, 1.9, 1.0, 4.4, 0.8, 0.8, 3.2, 1.6, 3.0, 0…
## $ slope <int> 2, 0, 0, 2, 1, 1, 0, 1, 2, 1, 1, 1, 2, 1, 1, 2, 2, 1, 2, 2, 1…
## $ ca <int> 2, 0, 0, 1, 3, 0, 3, 1, 0, 2, 0, 0, 0, 3, 0, 0, 1, 1, 0, 0, 0…
## $ thal <int> 3, 3, 3, 3, 2, 2, 1, 3, 3, 2, 2, 3, 2, 3, 0, 2, 2, 3, 2, 2, 2…
## $ target <int> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0…
summary(df)
## age sex cp trestbps
## Min. :29.00 Min. :0.0000 Min. :0.0000 Min. : 94.0
## 1st Qu.:48.00 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:120.0
## Median :56.00 Median :1.0000 Median :1.0000 Median :130.0
## Mean :54.43 Mean :0.6956 Mean :0.9424 Mean :131.6
## 3rd Qu.:61.00 3rd Qu.:1.0000 3rd Qu.:2.0000 3rd Qu.:140.0
## Max. :77.00 Max. :1.0000 Max. :3.0000 Max. :200.0
## chol fbs restecg thalach
## Min. :126 Min. :0.0000 Min. :0.0000 Min. : 71.0
## 1st Qu.:211 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:132.0
## Median :240 Median :0.0000 Median :1.0000 Median :152.0
## Mean :246 Mean :0.1493 Mean :0.5298 Mean :149.1
## 3rd Qu.:275 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:166.0
## Max. :564 Max. :1.0000 Max. :2.0000 Max. :202.0
## exang oldpeak slope ca
## Min. :0.0000 Min. :0.000 Min. :0.000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:1.000 1st Qu.:0.0000
## Median :0.0000 Median :0.800 Median :1.000 Median :0.0000
## Mean :0.3366 Mean :1.072 Mean :1.385 Mean :0.7541
## 3rd Qu.:1.0000 3rd Qu.:1.800 3rd Qu.:2.000 3rd Qu.:1.0000
## Max. :1.0000 Max. :6.200 Max. :2.000 Max. :4.0000
## thal target
## Min. :0.000 Min. :0.0000
## 1st Qu.:2.000 1st Qu.:0.0000
## Median :2.000 Median :1.0000
## Mean :2.324 Mean :0.5132
## 3rd Qu.:3.000 3rd Qu.:1.0000
## Max. :3.000 Max. :1.0000
4. Exploratory data analysis
## Count missing values in each column
colSums(is.na(df))
## age sex cp trestbps chol fbs restecg thalach
## 0 0 0 0 0 0 0 0
## exang oldpeak slope ca thal target
## 0 0 0 0 0 0
## Visualizing the distribution of the target variable
ggplot(df, aes(x = target)) +
geom_bar(fill = "skyblue", color = "black", stat = "count") +
labs(title = "Distribution of Target Variable", x = "Target", y = "Frequency") # patients with heart disease is slightly more than those without heart disease.

## Creating two separate plots for heart disease and no heart disease
ggplot(df, aes(x = age, fill = factor(target))) +
geom_histogram(binwidth = 4, position = "dodge", color = 'grey') +
scale_fill_manual(values = c("0" = "blue", "1" = "red"),
labels = c("No Disease", "Disease")) +
facet_wrap(~target, scales = "free_y") # a histogram to show the relation between heart disease and age

## Visualizing the relationship between gender and heart disease
ggplot(df, aes(x = factor(sex), fill = factor(target))) + geom_bar() +
labs(title = "Distribution of Gender by Heart Disease Status",
x = "Gender (0 = Female, 1 = Male)", y = "Frequency") +
scale_fill_manual(values = c("0" = "blue", "1" = "red"),
labels = c("No Disease", "Disease"))

## Correlation matrix
ggcorr(df, label = TRUE, label_size = 2.5, hjust = 1, layout.exp = 2)

# The variables “slope”, “thalach” and “cp” have a positive correlation with the target variable
# On the other hand, the variable “fbs” has 0 correlation indicating it doesn’t have any relationship with our target variable.
7. Model testing (Evaluation)
## Making predictions on the test set
predictions <- predict(model, newdata = as.data.frame(X_test), type = "response")
## Converting probabilities to binary predictions based on threshold 0.5
binary_predictions <- ifelse(predictions >= 0.5, 1, 0)
## Combining actual values and predicted values into a data frame
result <- data.frame(actual = Y_test, predicted = binary_predictions)
## Evaluating the model
confusionMatrix(data = as.factor(binary_predictions), reference = as.factor(Y_test),
positive = "1") # positive = "1": very important
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 77 12
## 1 18 98
##
## Accuracy : 0.8537
## 95% CI : (0.7977, 0.899)
## No Information Rate : 0.5366
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.7045
##
## Mcnemar's Test P-Value : 0.3613
##
## Sensitivity : 0.8909
## Specificity : 0.8105
## Pos Pred Value : 0.8448
## Neg Pred Value : 0.8652
## Prevalence : 0.5366
## Detection Rate : 0.4780
## Detection Prevalence : 0.5659
## Balanced Accuracy : 0.8507
##
## 'Positive' Class : 1
##
## Create a confusion matrix
conf_matrix <- table(factor(binary_predictions, levels = c("0", "1")),
factor(Y_test, levels = c("0", "1")))
## Set the dimension names of the confusion matrix
dimnames(conf_matrix) <- list(Actual = c("0", "1"), Predicted = c("0", "1"))
## Plot the fourfold plot with color and main title
fourfoldplot(conf_matrix, color = c("lightblue", "yellow"), main = "Confusion Matrix")

# Assuming you have a test dataset named 'test_data' with the same features the training
## Combine features and target into a single data frame for the test set
test_data <- as.data.frame(cbind(target = Y_test, X_test))
## Making predictions on the test set
predictions <- predict(model, newdata = as.data.frame(test_data[, -1]),type ="response")
## Converting probabilities to binary predictions based on threshold 0.5
binary_predictions <- ifelse(predictions >= 0.5, 1, 0)
## Combining actual values and predicted values into a data frame
result <- data.frame(actual = test_data$target, predicted = binary_predictions)
## Displaying the results
print(result)
## actual predicted
## 4 0 0
## 12 0 0
## 17 1 1
## 21 0 0
## 25 1 1
## 37 1 0
## 39 1 0
## 42 1 1
## 44 0 0
## 46 1 1
## 48 0 0
## 49 1 1
## 50 0 0
## 52 0 0
## 55 0 0
## 67 1 1
## 72 0 0
## 73 0 0
## 76 1 1
## 83 0 0
## 84 1 1
## 89 0 0
## 93 0 0
## 96 1 1
## 97 1 1
## 98 0 0
## 105 1 1
## 116 0 0
## 117 0 0
## 118 0 0
## 122 0 0
## 127 1 0
## 135 1 1
## 147 1 1
## 161 0 0
## 162 1 1
## 165 0 0
## 170 1 1
## 171 1 1
## 176 0 0
## 181 0 0
## 185 1 1
## 189 0 0
## 190 0 0
## 200 0 0
## 213 0 0
## 217 0 0
## 224 1 1
## 232 1 1
## 237 0 0
## 243 1 1
## 249 1 1
## 256 1 1
## 258 1 1
## 260 1 0
## 261 1 1
## 262 1 1
## 267 0 0
## 273 1 1
## 276 0 0
## 278 1 1
## 279 0 0
## 290 0 1
## 297 0 0
## 298 0 0
## 310 1 1
## 326 1 1
## 332 0 0
## 333 1 1
## 334 1 1
## 335 0 0
## 337 1 1
## 339 0 1
## 349 0 0
## 354 1 0
## 370 1 1
## 374 0 0
## 380 1 1
## 386 1 1
## 387 1 1
## 388 0 0
## 399 1 0
## 415 0 1
## 420 1 1
## 422 1 1
## 427 1 1
## 432 0 0
## 435 1 1
## 446 1 1
## 450 0 1
## 457 0 1
## 458 1 1
## 462 1 1
## 466 1 1
## 471 1 1
## 475 0 1
## 477 0 0
## 485 0 1
## 495 1 1
## 516 0 0
## 520 0 0
## 523 1 1
## 524 0 0
## 526 1 1
## 529 1 0
## 530 1 1
## 532 1 1
## 534 1 1
## 538 1 1
## 541 0 0
## 546 0 1
## 555 0 1
## 559 1 1
## 564 0 0
## 570 1 1
## 577 1 1
## 584 0 0
## 585 0 0
## 586 1 1
## 587 0 0
## 589 0 0
## 591 1 1
## 593 0 0
## 595 0 0
## 597 1 1
## 598 1 1
## 604 1 1
## 612 0 0
## 640 1 1
## 652 1 1
## 653 1 1
## 654 0 0
## 656 1 1
## 661 0 0
## 674 1 1
## 676 0 1
## 678 0 0
## 687 0 1
## 693 0 0
## 698 0 0
## 714 1 1
## 716 1 1
## 718 0 1
## 719 1 1
## 723 1 1
## 724 1 1
## 739 0 0
## 743 0 0
## 745 1 1
## 747 0 1
## 748 0 0
## 754 1 1
## 762 1 1
## 764 1 1
## 765 0 0
## 782 0 0
## 785 1 1
## 795 0 0
## 796 1 1
## 798 0 0
## 801 0 0
## 802 1 1
## 803 0 1
## 812 0 1
## 816 1 0
## 826 1 1
## 833 1 1
## 851 0 0
## 856 1 1
## 858 1 1
## 859 1 1
## 862 0 1
## 865 0 1
## 869 1 1
## 872 1 1
## 879 0 0
## 887 0 0
## 906 0 0
## 914 1 0
## 915 0 0
## 922 0 0
## 923 1 1
## 933 1 1
## 938 0 1
## 945 0 0
## 946 1 1
## 956 1 0
## 963 1 0
## 965 1 1
## 966 1 1
## 968 1 1
## 977 0 0
## 980 0 0
## 981 1 1
## 984 1 0
## 993 1 1
## 994 0 0
## 996 1 1
## 1002 1 1
## 1007 1 1
## 1009 1 1
## 1011 0 0
## 1014 0 0
## 1020 1 1
## 1024 1 1
# ref: https://www.geeksforgeeks.org/heart-disease-prediction-using-logistic-regression-in-r/