1. Import libraries

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(readr) 
library(caret)
## Warning: package 'caret' was built under R version 4.2.1
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.2.3
## Loading required package: lattice
library(ggplot2)
library(gplots)
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

2. Importing dataset

df <- read.csv("C:/Users/Momo/Desktop/R - Learning/Dataset 4-2022/heart.csv")
head(df)
##   age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal
## 1  52   1  0      125  212   0       1     168     0     1.0     2  2    3
## 2  53   1  0      140  203   1       0     155     1     3.1     0  0    3
## 3  70   1  0      145  174   0       1     125     1     2.6     0  0    3
## 4  61   1  0      148  203   0       1     161     0     0.0     2  1    3
## 5  62   0  0      138  294   1       1     106     0     1.9     1  3    2
## 6  58   0  0      100  248   0       0     122     0     1.0     1  0    2
##   target
## 1      0
## 2      0
## 3      0
## 4      0
## 5      0
## 6      1

3. Try to understand dataset

## Getting the dimensions of the dataframe
dim(df)
## [1] 1025   14
## Getting the structure of the dataframe
str(df)
## 'data.frame':    1025 obs. of  14 variables:
##  $ age     : int  52 53 70 61 62 58 58 55 46 54 ...
##  $ sex     : int  1 1 1 1 0 0 1 1 1 1 ...
##  $ cp      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ trestbps: int  125 140 145 148 138 100 114 160 120 122 ...
##  $ chol    : int  212 203 174 203 294 248 318 289 249 286 ...
##  $ fbs     : int  0 1 0 0 1 0 0 0 0 0 ...
##  $ restecg : int  1 0 1 1 1 0 2 0 0 0 ...
##  $ thalach : int  168 155 125 161 106 122 140 145 144 116 ...
##  $ exang   : int  0 1 1 0 0 0 0 1 0 1 ...
##  $ oldpeak : num  1 3.1 2.6 0 1.9 1 4.4 0.8 0.8 3.2 ...
##  $ slope   : int  2 0 0 2 1 1 0 1 2 1 ...
##  $ ca      : int  2 0 0 1 3 0 3 1 0 2 ...
##  $ thal    : int  3 3 3 3 2 2 1 3 3 2 ...
##  $ target  : int  0 0 0 0 0 1 0 0 0 0 ...
glimpse(df)
## Rows: 1,025
## Columns: 14
## $ age      <int> 52, 53, 70, 61, 62, 58, 58, 55, 46, 54, 71, 43, 34, 51, 52, 3…
## $ sex      <int> 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1…
## $ cp       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 2, 0, 1, 2, 2…
## $ trestbps <int> 125, 140, 145, 148, 138, 100, 114, 160, 120, 122, 112, 132, 1…
## $ chol     <int> 212, 203, 174, 203, 294, 248, 318, 289, 249, 286, 149, 341, 2…
## $ fbs      <int> 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0…
## $ restecg  <int> 1, 0, 1, 1, 1, 0, 2, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0…
## $ thalach  <int> 168, 155, 125, 161, 106, 122, 140, 145, 144, 116, 125, 136, 1…
## $ exang    <int> 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0…
## $ oldpeak  <dbl> 1.0, 3.1, 2.6, 0.0, 1.9, 1.0, 4.4, 0.8, 0.8, 3.2, 1.6, 3.0, 0…
## $ slope    <int> 2, 0, 0, 2, 1, 1, 0, 1, 2, 1, 1, 1, 2, 1, 1, 2, 2, 1, 2, 2, 1…
## $ ca       <int> 2, 0, 0, 1, 3, 0, 3, 1, 0, 2, 0, 0, 0, 3, 0, 0, 1, 1, 0, 0, 0…
## $ thal     <int> 3, 3, 3, 3, 2, 2, 1, 3, 3, 2, 2, 3, 2, 3, 0, 2, 2, 3, 2, 2, 2…
## $ target   <int> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0…
summary(df)
##       age             sex               cp            trestbps    
##  Min.   :29.00   Min.   :0.0000   Min.   :0.0000   Min.   : 94.0  
##  1st Qu.:48.00   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:120.0  
##  Median :56.00   Median :1.0000   Median :1.0000   Median :130.0  
##  Mean   :54.43   Mean   :0.6956   Mean   :0.9424   Mean   :131.6  
##  3rd Qu.:61.00   3rd Qu.:1.0000   3rd Qu.:2.0000   3rd Qu.:140.0  
##  Max.   :77.00   Max.   :1.0000   Max.   :3.0000   Max.   :200.0  
##       chol          fbs            restecg          thalach     
##  Min.   :126   Min.   :0.0000   Min.   :0.0000   Min.   : 71.0  
##  1st Qu.:211   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:132.0  
##  Median :240   Median :0.0000   Median :1.0000   Median :152.0  
##  Mean   :246   Mean   :0.1493   Mean   :0.5298   Mean   :149.1  
##  3rd Qu.:275   3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:166.0  
##  Max.   :564   Max.   :1.0000   Max.   :2.0000   Max.   :202.0  
##      exang           oldpeak          slope             ca        
##  Min.   :0.0000   Min.   :0.000   Min.   :0.000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:1.000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.800   Median :1.000   Median :0.0000  
##  Mean   :0.3366   Mean   :1.072   Mean   :1.385   Mean   :0.7541  
##  3rd Qu.:1.0000   3rd Qu.:1.800   3rd Qu.:2.000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :6.200   Max.   :2.000   Max.   :4.0000  
##       thal           target      
##  Min.   :0.000   Min.   :0.0000  
##  1st Qu.:2.000   1st Qu.:0.0000  
##  Median :2.000   Median :1.0000  
##  Mean   :2.324   Mean   :0.5132  
##  3rd Qu.:3.000   3rd Qu.:1.0000  
##  Max.   :3.000   Max.   :1.0000

4. Exploratory data analysis

## Count missing values in each column
colSums(is.na(df))
##      age      sex       cp trestbps     chol      fbs  restecg  thalach 
##        0        0        0        0        0        0        0        0 
##    exang  oldpeak    slope       ca     thal   target 
##        0        0        0        0        0        0
## Visualizing the distribution of the target variable
ggplot(df, aes(x = target)) +
  geom_bar(fill = "skyblue", color = "black", stat = "count") +
  labs(title = "Distribution of Target Variable", x = "Target", y = "Frequency") # patients with heart disease is slightly more than those without heart disease.

## Creating two separate plots for heart disease and no heart disease
ggplot(df, aes(x = age, fill = factor(target))) +
  geom_histogram(binwidth = 4, position = "dodge", color = 'grey') +
  scale_fill_manual(values = c("0" = "blue", "1" = "red"), 
                    labels = c("No Disease", "Disease")) +
  facet_wrap(~target, scales = "free_y") # a histogram to show the relation between heart disease and age

## Visualizing the relationship between gender and heart disease
ggplot(df, aes(x = factor(sex), fill = factor(target))) + geom_bar() +
  labs(title = "Distribution of Gender by Heart Disease Status",
       x = "Gender (0 = Female, 1 = Male)", y = "Frequency") +
  scale_fill_manual(values = c("0" = "blue", "1" = "red"), 
                    labels = c("No Disease", "Disease"))

## Correlation matrix
ggcorr(df, label = TRUE, label_size = 2.5, hjust = 1, layout.exp = 2)

# The variables “slope”, “thalach” and “cp” have a positive correlation with the target variable
# On the other hand, the variable “fbs” has 0 correlation indicating it doesn’t have any relationship with our target variable.

5. Data preparing before machine learning

5.1 Data Encoding

heart <- df %>%
  mutate(sex = as.factor(sex),
         cp = as.factor(cp),
         fbs = as.factor(fbs),
         restecg = as.factor(restecg),
         exang = as.factor(exang),
         slope = as.factor(slope),
         ca = as.factor(ca),
         thal = as.factor(thal),
         target = as.factor(target)) # This encoding allows for efficient handling of categorical variables in statistical models and data analysis.
## Checking the structure of the dataset
str(heart)
## 'data.frame':    1025 obs. of  14 variables:
##  $ age     : int  52 53 70 61 62 58 58 55 46 54 ...
##  $ sex     : Factor w/ 2 levels "0","1": 2 2 2 2 1 1 2 2 2 2 ...
##  $ cp      : Factor w/ 4 levels "0","1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
##  $ trestbps: int  125 140 145 148 138 100 114 160 120 122 ...
##  $ chol    : int  212 203 174 203 294 248 318 289 249 286 ...
##  $ fbs     : Factor w/ 2 levels "0","1": 1 2 1 1 2 1 1 1 1 1 ...
##  $ restecg : Factor w/ 3 levels "0","1","2": 2 1 2 2 2 1 3 1 1 1 ...
##  $ thalach : int  168 155 125 161 106 122 140 145 144 116 ...
##  $ exang   : Factor w/ 2 levels "0","1": 1 2 2 1 1 1 1 2 1 2 ...
##  $ oldpeak : num  1 3.1 2.6 0 1.9 1 4.4 0.8 0.8 3.2 ...
##  $ slope   : Factor w/ 3 levels "0","1","2": 3 1 1 3 2 2 1 2 3 2 ...
##  $ ca      : Factor w/ 5 levels "0","1","2","3",..: 3 1 1 2 4 1 4 2 1 3 ...
##  $ thal    : Factor w/ 4 levels "0","1","2","3": 4 4 4 4 3 3 2 4 4 3 ...
##  $ target  : Factor w/ 2 levels "0","1": 1 1 1 1 1 2 1 1 1 1 ...

5.2. Normalization and Data Splitting

## Feature selection
features <- df[, c('age', 'sex',  'cp', 'trestbps', 'chol', 'restecg', 'thalach', 
                   'exang', 'oldpeak', 'slope', 'ca', 'thal')]
target <- df$target

## Data normalization
preprocessParams <- preProcess(features, method = c("center", "scale"))
features_normalized <- predict(preprocessParams, features)
## Splitting the data
split <- createDataPartition(target, p = 0.8, list = FALSE)
X_train <- features_normalized[split, ]
X_test <- features_normalized[-split, ]
Y_train <- target[split]
Y_test <- target[-split]

## Print the shape of the training and test sets
print(paste("X_train shape:", paste(dim(X_train), collapse = "x")))
## [1] "X_train shape: 820x12"
print(paste("X_test shape:", paste(dim(X_test), collapse = "x")))
## [1] "X_test shape: 205x12"
## Combine features and target into a single data frame
train_data <- as.data.frame(cbind(target = Y_train, X_train))

6. Training the logistic regression model

model <- glm(target ~ ., data = train_data, family = "binomial")

7. Model testing (Evaluation)

## Making predictions on the test set
predictions <- predict(model, newdata = as.data.frame(X_test), type = "response")
## Converting probabilities to binary predictions based on threshold 0.5
binary_predictions <- ifelse(predictions >= 0.5, 1, 0)
## Combining actual values and predicted values into a data frame
result <- data.frame(actual = Y_test, predicted = binary_predictions)
## Evaluating the model
confusionMatrix(data = as.factor(binary_predictions), reference = as.factor(Y_test), 
                positive = "1") # positive = "1": very important
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 77 12
##          1 18 98
##                                          
##                Accuracy : 0.8537         
##                  95% CI : (0.7977, 0.899)
##     No Information Rate : 0.5366         
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.7045         
##                                          
##  Mcnemar's Test P-Value : 0.3613         
##                                          
##             Sensitivity : 0.8909         
##             Specificity : 0.8105         
##          Pos Pred Value : 0.8448         
##          Neg Pred Value : 0.8652         
##              Prevalence : 0.5366         
##          Detection Rate : 0.4780         
##    Detection Prevalence : 0.5659         
##       Balanced Accuracy : 0.8507         
##                                          
##        'Positive' Class : 1              
## 
## Create a confusion matrix
conf_matrix <- table(factor(binary_predictions, levels = c("0", "1")), 
                     factor(Y_test, levels = c("0", "1")))
## Set the dimension names of the confusion matrix
dimnames(conf_matrix) <- list(Actual = c("0", "1"), Predicted = c("0", "1"))
## Plot the fourfold plot with color and main title
fourfoldplot(conf_matrix, color = c("lightblue", "yellow"), main = "Confusion Matrix")

# Assuming you have a test dataset named 'test_data' with the same features the training
## Combine features and target into a single data frame for the test set
test_data <- as.data.frame(cbind(target = Y_test, X_test))

## Making predictions on the test set
predictions <- predict(model, newdata = as.data.frame(test_data[, -1]),type ="response")

## Converting probabilities to binary predictions based on threshold 0.5
binary_predictions <- ifelse(predictions >= 0.5, 1, 0)

## Combining actual values and predicted values into a data frame
result <- data.frame(actual = test_data$target, predicted = binary_predictions)

## Displaying the results
print(result)
##      actual predicted
## 4         0         0
## 12        0         0
## 17        1         1
## 21        0         0
## 25        1         1
## 37        1         0
## 39        1         0
## 42        1         1
## 44        0         0
## 46        1         1
## 48        0         0
## 49        1         1
## 50        0         0
## 52        0         0
## 55        0         0
## 67        1         1
## 72        0         0
## 73        0         0
## 76        1         1
## 83        0         0
## 84        1         1
## 89        0         0
## 93        0         0
## 96        1         1
## 97        1         1
## 98        0         0
## 105       1         1
## 116       0         0
## 117       0         0
## 118       0         0
## 122       0         0
## 127       1         0
## 135       1         1
## 147       1         1
## 161       0         0
## 162       1         1
## 165       0         0
## 170       1         1
## 171       1         1
## 176       0         0
## 181       0         0
## 185       1         1
## 189       0         0
## 190       0         0
## 200       0         0
## 213       0         0
## 217       0         0
## 224       1         1
## 232       1         1
## 237       0         0
## 243       1         1
## 249       1         1
## 256       1         1
## 258       1         1
## 260       1         0
## 261       1         1
## 262       1         1
## 267       0         0
## 273       1         1
## 276       0         0
## 278       1         1
## 279       0         0
## 290       0         1
## 297       0         0
## 298       0         0
## 310       1         1
## 326       1         1
## 332       0         0
## 333       1         1
## 334       1         1
## 335       0         0
## 337       1         1
## 339       0         1
## 349       0         0
## 354       1         0
## 370       1         1
## 374       0         0
## 380       1         1
## 386       1         1
## 387       1         1
## 388       0         0
## 399       1         0
## 415       0         1
## 420       1         1
## 422       1         1
## 427       1         1
## 432       0         0
## 435       1         1
## 446       1         1
## 450       0         1
## 457       0         1
## 458       1         1
## 462       1         1
## 466       1         1
## 471       1         1
## 475       0         1
## 477       0         0
## 485       0         1
## 495       1         1
## 516       0         0
## 520       0         0
## 523       1         1
## 524       0         0
## 526       1         1
## 529       1         0
## 530       1         1
## 532       1         1
## 534       1         1
## 538       1         1
## 541       0         0
## 546       0         1
## 555       0         1
## 559       1         1
## 564       0         0
## 570       1         1
## 577       1         1
## 584       0         0
## 585       0         0
## 586       1         1
## 587       0         0
## 589       0         0
## 591       1         1
## 593       0         0
## 595       0         0
## 597       1         1
## 598       1         1
## 604       1         1
## 612       0         0
## 640       1         1
## 652       1         1
## 653       1         1
## 654       0         0
## 656       1         1
## 661       0         0
## 674       1         1
## 676       0         1
## 678       0         0
## 687       0         1
## 693       0         0
## 698       0         0
## 714       1         1
## 716       1         1
## 718       0         1
## 719       1         1
## 723       1         1
## 724       1         1
## 739       0         0
## 743       0         0
## 745       1         1
## 747       0         1
## 748       0         0
## 754       1         1
## 762       1         1
## 764       1         1
## 765       0         0
## 782       0         0
## 785       1         1
## 795       0         0
## 796       1         1
## 798       0         0
## 801       0         0
## 802       1         1
## 803       0         1
## 812       0         1
## 816       1         0
## 826       1         1
## 833       1         1
## 851       0         0
## 856       1         1
## 858       1         1
## 859       1         1
## 862       0         1
## 865       0         1
## 869       1         1
## 872       1         1
## 879       0         0
## 887       0         0
## 906       0         0
## 914       1         0
## 915       0         0
## 922       0         0
## 923       1         1
## 933       1         1
## 938       0         1
## 945       0         0
## 946       1         1
## 956       1         0
## 963       1         0
## 965       1         1
## 966       1         1
## 968       1         1
## 977       0         0
## 980       0         0
## 981       1         1
## 984       1         0
## 993       1         1
## 994       0         0
## 996       1         1
## 1002      1         1
## 1007      1         1
## 1009      1         1
## 1011      0         0
## 1014      0         0
## 1020      1         1
## 1024      1         1
# ref: https://www.geeksforgeeks.org/heart-disease-prediction-using-logistic-regression-in-r/