1. Import libraries

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(readr) 
library(caret)

## Warning: package 'caret' was built under R version 4.2.1

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 4.2.3

## Loading required package: lattice

library(ggplot2)
library(gplots)

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

library(GGally)

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

2. Importing dataset

df <- read.csv("C:/Users/Momo/Desktop/R - Learning/Dataset 4-2022/heart.csv")
head(df)

##   age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal
## 1  52   1  0      125  212   0       1     168     0     1.0     2  2    3
## 2  53   1  0      140  203   1       0     155     1     3.1     0  0    3
## 3  70   1  0      145  174   0       1     125     1     2.6     0  0    3
## 4  61   1  0      148  203   0       1     161     0     0.0     2  1    3
## 5  62   0  0      138  294   1       1     106     0     1.9     1  3    2
## 6  58   0  0      100  248   0       0     122     0     1.0     1  0    2
##   target
## 1      0
## 2      0
## 3      0
## 4      0
## 5      0
## 6      1

3. Try to understand dataset

## Getting the dimensions of the dataframe
dim(df)

## [1] 1025   14

## Getting the structure of the dataframe
str(df)

## 'data.frame':    1025 obs. of  14 variables:
##  $ age     : int  52 53 70 61 62 58 58 55 46 54 ...
##  $ sex     : int  1 1 1 1 0 0 1 1 1 1 ...
##  $ cp      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ trestbps: int  125 140 145 148 138 100 114 160 120 122 ...
##  $ chol    : int  212 203 174 203 294 248 318 289 249 286 ...
##  $ fbs     : int  0 1 0 0 1 0 0 0 0 0 ...
##  $ restecg : int  1 0 1 1 1 0 2 0 0 0 ...
##  $ thalach : int  168 155 125 161 106 122 140 145 144 116 ...
##  $ exang   : int  0 1 1 0 0 0 0 1 0 1 ...
##  $ oldpeak : num  1 3.1 2.6 0 1.9 1 4.4 0.8 0.8 3.2 ...
##  $ slope   : int  2 0 0 2 1 1 0 1 2 1 ...
##  $ ca      : int  2 0 0 1 3 0 3 1 0 2 ...
##  $ thal    : int  3 3 3 3 2 2 1 3 3 2 ...
##  $ target  : int  0 0 0 0 0 1 0 0 0 0 ...

glimpse(df)

## Rows: 1,025
## Columns: 14
## $ age      <int> 52, 53, 70, 61, 62, 58, 58, 55, 46, 54, 71, 43, 34, 51, 52, 3…
## $ sex      <int> 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1…
## $ cp       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 2, 0, 1, 2, 2…
## $ trestbps <int> 125, 140, 145, 148, 138, 100, 114, 160, 120, 122, 112, 132, 1…
## $ chol     <int> 212, 203, 174, 203, 294, 248, 318, 289, 249, 286, 149, 341, 2…
## $ fbs      <int> 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0…
## $ restecg  <int> 1, 0, 1, 1, 1, 0, 2, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0…
## $ thalach  <int> 168, 155, 125, 161, 106, 122, 140, 145, 144, 116, 125, 136, 1…
## $ exang    <int> 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0…
## $ oldpeak  <dbl> 1.0, 3.1, 2.6, 0.0, 1.9, 1.0, 4.4, 0.8, 0.8, 3.2, 1.6, 3.0, 0…
## $ slope    <int> 2, 0, 0, 2, 1, 1, 0, 1, 2, 1, 1, 1, 2, 1, 1, 2, 2, 1, 2, 2, 1…
## $ ca       <int> 2, 0, 0, 1, 3, 0, 3, 1, 0, 2, 0, 0, 0, 3, 0, 0, 1, 1, 0, 0, 0…
## $ thal     <int> 3, 3, 3, 3, 2, 2, 1, 3, 3, 2, 2, 3, 2, 3, 0, 2, 2, 3, 2, 2, 2…
## $ target   <int> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0…

summary(df)

##       age             sex               cp            trestbps    
##  Min.   :29.00   Min.   :0.0000   Min.   :0.0000   Min.   : 94.0  
##  1st Qu.:48.00   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:120.0  
##  Median :56.00   Median :1.0000   Median :1.0000   Median :130.0  
##  Mean   :54.43   Mean   :0.6956   Mean   :0.9424   Mean   :131.6  
##  3rd Qu.:61.00   3rd Qu.:1.0000   3rd Qu.:2.0000   3rd Qu.:140.0  
##  Max.   :77.00   Max.   :1.0000   Max.   :3.0000   Max.   :200.0  
##       chol          fbs            restecg          thalach     
##  Min.   :126   Min.   :0.0000   Min.   :0.0000   Min.   : 71.0  
##  1st Qu.:211   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:132.0  
##  Median :240   Median :0.0000   Median :1.0000   Median :152.0  
##  Mean   :246   Mean   :0.1493   Mean   :0.5298   Mean   :149.1  
##  3rd Qu.:275   3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:166.0  
##  Max.   :564   Max.   :1.0000   Max.   :2.0000   Max.   :202.0  
##      exang           oldpeak          slope             ca        
##  Min.   :0.0000   Min.   :0.000   Min.   :0.000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:1.000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.800   Median :1.000   Median :0.0000  
##  Mean   :0.3366   Mean   :1.072   Mean   :1.385   Mean   :0.7541  
##  3rd Qu.:1.0000   3rd Qu.:1.800   3rd Qu.:2.000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :6.200   Max.   :2.000   Max.   :4.0000  
##       thal           target      
##  Min.   :0.000   Min.   :0.0000  
##  1st Qu.:2.000   1st Qu.:0.0000  
##  Median :2.000   Median :1.0000  
##  Mean   :2.324   Mean   :0.5132  
##  3rd Qu.:3.000   3rd Qu.:1.0000  
##  Max.   :3.000   Max.   :1.0000

4. Exploratory data analysis

## Count missing values in each column
colSums(is.na(df))

##      age      sex       cp trestbps     chol      fbs  restecg  thalach 
##        0        0        0        0        0        0        0        0 
##    exang  oldpeak    slope       ca     thal   target 
##        0        0        0        0        0        0

## Visualizing the distribution of the target variable
ggplot(df, aes(x = target)) +
  geom_bar(fill = "skyblue", color = "black", stat = "count") +
  labs(title = "Distribution of Target Variable", x = "Target", y = "Frequency") # patients with heart disease is slightly more than those without heart disease.

## Creating two separate plots for heart disease and no heart disease
ggplot(df, aes(x = age, fill = factor(target))) +
  geom_histogram(binwidth = 4, position = "dodge", color = 'grey') +
  scale_fill_manual(values = c("0" = "blue", "1" = "red"), 
                    labels = c("No Disease", "Disease")) +
  facet_wrap(~target, scales = "free_y") # a histogram to show the relation between heart disease and age

## Visualizing the relationship between gender and heart disease
ggplot(df, aes(x = factor(sex), fill = factor(target))) + geom_bar() +
  labs(title = "Distribution of Gender by Heart Disease Status",
       x = "Gender (0 = Female, 1 = Male)", y = "Frequency") +
  scale_fill_manual(values = c("0" = "blue", "1" = "red"), 
                    labels = c("No Disease", "Disease"))

## Correlation matrix
ggcorr(df, label = TRUE, label_size = 2.5, hjust = 1, layout.exp = 2)

# The variables “slope”, “thalach” and “cp” have a positive correlation with the target variable
# On the other hand, the variable “fbs” has 0 correlation indicating it doesn’t have any relationship with our target variable.

5. Data preparing before machine learning

5.1 Data Encoding

heart <- df %>%
  mutate(sex = as.factor(sex),
         cp = as.factor(cp),
         fbs = as.factor(fbs),
         restecg = as.factor(restecg),
         exang = as.factor(exang),
         slope = as.factor(slope),
         ca = as.factor(ca),
         thal = as.factor(thal),
         target = as.factor(target)) # This encoding allows for efficient handling of categorical variables in statistical models and data analysis.

## Checking the structure of the dataset
str(heart)

## 'data.frame':    1025 obs. of  14 variables:
##  $ age     : int  52 53 70 61 62 58 58 55 46 54 ...
##  $ sex     : Factor w/ 2 levels "0","1": 2 2 2 2 1 1 2 2 2 2 ...
##  $ cp      : Factor w/ 4 levels "0","1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
##  $ trestbps: int  125 140 145 148 138 100 114 160 120 122 ...
##  $ chol    : int  212 203 174 203 294 248 318 289 249 286 ...
##  $ fbs     : Factor w/ 2 levels "0","1": 1 2 1 1 2 1 1 1 1 1 ...
##  $ restecg : Factor w/ 3 levels "0","1","2": 2 1 2 2 2 1 3 1 1 1 ...
##  $ thalach : int  168 155 125 161 106 122 140 145 144 116 ...
##  $ exang   : Factor w/ 2 levels "0","1": 1 2 2 1 1 1 1 2 1 2 ...
##  $ oldpeak : num  1 3.1 2.6 0 1.9 1 4.4 0.8 0.8 3.2 ...
##  $ slope   : Factor w/ 3 levels "0","1","2": 3 1 1 3 2 2 1 2 3 2 ...
##  $ ca      : Factor w/ 5 levels "0","1","2","3",..: 3 1 1 2 4 1 4 2 1 3 ...
##  $ thal    : Factor w/ 4 levels "0","1","2","3": 4 4 4 4 3 3 2 4 4 3 ...
##  $ target  : Factor w/ 2 levels "0","1": 1 1 1 1 1 2 1 1 1 1 ...

5.2. Normalization and Data Splitting

## Feature selection
features <- df[, c('age', 'sex',  'cp', 'trestbps', 'chol', 'restecg', 'thalach', 
                   'exang', 'oldpeak', 'slope', 'ca', 'thal')]
target <- df$target

## Data normalization
preprocessParams <- preProcess(features, method = c("center", "scale"))
features_normalized <- predict(preprocessParams, features)

## Splitting the data
split <- createDataPartition(target, p = 0.8, list = FALSE)
X_train <- features_normalized[split, ]
X_test <- features_normalized[-split, ]
Y_train <- target[split]
Y_test <- target[-split]

## Print the shape of the training and test sets
print(paste("X_train shape:", paste(dim(X_train), collapse = "x")))

## [1] "X_train shape: 820x12"

print(paste("X_test shape:", paste(dim(X_test), collapse = "x")))

## [1] "X_test shape: 205x12"

## Combine features and target into a single data frame
train_data <- as.data.frame(cbind(target = Y_train, X_train))

6. Training the logistic regression model

model <- glm(target ~ ., data = train_data, family = "binomial")

7. Model testing (Evaluation)

## Making predictions on the test set
predictions <- predict(model, newdata = as.data.frame(X_test), type = "response")

## Converting probabilities to binary predictions based on threshold 0.5
binary_predictions <- ifelse(predictions >= 0.5, 1, 0)

## Combining actual values and predicted values into a data frame
result <- data.frame(actual = Y_test, predicted = binary_predictions)

## Evaluating the model
confusionMatrix(data = as.factor(binary_predictions), reference = as.factor(Y_test), 
                positive = "1") # positive = "1": very important

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 77 12
##          1 18 98
##                                          
##                Accuracy : 0.8537         
##                  95% CI : (0.7977, 0.899)
##     No Information Rate : 0.5366         
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.7045         
##                                          
##  Mcnemar's Test P-Value : 0.3613         
##                                          
##             Sensitivity : 0.8909         
##             Specificity : 0.8105         
##          Pos Pred Value : 0.8448         
##          Neg Pred Value : 0.8652         
##              Prevalence : 0.5366         
##          Detection Rate : 0.4780         
##    Detection Prevalence : 0.5659         
##       Balanced Accuracy : 0.8507         
##                                          
##        'Positive' Class : 1              
##

## Create a confusion matrix
conf_matrix <- table(factor(binary_predictions, levels = c("0", "1")), 
                     factor(Y_test, levels = c("0", "1")))

## Set the dimension names of the confusion matrix
dimnames(conf_matrix) <- list(Actual = c("0", "1"), Predicted = c("0", "1"))

## Plot the fourfold plot with color and main title
fourfoldplot(conf_matrix, color = c("lightblue", "yellow"), main = "Confusion Matrix")

# Assuming you have a test dataset named 'test_data' with the same features the training
## Combine features and target into a single data frame for the test set
test_data <- as.data.frame(cbind(target = Y_test, X_test))

## Making predictions on the test set
predictions <- predict(model, newdata = as.data.frame(test_data[, -1]),type ="response")

## Converting probabilities to binary predictions based on threshold 0.5
binary_predictions <- ifelse(predictions >= 0.5, 1, 0)

## Combining actual values and predicted values into a data frame
result <- data.frame(actual = test_data$target, predicted = binary_predictions)

## Displaying the results
print(result)

##      actual predicted
## 4         0         0
## 12        0         0
## 17        1         1
## 21        0         0
## 25        1         1
## 37        1         0
## 39        1         0
## 42        1         1
## 44        0         0
## 46        1         1
## 48        0         0
## 49        1         1
## 50        0         0
## 52        0         0
## 55        0         0
## 67        1         1
## 72        0         0
## 73        0         0
## 76        1         1
## 83        0         0
## 84        1         1
## 89        0         0
## 93        0         0
## 96        1         1
## 97        1         1
## 98        0         0
## 105       1         1
## 116       0         0
## 117       0         0
## 118       0         0
## 122       0         0
## 127       1         0
## 135       1         1
## 147       1         1
## 161       0         0
## 162       1         1
## 165       0         0
## 170       1         1
## 171       1         1
## 176       0         0
## 181       0         0
## 185       1         1
## 189       0         0
## 190       0         0
## 200       0         0
## 213       0         0
## 217       0         0
## 224       1         1
## 232       1         1
## 237       0         0
## 243       1         1
## 249       1         1
## 256       1         1
## 258       1         1
## 260       1         0
## 261       1         1
## 262       1         1
## 267       0         0
## 273       1         1
## 276       0         0
## 278       1         1
## 279       0         0
## 290       0         1
## 297       0         0
## 298       0         0
## 310       1         1
## 326       1         1
## 332       0         0
## 333       1         1
## 334       1         1
## 335       0         0
## 337       1         1
## 339       0         1
## 349       0         0
## 354       1         0
## 370       1         1
## 374       0         0
## 380       1         1
## 386       1         1
## 387       1         1
## 388       0         0
## 399       1         0
## 415       0         1
## 420       1         1
## 422       1         1
## 427       1         1
## 432       0         0
## 435       1         1
## 446       1         1
## 450       0         1
## 457       0         1
## 458       1         1
## 462       1         1
## 466       1         1
## 471       1         1
## 475       0         1
## 477       0         0
## 485       0         1
## 495       1         1
## 516       0         0
## 520       0         0
## 523       1         1
## 524       0         0
## 526       1         1
## 529       1         0
## 530       1         1
## 532       1         1
## 534       1         1
## 538       1         1
## 541       0         0
## 546       0         1
## 555       0         1
## 559       1         1
## 564       0         0
## 570       1         1
## 577       1         1
## 584       0         0
## 585       0         0
## 586       1         1
## 587       0         0
## 589       0         0
## 591       1         1
## 593       0         0
## 595       0         0
## 597       1         1
## 598       1         1
## 604       1         1
## 612       0         0
## 640       1         1
## 652       1         1
## 653       1         1
## 654       0         0
## 656       1         1
## 661       0         0
## 674       1         1
## 676       0         1
## 678       0         0
## 687       0         1
## 693       0         0
## 698       0         0
## 714       1         1
## 716       1         1
## 718       0         1
## 719       1         1
## 723       1         1
## 724       1         1
## 739       0         0
## 743       0         0
## 745       1         1
## 747       0         1
## 748       0         0
## 754       1         1
## 762       1         1
## 764       1         1
## 765       0         0
## 782       0         0
## 785       1         1
## 795       0         0
## 796       1         1
## 798       0         0
## 801       0         0
## 802       1         1
## 803       0         1
## 812       0         1
## 816       1         0
## 826       1         1
## 833       1         1
## 851       0         0
## 856       1         1
## 858       1         1
## 859       1         1
## 862       0         1
## 865       0         1
## 869       1         1
## 872       1         1
## 879       0         0
## 887       0         0
## 906       0         0
## 914       1         0
## 915       0         0
## 922       0         0
## 923       1         1
## 933       1         1
## 938       0         1
## 945       0         0
## 946       1         1
## 956       1         0
## 963       1         0
## 965       1         1
## 966       1         1
## 968       1         1
## 977       0         0
## 980       0         0
## 981       1         1
## 984       1         0
## 993       1         1
## 994       0         0
## 996       1         1
## 1002      1         1
## 1007      1         1
## 1009      1         1
## 1011      0         0
## 1014      0         0
## 1020      1         1
## 1024      1         1

# ref: https://www.geeksforgeeks.org/heart-disease-prediction-using-logistic-regression-in-r/

Logistic Regression - Machine Learning study

Vu Thien

2024-05-15

1. Import libraries

2. Importing dataset

3. Try to understand dataset

4. Exploratory data analysis

5. Data preparing before machine learning

5.1 Data Encoding

5.2. Normalization and Data Splitting

6. Training the logistic regression model

7. Model testing (Evaluation)