library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
heart_data<- read_excel("data.xlsx")
head(5)
## [1] 5
# Transform continuous variables that should be factors
heart_data_transformed <- heart_data %>%
  mutate(
    sex = factor(sex, levels = 0:1,labels = c("female", "male")),
    cp = factor(cp, levels = 0:3, labels = c("non-anginal pain", "atypical angina", "typical angina", "asymptomatic")),
    fbs = factor(fbs, levels = 0:1, labels = c("false", "true")),
    restecg = factor(restecg, levels = 0:2, labels = c("normal", "ST-T wave abnormality", "left ventricular hypertrophy")),
    exang = factor(exang, levels = 0:1, labels = c("no", "yes")),
    slope = factor(slope, levels = 0:2, labels = c("slope 0", "slope 1", "slope 2")),
    thal = factor(thal, levels = 1:3, labels = c("normal", "fixed defect", "reversible defect")),
    target = factor(target, levels = 0:1, labels = c("no disease", "disease")),
    ca = factor(ca)
  )

# Print the transformed dataset
print(heart_data_transformed)
## # A tibble: 303 × 14
##      age sex    cp      trestbps  chol fbs   restecg thalach exang oldpeak slope
##    <dbl> <fct>  <fct>      <dbl> <dbl> <fct> <fct>     <dbl> <fct>   <dbl> <fct>
##  1    63 male   asympt…      145   233 true  normal      150 no        2.3 slop…
##  2    37 male   typica…      130   250 false ST-T w…     187 no        3.5 slop…
##  3    41 female atypic…      130   204 false normal      172 no        1.4 slop…
##  4    56 male   atypic…      120   236 false ST-T w…     178 no        0.8 slop…
##  5    57 female non-an…      120   354 false ST-T w…     163 yes       0.6 slop…
##  6    57 male   non-an…      140   192 false ST-T w…     148 no        0.4 slop…
##  7    56 female atypic…      140   294 false normal      153 no        1.3 slop…
##  8    44 male   atypic…      120   263 false ST-T w…     173 no        0   slop…
##  9    52 male   typica…      172   199 true  ST-T w…     162 no        0.5 slop…
## 10    57 male   typica…      150   168 false ST-T w…     174 no        1.6 slop…
## # ℹ 293 more rows
## # ℹ 3 more variables: ca <fct>, thal <fct>, target <fct>
str(heart_data_transformed)
## tibble [303 × 14] (S3: tbl_df/tbl/data.frame)
##  $ age     : num [1:303] 63 37 41 56 57 57 56 44 52 57 ...
##  $ sex     : Factor w/ 2 levels "female","male": 2 2 1 2 1 2 1 2 2 2 ...
##  $ cp      : Factor w/ 4 levels "non-anginal pain",..: 4 3 2 2 1 1 2 2 3 3 ...
##  $ trestbps: num [1:303] 145 130 130 120 120 140 140 120 172 150 ...
##  $ chol    : num [1:303] 233 250 204 236 354 192 294 263 199 168 ...
##  $ fbs     : Factor w/ 2 levels "false","true": 2 1 1 1 1 1 1 1 2 1 ...
##  $ restecg : Factor w/ 3 levels "normal","ST-T wave abnormality",..: 1 2 1 2 2 2 1 2 2 2 ...
##  $ thalach : num [1:303] 150 187 172 178 163 148 153 173 162 174 ...
##  $ exang   : Factor w/ 2 levels "no","yes": 1 1 1 1 2 1 1 1 1 1 ...
##  $ oldpeak : num [1:303] 2.3 3.5 1.4 0.8 0.6 0.4 1.3 0 0.5 1.6 ...
##  $ slope   : Factor w/ 3 levels "slope 0","slope 1",..: 1 1 3 3 3 2 2 3 3 3 ...
##  $ ca      : Factor w/ 5 levels "0","1","2","3",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ thal    : Factor w/ 3 levels "normal","fixed defect",..: 1 2 2 2 2 1 2 3 3 2 ...
##  $ target  : Factor w/ 2 levels "no disease","disease": 2 2 2 2 2 2 2 2 2 2 ...
# Check for missing values
missing_values <- heart_data_transformed %>%
  summarise_all(~ sum(is.na(.)))
missing_values
## # A tibble: 1 × 14
##     age   sex    cp trestbps  chol   fbs restecg thalach exang oldpeak slope
##   <int> <int> <int>    <int> <int> <int>   <int>   <int> <int>   <int> <int>
## 1     0     0     0        0     0     0       0       0     0       0     0
## # ℹ 3 more variables: ca <int>, thal <int>, target <int>
#checking duolicates
duplicates <- heart_data_transformed[duplicated(heart_data_transformed), ]
duplicates
## # A tibble: 1 × 14
##     age sex   cp        trestbps  chol fbs   restecg thalach exang oldpeak slope
##   <dbl> <fct> <fct>        <dbl> <dbl> <fct> <fct>     <dbl> <fct>   <dbl> <fct>
## 1    38 male  typical …      138   175 false ST-T w…     173 no          0 slop…
## # ℹ 3 more variables: ca <fct>, thal <fct>, target <fct>
# Preliminary statistical summary
summary(heart_data_transformed)
##       age            sex                     cp         trestbps    
##  Min.   :29.00   female: 96   non-anginal pain:143   Min.   : 94.0  
##  1st Qu.:47.50   male  :207   atypical angina : 50   1st Qu.:120.0  
##  Median :55.00                typical angina  : 87   Median :130.0  
##  Mean   :54.37                asymptomatic    : 23   Mean   :131.6  
##  3rd Qu.:61.00                                       3rd Qu.:140.0  
##  Max.   :77.00                                       Max.   :200.0  
##       chol          fbs                              restecg       thalach     
##  Min.   :126.0   false:258   normal                      :147   Min.   : 71.0  
##  1st Qu.:211.0   true : 45   ST-T wave abnormality       :152   1st Qu.:133.5  
##  Median :240.0               left ventricular hypertrophy:  4   Median :153.0  
##  Mean   :246.3                                                  Mean   :149.6  
##  3rd Qu.:274.5                                                  3rd Qu.:166.0  
##  Max.   :564.0                                                  Max.   :202.0  
##  exang        oldpeak         slope     ca                     thal    
##  no :204   Min.   :0.00   slope 0: 21   0:175   normal           : 20  
##  yes: 99   1st Qu.:0.00   slope 1:140   1: 65   fixed defect     :166  
##            Median :0.80   slope 2:142   2: 38   reversible defect:117  
##            Mean   :1.04                 3: 20                          
##            3rd Qu.:1.60                 4:  5                          
##            Max.   :6.20                                                
##         target   
##  no disease:138  
##  disease   :165  
##                  
##                  
##                  
## 
# Explore categorical variables using count plots
ggplot(heart_data_transformed, aes(x = cp, fill = cp)) +
  geom_bar() +
  labs(title = "Chest Pain Type Distribution")

ggplot(heart_data_transformed, aes(x = fbs, fill = fbs)) +
  geom_bar() +
  labs(title = "Fasting Blood Sugar Distribution")

ggplot(heart_data_transformed, aes(x = restecg, fill = restecg)) +
  geom_bar() +
  labs(title = "Resting ECG Distribution")

ggplot(heart_data_transformed, aes(x = exang, fill = exang)) +
  geom_bar() +
  labs(title = "Exercise Induced Angina Distribution")

ggplot(heart_data_transformed, aes(x = slope, fill = slope)) +
  geom_bar() +
  labs(title = "Slope Distribution")

ggplot(heart_data_transformed, aes(x = ca, fill = ca)) +
  geom_bar() +
  labs(title = "Number of Major Vessels Distribution")

ggplot(heart_data_transformed, aes(x = thal, fill = thal)) +
  geom_bar() +
  labs(title = "Thal Distribution")

ggplot(heart_data_transformed, aes(x = sex, fill = sex)) +
  geom_bar() +
  labs(title = "Composition of Patients by Gender")

# Study the occurrence of CVD across different ages
ggplot(heart_data_transformed, aes(x = age, fill = target)) +
  geom_histogram(binwidth = 5, position = "dodge") +
  labs(title = "Occurrence of CVD Across Ages", x = "Age", y = "Count", fill = "CVD")

# Study the composition of patients by gender
ggplot(heart_data_transformed, aes(x = sex, fill = target)) +
  geom_bar() +
  labs(title = "Composition of Patients by Gender", fill = "CVD")

# Detect heart attack based on anomalies in resting blood pressure
ggplot(heart_data_transformed, aes(x = target, y = trestbps)) +
  geom_boxplot() +
  labs(title = "Detecting Heart Attack Based on Resting Blood Pressure", x = "Heart Attack", y = "Resting Blood Pressure")

# Relationship between cholesterol levels and target variable
ggplot(heart_data_transformed, aes(x = chol, fill = target)) +
  geom_histogram(binwidth = 10, position = "dodge") +
  labs(title = "Cholesterol Levels vs. CVD Occurrence", x = "Cholesterol Levels", y = "Count", fill = "CVD")

# Relationship between peak exercising and occurrence of heart attack
ggplot(heart_data_transformed, aes(x = target, y = oldpeak)) +
  geom_boxplot() +
  labs(title = "Peak Exercise ST Depression vs. Heart Attack", x = "Heart Attack", y = "ST Depression")

# Thalassemia and CVD relationship
ggplot(heart_data_transformed, aes(x = thal, fill = target)) +
  geom_bar() +
  labs(title = "Thalassemia and CVD Relationship", fill = "CVD")

# Use logistic regression to explore relationship between factors and CVD occurrence
log_reg_model <- glm(target ~ ., data = heart_data_transformed, family = "binomial")
summary(log_reg_model)
## 
## Call:
## glm(formula = target ~ ., family = "binomial", data = heart_data_transformed)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.9404  -0.2750   0.1042   0.4539   3.1088  
## 
## Coefficients:
##                                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                          2.625315   2.963517   0.886 0.375683    
## age                                  0.027403   0.025394   1.079 0.280524    
## sexmale                             -1.828525   0.571743  -3.198 0.001383 ** 
## cpatypical angina                    0.884012   0.576235   1.534 0.125001    
## cptypical angina                     2.008412   0.532184   3.774 0.000161 ***
## cpasymptomatic                       2.413208   0.717417   3.364 0.000769 ***
## trestbps                            -0.025479   0.011880  -2.145 0.031971 *  
## chol                                -0.004082   0.004240  -0.963 0.335663    
## fbstrue                              0.341972   0.574744   0.595 0.551844    
## restecgST-T wave abnormality         0.446960   0.399511   1.119 0.263240    
## restecgleft ventricular hypertrophy -0.714066   2.772148  -0.258 0.796727    
## thalach                              0.019191   0.011778   1.629 0.103229    
## exangyes                            -0.807201   0.450351  -1.792 0.073071 .  
## oldpeak                             -0.390368   0.241556  -1.616 0.106082    
## slopeslope 1                        -0.819597   0.875754  -0.936 0.349337    
## slopeslope 2                         0.645486   0.945384   0.683 0.494748    
## ca1                                 -2.298937   0.524000  -4.387 1.15e-05 ***
## ca2                                 -3.425023   0.806938  -4.244 2.19e-05 ***
## ca3                                 -2.203505   0.934782  -2.357 0.018411 *  
## ca4                                  1.315756   1.743752   0.755 0.450516    
## thalfixed defect                    -0.049327   0.801377  -0.062 0.950918    
## thalreversible defect               -1.498076   0.786286  -1.905 0.056747 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 417.64  on 302  degrees of freedom
## Residual deviance: 180.76  on 281  degrees of freedom
## AIC: 224.76
## 
## Number of Fisher Scoring iterations: 6
# Pair plot to understand relationship between all variables
pairs(heart_data_transformed)

# Build baseline logistic regression model
baseline_model <- glm(target ~ ., data = heart_data, family = "binomial")
summary(baseline_model)
## 
## Call:
## glm(formula = target ~ ., family = "binomial", data = heart_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6018  -0.3901   0.1549   0.5824   2.6399  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  3.584930   2.581363   1.389 0.164902    
## age         -0.004633   0.023261  -0.199 0.842117    
## sex         -1.736286   0.469927  -3.695 0.000220 ***
## cp           0.858124   0.185604   4.623 3.77e-06 ***
## trestbps    -0.019687   0.010350  -1.902 0.057155 .  
## chol        -0.004599   0.003779  -1.217 0.223556    
## fbs          0.056954   0.528307   0.108 0.914150    
## restecg      0.486371   0.349920   1.390 0.164545    
## thalach      0.023577   0.010486   2.249 0.024544 *  
## exang       -0.962881   0.410811  -2.344 0.019086 *  
## oldpeak     -0.544410   0.214936  -2.533 0.011313 *  
## slope        0.575268   0.351105   1.638 0.101328    
## ca          -0.780812   0.191813  -4.071 4.69e-05 ***
## thal        -0.986962   0.297936  -3.313 0.000924 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 417.64  on 302  degrees of freedom
## Residual deviance: 210.14  on 289  degrees of freedom
## AIC: 238.14
## 
## Number of Fisher Scoring iterations: 6
# Variable importance chart based on the logistic regression model
var_importance <- data.frame(variable = names(coef(baseline_model)), importance = abs(coef(baseline_model)))
var_importance <- var_importance %>%
  arrange(desc(importance))

ggplot(var_importance, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Variable Importance for CVD Prediction", x = "Variable", y = "Importance")

# Split data into training and testing sets
set.seed(123)  # For reproducibility
sample_indices <- sample(1:nrow(heart_data), 0.7 * nrow(heart_data))
train_data <- heart_data[sample_indices, ]
test_data <- heart_data[-sample_indices, ]

# Build logistic regression model on training data
model <- glm(target ~ ., data = train_data, family = "binomial")

# Predict on test data
test_predictions <- predict(model, newdata = test_data, type = "response")
test_predictions <- ifelse(test_predictions > 0.5, 1, 0)

# Calculate accuracy
accuracy <- sum(test_predictions == test_data$target) / nrow(test_data)
print(paste("Accuracy:", accuracy))
## [1] "Accuracy: 0.846153846153846"