CVD Predictors Analysis

Importing dataset:

# Importing the dataset
heart_data <- read.csv("C:/Users/63650/Downloads/heart_failure_clinical_records_dataset.csv")

We choose to exclude the follow up period (time) variable, as this is highly correlated to the DEATH_EVENT outcome variable, as by common sense, it will be obvious if a patient is alive or not from their follow- up visit or lack thereof.

heart_data <- heart_data[, !(names(heart_data) %in% "time")]

Hypotheses Outline:

# Create the hypothesis data as a data frame
table_data <- data.frame(
    Hypothesis_age = c(
        "We predict ages over 60 and serum creatinine exceeding 1.4 mg/dL to be reliable predictors of CVD.",
        "We predict ages under 60 and serum creatinine exceeding 1.4 mg/dL to be reliable predictors of CVD."
    ),
    Hypothesis_creatinine = c(
        "We predict ages over 60 and serum creatinine less than 1.4 mg/dL to NOT be reliable predictors of CVD.",
        "We predict ages under 60 and serum creatinine less than 1.4 mg/dL to NOT be reliable predictors of CVD."
    )
)

# Create the 2x2 table using knitr
kable(table_data, "html", caption = "Hypotheses") %>%
    kable_styling(full_width = FALSE, position = "center")

Hypotheses
Hypothesis_age	Hypothesis_creatinine
We predict ages over 60 and serum creatinine exceeding 1.4 mg/dL to be reliable predictors of CVD.	We predict ages over 60 and serum creatinine less than 1.4 mg/dL to NOT be reliable predictors of CVD.
We predict ages under 60 and serum creatinine exceeding 1.4 mg/dL to be reliable predictors of CVD.	We predict ages under 60 and serum creatinine less than 1.4 mg/dL to NOT be reliable predictors of CVD.

# Check for missing values
colSums(is.na(heart_data))

##                      age                  anaemia creatinine_phosphokinase 
##                        0                        0                        0 
##                 diabetes        ejection_fraction      high_blood_pressure 
##                        0                        0                        0 
##                platelets         serum_creatinine             serum_sodium 
##                        0                        0                        0 
##                      sex                  smoking              DEATH_EVENT 
##                        0                        0                        0

# Outlier detection via IQR
numeric_columns <- select_if(heart_data, is.numeric)
outliers <- lapply(numeric_columns, function(x) x[x < (quantile(x, 0.25) - 1.5 * IQR(x)) | x > (quantile(x, 0.75) + 1.5 * IQR(x))])
outliers

## $age
## numeric(0)
## 
## $creatinine_phosphokinase
##  [1] 7861 2656 1380 3964 7702 5882 5209 1876 1808 4540 1548 1610 2261 1846 2334
## [16] 2442 3966 1419 1896 1767 2281 2794 2017 2522 2695 1688 1820 2060 2413
## 
## $ejection_fraction
## [1] 80 70
## 
## $platelets
##  [1] 454000  47000 451000 461000 497000 621000 850000 507000 448000  75000
## [11]  70000  73000 481000 504000  62000 533000  25100 451000  51000 543000
## [21] 742000
## 
## $serum_creatinine
##  [1] 2.7 9.4 4.0 5.8 3.0 3.5 2.3 3.0 4.4 6.8 2.2 2.7 2.3 2.9 2.5 2.3 3.2 3.7 3.4
## [20] 6.1 2.5 2.4 2.5 3.5 9.0 5.0 2.4 2.7 3.8
## 
## $serum_sodium
## [1] 116 121 124 113

Checking for missing values:

##                      age                  anaemia creatinine_phosphokinase 
##                        0                        0                        0 
##                 diabetes        ejection_fraction      high_blood_pressure 
##                        0                        0                        0 
##                platelets         serum_creatinine             serum_sodium 
##                        0                        0                        0 
##                      sex                  smoking              DEATH_EVENT 
##                        0                        0                        0

Outlier classification:

# Outlier detection via IQR
numeric_columns <- select_if(heart_data, is.numeric)
outliers <- lapply(numeric_columns, function(x) x[x < (quantile(x, 0.25) - 1.5 * IQR(x)) | x > (quantile(x, 0.75) + 1.5 * IQR(x))])
outliers

## $age
## numeric(0)
## 
## $creatinine_phosphokinase
##  [1] 7861 2656 1380 3964 7702 5882 5209 1876 1808 4540 1548 1610 2261 1846 2334
## [16] 2442 3966 1419 1896 1767 2281 2794 2017 2522 2695 1688 1820 2060 2413
## 
## $ejection_fraction
## [1] 80 70
## 
## $platelets
##  [1] 454000  47000 451000 461000 497000 621000 850000 507000 448000  75000
## [11]  70000  73000 481000 504000  62000 533000  25100 451000  51000 543000
## [21] 742000
## 
## $serum_creatinine
##  [1] 2.7 9.4 4.0 5.8 3.0 3.5 2.3 3.0 4.4 6.8 2.2 2.7 2.3 2.9 2.5 2.3 3.2 3.7 3.4
## [20] 6.1 2.5 2.4 2.5 3.5 9.0 5.0 2.4 2.7 3.8
## 
## $serum_sodium
## [1] 116 121 124 113

Proportion of 0 (no death) to 1 (death):

## 
##         0         1 
## 0.6789298 0.3210702

Metrics as defined in the hypotheses:

# Age Average
mean(heart_data$age)

## [1] 60.83389

# Creatinine Average
mean(heart_data$serum_creatinine)

## [1] 1.39388

# Age Median 
median(heart_data$age)

## [1] 60

# Creatinine Median
median(heart_data$serum_creatinine)

## [1] 1.1

Correlation Matrix:

correlation_matrix <- cor(select_if(heart_data, is.numeric))
corrplot::corrplot(correlation_matrix, method = "circle")

Key Logistic Regression Metrics
Term	Definition
p- value	The probability that the observed results occurred by chance, with values less than 0.05 indicating significance.
Odds Ratios	A statistic that expresses the change in odds of an outcome for a one-unit change in a predictor.
AIC Value	Akaike Information Criterion, a measure of model quality with lower values indicating better fit.
Confusion Matrix	A table used to evaluate the performance of a classification model by comparing predictions with actual results.
ROC Curve	Receiver Operating Characteristic curve, a plot that shows the performance of a binary classifier.
Multicollinearity	A situation in regression analysis where predictor variables are highly correlated with each other.
VIF	Variance Inflation Factor, a measure used to detect multicollinearity in a regression model.
Cross Validation	A technique to assess model reliability by training and testing on different subsets of data.

Model Summary:

logistic_model <- glm(DEATH_EVENT ~ ., data = heart_data, family = binomial)
summary(logistic_model)

## 
## Call:
## glm(formula = DEATH_EVENT ~ ., family = binomial, data = heart_data)
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               4.964e+00  4.601e+00   1.079 0.280625    
## age                       5.569e-02  1.313e-02   4.241 2.23e-05 ***
## anaemia1                  4.179e-01  3.009e-01   1.389 0.164904    
## creatinine_phosphokinase  2.905e-04  1.428e-04   2.034 0.041907 *  
## diabetes1                 1.514e-01  2.974e-01   0.509 0.610644    
## ejection_fraction        -7.032e-02  1.486e-02  -4.731 2.23e-06 ***
## high_blood_pressure1      4.189e-01  3.061e-01   1.369 0.171092    
## platelets                -7.094e-07  1.617e-06  -0.439 0.660857    
## serum_creatinine          6.619e-01  1.734e-01   3.817 0.000135 ***
## serum_sodium             -5.667e-02  3.338e-02  -1.698 0.089558 .  
## sex1                     -3.990e-01  3.508e-01  -1.137 0.255394    
## smoking1                  1.356e-01  3.486e-01   0.389 0.697300    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 375.35  on 298  degrees of freedom
## Residual deviance: 294.28  on 287  degrees of freedom
## AIC: 318.28
## 
## Number of Fisher Scoring iterations: 5

# Compute Odds Ratios
odds_ratios <- exp(coef(logistic_model))
cat("\n--- Odds Ratios ---\n")

## 
## --- Odds Ratios ---

print(odds_ratios)

##              (Intercept)                      age                 anaemia1 
##              143.2080946                1.0572709                1.5188140 
## creatinine_phosphokinase                diabetes1        ejection_fraction 
##                1.0002906                1.1634708                0.9320936 
##     high_blood_pressure1                platelets         serum_creatinine 
##                1.5203456                0.9999993                1.9383955 
##             serum_sodium                     sex1                 smoking1 
##                0.9449089                0.6709813                1.1452113

# Compute AIC Value
aic_value <- AIC(logistic_model)
cat("\n--- AIC Value ---\n")

## 
## --- AIC Value ---

print(aic_value)

## [1] 318.2807

# Predicted Values and Confusion Matrix
predicted_values <- ifelse(predict(logistic_model, heart_data, type = "response") > 0.5, 1, 0)
true_values <- heart_data$DEATH_EVENT  # Replace DEATH_EVENT if the actual outcome variable is different

cat("\n--- Confusion Matrix ---\n")

## 
## --- Confusion Matrix ---

library(caret)  # Ensure 'caret' package is loaded for confusionMatrix
conf_matrix <- confusionMatrix(as.factor(predicted_values), as.factor(true_values))
print(conf_matrix)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 183  49
##          1  20  47
##                                           
##                Accuracy : 0.7692          
##                  95% CI : (0.7173, 0.8158)
##     No Information Rate : 0.6789          
##     P-Value [Acc > NIR] : 0.0003752       
##                                           
##                   Kappa : 0.4249          
##                                           
##  Mcnemar's Test P-Value : 0.0007495       
##                                           
##             Sensitivity : 0.9015          
##             Specificity : 0.4896          
##          Pos Pred Value : 0.7888          
##          Neg Pred Value : 0.7015          
##              Prevalence : 0.6789          
##          Detection Rate : 0.6120          
##    Detection Prevalence : 0.7759          
##       Balanced Accuracy : 0.6955          
##                                           
##        'Positive' Class : 0               
##

# Predicted Probabilities and ROC Curve
cat("\n--- ROC Curve and AUC ---\n")

## 
## --- ROC Curve and AUC ---

library(pROC)  # Ensure 'pROC' package is loaded for ROC analysis
predicted_probabilities <- predict(logistic_model, heart_data, type = "response")
roc_curve <- roc(true_values, predicted_probabilities)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

plot(roc_curve, main = "ROC Curve")  # Plot the ROC curve

auc_value <- auc(roc_curve)  # Calculate AUC
cat("AUC Value: ", auc_value, "\n")

## AUC Value:  0.8085489

# Compute VIF Values
cat("\n--- VIF Values ---\n")

## 
## --- VIF Values ---

library(car)  # Ensure 'car' package is loaded for VIF
vif_values <- vif(logistic_model)
print(vif_values)

##                      age                  anaemia creatinine_phosphokinase 
##                 1.128444                 1.091638                 1.112243 
##                 diabetes        ejection_fraction      high_blood_pressure 
##                 1.056803                 1.122656                 1.059847 
##                platelets         serum_creatinine             serum_sodium 
##                 1.057122                 1.059148                 1.073572 
##                      sex                  smoking 
##                 1.377926                 1.260903

Odds Ratios:

# Odds Ratios
odds_ratios <- exp(coef(logistic_model))
print(odds_ratios)

##              (Intercept)                      age                 anaemia1 
##              143.2080946                1.0572709                1.5188140 
## creatinine_phosphokinase                diabetes1        ejection_fraction 
##                1.0002906                1.1634708                0.9320936 
##     high_blood_pressure1                platelets         serum_creatinine 
##                1.5203456                0.9999993                1.9383955 
##             serum_sodium                     sex1                 smoking1 
##                0.9449089                0.6709813                1.1452113

AIC Value:

# AIC Value
aic_value <- AIC(logistic_model)
print(aic_value)

## [1] 318.2807

Confusion Matrix:

# Confusion Matrix
# Assuming 'predicted_values' are the model predictions (binary) and 'true_values' are actual outcomes
predicted_values <- ifelse(predict(logistic_model, heart_data, type = "response") > 0.5, 1, 0)
true_values <- heart_data$DEATH_EVENT  # replace DEATH_EVENT with the actual outcome variable name if different
conf_matrix <- confusionMatrix(as.factor(predicted_values), as.factor(true_values))
print(conf_matrix)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 183  49
##          1  20  47
##                                           
##                Accuracy : 0.7692          
##                  95% CI : (0.7173, 0.8158)
##     No Information Rate : 0.6789          
##     P-Value [Acc > NIR] : 0.0003752       
##                                           
##                   Kappa : 0.4249          
##                                           
##  Mcnemar's Test P-Value : 0.0007495       
##                                           
##             Sensitivity : 0.9015          
##             Specificity : 0.4896          
##          Pos Pred Value : 0.7888          
##          Neg Pred Value : 0.7015          
##              Prevalence : 0.6789          
##          Detection Rate : 0.6120          
##    Detection Prevalence : 0.7759          
##       Balanced Accuracy : 0.6955          
##                                           
##        'Positive' Class : 0               
##

ROC Curve:

# ROC Curve
predicted_probabilities <- predict(logistic_model, heart_data, type = "response")
roc_curve <- roc(true_values, predicted_probabilities)
plot(roc_curve, main = "ROC Curve")

auc_value <- auc(roc_curve)
print(auc_value)

## Area under the curve: 0.8085

Multicollinearity Check:

# Multicollinearity Check with VIF
vif_values <- vif(logistic_model)
print(vif_values)

##                      age                  anaemia creatinine_phosphokinase 
##                 1.128444                 1.091638                 1.112243 
##                 diabetes        ejection_fraction      high_blood_pressure 
##                 1.056803                 1.122656                 1.059847 
##                platelets         serum_creatinine             serum_sodium 
##                 1.057122                 1.059148                 1.073572 
##                      sex                  smoking 
##                 1.377926                 1.260903

Cross Validation:

# Model Validation with Cross-Validation
cv_results <- cv.glm(heart_data, logistic_model, K = 10)
print(cv_results$delta)

## [1] 0.1796879 0.1786616

Now we attempt the same analysis via stepwise regression to see if our results may be improved:

# 1. Stepwise Selection to Identify Important Variables
stepwise_model <- step(logistic_model, direction = "both")  # Both-direction stepwise selection

## Start:  AIC=318.28
## DEATH_EVENT ~ age + anaemia + creatinine_phosphokinase + diabetes + 
##     ejection_fraction + high_blood_pressure + platelets + serum_creatinine + 
##     serum_sodium + sex + smoking
## 
##                            Df Deviance    AIC
## - smoking                   1   294.43 316.43
## - platelets                 1   294.47 316.47
## - diabetes                  1   294.54 316.54
## - sex                       1   295.58 317.58
## - high_blood_pressure       1   296.15 318.15
## - anaemia                   1   296.22 318.22
## <none>                          294.28 318.28
## - serum_sodium              1   297.21 319.21
## - creatinine_phosphokinase  1   298.46 320.46
## - serum_creatinine          1   314.04 336.04
## - age                       1   314.06 336.06
## - ejection_fraction         1   320.96 342.96
## 
## Step:  AIC=316.43
## DEATH_EVENT ~ age + anaemia + creatinine_phosphokinase + diabetes + 
##     ejection_fraction + high_blood_pressure + platelets + serum_creatinine + 
##     serum_sodium + sex
## 
##                            Df Deviance    AIC
## - platelets                 1   294.61 314.61
## - diabetes                  1   294.68 314.68
## - sex                       1   295.59 315.59
## - anaemia                   1   296.28 316.28
## - high_blood_pressure       1   296.29 316.29
## <none>                          294.43 316.43
## - serum_sodium              1   297.35 317.35
## + smoking                   1   294.28 318.28
## - creatinine_phosphokinase  1   298.51 318.51
## - serum_creatinine          1   314.07 334.07
## - age                       1   314.17 334.17
## - ejection_fraction         1   321.18 341.18
## 
## Step:  AIC=314.61
## DEATH_EVENT ~ age + anaemia + creatinine_phosphokinase + diabetes + 
##     ejection_fraction + high_blood_pressure + serum_creatinine + 
##     serum_sodium + sex
## 
##                            Df Deviance    AIC
## - diabetes                  1   294.83 312.83
## - sex                       1   295.66 313.66
## - high_blood_pressure       1   296.38 314.38
## - anaemia                   1   296.49 314.49
## <none>                          294.61 314.61
## - serum_sodium              1   297.63 315.63
## + platelets                 1   294.43 316.43
## + smoking                   1   294.47 316.47
## - creatinine_phosphokinase  1   298.62 316.62
## - age                       1   314.29 332.29
## - serum_creatinine          1   314.39 332.39
## - ejection_fraction         1   321.53 339.53
## 
## Step:  AIC=312.83
## DEATH_EVENT ~ age + anaemia + creatinine_phosphokinase + ejection_fraction + 
##     high_blood_pressure + serum_creatinine + serum_sodium + sex
## 
##                            Df Deviance    AIC
## - sex                       1   296.05 312.05
## - high_blood_pressure       1   296.58 312.58
## - anaemia                   1   296.71 312.71
## <none>                          294.83 312.83
## - serum_sodium              1   298.02 314.02
## + diabetes                  1   294.61 314.61
## + platelets                 1   294.68 314.68
## + smoking                   1   294.71 314.71
## - creatinine_phosphokinase  1   298.84 314.84
## - age                       1   314.29 330.29
## - serum_creatinine          1   314.46 330.46
## - ejection_fraction         1   321.68 337.68
## 
## Step:  AIC=312.05
## DEATH_EVENT ~ age + anaemia + creatinine_phosphokinase + ejection_fraction + 
##     high_blood_pressure + serum_creatinine + serum_sodium
## 
##                            Df Deviance    AIC
## <none>                          296.05 312.05
## - anaemia                   1   298.16 312.16
## - high_blood_pressure       1   298.42 312.42
## + sex                       1   294.83 312.83
## - serum_sodium              1   299.06 313.06
## + diabetes                  1   295.66 313.66
## - creatinine_phosphokinase  1   299.72 313.72
## + platelets                 1   296.01 314.01
## + smoking                   1   296.03 314.03
## - age                       1   314.73 328.73
## - serum_creatinine          1   315.76 329.76
## - ejection_fraction         1   321.83 335.83

summary(stepwise_model)  # View the model summary to see selected variables

## 
## Call:
## glm(formula = DEATH_EVENT ~ age + anaemia + creatinine_phosphokinase + 
##     ejection_fraction + high_blood_pressure + serum_creatinine + 
##     serum_sodium, family = binomial, data = heart_data)
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               4.7298351  4.5190849   1.047  0.29527    
## age                       0.0530244  0.0127859   4.147 3.37e-05 ***
## anaemia1                  0.4309325  0.2980595   1.446  0.14824    
## creatinine_phosphokinase  0.0002674  0.0001405   1.902  0.05713 .  
## ejection_fraction        -0.0679014  0.0145626  -4.663 3.12e-06 ***
## high_blood_pressure1      0.4606311  0.2992103   1.539  0.12368    
## serum_creatinine          0.6619016  0.1711437   3.868  0.00011 ***
## serum_sodium             -0.0568437  0.0330681  -1.719  0.08562 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 375.35  on 298  degrees of freedom
## Residual deviance: 296.05  on 291  degrees of freedom
## AIC: 312.05
## 
## Number of Fisher Scoring iterations: 5

# Extract variable names, excluding the intercept term
selected_variables <- names(coef(stepwise_model))
selected_variables <- selected_variables[selected_variables != "(Intercept)"]
selected_variables <- selected_variables[selected_variables != "time"]
selected_variables[selected_variables == "high_blood_pressure1"] <- "high_blood_pressure"
selected_variables[selected_variables == "anaemia1"] <- "anaemia"
# Fit the refined model with the selected variables
final_model <- glm(DEATH_EVENT ~ ., data = heart_data[, c(selected_variables, "DEATH_EVENT")], family = binomial)

Final Model Summary:

# Check Model Summary
summary(final_model)

## 
## Call:
## glm(formula = DEATH_EVENT ~ ., family = binomial, data = heart_data[, 
##     c(selected_variables, "DEATH_EVENT")])
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               4.7298351  4.5190849   1.047  0.29527    
## age                       0.0530244  0.0127859   4.147 3.37e-05 ***
## anaemia1                  0.4309325  0.2980595   1.446  0.14824    
## creatinine_phosphokinase  0.0002674  0.0001405   1.902  0.05713 .  
## ejection_fraction        -0.0679014  0.0145626  -4.663 3.12e-06 ***
## high_blood_pressure1      0.4606311  0.2992103   1.539  0.12368    
## serum_creatinine          0.6619016  0.1711437   3.868  0.00011 ***
## serum_sodium             -0.0568437  0.0330681  -1.719  0.08562 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 375.35  on 298  degrees of freedom
## Residual deviance: 296.05  on 291  degrees of freedom
## AIC: 312.05
## 
## Number of Fisher Scoring iterations: 5

Final Odds Ratios:

# Calculate Odds Ratios
odds_ratios_f <- exp(coef(final_model))
print(odds_ratios_f)

##              (Intercept)                      age                 anaemia1 
##              113.2768864                1.0544554                1.5386917 
## creatinine_phosphokinase        ejection_fraction     high_blood_pressure1 
##                1.0002674                0.9343526                1.5850741 
##         serum_creatinine             serum_sodium 
##                1.9384750                0.9447417

Final AIC Value:

# Assess Model Fit with AIC
aic_value_f <- AIC(final_model)
print(aic_value_f)

## [1] 312.0522

Final Confusion Matrix:

# Evaluate Model Performance with Confusion Matrix
predicted_values_f <- ifelse(predict(final_model, heart_data, type = "response") > 0.5, 1, 0)
true_values <- heart_data$DEATH_EVENT
conf_matrix_f <- confusionMatrix(as.factor(predicted_values_f), as.factor(true_values))
print(conf_matrix_f)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 182  51
##          1  21  45
##                                           
##                Accuracy : 0.7592          
##                  95% CI : (0.7066, 0.8066)
##     No Information Rate : 0.6789          
##     P-Value [Acc > NIR] : 0.0014572       
##                                           
##                   Kappa : 0.3981          
##                                           
##  Mcnemar's Test P-Value : 0.0006316       
##                                           
##             Sensitivity : 0.8966          
##             Specificity : 0.4688          
##          Pos Pred Value : 0.7811          
##          Neg Pred Value : 0.6818          
##              Prevalence : 0.6789          
##          Detection Rate : 0.6087          
##    Detection Prevalence : 0.7793          
##       Balanced Accuracy : 0.6827          
##                                           
##        'Positive' Class : 0               
##

Final ROC Curve:

# Plot ROC Curve and Calculate AUC
predicted_probabilities_f <- predict(final_model, heart_data, type = "response")
roc_curve_f <- roc(true_values, predicted_probabilities_f)
plot(roc_curve_f, main = "ROC Curve")

auc_value_f <- auc(roc_curve_f)
print(auc_value_f)

## Area under the curve: 0.8035

Final VIF Values:

# Multicollinearity Check with VIF
vif_values_f <- vif(final_model)
print(vif_values_f)

##                      age                  anaemia creatinine_phosphokinase 
##                 1.085777                 1.076613                 1.088111 
##        ejection_fraction      high_blood_pressure         serum_creatinine 
##                 1.096840                 1.022362                 1.050969 
##             serum_sodium 
##                 1.057987

Final Cross Validation:

# Model Validation with Cross-Validation
cv_results_f <- cv.glm(heart_data, final_model, K = 10)
print(cv_results_f$delta)

## [1] 0.1759261 0.1762559

# Compute Final Odds Ratios
odds_ratios_f <- exp(coef(final_model))
cat("\n--- Final Odds Ratios ---\n")

## 
## --- Final Odds Ratios ---

print(odds_ratios_f)

##              (Intercept)                      age                 anaemia1 
##              113.2768864                1.0544554                1.5386917 
## creatinine_phosphokinase        ejection_fraction     high_blood_pressure1 
##                1.0002674                0.9343526                1.5850741 
##         serum_creatinine             serum_sodium 
##                1.9384750                0.9447417

# Compute Final AIC Value
aic_value_f <- AIC(final_model)
cat("\n--- Final AIC Value ---\n")

## 
## --- Final AIC Value ---

print(aic_value_f)

## [1] 312.0522

# Final Predicted Values and Confusion Matrix
predicted_values_f <- ifelse(predict(final_model, heart_data, type = "response") > 0.5, 1, 0)
true_values <- heart_data$DEATH_EVENT  # Replace DEATH_EVENT with actual outcome variable if different

cat("\n--- Final Confusion Matrix ---\n")

## 
## --- Final Confusion Matrix ---

library(caret)  # Ensure 'caret' package is loaded
conf_matrix_f <- confusionMatrix(as.factor(predicted_values_f), as.factor(true_values))
print(conf_matrix_f)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 182  51
##          1  21  45
##                                           
##                Accuracy : 0.7592          
##                  95% CI : (0.7066, 0.8066)
##     No Information Rate : 0.6789          
##     P-Value [Acc > NIR] : 0.0014572       
##                                           
##                   Kappa : 0.3981          
##                                           
##  Mcnemar's Test P-Value : 0.0006316       
##                                           
##             Sensitivity : 0.8966          
##             Specificity : 0.4688          
##          Pos Pred Value : 0.7811          
##          Neg Pred Value : 0.6818          
##              Prevalence : 0.6789          
##          Detection Rate : 0.6087          
##    Detection Prevalence : 0.7793          
##       Balanced Accuracy : 0.6827          
##                                           
##        'Positive' Class : 0               
##

# Final Predicted Probabilities and ROC Curve
cat("\n--- Final ROC Curve and AUC ---\n")

## 
## --- Final ROC Curve and AUC ---

library(pROC)  # Ensure 'pROC' package is loaded
predicted_probabilities_f <- predict(final_model, heart_data, type = "response")
roc_curve_f <- roc(true_values, predicted_probabilities_f)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

plot(roc_curve_f, main = "ROC Curve")  # Plot the ROC curve

auc_value_f <- auc(roc_curve_f)  # Calculate AUC
cat("AUC Value: ", auc_value_f, "\n")

## AUC Value:  0.8034688

# Compute Final VIF Values
cat("\n--- Final VIF Values ---\n")

## 
## --- Final VIF Values ---

library(car)  # Ensure 'car' package is loaded
vif_values_f <- vif(final_model)
print(vif_values_f)

##                      age                  anaemia creatinine_phosphokinase 
##                 1.085777                 1.076613                 1.088111 
##        ejection_fraction      high_blood_pressure         serum_creatinine 
##                 1.096840                 1.022362                 1.050969 
##             serum_sodium 
##                 1.057987

comparison_table <- data.frame(
  Metric = c("AIC", "AUC", "VIF (Average)"),
  `Before Stepwise` = c(
    aic_value,
    auc_value,
    mean(vif_values)
  ),
  `After Stepwise` = c(
    aic_value_f,
    auc_value_f,
    mean(vif_values_f)
  )
)

# Print the table with kable
kable(comparison_table, caption = "Comparison of Model Metrics Before and After Stepwise Selection")

Comparison of Model Metrics Before and After Stepwise Selection
Metric	Before.Stepwise	After.Stepwise
AIC	318.2807066	312.0522349
AUC	0.8085489	0.8034688
VIF (Average)	1.1273001	1.0683801

Comparison of Model Metrics Before and After Stepwise Selection
Metric	Before.Stepwise	After.Stepwise
AIC	318.2807066	312.0522349
AUC	0.8085489	0.8034688
VIF (Average)	1.1273001	1.0683801
Cross-Validation Delta (Raw)	0.1796879	0.1759261
Cross-Validation Delta (Bias-Corrected)	0.1786616	0.1762559

We continue our analysis with a decision tree analysis to validate our results prior:

Key Decision Tree Metrics
Term	Definition
Complexity Parameter	A parameter that controls the size of the decision tree by penalizing more complex models. Smaller values lead to larger trees, while larger values prune the tree.
1-SE Rule	The rule that selects the simplest model within one standard error of the minimum cross-validated error. It helps to avoid overfitting.
Performance	A general measure of the effectiveness of the model, often based on metrics like accuracy, precision, recall, and F1 score.
Accuracy	The proportion of correctly predicted instances out of the total instances. Measures how often the model is correct.
Precision	The proportion of true positive predictions out of all positive predictions made by the model. Measures accuracy in predicting positive cases.
Gini Impurity	A measure of how often instances are classified correctly by splitting on a variable. It focuses on purity within nodes.
F1 Score	A metric combining precision and recall. It is the harmonic mean of precision and recall, providing a balanced score for the model’s accuracy.

Via Gini Impurity:

# Fit the decision tree model using Gini impurity
tree_model <- rpart(DEATH_EVENT ~ ., data = heart_data, method = "class", parms = list(split = "gini"))

# Plot the tree
rpart.plot(tree_model, main = "Decision Tree for DEATH_EVENT")

Complexity Parameter:

# Display complexity parameter (CP) table
printcp(tree_model)

## 
## Classification tree:
## rpart(formula = DEATH_EVENT ~ ., data = heart_data, method = "class", 
##     parms = list(split = "gini"))
## 
## Variables actually used in tree construction:
## [1] age               ejection_fraction platelets         serum_creatinine 
## [5] serum_sodium     
## 
## Root node error: 96/299 = 0.32107
## 
## n= 299 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.229167      0   1.00000 1.00000 0.084096
## 2 0.083333      1   0.77083 0.80208 0.078762
## 3 0.062500      2   0.68750 0.80208 0.078762
## 4 0.015625      3   0.62500 0.77083 0.077732
## 5 0.010000      8   0.54167 0.83333 0.079737

Optimal Parameter:

# Choose the optimal CP value based on 1-SE rule
optimal_cp <- tree_model$cptable[which.min(tree_model$cptable[,"xerror"]), "CP"]

# Prune the tree using the optimal CP value
pruned_tree <- prune(tree_model, cp = optimal_cp)

# Plot the pruned tree
rpart.plot(pruned_tree, main = "Pruned Decision Tree for DEATH_EVENT")

Performance of Pruned Tree:

# Predict on the training data
predicted <- predict(pruned_tree, heart_data, type = "class")

# Generate confusion matrix to calculate performance metrics
confusion_matrix <- table(Predicted = predicted, Actual = heart_data$DEATH_EVENT)

# Calculate accuracy, precision, recall, and F1 score
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
precision <- confusion_matrix[2, 2] / sum(confusion_matrix[2, ])
recall <- confusion_matrix[2, 2] / sum(confusion_matrix[, 2])
f1_score <- 2 * ((precision * recall) / (precision + recall))

# Print results
cat("Accuracy:", accuracy, "\n")

## Accuracy: 0.7993311

cat("Precision:", precision, "\n")

## Precision: 0.6730769

cat("Recall:", recall, "\n")

## Recall: 0.7291667

cat("F1 Score:", f1_score, "\n")

## F1 Score: 0.7

Data Splitting:

# Split 1: Age > 60 and Serum Creatinine > 1.4
heart_data_age60_creat1.4 <- subset(heart_data, age > 60 & serum_creatinine > 1.4)

# Split 2: Age > 60 and Serum Creatinine < 1.4
heart_data_age60_creat_less1.4 <- subset(heart_data, age > 60 & serum_creatinine < 1.4)

# Split 3: Age < 60 and Serum Creatinine > 1.4
heart_data_age_less60_creat1.4 <- subset(heart_data, age < 60 & serum_creatinine > 1.4)

# Split 4: Age < 60 and Serum Creatinine < 1.4
heart_data_age_less60_creat_less1.4 <- subset(heart_data, age < 60 & serum_creatinine < 1.4)

# Save each subset as separate files if needed
write.csv(heart_data_age60_creat1.4, "heart_data_age60_creat1.4.csv", row.names = FALSE)
write.csv(heart_data_age60_creat_less1.4, "heart_data_age60_creat_less1.4.csv", row.names = FALSE)
write.csv(heart_data_age_less60_creat1.4, "heart_data_age_less60_creat1.4.csv", row.names = FALSE)
write.csv(heart_data_age_less60_creat_less1.4, "heart_data_age_less60_creat_less1.4.csv", row.names = FALSE)

Final Splits and Testing:

# Subset 1: Age > 60 and Serum Creatinine > 1.4
heart_data_age60_creat1.4 <- subset(heart_data, age > 60 & serum_creatinine > 1.4)
model_age60_creat1.4 <- glm(DEATH_EVENT ~ ., data = heart_data_age60_creat1.4, family = binomial)
summary_age60_creat1.4 <- summary(model_age60_creat1.4)

# Extract significant variables (p < 0.05)
significant_vars_age60_creat1.4 <- summary_age60_creat1.4$coefficients[summary_age60_creat1.4$coefficients[, "Pr(>|z|)"] < 0.05, ]
if (nrow(significant_vars_age60_creat1.4) == 0) {
  significant_vars_age60_creat1.4 <- data.frame(Variable = "None", Estimate = NA, `Pr(>|z|)` = NA)
} else {
  significant_vars_age60_creat1.4 <- as.data.frame(significant_vars_age60_creat1.4)
  significant_vars_age60_creat1.4$Variable <- rownames(significant_vars_age60_creat1.4)
}

# Subset 2: Age > 60 and Serum Creatinine < 1.4
heart_data_age60_creat_less1.4 <- subset(heart_data, age > 60 & serum_creatinine < 1.4)
model_age60_creat_less1.4 <- glm(DEATH_EVENT ~ ., data = heart_data_age60_creat_less1.4, family = binomial)
summary_age60_creat_less1.4 <- summary(model_age60_creat_less1.4)

# Extract significant variables (p < 0.05)
significant_vars_age60_creat_less1.4 <- summary_age60_creat_less1.4$coefficients[summary_age60_creat_less1.4$coefficients[, "Pr(>|z|)"] < 0.05, ]
if (nrow(significant_vars_age60_creat_less1.4) == 0) {
  significant_vars_age60_creat_less1.4 <- data.frame(Variable = "None", Estimate = NA, `Pr(>|z|)` = NA)
} else {
  significant_vars_age60_creat_less1.4 <- as.data.frame(significant_vars_age60_creat_less1.4)
  significant_vars_age60_creat_less1.4$Variable <- rownames(significant_vars_age60_creat_less1.4)
}

# Subset 3: Age < 60 and Serum Creatinine > 1.4
heart_data_age_less60_creat1.4 <- subset(heart_data, age < 60 & serum_creatinine > 1.4)
model_age_less60_creat1.4 <- glm(DEATH_EVENT ~ ., data = heart_data_age_less60_creat1.4, family = binomial)
summary_age_less60_creat1.4 <- summary(model_age_less60_creat1.4)

# Extract significant variables (p < 0.05)
significant_vars_age_less60_creat1.4 <- summary_age_less60_creat1.4$coefficients[summary_age_less60_creat1.4$coefficients[, "Pr(>|z|)"] < 0.05, ]
if (nrow(significant_vars_age_less60_creat1.4) == 0) {
  significant_vars_age_less60_creat1.4 <- data.frame(Variable = "None", Estimate = NA, `Pr(>|z|)` = NA)
} else {
  significant_vars_age_less60_creat1.4 <- as.data.frame(significant_vars_age_less60_creat1.4)
  significant_vars_age_less60_creat1.4$Variable <- rownames(significant_vars_age_less60_creat1.4)
}

# Subset 4: Age < 60 and Serum Creatinine < 1.4
heart_data_age_less60_creat_less1.4 <- subset(heart_data, age < 60 & serum_creatinine < 1.4)
model_age_less60_creat_less1.4 <- glm(DEATH_EVENT ~ ., data = heart_data_age_less60_creat_less1.4, family = binomial)
summary_age_less60_creat_less1.4 <- summary(model_age_less60_creat_less1.4)

# Extract significant variables (p < 0.05)
significant_vars_age_less60_creat_less1.4 <- summary_age_less60_creat_less1.4$coefficients[summary_age_less60_creat_less1.4$coefficients[, "Pr(>|z|)"] < 0.05, ]
if (nrow(significant_vars_age_less60_creat_less1.4) == 0) {
  significant_vars_age_less60_creat_less1.4 <- data.frame(Variable = "None", Estimate = NA, `Pr(>|z|)` = NA)
} else {
  significant_vars_age_less60_creat_less1.4 <- as.data.frame(significant_vars_age_less60_creat_less1.4)
  significant_vars_age_less60_creat_less1.4$Variable <- rownames(significant_vars_age_less60_creat_less1.4)
}

Data Summary:

Significant Variables by Subset Data (p < 0.05)
Split Datasets	Significant Variables
Age > 60, Serum Creatinine > 1.4	None
Age > 60, Serum Creatinine < 1.4	age, serum_creatinine
Age < 60, Serum Creatinine > 1.4	None
Age < 60, Serum Creatinine < 1.4	creatinine_phosphokinase, ejection_fraction

3D Plot:

Interactive 3D plot: drag to rotate, scroll to zoom.

CVD Predictors Analysis

Ivan Jimenez- Acosta

2024-11-05