Importing dataset:
# Importing the dataset
heart_data <- read.csv("C:/Users/63650/Downloads/heart_failure_clinical_records_dataset.csv")
We choose to exclude the follow up period (time) variable, as this is highly correlated to the DEATH_EVENT outcome variable, as by common sense, it will be obvious if a patient is alive or not from their follow- up visit or lack thereof.
heart_data <- heart_data[, !(names(heart_data) %in% "time")]
Hypotheses Outline:
# Load necessary library
library(knitr)
# Create the table data as a data frame
table_data <- data.frame(
Hypothesis_age = c(
"We predict ages over 60 and serum creatinine exceeding 1.4 mg/dL to be reliable predictors of CVD.",
"We predict ages under 60 and serum creatinine exceeding 1.4 mg/dL to be reliable predictors of CVD."
),
Hypothesis_creatinine = c(
"We predict ages over 60 and serum creatinine less than 1.4 mg/dL to NOT be reliable predictors of CVD.",
"We predict ages under 60 and serum creatinine less than 1.4 mg/dL to NOT be reliable predictors of CVD."
)
)
# Create the 2x2 table using knitr
kable(table_data, "html", caption = "Hypotheses") %>%
kable_styling(full_width = FALSE, position = "center")
| Hypothesis_age | Hypothesis_creatinine |
|---|---|
| We predict ages over 60 and serum creatinine exceeding 1.4 mg/dL to be reliable predictors of CVD. | We predict ages over 60 and serum creatinine less than 1.4 mg/dL to NOT be reliable predictors of CVD. |
| We predict ages under 60 and serum creatinine exceeding 1.4 mg/dL to be reliable predictors of CVD. | We predict ages under 60 and serum creatinine less than 1.4 mg/dL to NOT be reliable predictors of CVD. |
Checking for missing values:
## age anaemia creatinine_phosphokinase
## 0 0 0
## diabetes ejection_fraction high_blood_pressure
## 0 0 0
## platelets serum_creatinine serum_sodium
## 0 0 0
## sex smoking DEATH_EVENT
## 0 0 0
Outlier classification:
# Outlier detection via IQR
numeric_columns <- select_if(heart_data, is.numeric)
outliers <- lapply(numeric_columns, function(x) x[x < (quantile(x, 0.25) - 1.5 * IQR(x)) | x > (quantile(x, 0.75) + 1.5 * IQR(x))])
outliers
## $age
## numeric(0)
##
## $creatinine_phosphokinase
## [1] 7861 2656 1380 3964 7702 5882 5209 1876 1808 4540 1548 1610 2261 1846 2334
## [16] 2442 3966 1419 1896 1767 2281 2794 2017 2522 2695 1688 1820 2060 2413
##
## $ejection_fraction
## [1] 80 70
##
## $platelets
## [1] 454000 47000 451000 461000 497000 621000 850000 507000 448000 75000
## [11] 70000 73000 481000 504000 62000 533000 25100 451000 51000 543000
## [21] 742000
##
## $serum_creatinine
## [1] 2.7 9.4 4.0 5.8 3.0 3.5 2.3 3.0 4.4 6.8 2.2 2.7 2.3 2.9 2.5 2.3 3.2 3.7 3.4
## [20] 6.1 2.5 2.4 2.5 3.5 9.0 5.0 2.4 2.7 3.8
##
## $serum_sodium
## [1] 116 121 124 113
Proportion of 0 (no death) to 1 (death):
##
## 0 1
## 0.6789298 0.3210702
Metrics as defined in the hypotheses:
# Age Average
mean(heart_data$age)
## [1] 60.83389
# Creatinine Average
mean(heart_data$serum_creatinine)
## [1] 1.39388
# Age Median
median(heart_data$age)
## [1] 60
# Creatinine Median
median(heart_data$serum_creatinine)
## [1] 1.1
Correlation Matrix:
correlation_matrix <- cor(select_if(heart_data, is.numeric))
corrplot::corrplot(correlation_matrix, method = "circle")
| Term | Definition |
|---|---|
| p- value | The probability that the observed results occurred by chance, with values less than 0.05 indicating significance. |
| Odds Ratios | A statistic that expresses the change in odds of an outcome for a one-unit change in a predictor. |
| AIC Value | Akaike Information Criterion, a measure of model quality with lower values indicating better fit. |
| Confusion Matrix | A table used to evaluate the performance of a classification model by comparing predictions with actual results. |
| ROC Curve | Receiver Operating Characteristic curve, a plot that shows the performance of a binary classifier. |
| Multicollinearity | A situation in regression analysis where predictor variables are highly correlated with each other. |
| VIF | Variance Inflation Factor, a measure used to detect multicollinearity in a regression model. |
| Cross Validation | A technique to assess model reliability by training and testing on different subsets of data. |
Model Summary:
logistic_model <- glm(DEATH_EVENT ~ ., data = heart_data, family = binomial)
summary(logistic_model)
##
## Call:
## glm(formula = DEATH_EVENT ~ ., family = binomial, data = heart_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.964e+00 4.601e+00 1.079 0.280625
## age 5.569e-02 1.313e-02 4.241 2.23e-05 ***
## anaemia1 4.179e-01 3.009e-01 1.389 0.164904
## creatinine_phosphokinase 2.905e-04 1.428e-04 2.034 0.041907 *
## diabetes1 1.514e-01 2.974e-01 0.509 0.610644
## ejection_fraction -7.032e-02 1.486e-02 -4.731 2.23e-06 ***
## high_blood_pressure1 4.189e-01 3.061e-01 1.369 0.171092
## platelets -7.094e-07 1.617e-06 -0.439 0.660857
## serum_creatinine 6.619e-01 1.734e-01 3.817 0.000135 ***
## serum_sodium -5.667e-02 3.338e-02 -1.698 0.089558 .
## sex1 -3.990e-01 3.508e-01 -1.137 0.255394
## smoking1 1.356e-01 3.486e-01 0.389 0.697300
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 375.35 on 298 degrees of freedom
## Residual deviance: 294.28 on 287 degrees of freedom
## AIC: 318.28
##
## Number of Fisher Scoring iterations: 5
Odds Ratios:
odds_ratios <- exp(coef(logistic_model))
print(odds_ratios)
## (Intercept) age anaemia1
## 143.2080946 1.0572709 1.5188140
## creatinine_phosphokinase diabetes1 ejection_fraction
## 1.0002906 1.1634708 0.9320936
## high_blood_pressure1 platelets serum_creatinine
## 1.5203456 0.9999993 1.9383955
## serum_sodium sex1 smoking1
## 0.9449089 0.6709813 1.1452113
AIC Value:
aic_value <- AIC(logistic_model)
print(aic_value)
## [1] 318.2807
Confusion Matrix:
# Assuming 'predicted_values' are the model predictions (binary) and 'true_values' are actual outcomes
predicted_values <- ifelse(predict(logistic_model, heart_data, type = "response") > 0.5, 1, 0)
true_values <- heart_data$DEATH_EVENT # replace DEATH_EVENT with the actual outcome variable name if different
conf_matrix <- confusionMatrix(as.factor(predicted_values), as.factor(true_values))
print(conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 183 49
## 1 20 47
##
## Accuracy : 0.7692
## 95% CI : (0.7173, 0.8158)
## No Information Rate : 0.6789
## P-Value [Acc > NIR] : 0.0003752
##
## Kappa : 0.4249
##
## Mcnemar's Test P-Value : 0.0007495
##
## Sensitivity : 0.9015
## Specificity : 0.4896
## Pos Pred Value : 0.7888
## Neg Pred Value : 0.7015
## Prevalence : 0.6789
## Detection Rate : 0.6120
## Detection Prevalence : 0.7759
## Balanced Accuracy : 0.6955
##
## 'Positive' Class : 0
##
ROC Curve:
# Calculate predicted probabilities
predicted_probabilities <- predict(logistic_model, heart_data, type = "response")
roc_curve <- roc(true_values, predicted_probabilities)
plot(roc_curve, main = "ROC Curve")
auc_value <- auc(roc_curve)
print(auc_value)
## Area under the curve: 0.8085
Multicollinearity Check:
# Multicollinearity Check with VIF
vif_values <- vif(logistic_model)
print(vif_values)
## age anaemia creatinine_phosphokinase
## 1.128444 1.091638 1.112243
## diabetes ejection_fraction high_blood_pressure
## 1.056803 1.122656 1.059847
## platelets serum_creatinine serum_sodium
## 1.057122 1.059148 1.073572
## sex smoking
## 1.377926 1.260903
Cross Validation:
# Model Validation with Cross-Validation
cv_results <- cv.glm(heart_data, logistic_model, K = 10)
print(cv_results$delta)
## [1] 0.1767792 0.1759412
Now we attempt the same analysis via stepwise regression to see if our results may be improved:
# 1. Stepwise Selection to Identify Important Variables
stepwise_model <- step(logistic_model, direction = "both") # Both-direction stepwise selection
## Start: AIC=318.28
## DEATH_EVENT ~ age + anaemia + creatinine_phosphokinase + diabetes +
## ejection_fraction + high_blood_pressure + platelets + serum_creatinine +
## serum_sodium + sex + smoking
##
## Df Deviance AIC
## - smoking 1 294.43 316.43
## - platelets 1 294.47 316.47
## - diabetes 1 294.54 316.54
## - sex 1 295.58 317.58
## - high_blood_pressure 1 296.15 318.15
## - anaemia 1 296.22 318.22
## <none> 294.28 318.28
## - serum_sodium 1 297.21 319.21
## - creatinine_phosphokinase 1 298.46 320.46
## - serum_creatinine 1 314.04 336.04
## - age 1 314.06 336.06
## - ejection_fraction 1 320.96 342.96
##
## Step: AIC=316.43
## DEATH_EVENT ~ age + anaemia + creatinine_phosphokinase + diabetes +
## ejection_fraction + high_blood_pressure + platelets + serum_creatinine +
## serum_sodium + sex
##
## Df Deviance AIC
## - platelets 1 294.61 314.61
## - diabetes 1 294.68 314.68
## - sex 1 295.59 315.59
## - anaemia 1 296.28 316.28
## - high_blood_pressure 1 296.29 316.29
## <none> 294.43 316.43
## - serum_sodium 1 297.35 317.35
## + smoking 1 294.28 318.28
## - creatinine_phosphokinase 1 298.51 318.51
## - serum_creatinine 1 314.07 334.07
## - age 1 314.17 334.17
## - ejection_fraction 1 321.18 341.18
##
## Step: AIC=314.61
## DEATH_EVENT ~ age + anaemia + creatinine_phosphokinase + diabetes +
## ejection_fraction + high_blood_pressure + serum_creatinine +
## serum_sodium + sex
##
## Df Deviance AIC
## - diabetes 1 294.83 312.83
## - sex 1 295.66 313.66
## - high_blood_pressure 1 296.38 314.38
## - anaemia 1 296.49 314.49
## <none> 294.61 314.61
## - serum_sodium 1 297.63 315.63
## + platelets 1 294.43 316.43
## + smoking 1 294.47 316.47
## - creatinine_phosphokinase 1 298.62 316.62
## - age 1 314.29 332.29
## - serum_creatinine 1 314.39 332.39
## - ejection_fraction 1 321.53 339.53
##
## Step: AIC=312.83
## DEATH_EVENT ~ age + anaemia + creatinine_phosphokinase + ejection_fraction +
## high_blood_pressure + serum_creatinine + serum_sodium + sex
##
## Df Deviance AIC
## - sex 1 296.05 312.05
## - high_blood_pressure 1 296.58 312.58
## - anaemia 1 296.71 312.71
## <none> 294.83 312.83
## - serum_sodium 1 298.02 314.02
## + diabetes 1 294.61 314.61
## + platelets 1 294.68 314.68
## + smoking 1 294.71 314.71
## - creatinine_phosphokinase 1 298.84 314.84
## - age 1 314.29 330.29
## - serum_creatinine 1 314.46 330.46
## - ejection_fraction 1 321.68 337.68
##
## Step: AIC=312.05
## DEATH_EVENT ~ age + anaemia + creatinine_phosphokinase + ejection_fraction +
## high_blood_pressure + serum_creatinine + serum_sodium
##
## Df Deviance AIC
## <none> 296.05 312.05
## - anaemia 1 298.16 312.16
## - high_blood_pressure 1 298.42 312.42
## + sex 1 294.83 312.83
## - serum_sodium 1 299.06 313.06
## + diabetes 1 295.66 313.66
## - creatinine_phosphokinase 1 299.72 313.72
## + platelets 1 296.01 314.01
## + smoking 1 296.03 314.03
## - age 1 314.73 328.73
## - serum_creatinine 1 315.76 329.76
## - ejection_fraction 1 321.83 335.83
summary(stepwise_model) # View the model summary to see selected variables
##
## Call:
## glm(formula = DEATH_EVENT ~ age + anaemia + creatinine_phosphokinase +
## ejection_fraction + high_blood_pressure + serum_creatinine +
## serum_sodium, family = binomial, data = heart_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.7298351 4.5190849 1.047 0.29527
## age 0.0530244 0.0127859 4.147 3.37e-05 ***
## anaemia1 0.4309325 0.2980595 1.446 0.14824
## creatinine_phosphokinase 0.0002674 0.0001405 1.902 0.05713 .
## ejection_fraction -0.0679014 0.0145626 -4.663 3.12e-06 ***
## high_blood_pressure1 0.4606311 0.2992103 1.539 0.12368
## serum_creatinine 0.6619016 0.1711437 3.868 0.00011 ***
## serum_sodium -0.0568437 0.0330681 -1.719 0.08562 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 375.35 on 298 degrees of freedom
## Residual deviance: 296.05 on 291 degrees of freedom
## AIC: 312.05
##
## Number of Fisher Scoring iterations: 5
# Extract variable names, excluding the intercept term
selected_variables <- names(coef(stepwise_model))
selected_variables <- selected_variables[selected_variables != "(Intercept)"]
selected_variables <- selected_variables[selected_variables != "time"]
selected_variables[selected_variables == "high_blood_pressure1"] <- "high_blood_pressure"
selected_variables[selected_variables == "anaemia1"] <- "anaemia"
# Fit the refined model with the selected variables
final_model <- glm(DEATH_EVENT ~ ., data = heart_data[, c(selected_variables, "DEATH_EVENT")], family = binomial)
Final Model Summary:
# Check Model Summary
summary(final_model)
##
## Call:
## glm(formula = DEATH_EVENT ~ ., family = binomial, data = heart_data[,
## c(selected_variables, "DEATH_EVENT")])
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.7298351 4.5190849 1.047 0.29527
## age 0.0530244 0.0127859 4.147 3.37e-05 ***
## anaemia1 0.4309325 0.2980595 1.446 0.14824
## creatinine_phosphokinase 0.0002674 0.0001405 1.902 0.05713 .
## ejection_fraction -0.0679014 0.0145626 -4.663 3.12e-06 ***
## high_blood_pressure1 0.4606311 0.2992103 1.539 0.12368
## serum_creatinine 0.6619016 0.1711437 3.868 0.00011 ***
## serum_sodium -0.0568437 0.0330681 -1.719 0.08562 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 375.35 on 298 degrees of freedom
## Residual deviance: 296.05 on 291 degrees of freedom
## AIC: 312.05
##
## Number of Fisher Scoring iterations: 5
Final Odds Ratios:
# Calculate Odds Ratios
odds_ratios_f <- exp(coef(final_model))
print(odds_ratios_f)
## (Intercept) age anaemia1
## 113.2768864 1.0544554 1.5386917
## creatinine_phosphokinase ejection_fraction high_blood_pressure1
## 1.0002674 0.9343526 1.5850741
## serum_creatinine serum_sodium
## 1.9384750 0.9447417
Final AIC Value:
# Assess Model Fit with AIC
aic_value_f <- AIC(final_model)
print(aic_value_f)
## [1] 312.0522
Final Confusion Matrix:
# Evaluate Model Performance with Confusion Matrix
predicted_values_f <- ifelse(predict(final_model, heart_data, type = "response") > 0.5, 1, 0)
true_values <- heart_data$DEATH_EVENT
conf_matrix_f <- confusionMatrix(as.factor(predicted_values_f), as.factor(true_values))
print(conf_matrix_f)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 182 51
## 1 21 45
##
## Accuracy : 0.7592
## 95% CI : (0.7066, 0.8066)
## No Information Rate : 0.6789
## P-Value [Acc > NIR] : 0.0014572
##
## Kappa : 0.3981
##
## Mcnemar's Test P-Value : 0.0006316
##
## Sensitivity : 0.8966
## Specificity : 0.4688
## Pos Pred Value : 0.7811
## Neg Pred Value : 0.6818
## Prevalence : 0.6789
## Detection Rate : 0.6087
## Detection Prevalence : 0.7793
## Balanced Accuracy : 0.6827
##
## 'Positive' Class : 0
##
Final ROC Curve:
# Plot ROC Curve and Calculate AUC
predicted_probabilities_f <- predict(final_model, heart_data, type = "response")
roc_curve_f <- roc(true_values, predicted_probabilities_f)
plot(roc_curve_f, main = "ROC Curve")
auc_value_f <- auc(roc_curve_f)
print(auc_value_f)
## Area under the curve: 0.8035
Final VIF Values:
# Multicollinearity Check with VIF
vif_values_f <- vif(final_model)
print(vif_values_f)
## age anaemia creatinine_phosphokinase
## 1.085777 1.076613 1.088111
## ejection_fraction high_blood_pressure serum_creatinine
## 1.096840 1.022362 1.050969
## serum_sodium
## 1.057987
Final Cross Validation:
# Model Validation with Cross-Validation
cv_results_f <- cv.glm(heart_data, final_model, K = 10)
print(cv_results_f$delta)
## [1] 0.1756983 0.1760051
| Metric | Before.Stepwise | After.Stepwise |
|---|---|---|
| AIC | 318.2807066 | 312.0522349 |
| AUC | 0.8085489 | 0.8034688 |
| VIF (Average) | 1.1273001 | 1.0683801 |
| Cross-Validation Delta (Raw) | 0.1767792 | 0.1756983 |
| Cross-Validation Delta (Bias-Corrected) | 0.1759412 | 0.1760051 |
We continue our analysis with a decision tree analysis to validate our results prior:
| Term | Definition |
|---|---|
| Complexity Parameter | A parameter that controls the size of the decision tree by penalizing more complex models. Smaller values lead to larger trees, while larger values prune the tree. |
| 1-SE Rule | The rule that selects the simplest model within one standard error of the minimum cross-validated error. It helps to avoid overfitting. |
| Performance | A general measure of the effectiveness of the model, often based on metrics like accuracy, precision, recall, and F1 score. |
| Accuracy | The proportion of correctly predicted instances out of the total instances. Measures how often the model is correct. |
| Precision | The proportion of true positive predictions out of all positive predictions made by the model. Measures accuracy in predicting positive cases. |
| Gini Impurity | A measure of how often instances are classified correctly by splitting on a variable. It focuses on purity within nodes. |
| F1 Score | A metric combining precision and recall. It is the harmonic mean of precision and recall, providing a balanced score for the model’s accuracy. |
Via Gini Impurity:
# Fit the decision tree model using Gini impurity
tree_model <- rpart(DEATH_EVENT ~ ., data = heart_data, method = "class", parms = list(split = "gini"))
# Plot the tree
rpart.plot(tree_model, main = "Decision Tree for DEATH_EVENT")
Complexity Parameter:
# Display complexity parameter (CP) table
printcp(tree_model)
##
## Classification tree:
## rpart(formula = DEATH_EVENT ~ ., data = heart_data, method = "class",
## parms = list(split = "gini"))
##
## Variables actually used in tree construction:
## [1] age ejection_fraction platelets serum_creatinine
## [5] serum_sodium
##
## Root node error: 96/299 = 0.32107
##
## n= 299
##
## CP nsplit rel error xerror xstd
## 1 0.229167 0 1.00000 1.00000 0.084096
## 2 0.083333 1 0.77083 0.85417 0.080358
## 3 0.062500 2 0.68750 0.89583 0.081533
## 4 0.015625 3 0.62500 0.91667 0.082087
## 5 0.010000 8 0.54167 0.84375 0.080050
Optimal Parameter:
# Choose the optimal CP value based on 1-SE rule
optimal_cp <- tree_model$cptable[which.min(tree_model$cptable[,"xerror"]), "CP"]
# Prune the tree using the optimal CP value
pruned_tree <- prune(tree_model, cp = optimal_cp)
# Plot the pruned tree
rpart.plot(pruned_tree, main = "Pruned Decision Tree for DEATH_EVENT")
Performance of Pruned Tree:
# Predict on the training data
predicted <- predict(pruned_tree, heart_data, type = "class")
# Generate confusion matrix to calculate performance metrics
confusion_matrix <- table(Predicted = predicted, Actual = heart_data$DEATH_EVENT)
# Calculate accuracy, precision, recall, and F1 score
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
precision <- confusion_matrix[2, 2] / sum(confusion_matrix[2, ])
recall <- confusion_matrix[2, 2] / sum(confusion_matrix[, 2])
f1_score <- 2 * ((precision * recall) / (precision + recall))
# Print results
cat("Accuracy:", accuracy, "\n")
## Accuracy: 0.826087
cat("Precision:", precision, "\n")
## Precision: 0.8142857
cat("Recall:", recall, "\n")
## Recall: 0.59375
cat("F1 Score:", f1_score, "\n")
## F1 Score: 0.686747
Data Splitting:
# Split 1: Age > 60 and Serum Creatinine > 1.4
heart_data_age60_creat1.4 <- subset(heart_data, age > 60 & serum_creatinine > 1.4)
# Split 2: Age > 60 and Serum Creatinine < 1.4
heart_data_age60_creat_less1.4 <- subset(heart_data, age > 60 & serum_creatinine < 1.4)
# Split 3: Age < 60 and Serum Creatinine > 1.4
heart_data_age_less60_creat1.4 <- subset(heart_data, age < 60 & serum_creatinine > 1.4)
# Split 4: Age < 60 and Serum Creatinine < 1.4
heart_data_age_less60_creat_less1.4 <- subset(heart_data, age < 60 & serum_creatinine < 1.4)
# Save each subset as separate files if needed
write.csv(heart_data_age60_creat1.4, "heart_data_age60_creat1.4.csv", row.names = FALSE)
write.csv(heart_data_age60_creat_less1.4, "heart_data_age60_creat_less1.4.csv", row.names = FALSE)
write.csv(heart_data_age_less60_creat1.4, "heart_data_age_less60_creat1.4.csv", row.names = FALSE)
write.csv(heart_data_age_less60_creat_less1.4, "heart_data_age_less60_creat_less1.4.csv", row.names = FALSE)
Final Splits and Testing:
# Subset 1: Age > 60 and Serum Creatinine > 1.4
heart_data_age60_creat1.4 <- subset(heart_data, age > 60 & serum_creatinine > 1.4)
model_age60_creat1.4 <- glm(DEATH_EVENT ~ ., data = heart_data_age60_creat1.4, family = binomial)
summary_age60_creat1.4 <- summary(model_age60_creat1.4)
# Extract significant variables (p < 0.05)
significant_vars_age60_creat1.4 <- summary_age60_creat1.4$coefficients[summary_age60_creat1.4$coefficients[, "Pr(>|z|)"] < 0.05, ]
if (nrow(significant_vars_age60_creat1.4) == 0) {
significant_vars_age60_creat1.4 <- data.frame(Variable = "None", Estimate = NA, `Pr(>|z|)` = NA)
} else {
significant_vars_age60_creat1.4 <- as.data.frame(significant_vars_age60_creat1.4)
significant_vars_age60_creat1.4$Variable <- rownames(significant_vars_age60_creat1.4)
}
# Subset 2: Age > 60 and Serum Creatinine < 1.4
heart_data_age60_creat_less1.4 <- subset(heart_data, age > 60 & serum_creatinine < 1.4)
model_age60_creat_less1.4 <- glm(DEATH_EVENT ~ ., data = heart_data_age60_creat_less1.4, family = binomial)
summary_age60_creat_less1.4 <- summary(model_age60_creat_less1.4)
# Extract significant variables (p < 0.05)
significant_vars_age60_creat_less1.4 <- summary_age60_creat_less1.4$coefficients[summary_age60_creat_less1.4$coefficients[, "Pr(>|z|)"] < 0.05, ]
if (nrow(significant_vars_age60_creat_less1.4) == 0) {
significant_vars_age60_creat_less1.4 <- data.frame(Variable = "None", Estimate = NA, `Pr(>|z|)` = NA)
} else {
significant_vars_age60_creat_less1.4 <- as.data.frame(significant_vars_age60_creat_less1.4)
significant_vars_age60_creat_less1.4$Variable <- rownames(significant_vars_age60_creat_less1.4)
}
# Subset 3: Age < 60 and Serum Creatinine > 1.4
heart_data_age_less60_creat1.4 <- subset(heart_data, age < 60 & serum_creatinine > 1.4)
model_age_less60_creat1.4 <- glm(DEATH_EVENT ~ ., data = heart_data_age_less60_creat1.4, family = binomial)
summary_age_less60_creat1.4 <- summary(model_age_less60_creat1.4)
# Extract significant variables (p < 0.05)
significant_vars_age_less60_creat1.4 <- summary_age_less60_creat1.4$coefficients[summary_age_less60_creat1.4$coefficients[, "Pr(>|z|)"] < 0.05, ]
if (nrow(significant_vars_age_less60_creat1.4) == 0) {
significant_vars_age_less60_creat1.4 <- data.frame(Variable = "None", Estimate = NA, `Pr(>|z|)` = NA)
} else {
significant_vars_age_less60_creat1.4 <- as.data.frame(significant_vars_age_less60_creat1.4)
significant_vars_age_less60_creat1.4$Variable <- rownames(significant_vars_age_less60_creat1.4)
}
# Subset 4: Age < 60 and Serum Creatinine < 1.4
heart_data_age_less60_creat_less1.4 <- subset(heart_data, age < 60 & serum_creatinine < 1.4)
model_age_less60_creat_less1.4 <- glm(DEATH_EVENT ~ ., data = heart_data_age_less60_creat_less1.4, family = binomial)
summary_age_less60_creat_less1.4 <- summary(model_age_less60_creat_less1.4)
# Extract significant variables (p < 0.05)
significant_vars_age_less60_creat_less1.4 <- summary_age_less60_creat_less1.4$coefficients[summary_age_less60_creat_less1.4$coefficients[, "Pr(>|z|)"] < 0.05, ]
if (nrow(significant_vars_age_less60_creat_less1.4) == 0) {
significant_vars_age_less60_creat_less1.4 <- data.frame(Variable = "None", Estimate = NA, `Pr(>|z|)` = NA)
} else {
significant_vars_age_less60_creat_less1.4 <- as.data.frame(significant_vars_age_less60_creat_less1.4)
significant_vars_age_less60_creat_less1.4$Variable <- rownames(significant_vars_age_less60_creat_less1.4)
}
Data Summary:
| Split Datasets | Significant Variables |
|---|---|
| Age > 60, Serum Creatinine > 1.4 | None |
| Age > 60, Serum Creatinine < 1.4 | age, serum_creatinine |
| Age < 60, Serum Creatinine > 1.4 | None |
| Age < 60, Serum Creatinine < 1.4 | creatinine_phosphokinase, ejection_fraction |