R Markdown
# Load required libraries
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.3.3
## Loading required package: lattice
library(ggplot2)
library(pROC)
## Warning: package 'pROC' was built under R version 4.3.3
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(yardstick)
## Warning: package 'yardstick' was built under R version 4.3.3
##
## Attaching package: 'yardstick'
## The following objects are masked from 'package:caret':
##
## precision, recall, sensitivity, specificity
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(knitr)
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.3.3
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.3.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(forecast)
## Warning: package 'forecast' was built under R version 4.3.3
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
##
## Attaching package: 'forecast'
## The following object is masked from 'package:yardstick':
##
## accuracy
library(car)
## Warning: package 'car' was built under R version 4.3.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.3.3
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.3.3
## Loading required package: Matrix
## Loaded glmnet 4.1-8
# Load datasets
getwd()
## [1] "C:/Users/tsapa/Downloads"
setwd("C:/Users/tsapa/Downloads")
train_data <- read.csv("train2.csv")
test_data <- read.csv("test2.csv")
################################################################################
df_frmgham2 <- read.csv("frmgham2.csv")
# Display structure and summary
str(df_frmgham2)
## 'data.frame': 11627 obs. of 39 variables:
## $ RANDID : int 2448 2448 6238 6238 6238 9428 9428 10552 10552 11252 ...
## $ SEX : int 1 1 2 2 2 1 1 2 2 2 ...
## $ TOTCHOL : int 195 209 250 260 237 245 283 225 232 285 ...
## $ AGE : int 39 52 46 52 58 48 54 61 67 46 ...
## $ SYSBP : num 106 121 121 105 108 ...
## $ DIABP : num 70 66 81 69.5 66 80 89 95 109 84 ...
## $ CURSMOKE: int 0 0 0 0 0 1 1 1 1 1 ...
## $ CIGPDAY : int 0 0 0 0 0 20 30 30 20 23 ...
## $ BMI : num 27 NA 28.7 29.4 28.5 ...
## $ DIABETES: int 0 0 0 0 0 0 0 0 0 0 ...
## $ BPMEDS : int 0 0 0 0 0 0 0 0 0 0 ...
## $ HEARTRTE: int 80 69 95 80 80 75 75 65 60 85 ...
## $ GLUCOSE : int 77 92 76 86 71 70 87 103 89 85 ...
## $ educ : int 4 4 2 2 2 1 1 3 3 3 ...
## $ PREVCHD : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PREVAP : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PREVMI : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PREVSTRK: int 0 0 0 0 0 0 0 0 0 0 ...
## $ PREVHYP : int 0 0 0 0 0 0 0 1 1 0 ...
## $ TIME : int 0 4628 0 2156 4344 0 2199 0 1977 0 ...
## $ PERIOD : int 1 3 1 2 3 1 2 1 2 1 ...
## $ HDLC : int NA 31 NA NA 54 NA NA NA NA NA ...
## $ LDLC : int NA 178 NA NA 141 NA NA NA NA NA ...
## $ DEATH : int 0 0 0 0 0 0 0 1 1 0 ...
## $ ANGINA : int 0 0 0 0 0 0 0 0 0 0 ...
## $ HOSPMI : int 1 1 0 0 0 0 0 0 0 0 ...
## $ MI_FCHD : int 1 1 0 0 0 0 0 0 0 0 ...
## $ ANYCHD : int 1 1 0 0 0 0 0 0 0 0 ...
## $ STROKE : int 0 0 0 0 0 0 0 1 1 0 ...
## $ CVD : int 1 1 0 0 0 0 0 1 1 0 ...
## $ HYPERTEN: int 0 0 0 0 0 0 0 1 1 1 ...
## $ TIMEAP : int 8766 8766 8766 8766 8766 8766 8766 2956 2956 8766 ...
## $ TIMEMI : int 6438 6438 8766 8766 8766 8766 8766 2956 2956 8766 ...
## $ TIMEMIFC: int 6438 6438 8766 8766 8766 8766 8766 2956 2956 8766 ...
## $ TIMECHD : int 6438 6438 8766 8766 8766 8766 8766 2956 2956 8766 ...
## $ TIMESTRK: int 8766 8766 8766 8766 8766 8766 8766 2089 2089 8766 ...
## $ TIMECVD : int 6438 6438 8766 8766 8766 8766 8766 2089 2089 8766 ...
## $ TIMEDTH : int 8766 8766 8766 8766 8766 8766 8766 2956 2956 8766 ...
## $ TIMEHYP : int 8766 8766 8766 8766 8766 8766 8766 0 0 4285 ...
# Ensure required libraries are loaded
library(dplyr)
library(DataExplorer) # For plot_missing()
## Warning: package 'DataExplorer' was built under R version 4.3.3
library(psych) # For describe()
## Warning: package 'psych' was built under R version 4.3.3
##
## Attaching package: 'psych'
## The following object is masked from 'package:car':
##
## logit
## The following object is masked from 'package:randomForest':
##
## outlier
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
# Plot missing values
plot_missing(df_frmgham2)

# Describe the data
df_frmgham2 %>%
dplyr::select(-RANDID) %>%
psych::describe() %>%
as.data.frame() %>% # Convert describe() output to a data frame for further manipulation
dplyr::select(-c(trimmed, mad, skew, kurtosis)) # Remove unwanted columns
## vars n mean sd median min max range
## SEX 1 11627 1.568074e+00 0.4953655 2.00 1.00 2.0 1.00
## TOTCHOL 2 11218 2.411624e+02 45.3680304 238.00 107.00 696.0 589.00
## AGE 3 11627 5.479281e+01 9.5642992 54.00 32.00 81.0 49.00
## SYSBP 4 11627 1.363241e+02 22.7986248 132.00 83.50 295.0 211.50
## DIABP 5 11627 8.303776e+01 11.6601443 82.00 30.00 150.0 120.00
## CURSMOKE 6 11627 4.325277e-01 0.4954479 0.00 0.00 1.0 1.00
## CIGPDAY 7 11548 8.250346e+00 12.1868882 0.00 0.00 90.0 90.00
## BMI 8 11575 2.587735e+01 4.1026400 25.48 14.43 56.8 42.37
## DIABETES 9 11627 4.558356e-02 0.2085892 0.00 0.00 1.0 1.00
## BPMEDS 10 11034 8.555374e-02 0.2797166 0.00 0.00 1.0 1.00
## HEARTRTE 11 11621 7.678152e+01 12.4633586 75.00 37.00 220.0 183.00
## GLUCOSE 12 10187 8.412487e+01 24.9937812 80.00 39.00 478.0 439.00
## educ 13 11332 1.990205e+00 1.0274630 2.00 1.00 4.0 3.00
## PREVCHD 14 11627 7.241765e-02 0.2591893 0.00 0.00 1.0 1.00
## PREVAP 15 11627 5.392621e-02 0.2258817 0.00 0.00 1.0 1.00
## PREVMI 16 11627 3.216651e-02 0.1764497 0.00 0.00 1.0 1.00
## PREVSTRK 17 11627 1.307302e-02 0.1135924 0.00 0.00 1.0 1.00
## PREVHYP 18 11627 4.596199e-01 0.4983882 0.00 0.00 1.0 1.00
## TIME 19 11627 1.957019e+03 1758.7769265 2156.00 0.00 4854.0 4854.00
## PERIOD 20 11627 1.899286e+00 0.8074072 2.00 1.00 3.0 2.00
## HDLC 21 3027 4.936472e+01 15.6266689 48.00 10.00 189.0 179.00
## LDLC 22 3026 1.764670e+02 46.8633928 173.00 20.00 565.0 545.00
## DEATH 23 11627 3.033457e-01 0.4597230 0.00 0.00 1.0 1.00
## ANGINA 24 11627 1.635848e-01 0.3699143 0.00 0.00 1.0 1.00
## HOSPMI 25 11627 9.925174e-02 0.2990126 0.00 0.00 1.0 1.00
## MI_FCHD 26 11627 1.537800e-01 0.3607532 0.00 0.00 1.0 1.00
## ANYCHD 27 11627 2.716092e-01 0.4448086 0.00 0.00 1.0 1.00
## STROKE 28 11627 9.125312e-02 0.2879811 0.00 0.00 1.0 1.00
## CVD 29 11627 2.493334e-01 0.4326458 0.00 0.00 1.0 1.00
## HYPERTEN 30 11627 7.432700e-01 0.4368480 1.00 0.00 1.0 1.00
## TIMEAP 31 11627 7.241557e+03 2477.7800095 8766.00 0.00 8766.0 8766.00
## TIMEMI 32 11627 7.593847e+03 2136.7302846 8766.00 0.00 8766.0 8766.00
## TIMEMIFC 33 11627 7.543037e+03 2192.1203114 8766.00 0.00 8766.0 8766.00
## TIMECHD 34 11627 7.008154e+03 2641.3445127 8766.00 0.00 8766.0 8766.00
## TIMESTRK 35 11627 7.660880e+03 2011.0770914 8766.00 0.00 8766.0 8766.00
## TIMECVD 36 11627 7.166083e+03 2541.6684767 8766.00 0.00 8766.0 8766.00
## TIMEDTH 37 11627 7.854103e+03 1788.3696229 8766.00 26.00 8766.0 8740.00
## TIMEHYP 38 11627 3.598956e+03 3464.1646589 2429.00 0.00 8766.0 8766.00
## se
## SEX 0.004594010
## TOTCHOL 0.428343527
## AGE 0.088699122
## SYSBP 0.211433995
## DIABP 0.108135947
## CURSMOKE 0.004594774
## CIGPDAY 0.113406889
## BMI 0.038133170
## DIABETES 0.001934452
## BPMEDS 0.002662881
## HEARTRTE 0.115614775
## GLUCOSE 0.247633166
## educ 0.009651902
## PREVCHD 0.002403717
## PREVAP 0.002094823
## PREVMI 0.001636391
## PREVSTRK 0.001053453
## PREVHYP 0.004622042
## TIME 16.310862371
## PERIOD 0.007487879
## HDLC 0.284027374
## LDLC 0.851920886
## DEATH 0.004263462
## ANGINA 0.003430577
## HOSPMI 0.002773037
## MI_FCHD 0.003345618
## ANYCHD 0.004125146
## STROKE 0.002670731
## CVD 0.004012348
## HYPERTEN 0.004051319
## TIMEAP 22.978882718
## TIMEMI 19.815994326
## TIMEMIFC 20.329680337
## TIMECHD 24.495776680
## TIMESTRK 18.650689102
## TIMECVD 23.571383097
## TIMEDTH 16.585304452
## TIMEHYP 32.126594414
# Count unique observations in RANDID
unique_randids <- df_frmgham2 %>%
distinct(RANDID) %>%
nrow()
print(unique_randids)
## [1] 4434
# Print first and last few rows
head(df_frmgham2)
## RANDID SEX TOTCHOL AGE SYSBP DIABP CURSMOKE CIGPDAY BMI DIABETES BPMEDS
## 1 2448 1 195 39 106.0 70.0 0 0 26.97 0 0
## 2 2448 1 209 52 121.0 66.0 0 0 NA 0 0
## 3 6238 2 250 46 121.0 81.0 0 0 28.73 0 0
## 4 6238 2 260 52 105.0 69.5 0 0 29.43 0 0
## 5 6238 2 237 58 108.0 66.0 0 0 28.50 0 0
## 6 9428 1 245 48 127.5 80.0 1 20 25.34 0 0
## HEARTRTE GLUCOSE educ PREVCHD PREVAP PREVMI PREVSTRK PREVHYP TIME PERIOD HDLC
## 1 80 77 4 0 0 0 0 0 0 1 NA
## 2 69 92 4 0 0 0 0 0 4628 3 31
## 3 95 76 2 0 0 0 0 0 0 1 NA
## 4 80 86 2 0 0 0 0 0 2156 2 NA
## 5 80 71 2 0 0 0 0 0 4344 3 54
## 6 75 70 1 0 0 0 0 0 0 1 NA
## LDLC DEATH ANGINA HOSPMI MI_FCHD ANYCHD STROKE CVD HYPERTEN TIMEAP TIMEMI
## 1 NA 0 0 1 1 1 0 1 0 8766 6438
## 2 178 0 0 1 1 1 0 1 0 8766 6438
## 3 NA 0 0 0 0 0 0 0 0 8766 8766
## 4 NA 0 0 0 0 0 0 0 0 8766 8766
## 5 141 0 0 0 0 0 0 0 0 8766 8766
## 6 NA 0 0 0 0 0 0 0 0 8766 8766
## TIMEMIFC TIMECHD TIMESTRK TIMECVD TIMEDTH TIMEHYP
## 1 6438 6438 8766 6438 8766 8766
## 2 6438 6438 8766 6438 8766 8766
## 3 8766 8766 8766 8766 8766 8766
## 4 8766 8766 8766 8766 8766 8766
## 5 8766 8766 8766 8766 8766 8766
## 6 8766 8766 8766 8766 8766 8766
tail(df_frmgham2)
## RANDID SEX TOTCHOL AGE SYSBP DIABP CURSMOKE CIGPDAY BMI DIABETES
## 11622 9998212 1 185 40 141 98 0 0 25.60 0
## 11623 9998212 1 173 46 126 82 0 0 19.17 0
## 11624 9998212 1 153 52 143 89 0 0 25.74 0
## 11625 9999312 2 196 39 133 86 1 30 20.91 0
## 11626 9999312 2 240 46 138 79 1 20 26.39 0
## 11627 9999312 2 NA 50 147 96 1 10 24.19 0
## BPMEDS HEARTRTE GLUCOSE educ PREVCHD PREVAP PREVMI PREVSTRK PREVHYP TIME
## 11622 0 67 72 3 0 0 0 0 1 0
## 11623 0 70 NA 3 0 0 0 0 1 2333
## 11624 0 65 72 3 0 0 0 0 1 4538
## 11625 0 85 80 3 0 0 0 0 0 0
## 11626 0 90 83 3 0 0 0 0 0 2390
## 11627 0 94 NA 3 0 0 0 0 1 4201
## PERIOD HDLC LDLC DEATH ANGINA HOSPMI MI_FCHD ANYCHD STROKE CVD HYPERTEN
## 11622 1 NA NA 0 0 0 0 0 0 0 1
## 11623 2 NA NA 0 0 0 0 0 0 0 1
## 11624 3 30 123 0 0 0 0 0 0 0 1
## 11625 1 NA NA 0 0 0 0 0 0 0 1
## 11626 2 NA NA 0 0 0 0 0 0 0 1
## 11627 3 NA NA 0 0 0 0 0 0 0 1
## TIMEAP TIMEMI TIMEMIFC TIMECHD TIMESTRK TIMECVD TIMEDTH TIMEHYP
## 11622 8766 8766 8766 8766 8766 8766 8766 0
## 11623 8766 8766 8766 8766 8766 8766 8766 0
## 11624 8766 8766 8766 8766 8766 8766 8766 0
## 11625 8766 8766 8766 8766 8766 8766 8766 4201
## 11626 8766 8766 8766 8766 8766 8766 8766 4201
## 11627 8766 8766 8766 8766 8766 8766 8766 4201
test_data[] <- lapply(test_data, function(x) if (is.integer(x)) as.numeric(x) else x)
train_data[] <- lapply(train_data, function(x) if (is.integer(x)) as.numeric(x) else x)
test_data[] <- lapply(test_data, function(x) if (is.character(x)) as.factor(x) else x)
train_data[] <- lapply(train_data, function(x) if (is.character(x)) as.factor(x) else x)
train_data$CVD <- factor(train_data$CVD, levels = c(0, 1))
test_data$CVD <- factor(test_data$CVD, levels = c(0, 1))
train_data$BPMEDS <- factor(train_data$BPMEDS, levels = c(0, 1))
test_data$BPMEDS <- factor(test_data$BPMEDS, levels = c(0, 1))
# Move column 'B' to the end
df_frmgham2 <- df_frmgham2[, c(setdiff(names(df_frmgham2), "CVD"), "CVD")]
test_data <- test_data[, c(setdiff(names(test_data), "CVD"), "CVD")]
train_data<- train_data[, c(setdiff(names(train_data), "CVD"), "CVD")]
############################################################
# Ensure proper formatting of the target variable
train_data$CVD <- factor(train_data$CVD, levels = c(0, 1), labels = c("Class0", "Class1"))
test_data$CVD <- factor(test_data$CVD, levels = c(0, 1), labels = c("Class0", "Class1"))
# Identify and remove constant features
constant_features_train <- colnames(train_data)[apply(train_data[, -ncol(train_data)], 2, var) == 0]
## Warning in stats::var(...): NAs introduced by coercion
## Warning in stats::var(...): NAs introduced by coercion
## Warning in stats::var(...): NAs introduced by coercion
train_data <- train_data[, !colnames(train_data) %in% constant_features_train]
test_data <- test_data[, !colnames(test_data) %in% constant_features_train]
# which(is.na(test_data))
# which(is.na(train_data))
which(is.na(constant_features_train))
## [1] 2 3 4
# Normalize numeric features while preserving CVD
normalize <- function(data) {
numeric_features <- data[, sapply(data, is.numeric)]
mins <- apply(numeric_features, 2, min)
maxs <- apply(numeric_features, 2, max)
normalized_data <- sweep(numeric_features, 2, mins, "-")
normalized_data <- sweep(normalized_data, 2, (maxs - mins), "/")
data[, sapply(data, is.numeric)] <- normalized_data
return(data)
}
#Transform skewness
train_data$CIGPDAY<- log1p(train_data$CIGPDAY)
test_data$CIGPDAY<- log1p(test_data$CIGPDAY)
#Scale
train_data$AGE <- scale(train_data$AGE)
train_data$CIGPDAY <- scale(train_data$CIGPDAY) # Z-score scaling
test_data$AGE <- scale(test_data$AGE)
test_data$CIGPDAY <- scale(test_data$CIGPDAY) # Z-score scaling
#Excude Outliers
# Identify numeric columns
numeric_cols <- sapply(train_data, is.numeric)
# Compute z-scores for numeric columns
z_scores <- scale(train_data[, numeric_cols])
# Find rows where all z-scores are less than 3 in absolute value
rows_to_keep <- apply(abs(z_scores) < 3, 1, all)
# Filter the data to keep only those rows
filtered_train_data <- train_data[rows_to_keep, ]
# Identify numeric columns
numeric_cols <- sapply(test_data, is.numeric)
# Compute z-scores for numeric columns
z_scores <- scale(test_data[, numeric_cols])
# Find rows where all z-scores are less than 3 in absolute value
rows_to_keep <- apply(abs(z_scores) < 3, 1, all)
# Filter the data to keep only those rows
filtered_test_data <-test_data[rows_to_keep, ]
# ONLY the test is affected
test_data <- filtered_test_data
train_data <- filtered_train_data
#Encode Categorical Values
# Function to convert ordered factors to numeric
convert_factors <- function(df) {
for (i in seq_len(ncol(df) - 1)) { # Loop through all columns except the last one
if (is.factor(df[[i]])) {
df[[i]] <- as.numeric(df[[i]]) # Convert ordered factor to numeric
}
}
return(df)
}
# Apply the function to the dataset
test_data <- convert_factors(test_data)
str(train_data)
## 'data.frame': 1493 obs. of 14 variables:
## $ TOTCHOL : num 266 252 208 226 245 260 208 214 283 165 ...
## $ AGE : num [1:1493, 1] 0.584 1.435 -0.874 0.949 0.827 ...
## $ SYSBP : num 116 148 125 110 136 139 127 133 155 123 ...
## $ DIABP : num 70 84 73 65 83 90 66 88 75 70 ...
## $ CIGPDAY : num [1:1493, 1] -0.709 -0.709 -0.709 -0.709 -0.709 ...
## $ BMI : num 18.6 28.1 20.2 25.9 25 ...
## $ BPMEDS : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ HEARTRTE: num 86 76 90 80 63 80 70 75 90 85 ...
## $ GLUCOSE : num 122 98 79 93 76 90 86 81 73 80 ...
## $ educ : num 2 1 1 2 1 2 1 2 4 3 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 1 2 1 1 2 1 2 2 2 1 ...
## $ Smoker : Factor w/ 2 levels "Not current smoker",..: 1 1 1 1 1 1 1 1 1 2 ...
## $ Diabetic: Factor w/ 2 levels "Diabetic","Non Diabetic": 2 2 2 2 2 2 1 2 1 2 ...
## $ CVD : Factor w/ 2 levels "Class0","Class1": 1 1 1 1 2 1 2 1 2 1 ...
# Apply the function to the dataset
train_data <- convert_factors(train_data)
str(test_data)
## 'data.frame': 496 obs. of 14 variables:
## $ TOTCHOL : num 280 162 206 230 251 216 231 255 216 223 ...
## $ AGE : num [1:496, 1] 0.5813 -0.7623 -0.518 -1.2509 -0.0294 ...
## $ SYSBP : num 168 152 129 142 132 ...
## $ DIABP : num 100 101 85 90.5 77 70 70 98 84 100 ...
## $ CIGPDAY : num [1:496, 1] -0.704 -0.704 2.058 1.813 1.435 ...
## $ BMI : num 25.7 26.4 26.4 24.3 19.3 ...
## $ BPMEDS : num 1 1 1 1 1 1 2 1 1 2 ...
## $ HEARTRTE: num 92 105 84 70 82 72 70 84 77 77 ...
## $ GLUCOSE : num 82 78 69 61 76 49 86 90 70 86 ...
## $ educ : num 1 2 4 3 2 2 2 2 1 1 ...
## $ Gender : num 2 2 2 2 1 2 1 2 1 1 ...
## $ Smoker : num 1 1 2 2 2 2 1 2 2 1 ...
## $ Diabetic: num 2 2 2 2 2 2 2 2 2 2 ...
## $ CVD : Factor w/ 2 levels "Class0","Class1": 1 1 1 2 1 2 1 1 1 1 ...
#Normalize
train_data_normalized <- normalize(train_data[, colnames(train_data) != "CVD"])
train_data_normalized$CVD <- train_data$CVD
test_data_normalized <- normalize(test_data[, colnames(test_data) != "CVD"])
test_data_normalized$CVD <- test_data$CVD
# Recursive Feature Elimination (RFE)
rfe_control <- rfeControl(functions = rfFuncs, method = "cv", number = 10)
rfe_results <- rfe(
train_data_normalized[, colnames(train_data_normalized) != "CVD"],
train_data_normalized$CVD,
sizes = c(1:13),
rfeControl = rfe_control
)
cat("\n With RFE")
##
## With RFE
predictors(rfe_results)
## [1] "Gender" "AGE" "Diabetic"
###################################################
### Feature Importance with Random forest
rf_model <- randomForest(
x = train_data_normalized[, colnames(train_data_normalized) != "CVD"],
y = train_data_normalized$CVD,
importance = TRUE
)
#
# Get top 10 features by importance
importance_scores <- importance(rf_model)
top_featuresRF <- rownames(importance_scores)[order(importance_scores[, "MeanDecreaseGini"], decreasing = TRUE)[1:10]]
cat("\n With RF")
##
## With RF
print(top_featuresRF)
## [1] "AGE" "SYSBP" "BMI" "TOTCHOL" "GLUCOSE" "DIABP"
## [7] "HEARTRTE" "Gender" "educ" "CIGPDAY"
###################################################
#Alternative rfe() RFA for parsimonious
#
# # Ensure 'CVD' is the response variable and is a factor
# train_data_normalized$CVD <- as.factor(train_data_normalized$CVD)
#
# # Define the train control for cross-validation
# control <- trainControl(method = "cv", number = 10)
#
# # Recursive Feature Elimination (RFE) control settings
# rfe_control <- rfeControl(functions = caretFuncs, method = "cv", number = 10)
#
# # Recursive Feature Addition using caret
# subset_sizes <- c(1:10) # Define the sizes of subsets of predictors
#
# # Perform RFE
# rfa_results <- rfe(
# train_data_normalized[, colnames(train_data_normalized) != "CVD"], # Exclude response variable
# train_data_normalized$CVD, # Response variable
# sizes = subset_sizes, # Subset sizes to evaluate
# rfeControl = rfe_control
# )
#
# # Get the top predictors
# top_predictorsRFA <- predictors(rfa_results)
# cat("\n With RFA")
# print(top_predictorsRFA)
###################################################
# Correlation based
# Exclude the last column from the correlation matrix
train_dataCorrel <- train_data_normalized
train_dataCorrel$CVD <- as.numeric(train_dataCorrel$CVD )
correlation_matrix <- cor(train_dataCorrel)
target_correlations <- abs(correlation_matrix["CVD", -ncol(train_dataCorrel)])
top_featuresCorrel <- names(sort(target_correlations, decreasing = TRUE)[1:10])
cat("\n With Correlation")
##
## With Correlation
print(top_featuresCorrel)
## [1] "AGE" "Gender" "SYSBP" "Diabetic" "BPMEDS" "DIABP"
## [7] "BMI" "educ" "GLUCOSE" "HEARTRTE"
###################################################
#Lasso Regression
# Convert data to matrix form
x <- as.matrix(train_data_normalized[, colnames(train_data_normalized) != "CVD"])
y <- train_data_normalized$CVD
lasso_model <- cv.glmnet(x, y, alpha = 1, family = "binomial")
coef_lasso <- coef(lasso_model, s = "lambda.min")
top_featuresLasso <- rownames(coef_lasso)[coef_lasso[, 1] != 0]
cat("\n With Lasso")
##
## With Lasso
print(top_featuresLasso)
## [1] "(Intercept)" "TOTCHOL" "AGE" "SYSBP" "CIGPDAY"
## [6] "BMI" "BPMEDS" "GLUCOSE" "Gender" "Smoker"
## [11] "Diabetic"
# Selected features
selected_features <- predictors(rfe_results)
cat("\nSelected Features by RFE:\n")
##
## Selected Features by RFE:
print(selected_features)
## [1] "Gender" "AGE" "Diabetic"
# Subset the training and test datasets
train_data_rfe <- train_data_normalized[, c(selected_features, "CVD")]
test_data_rfe <- test_data_normalized[, colnames(test_data_normalized) %in% c(selected_features, "CVD")]
# Train logistic regression using selected features
cv_control <- trainControl(method = "cv", number = 5, classProbs = TRUE, summaryFunction = twoClassSummary)
logistic_model_rfe <- train(
CVD ~ .,
data = train_data_rfe,
method = "glm",
family = binomial(link = "logit"),
trControl = cv_control,
metric = "ROC"
)
# Extract and rank coefficients
coefficients <- coef(logistic_model_rfe$finalModel)
coefficients <- as.data.frame(coefficients)
coefficients <- coefficients[order(-abs(coefficients[, 1])), , drop = FALSE]
colnames(coefficients) <- "Coefficient"
coefficients$Feature <- rownames(coefficients)
rownames(coefficients) <- NULL
coefficients$LogOdds <- coefficients$Coefficient # Log-odds interpretation
coefficients$Rank <- rank(-abs(coefficients$Coefficient)) # Rank by absolute value
# coefficients
# # Recode Feature Names
# coefficients$Feature <- recode(coefficients$Feature,
# "ANGINA" = "Angina Pectoris",
# "MI_FCHD" = "Myocardial Infarction Fatal Coronary Heart Disease",
# "HOSPMI" = "Hospitalized Myocardial Infarction",
# "DIABP" = "Diastolic Blood Pressure",
# "SYSBP" = "Systolic Blood Pressure",
# "BMI" = "Body Mass Index",
# "TOTCHOL" = "Total Cholesterol",
# "HEARTRTE" = "Heart Rate",
# "ANYCHD" = "Any Fatal Coronary Heart Disease",
# "PREVMI" = "Prevalent Myocardial Infarction",
# "PREVAP" = "Prevalent Angina",
# "PERIOD" = "Examination Cycle",
# "HYPERTEN" = "Hypertensive",
# "PREVSTRK - Stroke History_No.Stroke.History" = "No Stroke History",
# "Diabetic_Non.Diabetic" = "Non-diabetic",
# "BPMEDS" = "Anti-hypertensive Meds",
# "Gender_Male" = "Gender (Male)",
# "STROKE" = "Stroke",
# "DEATH" = "Death",
# "AGE" = "Age",
# "GLUCOSE" = "Glucose"
# )
# Exclude (Intercept) and display full coefficient table
coefficients <- coefficients[coefficients$Feature != "(Intercept)", ]
coefficients <- coefficients[, c("Feature", "Coefficient", "LogOdds", "Rank")]
# Display coefficients table in a clean format
cat("\n Coefficients and Log-Odds Table:\n")
##
## Coefficients and Log-Odds Table:
kable(coefficients, format = "pipe", align = "c") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE) %>%
column_spec(2:3, width = "5em")
|
Feature
|
Coefficient
|
LogOdds
|
Rank
|
1
|
AGE
|
2.990770
|
2.990770
|
1
|
3
|
Diabetic
|
-1.259228
|
-1.259228
|
3
|
4
|
Gender
|
1.247582
|
1.247582
|
4
|
# Feature Importance Plot
top_features <- coefficients %>% top_n(10, wt = abs(Coefficient))
top_features$Feature <- str_wrap(top_features$Feature, width = 25)
feature_importance_plot <- ggplot(top_features, aes(x = reorder(Feature, abs(Coefficient)), y = abs(Coefficient))) +
geom_col(fill = "purple") +
coord_flip() +
labs(
title = "Top 10 Features - Logistic Regression",
x = "Feature",
y = "Absolute Coefficient"
) +
theme_minimal() +
theme(
text = element_text(size = 12, family = "serif"),
plot.title = element_text(hjust = 0.5)
)
# Print and save the feature importance plot
print(feature_importance_plot)

#ggsave("feature_importance_top10.png", plot = feature_importance_plot, width = 10, height = 8, dpi = 300)
# Confusion Matrix Visualization
generate_confusion_matrix <- function(truth, estimate, model_name, data_split) {
truth <- factor(truth, levels = c("Class0", "Class1"))
estimate <- factor(estimate, levels = c("Class0", "Class1"))
conf_matrix <- conf_mat(data.frame(truth = truth, estimate = estimate), truth, estimate)
cm_table <- as.data.frame(as.table(conf_matrix$table))
colnames(cm_table) <- c("Truth", "Prediction", "Freq")
cm_table$Truth <- factor(cm_table$Truth, levels = c("Class0", "Class1"), labels = c("0", "1"))
cm_table$Prediction <- factor(cm_table$Prediction, levels = c("Class0", "Class1"), labels = c("0", "1"))
cm_table$Truth <- factor(cm_table$Truth, levels = rev(levels(cm_table$Truth)))
ggplot(cm_table, aes(x = Prediction, y = Truth, fill = Freq)) +
geom_tile() +
geom_text(aes(label = Freq), color = "white", size = 6) +
scale_fill_gradient(low = "lavender", high = "purple", name = "Count") +
labs(
title = paste("Confusion Matrix -", model_name, "(", data_split, ")"),
x = "Prediction",
y = "Truth"
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, size = 14),
axis.text = element_text(size = 12),
axis.title = element_text(size = 14),
legend.position = "none",
panel.grid.major = element_blank()
) +
coord_fixed()
}
# Function to compute performance metrics
compute_metrics <- function(truth, estimate, probabilities) {
cm <- confusionMatrix(estimate, truth, positive = "Class1")
# Extract key metrics
accuracy <- cm$overall["Accuracy"]
precision <- cm$byClass["Precision"]
recall <- cm$byClass["Recall"]
specificity <- cm$byClass["Specificity"]
f1_score <- 2 * (precision * recall) / (precision + recall)
# Compute AUC
auc <- roc(as.numeric(truth) - 1, probabilities)$auc
# Return metrics as a data frame
metrics <- data.frame(
Accuracy = accuracy,
Precision = precision,
Recall = recall,
Specificity = specificity,
F1_Score = f1_score,
AUC = auc
)
return(metrics)
}
# Compute predictions and probabilities
train_probabilities <- predict(logistic_model_rfe, newdata = train_data_rfe, type = "prob")[, "Class1"]
train_predictions <- ifelse(train_probabilities > 0.5, "Class1", "Class0")
test_probabilities <- predict(logistic_model_rfe, newdata = test_data_rfe, type = "prob")[, "Class1"]
test_predictions <- ifelse(test_probabilities > 0.5, "Class1", "Class0")
# Compute metrics for training data
train_metrics <- compute_metrics(
truth = train_data_rfe$CVD,
estimate = factor(train_predictions, levels = c("Class0", "Class1")),
probabilities = train_probabilities
)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Compute metrics for test data
test_metrics <- compute_metrics(
truth = test_data_rfe$CVD,
estimate = factor(test_predictions, levels = c("Class0", "Class1")),
probabilities = test_probabilities
)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Display performance metrics
cat("\n Performance Metrics (Training Data):\n")
##
## Performance Metrics (Training Data):
print(train_metrics)
## Accuracy Precision Recall Specificity F1_Score AUC
## Accuracy 0.790355 0.5689655 0.2006079 0.9570447 0.2966292 0.7495992
cat("\n Performance Metrics (Test Data):\n")
##
## Performance Metrics (Test Data):
print(test_metrics)
## Accuracy Precision Recall Specificity F1_Score AUC
## Accuracy 0.7943548 0.4705882 0.2424242 0.9319899 0.32 0.7412411
# Confusion Matrices
train_conf_matrix_plot <- generate_confusion_matrix(
truth = train_data_rfe$CVD,
estimate = factor(train_predictions, levels = c("Class0", "Class1")),
model_name = "Logistic Regression",
data_split = "Training Data"
)
test_conf_matrix_plot <- generate_confusion_matrix(
truth = test_data_rfe$CVD,
estimate = factor(test_predictions, levels = c("Class0", "Class1")),
model_name = "Logistic Regression",
data_split = "Test Data"
)
# Display confusion matrix plots
print(train_conf_matrix_plot)

print(test_conf_matrix_plot)

# ROC Curve Plotting
plot_roc_curve <- function(roc_object, data_split, model_name) {
roc_data <- data.frame(
FalsePositiveRate = 1 - roc_object$specificities,
TruePositiveRate = roc_object$sensitivities
)
ggplot(roc_data, aes(x = FalsePositiveRate, y = TruePositiveRate)) +
geom_line(color = "purple", linewidth = 1.2) +
labs(
title = paste("ROC Curve -", model_name, "(", data_split, ")"),
x = "False Positive Rate",
y = "True Positive Rate"
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5),
axis.text = element_text(size = 12),
axis.title = element_text(size = 14)
)
}
# Generate ROC Data
roc_train <- roc(as.numeric(train_data_rfe$CVD) - 1, train_probabilities)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_test <- roc(as.numeric(test_data_rfe$CVD) - 1, test_probabilities)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Display ROC Curves
print(plot_roc_curve(roc_train, "Training Set", "Logistic Regression"))

print(plot_roc_curve(roc_test, "Test Set", "Logistic Regression"))
