R Markdown

# Load required libraries
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.3.3
## Loading required package: lattice
library(ggplot2)
library(pROC)
## Warning: package 'pROC' was built under R version 4.3.3
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(yardstick)
## Warning: package 'yardstick' was built under R version 4.3.3
## 
## Attaching package: 'yardstick'
## The following objects are masked from 'package:caret':
## 
##     precision, recall, sensitivity, specificity
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
library(knitr)
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.3.3
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.3.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(forecast)
## Warning: package 'forecast' was built under R version 4.3.3
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## 
## Attaching package: 'forecast'
## The following object is masked from 'package:yardstick':
## 
##     accuracy
library(car)
## Warning: package 'car' was built under R version 4.3.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.3.3
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.3.3
## Loading required package: Matrix
## Loaded glmnet 4.1-8
# Load datasets
getwd()
## [1] "C:/Users/tsapa/Downloads"
setwd("C:/Users/tsapa/Downloads")
train_data <- read.csv("train2.csv")
test_data <- read.csv("test2.csv")

################################################################################
df_frmgham2 <- read.csv("frmgham2.csv")

# Display structure and summary
str(df_frmgham2)
## 'data.frame':    11627 obs. of  39 variables:
##  $ RANDID  : int  2448 2448 6238 6238 6238 9428 9428 10552 10552 11252 ...
##  $ SEX     : int  1 1 2 2 2 1 1 2 2 2 ...
##  $ TOTCHOL : int  195 209 250 260 237 245 283 225 232 285 ...
##  $ AGE     : int  39 52 46 52 58 48 54 61 67 46 ...
##  $ SYSBP   : num  106 121 121 105 108 ...
##  $ DIABP   : num  70 66 81 69.5 66 80 89 95 109 84 ...
##  $ CURSMOKE: int  0 0 0 0 0 1 1 1 1 1 ...
##  $ CIGPDAY : int  0 0 0 0 0 20 30 30 20 23 ...
##  $ BMI     : num  27 NA 28.7 29.4 28.5 ...
##  $ DIABETES: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ BPMEDS  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ HEARTRTE: int  80 69 95 80 80 75 75 65 60 85 ...
##  $ GLUCOSE : int  77 92 76 86 71 70 87 103 89 85 ...
##  $ educ    : int  4 4 2 2 2 1 1 3 3 3 ...
##  $ PREVCHD : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PREVAP  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PREVMI  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PREVSTRK: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PREVHYP : int  0 0 0 0 0 0 0 1 1 0 ...
##  $ TIME    : int  0 4628 0 2156 4344 0 2199 0 1977 0 ...
##  $ PERIOD  : int  1 3 1 2 3 1 2 1 2 1 ...
##  $ HDLC    : int  NA 31 NA NA 54 NA NA NA NA NA ...
##  $ LDLC    : int  NA 178 NA NA 141 NA NA NA NA NA ...
##  $ DEATH   : int  0 0 0 0 0 0 0 1 1 0 ...
##  $ ANGINA  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ HOSPMI  : int  1 1 0 0 0 0 0 0 0 0 ...
##  $ MI_FCHD : int  1 1 0 0 0 0 0 0 0 0 ...
##  $ ANYCHD  : int  1 1 0 0 0 0 0 0 0 0 ...
##  $ STROKE  : int  0 0 0 0 0 0 0 1 1 0 ...
##  $ CVD     : int  1 1 0 0 0 0 0 1 1 0 ...
##  $ HYPERTEN: int  0 0 0 0 0 0 0 1 1 1 ...
##  $ TIMEAP  : int  8766 8766 8766 8766 8766 8766 8766 2956 2956 8766 ...
##  $ TIMEMI  : int  6438 6438 8766 8766 8766 8766 8766 2956 2956 8766 ...
##  $ TIMEMIFC: int  6438 6438 8766 8766 8766 8766 8766 2956 2956 8766 ...
##  $ TIMECHD : int  6438 6438 8766 8766 8766 8766 8766 2956 2956 8766 ...
##  $ TIMESTRK: int  8766 8766 8766 8766 8766 8766 8766 2089 2089 8766 ...
##  $ TIMECVD : int  6438 6438 8766 8766 8766 8766 8766 2089 2089 8766 ...
##  $ TIMEDTH : int  8766 8766 8766 8766 8766 8766 8766 2956 2956 8766 ...
##  $ TIMEHYP : int  8766 8766 8766 8766 8766 8766 8766 0 0 4285 ...
# Ensure required libraries are loaded
library(dplyr)
library(DataExplorer)  # For plot_missing()
## Warning: package 'DataExplorer' was built under R version 4.3.3
library(psych)         # For describe()
## Warning: package 'psych' was built under R version 4.3.3
## 
## Attaching package: 'psych'
## The following object is masked from 'package:car':
## 
##     logit
## The following object is masked from 'package:randomForest':
## 
##     outlier
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
# Plot missing values
plot_missing(df_frmgham2)

# Describe the data
df_frmgham2 %>%
  dplyr::select(-RANDID) %>%
  psych::describe() %>%
  as.data.frame() %>%                    # Convert describe() output to a data frame for further manipulation
  dplyr::select(-c(trimmed, mad, skew, kurtosis))  # Remove unwanted columns
##          vars     n         mean           sd  median    min    max   range
## SEX         1 11627 1.568074e+00    0.4953655    2.00   1.00    2.0    1.00
## TOTCHOL     2 11218 2.411624e+02   45.3680304  238.00 107.00  696.0  589.00
## AGE         3 11627 5.479281e+01    9.5642992   54.00  32.00   81.0   49.00
## SYSBP       4 11627 1.363241e+02   22.7986248  132.00  83.50  295.0  211.50
## DIABP       5 11627 8.303776e+01   11.6601443   82.00  30.00  150.0  120.00
## CURSMOKE    6 11627 4.325277e-01    0.4954479    0.00   0.00    1.0    1.00
## CIGPDAY     7 11548 8.250346e+00   12.1868882    0.00   0.00   90.0   90.00
## BMI         8 11575 2.587735e+01    4.1026400   25.48  14.43   56.8   42.37
## DIABETES    9 11627 4.558356e-02    0.2085892    0.00   0.00    1.0    1.00
## BPMEDS     10 11034 8.555374e-02    0.2797166    0.00   0.00    1.0    1.00
## HEARTRTE   11 11621 7.678152e+01   12.4633586   75.00  37.00  220.0  183.00
## GLUCOSE    12 10187 8.412487e+01   24.9937812   80.00  39.00  478.0  439.00
## educ       13 11332 1.990205e+00    1.0274630    2.00   1.00    4.0    3.00
## PREVCHD    14 11627 7.241765e-02    0.2591893    0.00   0.00    1.0    1.00
## PREVAP     15 11627 5.392621e-02    0.2258817    0.00   0.00    1.0    1.00
## PREVMI     16 11627 3.216651e-02    0.1764497    0.00   0.00    1.0    1.00
## PREVSTRK   17 11627 1.307302e-02    0.1135924    0.00   0.00    1.0    1.00
## PREVHYP    18 11627 4.596199e-01    0.4983882    0.00   0.00    1.0    1.00
## TIME       19 11627 1.957019e+03 1758.7769265 2156.00   0.00 4854.0 4854.00
## PERIOD     20 11627 1.899286e+00    0.8074072    2.00   1.00    3.0    2.00
## HDLC       21  3027 4.936472e+01   15.6266689   48.00  10.00  189.0  179.00
## LDLC       22  3026 1.764670e+02   46.8633928  173.00  20.00  565.0  545.00
## DEATH      23 11627 3.033457e-01    0.4597230    0.00   0.00    1.0    1.00
## ANGINA     24 11627 1.635848e-01    0.3699143    0.00   0.00    1.0    1.00
## HOSPMI     25 11627 9.925174e-02    0.2990126    0.00   0.00    1.0    1.00
## MI_FCHD    26 11627 1.537800e-01    0.3607532    0.00   0.00    1.0    1.00
## ANYCHD     27 11627 2.716092e-01    0.4448086    0.00   0.00    1.0    1.00
## STROKE     28 11627 9.125312e-02    0.2879811    0.00   0.00    1.0    1.00
## CVD        29 11627 2.493334e-01    0.4326458    0.00   0.00    1.0    1.00
## HYPERTEN   30 11627 7.432700e-01    0.4368480    1.00   0.00    1.0    1.00
## TIMEAP     31 11627 7.241557e+03 2477.7800095 8766.00   0.00 8766.0 8766.00
## TIMEMI     32 11627 7.593847e+03 2136.7302846 8766.00   0.00 8766.0 8766.00
## TIMEMIFC   33 11627 7.543037e+03 2192.1203114 8766.00   0.00 8766.0 8766.00
## TIMECHD    34 11627 7.008154e+03 2641.3445127 8766.00   0.00 8766.0 8766.00
## TIMESTRK   35 11627 7.660880e+03 2011.0770914 8766.00   0.00 8766.0 8766.00
## TIMECVD    36 11627 7.166083e+03 2541.6684767 8766.00   0.00 8766.0 8766.00
## TIMEDTH    37 11627 7.854103e+03 1788.3696229 8766.00  26.00 8766.0 8740.00
## TIMEHYP    38 11627 3.598956e+03 3464.1646589 2429.00   0.00 8766.0 8766.00
##                    se
## SEX       0.004594010
## TOTCHOL   0.428343527
## AGE       0.088699122
## SYSBP     0.211433995
## DIABP     0.108135947
## CURSMOKE  0.004594774
## CIGPDAY   0.113406889
## BMI       0.038133170
## DIABETES  0.001934452
## BPMEDS    0.002662881
## HEARTRTE  0.115614775
## GLUCOSE   0.247633166
## educ      0.009651902
## PREVCHD   0.002403717
## PREVAP    0.002094823
## PREVMI    0.001636391
## PREVSTRK  0.001053453
## PREVHYP   0.004622042
## TIME     16.310862371
## PERIOD    0.007487879
## HDLC      0.284027374
## LDLC      0.851920886
## DEATH     0.004263462
## ANGINA    0.003430577
## HOSPMI    0.002773037
## MI_FCHD   0.003345618
## ANYCHD    0.004125146
## STROKE    0.002670731
## CVD       0.004012348
## HYPERTEN  0.004051319
## TIMEAP   22.978882718
## TIMEMI   19.815994326
## TIMEMIFC 20.329680337
## TIMECHD  24.495776680
## TIMESTRK 18.650689102
## TIMECVD  23.571383097
## TIMEDTH  16.585304452
## TIMEHYP  32.126594414
# Count unique observations in RANDID
unique_randids <- df_frmgham2 %>% 
  distinct(RANDID) %>% 
  nrow()

print(unique_randids)
## [1] 4434
# Print first and last few rows
head(df_frmgham2)
##   RANDID SEX TOTCHOL AGE SYSBP DIABP CURSMOKE CIGPDAY   BMI DIABETES BPMEDS
## 1   2448   1     195  39 106.0  70.0        0       0 26.97        0      0
## 2   2448   1     209  52 121.0  66.0        0       0    NA        0      0
## 3   6238   2     250  46 121.0  81.0        0       0 28.73        0      0
## 4   6238   2     260  52 105.0  69.5        0       0 29.43        0      0
## 5   6238   2     237  58 108.0  66.0        0       0 28.50        0      0
## 6   9428   1     245  48 127.5  80.0        1      20 25.34        0      0
##   HEARTRTE GLUCOSE educ PREVCHD PREVAP PREVMI PREVSTRK PREVHYP TIME PERIOD HDLC
## 1       80      77    4       0      0      0        0       0    0      1   NA
## 2       69      92    4       0      0      0        0       0 4628      3   31
## 3       95      76    2       0      0      0        0       0    0      1   NA
## 4       80      86    2       0      0      0        0       0 2156      2   NA
## 5       80      71    2       0      0      0        0       0 4344      3   54
## 6       75      70    1       0      0      0        0       0    0      1   NA
##   LDLC DEATH ANGINA HOSPMI MI_FCHD ANYCHD STROKE CVD HYPERTEN TIMEAP TIMEMI
## 1   NA     0      0      1       1      1      0   1        0   8766   6438
## 2  178     0      0      1       1      1      0   1        0   8766   6438
## 3   NA     0      0      0       0      0      0   0        0   8766   8766
## 4   NA     0      0      0       0      0      0   0        0   8766   8766
## 5  141     0      0      0       0      0      0   0        0   8766   8766
## 6   NA     0      0      0       0      0      0   0        0   8766   8766
##   TIMEMIFC TIMECHD TIMESTRK TIMECVD TIMEDTH TIMEHYP
## 1     6438    6438     8766    6438    8766    8766
## 2     6438    6438     8766    6438    8766    8766
## 3     8766    8766     8766    8766    8766    8766
## 4     8766    8766     8766    8766    8766    8766
## 5     8766    8766     8766    8766    8766    8766
## 6     8766    8766     8766    8766    8766    8766
tail(df_frmgham2)
##        RANDID SEX TOTCHOL AGE SYSBP DIABP CURSMOKE CIGPDAY   BMI DIABETES
## 11622 9998212   1     185  40   141    98        0       0 25.60        0
## 11623 9998212   1     173  46   126    82        0       0 19.17        0
## 11624 9998212   1     153  52   143    89        0       0 25.74        0
## 11625 9999312   2     196  39   133    86        1      30 20.91        0
## 11626 9999312   2     240  46   138    79        1      20 26.39        0
## 11627 9999312   2      NA  50   147    96        1      10 24.19        0
##       BPMEDS HEARTRTE GLUCOSE educ PREVCHD PREVAP PREVMI PREVSTRK PREVHYP TIME
## 11622      0       67      72    3       0      0      0        0       1    0
## 11623      0       70      NA    3       0      0      0        0       1 2333
## 11624      0       65      72    3       0      0      0        0       1 4538
## 11625      0       85      80    3       0      0      0        0       0    0
## 11626      0       90      83    3       0      0      0        0       0 2390
## 11627      0       94      NA    3       0      0      0        0       1 4201
##       PERIOD HDLC LDLC DEATH ANGINA HOSPMI MI_FCHD ANYCHD STROKE CVD HYPERTEN
## 11622      1   NA   NA     0      0      0       0      0      0   0        1
## 11623      2   NA   NA     0      0      0       0      0      0   0        1
## 11624      3   30  123     0      0      0       0      0      0   0        1
## 11625      1   NA   NA     0      0      0       0      0      0   0        1
## 11626      2   NA   NA     0      0      0       0      0      0   0        1
## 11627      3   NA   NA     0      0      0       0      0      0   0        1
##       TIMEAP TIMEMI TIMEMIFC TIMECHD TIMESTRK TIMECVD TIMEDTH TIMEHYP
## 11622   8766   8766     8766    8766     8766    8766    8766       0
## 11623   8766   8766     8766    8766     8766    8766    8766       0
## 11624   8766   8766     8766    8766     8766    8766    8766       0
## 11625   8766   8766     8766    8766     8766    8766    8766    4201
## 11626   8766   8766     8766    8766     8766    8766    8766    4201
## 11627   8766   8766     8766    8766     8766    8766    8766    4201
test_data[] <- lapply(test_data, function(x) if (is.integer(x)) as.numeric(x) else x)
train_data[] <- lapply(train_data, function(x) if (is.integer(x)) as.numeric(x) else x)
test_data[] <- lapply(test_data, function(x) if (is.character(x)) as.factor(x) else x)
train_data[] <- lapply(train_data, function(x) if (is.character(x)) as.factor(x) else x)

train_data$CVD <- factor(train_data$CVD, levels = c(0, 1))
test_data$CVD <- factor(test_data$CVD, levels = c(0, 1))
train_data$BPMEDS <- factor(train_data$BPMEDS, levels = c(0, 1))
test_data$BPMEDS <- factor(test_data$BPMEDS, levels = c(0, 1))

# Move column 'B' to the end

df_frmgham2 <- df_frmgham2[, c(setdiff(names(df_frmgham2), "CVD"), "CVD")]
test_data <- test_data[, c(setdiff(names(test_data), "CVD"), "CVD")]
train_data<- train_data[, c(setdiff(names(train_data), "CVD"), "CVD")]
############################################################
# Ensure proper formatting of the target variable



train_data$CVD <- factor(train_data$CVD, levels = c(0, 1), labels = c("Class0", "Class1"))
 test_data$CVD <- factor(test_data$CVD, levels = c(0, 1), labels = c("Class0", "Class1"))

 # Identify and remove constant features
 constant_features_train <- colnames(train_data)[apply(train_data[, -ncol(train_data)], 2, var) == 0]
## Warning in stats::var(...): NAs introduced by coercion

## Warning in stats::var(...): NAs introduced by coercion

## Warning in stats::var(...): NAs introduced by coercion
 train_data <- train_data[, !colnames(train_data) %in% constant_features_train]
 test_data <- test_data[, !colnames(test_data) %in% constant_features_train]
 
 # which(is.na(test_data))
 # which(is.na(train_data))
 which(is.na(constant_features_train))
## [1] 2 3 4
# Normalize numeric features while preserving CVD
normalize <- function(data) {
  numeric_features <- data[, sapply(data, is.numeric)]
  mins <- apply(numeric_features, 2, min)
  maxs <- apply(numeric_features, 2, max)
  normalized_data <- sweep(numeric_features, 2, mins, "-")
  normalized_data <- sweep(normalized_data, 2, (maxs - mins), "/")
  data[, sapply(data, is.numeric)] <- normalized_data
  return(data)

}


#Transform skewness
train_data$CIGPDAY<- log1p(train_data$CIGPDAY)
test_data$CIGPDAY<- log1p(test_data$CIGPDAY)

#Scale
train_data$AGE <- scale(train_data$AGE) 
train_data$CIGPDAY <- scale(train_data$CIGPDAY)  # Z-score scaling
test_data$AGE <- scale(test_data$AGE) 
test_data$CIGPDAY <- scale(test_data$CIGPDAY)  # Z-score scaling

#Excude Outliers
# Identify numeric columns
numeric_cols <- sapply(train_data, is.numeric)

# Compute z-scores for numeric columns
z_scores <- scale(train_data[, numeric_cols])

# Find rows where all z-scores are less than 3 in absolute value
rows_to_keep <- apply(abs(z_scores) < 3, 1, all)

# Filter the data to keep only those rows
filtered_train_data <- train_data[rows_to_keep, ]

# Identify numeric columns
numeric_cols <- sapply(test_data, is.numeric)

# Compute z-scores for numeric columns
z_scores <- scale(test_data[, numeric_cols])

# Find rows where all z-scores are less than 3 in absolute value
rows_to_keep <- apply(abs(z_scores) < 3, 1, all)

# Filter the data to keep only those rows
filtered_test_data <-test_data[rows_to_keep, ]

# ONLY the test is affected
test_data <- filtered_test_data 
train_data <- filtered_train_data


#Encode Categorical Values
# Function to convert ordered factors to numeric
convert_factors <- function(df) {
  for (i in seq_len(ncol(df) - 1)) { # Loop through all columns except the last one
    if (is.factor(df[[i]])) {
      df[[i]] <- as.numeric(df[[i]]) # Convert ordered factor to numeric
    }
  }
  return(df)
}


# Apply the function to the dataset
test_data <- convert_factors(test_data)
str(train_data)
## 'data.frame':    1493 obs. of  14 variables:
##  $ TOTCHOL : num  266 252 208 226 245 260 208 214 283 165 ...
##  $ AGE     : num [1:1493, 1] 0.584 1.435 -0.874 0.949 0.827 ...
##  $ SYSBP   : num  116 148 125 110 136 139 127 133 155 123 ...
##  $ DIABP   : num  70 84 73 65 83 90 66 88 75 70 ...
##  $ CIGPDAY : num [1:1493, 1] -0.709 -0.709 -0.709 -0.709 -0.709 ...
##  $ BMI     : num  18.6 28.1 20.2 25.9 25 ...
##  $ BPMEDS  : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ HEARTRTE: num  86 76 90 80 63 80 70 75 90 85 ...
##  $ GLUCOSE : num  122 98 79 93 76 90 86 81 73 80 ...
##  $ educ    : num  2 1 1 2 1 2 1 2 4 3 ...
##  $ Gender  : Factor w/ 2 levels "Female","Male": 1 2 1 1 2 1 2 2 2 1 ...
##  $ Smoker  : Factor w/ 2 levels "Not current smoker",..: 1 1 1 1 1 1 1 1 1 2 ...
##  $ Diabetic: Factor w/ 2 levels "Diabetic","Non Diabetic": 2 2 2 2 2 2 1 2 1 2 ...
##  $ CVD     : Factor w/ 2 levels "Class0","Class1": 1 1 1 1 2 1 2 1 2 1 ...
# Apply the function to the dataset
train_data <- convert_factors(train_data)

str(test_data)
## 'data.frame':    496 obs. of  14 variables:
##  $ TOTCHOL : num  280 162 206 230 251 216 231 255 216 223 ...
##  $ AGE     : num [1:496, 1] 0.5813 -0.7623 -0.518 -1.2509 -0.0294 ...
##  $ SYSBP   : num  168 152 129 142 132 ...
##  $ DIABP   : num  100 101 85 90.5 77 70 70 98 84 100 ...
##  $ CIGPDAY : num [1:496, 1] -0.704 -0.704 2.058 1.813 1.435 ...
##  $ BMI     : num  25.7 26.4 26.4 24.3 19.3 ...
##  $ BPMEDS  : num  1 1 1 1 1 1 2 1 1 2 ...
##  $ HEARTRTE: num  92 105 84 70 82 72 70 84 77 77 ...
##  $ GLUCOSE : num  82 78 69 61 76 49 86 90 70 86 ...
##  $ educ    : num  1 2 4 3 2 2 2 2 1 1 ...
##  $ Gender  : num  2 2 2 2 1 2 1 2 1 1 ...
##  $ Smoker  : num  1 1 2 2 2 2 1 2 2 1 ...
##  $ Diabetic: num  2 2 2 2 2 2 2 2 2 2 ...
##  $ CVD     : Factor w/ 2 levels "Class0","Class1": 1 1 1 2 1 2 1 1 1 1 ...
#Normalize
train_data_normalized <- normalize(train_data[, colnames(train_data) != "CVD"])
train_data_normalized$CVD <- train_data$CVD
test_data_normalized <- normalize(test_data[, colnames(test_data) != "CVD"])
test_data_normalized$CVD <- test_data$CVD



# Recursive Feature Elimination (RFE)
rfe_control <- rfeControl(functions = rfFuncs, method = "cv", number = 10)
rfe_results <- rfe(
  train_data_normalized[, colnames(train_data_normalized) != "CVD"], 
  train_data_normalized$CVD, 
  sizes = c(1:13), 
  rfeControl = rfe_control
)

cat("\n With RFE")
## 
##  With RFE
predictors(rfe_results)
## [1] "Gender"   "AGE"      "Diabetic"
###################################################
### Feature Importance with Random forest
rf_model <- randomForest(
  x = train_data_normalized[, colnames(train_data_normalized) != "CVD"], 
  y = train_data_normalized$CVD,
  importance = TRUE
)

#

# Get top 10 features by importance
importance_scores <- importance(rf_model)
top_featuresRF <- rownames(importance_scores)[order(importance_scores[, "MeanDecreaseGini"], decreasing = TRUE)[1:10]]
cat("\n With RF")
## 
##  With RF
print(top_featuresRF)
##  [1] "AGE"      "SYSBP"    "BMI"      "TOTCHOL"  "GLUCOSE"  "DIABP"   
##  [7] "HEARTRTE" "Gender"   "educ"     "CIGPDAY"
###################################################
#Alternative rfe() RFA for parsimonious
# 
# # Ensure 'CVD' is the response variable and is a factor
# train_data_normalized$CVD <- as.factor(train_data_normalized$CVD)
# 
# # Define the train control for cross-validation
# control <- trainControl(method = "cv", number = 10)
# 
# # Recursive Feature Elimination (RFE) control settings
# rfe_control <- rfeControl(functions = caretFuncs, method = "cv", number = 10)
# 
# # Recursive Feature Addition using caret
# subset_sizes <- c(1:10)  # Define the sizes of subsets of predictors
# 
# # Perform RFE
# rfa_results <- rfe(
#   train_data_normalized[, colnames(train_data_normalized) != "CVD"],  # Exclude response variable
#   train_data_normalized$CVD,  # Response variable
#   sizes = subset_sizes,  # Subset sizes to evaluate
#   rfeControl = rfe_control
# )
# 
# # Get the top predictors
# top_predictorsRFA <- predictors(rfa_results)
# cat("\n With RFA")
# print(top_predictorsRFA)

###################################################
# Correlation based

# Exclude the last column from the correlation matrix
train_dataCorrel <- train_data_normalized
train_dataCorrel$CVD <- as.numeric(train_dataCorrel$CVD )
correlation_matrix <- cor(train_dataCorrel)

target_correlations <- abs(correlation_matrix["CVD", -ncol(train_dataCorrel)])
top_featuresCorrel <- names(sort(target_correlations, decreasing = TRUE)[1:10])

cat("\n With Correlation")
## 
##  With Correlation
print(top_featuresCorrel)
##  [1] "AGE"      "Gender"   "SYSBP"    "Diabetic" "BPMEDS"   "DIABP"   
##  [7] "BMI"      "educ"     "GLUCOSE"  "HEARTRTE"
###################################################
#Lasso Regression


# Convert data to matrix form
x <- as.matrix(train_data_normalized[, colnames(train_data_normalized) != "CVD"])
y <- train_data_normalized$CVD

lasso_model <- cv.glmnet(x, y, alpha = 1, family = "binomial")
coef_lasso <- coef(lasso_model, s = "lambda.min")
top_featuresLasso <- rownames(coef_lasso)[coef_lasso[, 1] != 0]

cat("\n With Lasso")
## 
##  With Lasso
print(top_featuresLasso)
##  [1] "(Intercept)" "TOTCHOL"     "AGE"         "SYSBP"       "CIGPDAY"    
##  [6] "BMI"         "BPMEDS"      "GLUCOSE"     "Gender"      "Smoker"     
## [11] "Diabetic"
# Selected features
selected_features <- predictors(rfe_results)
cat("\nSelected Features by RFE:\n")
## 
## Selected Features by RFE:
print(selected_features)
## [1] "Gender"   "AGE"      "Diabetic"
# Subset the training and test datasets
train_data_rfe <- train_data_normalized[, c(selected_features, "CVD")]
test_data_rfe <- test_data_normalized[, colnames(test_data_normalized) %in% c(selected_features, "CVD")]

# Train logistic regression using selected features
cv_control <- trainControl(method = "cv", number = 5, classProbs = TRUE, summaryFunction = twoClassSummary)
logistic_model_rfe <- train(
  CVD ~ ., 
  data = train_data_rfe, 
  method = "glm", 
  family = binomial(link = "logit"), 
  trControl = cv_control, 
  metric = "ROC"
)

# Extract and rank coefficients
coefficients <- coef(logistic_model_rfe$finalModel)
coefficients <- as.data.frame(coefficients)
coefficients <- coefficients[order(-abs(coefficients[, 1])), , drop = FALSE]
colnames(coefficients) <- "Coefficient"
coefficients$Feature <- rownames(coefficients)
rownames(coefficients) <- NULL
coefficients$LogOdds <- coefficients$Coefficient  # Log-odds interpretation
coefficients$Rank <- rank(-abs(coefficients$Coefficient))  # Rank by absolute value

# coefficients
# # Recode Feature Names
# coefficients$Feature <- recode(coefficients$Feature,
#                                "ANGINA" = "Angina Pectoris",
#                                "MI_FCHD" = "Myocardial Infarction Fatal Coronary Heart Disease",
#                                "HOSPMI" = "Hospitalized Myocardial Infarction",
#                                "DIABP" = "Diastolic Blood Pressure",
#                                "SYSBP" = "Systolic Blood Pressure",
#                                "BMI" = "Body Mass Index",
#                                "TOTCHOL" = "Total Cholesterol",
#                                "HEARTRTE" = "Heart Rate",
#                                "ANYCHD" = "Any Fatal Coronary Heart Disease",
#                                "PREVMI" = "Prevalent Myocardial Infarction",
#                                "PREVAP" = "Prevalent Angina",
#                                "PERIOD" = "Examination Cycle",
#                                "HYPERTEN" = "Hypertensive",
#                                "PREVSTRK - Stroke History_No.Stroke.History" = "No Stroke History",
#                                "Diabetic_Non.Diabetic" = "Non-diabetic",
#                                "BPMEDS" = "Anti-hypertensive Meds",
#                                "Gender_Male" = "Gender (Male)",
#                                "STROKE" = "Stroke",
#                                "DEATH" = "Death",
#                                "AGE" = "Age",
#                                "GLUCOSE" = "Glucose"
# )

# Exclude (Intercept) and display full coefficient table
coefficients <- coefficients[coefficients$Feature != "(Intercept)", ]
coefficients <- coefficients[, c("Feature", "Coefficient", "LogOdds", "Rank")]
# Display coefficients table in a clean format
cat("\n Coefficients and Log-Odds Table:\n")
## 
##  Coefficients and Log-Odds Table:
kable(coefficients, format = "pipe", align = "c") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE) %>%
  column_spec(2:3, width = "5em")
Feature Coefficient LogOdds Rank
1 AGE 2.990770 2.990770 1
3 Diabetic -1.259228 -1.259228 3
4 Gender 1.247582 1.247582 4
# Feature Importance Plot
top_features <- coefficients %>% top_n(10, wt = abs(Coefficient))
top_features$Feature <- str_wrap(top_features$Feature, width = 25)

feature_importance_plot <- ggplot(top_features, aes(x = reorder(Feature, abs(Coefficient)), y = abs(Coefficient))) +
  geom_col(fill = "purple") +
  coord_flip() +
  labs(
    title = "Top 10 Features - Logistic Regression",
    x = "Feature",
    y = "Absolute Coefficient"
  ) +
  theme_minimal() +
  theme(
    text = element_text(size = 12, family = "serif"),
    plot.title = element_text(hjust = 0.5)
  )

# Print and save the feature importance plot
print(feature_importance_plot)

#ggsave("feature_importance_top10.png", plot = feature_importance_plot, width = 10, height = 8, dpi = 300)
# Confusion Matrix Visualization
generate_confusion_matrix <- function(truth, estimate, model_name, data_split) {
  truth <- factor(truth, levels = c("Class0", "Class1"))
  estimate <- factor(estimate, levels = c("Class0", "Class1"))
  
  conf_matrix <- conf_mat(data.frame(truth = truth, estimate = estimate), truth, estimate)
  cm_table <- as.data.frame(as.table(conf_matrix$table))
  colnames(cm_table) <- c("Truth", "Prediction", "Freq")
  
  cm_table$Truth <- factor(cm_table$Truth, levels = c("Class0", "Class1"), labels = c("0", "1"))
  cm_table$Prediction <- factor(cm_table$Prediction, levels = c("Class0", "Class1"), labels = c("0", "1"))
  cm_table$Truth <- factor(cm_table$Truth, levels = rev(levels(cm_table$Truth)))
  
  ggplot(cm_table, aes(x = Prediction, y = Truth, fill = Freq)) +
    geom_tile() +
    geom_text(aes(label = Freq), color = "white", size = 6) +
    scale_fill_gradient(low = "lavender", high = "purple", name = "Count") +
    labs(
      title = paste("Confusion Matrix -", model_name, "(", data_split, ")"),
      x = "Prediction",
      y = "Truth"
    ) +
    theme_minimal() +
    theme(
      plot.title = element_text(hjust = 0.5, size = 14),
      axis.text = element_text(size = 12),
      axis.title = element_text(size = 14),
      legend.position = "none",
      panel.grid.major = element_blank()
    ) +
    coord_fixed()
}

# Function to compute performance metrics
compute_metrics <- function(truth, estimate, probabilities) {
  cm <- confusionMatrix(estimate, truth, positive = "Class1")
  
  # Extract key metrics
  accuracy <- cm$overall["Accuracy"]
  precision <- cm$byClass["Precision"]
  recall <- cm$byClass["Recall"]
  specificity <- cm$byClass["Specificity"]
  f1_score <- 2 * (precision * recall) / (precision + recall)
  
  # Compute AUC
  auc <- roc(as.numeric(truth) - 1, probabilities)$auc
  
  # Return metrics as a data frame
  metrics <- data.frame(
    Accuracy = accuracy,
    Precision = precision,
    Recall = recall,
    Specificity = specificity,
    F1_Score = f1_score,
    AUC = auc
  )
  return(metrics)
}
# Compute predictions and probabilities
train_probabilities <- predict(logistic_model_rfe, newdata = train_data_rfe, type = "prob")[, "Class1"]
train_predictions <- ifelse(train_probabilities > 0.5, "Class1", "Class0")
test_probabilities <- predict(logistic_model_rfe, newdata = test_data_rfe, type = "prob")[, "Class1"]
test_predictions <- ifelse(test_probabilities > 0.5, "Class1", "Class0")

# Compute metrics for training data
train_metrics <- compute_metrics(
  truth = train_data_rfe$CVD,
  estimate = factor(train_predictions, levels = c("Class0", "Class1")),
  probabilities = train_probabilities
)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Compute metrics for test data
test_metrics <- compute_metrics(
  truth = test_data_rfe$CVD,
  estimate = factor(test_predictions, levels = c("Class0", "Class1")),
  probabilities = test_probabilities
)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Display performance metrics
cat("\n Performance Metrics (Training Data):\n")
## 
##  Performance Metrics (Training Data):
print(train_metrics)
##          Accuracy Precision    Recall Specificity  F1_Score       AUC
## Accuracy 0.790355 0.5689655 0.2006079   0.9570447 0.2966292 0.7495992
cat("\n Performance Metrics (Test Data):\n")
## 
##  Performance Metrics (Test Data):
print(test_metrics)
##           Accuracy Precision    Recall Specificity F1_Score       AUC
## Accuracy 0.7943548 0.4705882 0.2424242   0.9319899     0.32 0.7412411
# Confusion Matrices
train_conf_matrix_plot <- generate_confusion_matrix(
  truth = train_data_rfe$CVD,
  estimate = factor(train_predictions, levels = c("Class0", "Class1")),
  model_name = "Logistic Regression",
  data_split = "Training Data"
)

test_conf_matrix_plot <- generate_confusion_matrix(
  truth = test_data_rfe$CVD,
  estimate = factor(test_predictions, levels = c("Class0", "Class1")),
  model_name = "Logistic Regression",
  data_split = "Test Data"
)

# Display confusion matrix plots
print(train_conf_matrix_plot)

print(test_conf_matrix_plot)

# ROC Curve Plotting
plot_roc_curve <- function(roc_object, data_split, model_name) {
  roc_data <- data.frame(
    FalsePositiveRate = 1 - roc_object$specificities,
    TruePositiveRate = roc_object$sensitivities
  )
  ggplot(roc_data, aes(x = FalsePositiveRate, y = TruePositiveRate)) +
    geom_line(color = "purple", linewidth = 1.2) +
    labs(
      title = paste("ROC Curve -", model_name, "(", data_split, ")"),
      x = "False Positive Rate",
      y = "True Positive Rate"
    ) +
    theme_minimal() +
    theme(
      plot.title = element_text(hjust = 0.5),
      axis.text = element_text(size = 12),
      axis.title = element_text(size = 14)
    )
}

# Generate ROC Data
roc_train <- roc(as.numeric(train_data_rfe$CVD) - 1, train_probabilities)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_test <- roc(as.numeric(test_data_rfe$CVD) - 1, test_probabilities)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Display ROC Curves
print(plot_roc_curve(roc_train, "Training Set", "Logistic Regression"))

print(plot_roc_curve(roc_test, "Test Set", "Logistic Regression"))