Data Preprocessing

Loading the data and splitting into test and training

library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
Med <- read.csv("C:\\Users\\John Ledesma\\Downloads\\archive (8)\\cardio_data_processed.csv")
Med <- Med %>% select(-age, -id)
Med <- Med %>% select(-bp_category_encoded)
train_index <- sample(1:nrow(Med), 0.7*nrow(Med))
train <- Med[train_index,]
test <- Med[-train_index,]

Exploratory Data Analysis to find correlations.

head(Med)
##   gender height weight ap_hi ap_lo cholesterol gluc smoke alco active cardio
## 1      2    168     62   110    80           1    1     0    0      1      0
## 2      1    156     85   140    90           3    1     0    0      1      1
## 3      1    165     64   130    70           3    1     0    0      0      1
## 4      2    169     82   150   100           1    1     0    0      1      1
## 5      1    156     56   100    60           1    1     0    0      0      0
## 6      1    151     67   120    80           2    2     0    0      0      0
##   age_years      bmi          bp_category
## 1        50 21.96712 Hypertension Stage 1
## 2        55 34.92768 Hypertension Stage 2
## 3        51 23.50781 Hypertension Stage 1
## 4        48 28.71048 Hypertension Stage 2
## 5        47 23.01118               Normal
## 6        60 29.38468 Hypertension Stage 1
summary(Med)
##      gender          height          weight          ap_hi      
##  Min.   :1.000   Min.   : 55.0   Min.   : 11.0   Min.   : 90.0  
##  1st Qu.:1.000   1st Qu.:159.0   1st Qu.: 65.0   1st Qu.:120.0  
##  Median :1.000   Median :165.0   Median : 72.0   Median :120.0  
##  Mean   :1.349   Mean   :164.4   Mean   : 74.1   Mean   :126.4  
##  3rd Qu.:2.000   3rd Qu.:170.0   3rd Qu.: 82.0   3rd Qu.:140.0  
##  Max.   :2.000   Max.   :250.0   Max.   :200.0   Max.   :180.0  
##      ap_lo         cholesterol         gluc           smoke        
##  Min.   : 60.00   Min.   :1.000   Min.   :1.000   Min.   :0.00000  
##  1st Qu.: 80.00   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:0.00000  
##  Median : 80.00   Median :1.000   Median :1.000   Median :0.00000  
##  Mean   : 81.26   Mean   :1.363   Mean   :1.225   Mean   :0.08766  
##  3rd Qu.: 90.00   3rd Qu.:1.000   3rd Qu.:1.000   3rd Qu.:0.00000  
##  Max.   :120.00   Max.   :3.000   Max.   :3.000   Max.   :1.00000  
##       alco             active           cardio         age_years    
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.0000   Min.   :29.00  
##  1st Qu.:0.00000   1st Qu.:1.0000   1st Qu.:0.0000   1st Qu.:48.00  
##  Median :0.00000   Median :1.0000   Median :0.0000   Median :53.00  
##  Mean   :0.05313   Mean   :0.8035   Mean   :0.4937   Mean   :52.82  
##  3rd Qu.:0.00000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:58.00  
##  Max.   :1.00000   Max.   :1.0000   Max.   :1.0000   Max.   :64.00  
##       bmi          bp_category       
##  Min.   :  3.472   Length:68205      
##  1st Qu.: 23.875   Class :character  
##  Median : 26.346   Mode  :character  
##  Mean   : 27.511                     
##  3rd Qu.: 30.116                     
##  Max.   :298.667
cor_matrix <- cor(Med[, sapply(Med, is.numeric)])
cor_melted <- as.data.frame(as.table(cor_matrix))

cor_melted <- subset(cor_melted, Var1 != Var2)

cor_sorted <- cor_melted[order(-abs(cor_melted$Freq)),]

head(cor_sorted, 10)
##            Var1        Var2      Freq
## 39          bmi      weight 0.7630786
## 159      weight         bmi 0.7630786
## 44        ap_lo       ap_hi 0.7318124
## 56        ap_hi       ap_lo 0.7318124
## 2        height      gender 0.4983229
## 14       gender      height 0.4983229
## 72         gluc cholesterol 0.4504523
## 84  cholesterol        gluc 0.4504523
## 50       cardio       ap_hi 0.4338018
## 134       ap_hi      cardio 0.4338018

Logistic Regression

LogisticMed <- glm(cardio ~ . , data = train, family = binomial)

LogisticMedPred <- predict(LogisticMed, newdata = test, type = 'response')
threshold <- 0.5
LogisticMedClass <- ifelse(LogisticMedPred > threshold, 1, 0)
LogisticMedtable <- table(test$cardio, LogisticMedClass)

Naive Bayes

library(e1071)
## Warning: package 'e1071' was built under R version 4.2.2
nb_Med <- naiveBayes(cardio ~ ., data = train)
nb_Pred <- predict(nb_Med, test)
nb_Predtable <- table(test$cardio, nb_Pred)

KNN

library(kknn)
## Warning: package 'kknn' was built under R version 4.2.3
knn_model <- kknn(cardio~., train, test, k = 10000, scale = TRUE)
KNN_Pred <- fitted(knn_model)
KNN_Pred_Binary <- ifelse(KNN_Pred >= 0.5, 1, 0)
KNNtable <-table(test$cardio, KNN_Pred_Binary)

Decision Trees

library(rpart)
tree_model <- rpart(cardio ~ ., data = train, method = "class")
tree_model_predictions <- predict(tree_model, newdata = test, type = "class")
tree_model_table<-table(test$cardio, tree_model_predictions)

What I’ve found from the testing of different K’s using KNN, There is a sweetspot where

LogisticMedtable
##    LogisticMedClass
##        0    1
##   0 8388 1955
##   1 3524 6595
nb_Predtable
##    nb_Pred
##        0    1
##   0 8428 1915
##   1 4045 6074
KNNtable
##    KNN_Pred_Binary
##        0    1
##   0 8986 1357
##   1 4710 5409
tree_model_table
##    tree_model_predictions
##        0    1
##   0 8351 1992
##   1 3541 6578
get_metrics <- function(mat) {
  tp <- mat[2, 2]
  tn <- mat[1, 1]
  fp <- mat[1, 2]
  fn <- mat[2, 1]
  
  accuracy <- (tp + tn) / sum(mat)
  precision <- tp / (tp + fp)
  recall <- tp / (tp + fn)
  f1 <- 2 * (precision * recall) / (precision + recall)
  
  return(c(accuracy, precision, recall, f1))
}

# Assuming each get_metrics returns a vector of 4 values
values_Logistic <- get_metrics(LogisticMedtable)
values_nb <- get_metrics(nb_Predtable)
values_KNN <- get_metrics(KNNtable)
values_Tree <- get_metrics(tree_model_table)

# Combine all the values into a single vector
all_values <- c(values_Logistic, values_nb, values_KNN, values_Tree)

metrics <- data.frame(
  Model = rep(c("Logistic", "Naive Bayes", "KNN", "Tree"), each = 4),
  Metric = factor(rep(c("Accuracy", "Precision", "Recall", "F1"), times = 4)),
  Value = all_values
)

# Now plot the metrics
library(ggplot2)
ggplot(metrics, aes(x = Model, y = Value, fill = Metric)) +
  geom_bar(stat = "identity", position = "dodge") +
  theme_minimal() +
  labs(y = "Value", title = "Cardiovascular Model Performance Metrics")

### For the small dataset, data from 1000 sales is chosen.

student<-read.csv("C:\\Users\\John Ledesma\\Downloads\\archive (9)\\DATA (1).csv")
student <- student %>% select(-STUDENT.ID, -COURSE.ID)



student_index <- sample(1:nrow(student), 0.7*nrow(student))
student_train <- student[student_index,]
student_test <- student[-student_index,]


summary(student)
##        X1              X2            X3              X4              X5       
##  Min.   :1.000   Min.   :1.0   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:1.0   1st Qu.:2.000   1st Qu.:3.000   1st Qu.:1.000  
##  Median :2.000   Median :2.0   Median :2.000   Median :3.000   Median :2.000  
##  Mean   :1.621   Mean   :1.6   Mean   :1.945   Mean   :3.572   Mean   :1.662  
##  3rd Qu.:2.000   3rd Qu.:2.0   3rd Qu.:2.000   3rd Qu.:4.000   3rd Qu.:2.000  
##  Max.   :3.000   Max.   :2.0   Max.   :3.000   Max.   :5.000   Max.   :2.000  
##        X6            X7              X8              X9             X10       
##  Min.   :1.0   Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:1.0   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1.000  
##  Median :2.0   Median :2.000   Median :1.000   Median :1.000   Median :2.000  
##  Mean   :1.6   Mean   :1.579   Mean   :1.628   Mean   :1.621   Mean   :1.731  
##  3rd Qu.:2.0   3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.:2.000  
##  Max.   :2.0   Max.   :2.000   Max.   :5.000   Max.   :4.000   Max.   :4.000  
##       X11             X12             X13             X14       
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:2.000   1st Qu.:2.000   1st Qu.:1.000  
##  Median :2.000   Median :3.000   Median :3.000   Median :1.000  
##  Mean   :2.283   Mean   :2.634   Mean   :2.807   Mean   :1.172  
##  3rd Qu.:3.000   3rd Qu.:3.000   3rd Qu.:4.000   3rd Qu.:1.000  
##  Max.   :6.000   Max.   :6.000   Max.   :5.000   Max.   :3.000  
##       X15             X16             X17           X18             X19       
##  Min.   :1.000   Min.   :1.000   Min.   :1.0   Min.   :1.000   Min.   :1.000  
##  1st Qu.:2.000   1st Qu.:2.000   1st Qu.:2.0   1st Qu.:2.000   1st Qu.:2.000  
##  Median :2.000   Median :3.000   Median :2.0   Median :2.000   Median :2.000  
##  Mean   :2.359   Mean   :2.807   Mean   :2.2   Mean   :1.945   Mean   :2.014  
##  3rd Qu.:2.000   3rd Qu.:4.000   3rd Qu.:3.0   3rd Qu.:2.000   3rd Qu.:2.000  
##  Max.   :5.000   Max.   :5.000   Max.   :5.0   Max.   :3.000   Max.   :3.000  
##       X20             X21             X22             X23       
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1.000  
##  Median :1.000   Median :1.000   Median :1.000   Median :1.000  
##  Mean   :1.214   Mean   :1.207   Mean   :1.241   Mean   :1.338  
##  3rd Qu.:1.000   3rd Qu.:1.000   3rd Qu.:1.000   3rd Qu.:2.000  
##  Max.   :2.000   Max.   :3.000   Max.   :2.000   Max.   :3.000  
##       X24             X25             X26             X27       
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:2.000   1st Qu.:2.000   1st Qu.:2.000  
##  Median :1.000   Median :3.000   Median :2.000   Median :2.000  
##  Mean   :1.166   Mean   :2.545   Mean   :2.055   Mean   :2.393  
##  3rd Qu.:1.000   3rd Qu.:3.000   3rd Qu.:3.000   3rd Qu.:3.000  
##  Max.   :3.000   Max.   :3.000   Max.   :3.000   Max.   :3.000  
##       X28             X29             X30            GRADE      
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :0.000  
##  1st Qu.:1.000   1st Qu.:2.000   1st Qu.:2.000   1st Qu.:1.000  
##  Median :2.000   Median :3.000   Median :3.000   Median :3.000  
##  Mean   :1.807   Mean   :3.124   Mean   :2.724   Mean   :3.228  
##  3rd Qu.:2.000   3rd Qu.:4.000   3rd Qu.:3.000   3rd Qu.:5.000  
##  Max.   :3.000   Max.   :5.000   Max.   :4.000   Max.   :7.000

Multinomial Regression

library(nnet)
## Warning: package 'nnet' was built under R version 4.2.3
fit <- multinom(X30 ~ ., data = student_train)
## # weights:  128 (93 variable)
## initial  value 140.015730 
## iter  10 value 68.115379
## iter  20 value 33.595753
## iter  30 value 13.324618
## iter  40 value 0.047174
## final  value 0.000059 
## converged
pred <- predict(fit, newdata = student_test, type = "class")
multi_student_table<-table(pred, student_test$X30)

Naive Bayes

fit <- naiveBayes(X30 ~ ., data = student_train)
pred <- predict(fit, newdata = student_test)
Naive_Bayes_table<-table(pred, student_test$X30)
library(class)

knn_model_student <- kknn(X30~., student_train, student_test, k =5, scale = TRUE)
KNN_Pred_student <- fitted(knn_model_student)
KNNtable_student <-table(student_test$X30, KNN_Pred_student)
KNN_Pred_student_rounded <- round(KNN_Pred_student)
KNNtable_student_rounded <- table(student_test$X30, KNN_Pred_student_rounded)

KNNtable_student_rounded
##    KNN_Pred_student_rounded
##      1  2  3  4
##   1  0  2  4  0
##   2  2  4  5  0
##   3  1  3 11  1
##   4  0  3  4  4

Decision Tree model

tree_model_student <- rpart(X30 ~ ., data = student_train, method = "class")
tree_model_predictions_student <- predict(tree_model_student, newdata = student_test, type = "class")
tree_model_table_student<-table(student_test$X30, tree_model_predictions_student)

tree_model_table_student
##    tree_model_predictions_student
##     1 2 3 4
##   1 4 0 1 1
##   2 3 3 5 0
##   3 0 7 7 2
##   4 0 0 2 9

Naive Bayes

tree_model_table_student
##    tree_model_predictions_student
##     1 2 3 4
##   1 4 0 1 1
##   2 3 3 5 0
##   3 0 7 7 2
##   4 0 0 2 9
Naive_Bayes_table
##     
## pred  1  2  3  4
##    1  5 10 12  5
##    2  0  1  0  0
##    3  1  0  4  1
##    4  0  0  0  5
multi_student_table
##     
## pred 1 2 3 4
##    1 1 3 2 0
##    2 0 4 5 1
##    3 3 1 7 7
##    4 2 3 2 3
KNNtable_student_rounded
##    KNN_Pred_student_rounded
##      1  2  3  4
##   1  0  2  4  0
##   2  2  4  5  0
##   3  1  3 11  1
##   4  0  3  4  4

Model Evaluation

library(ggplot2)

calc_metrics_4x4 <- function(matrix) {
  total <- sum(matrix)
  diag_sum <- sum(diag(matrix))
  accuracy <- diag_sum / total
  precision_per_class <- diag(matrix) / rowSums(matrix)
  recall_per_class <- diag(matrix) / colSums(matrix)
  f1_per_class <- 2 * (precision_per_class * recall_per_class) / (precision_per_class + recall_per_class)
  
  avg_precision <- mean(precision_per_class, na.rm = TRUE)
  avg_recall <- mean(recall_per_class, na.rm = TRUE)
  avg_f1 <- mean(f1_per_class, na.rm = TRUE)
  
  return(c(Accuracy = accuracy, Average_Precision = avg_precision, Average_Recall = avg_recall, Average_F1 = avg_f1))
}

tree_metrics <- calc_metrics_4x4(tree_model_table_student)
naive_bayes_metrics <- calc_metrics_4x4(Naive_Bayes_table)
multi_student_metrics <- calc_metrics_4x4(multi_student_table)
knn_metrics <- calc_metrics_4x4(KNNtable_student_rounded)

metrics_df <- data.frame(
  Metric = rep(names(tree_metrics), 4),
  Value = c(tree_metrics, naive_bayes_metrics, multi_student_metrics, knn_metrics),
  Model = rep(c("Decision Tree", "Naive Bayes", "Multi Student", "KNN"), each = 4)
)



ggplot(metrics_df, aes(x = Model, y = Value, fill = Metric)) +
  geom_bar(stat = "identity", position = "dodge") +
  ylab("Metric Value") +
  xlab("Algorithm") +
  ggtitle("Algorithm Evaluation Metrics Comparison for Student Data") +
  scale_fill_brewer(palette = "Set1") +
  theme_minimal()