library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Med <- read.csv("C:\\Users\\John Ledesma\\Downloads\\archive (8)\\cardio_data_processed.csv")
Med <- Med %>% select(-age, -id)
Med <- Med %>% select(-bp_category_encoded)
train_index <- sample(1:nrow(Med), 0.7*nrow(Med))
train <- Med[train_index,]
test <- Med[-train_index,]
head(Med)
## gender height weight ap_hi ap_lo cholesterol gluc smoke alco active cardio
## 1 2 168 62 110 80 1 1 0 0 1 0
## 2 1 156 85 140 90 3 1 0 0 1 1
## 3 1 165 64 130 70 3 1 0 0 0 1
## 4 2 169 82 150 100 1 1 0 0 1 1
## 5 1 156 56 100 60 1 1 0 0 0 0
## 6 1 151 67 120 80 2 2 0 0 0 0
## age_years bmi bp_category
## 1 50 21.96712 Hypertension Stage 1
## 2 55 34.92768 Hypertension Stage 2
## 3 51 23.50781 Hypertension Stage 1
## 4 48 28.71048 Hypertension Stage 2
## 5 47 23.01118 Normal
## 6 60 29.38468 Hypertension Stage 1
summary(Med)
## gender height weight ap_hi
## Min. :1.000 Min. : 55.0 Min. : 11.0 Min. : 90.0
## 1st Qu.:1.000 1st Qu.:159.0 1st Qu.: 65.0 1st Qu.:120.0
## Median :1.000 Median :165.0 Median : 72.0 Median :120.0
## Mean :1.349 Mean :164.4 Mean : 74.1 Mean :126.4
## 3rd Qu.:2.000 3rd Qu.:170.0 3rd Qu.: 82.0 3rd Qu.:140.0
## Max. :2.000 Max. :250.0 Max. :200.0 Max. :180.0
## ap_lo cholesterol gluc smoke
## Min. : 60.00 Min. :1.000 Min. :1.000 Min. :0.00000
## 1st Qu.: 80.00 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:0.00000
## Median : 80.00 Median :1.000 Median :1.000 Median :0.00000
## Mean : 81.26 Mean :1.363 Mean :1.225 Mean :0.08766
## 3rd Qu.: 90.00 3rd Qu.:1.000 3rd Qu.:1.000 3rd Qu.:0.00000
## Max. :120.00 Max. :3.000 Max. :3.000 Max. :1.00000
## alco active cardio age_years
## Min. :0.00000 Min. :0.0000 Min. :0.0000 Min. :29.00
## 1st Qu.:0.00000 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:48.00
## Median :0.00000 Median :1.0000 Median :0.0000 Median :53.00
## Mean :0.05313 Mean :0.8035 Mean :0.4937 Mean :52.82
## 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:58.00
## Max. :1.00000 Max. :1.0000 Max. :1.0000 Max. :64.00
## bmi bp_category
## Min. : 3.472 Length:68205
## 1st Qu.: 23.875 Class :character
## Median : 26.346 Mode :character
## Mean : 27.511
## 3rd Qu.: 30.116
## Max. :298.667
cor_matrix <- cor(Med[, sapply(Med, is.numeric)])
cor_melted <- as.data.frame(as.table(cor_matrix))
cor_melted <- subset(cor_melted, Var1 != Var2)
cor_sorted <- cor_melted[order(-abs(cor_melted$Freq)),]
head(cor_sorted, 10)
## Var1 Var2 Freq
## 39 bmi weight 0.7630786
## 159 weight bmi 0.7630786
## 44 ap_lo ap_hi 0.7318124
## 56 ap_hi ap_lo 0.7318124
## 2 height gender 0.4983229
## 14 gender height 0.4983229
## 72 gluc cholesterol 0.4504523
## 84 cholesterol gluc 0.4504523
## 50 cardio ap_hi 0.4338018
## 134 ap_hi cardio 0.4338018
LogisticMed <- glm(cardio ~ . , data = train, family = binomial)
LogisticMedPred <- predict(LogisticMed, newdata = test, type = 'response')
threshold <- 0.5
LogisticMedClass <- ifelse(LogisticMedPred > threshold, 1, 0)
LogisticMedtable <- table(test$cardio, LogisticMedClass)
library(e1071)
## Warning: package 'e1071' was built under R version 4.2.2
nb_Med <- naiveBayes(cardio ~ ., data = train)
nb_Pred <- predict(nb_Med, test)
nb_Predtable <- table(test$cardio, nb_Pred)
library(kknn)
## Warning: package 'kknn' was built under R version 4.2.3
knn_model <- kknn(cardio~., train, test, k = 10000, scale = TRUE)
KNN_Pred <- fitted(knn_model)
KNN_Pred_Binary <- ifelse(KNN_Pred >= 0.5, 1, 0)
KNNtable <-table(test$cardio, KNN_Pred_Binary)
library(rpart)
tree_model <- rpart(cardio ~ ., data = train, method = "class")
tree_model_predictions <- predict(tree_model, newdata = test, type = "class")
tree_model_table<-table(test$cardio, tree_model_predictions)
LogisticMedtable
## LogisticMedClass
## 0 1
## 0 8388 1955
## 1 3524 6595
nb_Predtable
## nb_Pred
## 0 1
## 0 8428 1915
## 1 4045 6074
KNNtable
## KNN_Pred_Binary
## 0 1
## 0 8986 1357
## 1 4710 5409
tree_model_table
## tree_model_predictions
## 0 1
## 0 8351 1992
## 1 3541 6578
get_metrics <- function(mat) {
tp <- mat[2, 2]
tn <- mat[1, 1]
fp <- mat[1, 2]
fn <- mat[2, 1]
accuracy <- (tp + tn) / sum(mat)
precision <- tp / (tp + fp)
recall <- tp / (tp + fn)
f1 <- 2 * (precision * recall) / (precision + recall)
return(c(accuracy, precision, recall, f1))
}
# Assuming each get_metrics returns a vector of 4 values
values_Logistic <- get_metrics(LogisticMedtable)
values_nb <- get_metrics(nb_Predtable)
values_KNN <- get_metrics(KNNtable)
values_Tree <- get_metrics(tree_model_table)
# Combine all the values into a single vector
all_values <- c(values_Logistic, values_nb, values_KNN, values_Tree)
metrics <- data.frame(
Model = rep(c("Logistic", "Naive Bayes", "KNN", "Tree"), each = 4),
Metric = factor(rep(c("Accuracy", "Precision", "Recall", "F1"), times = 4)),
Value = all_values
)
# Now plot the metrics
library(ggplot2)
ggplot(metrics, aes(x = Model, y = Value, fill = Metric)) +
geom_bar(stat = "identity", position = "dodge") +
theme_minimal() +
labs(y = "Value", title = "Cardiovascular Model Performance Metrics")
### For the small dataset, data from 1000 sales is chosen.
student<-read.csv("C:\\Users\\John Ledesma\\Downloads\\archive (9)\\DATA (1).csv")
student <- student %>% select(-STUDENT.ID, -COURSE.ID)
student_index <- sample(1:nrow(student), 0.7*nrow(student))
student_train <- student[student_index,]
student_test <- student[-student_index,]
summary(student)
## X1 X2 X3 X4 X5
## Min. :1.000 Min. :1.0 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:1.0 1st Qu.:2.000 1st Qu.:3.000 1st Qu.:1.000
## Median :2.000 Median :2.0 Median :2.000 Median :3.000 Median :2.000
## Mean :1.621 Mean :1.6 Mean :1.945 Mean :3.572 Mean :1.662
## 3rd Qu.:2.000 3rd Qu.:2.0 3rd Qu.:2.000 3rd Qu.:4.000 3rd Qu.:2.000
## Max. :3.000 Max. :2.0 Max. :3.000 Max. :5.000 Max. :2.000
## X6 X7 X8 X9 X10
## Min. :1.0 Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:1.0 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000
## Median :2.0 Median :2.000 Median :1.000 Median :1.000 Median :2.000
## Mean :1.6 Mean :1.579 Mean :1.628 Mean :1.621 Mean :1.731
## 3rd Qu.:2.0 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:2.000
## Max. :2.0 Max. :2.000 Max. :5.000 Max. :4.000 Max. :4.000
## X11 X12 X13 X14
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:1.000
## Median :2.000 Median :3.000 Median :3.000 Median :1.000
## Mean :2.283 Mean :2.634 Mean :2.807 Mean :1.172
## 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:1.000
## Max. :6.000 Max. :6.000 Max. :5.000 Max. :3.000
## X15 X16 X17 X18 X19
## Min. :1.000 Min. :1.000 Min. :1.0 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.0 1st Qu.:2.000 1st Qu.:2.000
## Median :2.000 Median :3.000 Median :2.0 Median :2.000 Median :2.000
## Mean :2.359 Mean :2.807 Mean :2.2 Mean :1.945 Mean :2.014
## 3rd Qu.:2.000 3rd Qu.:4.000 3rd Qu.:3.0 3rd Qu.:2.000 3rd Qu.:2.000
## Max. :5.000 Max. :5.000 Max. :5.0 Max. :3.000 Max. :3.000
## X20 X21 X22 X23
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000
## Median :1.000 Median :1.000 Median :1.000 Median :1.000
## Mean :1.214 Mean :1.207 Mean :1.241 Mean :1.338
## 3rd Qu.:1.000 3rd Qu.:1.000 3rd Qu.:1.000 3rd Qu.:2.000
## Max. :2.000 Max. :3.000 Max. :2.000 Max. :3.000
## X24 X25 X26 X27
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.000
## Median :1.000 Median :3.000 Median :2.000 Median :2.000
## Mean :1.166 Mean :2.545 Mean :2.055 Mean :2.393
## 3rd Qu.:1.000 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:3.000
## Max. :3.000 Max. :3.000 Max. :3.000 Max. :3.000
## X28 X29 X30 GRADE
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :0.000
## 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:1.000
## Median :2.000 Median :3.000 Median :3.000 Median :3.000
## Mean :1.807 Mean :3.124 Mean :2.724 Mean :3.228
## 3rd Qu.:2.000 3rd Qu.:4.000 3rd Qu.:3.000 3rd Qu.:5.000
## Max. :3.000 Max. :5.000 Max. :4.000 Max. :7.000
library(nnet)
## Warning: package 'nnet' was built under R version 4.2.3
fit <- multinom(X30 ~ ., data = student_train)
## # weights: 128 (93 variable)
## initial value 140.015730
## iter 10 value 68.115379
## iter 20 value 33.595753
## iter 30 value 13.324618
## iter 40 value 0.047174
## final value 0.000059
## converged
pred <- predict(fit, newdata = student_test, type = "class")
multi_student_table<-table(pred, student_test$X30)
fit <- naiveBayes(X30 ~ ., data = student_train)
pred <- predict(fit, newdata = student_test)
Naive_Bayes_table<-table(pred, student_test$X30)
library(class)
knn_model_student <- kknn(X30~., student_train, student_test, k =5, scale = TRUE)
KNN_Pred_student <- fitted(knn_model_student)
KNNtable_student <-table(student_test$X30, KNN_Pred_student)
KNN_Pred_student_rounded <- round(KNN_Pred_student)
KNNtable_student_rounded <- table(student_test$X30, KNN_Pred_student_rounded)
KNNtable_student_rounded
## KNN_Pred_student_rounded
## 1 2 3 4
## 1 0 2 4 0
## 2 2 4 5 0
## 3 1 3 11 1
## 4 0 3 4 4
tree_model_student <- rpart(X30 ~ ., data = student_train, method = "class")
tree_model_predictions_student <- predict(tree_model_student, newdata = student_test, type = "class")
tree_model_table_student<-table(student_test$X30, tree_model_predictions_student)
tree_model_table_student
## tree_model_predictions_student
## 1 2 3 4
## 1 4 0 1 1
## 2 3 3 5 0
## 3 0 7 7 2
## 4 0 0 2 9
tree_model_table_student
## tree_model_predictions_student
## 1 2 3 4
## 1 4 0 1 1
## 2 3 3 5 0
## 3 0 7 7 2
## 4 0 0 2 9
Naive_Bayes_table
##
## pred 1 2 3 4
## 1 5 10 12 5
## 2 0 1 0 0
## 3 1 0 4 1
## 4 0 0 0 5
multi_student_table
##
## pred 1 2 3 4
## 1 1 3 2 0
## 2 0 4 5 1
## 3 3 1 7 7
## 4 2 3 2 3
KNNtable_student_rounded
## KNN_Pred_student_rounded
## 1 2 3 4
## 1 0 2 4 0
## 2 2 4 5 0
## 3 1 3 11 1
## 4 0 3 4 4
library(ggplot2)
calc_metrics_4x4 <- function(matrix) {
total <- sum(matrix)
diag_sum <- sum(diag(matrix))
accuracy <- diag_sum / total
precision_per_class <- diag(matrix) / rowSums(matrix)
recall_per_class <- diag(matrix) / colSums(matrix)
f1_per_class <- 2 * (precision_per_class * recall_per_class) / (precision_per_class + recall_per_class)
avg_precision <- mean(precision_per_class, na.rm = TRUE)
avg_recall <- mean(recall_per_class, na.rm = TRUE)
avg_f1 <- mean(f1_per_class, na.rm = TRUE)
return(c(Accuracy = accuracy, Average_Precision = avg_precision, Average_Recall = avg_recall, Average_F1 = avg_f1))
}
tree_metrics <- calc_metrics_4x4(tree_model_table_student)
naive_bayes_metrics <- calc_metrics_4x4(Naive_Bayes_table)
multi_student_metrics <- calc_metrics_4x4(multi_student_table)
knn_metrics <- calc_metrics_4x4(KNNtable_student_rounded)
metrics_df <- data.frame(
Metric = rep(names(tree_metrics), 4),
Value = c(tree_metrics, naive_bayes_metrics, multi_student_metrics, knn_metrics),
Model = rep(c("Decision Tree", "Naive Bayes", "Multi Student", "KNN"), each = 4)
)
ggplot(metrics_df, aes(x = Model, y = Value, fill = Metric)) +
geom_bar(stat = "identity", position = "dodge") +
ylab("Metric Value") +
xlab("Algorithm") +
ggtitle("Algorithm Evaluation Metrics Comparison for Student Data") +
scale_fill_brewer(palette = "Set1") +
theme_minimal()