#Algoritma Naive Bayes
library(caret)
library(e1071)
# Membaca data dari file CSV
data <- read.csv2("C:/Users/acer/Downloads/Maternal Health Risk Data Set(Machine Learning).csv")
head(data)
## Age SystolicBP DiastolicBP BS BodyTemp HeartRate RiskLevel
## 1 25 130 80 15.0 98 86 high risk
## 2 35 140 90 13.0 98 70 high risk
## 3 29 90 70 8.0 100 80 high risk
## 4 30 140 85 7.0 98 70 high risk
## 5 35 120 60 6.1 98 76 low risk
## 6 23 140 80 7.0 98 70 high risk
# Melihat struktur data sebelum dan setelah mengubah RiskLevel menjadi faktor
str(data)
## 'data.frame': 1014 obs. of 7 variables:
## $ Age : int 25 35 29 30 35 23 23 35 32 42 ...
## $ SystolicBP : int 130 140 90 140 120 140 130 85 120 130 ...
## $ DiastolicBP: int 80 90 70 85 60 80 70 60 90 80 ...
## $ BS : num 15 13 8 7 6.1 7 7 11 6.9 18 ...
## $ BodyTemp : chr "98" "98" "100" "98" ...
## $ HeartRate : int 86 70 80 70 76 70 78 86 70 70 ...
## $ RiskLevel : chr "high risk" "high risk" "high risk" "high risk" ...
data$RiskLevel <- as.factor(data$RiskLevel)
str(data)
## 'data.frame': 1014 obs. of 7 variables:
## $ Age : int 25 35 29 30 35 23 23 35 32 42 ...
## $ SystolicBP : int 130 140 90 140 120 140 130 85 120 130 ...
## $ DiastolicBP: int 80 90 70 85 60 80 70 60 90 80 ...
## $ BS : num 15 13 8 7 6.1 7 7 11 6.9 18 ...
## $ BodyTemp : chr "98" "98" "100" "98" ...
## $ HeartRate : int 86 70 80 70 76 70 78 86 70 70 ...
## $ RiskLevel : Factor w/ 3 levels "high risk","low risk",..: 1 1 1 1 2 1 3 1 3 1 ...
summary(data)
## Age SystolicBP DiastolicBP BS
## Min. :10.00 Min. : 70.0 Min. : 49.00 Min. : 6.000
## 1st Qu.:19.00 1st Qu.:100.0 1st Qu.: 65.00 1st Qu.: 6.900
## Median :26.00 Median :120.0 Median : 80.00 Median : 7.500
## Mean :29.87 Mean :113.2 Mean : 76.46 Mean : 8.726
## 3rd Qu.:39.00 3rd Qu.:120.0 3rd Qu.: 90.00 3rd Qu.: 8.000
## Max. :70.00 Max. :160.0 Max. :100.00 Max. :19.000
## BodyTemp HeartRate RiskLevel
## Length:1014 Min. : 7.0 high risk:272
## Class :character 1st Qu.:70.0 low risk :406
## Mode :character Median :76.0 mid risk :336
## Mean :74.3
## 3rd Qu.:80.0
## Max. :90.0
# Partisi Data untuk Data Training dan Testing
set.seed(1403)
sample <- sample(1:nrow(data), 0.75 * nrow(data), replace = FALSE)
training <- data[sample, ]
testing <- data[-sample, ]
# Partisi Data untuk Data Training dan Testing
set.seed(1403)
sample <- sample(1:nrow(data), 0.75 * nrow(data), replace = FALSE)
training <- data[sample, ]
testing <- data[-sample, ]
# Membuat model Naive Bayes
model <- naiveBayes(RiskLevel ~ ., data = training)
#Membuat Prediksi dari Model
prediksi = predict(model, testing, type="class")
prediksi2 = predict(model, training, type="class")
#Melihat Akurasi dari Model
confusionMatrix(prediksi, testing$RiskLevel)
## Confusion Matrix and Statistics
##
## Reference
## Prediction high risk low risk mid risk
## high risk 43 4 8
## low risk 16 84 70
## mid risk 7 1 21
##
## Overall Statistics
##
## Accuracy : 0.5827
## 95% CI : (0.5194, 0.644)
## No Information Rate : 0.3898
## P-Value [Acc > NIR] : 4.090e-10
##
## Kappa : 0.3722
##
## Mcnemar's Test P-Value : 5.061e-16
##
## Statistics by Class:
##
## Class: high risk Class: low risk Class: mid risk
## Sensitivity 0.6515 0.9438 0.21212
## Specificity 0.9362 0.4788 0.94839
## Pos Pred Value 0.7818 0.4941 0.72414
## Neg Pred Value 0.8844 0.9405 0.65333
## Prevalence 0.2598 0.3504 0.38976
## Detection Rate 0.1693 0.3307 0.08268
## Detection Prevalence 0.2165 0.6693 0.11417
## Balanced Accuracy 0.7938 0.7113 0.58025
confusionMatrix(prediksi2, training$RiskLevel)
## Confusion Matrix and Statistics
##
## Reference
## Prediction high risk low risk mid risk
## high risk 129 4 24
## low risk 40 306 166
## mid risk 37 7 47
##
## Overall Statistics
##
## Accuracy : 0.6342
## 95% CI : (0.5988, 0.6685)
## No Information Rate : 0.4171
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4154
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: high risk Class: low risk Class: mid risk
## Sensitivity 0.6262 0.9653 0.19831
## Specificity 0.9495 0.5350 0.91587
## Pos Pred Value 0.8217 0.5977 0.51648
## Neg Pred Value 0.8723 0.9556 0.71599
## Prevalence 0.2711 0.4171 0.31184
## Detection Rate 0.1697 0.4026 0.06184
## Detection Prevalence 0.2066 0.6737 0.11974
## Balanced Accuracy 0.7878 0.7501 0.55709
# Menghitung confusion matrix untuk data testing
cm <- confusionMatrix(prediksi, testing$RiskLevel)
# Menghitung TP, TN, FP, FN untuk data testing
conf_table <- table(prediksi, testing$RiskLevel)
print(conf_table)
##
## prediksi high risk low risk mid risk
## high risk 43 4 8
## low risk 16 84 70
## mid risk 7 1 21
# Menghitung TP, TN, FP, FN untuk setiap kelas
TP <- diag(conf_table) # Mendapatkan nilai True Positives langsung dari diagonal
TN <- sum(conf_table) - rowSums(conf_table) - colSums(conf_table) + diag(conf_table)
FP <- colSums(conf_table) - diag(conf_table)
FN <- rowSums(conf_table) - diag(conf_table)
# Menampilkan nilai TP, TN, FP, FN untuk setiap kelas
for (i in 1:nrow(conf_table)) {
cat("Class:", rownames(conf_table)[i], "\n")
cat("True Positives (TP):", TP[i], "\n")
cat("True Negatives (TN):", TN[i], "\n")
cat("False Positives (FP):", FP[i], "\n")
cat("False Negatives (FN):", FN[i], "\n")
}
## Class: high risk
## True Positives (TP): 43
## True Negatives (TN): 176
## False Positives (FP): 23
## False Negatives (FN): 12
## Class: low risk
## True Positives (TP): 84
## True Negatives (TN): 79
## False Positives (FP): 5
## False Negatives (FN): 86
## Class: mid risk
## True Positives (TP): 21
## True Negatives (TN): 147
## False Positives (FP): 78
## False Negatives (FN): 8
# Menghitung sensitivitas (true positive rate)
sensitivity <- TP / (TP + FN)
sensitivity
## high risk low risk mid risk
## 0.7818182 0.4941176 0.7241379
# Menghitung spesifisitas (true negative rate)
specificity <- TN / (TN + FP)
specificity
## high risk low risk mid risk
## 0.8844221 0.9404762 0.6533333
# Menghitung recall (true positive rate)
recall <- TP / (TP + FN)
recall
## high risk low risk mid risk
## 0.7818182 0.4941176 0.7241379
# Menghitung precision
precision <- TP / (TP + FP)
precision
## high risk low risk mid risk
## 0.6515152 0.9438202 0.2121212
# Menghitung F1-score
f1_score <- 2 * (precision * recall) / (precision + recall)
f1_score
## high risk low risk mid risk
## 0.7107438 0.6486486 0.3281250
# Menghitung TP, TN, FP, FN untuk data testing
conf_table2 <- table(prediksi2, training$RiskLevel)
print(conf_table2)
##
## prediksi2 high risk low risk mid risk
## high risk 129 4 24
## low risk 40 306 166
## mid risk 37 7 47
# Menghitung TP, TN, FP, FN untuk setiap kelas
TP2 <- diag(conf_table2) # Mendapatkan nilai True Positives langsung dari diagonal
TN2 <- sum(conf_table2) - rowSums(conf_table2) - colSums(conf_table2) + diag(conf_table2)
FP2 <- colSums(conf_table2) - diag(conf_table2)
FN2 <- rowSums(conf_table2) - diag(conf_table2)
# Menampilkan nilai TP, TN, FP, FN untuk setiap kelas
for (i in 1:nrow(conf_table2)) {
cat("Class:", rownames(conf_table2)[i], "\n")
cat("True Positives (TP2):", TP2[i], "\n")
cat("True Negatives (TN2):", TN2[i], "\n")
cat("False Positives (FP2):", FP2[i], "\n")
cat("False Negatives (FN2):", FN2[i], "\n")
}
## Class: high risk
## True Positives (TP2): 129
## True Negatives (TN2): 526
## False Positives (FP2): 77
## False Negatives (FN2): 28
## Class: low risk
## True Positives (TP2): 306
## True Negatives (TN2): 237
## False Positives (FP2): 11
## False Negatives (FN2): 206
## Class: mid risk
## True Positives (TP2): 47
## True Negatives (TN2): 479
## False Positives (FP2): 190
## False Negatives (FN2): 44
# Menghitung sensitivitas (true positive rate)
sensitivity2 <- TP2 / (TP2 + FN2)
sensitivity2
## high risk low risk mid risk
## 0.8216561 0.5976562 0.5164835
# Menghitung spesifisitas (true negative rate)
specificity2 <- TN2 / (TN2 + FP2)
specificity2
## high risk low risk mid risk
## 0.8723051 0.9556452 0.7159940
# Menghitung recall (true positive rate)
recall2 <- TP2 / (TP2 + FN2)
recall2
## high risk low risk mid risk
## 0.8216561 0.5976562 0.5164835
# Menghitung precision
precision2 <- TP2 / (TP2 + FP2)
precision2
## high risk low risk mid risk
## 0.6262136 0.9652997 0.1983122
# Menghitung F1-score
f1_score2 <- 2 * (precision2 * recall2) / (precision2 + recall2)
f1_score2
## high risk low risk mid risk
## 0.7107438 0.7382388 0.2865854