#Install Package

library(class)
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## Loading required package: lattice
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)

#Input Data

data <- read.csv("C:/Users/nandh/Desktop/SEMESTER 6/DATMIN/dataset_knn_olahraga.csv")
summary(data)
##       usia            bmi          jam_tidur      kebutuhan_olahraga
##  Min.   :18.00   Min.   :14.70   Min.   : 3.000   Length:200        
##  1st Qu.:31.00   1st Qu.:21.88   1st Qu.: 6.100   Class :character  
##  Median :41.00   Median :24.60   Median : 7.150   Mode  :character  
##  Mean   :41.79   Mean   :25.03   Mean   : 7.052                     
##  3rd Qu.:52.25   3rd Qu.:28.02   3rd Qu.: 8.100                     
##  Max.   :65.00   Max.   :41.20   Max.   :10.900
sum(is.na(data))
## [1] 0

#Encode label menjadi faktor

data$kebutuhan_olahraga <- as.factor(data$kebutuhan_olahraga)

#Normalisasi data

normalize <- function(x) { (x - min(x)) / (max(x) - min(x)) }
data_norm <- as.data.frame(lapply(data[, c("usia", "bmi", "jam_tidur")], normalize))

#Split data

set.seed(123)
train_index <- createDataPartition(data$kebutuhan_olahraga, p=0.8, list = FALSE)
train_data <- data_norm[train_index, ]
test_data <- data_norm[-train_index, ]

train_labels <- data$kebutuhan_olahraga[train_index]
test_labels <- data$kebutuhan_olahraga[-train_index]

#KNN model

predicted <- knn(train = train_data, test = test_data, cl = train_labels, k = 5)

#Evaluasi

conf_matrix <- confusionMatrix(predicted, test_labels)
print(conf_matrix)
## Confusion Matrix and Statistics
## 
##                       Reference
## Prediction             Perlu Olahraga Tidak Perlu Olahraga
##   Perlu Olahraga                   11                   11
##   Tidak Perlu Olahraga              9                    8
##                                           
##                Accuracy : 0.4872          
##                  95% CI : (0.3242, 0.6522)
##     No Information Rate : 0.5128          
##     P-Value [Acc > NIR] : 0.6847          
##                                           
##                   Kappa : -0.029          
##                                           
##  Mcnemar's Test P-Value : 0.8231          
##                                           
##             Sensitivity : 0.5500          
##             Specificity : 0.4211          
##          Pos Pred Value : 0.5000          
##          Neg Pred Value : 0.4706          
##              Prevalence : 0.5128          
##          Detection Rate : 0.2821          
##    Detection Prevalence : 0.5641          
##       Balanced Accuracy : 0.4855          
##                                           
##        'Positive' Class : Perlu Olahraga  
## 

#Prediksi studi Kasus

# Simpan nilai min dan max untuk tiap kolom (dari data asli)
min_vals <- sapply(data[, c("usia", "bmi", "jam_tidur")], min)
max_vals <- sapply(data[, c("usia", "bmi", "jam_tidur")], max)
input_Carmen <- data.frame(
  usia = 22,
  bmi = 30,
  jam_tidur = 7
                           )

# Normalisasi input
input_Carmen_norm <- as.data.frame(scale(input_Carmen, center = min_vals, scale = max_vals - min_vals))

# Prediksi KNN
kebutuhan_Carmen <- knn(train = train_data, test = input_Carmen_norm, cl = train_labels, k = 5)
cat("Kebutuhan Olahraga untuk Carmen:", as.character(kebutuhan_Carmen), "\n")
## Kebutuhan Olahraga untuk Carmen: Perlu Olahraga
library(ggplot2)

# Tambahkan label prediksi ke data asli (untuk warna)
data$label <- data$kebutuhan_olahraga

# Data untuk Carmen (asumsikan nilai original, bukan normalisasi)
input_Carmen$label <- as.character(kebutuhan_Carmen)

# Buat plot
ggplot(data, aes(x = usia, y = bmi, color = label)) +
  geom_point(alpha = 0.6, size = 3) +
  geom_point(data = input_Carmen, aes(x = usia, y = bmi), 
             color = "black", shape = 17, size = 5, stroke = 2) +
  labs(
    title = "Visualisasi Kebutuhan Olahraga untuk Carmen",
    x = "Usia",
    y = "BMI",
    color = "Kebutuhan Olahraga"
  ) +
  annotate("text", x = input_Carmen$usia + 1, y = input_Carmen$bmi,
           label = "Carmen", hjust = 0, vjust = -1, size = 4.5) +
  scale_color_manual(values = c("Perlu Olahraga" = "#E74C3C", 
                                "Tidak Perlu Olahraga" = "#2ECC71")) +
  theme_minimal()