#Libary

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.2.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(DT)

#Impor Data

data = read.csv("data_uas.csv")
data$Purchase = factor(data$Purchase)
data$Purchase <- relevel(data$Purchase, ref = "1")
data %>% datatable(caption = "Tabel 1. Dataset")

#Split dataset menjadi set training dan validasi

set.seed(123)
trainIndex <- createDataPartition(data$Purchase, p = 0.7,list = FALSE)
trainData <- data[trainIndex, ]
validationData <- data[-trainIndex, ]
trainData %>% 
  datatable(caption = "Tabel 2. Dataset training")
validationData %>%
  datatable(caption = "Tabel 3. Dataset validasi")

#Mengevaluasi model

#1. Logistic Regression
logistic_model <- train(Purchase ~ Age + Income + Education_Level + Marital_Status + Job_Experience,
                        data = trainData, method = "glm", family = "binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Now make predictions on the validation set
logistic_predictions <- predict(logistic_model, newdata = validationData)

# Evaluate each model's performance using a confusion matrix and accuracy
logistic_cm <- confusionMatrix(logistic_predictions, validationData$Purchase)

# Accuracy
logistic_cm$overall['Accuracy']
##  Accuracy 
## 0.9491525
# 2. k-Nearest Neighbors (kNN)
knn_model <- train(Purchase ~ Age + Income + Education_Level + Marital_Status + Job_Experience,
                   data = trainData, method = "knn", tuneLength = 5)

# Make predictions on the validation set
knn_predictions <- predict(knn_model, newdata = validationData)

# Evaluate each model's performance using a confusion matrix and accuracy
knn_cm <- confusionMatrix(knn_predictions, validationData$Purchase)

# Accuracy
knn_cm$overall['Accuracy']
##  Accuracy 
## 0.9491525
# 3. Decision Tree
tree_model <- train(Purchase ~ Age + Income + Education_Level + Marital_Status + Job_Experience,
                    data = trainData, method = "rpart")

# Make predictions on the validation set
tree_predictions <- predict(tree_model, newdata = validationData)

# Evaluate each model's performance using a confusion matrix and accuracy
tree_cm <- confusionMatrix(tree_predictions, validationData$Purchase)

# Accuracy
tree_cm$overall['Accuracy']
##  Accuracy 
## 0.9491525

#Confusion matrix

Predicted = c("Yes", "No")
Actual_Yes = c("True Positives", "False Negatives")
Actual_No = c("False Positives", "True Negatives")

data.frame(Predicted, Actual_Yes, Actual_No) %>% 
  datatable(caption = "Tabel 4. Kategori dalam confusion matrix")

#Indeks Evaluasi

conmat.logistic = logistic_cm$table %>% 
  as.data.frame() %>%
  mutate(Prediction = recode(Prediction, "0" = "No", "1" = "Yes"),
         Reference = recode(Reference, "0" = "No", "1" = "Yes")) %>%
  mutate(Category = case_when(
      Prediction == "No"  & Reference == "No"  ~ "True Negative (TN)",
      Prediction == "Yes" & Reference == "No"  ~ "False Positive (FP)",
      Prediction == "No"  & Reference == "Yes" ~ "False Negative (FN)",
      Prediction == "Yes" & Reference == "Yes" ~ "True Positive (TP)"
    ))

conmat.logistic %>% datatable(caption = "Tabel 5. Confusion matrix logistic regression")
conmat.knn = knn_cm$table %>% 
  as.data.frame() %>%
  mutate(Prediction = recode(Prediction, "0" = "No", "1" = "Yes"),
         Reference = recode(Reference, "0" = "No", "1" = "Yes")) %>%
  mutate(Category = case_when(
      Prediction == "No"  & Reference == "No"  ~ "True Negative (TN)",
      Prediction == "Yes" & Reference == "No"  ~ "False Positive (FP)",
      Prediction == "No"  & Reference == "Yes" ~ "False Negative (FN)",
      Prediction == "Yes" & Reference == "Yes" ~ "True Positive (TP)"
    ))

conmat.knn %>% datatable(caption = "Tabel 6. Confusion matrix kNN")
conmat.tree = tree_cm$table %>% 
  as.data.frame() %>%
  mutate(Prediction = recode(Prediction, "0" = "No", "1" = "Yes"),
         Reference = recode(Reference, "0" = "No", "1" = "Yes")) %>%
  mutate(Category = case_when(
      Prediction == "No"  & Reference == "No"  ~ "True Negative (TN)",
      Prediction == "Yes" & Reference == "No"  ~ "False Positive (FP)",
      Prediction == "No"  & Reference == "Yes" ~ "False Negative (FN)",
      Prediction == "Yes" & Reference == "Yes" ~ "True Positive (TP)"
    ))

conmat.tree %>% datatable(caption = "Tabel 7. Confusion matrix decision tree")
# Buat data frame evaluasi model
evaluation_results <- data.frame(
  Model = c("Logistic Regression", "KNN", "Decision Tree"),
  Accuracy = c(
    logistic_cm$overall["Accuracy"],
    knn_cm$overall["Accuracy"],
    tree_cm$overall["Accuracy"]
  ),
  Precision = c(
    logistic_cm$byClass["Precision"],
    knn_cm$byClass["Precision"],
    tree_cm$byClass["Precision"]
  ),
  Recall = c(
    logistic_cm$byClass["Recall"],
    knn_cm$byClass["Recall"],
    tree_cm$byClass["Recall"]
  ),
  F1 = c(
    logistic_cm$byClass["F1"],
    knn_cm$byClass["F1"],
    tree_cm$byClass["F1"]
  )
)

evaluation_results %>% datatable(
  caption = "Tabel 8. Perbandingan Accuracy, Precision, Recall, dan F1",
  rownames = FALSE
)