library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readxl)
library(rpart)
library(rpart.plot)
# MEMBACA DATA
data <- read_excel("C:/Users/User/Downloads/app_data.xlsx")
data
## # A tibble: 782 × 58
##      Age BMI              Sex   Height Weight Length_of_Stay Management Severity
##    <dbl> <chr>            <chr>  <dbl>  <dbl>          <dbl> <chr>      <chr>   
##  1 12.7  16.899999999999… fema…    148   37                3 conservat… uncompl…
##  2 14.1  31.9             male     147   69.5              2 conservat… uncompl…
##  3 14.1  23.3             fema…    163   62                4 conservat… uncompl…
##  4 16.4  20.6             fema…    165   56                3 conservat… uncompl…
##  5 11.1  16.899999999999… fema…    163   45                3 conservat… uncompl…
##  6 11.0  30.7             male     121   45                3 conservat… uncompl…
##  7  8.98 19.399999999999… fema…    140   38.5              3 conservat… uncompl…
##  8  7.06 <NA>             fema…     NA   21.5              2 conservat… uncompl…
##  9  7.9  15.7             male     131   26.7              3 conservat… uncompl…
## 10 14.3  14.9             male     174   45.5              3 conservat… uncompl…
## # ℹ 772 more rows
## # ℹ 50 more variables: Diagnosis_Presumptive <chr>, Diagnosis <chr>,
## #   Alvarado_Score <dbl>, Paedriatic_Appendicitis_Score <dbl>,
## #   Appendix_on_US <chr>, Appendix_Diameter <dbl>, Migratory_Pain <chr>,
## #   Lower_Right_Abd_Pain <chr>, Contralateral_Rebound_Tenderness <chr>,
## #   Coughing_Pain <chr>, Nausea <chr>, Loss_of_Appetite <chr>,
## #   Body_Temperature <dbl>, WBC_Count <dbl>, Neutrophil_Percentage <dbl>, …
# CEK TIPE DATA
str(data)
## tibble [782 × 58] (S3: tbl_df/tbl/data.frame)
##  $ Age                             : num [1:782] 12.7 14.1 14.1 16.4 11.1 ...
##  $ BMI                             : chr [1:782] "16.899999999999999" "31.9" "23.3" "20.6" ...
##  $ Sex                             : chr [1:782] "female" "male" "female" "female" ...
##  $ Height                          : num [1:782] 148 147 163 165 163 121 140 NA 131 174 ...
##  $ Weight                          : num [1:782] 37 69.5 62 56 45 45 38.5 21.5 26.7 45.5 ...
##  $ Length_of_Stay                  : num [1:782] 3 2 4 3 3 3 3 2 3 3 ...
##  $ Management                      : chr [1:782] "conservative" "conservative" "conservative" "conservative" ...
##  $ Severity                        : chr [1:782] "uncomplicated" "uncomplicated" "uncomplicated" "uncomplicated" ...
##  $ Diagnosis_Presumptive           : chr [1:782] "appendicitis" "appendicitis" "appendicitis" "appendicitis" ...
##  $ Diagnosis                       : chr [1:782] "appendicitis" "no appendicitis" "no appendicitis" "no appendicitis" ...
##  $ Alvarado_Score                  : num [1:782] 4 5 5 7 5 6 5 3 7 4 ...
##  $ Paedriatic_Appendicitis_Score   : num [1:782] 3 4 3 6 6 7 6 3 6 4 ...
##  $ Appendix_on_US                  : chr [1:782] "yes" "no" "no" "no" ...
##  $ Appendix_Diameter               : num [1:782] 7.1 NA NA NA 7 NA NA NA 3.7 8 ...
##  $ Migratory_Pain                  : chr [1:782] "no" "yes" "no" "yes" ...
##  $ Lower_Right_Abd_Pain            : chr [1:782] "yes" "yes" "yes" "yes" ...
##  $ Contralateral_Rebound_Tenderness: chr [1:782] "yes" "yes" "yes" "no" ...
##  $ Coughing_Pain                   : chr [1:782] "no" "no" "no" "no" ...
##  $ Nausea                          : chr [1:782] "no" "no" "no" "yes" ...
##  $ Loss_of_Appetite                : chr [1:782] "yes" "yes" "no" "yes" ...
##  $ Body_Temperature                : num [1:782] 37 36.9 36.6 36 36.9 36.9 36.7 36.8 37.3 37.1 ...
##  $ WBC_Count                       : num [1:782] 7.7 8.1 13.2 11.4 8.1 9.5 10 8 20.9 5.8 ...
##  $ Neutrophil_Percentage           : num [1:782] 68.2 64.8 74.8 63 44 71.4 69.1 79.6 76 47.2 ...
##  $ Segmented_Neutrophils           : num [1:782] NA NA NA NA NA NA NA NA NA NA ...
##  $ Neutrophilia                    : chr [1:782] "no" "no" "no" "no" ...
##  $ RBC_Count                       : num [1:782] 5.27 5.26 3.98 4.64 4.44 4.96 4.77 4.89 4.61 4.78 ...
##  $ Hemoglobin                      : num [1:782] 14.8 15.7 11.4 13.6 12.6 12.5 12.7 12 13.4 12.9 ...
##  $ RDW                             : num [1:782] 12.2 12.7 12.2 13.2 13.6 13.3 12.6 13.9 12 12.6 ...
##  $ Thrombocyte_Count               : num [1:782] 254 151 300 258 311 249 337 412 350 220 ...
##  $ Ketones_in_Urine                : chr [1:782] "++" "no" "no" "no" ...
##  $ RBC_in_Urine                    : chr [1:782] "+" "no" "no" "no" ...
##  $ WBC_in_Urine                    : chr [1:782] "no" "no" "no" "no" ...
##  $ CRP                             : num [1:782] 0 3 3 0 0 63 9 0 20 0 ...
##  $ Dysuria                         : chr [1:782] "no" "yes" "no" "yes" ...
##  $ Stool                           : chr [1:782] "normal" "normal" "constipation" "normal" ...
##  $ Peritonitis                     : chr [1:782] "no" "no" "no" "no" ...
##  $ Psoas_Sign                      : chr [1:782] "yes" "yes" "yes" "yes" ...
##  $ Ipsilateral_Rebound_Tenderness  : chr [1:782] "no" "no" "no" "no" ...
##  $ US_Performed                    : chr [1:782] "yes" "yes" "yes" "yes" ...
##  $ US_Number                       : num [1:782] 882 883 884 886 887 888 889 890 891 893 ...
##  $ Free_Fluids                     : chr [1:782] "no" "no" "no" "no" ...
##  $ Appendix_Wall_Layers            : chr [1:782] "intact" NA NA NA ...
##  $ Target_Sign                     : chr [1:782] NA NA NA NA ...
##  $ Appendicolith                   : chr [1:782] "suspected" NA NA NA ...
##  $ Perfusion                       : chr [1:782] NA NA NA NA ...
##  $ Perforation                     : chr [1:782] "no" NA NA NA ...
##  $ Surrounding_Tissue_Reaction     : chr [1:782] "yes" NA NA NA ...
##  $ Appendicular_Abscess            : chr [1:782] "no" NA NA NA ...
##  $ Abscess_Location                : chr [1:782] NA NA NA NA ...
##  $ Pathological_Lymph_Nodes        : chr [1:782] "yes" NA NA "yes" ...
##  $ Lymph_Nodes_Location            : chr [1:782] "reUB" NA NA "reUB" ...
##  $ Bowel_Wall_Thickening           : chr [1:782] NA NA NA NA ...
##  $ Conglomerate_of_Bowel_Loops     : chr [1:782] NA NA NA NA ...
##  $ Ileus                           : chr [1:782] NA NA NA NA ...
##  $ Coprostasis                     : chr [1:782] NA NA NA NA ...
##  $ Meteorism                       : chr [1:782] NA "yes" "yes" NA ...
##  $ Enteritis                       : chr [1:782] NA NA "yes" "yes" ...
##  $ Gynecological_Findings          : chr [1:782] NA NA NA NA ...
# MENGUBAH VARIABEL BMI DARI CHARACTER MENJADI NUMERIC
# karena BMI harus berupa angka untuk analisis
data$BMI <- as.numeric(data$BMI)

# MENGHAPUS BARIS YANG SEMUA NILAINYA NA
# baris tanpa informasi tidak bisa dipakai untuk modeling
data <- data[rowSums(is.na(data)) != ncol(data), ]

# MENGHAPUS VARIABEL DENGAN MISSING VALUE > 50%
# variabel dengan terlalu banyak NA biasanya tidak informatif
data <- data[, colMeans(is.na(data)) < 0.5]

data
## # A tibble: 781 × 40
##      Age   BMI Sex    Height Weight Length_of_Stay Management   Severity     
##    <dbl> <dbl> <chr>   <dbl>  <dbl>          <dbl> <chr>        <chr>        
##  1 12.7   16.9 female    148   37                3 conservative uncomplicated
##  2 14.1   31.9 male      147   69.5              2 conservative uncomplicated
##  3 14.1   23.3 female    163   62                4 conservative uncomplicated
##  4 16.4   20.6 female    165   56                3 conservative uncomplicated
##  5 11.1   16.9 female    163   45                3 conservative uncomplicated
##  6 11.0   30.7 male      121   45                3 conservative uncomplicated
##  7  8.98  19.4 female    140   38.5              3 conservative uncomplicated
##  8  7.06  NA   female     NA   21.5              2 conservative uncomplicated
##  9  7.9   15.7 male      131   26.7              3 conservative uncomplicated
## 10 14.3   14.9 male      174   45.5              3 conservative uncomplicated
## # ℹ 771 more rows
## # ℹ 32 more variables: Diagnosis_Presumptive <chr>, Diagnosis <chr>,
## #   Alvarado_Score <dbl>, Paedriatic_Appendicitis_Score <dbl>,
## #   Appendix_on_US <chr>, Appendix_Diameter <dbl>, Migratory_Pain <chr>,
## #   Lower_Right_Abd_Pain <chr>, Contralateral_Rebound_Tenderness <chr>,
## #   Coughing_Pain <chr>, Nausea <chr>, Loss_of_Appetite <chr>,
## #   Body_Temperature <dbl>, WBC_Count <dbl>, Neutrophil_Percentage <dbl>, …
# MENGUBAH SEMUA VARIABEL KATEGORIK MENJADI FACTOR
data <- data %>%
  mutate(across(where(is.character), as.factor))
# IMPUTASI MEDIAN DAN MODUS
# NUMERIK -> MEDIAN
# KATEGORIK -> MODUS

# Fungsi Modus
get_mode <- function(v){
  uniqv <- unique(na.omit(v))
  uniqv[which.max(tabulate(match(v, uniqv)))]
}

data <- data %>%
  mutate(across(where(is.numeric),
                ~ ifelse(is.na(.), median(., na.rm = TRUE), .)))

data <- data %>%
  mutate(across(where(is.factor),
                ~ ifelse(is.na(.), get_mode(.), .)))

data
## # A tibble: 781 × 40
##      Age   BMI   Sex Height Weight Length_of_Stay Management Severity
##    <dbl> <dbl> <int>  <dbl>  <dbl>          <dbl>      <int>    <int>
##  1 12.7   16.9     1   148    37                3          1        2
##  2 14.1   31.9     2   147    69.5              2          1        2
##  3 14.1   23.3     1   163    62                4          1        2
##  4 16.4   20.6     1   165    56                3          1        2
##  5 11.1   16.9     1   163    45                3          1        2
##  6 11.0   30.7     2   121    45                3          1        2
##  7  8.98  19.4     1   140    38.5              3          1        2
##  8  7.06  18.1     1   150.   21.5              2          1        2
##  9  7.9   15.7     2   131    26.7              3          1        2
## 10 14.3   14.9     2   174    45.5              3          1        2
## # ℹ 771 more rows
## # ℹ 32 more variables: Diagnosis_Presumptive <int>, Diagnosis <int>,
## #   Alvarado_Score <dbl>, Paedriatic_Appendicitis_Score <dbl>,
## #   Appendix_on_US <int>, Appendix_Diameter <dbl>, Migratory_Pain <int>,
## #   Lower_Right_Abd_Pain <int>, Contralateral_Rebound_Tenderness <int>,
## #   Coughing_Pain <int>, Nausea <int>, Loss_of_Appetite <int>,
## #   Body_Temperature <dbl>, WBC_Count <dbl>, Neutrophil_Percentage <dbl>, …
# MENGHAPUS VARIABEL YANG TIDAK DIGUNAKAN
data <- data[, !(names(data) %in% c(
  "Management",
  "Severity",
  "US_Number",
  "Length_of_Stay", 
  "Diagnosis_Presumptive"
))]

data
## # A tibble: 781 × 35
##      Age   BMI   Sex Height Weight Diagnosis Alvarado_Score
##    <dbl> <dbl> <int>  <dbl>  <dbl>     <int>          <dbl>
##  1 12.7   16.9     1   148    37           1              4
##  2 14.1   31.9     2   147    69.5         2              5
##  3 14.1   23.3     1   163    62           2              5
##  4 16.4   20.6     1   165    56           2              7
##  5 11.1   16.9     1   163    45           1              5
##  6 11.0   30.7     2   121    45           2              6
##  7  8.98  19.4     1   140    38.5         2              5
##  8  7.06  18.1     1   150.   21.5         2              3
##  9  7.9   15.7     2   131    26.7         2              7
## 10 14.3   14.9     2   174    45.5         1              4
## # ℹ 771 more rows
## # ℹ 28 more variables: Paedriatic_Appendicitis_Score <dbl>,
## #   Appendix_on_US <int>, Appendix_Diameter <dbl>, Migratory_Pain <int>,
## #   Lower_Right_Abd_Pain <int>, Contralateral_Rebound_Tenderness <int>,
## #   Coughing_Pain <int>, Nausea <int>, Loss_of_Appetite <int>,
## #   Body_Temperature <dbl>, WBC_Count <dbl>, Neutrophil_Percentage <dbl>,
## #   Neutrophilia <int>, RBC_Count <dbl>, Hemoglobin <dbl>, RDW <dbl>, …
library(FSelectorRcpp)

# Hitung Information Gain
ig <- information_gain(Diagnosis ~ ., data)
ig_sorted <- ig[order(-ig$importance), ]
ig_sorted
##                          attributes importance
## 9                 Appendix_Diameter 0.29297069
## 8                    Appendix_on_US 0.09733408
## 6                    Alvarado_Score 0.09035710
## 27                              CRP 0.08616099
## 17                        WBC_Count 0.08316729
## 30                      Peritonitis 0.07519595
## 7     Paedriatic_Appendicitis_Score 0.05794704
## 18            Neutrophil_Percentage 0.04664516
## 19                     Neutrophilia 0.04158565
## 34                      Free_Fluids 0.02187316
## 16                 Body_Temperature 0.02025787
## 14                           Nausea 0.01593093
## 12 Contralateral_Rebound_Tenderness 0.01497887
## 15                 Loss_of_Appetite 0.01378416
## 1                               Age 0.00000000
## 2                               BMI 0.00000000
## 3                               Sex 0.00000000
## 4                            Height 0.00000000
## 5                            Weight 0.00000000
## 10                   Migratory_Pain 0.00000000
## 11             Lower_Right_Abd_Pain 0.00000000
## 13                    Coughing_Pain 0.00000000
## 20                        RBC_Count 0.00000000
## 21                       Hemoglobin 0.00000000
## 22                              RDW 0.00000000
## 23                Thrombocyte_Count 0.00000000
## 24                 Ketones_in_Urine 0.00000000
## 25                     RBC_in_Urine 0.00000000
## 26                     WBC_in_Urine 0.00000000
## 28                          Dysuria 0.00000000
## 29                            Stool 0.00000000
## 31                       Psoas_Sign 0.00000000
## 32   Ipsilateral_Rebound_Tenderness 0.00000000
## 33                     US_Performed 0.00000000
top6_vars <- ig_sorted$attributes[1:10]
top6_vars
##  [1] "Appendix_Diameter"             "Appendix_on_US"               
##  [3] "Alvarado_Score"                "CRP"                          
##  [5] "WBC_Count"                     "Peritonitis"                  
##  [7] "Paedriatic_Appendicitis_Score" "Neutrophil_Percentage"        
##  [9] "Neutrophilia"                  "Free_Fluids"
data_model <- data[, c("Diagnosis", top6_vars)]
# MEMBAGI DATA MENJADI TRAINING DAN TESTING
set.seed(123)
train_index <- sample(1:nrow(data_model), 0.8*nrow(data_model))
trainData <- data_model[train_index, ]
testData  <- data_model[-train_index, ]
# MEMBUAT MODEL DECISION TREE
model_dt <- rpart(
  Diagnosis ~ .,
  data = trainData,
  method = "class",
  parms = list(split = "information")
)

rpart.plot(
  model_dt,
  type = 2,
  extra = 104,
  fallen.leaves = TRUE,
  box.palette = c("#2ECC71","#E74C3C"),
  shadow.col = "gray",
  tweak = 1.1,
  main = "Pohon Keputusan Diagnosis Apendisitis"
)

# Prediksi data training
pred_train <- predict(model_dt, trainData, type = "class")

# Confusion matrix training
print("DATA TRAINING")
## [1] "DATA TRAINING"
table(Predicted = pred_train, Actual = trainData$Diagnosis)
##          Actual
## Predicted   1   2
##         1 341  18
##         2  28 237
# Accuracy training
mean(pred_train == trainData$Diagnosis)
## [1] 0.9262821
# Prediksi data testing
pred_test <- predict(model_dt, testData, type = "class")

# Confusion matrix testing
print("DATA TESTING")
## [1] "DATA TESTING"
table(Predicted = pred_test, Actual = testData$Diagnosis)
##          Actual
## Predicted  1  2
##         1 83  6
##         2 12 56
# Accuracy testing
mean(pred_test == testData$Diagnosis)
## [1] 0.8853503