library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readxl)
library(rpart)
library(rpart.plot)
# MEMBACA DATA
data <- read_excel("C:/Users/User/Downloads/app_data.xlsx")
data
## # A tibble: 782 × 58
## Age BMI Sex Height Weight Length_of_Stay Management Severity
## <dbl> <chr> <chr> <dbl> <dbl> <dbl> <chr> <chr>
## 1 12.7 16.899999999999… fema… 148 37 3 conservat… uncompl…
## 2 14.1 31.9 male 147 69.5 2 conservat… uncompl…
## 3 14.1 23.3 fema… 163 62 4 conservat… uncompl…
## 4 16.4 20.6 fema… 165 56 3 conservat… uncompl…
## 5 11.1 16.899999999999… fema… 163 45 3 conservat… uncompl…
## 6 11.0 30.7 male 121 45 3 conservat… uncompl…
## 7 8.98 19.399999999999… fema… 140 38.5 3 conservat… uncompl…
## 8 7.06 <NA> fema… NA 21.5 2 conservat… uncompl…
## 9 7.9 15.7 male 131 26.7 3 conservat… uncompl…
## 10 14.3 14.9 male 174 45.5 3 conservat… uncompl…
## # ℹ 772 more rows
## # ℹ 50 more variables: Diagnosis_Presumptive <chr>, Diagnosis <chr>,
## # Alvarado_Score <dbl>, Paedriatic_Appendicitis_Score <dbl>,
## # Appendix_on_US <chr>, Appendix_Diameter <dbl>, Migratory_Pain <chr>,
## # Lower_Right_Abd_Pain <chr>, Contralateral_Rebound_Tenderness <chr>,
## # Coughing_Pain <chr>, Nausea <chr>, Loss_of_Appetite <chr>,
## # Body_Temperature <dbl>, WBC_Count <dbl>, Neutrophil_Percentage <dbl>, …
# CEK TIPE DATA
str(data)
## tibble [782 × 58] (S3: tbl_df/tbl/data.frame)
## $ Age : num [1:782] 12.7 14.1 14.1 16.4 11.1 ...
## $ BMI : chr [1:782] "16.899999999999999" "31.9" "23.3" "20.6" ...
## $ Sex : chr [1:782] "female" "male" "female" "female" ...
## $ Height : num [1:782] 148 147 163 165 163 121 140 NA 131 174 ...
## $ Weight : num [1:782] 37 69.5 62 56 45 45 38.5 21.5 26.7 45.5 ...
## $ Length_of_Stay : num [1:782] 3 2 4 3 3 3 3 2 3 3 ...
## $ Management : chr [1:782] "conservative" "conservative" "conservative" "conservative" ...
## $ Severity : chr [1:782] "uncomplicated" "uncomplicated" "uncomplicated" "uncomplicated" ...
## $ Diagnosis_Presumptive : chr [1:782] "appendicitis" "appendicitis" "appendicitis" "appendicitis" ...
## $ Diagnosis : chr [1:782] "appendicitis" "no appendicitis" "no appendicitis" "no appendicitis" ...
## $ Alvarado_Score : num [1:782] 4 5 5 7 5 6 5 3 7 4 ...
## $ Paedriatic_Appendicitis_Score : num [1:782] 3 4 3 6 6 7 6 3 6 4 ...
## $ Appendix_on_US : chr [1:782] "yes" "no" "no" "no" ...
## $ Appendix_Diameter : num [1:782] 7.1 NA NA NA 7 NA NA NA 3.7 8 ...
## $ Migratory_Pain : chr [1:782] "no" "yes" "no" "yes" ...
## $ Lower_Right_Abd_Pain : chr [1:782] "yes" "yes" "yes" "yes" ...
## $ Contralateral_Rebound_Tenderness: chr [1:782] "yes" "yes" "yes" "no" ...
## $ Coughing_Pain : chr [1:782] "no" "no" "no" "no" ...
## $ Nausea : chr [1:782] "no" "no" "no" "yes" ...
## $ Loss_of_Appetite : chr [1:782] "yes" "yes" "no" "yes" ...
## $ Body_Temperature : num [1:782] 37 36.9 36.6 36 36.9 36.9 36.7 36.8 37.3 37.1 ...
## $ WBC_Count : num [1:782] 7.7 8.1 13.2 11.4 8.1 9.5 10 8 20.9 5.8 ...
## $ Neutrophil_Percentage : num [1:782] 68.2 64.8 74.8 63 44 71.4 69.1 79.6 76 47.2 ...
## $ Segmented_Neutrophils : num [1:782] NA NA NA NA NA NA NA NA NA NA ...
## $ Neutrophilia : chr [1:782] "no" "no" "no" "no" ...
## $ RBC_Count : num [1:782] 5.27 5.26 3.98 4.64 4.44 4.96 4.77 4.89 4.61 4.78 ...
## $ Hemoglobin : num [1:782] 14.8 15.7 11.4 13.6 12.6 12.5 12.7 12 13.4 12.9 ...
## $ RDW : num [1:782] 12.2 12.7 12.2 13.2 13.6 13.3 12.6 13.9 12 12.6 ...
## $ Thrombocyte_Count : num [1:782] 254 151 300 258 311 249 337 412 350 220 ...
## $ Ketones_in_Urine : chr [1:782] "++" "no" "no" "no" ...
## $ RBC_in_Urine : chr [1:782] "+" "no" "no" "no" ...
## $ WBC_in_Urine : chr [1:782] "no" "no" "no" "no" ...
## $ CRP : num [1:782] 0 3 3 0 0 63 9 0 20 0 ...
## $ Dysuria : chr [1:782] "no" "yes" "no" "yes" ...
## $ Stool : chr [1:782] "normal" "normal" "constipation" "normal" ...
## $ Peritonitis : chr [1:782] "no" "no" "no" "no" ...
## $ Psoas_Sign : chr [1:782] "yes" "yes" "yes" "yes" ...
## $ Ipsilateral_Rebound_Tenderness : chr [1:782] "no" "no" "no" "no" ...
## $ US_Performed : chr [1:782] "yes" "yes" "yes" "yes" ...
## $ US_Number : num [1:782] 882 883 884 886 887 888 889 890 891 893 ...
## $ Free_Fluids : chr [1:782] "no" "no" "no" "no" ...
## $ Appendix_Wall_Layers : chr [1:782] "intact" NA NA NA ...
## $ Target_Sign : chr [1:782] NA NA NA NA ...
## $ Appendicolith : chr [1:782] "suspected" NA NA NA ...
## $ Perfusion : chr [1:782] NA NA NA NA ...
## $ Perforation : chr [1:782] "no" NA NA NA ...
## $ Surrounding_Tissue_Reaction : chr [1:782] "yes" NA NA NA ...
## $ Appendicular_Abscess : chr [1:782] "no" NA NA NA ...
## $ Abscess_Location : chr [1:782] NA NA NA NA ...
## $ Pathological_Lymph_Nodes : chr [1:782] "yes" NA NA "yes" ...
## $ Lymph_Nodes_Location : chr [1:782] "reUB" NA NA "reUB" ...
## $ Bowel_Wall_Thickening : chr [1:782] NA NA NA NA ...
## $ Conglomerate_of_Bowel_Loops : chr [1:782] NA NA NA NA ...
## $ Ileus : chr [1:782] NA NA NA NA ...
## $ Coprostasis : chr [1:782] NA NA NA NA ...
## $ Meteorism : chr [1:782] NA "yes" "yes" NA ...
## $ Enteritis : chr [1:782] NA NA "yes" "yes" ...
## $ Gynecological_Findings : chr [1:782] NA NA NA NA ...
# MENGUBAH VARIABEL BMI DARI CHARACTER MENJADI NUMERIC
# karena BMI harus berupa angka untuk analisis
data$BMI <- as.numeric(data$BMI)
# MENGHAPUS BARIS YANG SEMUA NILAINYA NA
# baris tanpa informasi tidak bisa dipakai untuk modeling
data <- data[rowSums(is.na(data)) != ncol(data), ]
# MENGHAPUS VARIABEL DENGAN MISSING VALUE > 50%
# variabel dengan terlalu banyak NA biasanya tidak informatif
data <- data[, colMeans(is.na(data)) < 0.5]
data
## # A tibble: 781 × 40
## Age BMI Sex Height Weight Length_of_Stay Management Severity
## <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <chr> <chr>
## 1 12.7 16.9 female 148 37 3 conservative uncomplicated
## 2 14.1 31.9 male 147 69.5 2 conservative uncomplicated
## 3 14.1 23.3 female 163 62 4 conservative uncomplicated
## 4 16.4 20.6 female 165 56 3 conservative uncomplicated
## 5 11.1 16.9 female 163 45 3 conservative uncomplicated
## 6 11.0 30.7 male 121 45 3 conservative uncomplicated
## 7 8.98 19.4 female 140 38.5 3 conservative uncomplicated
## 8 7.06 NA female NA 21.5 2 conservative uncomplicated
## 9 7.9 15.7 male 131 26.7 3 conservative uncomplicated
## 10 14.3 14.9 male 174 45.5 3 conservative uncomplicated
## # ℹ 771 more rows
## # ℹ 32 more variables: Diagnosis_Presumptive <chr>, Diagnosis <chr>,
## # Alvarado_Score <dbl>, Paedriatic_Appendicitis_Score <dbl>,
## # Appendix_on_US <chr>, Appendix_Diameter <dbl>, Migratory_Pain <chr>,
## # Lower_Right_Abd_Pain <chr>, Contralateral_Rebound_Tenderness <chr>,
## # Coughing_Pain <chr>, Nausea <chr>, Loss_of_Appetite <chr>,
## # Body_Temperature <dbl>, WBC_Count <dbl>, Neutrophil_Percentage <dbl>, …
# MENGUBAH SEMUA VARIABEL KATEGORIK MENJADI FACTOR
data <- data %>%
mutate(across(where(is.character), as.factor))
# IMPUTASI MEDIAN DAN MODUS
# NUMERIK -> MEDIAN
# KATEGORIK -> MODUS
# Fungsi Modus
get_mode <- function(v){
uniqv <- unique(na.omit(v))
uniqv[which.max(tabulate(match(v, uniqv)))]
}
data <- data %>%
mutate(across(where(is.numeric),
~ ifelse(is.na(.), median(., na.rm = TRUE), .)))
data <- data %>%
mutate(across(where(is.factor),
~ ifelse(is.na(.), get_mode(.), .)))
data
## # A tibble: 781 × 40
## Age BMI Sex Height Weight Length_of_Stay Management Severity
## <dbl> <dbl> <int> <dbl> <dbl> <dbl> <int> <int>
## 1 12.7 16.9 1 148 37 3 1 2
## 2 14.1 31.9 2 147 69.5 2 1 2
## 3 14.1 23.3 1 163 62 4 1 2
## 4 16.4 20.6 1 165 56 3 1 2
## 5 11.1 16.9 1 163 45 3 1 2
## 6 11.0 30.7 2 121 45 3 1 2
## 7 8.98 19.4 1 140 38.5 3 1 2
## 8 7.06 18.1 1 150. 21.5 2 1 2
## 9 7.9 15.7 2 131 26.7 3 1 2
## 10 14.3 14.9 2 174 45.5 3 1 2
## # ℹ 771 more rows
## # ℹ 32 more variables: Diagnosis_Presumptive <int>, Diagnosis <int>,
## # Alvarado_Score <dbl>, Paedriatic_Appendicitis_Score <dbl>,
## # Appendix_on_US <int>, Appendix_Diameter <dbl>, Migratory_Pain <int>,
## # Lower_Right_Abd_Pain <int>, Contralateral_Rebound_Tenderness <int>,
## # Coughing_Pain <int>, Nausea <int>, Loss_of_Appetite <int>,
## # Body_Temperature <dbl>, WBC_Count <dbl>, Neutrophil_Percentage <dbl>, …
# MENGHAPUS VARIABEL YANG TIDAK DIGUNAKAN
data <- data[, !(names(data) %in% c(
"Management",
"Severity",
"US_Number",
"Length_of_Stay",
"Diagnosis_Presumptive"
))]
data
## # A tibble: 781 × 35
## Age BMI Sex Height Weight Diagnosis Alvarado_Score
## <dbl> <dbl> <int> <dbl> <dbl> <int> <dbl>
## 1 12.7 16.9 1 148 37 1 4
## 2 14.1 31.9 2 147 69.5 2 5
## 3 14.1 23.3 1 163 62 2 5
## 4 16.4 20.6 1 165 56 2 7
## 5 11.1 16.9 1 163 45 1 5
## 6 11.0 30.7 2 121 45 2 6
## 7 8.98 19.4 1 140 38.5 2 5
## 8 7.06 18.1 1 150. 21.5 2 3
## 9 7.9 15.7 2 131 26.7 2 7
## 10 14.3 14.9 2 174 45.5 1 4
## # ℹ 771 more rows
## # ℹ 28 more variables: Paedriatic_Appendicitis_Score <dbl>,
## # Appendix_on_US <int>, Appendix_Diameter <dbl>, Migratory_Pain <int>,
## # Lower_Right_Abd_Pain <int>, Contralateral_Rebound_Tenderness <int>,
## # Coughing_Pain <int>, Nausea <int>, Loss_of_Appetite <int>,
## # Body_Temperature <dbl>, WBC_Count <dbl>, Neutrophil_Percentage <dbl>,
## # Neutrophilia <int>, RBC_Count <dbl>, Hemoglobin <dbl>, RDW <dbl>, …
library(FSelectorRcpp)
# Hitung Information Gain
ig <- information_gain(Diagnosis ~ ., data)
ig_sorted <- ig[order(-ig$importance), ]
ig_sorted
## attributes importance
## 9 Appendix_Diameter 0.29297069
## 8 Appendix_on_US 0.09733408
## 6 Alvarado_Score 0.09035710
## 27 CRP 0.08616099
## 17 WBC_Count 0.08316729
## 30 Peritonitis 0.07519595
## 7 Paedriatic_Appendicitis_Score 0.05794704
## 18 Neutrophil_Percentage 0.04664516
## 19 Neutrophilia 0.04158565
## 34 Free_Fluids 0.02187316
## 16 Body_Temperature 0.02025787
## 14 Nausea 0.01593093
## 12 Contralateral_Rebound_Tenderness 0.01497887
## 15 Loss_of_Appetite 0.01378416
## 1 Age 0.00000000
## 2 BMI 0.00000000
## 3 Sex 0.00000000
## 4 Height 0.00000000
## 5 Weight 0.00000000
## 10 Migratory_Pain 0.00000000
## 11 Lower_Right_Abd_Pain 0.00000000
## 13 Coughing_Pain 0.00000000
## 20 RBC_Count 0.00000000
## 21 Hemoglobin 0.00000000
## 22 RDW 0.00000000
## 23 Thrombocyte_Count 0.00000000
## 24 Ketones_in_Urine 0.00000000
## 25 RBC_in_Urine 0.00000000
## 26 WBC_in_Urine 0.00000000
## 28 Dysuria 0.00000000
## 29 Stool 0.00000000
## 31 Psoas_Sign 0.00000000
## 32 Ipsilateral_Rebound_Tenderness 0.00000000
## 33 US_Performed 0.00000000
top6_vars <- ig_sorted$attributes[1:10]
top6_vars
## [1] "Appendix_Diameter" "Appendix_on_US"
## [3] "Alvarado_Score" "CRP"
## [5] "WBC_Count" "Peritonitis"
## [7] "Paedriatic_Appendicitis_Score" "Neutrophil_Percentage"
## [9] "Neutrophilia" "Free_Fluids"
data_model <- data[, c("Diagnosis", top6_vars)]
# MEMBAGI DATA MENJADI TRAINING DAN TESTING
set.seed(123)
train_index <- sample(1:nrow(data_model), 0.8*nrow(data_model))
trainData <- data_model[train_index, ]
testData <- data_model[-train_index, ]
# MEMBUAT MODEL DECISION TREE
model_dt <- rpart(
Diagnosis ~ .,
data = trainData,
method = "class",
parms = list(split = "information")
)
rpart.plot(
model_dt,
type = 2,
extra = 104,
fallen.leaves = TRUE,
box.palette = c("#2ECC71","#E74C3C"),
shadow.col = "gray",
tweak = 1.1,
main = "Pohon Keputusan Diagnosis Apendisitis"
)

# Prediksi data training
pred_train <- predict(model_dt, trainData, type = "class")
# Confusion matrix training
print("DATA TRAINING")
## [1] "DATA TRAINING"
table(Predicted = pred_train, Actual = trainData$Diagnosis)
## Actual
## Predicted 1 2
## 1 341 18
## 2 28 237
# Accuracy training
mean(pred_train == trainData$Diagnosis)
## [1] 0.9262821
# Prediksi data testing
pred_test <- predict(model_dt, testData, type = "class")
# Confusion matrix testing
print("DATA TESTING")
## [1] "DATA TESTING"
table(Predicted = pred_test, Actual = testData$Diagnosis)
## Actual
## Predicted 1 2
## 1 83 6
## 2 12 56
# Accuracy testing
mean(pred_test == testData$Diagnosis)
## [1] 0.8853503