library(mlbench)
library(readxl)
library(dplyr)
library(ggplot2)
library(caret)
library(MASS)
library(class)
library(rpart)
library(rpart.plot)
library(e1071)
library(pROC)
library(ROCR)
library(corrplot)
library(reshape2)
library(ggcorrplot)
data_pdm <- read_excel("D:/Files/Anin's Docs/COLLEGE!/UNS/Semester 5/PDM/Tugas/2/gallstone-1/dataset-uci/dataset-uci.xlsx")
set.seed(123) # memastikan replikasi hasil
str(data_pdm)
## tibble [319 × 39] (S3: tbl_df/tbl/data.frame)
## $ Gallstone Status : num [1:319] 0 0 0 0 0 0 0 0 0 0 ...
## $ Age : num [1:319] 50 47 61 41 42 96 37 41 38 38 ...
## $ Gender : num [1:319] 0 0 0 0 0 0 0 0 0 0 ...
## $ Comorbidity : num [1:319] 0 1 0 0 0 0 0 0 0 0 ...
## $ Coronary Artery Disease (CAD) : num [1:319] 0 0 0 0 0 0 0 0 0 0 ...
## $ Hypothyroidism : num [1:319] 0 0 0 0 0 0 0 0 0 0 ...
## $ Hyperlipidemia : num [1:319] 0 0 0 0 0 0 0 0 0 0 ...
## $ Diabetes Mellitus (DM) : num [1:319] 0 0 0 0 0 0 0 0 0 0 ...
## $ Height : num [1:319] 185 176 171 168 178 155 185 176 186 171 ...
## $ Weight : num [1:319] 92.8 94.5 91.1 67.7 89.6 49 67.1 114 93.7 68.6 ...
## $ Body Mass Index (BMI) : num [1:319] 27.1 30.5 31.2 24 28.3 20.4 19.6 36.8 27.1 23.5 ...
## $ Total Body Water (TBW) : num [1:319] 52.9 43.1 47.2 41.4 51.4 34 41.8 57.6 51.9 39.5 ...
## $ Extracellular Water (ECW) : num [1:319] 21.2 19.5 20.1 17 20 15.7 17.1 23.5 20.9 16.6 ...
## $ Intracellular Water (ICW) : num [1:319] 31.7 23.6 27.1 24.4 31.4 18.3 24.7 34.1 31 22.9 ...
## $ Extracellular Fluid/Total Body Water (ECF/TBW): num [1:319] 40 45 43 41 39 46 41 41 40 42 ...
## $ Total Body Fat Ratio (TBFR) (%) : num [1:319] 19.2 32.8 27.3 15.8 20 6.3 11.3 31.8 21.5 19.2 ...
## $ Lean Mass (LM) (%) : num [1:319] 80.8 67.2 72.7 84.2 80 ...
## $ Body Protein Content (Protein) (%) : num [1:319] 18.9 16.7 16.4 16.9 16.8 ...
## $ Visceral Fat Rating (VFR) : num [1:319] 9 15 15 6 8 12 3 16 8 6 ...
## $ Bone Mass (BM) : num [1:319] 3.7 3.2 3.3 2.9 3.5 2.4 3 3.8 3.6 2.8 ...
## $ Muscle Mass (MM) : num [1:319] 71.4 60.3 62.9 54.1 68.2 43.5 56.5 73.9 70 52.6 ...
## $ Obesity (%) : num [1:319] 23.4 38.8 41.7 9 28.6 7.4 10.9 67.4 23.1 6.7 ...
## $ Total Fat Content (TFC) : num [1:319] 17.8 31 24.9 10.7 17.9 3.1 7.6 36.3 20.1 13.2 ...
## $ Visceral Fat Area (VFA) : num [1:319] 10.6 18.4 16.2 6.5 10.4 0.9 4.6 23.7 12.7 8.2 ...
## $ Visceral Muscle Area (VMA) (Kg) : num [1:319] 39.7 32.7 34 29.2 37.4 25.9 31.4 37.5 37.8 28.8 ...
## $ Hepatic Fat Accumulation (HFA) : num [1:319] 0 0 0 1 2 0 0 3 2 0 ...
## $ Glucose : num [1:319] 102 94 103 69 109 74 94 125 93 93 ...
## $ Total Cholesterol (TC) : num [1:319] 250 172 179 173 205 133 166 246 290 239 ...
## $ Low Density Lipoprotein (LDL) : num [1:319] 175 108 124 73 154 ...
## $ High Density Lipoprotein (HDL) : num [1:319] 40 43 43 59 30 30 40 34 33 43 ...
## $ Triglyceride : num [1:319] 134 103 69 53 326 65 60 464 679 129 ...
## $ Aspartat Aminotransferaz (AST) : num [1:319] 20 14 18 20 27 13 15 26 68 19 ...
## $ Alanin Aminotransferaz (ALT) : num [1:319] 22 13 14 12 54 13 14 28 102 34 ...
## $ Alkaline Phosphatase (ALP) : num [1:319] 87 46 66 34 71 60 72 69 70 75 ...
## $ Creatinine : num [1:319] 0.82 0.87 1.25 1.02 0.82 1.46 0.77 1.3 0.91 0.91 ...
## $ Glomerular Filtration Rate (GFR) : num [1:319] NA 107.1 65.5 94.1 112.5 ...
## $ C-Reactive Protein (CRP) : num [1:319] 0 0 0 0 0 0 0 0.11 1.57 0 ...
## $ Hemoglobin (HGB) : num [1:319] 16 14.4 16.2 15.4 16.8 11 13.8 16.5 16.5 16.6 ...
## $ Vitamin D : num [1:319] 33 25 30.2 35.4 40.6 45.8 20 24.5 22.7 15.6 ...
summary(data_pdm)
## Gallstone Status Age Gender Comorbidity
## Min. :0.0000 Min. :20.00 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:38.50 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :49.00 Median :0.0000 Median :0.0000
## Mean :0.4953 Mean :48.07 Mean :0.4922 Mean :0.3354
## 3rd Qu.:1.0000 3rd Qu.:56.00 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :96.00 Max. :1.0000 Max. :3.0000
##
## Coronary Artery Disease (CAD) Hypothyroidism Hyperlipidemia
## Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.00000 Median :0.00000 Median :0.00000
## Mean :0.03762 Mean :0.02821 Mean :0.02508
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.00000 Max. :1.00000
##
## Diabetes Mellitus (DM) Height Weight Body Mass Index (BMI)
## Min. :0.0000 Min. :145.0 Min. : 42.90 Min. :17.40
## 1st Qu.:0.0000 1st Qu.:159.5 1st Qu.: 69.60 1st Qu.:25.25
## Median :0.0000 Median :168.0 Median : 78.80 Median :28.30
## Mean :0.1348 Mean :167.2 Mean : 80.56 Mean :28.88
## 3rd Qu.:0.0000 3rd Qu.:175.0 3rd Qu.: 91.25 3rd Qu.:31.85
## Max. :1.0000 Max. :191.0 Max. :143.50 Max. :49.70
##
## Total Body Water (TBW) Extracellular Water (ECW) Intracellular Water (ICW)
## Min. :13.00 Min. : 9.00 Min. :13.80
## 1st Qu.:34.20 1st Qu.:14.80 1st Qu.:19.30
## Median :39.80 Median :17.10 Median :23.00
## Mean :40.59 Mean :17.07 Mean :23.63
## 3rd Qu.:47.00 3rd Qu.:19.40 3rd Qu.:27.55
## Max. :66.20 Max. :27.80 Max. :57.10
##
## Extracellular Fluid/Total Body Water (ECF/TBW) Total Body Fat Ratio (TBFR) (%)
## Min. :29.23 Min. : 6.30
## 1st Qu.:40.08 1st Qu.:22.02
## Median :42.00 Median :27.82
## Mean :42.21 Mean :28.27
## 3rd Qu.:44.00 3rd Qu.:34.81
## Max. :52.00 Max. :50.92
##
## Lean Mass (LM) (%) Body Protein Content (Protein) (%)
## Min. :48.99 Min. : 5.56
## 1st Qu.:65.17 1st Qu.:14.46
## Median :72.11 Median :15.87
## Mean :71.64 Mean :15.94
## 3rd Qu.:77.85 3rd Qu.:17.43
## Max. :93.67 Max. :24.81
##
## Visceral Fat Rating (VFR) Bone Mass (BM) Muscle Mass (MM) Obesity (%)
## Min. : 1.000 Min. :1.400 Min. : 4.70 Min. : 0.40
## 1st Qu.: 6.000 1st Qu.:2.400 1st Qu.:45.80 1st Qu.: 13.90
## Median : 9.000 Median :2.800 Median :53.90 Median : 25.60
## Mean : 9.078 Mean :2.803 Mean :54.27 Mean : 35.85
## 3rd Qu.:12.000 3rd Qu.:3.200 3rd Qu.:62.60 3rd Qu.: 41.75
## Max. :31.000 Max. :4.000 Max. :78.80 Max. :1954.00
##
## Total Fat Content (TFC) Visceral Fat Area (VFA)
## Min. : 3.10 Min. : 0.90
## 1st Qu.:17.00 1st Qu.: 8.57
## Median :22.60 Median :11.59
## Mean :23.49 Mean :12.17
## 3rd Qu.:28.55 3rd Qu.:15.10
## Max. :62.50 Max. :41.00
##
## Visceral Muscle Area (VMA) (Kg) Hepatic Fat Accumulation (HFA) Glucose
## Min. :18.90 Min. :0.00 Min. : 69.0
## 1st Qu.:27.25 1st Qu.:0.00 1st Qu.: 92.0
## Median :30.41 Median :1.00 Median : 98.0
## Mean :30.40 Mean :1.15 Mean :108.7
## 3rd Qu.:33.80 3rd Qu.:2.00 3rd Qu.:109.0
## Max. :41.10 Max. :4.00 Max. :575.0
##
## Total Cholesterol (TC) Low Density Lipoprotein (LDL)
## Min. : 60.0 Min. : 11.0
## 1st Qu.:172.0 1st Qu.:100.5
## Median :198.0 Median :122.0
## Mean :203.5 Mean :126.7
## 3rd Qu.:233.0 3rd Qu.:151.0
## Max. :360.0 Max. :293.0
##
## High Density Lipoprotein (HDL) Triglyceride Aspartat Aminotransferaz (AST)
## Min. : 25.00 Min. : 1.39 Min. : 8.00
## 1st Qu.: 40.00 1st Qu.: 83.00 1st Qu.: 15.00
## Median : 46.50 Median :119.00 Median : 18.00
## Mean : 49.48 Mean :144.50 Mean : 21.68
## 3rd Qu.: 56.00 3rd Qu.:172.00 3rd Qu.: 23.00
## Max. :273.00 Max. :838.00 Max. :195.00
##
## Alanin Aminotransferaz (ALT) Alkaline Phosphatase (ALP) Creatinine
## Min. : 3.00 Min. : 7.00 Min. :0.4600
## 1st Qu.: 14.25 1st Qu.: 58.00 1st Qu.:0.6500
## Median : 19.00 Median : 71.00 Median :0.7900
## Mean : 26.86 Mean : 73.11 Mean :0.8006
## 3rd Qu.: 30.00 3rd Qu.: 86.00 3rd Qu.:0.9200
## Max. :372.00 Max. :197.00 Max. :1.4600
##
## Glomerular Filtration Rate (GFR) C-Reactive Protein (CRP) Hemoglobin (HGB)
## Min. : 10.60 Min. : 0.000 Min. : 8.50
## 1st Qu.: 94.17 1st Qu.: 0.000 1st Qu.:13.30
## Median :104.00 Median : 0.215 Median :14.40
## Mean :100.78 Mean : 1.854 Mean :14.42
## 3rd Qu.:110.63 3rd Qu.: 1.615 3rd Qu.:15.70
## Max. :132.00 Max. :43.400 Max. :18.80
## NA's :1
## Vitamin D
## Min. : 3.50
## 1st Qu.:13.25
## Median :22.00
## Mean :21.40
## 3rd Qu.:28.06
## Max. :53.10
##
head(data_pdm)
## # A tibble: 6 × 39
## `Gallstone Status` Age Gender Comorbidity `Coronary Artery Disease (CAD)`
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 50 0 0 0
## 2 0 47 0 1 0
## 3 0 61 0 0 0
## 4 0 41 0 0 0
## 5 0 42 0 0 0
## 6 0 96 0 0 0
## # ℹ 34 more variables: Hypothyroidism <dbl>, Hyperlipidemia <dbl>,
## # `Diabetes Mellitus (DM)` <dbl>, Height <dbl>, Weight <dbl>,
## # `Body Mass Index (BMI)` <dbl>, `Total Body Water (TBW)` <dbl>,
## # `Extracellular Water (ECW)` <dbl>, `Intracellular Water (ICW)` <dbl>,
## # `Extracellular Fluid/Total Body Water (ECF/TBW)` <dbl>,
## # `Total Body Fat Ratio (TBFR) (%)` <dbl>, `Lean Mass (LM) (%)` <dbl>,
## # `Body Protein Content (Protein) (%)` <dbl>, …
colSums(is.na(data_pdm))
## Gallstone Status
## 0
## Age
## 0
## Gender
## 0
## Comorbidity
## 0
## Coronary Artery Disease (CAD)
## 0
## Hypothyroidism
## 0
## Hyperlipidemia
## 0
## Diabetes Mellitus (DM)
## 0
## Height
## 0
## Weight
## 0
## Body Mass Index (BMI)
## 0
## Total Body Water (TBW)
## 0
## Extracellular Water (ECW)
## 0
## Intracellular Water (ICW)
## 0
## Extracellular Fluid/Total Body Water (ECF/TBW)
## 0
## Total Body Fat Ratio (TBFR) (%)
## 0
## Lean Mass (LM) (%)
## 0
## Body Protein Content (Protein) (%)
## 0
## Visceral Fat Rating (VFR)
## 0
## Bone Mass (BM)
## 0
## Muscle Mass (MM)
## 0
## Obesity (%)
## 0
## Total Fat Content (TFC)
## 0
## Visceral Fat Area (VFA)
## 0
## Visceral Muscle Area (VMA) (Kg)
## 0
## Hepatic Fat Accumulation (HFA)
## 0
## Glucose
## 0
## Total Cholesterol (TC)
## 0
## Low Density Lipoprotein (LDL)
## 0
## High Density Lipoprotein (HDL)
## 0
## Triglyceride
## 0
## Aspartat Aminotransferaz (AST)
## 0
## Alanin Aminotransferaz (ALT)
## 0
## Alkaline Phosphatase (ALP)
## 0
## Creatinine
## 0
## Glomerular Filtration Rate (GFR)
## 1
## C-Reactive Protein (CRP)
## 0
## Hemoglobin (HGB)
## 0
## Vitamin D
## 0
num_vars <- dplyr::select_if(data_pdm, is.numeric)
corrplot(cor(num_vars), method = "color", tl.cex = 0.7)
table(data_pdm$`Gallstone Status`)
##
## 0 1
## 161 158
ggplot(data_pdm, aes(x = `Gallstone Status`)) +
geom_bar(fill = "#00BFC4") +
theme_minimal() +
labs(title = "Distribusi Kelas Target")
df <- data_pdm
df$y <- factor(df$`Gallstone Status`, levels = c(0,1), labels = c("No","Yes"))
X <- df[, !(names(df) %in% c("Gallstone Status", "y"))]
X$Gender <- as.numeric(factor(X$Gender))
X$Comorbidity <- as.numeric(factor(X$Comorbidity))
X$`Hepatic Fat Accumulation (HFA)` <- as.numeric(factor(X$`Hepatic Fat Accumulation (HFA)`))
df_final <- cbind(y = df$y, X)
set.seed(123)
index <- createDataPartition(df_final$y, p = 0.8, list = FALSE)
train <- df_final[index, ]
test <- df_final[-index, ]
pp <- preProcess(train[, -1], method = c("center", "scale"))
train_sc <- train
train_sc[, -1] <- predict(pp, train[, -1])
test_sc <- test
test_sc[, -1] <- predict(pp, test[, -1])
str(train_sc)
## 'data.frame': 256 obs. of 39 variables:
## $ y : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ Age : num 0.166 -0.567 -0.486 3.915 -0.893 ...
## $ Gender : num -0.952 -0.952 -0.952 -0.952 -0.952 ...
## $ Comorbidity : num -0.647 -0.647 -0.647 -0.647 -0.647 ...
## $ Coronary Artery Disease (CAD) : num -0.155 -0.155 -0.155 -0.155 -0.155 ...
## $ Hypothyroidism : num -0.167 -0.167 -0.167 -0.167 -0.167 ...
## $ Hyperlipidemia : num -0.155 -0.155 -0.155 -0.155 -0.155 ...
## $ Diabetes Mellitus (DM) : num -0.417 -0.417 -0.417 -0.417 -0.417 ...
## $ Height : num 1.7557 0.0771 1.0645 -1.2065 1.7557 ...
## $ Weight : num 0.768 -0.841 0.563 -2.04 -0.879 ...
## $ Body Mass Index (BMI) : num -0.352 -0.945 -0.123 -1.633 -1.786 ...
## $ Total Body Water (TBW) : num 1.52 0.068 1.331 -0.866 0.118 ...
## $ Extracellular Water (ECW) : num 1.29605 -0.04168 0.91384 -0.45574 -0.00983 ...
## $ Intracellular Water (ICW) : num 1.4428 0.0966 1.3875 -1.0283 0.1519 ...
## $ Extracellular Fluid/Total Body Water (ECF/TBW): num -0.635 -0.331 -0.939 1.19 -0.331 ...
## $ Total Body Fat Ratio (TBFR) (%) : num -1.031 -1.43 -0.937 -2.547 -1.959 ...
## $ Lean Mass (LM) (%) : num 1.049 1.444 0.952 2.563 1.973 ...
## $ Body Protein Content (Protein) (%) : num 1.228 0.362 0.322 1.053 1.674 ...
## $ Visceral Fat Rating (VFR) : num -0.0154 -0.7099 -0.2469 0.6791 -1.4044 ...
## $ Bone Mass (BM) : num 1.734 0.168 1.342 -0.812 0.363 ...
## $ Muscle Mass (MM) : num 1.5828 -0.0522 1.2804 -1.054 0.1746 ...
## $ Obesity (%) : num -0.3104 -0.9921 -0.0642 -1.0678 -0.9021 ...
## $ Total Fat Content (TFC) : num -0.576 -1.314 -0.565 -2.105 -1.637 ...
## $ Visceral Fat Area (VFA) : num -0.281 -1.053 -0.319 -2.108 -1.411 ...
## $ Visceral Muscle Area (VMA) (Kg) : num 2.097 -0.316 1.568 -1.075 0.189 ...
## $ Hepatic Fat Accumulation (HFA) : num -1.105 -0.181 0.744 -1.105 -1.105 ...
## $ Glucose : num -0.1701 -0.8422 -0.0275 -0.7404 -0.333 ...
## $ Total Cholesterol (TC) : num 0.927 -0.7168 -0.0336 -1.5707 -0.8662 ...
## $ Low Density Lipoprotein (LDL) : num 1.186 -1.433 0.647 -0.971 -0.457 ...
## $ High Density Lipoprotein (HDL) : num -0.513 0.502 -1.047 -1.047 -0.513 ...
## $ Triglyceride : num -0.144 -0.928 1.714 -0.812 -0.861 ...
## $ Aspartat Aminotransferaz (AST) : num -0.0355 -0.0355 0.7912 -0.8622 -0.626 ...
## $ Alanin Aminotransferaz (ALT) : num -0.199 -0.78 1.661 -0.722 -0.664 ...
## $ Alkaline Phosphatase (ALP) : num 0.5569 -1.601 -0.0945 -0.5424 -0.0538 ...
## $ Creatinine : num 0.128 1.252 0.128 3.724 -0.153 ...
## $ Glomerular Filtration Rate (GFR) : num NA -0.391 0.657 -3.262 0.986 ...
## $ C-Reactive Protein (CRP) : num -0.375 -0.375 -0.375 -0.375 -0.375 ...
## $ Hemoglobin (HGB) : num 0.864 0.533 1.305 -1.892 -0.348 ...
## $ Vitamin D : num 1.121 1.356 1.864 2.371 -0.148 ...
levels(train_sc$y)
## [1] "No" "Yes"
levels(test_sc$y)
## [1] "No" "Yes"
tree_model <- rpart(y ~ ., data = train, method = "class")
rpart.plot(tree_model, type = 3, extra = 101, fallen.leaves = TRUE)
pred_train_class <- predict(tree_model, type = "class") # ok pakai data training
table(pred_train_class, train$y)
##
## pred_train_class No Yes
## No 114 28
## Yes 15 99
conf_tree_train <- caret::confusionMatrix(pred_train_class, train$y, positive = "Yes")
conf_tree_train
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 114 28
## Yes 15 99
##
## Accuracy : 0.832
## 95% CI : (0.7805, 0.8757)
## No Information Rate : 0.5039
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.6638
##
## Mcnemar's Test P-Value : 0.06725
##
## Sensitivity : 0.7795
## Specificity : 0.8837
## Pos Pred Value : 0.8684
## Neg Pred Value : 0.8028
## Prevalence : 0.4961
## Detection Rate : 0.3867
## Detection Prevalence : 0.4453
## Balanced Accuracy : 0.8316
##
## 'Positive' Class : Yes
##
pred_test_class <- predict(tree_model, newdata = test, type = "class")
table(pred_test_class, test$y)
##
## pred_test_class No Yes
## No 27 11
## Yes 5 20
conf_tree_test <- caret::confusionMatrix(pred_test_class, test$y, positive = "Yes")
conf_tree_test
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 27 11
## Yes 5 20
##
## Accuracy : 0.746
## 95% CI : (0.6206, 0.8473)
## No Information Rate : 0.5079
## P-Value [Acc > NIR] : 9.638e-05
##
## Kappa : 0.4904
##
## Mcnemar's Test P-Value : 0.2113
##
## Sensitivity : 0.6452
## Specificity : 0.8438
## Pos Pred Value : 0.8000
## Neg Pred Value : 0.7105
## Prevalence : 0.4921
## Detection Rate : 0.3175
## Detection Prevalence : 0.3968
## Balanced Accuracy : 0.7445
##
## 'Positive' Class : Yes
##
log_model <- glm(y ~ ., data = train, family = binomial)
prob_train <- predict(log_model, newdata = train, type = "response")
pred_train <- ifelse(prob_train > 0.5, "Yes", "No")
pred_train <- factor(pred_train, levels = levels(train$y))
conf_matrix_train <- table(Predicted = pred_train, Actual = train$y)
conf_matrix_train
## Actual
## Predicted No Yes
## No 115 22
## Yes 13 105
accuracy_train <- round(sum(diag(conf_matrix_train)) / sum(conf_matrix_train) * 100, 2)
accuracy_train
## [1] 86.27
cat(paste0("\nAkurasi data training sebesar ", accuracy_train, "%.\n"))
##
## Akurasi data training sebesar 86.27%.
conf_log_train <- caret::confusionMatrix(pred_train, train$y, positive = "Yes")
conf_log_train
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 115 22
## Yes 13 105
##
## Accuracy : 0.8627
## 95% CI : (0.8143, 0.9025)
## No Information Rate : 0.502
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.7254
##
## Mcnemar's Test P-Value : 0.1763
##
## Sensitivity : 0.8268
## Specificity : 0.8984
## Pos Pred Value : 0.8898
## Neg Pred Value : 0.8394
## Prevalence : 0.4980
## Detection Rate : 0.4118
## Detection Prevalence : 0.4627
## Balanced Accuracy : 0.8626
##
## 'Positive' Class : Yes
##
prob_test <- predict(log_model, newdata = test, type = "response")
pred_test <- ifelse(prob_test > 0.5, "Yes", "No")
pred_test <- factor(pred_test, levels = levels(test$y))
conf_matrix_test <- table(Predicted = pred_test, Actual = test$y)
conf_matrix_test
## Actual
## Predicted No Yes
## No 30 11
## Yes 2 20
accuracy_test <- round(sum(diag(conf_matrix_test)) / sum(conf_matrix_test) * 100, 2)
accuracy_test
## [1] 79.37
cat(paste0("\nAkurasi data testing sebesar ", accuracy_test, "%.\n"))
##
## Akurasi data testing sebesar 79.37%.
conf_log_test <- caret::confusionMatrix(pred_test, test$y, positive = "Yes")
conf_log_test
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 30 11
## Yes 2 20
##
## Accuracy : 0.7937
## 95% CI : (0.673, 0.8853)
## No Information Rate : 0.5079
## P-Value [Acc > NIR] : 2.719e-06
##
## Kappa : 0.5853
##
## Mcnemar's Test P-Value : 0.0265
##
## Sensitivity : 0.6452
## Specificity : 0.9375
## Pos Pred Value : 0.9091
## Neg Pred Value : 0.7317
## Prevalence : 0.4921
## Detection Rate : 0.3175
## Detection Prevalence : 0.3492
## Balanced Accuracy : 0.7913
##
## 'Positive' Class : Yes
##
train_x <- dplyr::select(train_sc, -y)
train_x[is.na(train_x)] <- 0
test_x <- dplyr::select(test_sc, -y)
test_x[is.na(test_x)] <- 0
train_y <- train_sc$y
test_y <- test_sc$y
train_x <- as.data.frame(train_x)
train_y <- as.factor(train_y)
set.seed(123)
ctrl <- caret::trainControl(method = "cv", number = 10)
grid <- data.frame(k = seq(3, 31, 2))
knn_cv <- caret::train(
x = train_x, y = train_y,
method = "knn",
trControl = ctrl,
tuneGrid = grid,
metric = "Accuracy"
)
best_k <- knn_cv$bestTune$k
print(knn_cv)
## k-Nearest Neighbors
##
## 256 samples
## 38 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 230, 230, 230, 231, 231, 230, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 3 0.6561538 0.3093554
## 5 0.6569231 0.3120052
## 7 0.6755385 0.3491110
## 9 0.6758462 0.3492669
## 11 0.6521538 0.3007999
## 13 0.6524615 0.3012033
## 15 0.6526154 0.3019216
## 17 0.6449231 0.2856811
## 19 0.6486154 0.2919221
## 21 0.6290769 0.2536559
## 23 0.6253846 0.2469093
## 25 0.6098462 0.2151749
## 27 0.6098462 0.2158941
## 29 0.6329231 0.2612917
## 31 0.6249231 0.2453434
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.
cat(paste0("\nK terbaik hasil CV = ", best_k, "\n"))
##
## K terbaik hasil CV = 9
pred_train_knn <- predict(knn_cv, newdata = train_x)
conf_knn_train <- caret::confusionMatrix(pred_train_knn, train_y, positive = "Yes")
conf_knn_train
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 111 51
## Yes 18 76
##
## Accuracy : 0.7305
## 95% CI : (0.6717, 0.7838)
## No Information Rate : 0.5039
## P-Value [Acc > NIR] : 1.193e-13
##
## Kappa : 0.4598
##
## Mcnemar's Test P-Value : 0.000117
##
## Sensitivity : 0.5984
## Specificity : 0.8605
## Pos Pred Value : 0.8085
## Neg Pred Value : 0.6852
## Prevalence : 0.4961
## Detection Rate : 0.2969
## Detection Prevalence : 0.3672
## Balanced Accuracy : 0.7294
##
## 'Positive' Class : Yes
##
pred_test_knn <- predict(knn_cv, newdata = test_x)
conf_knn_test <- caret::confusionMatrix(pred_test_knn, test_y, positive = "Yes")
conf_knn_test
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 28 19
## Yes 4 12
##
## Accuracy : 0.6349
## 95% CI : (0.504, 0.7527)
## No Information Rate : 0.5079
## P-Value [Acc > NIR] : 0.028824
##
## Kappa : 0.2641
##
## Mcnemar's Test P-Value : 0.003509
##
## Sensitivity : 0.3871
## Specificity : 0.8750
## Pos Pred Value : 0.7500
## Neg Pred Value : 0.5957
## Prevalence : 0.4921
## Detection Rate : 0.1905
## Detection Prevalence : 0.2540
## Balanced Accuracy : 0.6310
##
## 'Positive' Class : Yes
##
conf_matrix_knn_test <- table(Predicted = pred_test_knn, Actual = test_y)
conf_matrix_knn_test
## Actual
## Predicted No Yes
## No 28 19
## Yes 4 12
accuracy_knn_test <- round(sum(diag(conf_matrix_knn_test)) / sum(conf_matrix_knn_test) * 100, 2)
cat(paste0("\nAkurasi data testing (KNN, k=", best_k, ") sebesar ", accuracy_knn_test, "%.\n"))
##
## Akurasi data testing (KNN, k=9) sebesar 63.49%.
m_tree <- get_metrics(conf_tree_test) m_log <- get_metrics(conf_log_test) m_knn <- get_metrics(conf_knn_test)
model_eval <- data.frame( Model = c(“Decision Tree”,“Logistic Regression”,“KNN”), Accuracy = c(m_tree[“Accuracy”], m_log[“Accuracy”], m_knn[“Accuracy”]), Precision = c(m_tree[“Precision”], m_log[“Precision”], m_knn[“Precision”]), Recall = c(m_tree[“Recall”], m_log[“Recall”], m_knn[“Recall”]), F1 = c(m_tree[“F1”], m_log[“F1”], m_knn[“F1”]) ) print(model_eval)
<h1 style="color: black; font-size:22px"> Feature Importance </h1>
```r
library(caret)
library(ggplot2)
library(tibble)
scaler <- caret::preProcess(train[, -1], method = c("center","scale"))
train_std <- train
train_std[, -1] <- predict(scaler, train[, -1])
log_model_std <- glm(y ~ ., data = train_std, family = binomial)
summary_log <- summary(log_model_std)
coefs <- as.data.frame(summary_log$coefficients)
colnames(coefs) <- c("Estimate", "Std.Error", "z.value", "p.value")
coefs <- coefs[-1, , drop = FALSE] # drop intercept
coefs$Importance <- abs(coefs$Estimate)
coefs$OR <- exp(coefs$Estimate) # odds ratio
coefs_sorted <- coefs[order(coefs$Importance, decreasing = TRUE), ]
coefs_sorted
## Estimate Std.Error
## Weight 3.30787737 3.7875910
## `Extracellular Water (ECW)` -3.15591628 2.5751484
## Hyperlipidemia 2.93303260 209.7408772
## `Intracellular Water (ICW)` 2.62233196 3.1696649
## `Bone Mass (BM)` -2.24252710 0.9687638
## `Obesity (%)` -1.95781607 0.8194581
## `C-Reactive Protein (CRP)` 1.89049116 0.6787019
## `Lean Mass (LM) (%)` 1.73570188 2.5717544
## `Total Body Fat Ratio (TBFR) (%)` 1.65007988 2.1172123
## `Total Fat Content (TFC)` -1.02899746 0.9486100
## Gender -1.02266152 0.6209531
## `Muscle Mass (MM)` -1.00546770 1.0755186
## `Vitamin D` -0.98098986 0.2392966
## `Total Body Water (TBW)` 0.97235599 2.2273507
## `Body Protein Content (Protein) (%)` -0.90442550 0.7854519
## Age 0.76297397 0.5031528
## `Visceral Fat Area (VFA)` 0.75238005 1.0067807
## `Hemoglobin (HGB)` -0.73842485 0.3289968
## `Diabetes Mellitus (DM)` 0.71398942 0.3201217
## `Body Mass Index (BMI)` 0.71318804 2.1632437
## `Aspartat Aminotransferaz (AST)` -0.65468588 0.3718906
## Hypothyroidism -0.62601903 0.5873089
## `Visceral Fat Rating (VFR)` -0.55532145 0.9622551
## `Extracellular Fluid/Total Body Water (ECF/TBW)` 0.54246097 1.2924876
## `Alanin Aminotransferaz (ALT)` 0.49471792 0.3624082
## `High Density Lipoprotein (HDL)` 0.48991779 0.4628680
## `Hepatic Fat Accumulation (HFA)` 0.39193430 0.2704924
## `Glomerular Filtration Rate (GFR)` 0.38524846 0.2664003
## Triglyceride -0.29530544 0.3499490
## Comorbidity -0.27798272 0.3559688
## `Low Density Lipoprotein (LDL)` -0.25949004 0.5216240
## `Total Cholesterol (TC)` 0.22864573 0.5639644
## `Coronary Artery Disease (CAD)` -0.20606698 0.2458566
## `Visceral Muscle Area (VMA) (Kg)` 0.18777388 0.3625015
## Glucose -0.11728419 0.2855388
## Creatinine 0.09729786 0.3099899
## `Alkaline Phosphatase (ALP)` 0.05982663 0.2386774
## Height 0.04838280 1.3674660
## z.value p.value
## Weight 0.87334598 3.824745e-01
## `Extracellular Water (ECW)` -1.22552795 2.203764e-01
## Hyperlipidemia 0.01398408 9.888427e-01
## `Intracellular Water (ICW)` 0.82732152 4.080548e-01
## `Bone Mass (BM)` -2.31483365 2.062204e-02
## `Obesity (%)` -2.38915944 1.688697e-02
## `C-Reactive Protein (CRP)` 2.78545135 5.345326e-03
## `Lean Mass (LM) (%)` 0.67490965 4.997332e-01
## `Total Body Fat Ratio (TBFR) (%)` 0.77936439 4.357651e-01
## `Total Fat Content (TFC)` -1.08474235 2.780358e-01
## Gender -1.64692241 9.957400e-02
## `Muscle Mass (MM)` -0.93486775 3.498565e-01
## `Vitamin D` -4.09947195 4.140938e-05
## `Total Body Water (TBW)` 0.43655272 6.624358e-01
## `Body Protein Content (Protein) (%)` -1.15147149 2.495383e-01
## Age 1.51638609 1.294218e-01
## `Visceral Fat Area (VFA)` 0.74731272 4.548748e-01
## `Hemoglobin (HGB)` -2.24447425 2.480190e-02
## `Diabetes Mellitus (DM)` 2.23036871 2.572297e-02
## `Body Mass Index (BMI)` 0.32968455 7.416383e-01
## `Aspartat Aminotransferaz (AST)` -1.76042611 7.833558e-02
## Hypothyroidism -1.06591106 2.864639e-01
## `Visceral Fat Rating (VFR)` -0.57710417 5.638691e-01
## `Extracellular Fluid/Total Body Water (ECF/TBW)` 0.41970303 6.747024e-01
## `Alanin Aminotransferaz (ALT)` 1.36508492 1.722264e-01
## `High Density Lipoprotein (HDL)` 1.05843963 2.898551e-01
## `Hepatic Fat Accumulation (HFA)` 1.44896623 1.473470e-01
## `Glomerular Filtration Rate (GFR)` 1.44612631 1.481418e-01
## Triglyceride -0.84385278 3.987517e-01
## Comorbidity -0.78091887 4.348502e-01
## `Low Density Lipoprotein (LDL)` -0.49746569 6.188607e-01
## `Total Cholesterol (TC)` 0.40542582 6.851645e-01
## `Coronary Artery Disease (CAD)` -0.83815921 4.019413e-01
## `Visceral Muscle Area (VMA) (Kg)` 0.51799473 6.044619e-01
## Glucose -0.41074687 6.812582e-01
## Creatinine 0.31387429 7.536165e-01
## `Alkaline Phosphatase (ALP)` 0.25065896 8.020778e-01
## Height 0.03538136 9.717757e-01
## Importance OR
## Weight 3.30787737 27.32705867
## `Extracellular Water (ECW)` 3.15591628 0.04259935
## Hyperlipidemia 2.93303260 18.78451008
## `Intracellular Water (ICW)` 2.62233196 13.76779215
## `Bone Mass (BM)` 2.24252710 0.10618981
## `Obesity (%)` 1.95781607 0.14116638
## `C-Reactive Protein (CRP)` 1.89049116 6.62262065
## `Lean Mass (LM) (%)` 1.73570188 5.67290813
## `Total Body Fat Ratio (TBFR) (%)` 1.65007988 5.20739577
## `Total Fat Content (TFC)` 1.02899746 0.35736505
## Gender 1.02266152 0.35963649
## `Muscle Mass (MM)` 1.00546770 0.36587348
## `Vitamin D` 0.98098986 0.37493978
## `Total Body Water (TBW)` 0.97235599 2.64416675
## `Body Protein Content (Protein) (%)` 0.90442550 0.40477436
## Age 0.76297397 2.14464485
## `Visceral Fat Area (VFA)` 0.75238005 2.12204459
## `Hemoglobin (HGB)` 0.73842485 0.47786604
## `Diabetes Mellitus (DM)` 0.71398942 2.04212191
## `Body Mass Index (BMI)` 0.71318804 2.04048605
## `Aspartat Aminotransferaz (AST)` 0.65468588 0.51960525
## Hypothyroidism 0.62601903 0.53471626
## `Visceral Fat Rating (VFR)` 0.55532145 0.57388776
## `Extracellular Fluid/Total Body Water (ECF/TBW)` 0.54246097 1.72023510
## `Alanin Aminotransferaz (ALT)` 0.49471792 1.64003556
## `High Density Lipoprotein (HDL)` 0.48991779 1.63218203
## `Hepatic Fat Accumulation (HFA)` 0.39193430 1.47984049
## `Glomerular Filtration Rate (GFR)` 0.38524846 1.46997950
## Triglyceride 0.29530544 0.74430421
## Comorbidity 0.27798272 0.75730990
## `Low Density Lipoprotein (LDL)` 0.25949004 0.77144489
## `Total Cholesterol (TC)` 0.22864573 1.25689667
## `Coronary Artery Disease (CAD)` 0.20606698 0.81377857
## `Visceral Muscle Area (VMA) (Kg)` 0.18777388 1.20656066
## Glucose 0.11728419 0.88933242
## Creatinine 0.09729786 1.10218863
## `Alkaline Phosphatase (ALP)` 0.05982663 1.06165247
## Height 0.04838280 1.04957236
cat("\nTop 10 fitur paling berpengaruh (berdasarkan |β|):\n")
##
## Top 10 fitur paling berpengaruh (berdasarkan |β|):
print(head(coefs_sorted[, c("Estimate","Std.Error","z.value","p.value","OR","Importance")], 10))
## Estimate Std.Error z.value p.value
## Weight 3.307877 3.7875910 0.87334598 0.382474525
## `Extracellular Water (ECW)` -3.155916 2.5751484 -1.22552795 0.220376365
## Hyperlipidemia 2.933033 209.7408772 0.01398408 0.988842684
## `Intracellular Water (ICW)` 2.622332 3.1696649 0.82732152 0.408054847
## `Bone Mass (BM)` -2.242527 0.9687638 -2.31483365 0.020622038
## `Obesity (%)` -1.957816 0.8194581 -2.38915944 0.016886972
## `C-Reactive Protein (CRP)` 1.890491 0.6787019 2.78545135 0.005345326
## `Lean Mass (LM) (%)` 1.735702 2.5717544 0.67490965 0.499733167
## `Total Body Fat Ratio (TBFR) (%)` 1.650080 2.1172123 0.77936439 0.435765094
## `Total Fat Content (TFC)` -1.028997 0.9486100 -1.08474235 0.278035784
## OR Importance
## Weight 27.32705867 3.307877
## `Extracellular Water (ECW)` 0.04259935 3.155916
## Hyperlipidemia 18.78451008 2.933033
## `Intracellular Water (ICW)` 13.76779215 2.622332
## `Bone Mass (BM)` 0.10618981 2.242527
## `Obesity (%)` 0.14116638 1.957816
## `C-Reactive Protein (CRP)` 6.62262065 1.890491
## `Lean Mass (LM) (%)` 5.67290813 1.735702
## `Total Body Fat Ratio (TBFR) (%)` 5.20739577 1.650080
## `Total Fat Content (TFC)` 0.35736505 1.028997
top_n <- min(10, nrow(coefs_sorted))
plot_df <- head(coefs_sorted, top_n)
plot_df$Feature <- rownames(plot_df)
ggplot(plot_df, aes(x = reorder(Feature, Importance), y = Importance)) +
geom_col(fill = "#00BFC4") +
coord_flip() +
labs(
title = "Feature Importance (|β|) - Logistic Regression",
x = NULL, y = "|Koefisien (β)| - standardized"
) +
theme_minimal()
sig_features <- c("`Obesity (%)`",
"`Bone Mass (BM)`",
"`C-Reactive Protein (CRP)`",
"`Vitamin D`",
"`Hemoglobin (HGB)`",
"`Diabetes Mellitus (DM)`")
formula_sig <- as.formula(paste("y ~", paste(sig_features, collapse = " + ")))
log_model_sig <- glm(formula_sig, data = train, family = binomial)
prob_test_sig <- predict(log_model_sig, newdata = test, type = "response")
pred_test_sig <- ifelse(prob_test_sig > 0.5, "Yes", "No")
pred_test_sig <- factor(pred_test_sig, levels = levels(test$y))
conf_sig <- caret::confusionMatrix(pred_test_sig, test$y, positive = "Yes")
conf_sig
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 29 8
## Yes 3 23
##
## Accuracy : 0.8254
## 95% CI : (0.709, 0.9095)
## No Information Rate : 0.5079
## P-Value [Acc > NIR] : 1.604e-07
##
## Kappa : 0.6498
##
## Mcnemar's Test P-Value : 0.2278
##
## Sensitivity : 0.7419
## Specificity : 0.9062
## Pos Pred Value : 0.8846
## Neg Pred Value : 0.7838
## Prevalence : 0.4921
## Detection Rate : 0.3651
## Detection Prevalence : 0.4127
## Balanced Accuracy : 0.8241
##
## 'Positive' Class : Yes
##
compare_acc <- data.frame(
Model = c("Full Model (All Variables)", "Reduced Model (Significant Variables)"),
Accuracy = c(as.numeric(conf_log_test$overall["Accuracy"]),
as.numeric(conf_sig$overall["Accuracy"])),
F1 = c(as.numeric(conf_log_test$byClass["F1"]),
as.numeric(conf_sig$byClass["F1"])),
Precision = c(as.numeric(conf_log_test$byClass["Precision"]),
as.numeric(conf_sig$byClass["Precision"])),
Recall = c(as.numeric(conf_log_test$byClass["Recall"]),
as.numeric(conf_sig$byClass["Recall"]))
)
print(compare_acc)
## Model Accuracy F1 Precision Recall
## 1 Full Model (All Variables) 0.7936508 0.7547170 0.9090909 0.6451613
## 2 Reduced Model (Significant Variables) 0.8253968 0.8070175 0.8846154 0.7419355
ggplot(compare_acc, aes(x = Model, y = Accuracy, fill = Model)) +
geom_col(width = 0.6) +
geom_text(aes(label = round(Accuracy, 3)), vjust = -0.3, size = 4) +
theme_minimal() +
labs(
title = "Perbandingan Akurasi: Model Penuh vs Model Signifikan (Logistic Regression)",
y = "Accuracy", x = NULL
) +
scale_fill_manual(values = c("#00BFC4", "#F8766D"))
library(pROC)
library(ggplot2)
prob_test_sig <- predict(log_model_sig, newdata = test, type = "response")
roc_sig <- pROC::roc(
response = test$y, # target (factor: No / Yes)
predictor = prob_test_sig, # probabilitas prediksi
levels = c("No", "Yes"),
direction = "<"
)
auc_sig <- pROC::auc(roc_sig)
cat("AUC Model Signifikan :", round(as.numeric(auc_sig), 3), "\n")
## AUC Model Signifikan : 0.856
p <- pROC::ggroc(roc_sig, color = "#F8766D", size = 1.2) +
geom_abline(slope = 1, intercept = 1, linetype = "dashed", color = "gray40") +
labs(
title = "Kurva ROC - Model Logistik (Variabel Signifikan)",
subtitle = paste0("AUC = ", round(as.numeric(auc_sig), 3)),
x = "False Positive Rate (1 - Spesifisitas)",
y = "True Positive Rate (Sensitivitas)"
) +
theme_minimal()
print(p)