This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
# -----------------------------------------
# ANALISIS GALLSTONE STATUS
# -----------------------------------------
# Load Library
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
library(ResourceSelection)
## ResourceSelection 0.3-6 2023-06-27
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(biotools)
## ---
## biotools version 4.3
# Load data
data <- read.csv("dataset-uci 1.csv")
# Perbaiki nama kolom
colnames(data) <- make.names(colnames(data))
# Cek nama kolom yang relevan
head(names(data), 10)
## [1] "Gallstone.Status" "Age"
## [3] "Gender" "Comorbidity"
## [5] "Coronary.Artery.Disease..CAD." "Hypothyroidism"
## [7] "Hyperlipidemia" "Diabetes.Mellitus..DM."
## [9] "Height" "Weight"
library(dplyr)
# Cek jumlah missing per kolom
colSums(is.na(data))
## Gallstone.Status
## 0
## Age
## 0
## Gender
## 0
## Comorbidity
## 0
## Coronary.Artery.Disease..CAD.
## 0
## Hypothyroidism
## 0
## Hyperlipidemia
## 0
## Diabetes.Mellitus..DM.
## 0
## Height
## 0
## Weight
## 0
## Body.Mass.Index..BMI.
## 0
## Total.Body.Water..TBW.
## 0
## Extracellular.Water..ECW.
## 0
## Intracellular.Water..ICW.
## 0
## Extracellular.Fluid.Total.Body.Water..ECF.TBW.
## 0
## Total.Body.Fat.Ratio..TBFR.....
## 0
## Lean.Mass..LM.....
## 0
## Body.Protein.Content..Protein.....
## 0
## Visceral.Fat.Rating..VFR.
## 0
## Bone.Mass..BM.
## 0
## Muscle.Mass..MM.
## 0
## Obesity....
## 0
## Total.Fat.Content..TFC.
## 0
## Visceral.Fat.Area..VFA.
## 0
## Visceral.Muscle.Area..VMA...Kg.
## 0
## Hepatic.Fat.Accumulation..HFA.
## 0
## Glucose
## 0
## Total.Cholesterol..TC.
## 0
## Low.Density.Lipoprotein..LDL.
## 0
## High.Density.Lipoprotein..HDL.
## 0
## Triglyceride
## 0
## Aspartat.Aminotransferaz..AST.
## 0
## Alanin.Aminotransferaz..ALT.
## 0
## Alkaline.Phosphatase..ALP.
## 0
## Creatinine
## 0
## Glomerular.Filtration.Rate..GFR.
## 0
## C.Reactive.Protein..CRP.
## 0
## Hemoglobin..HGB.
## 0
## Vitamin.D
## 0
# Statistik deskriptif untuk variabel numerik
summary(data)
## Gallstone.Status Age Gender Comorbidity
## Min. :0.0000 Min. :20.00 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:38.50 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :49.00 Median :0.0000 Median :0.0000
## Mean :0.4953 Mean :48.07 Mean :0.4922 Mean :0.3354
## 3rd Qu.:1.0000 3rd Qu.:56.00 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :96.00 Max. :1.0000 Max. :3.0000
## Coronary.Artery.Disease..CAD. Hypothyroidism Hyperlipidemia
## Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.00000 Median :0.00000 Median :0.00000
## Mean :0.03762 Mean :0.02821 Mean :0.02508
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.00000 Max. :1.00000
## Diabetes.Mellitus..DM. Height Weight Body.Mass.Index..BMI.
## Min. :0.0000 Min. :145.0 Min. : 42.90 Min. :17.40
## 1st Qu.:0.0000 1st Qu.:159.5 1st Qu.: 69.60 1st Qu.:25.25
## Median :0.0000 Median :168.0 Median : 78.80 Median :28.30
## Mean :0.1348 Mean :167.2 Mean : 80.56 Mean :28.88
## 3rd Qu.:0.0000 3rd Qu.:175.0 3rd Qu.: 91.25 3rd Qu.:31.85
## Max. :1.0000 Max. :191.0 Max. :143.50 Max. :49.70
## Total.Body.Water..TBW. Extracellular.Water..ECW. Intracellular.Water..ICW.
## Min. :13.00 Min. : 9.00 Min. :13.80
## 1st Qu.:34.20 1st Qu.:14.80 1st Qu.:19.30
## Median :39.80 Median :17.10 Median :23.00
## Mean :40.59 Mean :17.07 Mean :23.63
## 3rd Qu.:47.00 3rd Qu.:19.40 3rd Qu.:27.55
## Max. :66.20 Max. :27.80 Max. :57.10
## Extracellular.Fluid.Total.Body.Water..ECF.TBW. Total.Body.Fat.Ratio..TBFR.....
## Min. :29.23 Min. : 6.30
## 1st Qu.:40.08 1st Qu.:22.02
## Median :42.00 Median :27.82
## Mean :42.21 Mean :28.27
## 3rd Qu.:44.00 3rd Qu.:34.81
## Max. :52.00 Max. :50.92
## Lean.Mass..LM..... Body.Protein.Content..Protein.....
## Min. :48.99 Min. : 5.56
## 1st Qu.:65.17 1st Qu.:14.46
## Median :72.11 Median :15.87
## Mean :71.64 Mean :15.94
## 3rd Qu.:77.85 3rd Qu.:17.43
## Max. :93.67 Max. :24.81
## Visceral.Fat.Rating..VFR. Bone.Mass..BM. Muscle.Mass..MM. Obesity....
## Min. : 1.000 Min. :1.400 Min. : 4.70 Min. : 0.40
## 1st Qu.: 6.000 1st Qu.:2.400 1st Qu.:45.80 1st Qu.: 13.90
## Median : 9.000 Median :2.800 Median :53.90 Median : 25.60
## Mean : 9.078 Mean :2.803 Mean :54.27 Mean : 35.85
## 3rd Qu.:12.000 3rd Qu.:3.200 3rd Qu.:62.60 3rd Qu.: 41.75
## Max. :31.000 Max. :4.000 Max. :78.80 Max. :1954.00
## Total.Fat.Content..TFC. Visceral.Fat.Area..VFA.
## Min. : 3.10 Min. : 0.90
## 1st Qu.:17.00 1st Qu.: 8.57
## Median :22.60 Median :11.59
## Mean :23.49 Mean :12.17
## 3rd Qu.:28.55 3rd Qu.:15.10
## Max. :62.50 Max. :41.00
## Visceral.Muscle.Area..VMA...Kg. Hepatic.Fat.Accumulation..HFA. Glucose
## Min. :18.90 Min. :0.00 Min. : 69.0
## 1st Qu.:27.25 1st Qu.:0.00 1st Qu.: 92.0
## Median :30.41 Median :1.00 Median : 98.0
## Mean :30.40 Mean :1.15 Mean :108.7
## 3rd Qu.:33.80 3rd Qu.:2.00 3rd Qu.:109.0
## Max. :41.10 Max. :4.00 Max. :575.0
## Total.Cholesterol..TC. Low.Density.Lipoprotein..LDL.
## Min. : 60.0 Min. : 11.0
## 1st Qu.:172.0 1st Qu.:100.5
## Median :198.0 Median :122.0
## Mean :203.5 Mean :126.7
## 3rd Qu.:233.0 3rd Qu.:151.0
## Max. :360.0 Max. :293.0
## High.Density.Lipoprotein..HDL. Triglyceride Aspartat.Aminotransferaz..AST.
## Min. : 25.00 Min. : 1.39 Min. : 8.00
## 1st Qu.: 40.00 1st Qu.: 83.00 1st Qu.: 15.00
## Median : 46.50 Median :119.00 Median : 18.00
## Mean : 49.48 Mean :144.50 Mean : 21.68
## 3rd Qu.: 56.00 3rd Qu.:172.00 3rd Qu.: 23.00
## Max. :273.00 Max. :838.00 Max. :195.00
## Alanin.Aminotransferaz..ALT. Alkaline.Phosphatase..ALP. Creatinine
## Min. : 3.00 Min. : 7.00 Min. :0.4600
## 1st Qu.: 14.25 1st Qu.: 58.00 1st Qu.:0.6500
## Median : 19.00 Median : 71.00 Median :0.7900
## Mean : 26.86 Mean : 73.11 Mean :0.8006
## 3rd Qu.: 30.00 3rd Qu.: 86.00 3rd Qu.:0.9200
## Max. :372.00 Max. :197.00 Max. :1.4600
## Glomerular.Filtration.Rate..GFR. C.Reactive.Protein..CRP. Hemoglobin..HGB.
## Min. : 10.60 Min. : 0.000 Min. : 8.50
## 1st Qu.: 94.17 1st Qu.: 0.000 1st Qu.:13.30
## Median :104.00 Median : 0.215 Median :14.40
## Mean :100.82 Mean : 1.854 Mean :14.42
## 3rd Qu.:110.75 3rd Qu.: 1.615 3rd Qu.:15.70
## Max. :132.00 Max. :43.400 Max. :18.80
## Vitamin.D
## Min. : 3.50
## 1st Qu.:13.25
## Median :22.00
## Mean :21.40
## 3rd Qu.:28.06
## Max. :53.10
# Tangani missing value - contoh: hapus baris dengan NA pada kolom penting
data_clean <- data %>%
dplyr::select(Gallstone.Status, Age, Weight, Body.Mass.Index..BMI., Vitamin.D) %>%
na.omit()
# Konfirmasi pembersihan
dim(data_clean)
## [1] 319 5
data_clean <- data %>%
dplyr::select(Gallstone.Status, Age, Weight, Body.Mass.Index..BMI., Vitamin.D) %>%
na.omit()
# Pastikan target adalah faktor
data_clean$Gallstone.Status <- as.factor(data_clean$Gallstone.Status)
# Split data
set.seed(123)
index <- createDataPartition(data_clean$Gallstone.Status, p = 0.7, list = FALSE)
train <- data_clean[index, ]
test <- data_clean[-index, ]
# Model regresi logistik
log_model <- glm(Gallstone.Status ~ Age + Weight + Body.Mass.Index..BMI. + Vitamin.D,
data = train, family = "binomial")
summary(log_model)
##
## Call:
## glm(formula = Gallstone.Status ~ Age + Weight + Body.Mass.Index..BMI. +
## Vitamin.D, family = "binomial", data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.170401 1.053575 -0.162 0.872
## Age 0.020559 0.013073 1.573 0.116
## Weight -0.009263 0.015723 -0.589 0.556
## Body.Mass.Index..BMI. 0.048596 0.046343 1.049 0.294
## Vitamin.D -0.070198 0.015132 -4.639 3.5e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 310.51 on 223 degrees of freedom
## Residual deviance: 280.64 on 219 degrees of freedom
## AIC: 290.64
##
## Number of Fisher Scoring iterations: 4
# VIF untuk multikolinearitas
vif(log_model)
## Age Weight Body.Mass.Index..BMI.
## 1.183146 2.958613 2.982490
## Vitamin.D
## 1.030311
# Prediksi
log_prob <- predict(log_model, newdata = test, type = "response")
log_class <- factor(ifelse(log_prob > 0.5, "1", "0"), levels = c("0", "1"))
# Confusion matrix
confusionMatrix(log_class, test$Gallstone.Status)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 31 19
## 1 17 28
##
## Accuracy : 0.6211
## 95% CI : (0.5157, 0.7186)
## No Information Rate : 0.5053
## P-Value [Acc > NIR] : 0.01527
##
## Kappa : 0.2417
##
## Mcnemar's Test P-Value : 0.86763
##
## Sensitivity : 0.6458
## Specificity : 0.5957
## Pos Pred Value : 0.6200
## Neg Pred Value : 0.6222
## Prevalence : 0.5053
## Detection Rate : 0.3263
## Detection Prevalence : 0.5263
## Balanced Accuracy : 0.6208
##
## 'Positive' Class : 0
##
# Hosmer-Lemeshow Test
hoslem.test(as.numeric(test$Gallstone.Status)-1, log_prob)
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: as.numeric(test$Gallstone.Status) - 1, log_prob
## X-squared = 12.646, df = 8, p-value = 0.1246
# ROC Curve
roc_obj <- roc(test$Gallstone.Status, log_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_obj, col = "blue", main = "ROC Curve - Logistic Regression")
auc(roc_obj)
## Area under the curve: 0.7141
# Model LDA
lda_model <- lda(Gallstone.Status ~ Age + Weight + Body.Mass.Index..BMI. + Vitamin.D, data = train)
lda_pred <- predict(lda_model, newdata = test)
# Confusion matrix
confusionMatrix(lda_pred$class, test$Gallstone.Status)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 31 19
## 1 17 28
##
## Accuracy : 0.6211
## 95% CI : (0.5157, 0.7186)
## No Information Rate : 0.5053
## P-Value [Acc > NIR] : 0.01527
##
## Kappa : 0.2417
##
## Mcnemar's Test P-Value : 0.86763
##
## Sensitivity : 0.6458
## Specificity : 0.5957
## Pos Pred Value : 0.6200
## Neg Pred Value : 0.6222
## Prevalence : 0.5053
## Detection Rate : 0.3263
## Detection Prevalence : 0.5263
## Balanced Accuracy : 0.6208
##
## 'Positive' Class : 0
##
# Uji normalitas per variabel dan kelas
vars <- c("Age", "Weight", "Body.Mass.Index..BMI.", "Vitamin.D")
for (v in vars) {
print(paste("Shapiro test untuk", v))
by(train[[v]], train$Gallstone.Status, shapiro.test)
}
## [1] "Shapiro test untuk Age"
## [1] "Shapiro test untuk Weight"
## [1] "Shapiro test untuk Body.Mass.Index..BMI."
## [1] "Shapiro test untuk Vitamin.D"
# Uji Box's M untuk homogenitas kovarians
boxM(train[, vars], train$Gallstone.Status)
##
## Box's M-test for Homogeneity of Covariance Matrices
##
## data: train[, vars]
## Chi-Sq (approx.) = 5.7685, df = 10, p-value = 0.8343
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.