library(MASS)
library(brant)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
data <- read.csv("ObesityDataSet_raw_and_data_sinthetic.csv")
head(data)
## Age Gender Height Weight CALC FAVC FCVC NCP SCC SMOKE CH2O
## 1 21 Female 1.62 64.0 no no 2 3 no no 2
## 2 21 Female 1.52 56.0 Sometimes no 3 3 yes yes 3
## 3 23 Male 1.80 77.0 Frequently no 2 3 no no 2
## 4 27 Male 1.80 87.0 Frequently no 3 3 no no 2
## 5 22 Male 1.78 89.8 Sometimes no 2 1 no no 2
## 6 29 Male 1.62 53.0 Sometimes yes 2 3 no no 2
## family_history_with_overweight FAF TUE CAEC MTRANS
## 1 yes 0 1 Sometimes Public_Transportation
## 2 yes 3 0 Sometimes Public_Transportation
## 3 yes 2 1 Sometimes Public_Transportation
## 4 no 2 0 Sometimes Walking
## 5 no 0 0 Sometimes Public_Transportation
## 6 no 0 0 Sometimes Automobile
## NObeyesdad
## 1 Normal_Weight
## 2 Normal_Weight
## 3 Normal_Weight
## 4 Overweight_Level_I
## 5 Overweight_Level_II
## 6 Normal_Weight
str(data)
## 'data.frame': 2111 obs. of 17 variables:
## $ Age : num 21 21 23 27 22 29 23 22 24 22 ...
## $ Gender : chr "Female" "Female" "Male" "Male" ...
## $ Height : num 1.62 1.52 1.8 1.8 1.78 1.62 1.5 1.64 1.78 1.72 ...
## $ Weight : num 64 56 77 87 89.8 53 55 53 64 68 ...
## $ CALC : chr "no" "Sometimes" "Frequently" "Frequently" ...
## $ FAVC : chr "no" "no" "no" "no" ...
## $ FCVC : num 2 3 2 3 2 2 3 2 3 2 ...
## $ NCP : num 3 3 3 3 1 3 3 3 3 3 ...
## $ SCC : chr "no" "yes" "no" "no" ...
## $ SMOKE : chr "no" "yes" "no" "no" ...
## $ CH2O : num 2 3 2 2 2 2 2 2 2 2 ...
## $ family_history_with_overweight: chr "yes" "yes" "yes" "no" ...
## $ FAF : num 0 3 2 2 0 0 1 3 1 1 ...
## $ TUE : num 1 0 1 0 0 0 0 0 1 1 ...
## $ CAEC : chr "Sometimes" "Sometimes" "Sometimes" "Sometimes" ...
## $ MTRANS : chr "Public_Transportation" "Public_Transportation" "Public_Transportation" "Walking" ...
## $ NObeyesdad : chr "Normal_Weight" "Normal_Weight" "Normal_Weight" "Overweight_Level_I" ...
summary(data)
## Age Gender Height Weight
## Min. :14.00 Length:2111 Min. :1.450 Min. : 39.00
## 1st Qu.:19.95 Class :character 1st Qu.:1.630 1st Qu.: 65.47
## Median :22.78 Mode :character Median :1.700 Median : 83.00
## Mean :24.31 Mean :1.702 Mean : 86.59
## 3rd Qu.:26.00 3rd Qu.:1.768 3rd Qu.:107.43
## Max. :61.00 Max. :1.980 Max. :173.00
## CALC FAVC FCVC NCP
## Length:2111 Length:2111 Min. :1.000 Min. :1.000
## Class :character Class :character 1st Qu.:2.000 1st Qu.:2.659
## Mode :character Mode :character Median :2.386 Median :3.000
## Mean :2.419 Mean :2.686
## 3rd Qu.:3.000 3rd Qu.:3.000
## Max. :3.000 Max. :4.000
## SCC SMOKE CH2O
## Length:2111 Length:2111 Min. :1.000
## Class :character Class :character 1st Qu.:1.585
## Mode :character Mode :character Median :2.000
## Mean :2.008
## 3rd Qu.:2.477
## Max. :3.000
## family_history_with_overweight FAF TUE
## Length:2111 Min. :0.0000 Min. :0.0000
## Class :character 1st Qu.:0.1245 1st Qu.:0.0000
## Mode :character Median :1.0000 Median :0.6253
## Mean :1.0103 Mean :0.6579
## 3rd Qu.:1.6667 3rd Qu.:1.0000
## Max. :3.0000 Max. :2.0000
## CAEC MTRANS NObeyesdad
## Length:2111 Length:2111 Length:2111
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
colSums(is.na(data))
## Age Gender
## 0 0
## Height Weight
## 0 0
## CALC FAVC
## 0 0
## FCVC NCP
## 0 0
## SCC SMOKE
## 0 0
## CH2O family_history_with_overweight
## 0 0
## FAF TUE
## 0 0
## CAEC MTRANS
## 0 0
## NObeyesdad
## 0
nrow(data)
## [1] 2111
data <- unique(data)
nrow(data)
## [1] 2087
table(data$NObeyesdad)
##
## Insufficient_Weight Normal_Weight Obesity_Type_I Obesity_Type_II
## 267 282 351 297
## Obesity_Type_III Overweight_Level_I Overweight_Level_II
## 324 276 290
Visualsi Target
par(mar = c(10,4,4,2)) # tambah margin bawah
barplot(table(data$NObeyesdad),
main = "Distribusi Tingkat Obesitas",
col = "lightblue",
las = 2,
cex.names = 0.7) # kecilin font
numeric_cols <- c(
"FCVC",
"NCP",
"CH2O",
"FAF",
"TUE"
)
outlier_summary <- c()
for (col in numeric_cols) {
Q1 <- quantile(data[[col]], 0.25)
Q3 <- quantile(data[[col]], 0.75)
IQR <- Q3 - Q1
lower <- Q1 - 1.5 * IQR
upper <- Q3 + 1.5 * IQR
outliers <- data[[col]][
data[[col]] < lower |
data[[col]] > upper
]
outlier_summary[col] <- length(outliers)
}
outlier_summary
## FCVC NCP CH2O FAF TUE
## 0 577 0 0 0
for (col in numeric_cols) {
boxplot(data[[col]],
main = paste("Boxplot", col),
col = "lightblue")
}
for (col in numeric_cols) {
Q1 <- quantile(data[[col]], 0.25)
Q3 <- quantile(data[[col]], 0.75)
IQR <- Q3 - Q1
lower <- Q1 - 1.5 * IQR
upper <- Q3 + 1.5 * IQR
data[[col]][data[[col]] < lower] <- lower
data[[col]][data[[col]] > upper] <- upper
}
for (col in numeric_cols) {
cat(col, ": min =", min(data[[col]]),
"max =", max(data[[col]]), "\n")
}
## FCVC : min = 1 max = 3
## NCP : min = 2.243667 max = 3.453799
## CH2O : min = 1 max = 3
## FAF : min = 0 max = 3
## TUE : min = 0 max = 2
# ambil hanya kolom numerik
numeric_data <- data[, sapply(data, is.numeric)]
# cek dulu (biar aman)
str(numeric_data)
## 'data.frame': 2087 obs. of 8 variables:
## $ Age : num 21 21 23 27 22 29 23 22 24 22 ...
## $ Height: num 1.62 1.52 1.8 1.8 1.78 1.62 1.5 1.64 1.78 1.72 ...
## $ Weight: num 64 56 77 87 89.8 53 55 53 64 68 ...
## $ FCVC : num 2 3 2 3 2 2 3 2 3 2 ...
## $ NCP : num 3 3 3 3 2.24 ...
## $ CH2O : num 2 3 2 2 2 2 2 2 2 2 ...
## $ FAF : num 0 3 2 2 0 0 1 3 1 1 ...
## $ TUE : num 1 0 1 0 0 0 0 0 1 1 ...
# hitung korelasi (AMAN dari NA)
cor_matrix <- cor(numeric_data, use = "complete.obs")
# tampilkan
cor_matrix
## Age Height Weight FCVC NCP CH2O
## Age 1.00000000 -0.03174825 0.19816049 0.01357180 -0.09210744 -0.04405777
## Height -0.03174825 1.00000000 0.45746802 -0.04036257 0.19069750 0.22048705
## Weight 0.19816049 0.45746802 1.00000000 0.21657440 0.03755726 0.20382302
## FCVC 0.01357180 -0.04036257 0.21657440 1.00000000 0.03465198 0.08133170
## NCP -0.09210744 0.19069750 0.03755726 0.03465198 1.00000000 0.07353687
## CH2O -0.04405777 0.22048705 0.20382302 0.08133170 0.07353687 1.00000000
## FAF -0.14820151 0.29358397 -0.05649007 0.02200298 0.12694700 0.16530995
## TUE -0.30292652 0.04180763 -0.07935127 -0.10412758 0.02258690 0.02070441
## FAF TUE
## Age -0.14820151 -0.30292652
## Height 0.29358397 0.04180763
## Weight -0.05649007 -0.07935127
## FCVC 0.02200298 -0.10412758
## NCP 0.12694700 0.02258690
## CH2O 0.16530995 0.02070441
## FAF 1.00000000 0.05871604
## TUE 0.05871604 1.00000000
if (!require(corrplot)) install.packages("corrplot")
## Loading required package: corrplot
## corrplot 0.95 loaded
library(corrplot)
corrplot(cor_matrix,
method = "color",
type = "upper",
tl.col = "black",
tl.srt = 45,
addCoef.col = "black")
Variabel NObeyesdad diubah menjadi ordered factor dengan urutan tingkat obesitas dari rendah hingga tinggi. Hal ini bertujuan agar model dapat memahami hubungan berurutan antar kategori dalam analisis Ordinal Logistic Regression.
data$NObeyesdad <- factor(data$NObeyesdad,
levels = c("Insufficient_Weight",
"Normal_Weight",
"Overweight_Level_I",
"Overweight_Level_II",
"Obesity_Type_I",
"Obesity_Type_II",
"Obesity_Type_III"),
ordered = TRUE)
Variabel kategorikal seperti Gender, FAVC, CAEC, SMOKE, SCC, CALC, MTRANS, dan family_history_with_overweight dikonversi menjadi factor agar dapat diproses dengan benar oleh model statistik
factor_cols <- c(
"Gender",
"family_history_with_overweight",
"FAVC",
"SMOKE",
"SCC",
"MTRANS"
)
data[factor_cols] <- lapply(data[factor_cols], as.factor)
data$CAEC <- factor(
data$CAEC,
levels = c("no", "Sometimes", "Frequently", "Always"),
ordered = TRUE
)
data$CALC <- factor(
data$CALC,
levels = c("no", "Sometimes", "Frequently", "Always"),
ordered = TRUE
)
Distribusi data pada setiap kategori NObeyesdad relatif seimbang, sehingga tidak terdapat dominasi kelas tertentu dan model dapat belajar dengan baik dari seluruh kategori
summary(data$NObeyesdad)
## Insufficient_Weight Normal_Weight Overweight_Level_I Overweight_Level_II
## 267 282 276 290
## Obesity_Type_I Obesity_Type_II Obesity_Type_III
## 351 297 324
Hasil pengecekan menunjukkan seluruh variabel telah memiliki tipe data yang sesuai, sehingga dataset siap digunakan untuk tahap pemodelan.
str(data)
## 'data.frame': 2087 obs. of 17 variables:
## $ Age : num 21 21 23 27 22 29 23 22 24 22 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 1 1 2 2 2 2 1 2 2 2 ...
## $ Height : num 1.62 1.52 1.8 1.8 1.78 1.62 1.5 1.64 1.78 1.72 ...
## $ Weight : num 64 56 77 87 89.8 53 55 53 64 68 ...
## $ CALC : Ord.factor w/ 4 levels "no"<"Sometimes"<..: 1 2 3 3 2 2 2 2 3 1 ...
## $ FAVC : Factor w/ 2 levels "no","yes": 1 1 1 1 1 2 2 1 2 2 ...
## $ FCVC : num 2 3 2 3 2 2 3 2 3 2 ...
## $ NCP : num 3 3 3 3 2.24 ...
## $ SCC : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 1 1 1 1 ...
## $ SMOKE : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 1 1 1 1 ...
## $ CH2O : num 2 3 2 2 2 2 2 2 2 2 ...
## $ family_history_with_overweight: Factor w/ 2 levels "no","yes": 2 2 2 1 1 1 2 1 2 2 ...
## $ FAF : num 0 3 2 2 0 0 1 3 1 1 ...
## $ TUE : num 1 0 1 0 0 0 0 0 1 1 ...
## $ CAEC : Ord.factor w/ 4 levels "no"<"Sometimes"<..: 2 2 2 2 2 2 2 2 2 2 ...
## $ MTRANS : Factor w/ 5 levels "Automobile","Bike",..: 4 4 4 5 4 1 3 4 4 4 ...
## $ NObeyesdad : Ord.factor w/ 7 levels "Insufficient_Weight"<..: 2 2 2 3 4 2 2 2 2 2 ...
Data dibagi menjadi data latih (training) dan data uji (testing) untuk memastikan model dapat dievaluasi pada data yang tidak digunakan saat pelatihan. Pembagian ini dilakukan secara acak dengan proporsi tertentu (70% data latih dan 30% data uji).
set.seed(123)
train_index <- sample(1:nrow(data), 0.7*nrow(data))
train <- data[train_index, ]
test <- data[-train_index, ]
num_vars <- c(
"Age",
"Height",
"Weight",
"FCVC",
"NCP",
"CH2O",
"FAF",
"TUE"
)
preproc <- preProcess(
train[, num_vars],
method = c("center", "scale")
)
train[, num_vars] <- predict(
preproc,
train[, num_vars]
)
test[, num_vars] <- predict(
preproc,
test[, num_vars]
)
Model dibangun menggunakan metode Ordinal Logistic Regression dengan pendekatan cumulative logit. Variabel dependen berupa tingkat obesitas yang bersifat ordinal, sedangkan variabel independen meliputi Weight, FAF, CH2O, dan Age. Estimasi parameter dilakukan menggunakan metode Maximum Likelihood.
sapply(train, class)
## $Age
## [1] "numeric"
##
## $Gender
## [1] "factor"
##
## $Height
## [1] "numeric"
##
## $Weight
## [1] "numeric"
##
## $CALC
## [1] "ordered" "factor"
##
## $FAVC
## [1] "factor"
##
## $FCVC
## [1] "numeric"
##
## $NCP
## [1] "numeric"
##
## $SCC
## [1] "factor"
##
## $SMOKE
## [1] "factor"
##
## $CH2O
## [1] "numeric"
##
## $family_history_with_overweight
## [1] "factor"
##
## $FAF
## [1] "numeric"
##
## $TUE
## [1] "numeric"
##
## $CAEC
## [1] "ordered" "factor"
##
## $MTRANS
## [1] "factor"
##
## $NObeyesdad
## [1] "ordered" "factor"
model <- polr(
NObeyesdad ~ Gender + Age + Weight +
family_history_with_overweight +
FAVC + FCVC + CH2O + FAF +
CAEC + MTRANS + CALC,
data = train,
Hess = TRUE
)
summary(model)
## Call:
## polr(formula = NObeyesdad ~ Gender + Age + Weight + family_history_with_overweight +
## FAVC + FCVC + CH2O + FAF + CAEC + MTRANS + CALC, data = train,
## Hess = TRUE)
##
## Coefficients:
## Value Std. Error t value
## GenderMale -3.54727 0.17398 -20.3887
## Age 0.54538 0.08077 6.7519
## Weight 7.43273 0.25302 29.3761
## family_history_with_overweightyes 0.37135 0.18965 1.9581
## FAVCyes -0.52647 0.19079 -2.7595
## FCVC 0.13141 0.06503 2.0209
## CH2O -0.25041 0.06402 -3.9113
## FAF -0.47514 0.06631 -7.1660
## CAEC.L -1.84429 0.37337 -4.9395
## CAEC.Q 0.86628 0.29268 2.9598
## CAEC.C 0.83071 0.18945 4.3849
## MTRANSBike 0.77989 1.10141 0.7081
## MTRANSMotorbike 2.44126 0.72273 3.3778
## MTRANSPublic_Transportation 1.18742 0.18780 6.3229
## MTRANSWalking 0.06679 0.39714 0.1682
## CALC.L 1.77506 1.49367 1.1884
## CALC.Q 2.05451 1.12315 1.8292
## CALC.C 0.77425 0.54101 1.4311
##
## Intercepts:
## Value Std. Error t value
## Insufficient_Weight|Normal_Weight -11.0359 0.7525 -14.6652
## Normal_Weight|Overweight_Level_I -7.0398 0.6990 -10.0713
## Overweight_Level_I|Overweight_Level_II -3.9538 0.6797 -5.8173
## Overweight_Level_II|Obesity_Type_I -1.1998 0.6733 -1.7819
## Obesity_Type_I|Obesity_Type_II 3.1836 0.6810 4.6750
## Obesity_Type_II|Obesity_Type_III 7.3190 0.7020 10.4252
##
## Residual Deviance: 2132.359
## AIC: 2180.359
# Menguji apakah asumsi proportional odds terpenuhi.
# Jika p-value > 0.05, asumsi terpenuhi dan model valid.
brant(model)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## --------------------------------------------------------------------
## Test for X2 df probability
## --------------------------------------------------------------------
## Omnibus 923.7 90 0
## GenderMale 46.02 5 0
## Age 45.34 5 0
## Weight 2.64 5 0.76
## family_history_with_overweightyes 11.74 5 0.04
## FAVCyes 35.91 5 0
## FCVC 4.33 5 0.5
## CH2O 6.18 5 0.29
## FAF 4.1 5 0.53
## CAEC.L 10.85 5 0.05
## CAEC.Q 13.25 5 0.02
## CAEC.C 1.97 5 0.85
## MTRANSBike 68.6 5 0
## MTRANSMotorbike -0.56 5 1
## MTRANSPublic_Transportation 17.48 5 0
## MTRANSWalking 10.91 5 0.05
## CALC.L 0 5 1
## CALC.Q 0 5 1
## CALC.C 0 5 1
## --------------------------------------------------------------------
##
## H0: Parallel Regression Assumption holds
## Warning in brant(model): 4236 combinations in table(dv,ivs) do not occur.
## Because of that, the test results might be invalid.
Uji signifikansi dilakukan untuk mengetahui apakah variabel independen memiliki pengaruh yang signifikan terhadap variabel dependen. Pengujian dilakukan menggunakan nilai p-value, dengan kriteria bahwa variabel dikatakan signifikan jika p-value < 0.05.
ctable <- coef(summary(model))
p_value <- pnorm(abs(ctable[, "t value"]), lower.tail = FALSE) * 2
hasil <- cbind(ctable, "p value" = p_value)
hasil
## Value Std. Error t value
## GenderMale -3.54727486 0.17398224 -20.3887184
## Age 0.54538232 0.08077462 6.7519022
## Weight 7.43273219 0.25302011 29.3760537
## family_history_with_overweightyes 0.37135099 0.18965034 1.9580824
## FAVCyes -0.52647178 0.19078723 -2.7594707
## FCVC 0.13140760 0.06502528 2.0208694
## CH2O -0.25040827 0.06402202 -3.9112837
## FAF -0.47514187 0.06630511 -7.1659917
## CAEC.L -1.84428701 0.37337363 -4.9395214
## CAEC.Q 0.86628106 0.29268148 2.9598082
## CAEC.C 0.83070823 0.18944855 4.3848750
## MTRANSBike 0.77988958 1.10141394 0.7080804
## MTRANSMotorbike 2.44125950 0.72273123 3.3778248
## MTRANSPublic_Transportation 1.18741560 0.18779504 6.3229336
## MTRANSWalking 0.06678712 0.39714401 0.1681685
## CALC.L 1.77506094 1.49367060 1.1883885
## CALC.Q 2.05450673 1.12314992 1.8292364
## CALC.C 0.77424656 0.54100951 1.4311145
## Insufficient_Weight|Normal_Weight -11.03593202 0.75252688 -14.6651665
## Normal_Weight|Overweight_Level_I -7.03977526 0.69899266 -10.0713149
## Overweight_Level_I|Overweight_Level_II -3.95377444 0.67966258 -5.8172608
## Overweight_Level_II|Obesity_Type_I -1.19976857 0.67330348 -1.7819135
## Obesity_Type_I|Obesity_Type_II 3.18361089 0.68099252 4.6749572
## Obesity_Type_II|Obesity_Type_III 7.31896116 0.70204640 10.4251815
## p value
## GenderMale 2.105907e-92
## Age 1.459192e-11
## Weight 1.110923e-189
## family_history_with_overweightyes 5.022035e-02
## FAVCyes 5.789508e-03
## FCVC 4.329329e-02
## CH2O 9.180684e-05
## FAF 7.722543e-13
## CAEC.L 7.831456e-07
## CAEC.Q 3.078306e-03
## CAEC.C 1.160524e-05
## MTRANSBike 4.788954e-01
## MTRANSMotorbike 7.306160e-04
## MTRANSPublic_Transportation 2.566436e-10
## MTRANSWalking 8.664507e-01
## CALC.L 2.346804e-01
## CALC.Q 6.736420e-02
## CALC.C 1.523974e-01
## Insufficient_Weight|Normal_Weight 1.077525e-48
## Normal_Weight|Overweight_Level_I 7.398227e-24
## Overweight_Level_I|Overweight_Level_II 5.981978e-09
## Overweight_Level_II|Obesity_Type_I 7.476334e-02
## Obesity_Type_I|Obesity_Type_II 2.940149e-06
## Obesity_Type_II|Obesity_Type_III 1.902957e-25
Odds ratio dihitung untuk menginterpretasikan besarnya pengaruh variabel independen terhadap peluang perubahan kategori pada variabel dependen. Nilai ini diperoleh dengan melakukan eksponensial terhadap koefisien model.
odds_ratio <- exp(coef(model))
odds_ratio
## GenderMale Age
## 2.880303e-02 1.725268e+00
## Weight family_history_with_overweightyes
## 1.690420e+03 1.449692e+00
## FAVCyes FCVC
## 5.906854e-01 1.140433e+00
## CH2O FAF
## 7.784829e-01 6.217968e-01
## CAEC.L CAEC.Q
## 1.581380e-01 2.378051e+00
## CAEC.C MTRANSBike
## 2.294944e+00 2.181231e+00
## MTRANSMotorbike MTRANSPublic_Transportation
## 1.148750e+01 3.278597e+00
## MTRANSWalking CALC.L
## 1.069068e+00 5.900641e+00
## CALC.Q CALC.C
## 7.802988e+00 2.168957e+00
Variabel yang paling berpengaruh ditentukan berdasarkan besar nilai koefisien atau odds ratio. Variabel dengan nilai terbesar dianggap memiliki pengaruh paling dominan terhadap tingkat obesitas.
sort(abs(coef(model)), decreasing = TRUE)
## Weight GenderMale
## 7.43273219 3.54727486
## MTRANSMotorbike CALC.Q
## 2.44125950 2.05450673
## CAEC.L CALC.L
## 1.84428701 1.77506094
## MTRANSPublic_Transportation CAEC.Q
## 1.18741560 0.86628106
## CAEC.C MTRANSBike
## 0.83070823 0.77988958
## CALC.C Age
## 0.77424656 0.54538232
## FAVCyes FAF
## 0.52647178 0.47514187
## family_history_with_overweightyes CH2O
## 0.37135099 0.25040827
## FCVC MTRANSWalking
## 0.13140760 0.06678712
# Konversi keduanya ke character dulu sebelum dibandingkan
pred_class <- predict(model, test)
# Akurasi - fix error
mean(as.character(pred_class) == as.character(test$NObeyesdad))
## [1] 0.6443381
actual_num <- as.numeric(test$NObeyesdad)
pred_num <- as.numeric(factor(as.character(pred_class),
levels = levels(test$NObeyesdad)))
mean(abs(actual_num - pred_num))
## [1] 0.3572568
pred_prob <- predict(model, test, type = "probs")
head(pred_prob)
## Insufficient_Weight Normal_Weight Overweight_Level_I Overweight_Level_II
## 3 6.282075e-03 2.495784e-01 0.626855457 0.108895069
## 7 1.564267e-02 4.479643e-01 0.486189891 0.046848907
## 15 2.370660e-01 7.070686e-01 0.053169483 0.002523809
## 21 6.925176e-03 2.680578e-01 0.617515015 0.099890984
## 22 5.619729e-07 3.000209e-05 0.000638037 0.009729822
## 25 4.034994e-02 6.554079e-01 0.284656489 0.018315281
## Obesity_Type_I Obesity_Type_II Obesity_Type_III
## 3 0.0082833973 1.039060e-04 1.689392e-06
## 7 0.0033122079 4.133813e-05 6.720674e-07
## 15 0.0001699398 2.114190e-06 3.437074e-08
## 21 0.0075152696 9.419683e-05 1.531517e-06
## 22 0.4466405683 5.243110e-01 1.865003e-02
## 25 0.0012544811 1.562393e-05 2.540043e-07
confusionMatrix(
factor(as.character(pred_class), levels = levels(test$NObeyesdad)),
factor(as.character(test$NObeyesdad), levels = levels(test$NObeyesdad))
)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Insufficient_Weight Normal_Weight Overweight_Level_I
## Insufficient_Weight 59 14 0
## Normal_Weight 12 48 9
## Overweight_Level_I 0 23 35
## Overweight_Level_II 0 0 35
## Obesity_Type_I 0 0 1
## Obesity_Type_II 0 0 0
## Obesity_Type_III 0 0 0
## Reference
## Prediction Overweight_Level_II Obesity_Type_I Obesity_Type_II
## Insufficient_Weight 0 0 0
## Normal_Weight 0 0 0
## Overweight_Level_I 24 0 0
## Overweight_Level_II 37 17 0
## Obesity_Type_I 26 76 14
## Obesity_Type_II 0 10 75
## Obesity_Type_III 0 0 12
## Reference
## Prediction Obesity_Type_III
## Insufficient_Weight 0
## Normal_Weight 0
## Overweight_Level_I 0
## Overweight_Level_II 0
## Obesity_Type_I 0
## Obesity_Type_II 26
## Obesity_Type_III 74
##
## Overall Statistics
##
## Accuracy : 0.6443
## 95% CI : (0.6055, 0.6819)
## No Information Rate : 0.1643
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5838
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity 0.8310 0.56471
## Specificity 0.9748 0.96125
## Pos Pred Value 0.8082 0.69565
## Neg Pred Value 0.9783 0.93369
## Prevalence 0.1132 0.13557
## Detection Rate 0.0941 0.07656
## Detection Prevalence 0.1164 0.11005
## Balanced Accuracy 0.9029 0.76298
## Class: Overweight_Level_I Class: Overweight_Level_II
## Sensitivity 0.43750 0.42529
## Specificity 0.91408 0.90370
## Pos Pred Value 0.42683 0.41573
## Neg Pred Value 0.91743 0.90706
## Prevalence 0.12759 0.13876
## Detection Rate 0.05582 0.05901
## Detection Prevalence 0.13078 0.14195
## Balanced Accuracy 0.67579 0.66450
## Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity 0.7379 0.7426
## Specificity 0.9218 0.9316
## Pos Pred Value 0.6496 0.6757
## Neg Pred Value 0.9471 0.9496
## Prevalence 0.1643 0.1611
## Detection Rate 0.1212 0.1196
## Detection Prevalence 0.1866 0.1770
## Balanced Accuracy 0.8298 0.8371
## Class: Obesity_Type_III
## Sensitivity 0.7400
## Specificity 0.9772
## Pos Pred Value 0.8605
## Neg Pred Value 0.9519
## Prevalence 0.1595
## Detection Rate 0.1180
## Detection Prevalence 0.1372
## Balanced Accuracy 0.8586
comparison <- data.frame(
Actual = as.character(test$NObeyesdad),
Predicted = as.character(pred_class)
)
# Definisi urutan kategori
kategori_levels <- c("Insufficient_Weight", "Normal_Weight",
"Overweight_Level_I", "Overweight_Level_II",
"Obesity_Type_I", "Obesity_Type_II", "Obesity_Type_III")
# Label singkat biar tidak terlalu panjang di axis
kategori_labels <- c("Insuf.\nWeight", "Normal\nWeight",
"Overweight\nI", "Overweight\nII",
"Obesity\nI", "Obesity\nII", "Obesity\nIII")
# Warna per kategori
warna_kategori <- c(
"Insufficient_Weight" = "#4E79A7",
"Normal_Weight" = "#59A14F",
"Overweight_Level_I" = "#F28E2B",
"Overweight_Level_II" = "#E15759",
"Obesity_Type_I" = "#B07AA1",
"Obesity_Type_II" = "#FF9DA7",
"Obesity_Type_III" = "#9C755F"
)
# Konversi ke factor dengan urutan yang benar
comparison$Actual <- factor(comparison$Actual, levels = kategori_levels)
comparison$Predicted <- factor(comparison$Predicted, levels = kategori_levels)
ggplot(comparison, aes(x = Actual, fill = Predicted)) +
geom_bar(position = position_dodge(width = 0.85),
width = 0.75, color = "white", linewidth = 0.3) +
scale_fill_manual(values = warna_kategori,
labels = kategori_labels) +
scale_x_discrete(labels = kategori_labels) +
scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
theme_minimal(base_size = 12) +
theme(
axis.text.x = element_text(size = 9, color = "gray30"),
axis.text.y = element_text(size = 9, color = "gray30"),
axis.title = element_text(size = 11, color = "gray20"),
plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
plot.subtitle = element_text(size = 10, hjust = 0.5, color = "gray50"),
legend.position = "right",
legend.title = element_text(size = 10, face = "bold"),
legend.text = element_text(size = 8),
panel.grid.major.x = element_blank(),
panel.grid.minor = element_blank(),
plot.background = element_rect(fill = "white", color = NA),
panel.background = element_rect(fill = "white", color = NA)
) +
labs(
title = "Perbandingan Kategori Aktual vs Prediksi",
subtitle = "Ordinal Logistic Regression — Data Test",
x = "Kategori Aktual",
y = "Jumlah Observasi",
fill = "Prediksi"
)
or_df <- data.frame(
Variable = names(exp(coef(model))),
OR = exp(coef(model))
)
or_df$Arah <- ifelse(or_df$OR > 1, "Meningkatkan Risiko", "Menurunkan Risiko")
ggplot(or_df, aes(x = reorder(Variable, OR), y = OR, fill = Arah)) +
geom_col(width = 0.7) +
geom_hline(yintercept = 1, linetype = "dashed",
color = "gray40", linewidth = 0.8) +
geom_text(aes(label = round(OR, 3)),
hjust = -0.2, size = 3.2) +
scale_fill_manual(values = c(
"Meningkatkan Risiko" = "#E15759",
"Menurunkan Risiko" = "#4E79A7"
)) +
coord_flip() +
theme_minimal(base_size = 12) +
theme(
plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
legend.position = "bottom",
panel.grid.minor = element_blank(),
panel.grid.major.y = element_blank(),
plot.background = element_rect(fill = "white", color = NA)
) +
labs(
title = "Odds Ratio Variabel Prediktor",
x = "",
y = "Odds Ratio",
fill = "Arah Pengaruh"
)
heatmap_data <- comparison %>%
count(Actual, Predicted) %>%
mutate(
Actual = factor(Actual, levels = kategori_levels),
Predicted = factor(Predicted, levels = kategori_levels)
)
ggplot(heatmap_data, aes(x = Actual, y = Predicted, fill = n)) +
geom_tile(color = "white", linewidth = 0.8) +
geom_text(aes(label = n, color = n > 50),
size = 3.5, fontface = "bold") +
scale_fill_gradient(low = "#EBF5FB", high = "#1A5276") +
scale_color_manual(values = c("TRUE" = "white", "FALSE" = "gray20"),
guide = "none") +
scale_x_discrete(labels = kategori_labels) +
scale_y_discrete(labels = kategori_labels) +
theme_minimal(base_size = 11) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, size = 9),
axis.text.y = element_text(size = 9),
plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
plot.subtitle = element_text(size = 10, hjust = 0.5, color = "gray50"),
panel.grid = element_blank(),
plot.background = element_rect(fill = "white", color = NA)
) +
labs(
title = "Heatmap Confusion Matrix",
subtitle = "Diagonal = prediksi benar | Warna lebih gelap = jumlah lebih banyak",
x = "Aktual",
y = "Prediksi",
fill = "Jumlah"
)