1 개요

이 문서는 R의 mlbench 패키지에 포함된 PimaIndiansDiabetes 데이터셋을 사용하여 네 가지 주요 분류 기법인 Logistic Regression, Decision Tree (CART, Classification and Regression Trees), Bagging, 그리고 Random Forest를 비교 분석합니다.

1.1 분석 목표

종속변수: 당뇨병 여부 (diabetes: neg/pos)
독립변수:
- glucose: 경구 포도당 내성 검사에서 2시간 후 혈장 포도당 농도
- mass: 체질량 지수(BMI, kg/m²)
- pregnant: 임신 횟수
- pressure: 이완기 혈압 (mm Hg)
- triceps: 삼두근 피부 두께 (mm)
- insulin: 2시간 후 혈청 인슐린 (mu U/ml)
- pedigree: 당뇨병 가족력 함수
- age: 나이 (년)
모델 비교:
- Logistic Regression: 선형 모델 기반 이진 분류
- CART: 의사결정나무 기반 분류
- Bagging: 부트스트랩 앙상블
- Random Forest: 전체 변수를 사용한 앙상블

2 환경 설정

2.1 패키지 로드

# 데이터 및 전처리
library(mlbench)     # PimaIndiansDiabetes 데이터셋
library(dplyr)       # 데이터 전처리

# 모델링
library(rpart)       # CART 모델
library(rpart.plot)  # 트리 시각화
library(partykit)    # 향상된 트리 시각화
library(ipred)       # Bagging
library(randomForest) # Random Forest

# 시각화
library(ggplot2)     # 데이터 시각화
library(gridExtra)   # 다중 플롯
library(showtext)    # 한글 폰트 지원
library(sysfonts)    # 시스템 폰트

# 모델 평가
library(caret)       # 모델 평가
library(pROC)        # ROC 분석

2.2 한글 폰트 설정

# Google Fonts에서 'Nanum Gothic' 다운로드 및 등록
font_add_google("Nanum Gothic", "nanum")
showtext_auto()  # 모든 플롯에 폰트 자동 적용

3 데이터 준비

3.1 데이터 불러오기

# PimaIndiansDiabetes 데이터셋 로드
data(PimaIndiansDiabetes)

# 데이터 구조 확인
str(PimaIndiansDiabetes)

## 'data.frame':    768 obs. of  9 variables:
##  $ pregnant: num  6 1 8 1 0 5 3 10 2 8 ...
##  $ glucose : num  148 85 183 89 137 116 78 115 197 125 ...
##  $ pressure: num  72 66 64 66 40 74 50 0 70 96 ...
##  $ triceps : num  35 29 0 23 35 0 32 0 45 0 ...
##  $ insulin : num  0 0 0 94 168 0 88 0 543 0 ...
##  $ mass    : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ pedigree: num  0.627 0.351 0.672 0.167 2.288 ...
##  $ age     : num  50 31 32 21 33 30 26 29 53 54 ...
##  $ diabetes: Factor w/ 2 levels "neg","pos": 2 1 2 1 2 1 2 1 2 2 ...

# 상위 6개 행 미리보기
head(PimaIndiansDiabetes)

##   pregnant glucose pressure triceps insulin mass pedigree age diabetes
## 1        6     148       72      35       0 33.6    0.627  50      pos
## 2        1      85       66      29       0 26.6    0.351  31      neg
## 3        8     183       64       0       0 23.3    0.672  32      pos
## 4        1      89       66      23      94 28.1    0.167  21      neg
## 5        0     137       40      35     168 43.1    2.288  33      pos
## 6        5     116       74       0       0 25.6    0.201  30      neg

# 데이터 요약
summary(PimaIndiansDiabetes)

##     pregnant         glucose         pressure         triceps     
##  Min.   : 0.000   Min.   :  0.0   Min.   :  0.00   Min.   : 0.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 62.00   1st Qu.: 0.00  
##  Median : 3.000   Median :117.0   Median : 72.00   Median :23.00  
##  Mean   : 3.845   Mean   :120.9   Mean   : 69.11   Mean   :20.54  
##  3rd Qu.: 6.000   3rd Qu.:140.2   3rd Qu.: 80.00   3rd Qu.:32.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##     insulin           mass          pedigree           age        diabetes 
##  Min.   :  0.0   Min.   : 0.00   Min.   :0.0780   Min.   :21.00   neg:500  
##  1st Qu.:  0.0   1st Qu.:27.30   1st Qu.:0.2437   1st Qu.:24.00   pos:268  
##  Median : 30.5   Median :32.00   Median :0.3725   Median :29.00            
##  Mean   : 79.8   Mean   :31.99   Mean   :0.4719   Mean   :33.24            
##  3rd Qu.:127.2   3rd Qu.:36.60   3rd Qu.:0.6262   3rd Qu.:41.00            
##  Max.   :846.0   Max.   :67.10   Max.   :2.4200   Max.   :81.00

# boxplot
boxplot(PimaIndiansDiabetes %>% select(-diabetes))

3.2 데이터 탐색적 시각화

# 1. 당뇨병 여부 분포
p1 <- ggplot(PimaIndiansDiabetes, aes(x = diabetes, fill = diabetes)) +
  geom_bar(alpha = 0.7) +
  geom_text(stat = 'count', aes(label = after_stat(count)), vjust = -0.5) +
  scale_fill_manual(values = c("neg" = "lightblue", "pos" = "lightcoral")) +
  labs(title = "당뇨병 여부 분포", x = "당뇨병", y = "환자 수") +
  theme_minimal(base_family = "nanum") +
  theme(legend.position = "none")

# 2. 주요 변수의 분포 (glucose)
p2 <- ggplot(PimaIndiansDiabetes, aes(x = glucose, fill = diabetes)) +
  geom_histogram(bins = 30, alpha = 0.6, position = "identity") +
  scale_fill_manual(values = c("neg" = "blue", "pos" = "red")) +
  labs(title = "포도당 농도 분포", x = "Glucose", y = "빈도") +
  theme_minimal(base_family = "nanum")

# 2. 주요 변수의 분포 (mass)
p3 <- ggplot(PimaIndiansDiabetes, aes(x = mass, fill = diabetes)) +
  geom_histogram(bins = 30, alpha = 0.6, position = "identity") +
  scale_fill_manual(values = c("neg" = "blue", "pos" = "red")) +
  labs(title = "BMI 분포", x = "BMI (mass)", y = "빈도") +
  theme_minimal(base_family = "nanum")

# 3. glucose와 mass의 산점도
p4 <- ggplot(PimaIndiansDiabetes, aes(x = glucose, y = mass, color = diabetes)) +
  geom_point(alpha = 0.6, size = 2) +
  scale_color_manual(values = c("neg" = "blue", "pos" = "red"),
                     labels = c("Negative", "Positive")) +
  labs(title = "Glucose vs BMI (당뇨병 여부)", 
       x = "포도당 농도 (glucose)", 
       y = "BMI (mass)",
       color = "당뇨병") +
  theme_minimal(base_family = "nanum") +
  theme(legend.position = "top")

# 전체 배치
grid.arrange(p1, p2, p3, p4, 
             ncol = 2, nrow = 2,
             top = "PimaIndiansDiabetes 데이터 탐색")

# 4. 수치형 변수 간 상관계수 히트맵
library(reshape2)

# 수치형 변수만 선택
numeric_data <- PimaIndiansDiabetes %>%
  select(pregnant, glucose, pressure, triceps, insulin, mass, pedigree, age)

# 상관계수 계산
cor_matrix <- cor(numeric_data, use = "complete.obs")
print(cor_matrix)

##             pregnant    glucose   pressure     triceps     insulin       mass
## pregnant  1.00000000 0.12945867 0.14128198 -0.08167177 -0.07353461 0.01768309
## glucose   0.12945867 1.00000000 0.15258959  0.05732789  0.33135711 0.22107107
## pressure  0.14128198 0.15258959 1.00000000  0.20737054  0.08893338 0.28180529
## triceps  -0.08167177 0.05732789 0.20737054  1.00000000  0.43678257 0.39257320
## insulin  -0.07353461 0.33135711 0.08893338  0.43678257  1.00000000 0.19785906
## mass      0.01768309 0.22107107 0.28180529  0.39257320  0.19785906 1.00000000
## pedigree -0.03352267 0.13733730 0.04126495  0.18392757  0.18507093 0.14064695
## age       0.54434123 0.26351432 0.23952795 -0.11397026 -0.04216295 0.03624187
##             pedigree         age
## pregnant -0.03352267  0.54434123
## glucose   0.13733730  0.26351432
## pressure  0.04126495  0.23952795
## triceps   0.18392757 -0.11397026
## insulin   0.18507093 -0.04216295
## mass      0.14064695  0.03624187
## pedigree  1.00000000  0.03356131
## age       0.03356131  1.00000000

cor_melted <- melt(cor_matrix)

# 히트맵
ggplot(cor_melted, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile(color = "white") +
  geom_text(aes(label = round(value, 2)), size = 3) +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red",
                       midpoint = 0, limit = c(-1, 1)) +
  labs(title = "변수 간 상관계수 히트맵",
       x = "", y = "", fill = "상관계수") +
  theme_minimal(base_family = "nanum") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(face = "bold", size = 14))

3.3 데이터 전처리

# diabetes 변수를 더 명확한 라벨로 변환
PimaIndiansDiabetes <- PimaIndiansDiabetes %>%
  mutate(
    diabetes_label = factor(diabetes, levels = c("neg", "pos"), 
                           labels = c("Negative", "Positive"))
  )

4 Logistic Regression 분석

4.1 Logistic Regression 개요

Logistic Regression은 이진(또는 다범주) 종속변수의 확률을 추정하는 통계적 분류 기법으로, 다음과 같은 특징을 가집니다:

선형 결정 경계: 독립변수의 선형 조합을 시그모이드(logit) 함수로 변환
계수 해석 가능: 각 변수의 로그오즈(log-odds) 변화량으로 효과 해석 가능

4.2 모델 적합

# Logistic Regression 모델 적합
logit_model <- glm(diabetes ~ pregnant + glucose + pressure + triceps + insulin + mass + pedigree + age, 
                   data = PimaIndiansDiabetes, 
                   family = binomial)

4.3 모델 결과

4.3.1 회귀계수 요약

# 모델 요약 통계
summary(logit_model)

## 
## Call:
## glm(formula = diabetes ~ pregnant + glucose + pressure + triceps + 
##     insulin + mass + pedigree + age, family = binomial, data = PimaIndiansDiabetes)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -8.4046964  0.7166359 -11.728  < 2e-16 ***
## pregnant     0.1231823  0.0320776   3.840 0.000123 ***
## glucose      0.0351637  0.0037087   9.481  < 2e-16 ***
## pressure    -0.0132955  0.0052336  -2.540 0.011072 *  
## triceps      0.0006190  0.0068994   0.090 0.928515    
## insulin     -0.0011917  0.0009012  -1.322 0.186065    
## mass         0.0897010  0.0150876   5.945 2.76e-09 ***
## pedigree     0.9451797  0.2991475   3.160 0.001580 ** 
## age          0.0148690  0.0093348   1.593 0.111192    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 993.48  on 767  degrees of freedom
## Residual deviance: 723.45  on 759  degrees of freedom
## AIC: 741.45
## 
## Number of Fisher Scoring iterations: 5

4.3.2 Odds Ratio

# odds ratio 계산: exp(β)
print(coef(logit_model))

##   (Intercept)      pregnant       glucose      pressure       triceps 
## -8.4046963669  0.1231822984  0.0351637146 -0.0132955469  0.0006189644 
##       insulin          mass      pedigree           age 
## -0.0011916990  0.0897009700  0.9451797406  0.0148690047

odds_ratios <- exp(coef(logit_model))
print(odds_ratios)

##  (Intercept)     pregnant      glucose     pressure      triceps      insulin 
## 0.0002238137 1.1310905981 1.0357892688 0.9867924485 1.0006191560 0.9988090108 
##         mass     pedigree          age 
## 1.0938471417 2.5732758592 1.0149800983

# 회귀계수와 odds ratio 요약
# 회귀계수와 odds ratio 요약
변수명 <- names(coef(logit_model))

coef_summary <- data.frame(
  변수 = 변수명,
  회귀계수 = round(coef(logit_model), 4),
  Odds_Ratio = round(odds_ratios, 4),
  해석 = c(
    "기준선 (Intercept)",
    sapply(2:length(변수명), function(i) {
      sprintf("%s이 1 증가할 때 사건이 일어날 odds가 %.4f배 %s.",
              변수명[i], 
              odds_ratios[i],
              ifelse(odds_ratios[i] > 1, "커진다", "작아진다"))
    })
  ),
  row.names = NULL
)

knitr::kable(coef_summary, 
             caption = "Logistic Regression 결과 요약",
             align = c("l", "r", "r", "l"))

Logistic Regression 결과 요약
변수	회귀계수	Odds_Ratio	해석
(Intercept)	-8.4047	0.0002	기준선 (Intercept)
pregnant	0.1232	1.1311	pregnant이 1 증가할 때 사건이 일어날 odds가 1.1311배 커진다.
glucose	0.0352	1.0358	glucose이 1 증가할 때 사건이 일어날 odds가 1.0358배 커진다.
pressure	-0.0133	0.9868	pressure이 1 증가할 때 사건이 일어날 odds가 0.9868배 작아진다.
triceps	0.0006	1.0006	triceps이 1 증가할 때 사건이 일어날 odds가 1.0006배 커진다.
insulin	-0.0012	0.9988	insulin이 1 증가할 때 사건이 일어날 odds가 0.9988배 작아진다.
mass	0.0897	1.0938	mass이 1 증가할 때 사건이 일어날 odds가 1.0938배 커진다.
pedigree	0.9452	2.5733	pedigree이 1 증가할 때 사건이 일어날 odds가 2.5733배 커진다.
age	0.0149	1.0150	age이 1 증가할 때 사건이 일어날 odds가 1.0150배 커진다.

4.3.3 [Special] Odds Ratio 시각화

# Odds Ratio 신뢰구간 시각화 함수
plot_odds_ratio <- function(model, conf_level = 0.95) {
  # 필요한 패키지
  library(ggplot2)
  
  # 회귀계수 신뢰구간 계산
  coef_ci <- confint(model, level = conf_level)
  
  # Odds Ratio와 신뢰구간으로 변환
  or_data <- data.frame(
    변수 = names(coef(model))[-1],  # Intercept 제외
    OR = exp(coef(model)[-1]),
    Lower = exp(coef_ci[-1, 1]),
    Upper = exp(coef_ci[-1, 2])
  )
  
  # 플롯 생성
  ggplot(or_data, aes(x = 변수, y = OR)) +
    geom_point(size = 3, color = "blue") +
    geom_errorbar(aes(ymin = Lower, ymax = Upper), width = 0.2, color = "blue") +
    geom_hline(yintercept = 1, linetype = "dashed", color = "red") +
    coord_flip() +
    scale_y_continuous(trans = "log10") +
    labs(
      title = "Odds Ratio 및 95% 신뢰구간",
      x = "변수",
      y = "Odds Ratio (log scale)"
    ) +
    theme_minimal() +
    theme(
      plot.title = element_text(hjust = 0.5, face = "bold"),
      axis.text = element_text(size = 10)
    )
}

# 사용 예시
plot_odds_ratio(logit_model)

4.4 Logistic Regression 모델 성능 평가

4.4.1 예측 및 Confusion Matrix

# 예측 확률 및 클래스
logit_pred_prob_single <- predict(logit_model, type = "response")
head(logit_pred_prob_single)

##          1          2          3          4          5          6 
## 0.72172655 0.04864161 0.79670208 0.04162486 0.90218390 0.14663156

logit_pred_class_single <- factor(ifelse(logit_pred_prob_single > 0.5, "pos", "neg"),
                                  levels = c("neg", "pos"))
head(logit_pred_class_single)

##   1   2   3   4   5   6 
## pos neg pos neg pos neg 
## Levels: neg pos

# Confusion Matrix
conf_logit_single <- confusionMatrix(data = logit_pred_class_single, reference= PimaIndiansDiabetes$diabetes, positive = "pos")
print(conf_logit_single)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction neg pos
##        neg 445 112
##        pos  55 156
##                                           
##                Accuracy : 0.7826          
##                  95% CI : (0.7517, 0.8112)
##     No Information Rate : 0.651           
##     P-Value [Acc > NIR] : 1.373e-15       
##                                           
##                   Kappa : 0.4966          
##                                           
##  Mcnemar's Test P-Value : 1.468e-05       
##                                           
##             Sensitivity : 0.5821          
##             Specificity : 0.8900          
##          Pos Pred Value : 0.7393          
##          Neg Pred Value : 0.7989          
##              Prevalence : 0.3490          
##          Detection Rate : 0.2031          
##    Detection Prevalence : 0.2747          
##       Balanced Accuracy : 0.7360          
##                                           
##        'Positive' Class : pos             
##

# threshold 변경
logit_pred_class_single2 <- factor(ifelse(logit_pred_prob_single > 0.9, "pos", "neg"),
                                  levels = c("neg", "pos"))
head(logit_pred_class_single2)

##   1   2   3   4   5   6 
## neg neg neg neg pos neg 
## Levels: neg pos

# Confusion Matrix
conf_logit_single2 <- confusionMatrix(data = logit_pred_class_single2, reference=PimaIndiansDiabetes$diabetes, positive = "pos")
print(conf_logit_single2)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction neg pos
##        neg 496 245
##        pos   4  23
##                                           
##                Accuracy : 0.6758          
##                  95% CI : (0.6414, 0.7088)
##     No Information Rate : 0.651           
##     P-Value [Acc > NIR] : 0.08009         
##                                           
##                   Kappa : 0.0983          
##                                           
##  Mcnemar's Test P-Value : < 2e-16         
##                                           
##             Sensitivity : 0.08582         
##             Specificity : 0.99200         
##          Pos Pred Value : 0.85185         
##          Neg Pred Value : 0.66937         
##              Prevalence : 0.34896         
##          Detection Rate : 0.02995         
##    Detection Prevalence : 0.03516         
##       Balanced Accuracy : 0.53891         
##                                           
##        'Positive' Class : pos             
##

4.4.2 ROC 곡선과 AUC

# ROC 객체 생성
roc_logit_single <- roc(PimaIndiansDiabetes$diabetes, logit_pred_prob_single)

# AUC 값
auc_logit_single <- auc(roc_logit_single)

# ROC 곡선 플롯
plot.roc(roc_logit_single,
     legacy.axes = TRUE,
     main = "Logistic Regression ROC 곡선",
     xlab = "1 - 특이도 (False Positive Rate)",
     ylab = "민감도 (True Positive Rate)",
     print.auc = TRUE,
     xaxs = "i",  # x축 범위 정확히 지정
     yaxs = "i")  # y축 범위 정확히 지정

4.5 [Special] 시각화

4.5.1 예측용 데이터 생성

# mass는 최소~최대 범위에서 100개 점 생성
# 다른 변수들은 평균값으로 고정
newdata_logit <- data.frame(
  pregnant = mean(PimaIndiansDiabetes$pregnant),
  glucose = mean(PimaIndiansDiabetes$glucose),
  pressure = mean(PimaIndiansDiabetes$pressure),
  triceps = mean(PimaIndiansDiabetes$triceps),
  insulin = mean(PimaIndiansDiabetes$insulin),
  mass = seq(min(PimaIndiansDiabetes$mass), 
             max(PimaIndiansDiabetes$mass), 
             length.out = 100),
  pedigree = mean(PimaIndiansDiabetes$pedigree),
  age = mean(PimaIndiansDiabetes$age)
)

# 예측 확률 계산
newdata_logit$pred_prob <- predict(logit_model, newdata = newdata_logit, type = "response")

# 첫 몇 개 행 확인
head(newdata_logit[c("mass", "pred_prob")])

##        mass  pred_prob
## 1 0.0000000 0.02318330
## 2 0.6777778 0.02460076
## 3 1.3555556 0.02610257
## 4 2.0333333 0.02769345
## 5 2.7111111 0.02937838
## 6 3.3888889 0.03116253

4.5.2 Logistic Regression 예측선

# tbl <- table(PimaIndiansDiabetes$diabetes_label)
# tbl_df <- data.frame(
#   당뇨병 = names(tbl),
#   개수 = as.numeric(tbl),
#   비율 = paste0(round(prop.table(tbl) * 100, 1), "%")
# )
# knitr::kable(tbl_df, caption = "당뇨병 여부별 환자 수 및 비율")

# 수치형 변환 (시각화용)
PimaIndiansDiabetes$diabetes_numeric <- as.numeric(PimaIndiansDiabetes$diabetes_label == "Positive")

# 시각화
p_logit <- ggplot(PimaIndiansDiabetes, aes(x = mass, y = diabetes_numeric)) +
  # 원자료 점(관측치) 표시
  geom_point(aes(color = diabetes_label), size = 3, alpha = 0.6) +
  
  # Logistic Regression 예측선
  geom_line(data = newdata_logit, aes(x = mass, y = pred_prob), 
            color = "black", linewidth = 1.3) +
  
  # 축 설정
  scale_x_continuous(labels = scales::comma_format(accuracy = 0.1)) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1), 
                     limits = c(0, 1)) +
  
  # 라벨 설정
  labs(
    title = "Logistic Regression: BMI에 따른 당뇨병 확률",
    subtitle = "다른 변수들은 평균값으로 고정",
    x = "BMI (kg/m²)",
    y = "당뇨병 양성일 확률",
    color = "당뇨병"
  ) +
  
  theme_minimal(base_family = "nanum") +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    plot.subtitle = element_text(size = 11),
    legend.position = "top"
  )

print(p_logit)

4.5.3 glucose 수준별 예측선

# glucose를 하위 25%, 중위 50%, 상위 75% 분위수로 나누기
glucose_levels <- quantile(PimaIndiansDiabetes$glucose, probs = c(0.25, 0.5, 0.75))

# 각 glucose 수준에 대한 예측 데이터 생성 (다른 변수들은 평균값으로 고정)
grid_logit <- expand.grid(
  glucose = as.numeric(glucose_levels),
  mass = seq(min(PimaIndiansDiabetes$mass), 
             max(PimaIndiansDiabetes$mass), 
             length.out = 120),
  # 다른 변수들은 평균값으로 고정
  pregnant = mean(PimaIndiansDiabetes$pregnant),
  pressure = mean(PimaIndiansDiabetes$pressure),
  triceps = mean(PimaIndiansDiabetes$triceps),
  insulin = mean(PimaIndiansDiabetes$insulin),
  pedigree = mean(PimaIndiansDiabetes$pedigree),
  age = mean(PimaIndiansDiabetes$age)
)

# 예측 확률 계산
grid_logit$pred_prob <- predict(logit_model, newdata = grid_logit, type = "response")
grid_logit$glucose_label <- paste0("glucose=", round(grid_logit$glucose, 0))

# glucose 수준별 예측선 시각화
p_logit_multi <- ggplot(PimaIndiansDiabetes, aes(x = mass, y = diabetes_numeric)) +
  geom_point(aes(color = diabetes_label), size = 2.8, alpha = 0.5) +
  geom_line(data = grid_logit, aes(x = mass, y = pred_prob, linetype = glucose_label),
            color = "black", linewidth = 1.1) +
  scale_x_continuous(labels = scales::comma_format(accuracy = 0.1)) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1), 
                     limits = c(0, 1)) +
  labs(
    title = "Logistic Regression: glucose 수준별 확률 곡선 (전체 변수 모델)",
    subtitle = "다른 변수들은 평균값으로 고정, BMI 변화에 따른 당뇨병 확률",
    x = "BMI (kg/m²)",
    y = "당뇨병 양성일 확률",
    color = "당뇨병", 
    linetype = "glucose 수준"
  ) +
  theme_minimal(base_family = "nanum") +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    legend.position = "top"
  )

print(p_logit_multi)

4.5.4 2차원 결정 영역 시각화

# 예측 격자 생성
glucose_range <- seq(min(PimaIndiansDiabetes$glucose), 
                    max(PimaIndiansDiabetes$glucose), 
                    length.out = 200)
mass_range <- seq(min(PimaIndiansDiabetes$mass), 
                 max(PimaIndiansDiabetes$mass), 
                 length.out = 200)

# 전체 변수를 포함한 격자 생성 (다른 변수들은 평균값으로 고정)
grid_logit <- expand.grid(
  glucose = glucose_range, 
  mass = mass_range,
  # 나머지 변수들은 평균값으로 고정
  pregnant = mean(PimaIndiansDiabetes$pregnant),
  pressure = mean(PimaIndiansDiabetes$pressure),
  triceps = mean(PimaIndiansDiabetes$triceps),
  insulin = mean(PimaIndiansDiabetes$insulin),
  pedigree = mean(PimaIndiansDiabetes$pedigree),
  age = mean(PimaIndiansDiabetes$age)
)

# 예측 확률 계산
grid_logit$pred_prob <- predict(logit_model, newdata = grid_logit, type = "response")
grid_logit$pred <- factor(ifelse(grid_logit$pred_prob > 0.5, "pos", "neg"),
                          levels = c("neg", "pos"))

# 파티션 플롯
p_logit_partition <- ggplot(PimaIndiansDiabetes, aes(x = glucose, y = mass)) +
  # 배경 영역 (결정 경계)
  geom_tile(data = grid_logit, aes(fill = pred), alpha = 0.3) +
  # 원본 데이터 포인트
  geom_point(aes(color = diabetes, shape = diabetes), size = 2.5, alpha = 0.7) +
  # 결정 경계선 강조 (0.5 확률 등고선)
  geom_contour(data = grid_logit, 
               aes(z = pred_prob), 
               breaks = 0.5,
               color = "black", 
               linewidth = 1, 
               alpha = 0.8) +
  # 색상 설정
  scale_fill_manual(values = c("neg" = "lightblue", "pos" = "lightcoral"),
                    name = "예측 결과",
                    labels = c("neg" = "Negative", "pos" = "Positive")) +
  scale_color_manual(values = c("neg" = "blue", "pos" = "red"),
                     name = "실제 결과",
                     labels = c("neg" = "Negative", "pos" = "Positive")) +
  scale_shape_manual(values = c("neg" = 16, "pos" = 17),
                     guide = "none") +
  # 라벨
  labs(
    title = "Logistic Regression의 결정 경계 (전체 변수 모델)",
    subtitle = "다른 변수들은 평균값으로 고정, glucose-mass 평면에서의 결정 경계",
    x = "포도당 농도 (glucose)",
    y = "BMI (mass)"
  ) +
  theme_minimal(base_family = "nanum") +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    plot.subtitle = element_text(size = 11),
    legend.position = "right"
  )

print(p_logit_partition)

5 CART (의사결정나무) 분석

5.1 CART 개요

CART는 데이터의 특징 공간을 반복적으로 분할하여 예측을 수행하는 비선형 트리 기반 알고리즘으로, 다음과 같은 특징을 가집니다:

비선형 모델링: 변수 간 복잡한 관계와 상호작용 자동 포착
해석 용이성: 트리 구조를 통해 예측 규칙을 시각적으로 이해 가능
가지치기(Pruning): 복잡도 매개변수(CP)를 통해 과적합 방지

5.2 모델 적합

# CART 모델 적합 (전체 변수 사용)
set.seed(123)  # 재현성을 위한 시드 설정
# 1️⃣ Gini 기준 모델
cart_gini <- rpart(
  diabetes ~ pregnant + glucose + pressure + triceps + insulin + mass + pedigree + age,
  data = PimaIndiansDiabetes,
  method = "class",
  parms = list(split = "gini")
)

# 2️⃣ Entropy 기준 모델 (information gain)
cart_entropy <- rpart(
  diabetes ~ pregnant + glucose + pressure + triceps + insulin + mass + pedigree + age,
  data = PimaIndiansDiabetes,
  method = "class",
  parms = list(split = "information")
)

5.3 모델 결과

5.3.1 트리 구조 요약

# 트리 구조 출력
print(cart_gini)

## n= 768 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##   1) root 768 268 neg (0.65104167 0.34895833)  
##     2) glucose< 127.5 485  94 neg (0.80618557 0.19381443)  
##       4) age< 28.5 271  23 neg (0.91512915 0.08487085) *
##       5) age>=28.5 214  71 neg (0.66822430 0.33177570)  
##        10) mass< 26.35 41   2 neg (0.95121951 0.04878049) *
##        11) mass>=26.35 173  69 neg (0.60115607 0.39884393)  
##          22) glucose< 99.5 55  10 neg (0.81818182 0.18181818) *
##          23) glucose>=99.5 118  59 neg (0.50000000 0.50000000)  
##            46) pedigree< 0.561 84  34 neg (0.59523810 0.40476190)  
##              92) pedigree< 0.2 21   4 neg (0.80952381 0.19047619) *
##              93) pedigree>=0.2 63  30 neg (0.52380952 0.47619048)  
##               186) pregnant>=1.5 52  21 neg (0.59615385 0.40384615)  
##                 372) pressure>=67 40  12 neg (0.70000000 0.30000000) *
##                 373) pressure< 67 12   3 pos (0.25000000 0.75000000) *
##               187) pregnant< 1.5 11   2 pos (0.18181818 0.81818182) *
##            47) pedigree>=0.561 34   9 pos (0.26470588 0.73529412) *
##     3) glucose>=127.5 283 109 pos (0.38515901 0.61484099)  
##       6) mass< 29.95 76  24 neg (0.68421053 0.31578947)  
##        12) glucose< 145.5 41   6 neg (0.85365854 0.14634146) *
##        13) glucose>=145.5 35  17 pos (0.48571429 0.51428571)  
##          26) insulin< 14.5 21   8 neg (0.61904762 0.38095238) *
##          27) insulin>=14.5 14   4 pos (0.28571429 0.71428571) *
##       7) mass>=29.95 207  57 pos (0.27536232 0.72463768)  
##        14) glucose< 157.5 115  45 pos (0.39130435 0.60869565)  
##          28) age< 30.5 50  23 neg (0.54000000 0.46000000)  
##            56) pressure>=61 40  13 neg (0.67500000 0.32500000)  
##             112) mass< 41.8 31   7 neg (0.77419355 0.22580645) *
##             113) mass>=41.8 9   3 pos (0.33333333 0.66666667) *
##            57) pressure< 61 10   0 pos (0.00000000 1.00000000) *
##          29) age>=30.5 65  18 pos (0.27692308 0.72307692) *
##        15) glucose>=157.5 92  12 pos (0.13043478 0.86956522) *

print(cart_entropy)

## n= 768 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##    1) root 768 268 neg (0.65104167 0.34895833)  
##      2) glucose< 127.5 485  94 neg (0.80618557 0.19381443)  
##        4) age< 28.5 271  23 neg (0.91512915 0.08487085) *
##        5) age>=28.5 214  71 neg (0.66822430 0.33177570)  
##         10) mass< 26.35 41   2 neg (0.95121951 0.04878049) *
##         11) mass>=26.35 173  69 neg (0.60115607 0.39884393)  
##           22) glucose< 99.5 55  10 neg (0.81818182 0.18181818) *
##           23) glucose>=99.5 118  59 neg (0.50000000 0.50000000)  
##             46) pedigree< 0.561 84  34 neg (0.59523810 0.40476190)  
##               92) age>=54.5 7   0 neg (1.00000000 0.00000000) *
##               93) age< 54.5 77  34 neg (0.55844156 0.44155844)  
##                186) pedigree< 0.2 20   4 neg (0.80000000 0.20000000) *
##                187) pedigree>=0.2 57  27 pos (0.47368421 0.52631579)  
##                  374) pregnant>=1.5 46  21 neg (0.54347826 0.45652174)  
##                    748) insulin>=11 16   3 neg (0.81250000 0.18750000) *
##                    749) insulin< 11 30  12 pos (0.40000000 0.60000000)  
##                     1498) mass>=34.05 10   3 neg (0.70000000 0.30000000) *
##                     1499) mass< 34.05 20   5 pos (0.25000000 0.75000000) *
##                  375) pregnant< 1.5 11   2 pos (0.18181818 0.81818182) *
##             47) pedigree>=0.561 34   9 pos (0.26470588 0.73529412) *
##      3) glucose>=127.5 283 109 pos (0.38515901 0.61484099)  
##        6) mass< 29.95 76  24 neg (0.68421053 0.31578947)  
##         12) glucose< 145.5 41   6 neg (0.85365854 0.14634146) *
##         13) glucose>=145.5 35  17 pos (0.48571429 0.51428571)  
##           26) insulin< 14.5 21   8 neg (0.61904762 0.38095238) *
##           27) insulin>=14.5 14   4 pos (0.28571429 0.71428571) *
##        7) mass>=29.95 207  57 pos (0.27536232 0.72463768)  
##         14) glucose< 157.5 115  45 pos (0.39130435 0.60869565)  
##           28) pressure>=61 100  44 pos (0.44000000 0.56000000)  
##             56) age< 30.5 40  13 neg (0.67500000 0.32500000) *
##             57) age>=30.5 60  17 pos (0.28333333 0.71666667) *
##           29) pressure< 61 15   1 pos (0.06666667 0.93333333) *
##         15) glucose>=157.5 92  12 pos (0.13043478 0.86956522) *

5.4 트리 시각화

# rpart.plot을 사용한 트리 시각화
par(mfrow = c(1, 2))

rpart.plot(cart_gini,
  type = 4, under = TRUE, fallen.leaves = TRUE,
  main = "CART (Gini Index)",
  box.palette = c("lightgreen", "lightcoral")
)

rpart.plot(cart_entropy,
  type = 4, under = TRUE, fallen.leaves = TRUE,
  main = "CART (Entropy)",
  box.palette = c("lightgreen", "lightcoral")
)

5.5 CART 모델 가지치기 (Pruning)

의사결정나무는 훈련 데이터에 과적합되기 쉽습니다. Pruning(가지치기)을 통해 불필요한 가지를 제거하고 모델을 단순화하면 일반화 성능을 향상시킬 수 있습니다.

5.5.1 Complexity Parameter (CP) 테이블

# CP 테이블 확인
cp_table <- cart_gini$cptable
cp_df <- as.data.frame(cp_table)
cp_df$CP <- round(cp_df$CP, 4)
cp_df$xerror <- round(cp_df$xerror, 4)
cp_df$xstd <- round(cp_df$xstd, 4)

knitr::kable(cp_df,
             caption = "CART 모델의 Complexity Parameter (CP) 테이블",
             align = "c",
             col.names = c("CP", "분할 수", "상대 오차", "CV 오차", "CV 표준편차"))

CART 모델의 Complexity Parameter (CP) 테이블
CP	분할 수	상대 오차	CV 오차	CV 표준편차
0.2425	0	1.0000000	1.0000	0.0493
0.1045	1	0.7574627	0.7985	0.0464
0.0174	2	0.6529851	0.7090	0.0446
0.0149	5	0.6007463	0.7164	0.0448
0.0131	9	0.5410448	0.7127	0.0447
0.0112	12	0.4925373	0.7239	0.0449
0.0100	15	0.4589552	0.7276	0.0450

CP 테이블 해석:

CP: 트리 복잡도를 조절하는 값 작을수록 트리가 깊고 복잡해짐
상대 오차 (rel error): 훈련 데이터 기준 오차 작을수록 학습 데이터에 더 잘 맞음
CV 오차: 교차검증 오차 — 작을수록 새로운 데이터에 대한 예측 성능이 좋음
CV 표준편차: 교차검증 오차의 표준편차 — 작을수록 오차 변동이 적고 모델이 안정적임

5.5.2 CP에 따른 오차 시각화

# CP에 따른 CV 오차 플롯
plotcp(cart_gini)

해석: 점선 아래의 가장 왼쪽 점(가장 단순한 모델)을 선택하는 것이 일반적입니다

5.5.3 최적의 CP 값 선택

# 최소 CV 오차를 가진 CP 선택
best_cp <- cp_table[which.min(cp_table[, "xerror"]), "CP"]

# 1-SE 규칙: 최소 오차 + 1 표준편차 이내의 가장 단순한 모델
min_xerror <- min(cp_table[, "xerror"])
min_xstd <- cp_table[which.min(cp_table[, "xerror"]), "xstd"]
threshold <- min_xerror + min_xstd

# 1-SE 규칙으로 CP 선택
best_cp_1se <- cp_table[cp_table[, "xerror"] <= threshold, "CP"][1]
print(best_cp_1se)

##          3 
## 0.01741294

5.5.4 📚 1-SE 규칙 (One Standard Error Rule) 이해하기

1-SE 규칙이란?

최소 CV 오차에서 1 표준편차를 더한 값을 기준선으로 설정
이 기준선 아래에 있는 모델 중 가장 단순한 모델 선택
통계적으로 최소 오차 모델과 차이가 없으면서 더 단순한 모델을 선호

5.5.5 Pruned Tree 생성

# 1-SE 규칙으로 가지치기
cart_pruned <- rpart::prune(cart_gini, cp = best_cp_1se)

# 원본 트리와 pruned 트리 비교
cat("\n📌 원본 트리 노드 수:", nrow(cart_gini$frame), "\n")

## 
## 📌 원본 트리 노드 수: 31

cat("✂️  Pruned 트리 노드 수:", nrow(cart_pruned$frame), "\n")

## ✂️  Pruned 트리 노드 수: 5

5.5.6 Pruned Tree 시각화

# 원본 트리와 pruned 트리 비교
par(mfrow = c(1, 2))

# 원본 트리
rpart.plot(cart_gini,
  type = 4, under = TRUE, fallen.leaves = TRUE,
  main = "원본 CART 트리 (Gini)",
  box.palette = c("lightgreen", "lightcoral"),
  cex = 0.8
)

# Pruned 트리
rpart.plot(cart_pruned,
  type = 4, under = TRUE, fallen.leaves = TRUE,
  main = "Pruned CART 트리 (1-SE 규칙)",
  box.palette = c("lightgreen", "lightcoral"),
  cex = 0.8
)

par(mfrow = c(1, 1))

5.5.7 Pruned 모델 성능 평가

# 모델 예측
cart_pred_class_single <- predict(cart_gini, PimaIndiansDiabetes, type = "class")
cart_pred_prob_single <- predict(cart_gini, PimaIndiansDiabetes, type = "prob")[, "pos"]
pruned_pred_class <- predict(cart_pruned, PimaIndiansDiabetes, type = "class")
pruned_pred_prob <- predict(cart_pruned, PimaIndiansDiabetes, type = "prob")[, "pos"]

# Confusion Matrix
conf_cart_single <- confusionMatrix(data = cart_pred_class_single, reference = PimaIndiansDiabetes$diabetes, positive = "pos")
conf_pruned <- confusionMatrix(data = pruned_pred_class, reference = PimaIndiansDiabetes$diabetes, positive = "pos")

# 성능 지표 비교
comparison_df <- data.frame(
  지표 = c("노드 수",
          "정확도 (Accuracy)",
          "민감도 (Sensitivity)",
          "특이도 (Specificity)",
          "정밀도 (Precision)",
          "F1 Score"),
  원본_CART = c(
    nrow(cart_gini$frame),
    round(conf_cart_single$overall["Accuracy"], 3),
    round(conf_cart_single$byClass["Sensitivity"], 3),
    round(conf_cart_single$byClass["Specificity"], 3),
    round(conf_cart_single$byClass["Pos Pred Value"], 3),
    round(conf_cart_single$byClass["F1"], 3)
  ),
  Pruned_CART = c(
    nrow(cart_pruned$frame),
    round(conf_pruned$overall["Accuracy"], 3),
    round(conf_pruned$byClass["Sensitivity"], 3),
    round(conf_pruned$byClass["Specificity"], 3),
    round(conf_pruned$byClass["Pos Pred Value"], 3),
    round(conf_pruned$byClass["F1"], 3)
  ),
  해석 = c(
    "트리의 복잡도 (노드 개수)",
    "전체 예측 중 올바른 예측의 비율",
    "실제 Positive 중 Positive로 예측한 비율",
    "실제 Negative 중 Negative로 예측한 비율",
    "Positive로 예측한 것 중 실제 Positive인 비율",
    "정밀도와 재현율의 조화평균"
  )
)

knitr::kable(comparison_df,
             caption = "원본 트리 vs Pruned 트리 성능 비교",
             align = c("l", "r", "r", "l"),
             col.names = c("지표", "원본 CART", "Pruned CART", "해석"))

원본 트리 vs Pruned 트리 성능 비교
	지표	원본 CART	Pruned CART	해석
	노드 수	31.000	5.000	트리의 복잡도 (노드 개수)
Accuracy	정확도 (Accuracy)	0.840	0.772	전체 예측 중 올바른 예측의 비율
Sensitivity	민감도 (Sensitivity)	0.731	0.560	실제 Positive 중 Positive로 예측한 비율
Specificity	특이도 (Specificity)	0.898	0.886	실제 Negative 중 Negative로 예측한 비율
Pos Pred Value	정밀도 (Precision)	0.794	0.725	Positive로 예측한 것 중 실제 Positive인 비율
F1	F1 Score	0.761	0.632	정밀도와 재현율의 조화평균

5.5.8 ROC 곡선 비교

# ROC 객체 생성
roc_original <- roc(PimaIndiansDiabetes$diabetes, cart_pred_prob_single)
roc_pruned <- roc(PimaIndiansDiabetes$diabetes, pruned_pred_prob)

# ROC 곡선 플롯
plot(roc_original, 
     col = "#E41A1C", lwd = 2,
     main = "원본 CART vs Pruned CART ROC 곡선 비교",
     xlab = "1 - 특이도 (False Positive Rate)",
     ylab = "민감도 (True Positive Rate)",
     legacy.axes = TRUE)

plot(roc_pruned, 
     col = "#377EB8", lwd = 2, add = TRUE)

# 대각선
abline(a = 0, b = 1, lty = 2, col = "gray")

# 범례
legend("bottomright",
       legend = c(
         paste0("원본 CART (AUC = ", round(auc(roc_original), 3), ")"),
         paste0("Pruned CART (AUC = ", round(auc(roc_pruned), 3), ")")
       ),
       col = c("#E41A1C", "#377EB8"),
       lwd = 2,
       cex = 0.9,
       bg = "white")

grid()

Pruning의 장점:

1. 모델 단순화: 불필요한 가지를 제거하여 해석이 쉬워집니다

2. 과적합 방지: 훈련 데이터에 지나치게 맞추는 것을 방지합니다

3. 일반화 성능: 새로운 데이터에 대한 예측 성능이 향상될 수 있습니다

4. 계산 효율성: 노드가 적어 예측 속도가 빨라집니다

5.6 [Special] 파티션 플롯

# 예측 격자 생성
glucose_range <- seq(min(PimaIndiansDiabetes$glucose), 
                    max(PimaIndiansDiabetes$glucose), 
                    length.out = 200)
mass_range <- seq(min(PimaIndiansDiabetes$mass), 
                 max(PimaIndiansDiabetes$mass), 
                 length.out = 200)

# 전체 변수를 포함한 격자 생성 (다른 변수들은 평균값으로 고정)
grid_boundary <- expand.grid(
  glucose = glucose_range, 
  mass = mass_range,
  pregnant = mean(PimaIndiansDiabetes$pregnant),
  pressure = mean(PimaIndiansDiabetes$pressure),
  triceps = mean(PimaIndiansDiabetes$triceps),
  insulin = mean(PimaIndiansDiabetes$insulin),
  pedigree = mean(PimaIndiansDiabetes$pedigree),
  age = mean(PimaIndiansDiabetes$age)
)

# 원본 CART 예측
grid_boundary$pred_original <- predict(cart_gini, grid_boundary, type = "class")

# Pruned CART 예측
grid_boundary$pred_pruned <- predict(cart_pruned, grid_boundary, type = "class")

# 1. 원본 CART 결정 경계
p_cart_partition <- ggplot(PimaIndiansDiabetes, aes(x = glucose, y = mass)) +
  # 배경 영역
  geom_tile(data = grid_boundary, aes(fill = pred_original), alpha = 0.3) +
  # 데이터 포인트
  geom_point(aes(color = diabetes, shape = diabetes), size = 2, alpha = 0.6) +
  # 결정 경계선
  geom_contour(data = grid_boundary, 
               aes(z = as.numeric(pred_original)), 
               breaks = 1.5,
               color = "black", 
               linewidth = 1.2) +
  # 색상 설정
  scale_fill_manual(values = c("neg" = "lightblue", "pos" = "lightcoral"),
                    name = "예측") +
  scale_color_manual(values = c("neg" = "blue", "pos" = "red"),
                     name = "실제") +
  scale_shape_manual(values = c("neg" = 16, "pos" = 17), guide = "none") +
  # 라벨
  labs(
    title = "원본 CART 트리",
    subtitle = paste0("노드 수: ", nrow(cart_gini$frame), 
                     " | AUC: ", round(auc(roc_original), 3)),
    x = "포도당 농도 (glucose)",
    y = "BMI (mass)"
  ) +
  theme_minimal(base_family = "nanum") +
  theme(
    plot.title = element_text(face = "bold", size = 13),
    plot.subtitle = element_text(size = 10),
    legend.position = "right"
  )

# 2. Pruned CART 결정 경계
p_pruned_partition <- ggplot(PimaIndiansDiabetes, aes(x = glucose, y = mass)) +
  # 배경 영역
  geom_tile(data = grid_boundary, aes(fill = pred_pruned), alpha = 0.3) +
  # 데이터 포인트
  geom_point(aes(color = diabetes, shape = diabetes), size = 2, alpha = 0.6) +
  # 결정 경계선
  geom_contour(data = grid_boundary, 
               aes(z = as.numeric(pred_pruned)), 
               breaks = 1.5,
               color = "black", 
               linewidth = 1.2) +
  # 색상 설정
  scale_fill_manual(values = c("neg" = "lightgreen", "pos" = "lightyellow"),
                    name = "예측") +
  scale_color_manual(values = c("neg" = "blue", "pos" = "red"),
                     name = "실제") +
  scale_shape_manual(values = c("neg" = 16, "pos" = 17), guide = "none") +
  # 라벨
  labs(
    title = "Pruned CART 트리",
    subtitle = paste0("노드 수: ", nrow(cart_pruned$frame), 
                     " | AUC: ", round(auc(roc_pruned), 3)),
    x = "포도당 농도 (glucose)",
    y = "BMI (mass)"
  ) +
  theme_minimal(base_family = "nanum") +
  theme(
    plot.title = element_text(face = "bold", size = 13),
    plot.subtitle = element_text(size = 10),
    legend.position = "right"
  )

# 나란히 배치
library(gridExtra)
grid.arrange(p_cart_partition, p_pruned_partition, ncol = 2,
             top = "CART 결정 경계 비교: 원본 vs Pruned (Glucose-Mass 평면)")

6 Bagging (Bootstrap Aggregating) 분석

6.1 Bagging 개요

Bagging은 Bootstrap Aggregating의 약자로, 다음과 같은 특징을 가집니다:

분산 감소: 여러 모델의 평균을 통해 예측 분산을 줄임
과적합 방지: 단일 모델의 과적합 경향을 완화
병렬 처리: 각 모델을 독립적으로 학습 가능
OOB 평가: Out-of-Bag 샘플로 자연스러운 검증

6.1.1 Bagging이 특히 효과적인 경우

높은 분산을 가진 모델 (예: 의사결정나무)
불안정한 학습 알고리즘
비선형 관계가 복잡한 데이터

6.2 Bagging 모델 구축

6.2.1 수동 Bagging 구현

# Bagging을 위한 설정
set.seed(123)
n_trees <- 500  # 부트스트랩 샘플 수
n_samples <- nrow(PimaIndiansDiabetes)

# 개별 트리 저장을 위한 리스트
bagged_trees <- list()
oob_predictions <- matrix(NA, nrow = n_samples, ncol = n_trees)

# Bootstrap 샘플링과 트리 생성
for(i in 1:n_trees) {
  # Bootstrap 샘플 생성
  bootstrap_idx <- sample(1:n_samples, n_samples, replace = TRUE)
  bootstrap_data <- PimaIndiansDiabetes[bootstrap_idx, ]
  
  # OOB (Out-of-Bag) 샘플 인덱스
  oob_idx <- setdiff(1:n_samples, unique(bootstrap_idx))
  
  # 트리 학습 (더 깊은 트리 허용)
  tree <- rpart(diabetes ~ pregnant + glucose + pressure + triceps +
                  insulin + mass + pedigree + age,
                data = bootstrap_data,
                method = "class",
                parms = list(split = "gini"))
  
  bagged_trees[[i]] <- tree
  
  # OOB 예측
  if(length(oob_idx) > 0) {
    oob_pred <- predict(tree, PimaIndiansDiabetes[oob_idx, ], type = "prob")[, "pos"]
    oob_predictions[oob_idx, i] <- oob_pred
  }
}

# OOB 예측의 평균 계산 (각 샘플별)
oob_mean_pred <- rowMeans(oob_predictions, na.rm = TRUE)

# OOB 예측 클래스 결정
oob_pred_class <- factor(ifelse(oob_mean_pred > 0.5, "pos", "neg"),
                         levels = c("neg", "pos"))

# OOB Error 계산
oob_error <- mean(oob_pred_class != PimaIndiansDiabetes$diabetes, na.rm = TRUE)

cat("OOB Error Rate:", round(oob_error * 100, 2), "%\n")

## OOB Error Rate: 23.7 %

cat("OOB Accuracy:", round((1 - oob_error) * 100, 2), "%\n")

## OOB Accuracy: 76.3 %

6.2.2 패키지를 이용한 Bagging 구현

# Bagging 모델 구축
set.seed(123)
bagging_model <- bagging(
  diabetes ~ pregnant + glucose + pressure + triceps + insulin + mass + pedigree + age,
  data = PimaIndiansDiabetes,
  nbagg = 500,  # 부트스트랩 샘플 수
  coob = TRUE,  # OOB 에러 계산
  control = rpart.control(minsplit = 2, cp = 0)  # 더 깊은 트리 허용
)

# 모델 요약 정보
cat("OOB Error Rate:", round(bagging_model$err * 100, 2), "%\n")

## OOB Error Rate: 23.05 %

cat("OOB Accuracy:", round((1 - bagging_model$err) * 100, 2), "%\n")

## OOB Accuracy: 76.95 %

6.3 Bagging 모델 성능 평가

6.3.1 예측 및 Confusion Matrix

# === 패키지 Bagging 예측 (원본 데이터) ===
# 예측 클래스
bagging_pred_class <- predict(bagging_model, PimaIndiansDiabetes, type = "class")

# 예측 확률
bagging_pred_prob <- predict(bagging_model, PimaIndiansDiabetes, type = "prob")[, "pos"]

# Confusion Matrix
conf_bagging <- confusionMatrix(data = bagging_pred_class, reference = PimaIndiansDiabetes$diabetes, positive = "pos")

# Confusion Matrix 출력
conf_matrix_bagging <- conf_bagging$table
conf_matrix_bagging_df <- as.data.frame.matrix(conf_matrix_bagging)
conf_matrix_bagging_df <- rbind(conf_matrix_bagging_df, Total = colSums(conf_matrix_bagging_df))
conf_matrix_bagging_df <- cbind(conf_matrix_bagging_df, Total = rowSums(conf_matrix_bagging_df))

knitr::kable(conf_matrix_bagging_df,
             caption = "Bagging Confusion Matrix (500개 트리)",
             align = "c")

Bagging Confusion Matrix (500개 트리)
	neg	pos	Total
neg	500	0	500
pos	0	268	268
Total	500	268	768

6.3.2 주요 성능 지표

# OOB 에러율 (모델에서 직접 가져옴)
oob_error_rate <- bagging_model$err

# 성능 지표 추출
bagging_metrics <- data.frame(
  지표 = c("정확도 (Accuracy)",
          "민감도 (Sensitivity)",
          "특이도 (Specificity)",
          "정밀도 (Precision)",
          "F1 Score",
          "OOB 에러율"),
  값 = c(
    round(conf_bagging$overall["Accuracy"], 3),
    round(conf_bagging$byClass["Sensitivity"], 3),
    round(conf_bagging$byClass["Specificity"], 3),
    round(conf_bagging$byClass["Pos Pred Value"], 3),
    round(conf_bagging$byClass["F1"], 3),
    round(oob_error_rate, 3)
  ),
  해석 = c(
    "전체 예측 중 올바른 예측의 비율",
    "실제 Positive 중 Positive로 예측한 비율",
    "실제 Negative 중 Negative로 예측한 비율",
    "Positive로 예측한 것 중 실제 Positive인 비율",
    "정밀도와 재현율의 조화평균",
    "Out-of-Bag 샘플 기반 에러율"
  )
)

knitr::kable(bagging_metrics,
             caption = "Bagging 모델 성능 지표 (500개 트리)",
             align = c("l", "r", "l"))

Bagging 모델 성능 지표 (500개 트리)
	지표	값	해석
Accuracy	정확도 (Accuracy)	1.00	전체 예측 중 올바른 예측의 비율
Sensitivity	민감도 (Sensitivity)	1.00	실제 Positive 중 Positive로 예측한 비율
Specificity	특이도 (Specificity)	1.00	실제 Negative 중 Negative로 예측한 비율
Pos Pred Value	정밀도 (Precision)	1.00	Positive로 예측한 것 중 실제 Positive인 비율
F1	F1 Score	1.00	정밀도와 재현율의 조화평균
	OOB 에러율	0.23	Out-of-Bag 샘플 기반 에러율

6.3.3 ROC 곡선과 AUC

# ROC 객체 생성 (Bagging 예측 확률 사용)
roc_bagging <- roc(PimaIndiansDiabetes$diabetes, bagging_pred_prob)

# AUC 값
auc_bagging <- auc(roc_bagging)

# ROC 곡선 플롯
plot(roc_bagging,
     legacy.axes = TRUE,
     main = "Bagging ROC 곡선 (500개 트리)",
     xlab = "1 - 특이도 (False Positive Rate)",
     ylab = "민감도 (True Positive Rate)",
     print.auc = TRUE,
     xaxs = "i",  # x축 범위 정확히 지정
     yaxs = "i")  # y축 범위 정확히 지정

6.4 [Special] Bagging 특성 시각화

6.4.1 개별 트리 vs Bagging 앙상블

# 예측 격자 생성 (glucose-mass 평면, 다른 변수는 평균값으로 고정)
grid_viz <- expand.grid(
  glucose = seq(min(PimaIndiansDiabetes$glucose), 
                max(PimaIndiansDiabetes$glucose), 
                length.out = 150),
  mass = seq(min(PimaIndiansDiabetes$mass), 
            max(PimaIndiansDiabetes$mass), 
            length.out = 150),
  pregnant = mean(PimaIndiansDiabetes$pregnant),
  pressure = mean(PimaIndiansDiabetes$pressure),
  triceps = mean(PimaIndiansDiabetes$triceps),
  insulin = mean(PimaIndiansDiabetes$insulin),
  pedigree = mean(PimaIndiansDiabetes$pedigree),
  age = mean(PimaIndiansDiabetes$age)
)

# 1️⃣ 5개의 개별 Bootstrap 트리 학습
set.seed(456)
n_visual_trees <- 5
individual_trees <- list()
individual_preds <- list()

for(i in 1:n_visual_trees) {
  # Bootstrap 샘플링
  bootstrap_idx <- sample(1:n_samples, n_samples, replace = TRUE)
  
  # 개별 트리 학습
  tree <- rpart(diabetes ~ pregnant + glucose + pressure + triceps +
                  insulin + mass + pedigree + age,
                data = PimaIndiansDiabetes[bootstrap_idx, ],
                method = "class",
                parms = list(split = "gini"))
  
  individual_trees[[i]] <- tree
  individual_preds[[i]] <- predict(tree, grid_viz, type = "prob")[, "pos"]
}

# 2️⃣ 개별 트리 구조 시각화 (5개)
par(mfrow = c(2, 3), mar = c(2, 2, 3, 2))

for(i in 1:n_visual_trees) {
  rpart.plot(
    individual_trees[[i]],
    type = 4,
    extra = 104,
    under = TRUE,
    fallen.leaves = TRUE,
    main = paste("Bootstrap 트리", i, "구조"),
    box.palette = c("lightgreen", "lightcoral"),
    branch.lty = 3,
    shadow.col = "gray",
    cex.main = 1.0,
    cex = 0.65
  )
}

par(mfrow = c(1, 1))

# 3️⃣ 개별 트리 결정 경계 플롯
plots_individual <- list()

for(i in 1:n_visual_trees) {
  grid_temp <- grid_viz
  grid_temp$pred <- individual_preds[[i]]
  
  plots_individual[[i]] <- ggplot(grid_temp, aes(x = glucose, y = mass)) +
    geom_tile(aes(fill = pred), alpha = 0.8) +
    scale_fill_gradient2(
      low = "lightblue", 
      mid = "yellow", 
      high = "lightcoral",
      midpoint = 0.5, 
      limits = c(0, 1),
      name = "P(pos)"
    ) +
    geom_point(data = PimaIndiansDiabetes, 
               aes(color = diabetes), 
               size = 0.8, alpha = 0.4) +
    scale_color_manual(values = c("neg" = "blue", "pos" = "red")) +
    labs(title = paste("트리", i, "결정 경계"),
         x = "Glucose", y = "BMI") +
    theme_minimal(base_family = "nanum") +
    theme(
      legend.position = "none",
      plot.title = element_text(size = 11, face = "bold"),
      axis.title = element_text(size = 9)
    )
}

# 4️⃣ Bagging 앙상블 예측 (전체 트리 사용)
bagging_pred_grid <- matrix(0, nrow = nrow(grid_viz), ncol = n_trees)

for(i in 1:n_trees) {
  bagging_pred_grid[, i] <- predict(bagged_trees[[i]], 
                                     grid_viz, 
                                     type = "prob")[, "pos"]
}

# 전체 트리 평균 확률
grid_viz$bagging_pred <- rowMeans(bagging_pred_grid)

# 5️⃣ Bagging 앙상블 플롯
p_bagging <- ggplot(grid_viz, aes(x = glucose, y = mass)) +
  geom_tile(aes(fill = bagging_pred), alpha = 0.8) +
  scale_fill_gradient2(
    low = "lightblue", 
    mid = "yellow", 
    high = "lightcoral",
    midpoint = 0.5, 
    limits = c(0, 1),
    name = "P(pos)"
  ) +
  geom_contour(aes(z = bagging_pred), 
               breaks = 0.5, 
               color = "black", 
               linewidth = 1.2) +
  geom_point(data = PimaIndiansDiabetes, 
             aes(color = diabetes, shape = diabetes), 
             size = 1.5, alpha = 0.6) +
  scale_color_manual(values = c("neg" = "blue", "pos" = "red"),
                     name = "실제") +
  scale_shape_manual(values = c("neg" = 16, "pos" = 17),
                     guide = "none") +
  labs(
    title = paste0("Bagging 앙상블 (", n_trees, "개 트리)"),
    subtitle = "개별 트리보다 훨씬 부드러운 결정 경계",
    x = "포도당 농도 (glucose)",
    y = "BMI (mass)"
  ) +
  theme_minimal(base_family = "nanum") +
  theme(
    plot.title = element_text(face = "bold", size = 13),
    plot.subtitle = element_text(size = 10),
    legend.position = "right"
  )

# 6️⃣ 결정 경계 레이아웃 구성
grid.arrange(
  arrangeGrob(grobs = plots_individual, ncol = 5),
  p_bagging,
  ncol = 1,
  heights = c(1.2, 2),
  top = textGrob(
    "개별 Bootstrap 트리 vs Bagging 앙상블 결정 경계 비교",
    gp = gpar(fontsize = 15, fontface = "bold")
  )
)

해석:

상단 (트리 구조): 5개의 개별 트리가 각기 다른 분할 규칙을 학습
중단 (개별 경계): 각 트리마다 불안정하고 서로 다른 결정 경계
하단 (앙상블): 500개 트리의 평균으로 부드럽고 안정적인 경계 생성

7 Random Forest 분석

7.1 Random Forest 개요

Random Forest는 Bagging + 변수 무작위 선택을 결합한 앙상블 기법으로, 다음과 같은 특징을 가집니다:

변수 무작위성: 각 분할에서 변수의 일부만 고려하여 트리 간 상관관계 감소
강력한 예측 성능: Bagging보다 더욱 개선된 일반화 성능
변수 중요도 제공: 예측에 기여하는 변수의 중요도 측정
과적합 방지 우수: 이중 무작위성으로 과적합 위험 최소화

7.1.1 Random Forest가 Bagging보다 우수한 이유

트리 간 상관관계 감소: 변수 무작위 선택으로 다양한 트리 생성
더 나은 일반화: 개별 트리는 약하지만 앙상블은 강력
자동 특성 선택: 중요하지 않은 변수의 영향 자동 감소

7.2 Random Forest 모델 구축

7.2.1 randomForestSRC 패키지를 이용한 모델링

library(randomForestSRC)

# Random Forest 모델 학습 (전체 변수 사용)
# diabetes_label과 diabetes_numeric은 시각화용 변수이므로 제외
set.seed(123)
rf_model <- rfsrc(
  diabetes ~ pregnant + glucose + pressure + triceps + insulin + mass + pedigree + age,  # 원래 8개 변수만 사용
  data = PimaIndiansDiabetes,
  ntree = 500,             # 트리 개수
  mtry = 3,                # 각 분할에서 고려할 변수 수 (sqrt(8) ≈ 3)
  importance = "permute"        # 변수 중요도 계산
)

# 모델 요약
print(rf_model)

##                          Sample size: 768
##            Frequency of class labels: neg=500, pos=268
##                      Number of trees: 500
##            Forest terminal node size: 1
##        Average no. of terminal nodes: 118.302
## No. of variables tried at each split: 3
##               Total no. of variables: 8
##        Resampling used to grow trees: swor
##     Resample size used to grow trees: 485
##                             Analysis: RF-C
##                               Family: class
##                       Splitting rule: gini *random*
##        Number of random split points: 10
##                     Imbalanced ratio: 1.8657
##                    (OOB) Brier score: 0.15871674
##         (OOB) Normalized Brier score: 0.63486697
##                            (OOB) AUC: 0.82818284
##                       (OOB) Log-loss: 0.48237908
##                         (OOB) PR-AUC: 0.70099478
##                         (OOB) G-mean: 0.73428816
##    (OOB) Requested performance error: 0.22526042, 0.15, 0.36567164
## 
## Confusion matrix:
## 
##           predicted
##   observed neg pos class.error
##        neg 425  75      0.1500
##        pos 100 168      0.3731
## 
##       (OOB) Misclassification rate: 0.2278646
## 
## Random-classifier baselines (uniform):
##    Brier: 0.25   Normalized Brier: 1   Log-loss: 0.69314718

# OOB 에러율
oob_error_rf <- rf_model$err.rate[nrow(rf_model$err.rate), "all"]

7.3 Random Forest 모델 성능 평가

7.3.1 예측 및 Confusion Matrix

# 예측 클래스 및 확률
rf_pred_result <- predict(rf_model, PimaIndiansDiabetes)
rf_pred_class <- rf_pred_result$class
rf_pred_prob <- rf_pred_result$predicted[, "pos"]

# Confusion Matrix
conf_rf <- confusionMatrix(data = rf_pred_class, reference = PimaIndiansDiabetes$diabetes, positive = "pos")

# Confusion Matrix 출력
conf_matrix_rf <- conf_rf$table
conf_matrix_rf_df <- as.data.frame.matrix(conf_matrix_rf)
conf_matrix_rf_df <- rbind(conf_matrix_rf_df, Total = colSums(conf_matrix_rf_df))
conf_matrix_rf_df <- cbind(conf_matrix_rf_df, Total = rowSums(conf_matrix_rf_df))

knitr::kable(conf_matrix_rf_df,
             caption = "Random Forest Confusion Matrix (전체 8개 변수)",
             align = "c")

Random Forest Confusion Matrix (전체 8개 변수)
	neg	pos	Total
neg	500	0	500
pos	0	268	268
Total	500	268	768

7.3.2 주요 성능 지표

# 성능 지표 추출
rf_metrics <- data.frame(
  지표 = c("정확도 (Accuracy)",
          "민감도 (Sensitivity)",
          "특이도 (Specificity)",
          "정밀도 (Precision)",
          "F1 Score",
          "OOB 에러율"),
  값 = c(
    round(conf_rf$overall["Accuracy"], 3),
    round(conf_rf$byClass["Sensitivity"], 3),
    round(conf_rf$byClass["Specificity"], 3),
    round(conf_rf$byClass["Pos Pred Value"], 3),
    round(conf_rf$byClass["F1"], 3),
    round(oob_error_rf, 3)
  ),
  해석 = c(
    "전체 예측 중 올바른 예측의 비율",
    "실제 Positive 중 Positive로 예측한 비율",
    "실제 Negative 중 Negative로 예측한 비율",
    "Positive로 예측한 것 중 실제 Positive인 비율",
    "정밀도와 재현율의 조화평균",
    "Out-of-Bag 샘플 기반 에러율"
  )
)

knitr::kable(rf_metrics,
             caption = "Random Forest 모델 성능 지표 (전체 8개 변수)",
             align = c("l", "r", "l"))

Random Forest 모델 성능 지표 (전체 8개 변수)
	지표	값	해석
Accuracy	정확도 (Accuracy)	1.000	전체 예측 중 올바른 예측의 비율
Sensitivity	민감도 (Sensitivity)	1.000	실제 Positive 중 Positive로 예측한 비율
Specificity	특이도 (Specificity)	1.000	실제 Negative 중 Negative로 예측한 비율
Pos Pred Value	정밀도 (Precision)	1.000	Positive로 예측한 것 중 실제 Positive인 비율
F1	F1 Score	1.000	정밀도와 재현율의 조화평균
all	OOB 에러율	0.225	Out-of-Bag 샘플 기반 에러율

7.3.3 ROC 곡선과 AUC

# ROC 객체 생성
roc_rf <- roc(PimaIndiansDiabetes$diabetes, rf_pred_prob)

# AUC 값
auc_rf <- auc(roc_rf)

# ROC 곡선 플롯
plot(roc_rf,
     col = "purple",
     lwd = 2,
     main = "Random Forest ROC 곡선 (전체 8개 변수)",
     xlab = "1 - 특이도 (False Positive Rate)",
     ylab = "민감도 (True Positive Rate)",
     print.auc = TRUE,
     print.auc.x = 0.5,
     print.auc.y = 0.4)

7.4 변수 중요도 (Feature Importance) 분석

Random Forest는 각 변수가 예측에 얼마나 기여하는지를 측정하는 변수 중요도(Variable Importance)를 자동으로 계산합니다. 이는 모델 해석과 변수 선택에 중요한 정보를 제공합니다.

7.4.1 변수 중요도 개념

변수 중요도 계산 방법 (Permutation Importance):
- 각 변수의 값을 무작위로 섞었을 때 예측 정확도가 얼마나 감소하는지 측정
- 중요도가 높을수록 해당 변수가 예측에 큰 영향을 미침
- OOB(Out-of-Bag) 샘플을 사용하여 계산되므로 과적합 없이 신뢰할 수 있는 지표

7.4.2 변수 중요도 추출

# 변수 중요도 추출
vimp_rf <- rf_model$importance

# 데이터프레임으로 변환 및 정렬
vimp_df <- data.frame(
  변수 = rownames(vimp_rf),
  중요도 = vimp_rf[, 1],
  순위 = rank(-vimp_rf[, 1])
) %>%
  arrange(desc(중요도)) %>%
  mutate(
    중요도 = round(중요도, 4)
  )

# 변수 중요도 테이블
knitr::kable(vimp_df,
             caption = "Random Forest 변수 중요도",
             col.names = c("변수", "중요도", "순위"),
             align = c("l", "r", "r", "r", "r"))

Random Forest 변수 중요도
	변수	중요도	순위
glucose	glucose	0.0877	1
mass	mass	0.0281	2
age	age	0.0255	3
pregnant	pregnant	0.0185	4
pedigree	pedigree	0.0082	5
insulin	insulin	0.0059	6
pressure	pressure	0.0056	7
triceps	triceps	-0.0010	8

7.5 [Special] Random Forest 특성 시각화

Random Forest의 개별 트리를 추출하여 시각화합니다. 각 트리는 부트스트랩 샘플과 변수 무작위 선택을 통해 생성됩니다.

7.5.1 Random Forest 개별 트리 vs 앙상블

# 예측 격자 생성 (glucose-mass 평면, 다른 변수는 평균값으로 고정)
grid_rf <- expand.grid(
  glucose = seq(min(PimaIndiansDiabetes$glucose), 
                max(PimaIndiansDiabetes$glucose), 
                length.out = 150),
  mass = seq(min(PimaIndiansDiabetes$mass), 
            max(PimaIndiansDiabetes$mass), 
            length.out = 150),
  pregnant = mean(PimaIndiansDiabetes$pregnant),
  pressure = mean(PimaIndiansDiabetes$pressure),
  triceps = mean(PimaIndiansDiabetes$triceps),
  insulin = mean(PimaIndiansDiabetes$insulin),
  pedigree = mean(PimaIndiansDiabetes$pedigree),
  age = mean(PimaIndiansDiabetes$age)
)

# 1️⃣ 개별 트리 구조 정보 수집
tree_info <- data.frame(
  Tree = integer(),
  Total_Nodes = integer(),
  Terminal_Nodes = integer(),
  Depth_Levels = integer()
)

for(i in 1:5) {
  tree_obj <- get.tree(rf_model, tree.id = i, show.plots = FALSE)
  
  tree_info <- rbind(tree_info, data.frame(
    Tree = i,
    Total_Nodes = tree_obj$totalCount,
    Terminal_Nodes = tree_obj$leafCount,
    Depth_Levels = length(tree_obj$height)
  ))
}

knitr::kable(tree_info, 
             caption = "Random Forest 개별 트리 구조 정보",
             col.names = c("트리 번호", "총 노드 수", "터미널 노드 수", "트리 깊이"),
             align = "c")

Random Forest 개별 트리 구조 정보
트리 번호	총 노드 수	터미널 노드 수	트리 깊이
1	249	125	1
2	237	119	1
3	249	125	1
4	225	113	1
5	263	132	1

# 2️⃣ 개별 트리 5개의 예측
individual_rf_preds <- list()

for(i in 1:5) {
  # i번째 트리만 사용하여 예측
  temp_pred <- predict(rf_model, 
                       newdata = grid_rf,
                       get.tree = i)
  
  # "pos" 클래스 확률 추출
  individual_rf_preds[[i]] <- temp_pred$predicted[, "pos"]
}

# 3️⃣ Random Forest 앙상블 예측 (전체 500개 트리)
rf_ensemble_result <- predict(rf_model, grid_rf)
grid_rf$rf_pred <- rf_ensemble_result$predicted[, "pos"]
grid_rf$rf_class <- rf_ensemble_result$class

# 4️⃣ 개별 트리 결정 경계 플롯 (5개)
plots_rf_individual <- list()

for(i in 1:5) {
  grid_temp <- data.frame(
    glucose = grid_rf$glucose,
    mass = grid_rf$mass,
    pred = individual_rf_preds[[i]]
  )
  
  plots_rf_individual[[i]] <- ggplot(grid_temp, aes(x = glucose, y = mass)) +
    geom_tile(aes(fill = pred), alpha = 0.8) +
    scale_fill_gradient2(
      low = "lightblue", 
      mid = "yellow", 
      high = "lightcoral",
      midpoint = 0.5, 
      limits = c(0, 1),
      name = "P(pos)"
    ) +
    geom_point(data = PimaIndiansDiabetes, 
               aes(color = diabetes), 
               size = 0.8, alpha = 0.4) +
    scale_color_manual(values = c("neg" = "blue", "pos" = "red")) +
    labs(title = paste("트리", i)) +
    theme_minimal(base_family = "nanum") +
    theme(
      legend.position = "none",
      plot.title = element_text(size = 11, face = "bold"),
      axis.title = element_text(size = 9)
    )
}

# 5️⃣ Random Forest 앙상블 플롯
p_rf_ensemble <- ggplot(PimaIndiansDiabetes, aes(x = glucose, y = mass)) +
  geom_tile(data = grid_rf, aes(fill = rf_pred), alpha = 0.8) +
  scale_fill_gradient2(
    low = "lightblue", 
    mid = "yellow", 
    high = "lightcoral",
    midpoint = 0.5, 
    limits = c(0, 1),
    name = "P(pos)"
  ) +
  geom_contour(data = grid_rf, 
               aes(z = rf_pred), 
               breaks = 0.5, 
               color = "black", 
               linewidth = 1.2) +
  geom_point(aes(color = diabetes, shape = diabetes), 
             size = 1.5, alpha = 0.6) +
  scale_color_manual(values = c("neg" = "blue", "pos" = "red"),
                     name = "실제") +
  scale_shape_manual(values = c("neg" = 16, "pos" = 17),
                     guide = "none") +
  labs(
    title = "Random Forest 앙상블 (500개 트리)",
    subtitle = "전체 변수 사용, glucose-mass 평면에 투영",
    x = "포도당 농도 (glucose)",
    y = "BMI (mass)"
  ) +
  theme_minimal(base_family = "nanum") +
  theme(
    plot.title = element_text(face = "bold", size = 13),
    plot.subtitle = element_text(size = 10),
    legend.position = "right"
  )

# 6️⃣ 전체 레이아웃 구성
grid.arrange(
  arrangeGrob(grobs = plots_rf_individual, ncol = 5),
  p_rf_ensemble,
  ncol = 1,
  heights = c(1.2, 2),
  top = textGrob(
    "Random Forest: 개별 트리 vs 앙상블 결정 경계 비교",
    gp = gpar(fontsize = 15, fontface = "bold")
  )
)