필요한 패키지 로드

library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
library(e1071)    # naiveBayes, svm
library(rpart)    # decision tree
library(pROC)     # ROC curve & AUC
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

1. 데이터 불러오기

df <- read.delim(file.choose(), sep = "\t", stringsAsFactors = FALSE)

2. 의미 기반 수치 매핑 정의

sonority_map <- c("sto"=1, "aff"=2, "fri"=3, "nas"=4, "liq"=5, "glide"=6, "vow"=7)
ant_map <- c("lab"=1, "den"=2, "alv"=3, "pal"=4, "vel"=5, "glo"=6)
map_to_numeric <- function(col, map) { as.numeric(map[as.character(col)]) }

3. 매핑 적용

df$son_o1 <- map_to_numeric(df$manner_o1, sonority_map)
df$son_c1 <- map_to_numeric(df$manner_c1, sonority_map)
df$son_o2 <- map_to_numeric(df$manner_o2, sonority_map)
df$son_c2 <- map_to_numeric(df$manner_c2, sonority_map)
df$ant_o1 <- map_to_numeric(df$place_o1, ant_map)
df$ant_c1 <- map_to_numeric(df$place_c1, ant_map)
df$ant_o2 <- map_to_numeric(df$place_o2, ant_map)
df$ant_c2 <- map_to_numeric(df$place_c2, ant_map)

4. 종성(받침) 유무 자질 추가

df$has_coda1 <- ifelse(is.na(df$coda1) | df$coda1 == "", 0, 1)
df$has_coda2 <- ifelse(is.na(df$coda2) | df$coda2 == "", 0, 1)

5. 타겟 변수 변환

df$gen <- factor(df$gen, levels = c(-1, 1), labels = c("female", "male"))

6. 사용할 변수 목록 정의 및 결측치 처리

numeric_vars <- c("son_o1", "son_c1", "son_o2", "son_c2",
                  "ant_o1", "ant_c1", "ant_o2", "ant_c2",
                  "has_coda1", "has_coda2")
df <- df[, c("name", numeric_vars, "gen")]
df[numeric_vars][is.na(df[numeric_vars])] <- 0

7. 데이터 섞고 훈련/테스트 나누기

set.seed(423)
train_idx <- sample(nrow(df), 0.7 * nrow(df))
train <- df[train_idx, ]
test  <- df[-train_idx, ]

8. 랜덤 포레스트 모델 훈련

set.seed(12335)
rf.model <- randomForest(gen ~ ., data = train[, -1], importance = TRUE)

9. 테스트셋 예측 및 정확도 평가

rf.pred <- predict(rf.model, newdata = test)
cat("혼동 행렬:\n")
## 혼동 행렬:
print(table(Actual = test$gen, Predicted = rf.pred))
##         Predicted
## Actual   female male
##   female     65   20
##   male       22   73
cat("정확도:", mean(test$gen == rf.pred), "\n")
## 정확도: 0.7666667

10. 변수 중요도 시각화

windows(width = 7.0, height = 5.5)
varImpPlot(rf.model, pch = 21, color = "black", bg = "skyblue", pt.cex = 1.2,
           main = "Variable Importance for Gender Classification")

11. 변수 중요도 수치 출력

importance(rf.model)
##              female         male MeanDecreaseAccuracy MeanDecreaseGini
## son_o1     1.215303  3.520040032             3.429776        12.655799
## son_c1    10.856095  9.361074297            13.574262         7.179208
## son_o2    13.608355  4.134018065            15.446366        13.767343
## son_c2     8.007691 -0.000652372             7.812457         6.137578
## ant_o1     2.493161 -0.514151155             1.441628        11.784716
## ant_c1    14.201539  8.596552749            17.307349        11.635796
## ant_o2    14.023216  6.838597086            16.841089        13.469869
## ant_c2    13.040192  8.707958112            17.795482        11.645930
## has_coda1 12.975882  8.043582101            14.084430         5.718991
## has_coda2  7.105410  0.986055462             7.108927         3.551888

12. 모델 훈련

(1) Random Forest

rf.model <- randomForest(gen ~ ., data = train[, -1], importance = TRUE)
rf.prob <- predict(rf.model, newdata = test, type = "prob")[, "male"]

(2) Naive Bayes

nb.model <- naiveBayes(gen ~ ., data = train[, -1])
nb.prob <- predict(nb.model, newdata = test, type = "raw")[, "male"]

(3) Decision Tree

dt.model <- rpart(gen ~ ., data = train[, -1], method = "class")
dt.prob <- predict(dt.model, newdata = test, type = "prob")[, "male"]

(4) SVM

svm.model <- svm(gen ~ ., data = train[, -1], probability = TRUE)
svm.pred <- predict(svm.model, newdata = test, probability = TRUE)
svm.prob <- attr(svm.pred, "probabilities")[, "male"]

13. ROC & AUC 계산

rf.roc <- roc(test$gen, rf.prob)
## Setting levels: control = female, case = male
## Setting direction: controls < cases
nb.roc <- roc(test$gen, nb.prob)
## Setting levels: control = female, case = male
## Setting direction: controls < cases
dt.roc <- roc(test$gen, dt.prob)
## Setting levels: control = female, case = male
## Setting direction: controls < cases
svm.roc <- roc(test$gen, svm.prob)
## Setting levels: control = female, case = male
## Setting direction: controls < cases

14. ROC 곡선 그리기

plot(rf.roc, col = "blue", lwd = 2, main = "ROC Curve")
plot(nb.roc, col = "red", add = TRUE, lwd = 2)
plot(dt.roc, col = "green", add = TRUE, lwd = 2)
plot(svm.roc, col = "purple", add = TRUE, lwd = 2)
legend("bottomright", legend = c(
  paste("Random Forest (AUC =", round(auc(rf.roc), 3), ")"),
  paste("Naive Bayes (AUC =", round(auc(nb.roc), 3), ")"),
  paste("Decision Tree (AUC =", round(auc(dt.roc), 3), ")"),
  paste("SVM (AUC =", round(auc(svm.roc), 3), ")")
), col = c("blue", "red", "green", "purple"), lwd = 2)

15. AUC 출력

cat("AUC Scores:\n")
## AUC Scores:
cat("Random Forest:", round(auc(rf.roc), 4), "\n")
## Random Forest: 0.7941
cat("Naive Bayes  :", round(auc(nb.roc), 4), "\n")
## Naive Bayes  : 0.7415
cat("Decision Tree:", round(auc(dt.roc), 4), "\n")
## Decision Tree: 0.7292
cat("SVM          :", round(auc(svm.roc), 4), "\n")
## SVM          : 0.7688

랜덤포레스트가 가장 우수한걸로 나타남

16.전형적인 남자, 여자 이름 5개 뽑기

가장 확률 높은 ‘남자’ 예측

top_male <- test[order(-rf.prob), ][1:5, ]
cat("\n▶ 확률 높은 남자 예측 상위 5개:\n")
## 
## ▶ 확률 높은 남자 예측 상위 5개:
print(top_male[, c("name", "gen")])
##     name  gen
## 139 상현 male
## 144 승훈 male
## 152 성훈 male
## 220 승환 male
## 119 승윤 male
print(rf.prob[order(-rf.prob)][1:5])
##   139   144   152   220   119 
## 0.998 0.998 0.998 0.998 0.988

가장 확률 높은 ‘여자’ 예측

rf.prob <- predict(rf.model, newdata = test, type = "prob")[, "male"]  # 남자일 확률
rf.pred.class <- predict(rf.model, newdata = test)  # 예측 클래스
female.prob <- 1 - rf.prob
test$pred_class <- rf.pred.class
test$prob_male <- rf.prob
test$prob_female <- female.prob
top_female <- test[test$pred_class == "female", ]
top_female <- top_female[order(-top_female$prob_female), ][1:5, ]
cat("\n▶ 확률 높은 여자 예측 상위 5개:\n")
## 
## ▶ 확률 높은 여자 예측 상위 5개:
print(top_female[, c("name", "gen", "pred_class", "prob_female")])
##     name    gen pred_class prob_female
## 360 나윤 female     female       1.000
## 158 하윤   male     female       0.998
## 537 효원 female     female       0.998
## 29  시윤   male     female       0.992
## 132 시완   male     female       0.992