필요한 패키지 로드
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
library(e1071) # naiveBayes, svm
library(rpart) # decision tree
library(pROC) # ROC curve & AUC
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
1. 데이터 불러오기
df <- read.delim(file.choose(), sep = "\t", stringsAsFactors = FALSE)
2. 의미 기반 수치 매핑 정의
sonority_map <- c("sto"=1, "aff"=2, "fri"=3, "nas"=4, "liq"=5, "glide"=6, "vow"=7)
ant_map <- c("lab"=1, "den"=2, "alv"=3, "pal"=4, "vel"=5, "glo"=6)
map_to_numeric <- function(col, map) { as.numeric(map[as.character(col)]) }
3. 매핑 적용
df$son_o1 <- map_to_numeric(df$manner_o1, sonority_map)
df$son_c1 <- map_to_numeric(df$manner_c1, sonority_map)
df$son_o2 <- map_to_numeric(df$manner_o2, sonority_map)
df$son_c2 <- map_to_numeric(df$manner_c2, sonority_map)
df$ant_o1 <- map_to_numeric(df$place_o1, ant_map)
df$ant_c1 <- map_to_numeric(df$place_c1, ant_map)
df$ant_o2 <- map_to_numeric(df$place_o2, ant_map)
df$ant_c2 <- map_to_numeric(df$place_c2, ant_map)
4. 종성(받침) 유무 자질 추가
df$has_coda1 <- ifelse(is.na(df$coda1) | df$coda1 == "", 0, 1)
df$has_coda2 <- ifelse(is.na(df$coda2) | df$coda2 == "", 0, 1)
5. 타겟 변수 변환
df$gen <- factor(df$gen, levels = c(-1, 1), labels = c("female", "male"))
6. 사용할 변수 목록 정의 및 결측치 처리
numeric_vars <- c("son_o1", "son_c1", "son_o2", "son_c2",
"ant_o1", "ant_c1", "ant_o2", "ant_c2",
"has_coda1", "has_coda2")
df <- df[, c("name", numeric_vars, "gen")]
df[numeric_vars][is.na(df[numeric_vars])] <- 0
7. 데이터 섞고 훈련/테스트 나누기
set.seed(423)
train_idx <- sample(nrow(df), 0.7 * nrow(df))
train <- df[train_idx, ]
test <- df[-train_idx, ]
8. 랜덤 포레스트 모델 훈련
set.seed(12335)
rf.model <- randomForest(gen ~ ., data = train[, -1], importance = TRUE)
9. 테스트셋 예측 및 정확도 평가
rf.pred <- predict(rf.model, newdata = test)
cat("혼동 행렬:\n")
## 혼동 행렬:
print(table(Actual = test$gen, Predicted = rf.pred))
## Predicted
## Actual female male
## female 65 20
## male 22 73
cat("정확도:", mean(test$gen == rf.pred), "\n")
## 정확도: 0.7666667
10. 변수 중요도 시각화
windows(width = 7.0, height = 5.5)
varImpPlot(rf.model, pch = 21, color = "black", bg = "skyblue", pt.cex = 1.2,
main = "Variable Importance for Gender Classification")
11. 변수 중요도 수치 출력
importance(rf.model)
## female male MeanDecreaseAccuracy MeanDecreaseGini
## son_o1 1.215303 3.520040032 3.429776 12.655799
## son_c1 10.856095 9.361074297 13.574262 7.179208
## son_o2 13.608355 4.134018065 15.446366 13.767343
## son_c2 8.007691 -0.000652372 7.812457 6.137578
## ant_o1 2.493161 -0.514151155 1.441628 11.784716
## ant_c1 14.201539 8.596552749 17.307349 11.635796
## ant_o2 14.023216 6.838597086 16.841089 13.469869
## ant_c2 13.040192 8.707958112 17.795482 11.645930
## has_coda1 12.975882 8.043582101 14.084430 5.718991
## has_coda2 7.105410 0.986055462 7.108927 3.551888
12. 모델 훈련
(1) Random Forest
rf.model <- randomForest(gen ~ ., data = train[, -1], importance = TRUE)
rf.prob <- predict(rf.model, newdata = test, type = "prob")[, "male"]
(2) Naive Bayes
nb.model <- naiveBayes(gen ~ ., data = train[, -1])
nb.prob <- predict(nb.model, newdata = test, type = "raw")[, "male"]
(3) Decision Tree
dt.model <- rpart(gen ~ ., data = train[, -1], method = "class")
dt.prob <- predict(dt.model, newdata = test, type = "prob")[, "male"]
(4) SVM
svm.model <- svm(gen ~ ., data = train[, -1], probability = TRUE)
svm.pred <- predict(svm.model, newdata = test, probability = TRUE)
svm.prob <- attr(svm.pred, "probabilities")[, "male"]
13. ROC & AUC 계산
rf.roc <- roc(test$gen, rf.prob)
## Setting levels: control = female, case = male
## Setting direction: controls < cases
nb.roc <- roc(test$gen, nb.prob)
## Setting levels: control = female, case = male
## Setting direction: controls < cases
dt.roc <- roc(test$gen, dt.prob)
## Setting levels: control = female, case = male
## Setting direction: controls < cases
svm.roc <- roc(test$gen, svm.prob)
## Setting levels: control = female, case = male
## Setting direction: controls < cases
14. ROC 곡선 그리기
plot(rf.roc, col = "blue", lwd = 2, main = "ROC Curve")
plot(nb.roc, col = "red", add = TRUE, lwd = 2)
plot(dt.roc, col = "green", add = TRUE, lwd = 2)
plot(svm.roc, col = "purple", add = TRUE, lwd = 2)
legend("bottomright", legend = c(
paste("Random Forest (AUC =", round(auc(rf.roc), 3), ")"),
paste("Naive Bayes (AUC =", round(auc(nb.roc), 3), ")"),
paste("Decision Tree (AUC =", round(auc(dt.roc), 3), ")"),
paste("SVM (AUC =", round(auc(svm.roc), 3), ")")
), col = c("blue", "red", "green", "purple"), lwd = 2)

15. AUC 출력
cat("AUC Scores:\n")
## AUC Scores:
cat("Random Forest:", round(auc(rf.roc), 4), "\n")
## Random Forest: 0.7941
cat("Naive Bayes :", round(auc(nb.roc), 4), "\n")
## Naive Bayes : 0.7415
cat("Decision Tree:", round(auc(dt.roc), 4), "\n")
## Decision Tree: 0.7292
cat("SVM :", round(auc(svm.roc), 4), "\n")
## SVM : 0.7688
랜덤포레스트가 가장 우수한걸로 나타남
16.전형적인 남자, 여자 이름 5개 뽑기
가장 확률 높은 ‘남자’ 예측
top_male <- test[order(-rf.prob), ][1:5, ]
cat("\n▶ 확률 높은 남자 예측 상위 5개:\n")
##
## ▶ 확률 높은 남자 예측 상위 5개:
print(top_male[, c("name", "gen")])
## name gen
## 139 상현 male
## 144 승훈 male
## 152 성훈 male
## 220 승환 male
## 119 승윤 male
print(rf.prob[order(-rf.prob)][1:5])
## 139 144 152 220 119
## 0.998 0.998 0.998 0.998 0.988
가장 확률 높은 ‘여자’ 예측
rf.prob <- predict(rf.model, newdata = test, type = "prob")[, "male"] # 남자일 확률
rf.pred.class <- predict(rf.model, newdata = test) # 예측 클래스
female.prob <- 1 - rf.prob
test$pred_class <- rf.pred.class
test$prob_male <- rf.prob
test$prob_female <- female.prob
top_female <- test[test$pred_class == "female", ]
top_female <- top_female[order(-top_female$prob_female), ][1:5, ]
cat("\n▶ 확률 높은 여자 예측 상위 5개:\n")
##
## ▶ 확률 높은 여자 예측 상위 5개:
print(top_female[, c("name", "gen", "pred_class", "prob_female")])
## name gen pred_class prob_female
## 360 나윤 female female 1.000
## 158 하윤 male female 0.998
## 537 효원 female female 0.998
## 29 시윤 male female 0.992
## 132 시완 male female 0.992