필요한 패키지 로드
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(e1071)
library(rpart)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(nnet)
데이터 불러오기
df <- read.csv("C:/Users/chosun/Downloads/surnames.csv")
상위 4개 국적만 추출
top_nationalities <- names(sort(table(df$nationality), decreasing = TRUE))[1:4]
df_filtered <- df[df$nationality %in% top_nationalities, ]
상위 4개 국적은 English, Russian, Arabic, Japanese이다.
factor 변환
df_filtered$nationality <- factor(df_filtered$nationality)
이름 길이
df_filtered$name_length <- nchar(as.character(df_filtered$surname))
첫 글자, 마지막 글자 추출
df_filtered$first_letter <- substr(df_filtered$surname, 1, 1)
df_filtered$last_letter <- substr(df_filtered$surname, nchar(as.character(df_filtered$surname)), nchar(as.character(df_filtered$surname)))
문자형 factor로 변환
df_filtered$first_letter <- factor(df_filtered$first_letter)
df_filtered$last_letter <- factor(df_filtered$last_letter)
one-hot encoding
library(caret)
dummy_vars <- dummyVars(~ first_letter + last_letter, data = df_filtered)
dummy_df <- data.frame(predict(dummy_vars, newdata = df_filtered))
one-hot 인코딩된 변수 추가
df_final <- cbind(df_filtered, dummy_df)
데이터 분할
set.seed(123)
train_index <- createDataPartition(df_final$nationality, p = 0.8, list = FALSE)
train_data <- df_final[train_index, ]
test_data <- df_final[-train_index, ]
레이블 저장
train_labels <- train_data$nationality
test_labels <- test_data$nationality
Naive Bayes
nb_model <- naiveBayes(nationality ~ ., data = train_data[, !names(train_data) %in% c("surname", "first_letter", "last_letter")])
nb_pred <- predict(nb_model, test_data, type = "raw")
nb_class <- predict(nb_model, test_data)
Decision Tree
dt_model <- rpart(nationality ~ ., data = train_data[, !names(train_data) %in% c("surname", "first_letter", "last_letter")])
dt_pred <- predict(dt_model, test_data, type = "prob")
dt_class <- predict(dt_model, test_data, type = "class")
Random Forest
rf_model <- randomForest(nationality ~ ., data = train_data[, !names(train_data) %in% c("surname", "first_letter", "last_letter")])
rf_pred <- predict(rf_model, test_data, type = "prob")
rf_class <- predict(rf_model, test_data)
SVM
svm_model <- svm(nationality ~ ., data = train_data[, !names(train_data) %in% c("surname", "first_letter", "last_letter")],
probability = TRUE)
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'last_letter.D' constant. Cannot scale data.
svm_pred <- attr(predict(svm_model, test_data, probability = TRUE), "probabilities")
svm_class <- predict(svm_model, test_data)
ROC/AUC 계산
nb_roc <- multiclass.roc(test_labels, nb_pred)
dt_roc <- multiclass.roc(test_labels, dt_pred)
rf_roc <- multiclass.roc(test_labels, rf_pred)
svm_roc <- multiclass.roc(test_labels, svm_pred)
AUC 출력
cat("AUC (Naive Bayes):", auc(nb_roc), "\n")
## AUC (Naive Bayes): 0.7488466
cat("AUC (Decision Tree):", auc(dt_roc), "\n")
## AUC (Decision Tree): 0.8422539
cat("AUC (Random Forest):", auc(rf_roc), "\n")
## AUC (Random Forest): 0.9592929
cat("AUC (SVM):", auc(svm_roc), "\n")
## AUC (SVM): 0.9314125
랜덤포레스트 AUC가 0.9592로 가장 우수한걸로 나타났다. 그 다음으론
SVM(0.9341), 의사결정나무(0.8422), 나이브베이즈(0.7488) 순으로 성능이
우수하다.
정확도 평가
cat("\nAccuracy (Naive Bayes):\n")
##
## Accuracy (Naive Bayes):
print(confusionMatrix(nb_class, test_labels))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Arabic English Japanese Russian
## Arabic 86 210 10 171
## English 0 1 0 0
## Japanese 234 376 145 295
## Russian 0 7 0 8
##
## Overall Statistics
##
## Accuracy : 0.1555
## 95% CI : (0.1378, 0.1746)
## No Information Rate : 0.385
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.023
##
## Mcnemar's Test P-Value : <2e-16
##
## Statistics by Class:
##
## Class: Arabic Class: English Class: Japanese
## Sensitivity 0.26875 0.0016835 0.93548
## Specificity 0.68029 1.0000000 0.34798
## Pos Pred Value 0.18029 1.0000000 0.13810
## Neg Pred Value 0.78049 0.6154345 0.97972
## Prevalence 0.20739 0.3849644 0.10045
## Detection Rate 0.05574 0.0006481 0.09397
## Detection Prevalence 0.30914 0.0006481 0.68049
## Balanced Accuracy 0.47452 0.5008418 0.64173
## Class: Russian
## Sensitivity 0.016878
## Specificity 0.993452
## Pos Pred Value 0.533333
## Neg Pred Value 0.695026
## Prevalence 0.307194
## Detection Rate 0.005185
## Detection Prevalence 0.009721
## Balanced Accuracy 0.505165
cat("\nAccuracy (Decision Tree):\n")
##
## Accuracy (Decision Tree):
print(confusionMatrix(dt_class, test_labels))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Arabic English Japanese Russian
## Arabic 90 24 37 9
## English 197 465 9 90
## Japanese 24 6 103 34
## Russian 9 99 6 341
##
## Overall Statistics
##
## Accuracy : 0.6474
## 95% CI : (0.623, 0.6713)
## No Information Rate : 0.385
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4869
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: Arabic Class: English Class: Japanese
## Sensitivity 0.28125 0.7828 0.66452
## Specificity 0.94276 0.6881 0.95389
## Pos Pred Value 0.56250 0.6110 0.61677
## Neg Pred Value 0.83369 0.8350 0.96221
## Prevalence 0.20739 0.3850 0.10045
## Detection Rate 0.05833 0.3014 0.06675
## Detection Prevalence 0.10369 0.4932 0.10823
## Balanced Accuracy 0.61201 0.7355 0.80920
## Class: Russian
## Sensitivity 0.7194
## Specificity 0.8934
## Pos Pred Value 0.7495
## Neg Pred Value 0.8778
## Prevalence 0.3072
## Detection Rate 0.2210
## Detection Prevalence 0.2949
## Balanced Accuracy 0.8064
cat("\nAccuracy (Random Forest):\n")
##
## Accuracy (Random Forest):
print(confusionMatrix(rf_class, test_labels))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Arabic English Japanese Russian
## Arabic 296 55 10 15
## English 16 482 9 82
## Japanese 8 6 128 18
## Russian 0 51 8 359
##
## Overall Statistics
##
## Accuracy : 0.8198
## 95% CI : (0.7997, 0.8387)
## No Information Rate : 0.385
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7458
##
## Mcnemar's Test P-Value : 1.021e-08
##
## Statistics by Class:
##
## Class: Arabic Class: English Class: Japanese
## Sensitivity 0.9250 0.8114 0.82581
## Specificity 0.9346 0.8872 0.97695
## Pos Pred Value 0.7872 0.8183 0.80000
## Neg Pred Value 0.9794 0.8826 0.98048
## Prevalence 0.2074 0.3850 0.10045
## Detection Rate 0.1918 0.3124 0.08296
## Detection Prevalence 0.2437 0.3817 0.10369
## Balanced Accuracy 0.9298 0.8493 0.90138
## Class: Russian
## Sensitivity 0.7574
## Specificity 0.9448
## Pos Pred Value 0.8589
## Neg Pred Value 0.8978
## Prevalence 0.3072
## Detection Rate 0.2327
## Detection Prevalence 0.2709
## Balanced Accuracy 0.8511
cat("\nAccuracy (SVM):\n")
##
## Accuracy (SVM):
print(confusionMatrix(svm_class, test_labels))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Arabic English Japanese Russian
## Arabic 214 53 18 19
## English 88 506 10 105
## Japanese 18 4 113 15
## Russian 0 31 14 335
##
## Overall Statistics
##
## Accuracy : 0.757
## 95% CI : (0.7348, 0.7782)
## No Information Rate : 0.385
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6512
##
## Mcnemar's Test P-Value : 3.141e-13
##
## Statistics by Class:
##
## Class: Arabic Class: English Class: Japanese
## Sensitivity 0.6687 0.8519 0.72903
## Specificity 0.9264 0.7861 0.97334
## Pos Pred Value 0.7039 0.7137 0.75333
## Neg Pred Value 0.9144 0.8945 0.96985
## Prevalence 0.2074 0.3850 0.10045
## Detection Rate 0.1387 0.3279 0.07323
## Detection Prevalence 0.1970 0.4595 0.09721
## Balanced Accuracy 0.7976 0.8190 0.85119
## Class: Russian
## Sensitivity 0.7068
## Specificity 0.9579
## Pos Pred Value 0.8816
## Neg Pred Value 0.8805
## Prevalence 0.3072
## Detection Rate 0.2171
## Detection Prevalence 0.2463
## Balanced Accuracy 0.8323
나이브베이즈 정확도는 0.155이고, 의사결정나무 정확도는 0.647,
랜덤포레스트 정확도는 0.819, SVM 정확도는 0.757이다.
정확도 역시 랜덤 포레스트가 가장 높게 나왔고 그 다음으로 SVM,
의사결정나무, 나이브베이즈 순으로 AUC 결과와 동일한 순서이다.
가장 높은 확률과 해당 클래스 추출
get_top_predictions <- function(pred_probs, test_data, model_name) {
max_probs <- apply(pred_probs, 1, max)
predicted_labels <- colnames(pred_probs)[apply(pred_probs, 1, which.max)]
top_indices <- order(max_probs, decreasing = TRUE)[1:5]
top_df <- data.frame(
surname = test_data$surname[top_indices],
predicted_nationality = predicted_labels[top_indices],
max_probability = round(max_probs[top_indices], 3),
model = model_name
)
return(top_df)
}
예측 확률 얻기
nb_probs <- predict(nb_model, newdata = test_data, type = "raw")
dt_probs <- predict(dt_model, newdata = test_data, type = "prob")
rf_probs <- predict(rf_model, newdata = test_data, type = "prob")
svm_probs <- attr(predict(svm_model, newdata = test_data, probability = TRUE), "probabilities")
상위 5개씩 추출
nb_top5 <- get_top_predictions(nb_probs, test_data, "Naive Bayes")
dt_top5 <- get_top_predictions(dt_probs, test_data, "Decision Tree")
rf_top5 <- get_top_predictions(rf_probs, test_data, "Random Forest")
svm_top5 <- get_top_predictions(svm_probs, test_data, "SVM")
결과 통합 및 출력
top5_all <- rbind(nb_top5, dt_top5, rf_top5, svm_top5)
print(top5_all)
## surname predicted_nationality max_probability model
## 1 Jeffries Arabic 1.000 Naive Bayes
## 2 Mushanaokoji Japanese 1.000 Naive Bayes
## 3 Zherebko Japanese 1.000 Naive Bayes
## 4 Tuma Japanese 1.000 Naive Bayes
## 5 Miyahara Japanese 1.000 Naive Bayes
## 68 Abukhov Russian 1.000 Decision Tree
## 85 Davletkildeev Russian 1.000 Decision Tree
## 250 Jupikov Russian 1.000 Decision Tree
## 477 Zheltouhov Russian 1.000 Decision Tree
## 482 Pribylov Russian 1.000 Decision Tree
## 851 Davletkildeev Russian 1.000 Random Forest
## 2501 Jupikov Russian 1.000 Random Forest
## 4771 Zheltouhov Russian 1.000 Random Forest
## 496 Mnatsakanov Russian 1.000 Random Forest
## 583 Haziahmetov Russian 1.000 Random Forest
## 4772 Zheltouhov Russian 0.997 SVM
## 10240 Zhimailov Russian 0.995 SVM
## 7757 Valiakhmetov Russian 0.995 SVM
## 1242 Valentinov Russian 0.995 SVM
## 6909 Vaskovtsev Russian 0.995 SVM
일본 성은 첫 글자는 O,T,K와 같은 자음으로 시작하고 끝 글자는 a,o,i
같은 모음으로 부드럽게 끝나는 특징이 있다.
아랍 성은 첫 글자는 A,R,S,K처럼 강한 자음으로 시작하고 끝 글자는
y,l,i로 끝나는 경향이 많다.
영어권 성은 W,L,B,S로 시작하고 d,n,y,e와 같은 자음으로 끝나는 경향이
많지만 형태가 다양하여 예측 난이도가 높다.
러시아 성은 첫 글자는 -ov, -in, -a와 같은 접미사 패턴이 뚜렷하게
나타나며 강한 자음으로 끝나는 경향이 있어 예측하기 쉽다.