성으로 국적 알아내기

필요한 패키지 로드

library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

library(e1071)
library(rpart)
library(randomForest)

## randomForest 4.7-1.2

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

library(nnet)

데이터 불러오기

df <- read.csv("C:/Users/chosun/Downloads/surnames.csv")

상위 4개 국적만 추출

top_nationalities <- names(sort(table(df$nationality), decreasing = TRUE))[1:4]
df_filtered <- df[df$nationality %in% top_nationalities, ]

상위 4개 국적은 English, Russian, Arabic, Japanese이다.

factor 변환

df_filtered$nationality <- factor(df_filtered$nationality)

이름 길이

df_filtered$name_length <- nchar(as.character(df_filtered$surname))

첫 글자, 마지막 글자 추출

df_filtered$first_letter <- substr(df_filtered$surname, 1, 1)
df_filtered$last_letter <- substr(df_filtered$surname, nchar(as.character(df_filtered$surname)), nchar(as.character(df_filtered$surname)))

문자형 factor로 변환

df_filtered$first_letter <- factor(df_filtered$first_letter)
df_filtered$last_letter <- factor(df_filtered$last_letter)

one-hot encoding

library(caret)
dummy_vars <- dummyVars(~ first_letter + last_letter, data = df_filtered)
dummy_df <- data.frame(predict(dummy_vars, newdata = df_filtered))

one-hot 인코딩된 변수 추가

df_final <- cbind(df_filtered, dummy_df)

데이터 분할

set.seed(123)
train_index <- createDataPartition(df_final$nationality, p = 0.8, list = FALSE)
train_data <- df_final[train_index, ]
test_data <- df_final[-train_index, ]

레이블 저장

train_labels <- train_data$nationality
test_labels <- test_data$nationality

Naive Bayes

nb_model <- naiveBayes(nationality ~ ., data = train_data[, !names(train_data) %in% c("surname", "first_letter", "last_letter")])
nb_pred <- predict(nb_model, test_data, type = "raw")
nb_class <- predict(nb_model, test_data)

Decision Tree

dt_model <- rpart(nationality ~ ., data = train_data[, !names(train_data) %in% c("surname", "first_letter", "last_letter")])
dt_pred <- predict(dt_model, test_data, type = "prob")
dt_class <- predict(dt_model, test_data, type = "class")

Random Forest

rf_model <- randomForest(nationality ~ ., data = train_data[, !names(train_data) %in% c("surname", "first_letter", "last_letter")])
rf_pred <- predict(rf_model, test_data, type = "prob")
rf_class <- predict(rf_model, test_data)

SVM

svm_model <- svm(nationality ~ ., data = train_data[, !names(train_data) %in% c("surname", "first_letter", "last_letter")],
                 probability = TRUE)

## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'last_letter.D' constant. Cannot scale data.

svm_pred <- attr(predict(svm_model, test_data, probability = TRUE), "probabilities")
svm_class <- predict(svm_model, test_data)

ROC/AUC 계산

nb_roc <- multiclass.roc(test_labels, nb_pred)
dt_roc <- multiclass.roc(test_labels, dt_pred)
rf_roc <- multiclass.roc(test_labels, rf_pred)
svm_roc <- multiclass.roc(test_labels, svm_pred)

AUC 출력

cat("AUC (Naive Bayes):", auc(nb_roc), "\n")

## AUC (Naive Bayes): 0.7488466

cat("AUC (Decision Tree):", auc(dt_roc), "\n")

## AUC (Decision Tree): 0.8422539

cat("AUC (Random Forest):", auc(rf_roc), "\n")

## AUC (Random Forest): 0.9592929

cat("AUC (SVM):", auc(svm_roc), "\n")

## AUC (SVM): 0.9314125

랜덤포레스트 AUC가 0.9592로 가장 우수한걸로 나타났다. 그 다음으론 SVM(0.9341), 의사결정나무(0.8422), 나이브베이즈(0.7488) 순으로 성능이 우수하다.

정확도 평가

cat("\nAccuracy (Naive Bayes):\n")

## 
## Accuracy (Naive Bayes):

print(confusionMatrix(nb_class, test_labels))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Arabic English Japanese Russian
##   Arabic       86     210       10     171
##   English       0       1        0       0
##   Japanese    234     376      145     295
##   Russian       0       7        0       8
## 
## Overall Statistics
##                                           
##                Accuracy : 0.1555          
##                  95% CI : (0.1378, 0.1746)
##     No Information Rate : 0.385           
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.023           
##                                           
##  Mcnemar's Test P-Value : <2e-16          
## 
## Statistics by Class:
## 
##                      Class: Arabic Class: English Class: Japanese
## Sensitivity                0.26875      0.0016835         0.93548
## Specificity                0.68029      1.0000000         0.34798
## Pos Pred Value             0.18029      1.0000000         0.13810
## Neg Pred Value             0.78049      0.6154345         0.97972
## Prevalence                 0.20739      0.3849644         0.10045
## Detection Rate             0.05574      0.0006481         0.09397
## Detection Prevalence       0.30914      0.0006481         0.68049
## Balanced Accuracy          0.47452      0.5008418         0.64173
##                      Class: Russian
## Sensitivity                0.016878
## Specificity                0.993452
## Pos Pred Value             0.533333
## Neg Pred Value             0.695026
## Prevalence                 0.307194
## Detection Rate             0.005185
## Detection Prevalence       0.009721
## Balanced Accuracy          0.505165

cat("\nAccuracy (Decision Tree):\n")

## 
## Accuracy (Decision Tree):

print(confusionMatrix(dt_class, test_labels))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Arabic English Japanese Russian
##   Arabic       90      24       37       9
##   English     197     465        9      90
##   Japanese     24       6      103      34
##   Russian       9      99        6     341
## 
## Overall Statistics
##                                          
##                Accuracy : 0.6474         
##                  95% CI : (0.623, 0.6713)
##     No Information Rate : 0.385          
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.4869         
##                                          
##  Mcnemar's Test P-Value : < 2.2e-16      
## 
## Statistics by Class:
## 
##                      Class: Arabic Class: English Class: Japanese
## Sensitivity                0.28125         0.7828         0.66452
## Specificity                0.94276         0.6881         0.95389
## Pos Pred Value             0.56250         0.6110         0.61677
## Neg Pred Value             0.83369         0.8350         0.96221
## Prevalence                 0.20739         0.3850         0.10045
## Detection Rate             0.05833         0.3014         0.06675
## Detection Prevalence       0.10369         0.4932         0.10823
## Balanced Accuracy          0.61201         0.7355         0.80920
##                      Class: Russian
## Sensitivity                  0.7194
## Specificity                  0.8934
## Pos Pred Value               0.7495
## Neg Pred Value               0.8778
## Prevalence                   0.3072
## Detection Rate               0.2210
## Detection Prevalence         0.2949
## Balanced Accuracy            0.8064

cat("\nAccuracy (Random Forest):\n")

## 
## Accuracy (Random Forest):

print(confusionMatrix(rf_class, test_labels))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Arabic English Japanese Russian
##   Arabic      296      55       10      15
##   English      16     482        9      82
##   Japanese      8       6      128      18
##   Russian       0      51        8     359
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8198          
##                  95% CI : (0.7997, 0.8387)
##     No Information Rate : 0.385           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7458          
##                                           
##  Mcnemar's Test P-Value : 1.021e-08       
## 
## Statistics by Class:
## 
##                      Class: Arabic Class: English Class: Japanese
## Sensitivity                 0.9250         0.8114         0.82581
## Specificity                 0.9346         0.8872         0.97695
## Pos Pred Value              0.7872         0.8183         0.80000
## Neg Pred Value              0.9794         0.8826         0.98048
## Prevalence                  0.2074         0.3850         0.10045
## Detection Rate              0.1918         0.3124         0.08296
## Detection Prevalence        0.2437         0.3817         0.10369
## Balanced Accuracy           0.9298         0.8493         0.90138
##                      Class: Russian
## Sensitivity                  0.7574
## Specificity                  0.9448
## Pos Pred Value               0.8589
## Neg Pred Value               0.8978
## Prevalence                   0.3072
## Detection Rate               0.2327
## Detection Prevalence         0.2709
## Balanced Accuracy            0.8511

cat("\nAccuracy (SVM):\n")

## 
## Accuracy (SVM):

print(confusionMatrix(svm_class, test_labels))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Arabic English Japanese Russian
##   Arabic      214      53       18      19
##   English      88     506       10     105
##   Japanese     18       4      113      15
##   Russian       0      31       14     335
## 
## Overall Statistics
##                                           
##                Accuracy : 0.757           
##                  95% CI : (0.7348, 0.7782)
##     No Information Rate : 0.385           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6512          
##                                           
##  Mcnemar's Test P-Value : 3.141e-13       
## 
## Statistics by Class:
## 
##                      Class: Arabic Class: English Class: Japanese
## Sensitivity                 0.6687         0.8519         0.72903
## Specificity                 0.9264         0.7861         0.97334
## Pos Pred Value              0.7039         0.7137         0.75333
## Neg Pred Value              0.9144         0.8945         0.96985
## Prevalence                  0.2074         0.3850         0.10045
## Detection Rate              0.1387         0.3279         0.07323
## Detection Prevalence        0.1970         0.4595         0.09721
## Balanced Accuracy           0.7976         0.8190         0.85119
##                      Class: Russian
## Sensitivity                  0.7068
## Specificity                  0.9579
## Pos Pred Value               0.8816
## Neg Pred Value               0.8805
## Prevalence                   0.3072
## Detection Rate               0.2171
## Detection Prevalence         0.2463
## Balanced Accuracy            0.8323

나이브베이즈 정확도는 0.155이고, 의사결정나무 정확도는 0.647, 랜덤포레스트 정확도는 0.819, SVM 정확도는 0.757이다.

정확도 역시 랜덤 포레스트가 가장 높게 나왔고 그 다음으로 SVM, 의사결정나무, 나이브베이즈 순으로 AUC 결과와 동일한 순서이다.

가장 높은 확률과 해당 클래스 추출

get_top_predictions <- function(pred_probs, test_data, model_name) {
  max_probs <- apply(pred_probs, 1, max)                 
  predicted_labels <- colnames(pred_probs)[apply(pred_probs, 1, which.max)]  
  top_indices <- order(max_probs, decreasing = TRUE)[1:5] 
  
  top_df <- data.frame(
    surname = test_data$surname[top_indices],
    predicted_nationality = predicted_labels[top_indices],
    max_probability = round(max_probs[top_indices], 3),
    model = model_name
  )
  return(top_df)
}

예측 확률 얻기

nb_probs <- predict(nb_model, newdata = test_data, type = "raw")
dt_probs <- predict(dt_model, newdata = test_data, type = "prob")
rf_probs <- predict(rf_model, newdata = test_data, type = "prob")
svm_probs <- attr(predict(svm_model, newdata = test_data, probability = TRUE), "probabilities")

상위 5개씩 추출

nb_top5  <- get_top_predictions(nb_probs, test_data, "Naive Bayes")
dt_top5  <- get_top_predictions(dt_probs, test_data, "Decision Tree")
rf_top5  <- get_top_predictions(rf_probs, test_data, "Random Forest")
svm_top5 <- get_top_predictions(svm_probs, test_data, "SVM")

결과 통합 및 출력

top5_all <- rbind(nb_top5, dt_top5, rf_top5, svm_top5)
print(top5_all)

##             surname predicted_nationality max_probability         model
## 1          Jeffries                Arabic           1.000   Naive Bayes
## 2      Mushanaokoji              Japanese           1.000   Naive Bayes
## 3          Zherebko              Japanese           1.000   Naive Bayes
## 4              Tuma              Japanese           1.000   Naive Bayes
## 5          Miyahara              Japanese           1.000   Naive Bayes
## 68          Abukhov               Russian           1.000 Decision Tree
## 85    Davletkildeev               Russian           1.000 Decision Tree
## 250         Jupikov               Russian           1.000 Decision Tree
## 477      Zheltouhov               Russian           1.000 Decision Tree
## 482        Pribylov               Russian           1.000 Decision Tree
## 851   Davletkildeev               Russian           1.000 Random Forest
## 2501        Jupikov               Russian           1.000 Random Forest
## 4771     Zheltouhov               Russian           1.000 Random Forest
## 496     Mnatsakanov               Russian           1.000 Random Forest
## 583     Haziahmetov               Russian           1.000 Random Forest
## 4772     Zheltouhov               Russian           0.997           SVM
## 10240     Zhimailov               Russian           0.995           SVM
## 7757   Valiakhmetov               Russian           0.995           SVM
## 1242     Valentinov               Russian           0.995           SVM
## 6909     Vaskovtsev               Russian           0.995           SVM

성으로 국적 알아내기

류가은

2025-05-31

필요한 패키지 로드

데이터 불러오기

상위 4개 국적만 추출

상위 4개 국적은 English, Russian, Arabic, Japanese이다.

factor 변환

이름 길이

첫 글자, 마지막 글자 추출

문자형 factor로 변환

one-hot encoding

one-hot 인코딩된 변수 추가

데이터 분할

레이블 저장

Naive Bayes

Decision Tree

Random Forest

SVM

ROC/AUC 계산

AUC 출력

랜덤포레스트 AUC가 0.9592로 가장 우수한걸로 나타났다. 그 다음으론 SVM(0.9341), 의사결정나무(0.8422), 나이브베이즈(0.7488) 순으로 성능이 우수하다.

정확도 평가

나이브베이즈 정확도는 0.155이고, 의사결정나무 정확도는 0.647, 랜덤포레스트 정확도는 0.819, SVM 정확도는 0.757이다.

정확도 역시 랜덤 포레스트가 가장 높게 나왔고 그 다음으로 SVM, 의사결정나무, 나이브베이즈 순으로 AUC 결과와 동일한 순서이다.

가장 높은 확률과 해당 클래스 추출

예측 확률 얻기

상위 5개씩 추출

결과 통합 및 출력

일본 성은 첫 글자는 O,T,K와 같은 자음으로 시작하고 끝 글자는 a,o,i 같은 모음으로 부드럽게 끝나는 특징이 있다.

아랍 성은 첫 글자는 A,R,S,K처럼 강한 자음으로 시작하고 끝 글자는 y,l,i로 끝나는 경향이 많다.

영어권 성은 W,L,B,S로 시작하고 d,n,y,e와 같은 자음으로 끝나는 경향이 많지만 형태가 다양하여 예측 난이도가 높다.

러시아 성은 첫 글자는 -ov, -in, -a와 같은 접미사 패턴이 뚜렷하게 나타나며 강한 자음으로 끝나는 경향이 있어 예측하기 쉽다.