Data

class_roll <- read_excel("./data/class_roll_260309.xlsx", 
                        range = "B1:H618")
str(class_roll)
## tibble [617 × 7] (S3: tbl_df/tbl/data.frame)
##  $ 학과(전공) : chr [1:617] "경영학과" "법학과" "빅데이터전공" "철학전공" ...
##  $ 단과대학   : chr [1:617] "경영대학" "사회과학대학" "정보과학대학" "인문대학" ...
##  $ 학번       : chr [1:617] "20142940" "20172707" "20185278" "20191116" ...
##  $ 이름       : chr [1:617] "한화윤" "김민규" "이은호" "윤소은" ...
##  $ 역할       : chr [1:617] "학생" "학생" "학생" "학생" ...
##  $ 이메일 주소: chr [1:617] "yoonbly1004@gmail.com" "coc5006@naver.com" "eunho9932@gmail.com" "yse2sy@gmail.com" ...
##  $ 휴대 전화  : chr [1:617] "01030622943" "01067991443" "01090184407" "01043177405" ...
names(class_roll) <- c("dept", "college", "id", "name", "status", "email", "cell_no")
str(class_roll)
## tibble [617 × 7] (S3: tbl_df/tbl/data.frame)
##  $ dept   : chr [1:617] "경영학과" "법학과" "빅데이터전공" "철학전공" ...
##  $ college: chr [1:617] "경영대학" "사회과학대학" "정보과학대학" "인문대학" ...
##  $ id     : chr [1:617] "20142940" "20172707" "20185278" "20191116" ...
##  $ name   : chr [1:617] "한화윤" "김민규" "이은호" "윤소은" ...
##  $ status : chr [1:617] "학생" "학생" "학생" "학생" ...
##  $ email  : chr [1:617] "yoonbly1004@gmail.com" "coc5006@naver.com" "eunho9932@gmail.com" "yse2sy@gmail.com" ...
##  $ cell_no: chr [1:617] "01030622943" "01067991443" "01090184407" "01043177405" ...
# saveRDS(class_roll, file = "class_roll_250905.RDS")

Helper Functions

# Helper function to calculate chi-square for a given variable
calculate_chi_square <- function(data, group_var) {
  # Table of group vs variable, then run chi-square test
  table(data$group, group_var) %>%
    chisq.test(simulate.p.value = FALSE) %>%
    `[[`(1) %>%
    unname  # Return the chi-square statistic
}

# Function to handle Student ID category
calc_id_chi_square <- function(data) {
  data$id_2 <- data$id %>% substr(1, 4) %>% ifelse(as.numeric(.) <= 2020, "20", .)
  calculate_chi_square(data, data$id_2)
}

# Function to handle Email service provider category
calc_email_chi_square <- function(data) {
  isp <- data$email %>%
    strsplit("@", fixed = TRUE) %>%
    sapply("[", 2) %>%
    strsplit("[.]", fixed = FALSE) %>%
    sapply("[", 1)
  
  # Group into naver, gmail, 기타서비스
  email_group <- isp %>% 
    `%in%`(c("naver", "gmail")) %>% 
    ifelse(isp, "기타서비스") %>% 
    factor(levels = c("naver", "gmail", "기타서비스"))
  
  calculate_chi_square(data, email_group)
}

# Function to handle phone number category
calc_phone_chi_square <- function(data) {
  cut_label <- paste(paste0(0:9, "000"), paste0(0:9, "999"), sep = "~")
  
  phone_group <- data$cell_no %>%
    substr(start = 8, stop = 11) %>%
    sapply(as.numeric) %>%
    cut(labels = cut_label, breaks = seq(0, 10000, by = 1000))
  
  calculate_chi_square(data, phone_group)
}

# Function to handle last name category
calc_lastname_chi_square <- function(data) {
  f_name <- data$name %>% substring(first = 1, last = 1)
  
  last_name_group <- f_name %>%
    `%in%`(c("김", "이", "박", "최", "정")) %>%
    ifelse(f_name, "기타") %>%
    factor(levels = c("김", "이", "박", "최", "정", "기타"))
  
  calculate_chi_square(data, last_name_group)
}

# Function to handle College category
calc_college_chi_square <- function(data) {
  calculate_chi_square(data, data$college)
}

red_and_black <- function(k) {
  set.seed(k)
  N <- nrow(class_roll) 
  class_roll$group <- 
    sample(1:N) %% 2 %>% 
    factor(levels = c(0, 1), labels = c("Red", "Black"))
  
  # Calculate Chi-square for each category
  X1 <- calc_id_chi_square(class_roll)
  X2 <- calc_email_chi_square(class_roll)
  X3 <- calc_phone_chi_square(class_roll)
  X4 <- calc_lastname_chi_square(class_roll)
  X5 <- calc_college_chi_square(class_roll)
  
  # Sum of all Chi-square statistics
  Xsum <- X1 + X2 + X3 + X4 + X5
  
  # Return the sum and individual components (for detailed analysis)
  list(Xsum = Xsum, X1 = X1, X2 = X2, X3 = X3, X4 = X4, X5 = X5)
}

Search for Best Configuration

#> Windows version
library(parallel)
M1 <- 1
M2 <- 1000

# 클러스터 생성 (예: 4개의 코어 사용)
cl <- makeCluster(detectCores() - 1)

# 각 노드에서 필요한 패키지 로드
clusterEvalQ(cl, {
  library(dplyr)
  library(magrittr)
})

# 각 노드에 필요한 객체와 함수들을 모두 전달
clusterExport(cl, c("class_roll", "calculate_chi_square", 
                    "calc_id_chi_square", "calc_email_chi_square", 
                    "calc_phone_chi_square", "calc_lastname_chi_square", 
                    "calc_college_chi_square", "red_and_black"))

# parLapply() 사용
results <- parLapply(cl, M1:M2, red_and_black)

# 작업 종료 후 클러스터 종료
stopCluster(cl)

# Extract Xsum from the results
Xsum <- sapply(results, function(res) res$Xsum)

# Xsum_list <- sapply(M1:M2, red_and_black)
# Xsum <- Xsum_list$Xsum
names(Xsum) <- M1:M2
Xmin <- names(Xsum[which(Xsum == min(Xsum))])
Xmin
#> Mac or Linux Version
library(parallel)
M1 <- 1
M2 <- 1000000

# Number of cores on your machine
num_cores <- detectCores() - 1

# Parallelized version of running red_and_black over multiple iterations
results <- mclapply(M1:M2, red_and_black, mc.cores = num_cores)

# Extract Xsum from the results
Xsum <- sapply(results, function(res) res$Xsum)

# Xsum_list <- sapply(M1:M2, red_and_black)
# Xsum <- Xsum_list$Xsum
names(Xsum) <- M1:M2
Xmin <- names(Xsum[which(Xsum == min(Xsum))])
data.frame("Xmin" = Xmin) %>%
  pander
Xmin
90316

–>

Summary

기초통계값

Min. 1st Qu. Median Mean 3rd Qu. Max.
7.1 26.36 31.38 32.05 37.01 85.15
SD
8

최소값을 주는 초기값

Xmin
90316

Plot

Report

학번

  2020 이전 2021 2022 2023 2024 2025 2026
Red 16 25 16 36 43 152 20
Black 17 24 19 39 45 147 18
Chi-square Statistic and P-value
Statistic P-value
0.66 0.9953

e-mail 서비스업체

  네이버 구글 기타서비스
Red 185 112 11
Black 171 123 15
Chi-square Statistic and P-value
Statistic P-value
1.68 0.4319

전화번호의 분포

  0000~0999 1000~1999 2000~2999 3000~3999 4000~4999 5000~5999 6000~6999 7000~7999 8000~8999 9000~9999
Red 32 35 28 29 37 23 36 34 27 27
Black 37 36 27 24 36 29 34 27 32 27
Chi-square Statistic and P-value
Statistic P-value
2.85 0.9698

성씨 분포

  기타
Red 65 45 24 17 11 146
Black 73 41 25 16 13 141
Chi-square Statistic and P-value
Statistic P-value
0.95 0.9663

단과대학

  간호 경영 글로벌융합 미디어 미래융합 반도체/디스플레이 사회과학 의과 인문 자연과학 정보과학
Red 8 41 6 25 6 5 47 42 32 45 51
Black 8 42 7 25 6 8 45 41 30 44 53
Chi-square Statistic and P-value
Statistic P-value
0.95 0.9665