Data

class_roll <- read_excel("./data/class_roll_250225.xlsx", 
                        range = "B1:H611")
str(class_roll)
## tibble [610 × 7] (S3: tbl_df/tbl/data.frame)
##  $ 학과(전공) : chr [1:610] "간호학과" "간호학과" "간호학과" "간호학과" ...
##  $ 단과대학   : chr [1:610] "간호대학" "간호대학" "간호대학" "간호대학" ...
##  $ 학번       : chr [1:610] "20196285" "20226281" "20236228" "20236281" ...
##  $ 이름       : chr [1:610] "조연서" "임승언" "김재원" "이진우" ...
##  $ 역할       : chr [1:610] "학생" "군휴학생" "학생" "학생" ...
##  $ 이메일 주소: chr [1:610] "999justin@naver.com" "lsa3173@naver.com" "jaenn103@naver.com" "dragon2003n@naver.com" ...
##  $ 휴대 전화  : chr [1:610] "01092431659" "01035834684" "01089051911" "01047158287" ...
names(class_roll) <- c("dept", "college", "id", "name", "status", "email", "cell_no")
str(class_roll)
## tibble [610 × 7] (S3: tbl_df/tbl/data.frame)
##  $ dept   : chr [1:610] "간호학과" "간호학과" "간호학과" "간호학과" ...
##  $ college: chr [1:610] "간호대학" "간호대학" "간호대학" "간호대학" ...
##  $ id     : chr [1:610] "20196285" "20226281" "20236228" "20236281" ...
##  $ name   : chr [1:610] "조연서" "임승언" "김재원" "이진우" ...
##  $ status : chr [1:610] "학생" "군휴학생" "학생" "학생" ...
##  $ email  : chr [1:610] "999justin@naver.com" "lsa3173@naver.com" "jaenn103@naver.com" "dragon2003n@naver.com" ...
##  $ cell_no: chr [1:610] "01092431659" "01035834684" "01089051911" "01047158287" ...
saveRDS(class_roll, file = "class_roll_250225.RDS")

Helper Functions

# Helper function to calculate chi-square for a given variable
calculate_chi_square <- function(data, group_var) {
  # Table of group vs variable, then run chi-square test
  table(data$group, group_var) %>%
    chisq.test(simulate.p.value = FALSE) %>%
    `[[`(1) %>%
    unname  # Return the chi-square statistic
}

# Function to handle Student ID category
calc_id_chi_square <- function(data) {
  data$id_2 <- data$id %>% substr(1, 4) %>% ifelse(as.numeric(.) <= 2018 | as.numeric(.) >= 2025, "18 or 25", .)
  calculate_chi_square(data, data$id_2)
}

# Function to handle Email service provider category
calc_email_chi_square <- function(data) {
  isp <- data$email %>%
    strsplit("@", fixed = TRUE) %>%
    sapply("[", 2) %>%
    strsplit("[.]", fixed = FALSE) %>%
    sapply("[", 1)
  
  # Group into naver, gmail, 기타서비스
  email_group <- isp %>% 
    `%in%`(c("naver", "gmail")) %>% 
    ifelse(isp, "기타서비스") %>% 
    factor(levels = c("naver", "gmail", "기타서비스"))
  
  calculate_chi_square(data, email_group)
}

# Function to handle phone number category
calc_phone_chi_square <- function(data) {
  cut_label <- paste(paste0(0:9, "000"), paste0(0:9, "999"), sep = "~")
  
  phone_group <- data$cell_no %>%
    substr(start = 8, stop = 11) %>%
    sapply(as.numeric) %>%
    cut(labels = cut_label, breaks = seq(0, 10000, by = 1000))
  
  calculate_chi_square(data, phone_group)
}

# Function to handle last name category
calc_lastname_chi_square <- function(data) {
  f_name <- data$name %>% substring(first = 1, last = 1)
  
  last_name_group <- f_name %>%
    `%in%`(c("김", "이", "박", "최", "정")) %>%
    ifelse(f_name, "기타") %>%
    factor(levels = c("김", "이", "박", "최", "정", "기타"))
  
  calculate_chi_square(data, last_name_group)
}

# Function to handle College category
calc_college_chi_square <- function(data) {
  calculate_chi_square(data, data$college)
}

red_and_black <- function(k) {
  set.seed(k)
  N <- nrow(class_roll) 
  class_roll$group <- 
    sample(1:N) %% 2 %>% 
    factor(levels = c(0, 1), labels = c("Red", "Black"))
  
  # Calculate Chi-square for each category
  X1 <- calc_id_chi_square(class_roll)
  X2 <- calc_email_chi_square(class_roll)
  X3 <- calc_phone_chi_square(class_roll)
  X4 <- calc_lastname_chi_square(class_roll)
  X5 <- calc_college_chi_square(class_roll)
  
  # Sum of all Chi-square statistics
  Xsum <- X1 + X2 + X3 + X4 + X5
  
  # Return the sum and individual components (for detailed analysis)
  list(Xsum = Xsum, X1 = X1, X2 = X2, X3 = X3, X4 = X4, X5 = X5)
}

Search for Best Configuration

#> Windows version
library(parallel)
M1 <- 1
M2 <- 100

# 클러스터 생성 (예: 4개의 코어 사용)
cl <- makeCluster(detectCores() - 1)

# 각 노드에서 필요한 패키지 로드
clusterEvalQ(cl, {
  library(dplyr)
  library(magrittr)
})

# 각 노드에 필요한 객체와 함수들을 모두 전달
clusterExport(cl, c("class_roll", "calculate_chi_square", 
                    "calc_id_chi_square", "calc_email_chi_square", 
                    "calc_phone_chi_square", "calc_lastname_chi_square", 
                    "calc_college_chi_square", "red_and_black"))

# parLapply() 사용
results <- parLapply(cl, M1:M2, red_and_black)

# 작업 종료 후 클러스터 종료
stopCluster(cl)

# Extract Xsum from the results
Xsum <- sapply(results, function(res) res$Xsum)

# Xsum_list <- sapply(M1:M2, red_and_black)
# Xsum <- Xsum_list$Xsum
names(Xsum) <- M1:M2
Xmin <- names(Xsum[which(Xsum == min(Xsum))])
Xmin
#> Mac or Linux Version
library(parallel)
M1 <- 1
M2 <- 1000

# Number of cores on your machine
num_cores <- detectCores() - 1

# Parallelized version of running red_and_black over multiple iterations
results <- mclapply(M1:M2, red_and_black, mc.cores = num_cores)

# Extract Xsum from the results
Xsum <- sapply(results, function(res) res$Xsum)

# Xsum_list <- sapply(M1:M2, red_and_black)
# Xsum <- Xsum_list$Xsum
names(Xsum) <- M1:M2
Xmin <- names(Xsum[which(Xsum == min(Xsum))])
data.frame("Xmin" = Xmin) %>%
  pander
Xmin
296

–>

Summary

기초통계값

Min. 1st Qu. Median Mean 3rd Qu. Max.
12.75 25.9 30.87 31.6 35.95 73.42
SD
7.86

최소값을 주는 초기값

Xmin
296

Plot

Report

학번

  18이전과 25 2019 2020 2021 2022 2023 2024
Red 9 11 28 33 56 25 143
Black 7 11 21 37 52 26 151
Chi-square Statistic and P-value
Statistic P-value
1.86 0.9318

e-mail 서비스업체

  네이버 구글 기타서비스
Red 212 84 9
Black 204 92 9
Chi-square Statistic and P-value
Statistic P-value
0.52 0.772

전화번호의 분포

  0000~0999 1000~1999 2000~2999 3000~3999 4000~4999 5000~5999 6000~6999 7000~7999 8000~8999 9000~9999
Red 30 37 25 34 34 27 25 34 23 36
Black 41 38 21 27 32 30 26 32 25 33
Chi-square Statistic and P-value
Statistic P-value
3.38 0.9473

성씨 분포

  기타
Red 70 44 27 20 16 128
Black 67 39 29 19 15 136
Chi-square Statistic and P-value
Statistic P-value
0.74 0.9808

단과대학

  간호 경영 글로벌융합 미디어 미래융합 반도체/디스플레이 사회과학A 사회과학B 인문 자연과학 정보과학
Red 16 41 14 25 8 6 24 30 44 48 49
Black 16 35 10 28 11 10 35 27 39 53 41
Chi-square Statistic and P-value
Statistic P-value
6.25 0.2824