Data
class_roll <- read_excel("./data/class_roll_250225.xlsx",
range = "B1:H611")
str(class_roll)
## tibble [610 × 7] (S3: tbl_df/tbl/data.frame)
## $ 학과(전공) : chr [1:610] "간호학과" "간호학과" "간호학과" "간호학과" ...
## $ 단과대학 : chr [1:610] "간호대학" "간호대학" "간호대학" "간호대학" ...
## $ 학번 : chr [1:610] "20196285" "20226281" "20236228" "20236281" ...
## $ 이름 : chr [1:610] "조연서" "임승언" "김재원" "이진우" ...
## $ 역할 : chr [1:610] "학생" "군휴학생" "학생" "학생" ...
## $ 이메일 주소: chr [1:610] "999justin@naver.com" "lsa3173@naver.com" "jaenn103@naver.com" "dragon2003n@naver.com" ...
## $ 휴대 전화 : chr [1:610] "01092431659" "01035834684" "01089051911" "01047158287" ...
names(class_roll) <- c("dept", "college", "id", "name", "status", "email", "cell_no")
str(class_roll)
## tibble [610 × 7] (S3: tbl_df/tbl/data.frame)
## $ dept : chr [1:610] "간호학과" "간호학과" "간호학과" "간호학과" ...
## $ college: chr [1:610] "간호대학" "간호대학" "간호대학" "간호대학" ...
## $ id : chr [1:610] "20196285" "20226281" "20236228" "20236281" ...
## $ name : chr [1:610] "조연서" "임승언" "김재원" "이진우" ...
## $ status : chr [1:610] "학생" "군휴학생" "학생" "학생" ...
## $ email : chr [1:610] "999justin@naver.com" "lsa3173@naver.com" "jaenn103@naver.com" "dragon2003n@naver.com" ...
## $ cell_no: chr [1:610] "01092431659" "01035834684" "01089051911" "01047158287" ...
saveRDS(class_roll, file = "class_roll_250225.RDS")
Helper Functions
# Helper function to calculate chi-square for a given variable
calculate_chi_square <- function(data, group_var) {
# Table of group vs variable, then run chi-square test
table(data$group, group_var) %>%
chisq.test(simulate.p.value = FALSE) %>%
`[[`(1) %>%
unname # Return the chi-square statistic
}
# Function to handle Student ID category
calc_id_chi_square <- function(data) {
data$id_2 <- data$id %>% substr(1, 4) %>% ifelse(as.numeric(.) <= 2018 | as.numeric(.) >= 2025, "18 or 25", .)
calculate_chi_square(data, data$id_2)
}
# Function to handle Email service provider category
calc_email_chi_square <- function(data) {
isp <- data$email %>%
strsplit("@", fixed = TRUE) %>%
sapply("[", 2) %>%
strsplit("[.]", fixed = FALSE) %>%
sapply("[", 1)
# Group into naver, gmail, 기타서비스
email_group <- isp %>%
`%in%`(c("naver", "gmail")) %>%
ifelse(isp, "기타서비스") %>%
factor(levels = c("naver", "gmail", "기타서비스"))
calculate_chi_square(data, email_group)
}
# Function to handle phone number category
calc_phone_chi_square <- function(data) {
cut_label <- paste(paste0(0:9, "000"), paste0(0:9, "999"), sep = "~")
phone_group <- data$cell_no %>%
substr(start = 8, stop = 11) %>%
sapply(as.numeric) %>%
cut(labels = cut_label, breaks = seq(0, 10000, by = 1000))
calculate_chi_square(data, phone_group)
}
# Function to handle last name category
calc_lastname_chi_square <- function(data) {
f_name <- data$name %>% substring(first = 1, last = 1)
last_name_group <- f_name %>%
`%in%`(c("김", "이", "박", "최", "정")) %>%
ifelse(f_name, "기타") %>%
factor(levels = c("김", "이", "박", "최", "정", "기타"))
calculate_chi_square(data, last_name_group)
}
# Function to handle College category
calc_college_chi_square <- function(data) {
calculate_chi_square(data, data$college)
}
red_and_black <- function(k) {
set.seed(k)
N <- nrow(class_roll)
class_roll$group <-
sample(1:N) %% 2 %>%
factor(levels = c(0, 1), labels = c("Red", "Black"))
# Calculate Chi-square for each category
X1 <- calc_id_chi_square(class_roll)
X2 <- calc_email_chi_square(class_roll)
X3 <- calc_phone_chi_square(class_roll)
X4 <- calc_lastname_chi_square(class_roll)
X5 <- calc_college_chi_square(class_roll)
# Sum of all Chi-square statistics
Xsum <- X1 + X2 + X3 + X4 + X5
# Return the sum and individual components (for detailed analysis)
list(Xsum = Xsum, X1 = X1, X2 = X2, X3 = X3, X4 = X4, X5 = X5)
}
Search for Best Configuration
#> Windows version
library(parallel)
M1 <- 1
M2 <- 100
# 클러스터 생성 (예: 4개의 코어 사용)
cl <- makeCluster(detectCores() - 1)
# 각 노드에서 필요한 패키지 로드
clusterEvalQ(cl, {
library(dplyr)
library(magrittr)
})
# 각 노드에 필요한 객체와 함수들을 모두 전달
clusterExport(cl, c("class_roll", "calculate_chi_square",
"calc_id_chi_square", "calc_email_chi_square",
"calc_phone_chi_square", "calc_lastname_chi_square",
"calc_college_chi_square", "red_and_black"))
# parLapply() 사용
results <- parLapply(cl, M1:M2, red_and_black)
# 작업 종료 후 클러스터 종료
stopCluster(cl)
# Extract Xsum from the results
Xsum <- sapply(results, function(res) res$Xsum)
# Xsum_list <- sapply(M1:M2, red_and_black)
# Xsum <- Xsum_list$Xsum
names(Xsum) <- M1:M2
Xmin <- names(Xsum[which(Xsum == min(Xsum))])
Xmin
#> Mac or Linux Version
library(parallel)
M1 <- 1
M2 <- 1000
# Number of cores on your machine
num_cores <- detectCores() - 1
# Parallelized version of running red_and_black over multiple iterations
results <- mclapply(M1:M2, red_and_black, mc.cores = num_cores)
# Extract Xsum from the results
Xsum <- sapply(results, function(res) res$Xsum)
# Xsum_list <- sapply(M1:M2, red_and_black)
# Xsum <- Xsum_list$Xsum
names(Xsum) <- M1:M2
Xmin <- names(Xsum[which(Xsum == min(Xsum))])
data.frame("Xmin" = Xmin) %>%
pander
–>
Summary
기초통계값
12.75 |
25.9 |
30.87 |
31.6 |
35.95 |
73.42 |
Plot


Report
학번
Red |
9 |
11 |
28 |
33 |
56 |
25 |
143 |
Black |
7 |
11 |
21 |
37 |
52 |
26 |
151 |
Chi-square Statistic and P-value
1.86 |
0.9318 |
e-mail 서비스업체
Red |
212 |
84 |
9 |
Black |
204 |
92 |
9 |
Chi-square Statistic and P-value
0.52 |
0.772 |
전화번호의 분포
Red |
30 |
37 |
25 |
34 |
34 |
27 |
25 |
34 |
23 |
36 |
Black |
41 |
38 |
21 |
27 |
32 |
30 |
26 |
32 |
25 |
33 |
Chi-square Statistic and P-value
3.38 |
0.9473 |
성씨 분포
Red |
70 |
44 |
27 |
20 |
16 |
128 |
Black |
67 |
39 |
29 |
19 |
15 |
136 |
Chi-square Statistic and P-value
0.74 |
0.9808 |
단과대학
Red |
16 |
41 |
14 |
25 |
8 |
6 |
24 |
30 |
44 |
48 |
49 |
Black |
16 |
35 |
10 |
28 |
11 |
10 |
35 |
27 |
39 |
53 |
41 |
Chi-square Statistic and P-value
6.25 |
0.2824 |