Data

Search for Best Configuration

M1 <- 5000001
M2 <- 6000000
Xsum <- sapply(M1:M2, red_and_black)
names(Xsum) <- M1:M2
Xsum %>%
  summary %>%
  round(2) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.46   16.36   20.34   21.02   24.96   68.49
Xsum %>%
  sd %>%
  round(2)
## [1] 6.5
Xsum %>%
  `<=`(5) %>%
  which %>%
  `[`(Xsum, .) %>%
  round(2)
## 5023986 5029376 5036672 5043325 5049253 5056638 5061779 5065837 5070227 5076848 
##    3.99    4.87    3.73    4.91    4.61    4.92    4.64    4.91    4.98    4.67 
## 5078080 5095784 5099632 5108502 5111948 5112036 5124404 5129956 5148507 5150927 
##    4.54    4.50    4.81    3.35    2.46    3.81    4.89    3.18    3.68    4.26 
## 5156803 5161560 5165525 5178466 5178965 5183035 5193609 5194853 5199988 5204467 
##    4.54    4.73    4.63    4.89    4.86    4.40    4.31    4.82    3.43    4.70 
## 5223208 5245890 5250984 5267506 5270997 5276313 5278469 5280600 5290155 5300765 
##    4.72    4.64    4.56    4.85    3.80    4.66    4.28    4.79    4.59    4.76 
## 5303412 5326267 5328070 5333931 5343626 5343966 5344569 5346198 5348214 5363389 
##    3.62    4.35    4.93    4.62    4.49    3.52    4.94    4.73    4.52    4.51 
## 5369770 5371101 5379366 5379728 5384124 5395050 5402079 5405902 5412397 5418571 
##    4.50    4.84    4.57    4.41    4.74    4.31    3.97    4.99    4.62    4.88 
## 5433959 5445596 5455762 5463234 5466902 5472209 5480642 5485659 5488109 5496177 
##    3.93    4.89    4.19    4.06    4.75    3.48    4.63    4.15    4.62    4.30 
## 5499112 5510824 5517944 5529352 5539606 5546283 5548860 5550010 5556401 5577651 
##    4.07    4.90    4.80    4.96    3.93    3.13    4.49    4.51    4.35    4.21 
## 5616590 5620789 5622613 5653367 5653841 5663482 5678147 5697201 5704823 5721179 
##    4.69    2.68    4.00    4.71    4.65    4.13    4.59    4.79    4.24    4.29 
## 5731995 5742347 5745639 5753612 5770079 5787525 5800294 5823857 5831449 5838735 
##    4.14    4.80    4.69    4.21    4.09    4.87    3.92    4.13    4.40    3.91 
## 5860879 5862108 5863394 5886377 5900888 5915354 5925770 5931315 5938551 5945665 
##    4.09    4.87    4.47    4.48    4.07    4.51    4.95    4.84    3.22    4.94 
## 5957789 5959812 5965440 5970645 5974242 5975208 5975966 5979197 5991800 
##    4.81    4.64    4.66    4.97    4.82    4.74    4.89    3.62    4.30
Xmin <- names(Xsum[which(Xsum == min(Xsum))])
Xmin
## [1] "5111948"

Plot

hist(Xsum, prob = TRUE, nclass = 30, xlim = c(0, 50), ylim = c(0, 0.065))
x <- seq(0, 50, by = 0.1)
lines(x, dchisq(x, df = 21), col = "red")
legend("topright", inset = 0.05, legend = c("Xsum", "Chi-square(21)"), col = c("black", "red"), lty = 1)

plot(density(Xsum), xlim = c(0, 50), ylim = c(0, 0.065), main = "Density Estimation of Xsum")
lines(x, dchisq(x, df = 21), col = "red")
legend("topright", inset = 0.05, legend = c("Xsum", "Chi-square(21)"), col = c("black", "red"), lty = 1)

Randomization

set.seed(Xmin)
N <- nrow(class_roll) 
class_roll$group <- 
  sample(1:N) %%
  2 %>%
  factor(levels = c(0, 1), labels = c("Red", "Black"))
red_and_black(Xmin)
## [1] 2.463863

학번

class_roll$id_2 <-
  class_roll$id %>%
  ifelse(. <= 2016, "2016", .)
tbl1 <- class_roll %$%
  table(.$group, .$id_2 %>% substr(1, 4)) %>%
  `colnames<-`(c("2016 이전", 2017:2022)) 
tbl1 %>%
  pander
  2016 이전 2017 2018 2019 2020 2021 2022
Red 15 30 56 66 40 67 232
Black 15 28 56 65 44 69 230
X1min <- tbl1 %>%
  chisq.test(simulate.p.value = TRUE) %>%
  `[[`(1)
X1min
## X-squared 
## 0.3041582

학번 홀짝

tbl2 <- class_roll$id %>%
  as.numeric %>%
  `%%`(2) %>%
  factor(levels = c(1, 0), labels = c("홀", "짝")) %>%
  table(class_roll$group, .) 
tbl2 %>%
  pander
 
Red 271 235
Black 267 240
X2min <- tbl2 %>%
  chisq.test(simulate.p.value = TRUE) %>%
  `[[`(1)
X2min
##  X-squared 
## 0.08138427

학적 상태

tbl3 <- class_roll$status %>%
  table(class_roll$group, .) 
tbl3 %>%
  pander
  학생 휴학
Red 464 42
Black 465 42
X3min <- tbl3 %>%
  chisq.test(simulate.p.value = TRUE) %>%
  `[[`(1)
X3min
##    X-squared 
## 8.925952e-05

e-mail 서비스업체

tbl4 <- class_roll$email %>%
  strsplit("@", fixed = TRUE) %>%
  sapply("[", 2) %>%
  `==`("naver.com") %>%
  ifelse("네이버", "기타서비스") %>%
  factor(levels = c("네이버", "기타서비스")) %>%
  table(class_roll$group, .) 
tbl4 %>%
  pander
  네이버 기타서비스
Red 404 102
Black 408 99
X4min <- tbl4 %>%
  chisq.test(simulate.p.value = TRUE) %>%
  `[[`(1)
X4min
##  X-squared 
## 0.06349345

전화번호의 분포

cut_label <- paste(paste0(0:9, "000"), paste0(0:9, "999"), 
                   sep = "~")
tbl5 <- class_roll$cell_no %>%
  substr(start = 8, stop = 11) %>%
  sapply(as.numeric) %>%
  cut(labels = cut_label, 
      breaks = seq(0, 10000, by = 1000)) %>%
  table(class_roll$group, .) 
tbl5 %>%
  pander
  0000~0999 1000~1999 2000~2999 3000~3999 4000~4999 5000~5999 6000~6999 7000~7999 8000~8999 9000~9999
Red 49 46 46 47 61 55 55 54 42 51
Black 44 46 46 53 56 50 61 60 43 48
X5min <- tbl5 %>%
  chisq.test(simulate.p.value = TRUE) %>%
  `[[`(1)
X5min
## X-squared 
##   1.80841

성씨 분포

f_name <- class_roll$name %>%
  substring(first = 1, last = 1) 
tbl6 <- f_name %>%
  `%in%`(c("김", "이", "박")) %>%
  ifelse(f_name, "기타") %>%
  factor(levels = c("김", "이", "박", "기타")) %>%
  table(class_roll$group, .) 
tbl6 %>%
  pander
  기타
Red 119 71 40 276
Black 115 71 38 283
X6min <- tbl6 %>%
  chisq.test(simulate.p.value = TRUE) %>%
  `[[`(1)
X6min
## X-squared 
## 0.2063277

Sum of Chi_Squares

Xsum_min <- X1min + X2min + X3min + X4min + X5min + X6min
Xsum_min
## X-squared 
##  2.463863