Data

Search for Best Configuration

M1 <- 1
M2 <- 10000
Xsum <- numeric(0)
Values_mat <- numeric(0)
for(k in M1:M2){
  set.seed(k)
  N <- nrow(class_roll) 
  class_roll$group <- 
    sample(1:N) %%
    2 %>%
    factor(levels = c(0, 1), labels = c("Red", "Black"))
  Xsum <- c(Xsum, red_and_black(class_roll)$Xsum)
  Values_mat <- rbind(Values_mat, red_and_black(class_roll)$Values)
}
colnames(Values_mat) <- paste0("X", 1:6)
# Values_mat
# pairs(Values_mat)
cor(Values_mat) %>%
  round(4)
##         X1      X2      X3      X4      X5      X6
## X1  1.0000 -0.0131  0.0492  0.0061  0.0166  0.0184
## X2 -0.0131  1.0000  0.0060  0.0029  0.0027  0.0226
## X3  0.0492  0.0060  1.0000 -0.0079  0.0101 -0.0166
## X4  0.0061  0.0029 -0.0079  1.0000 -0.0051  0.0028
## X5  0.0166  0.0027  0.0101 -0.0051  1.0000 -0.0053
## X6  0.0184  0.0226 -0.0166  0.0028 -0.0053  1.0000
names(Xsum) <- M1:M2
Xsum %>%
  summary %>%
  round(2) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    4.35   16.40   20.37   21.05   25.00   56.01
Xsum %>%
  sd %>%
  round(2)
## [1] 6.52
Xsum %>%
  `<=`(10) %>%
  which %>%
  `[`(Xsum, .) %>%
  round(2)
##   17   89  105  109  250  271  390  552  570  648  723  729  786  795  803  872 
## 7.16 8.84 8.03 8.95 9.82 9.24 9.32 9.68 6.61 7.53 9.34 9.85 9.90 9.99 9.71 5.45 
##  885  900 1094 1116 1206 1226 1273 1287 1293 1331 1336 1421 1427 1541 1573 1593 
## 9.03 8.35 9.58 8.71 8.60 9.41 7.80 9.12 7.95 9.68 8.97 7.96 9.92 8.45 8.06 9.57 
## 1664 1681 1746 1778 1821 1964 2093 2288 2295 2322 2398 2429 2499 2592 2744 2777 
## 8.99 9.32 7.71 9.84 8.06 9.47 9.15 9.86 8.72 9.31 9.25 8.73 7.77 9.43 8.71 9.54 
## 2804 2832 2893 2992 3052 3071 3137 3170 3206 3337 3338 3474 3524 3539 3580 3630 
## 7.06 5.77 9.31 8.89 9.17 7.64 8.55 9.66 7.78 5.41 9.29 8.11 8.41 9.91 8.96 6.38 
## 3638 3685 3774 3784 3807 3849 3854 3941 3966 4021 4096 4099 4121 4125 4164 4181 
## 9.38 8.34 6.89 8.57 7.33 9.71 6.68 9.10 8.88 9.74 9.46 5.88 8.16 9.33 9.97 8.34 
## 4208 4236 4257 4320 4329 4599 4614 4892 4896 4927 4971 5049 5107 5253 5362 5482 
## 8.25 6.15 7.79 7.85 8.10 9.74 7.80 9.66 9.63 9.56 8.75 9.65 5.45 7.87 6.25 7.47 
## 5488 5515 5551 5575 5699 5728 5898 5931 5975 5977 6173 6175 6182 6188 6212 6260 
## 9.78 7.27 9.65 7.22 8.92 9.91 8.96 9.57 7.74 8.44 9.87 8.97 9.50 8.82 9.62 7.68 
## 6276 6279 6387 6413 6437 6444 6519 6536 6544 6693 6781 6788 6901 6979 6991 7026 
## 8.18 9.37 8.13 6.33 8.26 7.38 9.24 9.59 9.79 7.53 7.75 9.90 9.87 8.90 5.92 8.20 
## 7098 7161 7242 7338 7362 7366 7394 7414 7466 7494 7561 7571 7585 7730 7797 7819 
## 8.00 8.00 8.72 5.96 9.91 9.45 9.59 8.25 8.67 7.49 9.75 7.11 9.78 9.67 9.20 7.14 
## 7858 7866 7884 7993 8047 8065 8116 8163 8230 8351 8356 8367 8372 8464 8472 8498 
## 8.75 9.70 9.74 7.65 9.01 9.36 7.61 8.01 9.61 8.45 9.02 8.60 7.72 6.89 9.85 7.97 
## 8516 8545 8591 8597 8612 8659 8696 8719 8723 8883 8901 8958 9020 9031 9085 9195 
## 7.95 8.69 8.91 9.82 9.18 9.80 9.18 9.02 9.97 8.29 9.76 8.21 8.59 9.77 9.51 9.22 
## 9215 9304 9326 9397 9413 9428 9493 9504 9576 9580 9606 9629 9650 9663 9785 9831 
## 9.48 7.28 7.48 8.50 9.21 9.13 9.75 8.77 9.22 7.43 7.88 7.88 9.75 8.99 8.82 4.35 
## 9894 9908 
## 9.43 8.14
Xmin <- names(Xsum[which(Xsum == min(Xsum))])
Xmin
## [1] "9831"

Plot

hist(Xsum, prob = TRUE, nclass = 30, xlim = c(0, 50), ylim = c(0, 0.065))
x <- seq(0, 50, by = 0.1)
lines(x, dchisq(x, df = 21), col = "red")
legend("topright", inset = 0.05, legend = c("Xsum", "Chi-square(21)"), col = c("black", "red"), lty = 1)

plot(density(Xsum), xlim = c(0, 50), main = "Density Estimation of Xsum")
lines(x, dchisq(x, df = 21), col = "red")
legend("topright", inset = 0.05, legend = c("Xsum", "Chi-square(21)"), col = c("black", "red"), lty = 1)

Randomization

set.seed(Xmin)
N <- nrow(class_roll) 
class_roll$group <- 
  sample(1:N) %%
  2 %>%
  factor(levels = c(0, 1), labels = c("Red", "Black"))
red_and_black(class_roll)
## $Values
## [1] 1.46856184 0.31845373 0.66474820 0.04853833 1.23474583 0.61395926
## 
## $Xsum
## [1] 4.349007

학번

class_roll$id_2 <-
  class_roll$id %>%
  ifelse(. <= 2015, "2015", .)
tbl1 <- class_roll %$%
  table(.$group, .$id_2 %>% substr(1, 4)) %>%
  `colnames<-`(c("2015 이전", 2016:2021)) 
tbl1 %>%
  pander
  2015 이전 2016 2017 2018 2019 2020 2021
Red 19 26 30 56 20 48 109
Black 18 32 25 52 18 50 113
X1min <- tbl1 %>%
  chisq.test(simulate.p.value = TRUE) %>%
  `[[`(1)
X1min
## X-squared 
##  1.468562

학번 홀짝

tbl2 <- class_roll$id %>%
  as.numeric %>%
  `%%`(2) %>%
  factor(levels = c(1, 0), labels = c("홀", "짝")) %>%
  table(class_roll$group, .) 
tbl2 %>%
  pander
 
Red 146 162
Black 153 155
X2min <- tbl2 %>%
  chisq.test(simulate.p.value = TRUE) %>%
  `[[`(1)
X2min
## X-squared 
## 0.3184537

학적 상태

tbl3 <- class_roll$status %>%
  table(class_roll$group, .) 
tbl3 %>%
  pander
  학생 휴학
Red 275 33
Black 281 27
X3min <- tbl3 %>%
  chisq.test(simulate.p.value = TRUE) %>%
  `[[`(1)
X3min
## X-squared 
## 0.6647482

e-mail 서비스업체

tbl4 <- class_roll$email %>%
  strsplit("@", fixed = TRUE) %>%
  sapply("[", 2) %>%
  `==`("naver.com") %>%
  ifelse("네이버", "기타서비스") %>%
  factor(levels = c("네이버", "기타서비스")) %>%
  table(class_roll$group, .) 
tbl4 %>%
  pander
  네이버 기타서비스
Red 260 48
Black 258 50
X4min <- tbl4 %>%
  chisq.test(simulate.p.value = TRUE) %>%
  `[[`(1)
X4min
##  X-squared 
## 0.04853833

전화번호의 분포

cut_label <- paste(paste0(0:9, "000"), paste0(0:9, "999"), 
                   sep = "~")
tbl5 <- class_roll$cell_no %>%
  substr(start = 8, stop = 11) %>%
  sapply(as.numeric) %>%
  cut(labels = cut_label, 
      breaks = seq(0, 10000, by = 1000)) %>%
  table(class_roll$group, .) 
tbl5 %>%
  pander
  0000~0999 1000~1999 2000~2999 3000~3999 4000~4999 5000~5999 6000~6999 7000~7999 8000~8999 9000~9999
Red 23 31 34 28 25 31 25 36 40 35
Black 24 31 34 25 26 33 30 38 36 31
X5min <- tbl5 %>%
  chisq.test(simulate.p.value = TRUE) %>%
  `[[`(1)
X5min
## X-squared 
##  1.234746

성씨 분포

f_name <- class_roll$name %>%
  substring(first = 1, last = 1) 
tbl6 <- f_name %>%
  `%in%`(c("김", "이", "박")) %>%
  ifelse(f_name, "기타") %>%
  factor(levels = c("김", "이", "박", "기타")) %>%
  table(class_roll$group, .) 
tbl6 %>%
  pander
  기타
Red 63 51 24 170
Black 65 44 25 174
X6min <- tbl6 %>%
  chisq.test(simulate.p.value = TRUE) %>%
  `[[`(1)
X6min
## X-squared 
## 0.6139593

Sum of Chi_Squares

Xsum_min <- X1min + X2min + X3min + X4min + X5min + X6min
Xsum_min
## X-squared 
##  4.349007