Data

Search for Best Configuration

M1 <- 1000001
M2 <- 2000000
Xsum <- sapply(M1:M2, red_and_black)
names(Xsum) <- M1:M2
Xsum %>%
  summary %>%
  round(2) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.41   16.36   20.35   21.02   24.95   69.91
Xsum %>%
  sd %>%
  round(2)
## [1] 6.5
Xsum %>%
  `<=`(5) %>%
  which %>%
  `[`(Xsum, .) %>%
  round(2)
## 1000531 1002641 1010872 1011832 1015250 1016384 1019881 1026185 1029162 1072260 
##    4.74    3.22    4.29    3.62    4.80    4.16    4.76    3.98    4.19    2.41 
## 1075152 1080303 1081004 1086899 1094467 1098386 1114372 1119120 1133105 1142000 
##    4.67    4.97    4.66    4.78    4.72    4.55    4.35    4.42    3.75    4.86 
## 1143028 1144048 1159631 1162840 1168348 1170496 1181296 1185517 1186704 1189797 
##    4.72    3.71    4.99    4.34    4.55    4.60    4.89    4.81    4.43    4.97 
## 1193650 1194159 1196676 1210773 1235972 1239033 1241312 1244844 1245070 1246166 
##    4.37    4.09    4.82    4.31    3.41    4.78    3.67    3.96    4.65    4.28 
## 1264663 1269117 1274767 1277015 1277391 1278239 1281257 1287212 1293870 1300671 
##    4.64    4.58    4.62    4.98    4.85    4.28    3.97    3.30    4.95    4.38 
## 1304821 1308300 1311007 1314152 1314536 1348487 1351993 1352761 1356548 1366025 
##    3.59    4.76    4.79    4.14    4.79    4.95    3.49    4.92    2.74    4.21 
## 1379547 1381513 1384602 1386343 1403197 1404542 1415007 1418081 1421554 1432843 
##    4.86    4.61    4.52    5.00    4.29    4.43    4.70    4.61    4.10    4.52 
## 1438879 1442231 1446039 1446953 1450132 1463932 1468249 1473410 1477441 1484286 
##    4.91    3.89    3.97    4.19    4.33    4.90    3.66    3.84    4.65    4.78 
## 1485222 1522273 1531415 1533386 1541529 1545369 1566616 1568051 1572683 1579681 
##    4.93    4.25    3.98    4.75    4.51    4.78    4.72    4.16    4.88    4.61 
## 1596021 1597066 1600580 1613987 1638338 1647342 1651666 1669980 1686911 1691633 
##    4.52    4.66    4.01    4.41    4.86    4.50    4.96    3.21    4.79    4.53 
## 1691728 1697462 1709587 1710001 1710640 1716877 1730015 1730076 1768564 1770939 
##    4.80    4.85    3.75    4.32    4.38    4.96    3.95    4.10    4.46    4.36 
## 1791409 1807078 1833969 1843930 1846135 1852432 1862096 1864778 1866058 1866762 
##    4.96    4.25    4.98    4.47    4.55    4.71    3.85    4.29    4.32    4.92 
## 1868036 1892824 1896061 1899446 1903496 1917659 1925504 1928458 1930156 1933680 
##    4.75    4.90    4.81    4.98    4.29    3.52    4.71    4.12    4.21    4.81 
## 1951293 1952544 1959928 1965224 1966455 1969621 1994752 
##    4.40    4.45    4.53    3.97    4.38    3.98    4.87
Xmin <- names(Xsum[which(Xsum == min(Xsum))])
Xmin
## [1] "1072260"

Plot

hist(Xsum, prob = TRUE, nclass = 30, xlim = c(0, 50), ylim = c(0, 0.065))
x <- seq(0, 50, by = 0.1)
lines(x, dchisq(x, df = 21), col = "red")
legend("topright", inset = 0.05, legend = c("Xsum", "Chi-square(21)"), col = c("black", "red"), lty = 1)

plot(density(Xsum), xlim = c(0, 50), ylim = c(0, 0.065), main = "Density Estimation of Xsum")
lines(x, dchisq(x, df = 21), col = "red")
legend("topright", inset = 0.05, legend = c("Xsum", "Chi-square(21)"), col = c("black", "red"), lty = 1)

Randomization

set.seed(Xmin)
N <- nrow(class_roll) 
class_roll$group <- 
  sample(1:N) %%
  2 %>%
  factor(levels = c(0, 1), labels = c("Red", "Black"))
red_and_black(Xmin)
## [1] 2.413016

학번

class_roll$id_2 <-
  class_roll$id %>%
  ifelse(. <= 2016, "2016", .)
tbl1 <- class_roll %$%
  table(.$group, .$id_2 %>% substr(1, 4)) %>%
  `colnames<-`(c("2016 이전", 2017:2022)) 
tbl1 %>%
  pander
  2016 이전 2017 2018 2019 2020 2021 2022
Red 15 31 58 64 41 69 228
Black 15 27 54 67 43 67 234
X1min <- tbl1 %>%
  chisq.test(simulate.p.value = TRUE) %>%
  `[[`(1)
X1min
## X-squared 
## 0.6413879

학번 홀짝

tbl2 <- class_roll$id %>%
  as.numeric %>%
  `%%`(2) %>%
  factor(levels = c(1, 0), labels = c("홀", "짝")) %>%
  table(class_roll$group, .) 
tbl2 %>%
  pander
 
Red 271 235
Black 267 240
X2min <- tbl2 %>%
  chisq.test(simulate.p.value = TRUE) %>%
  `[[`(1)
X2min
##  X-squared 
## 0.08138427

학적 상태

tbl3 <- class_roll$status %>%
  table(class_roll$group, .) 
tbl3 %>%
  pander
  학생 휴학
Red 463 43
Black 466 41
X3min <- tbl3 %>%
  chisq.test(simulate.p.value = TRUE) %>%
  `[[`(1)
X3min
##  X-squared 
## 0.05631977

e-mail 서비스업체

tbl4 <- class_roll$email %>%
  strsplit("@", fixed = TRUE) %>%
  sapply("[", 2) %>%
  `==`("naver.com") %>%
  ifelse("네이버", "기타서비스") %>%
  factor(levels = c("네이버", "기타서비스")) %>%
  table(class_roll$group, .) 
tbl4 %>%
  pander
  네이버 기타서비스
Red 403 103
Black 409 98
X4min <- tbl4 %>%
  chisq.test(simulate.p.value = TRUE) %>%
  `[[`(1)
X4min
## X-squared 
## 0.1677261

전화번호의 분포

cut_label <- paste(paste0(0:9, "000"), paste0(0:9, "999"), 
                   sep = "~")
tbl5 <- class_roll$cell_no %>%
  substr(start = 8, stop = 11) %>%
  sapply(as.numeric) %>%
  cut(labels = cut_label, 
      breaks = seq(0, 10000, by = 1000)) %>%
  table(class_roll$group, .) 
tbl5 %>%
  pander
  0000~0999 1000~1999 2000~2999 3000~3999 4000~4999 5000~5999 6000~6999 7000~7999 8000~8999 9000~9999
Red 50 47 44 49 59 51 56 59 43 48
Black 43 45 48 51 58 54 60 55 42 51
X5min <- tbl5 %>%
  chisq.test(simulate.p.value = TRUE) %>%
  `[[`(1)
X5min
## X-squared 
##  1.258504

성씨 분포

f_name <- class_roll$name %>%
  substring(first = 1, last = 1) 
tbl6 <- f_name %>%
  `%in%`(c("김", "이", "박")) %>%
  ifelse(f_name, "기타") %>%
  factor(levels = c("김", "이", "박", "기타")) %>%
  table(class_roll$group, .) 
tbl6 %>%
  pander
  기타
Red 117 69 38 282
Black 117 73 40 277
X6min <- tbl6 %>%
  chisq.test(simulate.p.value = TRUE) %>%
  `[[`(1)
X6min
## X-squared 
## 0.2076939

Sum of Chi_Squares

Xsum_min <- X1min + X2min + X3min + X4min + X5min + X6min
Xsum_min
## X-squared 
##  2.413016