Data
Search for Best Configuration
M1 <- 5000001
M2 <- 6000000
Xsum <- sapply(M1:M2, red_and_black)
names(Xsum) <- M1:M2
Xsum %>%
summary %>%
round(2)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.46 16.36 20.34 21.02 24.96 68.49
Xsum %>%
sd %>%
round(2)
## [1] 6.5
Xsum %>%
`<=`(5) %>%
which %>%
`[`(Xsum, .) %>%
round(2)
## 5023986 5029376 5036672 5043325 5049253 5056638 5061779 5065837 5070227 5076848
## 3.99 4.87 3.73 4.91 4.61 4.92 4.64 4.91 4.98 4.67
## 5078080 5095784 5099632 5108502 5111948 5112036 5124404 5129956 5148507 5150927
## 4.54 4.50 4.81 3.35 2.46 3.81 4.89 3.18 3.68 4.26
## 5156803 5161560 5165525 5178466 5178965 5183035 5193609 5194853 5199988 5204467
## 4.54 4.73 4.63 4.89 4.86 4.40 4.31 4.82 3.43 4.70
## 5223208 5245890 5250984 5267506 5270997 5276313 5278469 5280600 5290155 5300765
## 4.72 4.64 4.56 4.85 3.80 4.66 4.28 4.79 4.59 4.76
## 5303412 5326267 5328070 5333931 5343626 5343966 5344569 5346198 5348214 5363389
## 3.62 4.35 4.93 4.62 4.49 3.52 4.94 4.73 4.52 4.51
## 5369770 5371101 5379366 5379728 5384124 5395050 5402079 5405902 5412397 5418571
## 4.50 4.84 4.57 4.41 4.74 4.31 3.97 4.99 4.62 4.88
## 5433959 5445596 5455762 5463234 5466902 5472209 5480642 5485659 5488109 5496177
## 3.93 4.89 4.19 4.06 4.75 3.48 4.63 4.15 4.62 4.30
## 5499112 5510824 5517944 5529352 5539606 5546283 5548860 5550010 5556401 5577651
## 4.07 4.90 4.80 4.96 3.93 3.13 4.49 4.51 4.35 4.21
## 5616590 5620789 5622613 5653367 5653841 5663482 5678147 5697201 5704823 5721179
## 4.69 2.68 4.00 4.71 4.65 4.13 4.59 4.79 4.24 4.29
## 5731995 5742347 5745639 5753612 5770079 5787525 5800294 5823857 5831449 5838735
## 4.14 4.80 4.69 4.21 4.09 4.87 3.92 4.13 4.40 3.91
## 5860879 5862108 5863394 5886377 5900888 5915354 5925770 5931315 5938551 5945665
## 4.09 4.87 4.47 4.48 4.07 4.51 4.95 4.84 3.22 4.94
## 5957789 5959812 5965440 5970645 5974242 5975208 5975966 5979197 5991800
## 4.81 4.64 4.66 4.97 4.82 4.74 4.89 3.62 4.30
Xmin <- names(Xsum[which(Xsum == min(Xsum))])
Xmin
## [1] "5111948"
Plot
hist(Xsum, prob = TRUE, nclass = 30, xlim = c(0, 50), ylim = c(0, 0.065))
x <- seq(0, 50, by = 0.1)
lines(x, dchisq(x, df = 21), col = "red")
legend("topright", inset = 0.05, legend = c("Xsum", "Chi-square(21)"), col = c("black", "red"), lty = 1)

plot(density(Xsum), xlim = c(0, 50), ylim = c(0, 0.065), main = "Density Estimation of Xsum")
lines(x, dchisq(x, df = 21), col = "red")
legend("topright", inset = 0.05, legend = c("Xsum", "Chi-square(21)"), col = c("black", "red"), lty = 1)

Randomization
set.seed(Xmin)
N <- nrow(class_roll)
class_roll$group <-
sample(1:N) %%
2 %>%
factor(levels = c(0, 1), labels = c("Red", "Black"))
red_and_black(Xmin)
## [1] 2.463863
학번
class_roll$id_2 <-
class_roll$id %>%
ifelse(. <= 2016, "2016", .)
tbl1 <- class_roll %$%
table(.$group, .$id_2 %>% substr(1, 4)) %>%
`colnames<-`(c("2016 이전", 2017:2022))
tbl1 %>%
pander
Red |
15 |
30 |
56 |
66 |
40 |
67 |
232 |
Black |
15 |
28 |
56 |
65 |
44 |
69 |
230 |
X1min <- tbl1 %>%
chisq.test(simulate.p.value = TRUE) %>%
`[[`(1)
X1min
## X-squared
## 0.3041582
학번 홀짝
tbl2 <- class_roll$id %>%
as.numeric %>%
`%%`(2) %>%
factor(levels = c(1, 0), labels = c("홀", "짝")) %>%
table(class_roll$group, .)
tbl2 %>%
pander
Red |
271 |
235 |
Black |
267 |
240 |
X2min <- tbl2 %>%
chisq.test(simulate.p.value = TRUE) %>%
`[[`(1)
X2min
## X-squared
## 0.08138427
학적 상태
tbl3 <- class_roll$status %>%
table(class_roll$group, .)
tbl3 %>%
pander
X3min <- tbl3 %>%
chisq.test(simulate.p.value = TRUE) %>%
`[[`(1)
X3min
## X-squared
## 8.925952e-05
e-mail 서비스업체
tbl4 <- class_roll$email %>%
strsplit("@", fixed = TRUE) %>%
sapply("[", 2) %>%
`==`("naver.com") %>%
ifelse("네이버", "기타서비스") %>%
factor(levels = c("네이버", "기타서비스")) %>%
table(class_roll$group, .)
tbl4 %>%
pander
X4min <- tbl4 %>%
chisq.test(simulate.p.value = TRUE) %>%
`[[`(1)
X4min
## X-squared
## 0.06349345
전화번호의 분포
cut_label <- paste(paste0(0:9, "000"), paste0(0:9, "999"),
sep = "~")
tbl5 <- class_roll$cell_no %>%
substr(start = 8, stop = 11) %>%
sapply(as.numeric) %>%
cut(labels = cut_label,
breaks = seq(0, 10000, by = 1000)) %>%
table(class_roll$group, .)
tbl5 %>%
pander
Red |
49 |
46 |
46 |
47 |
61 |
55 |
55 |
54 |
42 |
51 |
Black |
44 |
46 |
46 |
53 |
56 |
50 |
61 |
60 |
43 |
48 |
X5min <- tbl5 %>%
chisq.test(simulate.p.value = TRUE) %>%
`[[`(1)
X5min
## X-squared
## 1.80841
성씨 분포
f_name <- class_roll$name %>%
substring(first = 1, last = 1)
tbl6 <- f_name %>%
`%in%`(c("김", "이", "박")) %>%
ifelse(f_name, "기타") %>%
factor(levels = c("김", "이", "박", "기타")) %>%
table(class_roll$group, .)
tbl6 %>%
pander
Red |
119 |
71 |
40 |
276 |
Black |
115 |
71 |
38 |
283 |
X6min <- tbl6 %>%
chisq.test(simulate.p.value = TRUE) %>%
`[[`(1)
X6min
## X-squared
## 0.2063277
Sum of Chi_Squares
Xsum_min <- X1min + X2min + X3min + X4min + X5min + X6min
Xsum_min
## X-squared
## 2.463863