Data
Search for Best Configuration
M1 <- 7000001
M2 <- 8000000
Xsum <- sapply(M1:M2, red_and_black)
names(Xsum) <- M1:M2
Xsum %>%
summary %>%
round(2)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.89 16.36 20.37 21.03 24.98 82.98
Xsum %>%
sd %>%
round(2)
## [1] 6.5
Xsum %>%
`<=`(5) %>%
which %>%
`[`(Xsum, .) %>%
round(2)
## 7001946 7008403 7016585 7025745 7041842 7048960 7074640 7081298 7085202 7095301
## 4.66 4.89 4.72 4.65 4.07 3.68 4.43 4.76 4.90 4.40
## 7096276 7115963 7125585 7127156 7129340 7129702 7130537 7145744 7157678 7160035
## 4.30 4.30 4.89 3.85 4.58 4.64 4.81 4.74 4.71 4.74
## 7166896 7167677 7171752 7175371 7176150 7176185 7183879 7184722 7185426 7192253
## 3.53 4.06 4.52 4.49 4.96 4.11 4.72 3.39 3.41 4.89
## 7204797 7207568 7208619 7210405 7229166 7230186 7230813 7231563 7238784 7252399
## 4.77 4.62 3.99 4.92 3.69 4.89 4.48 3.49 4.57 4.46
## 7255211 7264192 7264864 7266888 7278005 7292810 7298653 7313190 7317888 7318987
## 4.27 4.45 4.29 4.02 4.95 4.97 4.57 3.34 4.01 4.02
## 7320944 7334077 7342253 7344773 7363013 7368837 7372308 7378116 7383472 7395274
## 4.09 3.83 3.28 4.90 4.98 4.98 3.51 4.90 4.01 3.97
## 7396583 7417245 7436360 7438753 7441999 7442694 7446338 7461952 7464986 7471670
## 4.75 4.68 4.05 4.98 4.71 4.73 4.05 3.08 3.47 4.45
## 7471882 7474479 7477581 7479257 7480958 7483900 7490375 7521830 7524851 7525062
## 4.57 3.69 4.79 4.94 4.17 4.84 4.81 4.96 4.59 4.09
## 7528997 7534618 7537971 7577509 7582088 7589048 7589396 7595931 7597023 7598298
## 4.71 3.56 4.83 4.44 5.00 4.79 4.91 4.94 4.95 4.20
## 7598769 7610001 7616663 7621237 7627352 7652947 7654738 7654806 7655882 7663763
## 4.72 4.39 4.17 3.91 4.77 4.95 4.59 4.26 4.12 4.56
## 7671276 7672225 7678652 7680378 7681775 7684991 7687516 7691009 7691825 7696928
## 4.77 4.84 4.53 4.66 4.95 4.13 3.97 4.98 4.97 4.54
## 7705846 7708543 7711808 7714596 7716669 7720051 7723433 7733526 7734024 7740426
## 4.76 3.59 4.00 4.37 4.89 4.83 4.81 4.75 4.80 4.40
## 7742840 7745721 7756945 7770087 7771078 7785653 7786043 7786759 7795926 7811064
## 3.59 3.42 4.53 4.68 4.85 4.75 4.35 4.93 4.72 4.99
## 7815898 7824130 7831160 7844009 7870613 7879033 7881375 7897077 7898866 7909658
## 4.86 4.89 3.27 4.93 4.89 4.91 4.84 4.87 4.95 4.67
## 7926512 7948757 7982106 7991513 7991974
## 2.89 4.44 4.66 3.12 4.58
Xmin <- names(Xsum[which(Xsum == min(Xsum))])
Xmin
## [1] "7926512"
Plot
hist(Xsum, prob = TRUE, nclass = 30, xlim = c(0, 50), ylim = c(0, 0.065))
x <- seq(0, 50, by = 0.1)
lines(x, dchisq(x, df = 21), col = "red")
legend("topright", inset = 0.05, legend = c("Xsum", "Chi-square(21)"), col = c("black", "red"), lty = 1)

plot(density(Xsum), xlim = c(0, 50), ylim = c(0, 0.065), main = "Density Estimation of Xsum")
lines(x, dchisq(x, df = 21), col = "red")
legend("topright", inset = 0.05, legend = c("Xsum", "Chi-square(21)"), col = c("black", "red"), lty = 1)

Randomization
set.seed(Xmin)
N <- nrow(class_roll)
class_roll$group <-
sample(1:N) %%
2 %>%
factor(levels = c(0, 1), labels = c("Red", "Black"))
red_and_black(Xmin)
## [1] 2.890813
학번
class_roll$id_2 <-
class_roll$id %>%
ifelse(. <= 2016, "2016", .)
tbl1 <- class_roll %$%
table(.$group, .$id_2 %>% substr(1, 4)) %>%
`colnames<-`(c("2016 이전", 2017:2022))
tbl1 %>%
pander
Red |
15 |
30 |
56 |
69 |
43 |
67 |
226 |
Black |
15 |
28 |
56 |
62 |
41 |
69 |
236 |
X1min <- tbl1 %>%
chisq.test(simulate.p.value = TRUE) %>%
`[[`(1)
X1min
## X-squared
## 0.7355059
학번 홀짝
tbl2 <- class_roll$id %>%
as.numeric %>%
`%%`(2) %>%
factor(levels = c(1, 0), labels = c("홀", "짝")) %>%
table(class_roll$group, .)
tbl2 %>%
pander
Red |
271 |
235 |
Black |
267 |
240 |
X2min <- tbl2 %>%
chisq.test(simulate.p.value = TRUE) %>%
`[[`(1)
X2min
## X-squared
## 0.08138427
학적 상태
tbl3 <- class_roll$status %>%
table(class_roll$group, .)
tbl3 %>%
pander
X3min <- tbl3 %>%
chisq.test(simulate.p.value = TRUE) %>%
`[[`(1)
X3min
## X-squared
## 0.04770835
e-mail 서비스업체
tbl4 <- class_roll$email %>%
strsplit("@", fixed = TRUE) %>%
sapply("[", 2) %>%
`==`("naver.com") %>%
ifelse("네이버", "기타서비스") %>%
factor(levels = c("네이버", "기타서비스")) %>%
table(class_roll$group, .)
tbl4 %>%
pander
X4min <- tbl4 %>%
chisq.test(simulate.p.value = TRUE) %>%
`[[`(1)
X4min
## X-squared
## 0.1430955
전화번호의 분포
cut_label <- paste(paste0(0:9, "000"), paste0(0:9, "999"),
sep = "~")
tbl5 <- class_roll$cell_no %>%
substr(start = 8, stop = 11) %>%
sapply(as.numeric) %>%
cut(labels = cut_label,
breaks = seq(0, 10000, by = 1000)) %>%
table(class_roll$group, .)
tbl5 %>%
pander
Red |
46 |
43 |
44 |
52 |
56 |
53 |
58 |
58 |
45 |
51 |
Black |
47 |
49 |
48 |
48 |
61 |
52 |
58 |
56 |
40 |
48 |
X5min <- tbl5 %>%
chisq.test(simulate.p.value = TRUE) %>%
`[[`(1)
X5min
## X-squared
## 1.378298
성씨 분포
f_name <- class_roll$name %>%
substring(first = 1, last = 1)
tbl6 <- f_name %>%
`%in%`(c("김", "이", "박")) %>%
ifelse(f_name, "기타") %>%
factor(levels = c("김", "이", "박", "기타")) %>%
table(class_roll$group, .)
tbl6 %>%
pander
Red |
117 |
72 |
36 |
281 |
Black |
117 |
70 |
42 |
278 |
X6min <- tbl6 %>%
chisq.test(simulate.p.value = TRUE) %>%
`[[`(1)
X6min
## X-squared
## 0.504821
Sum of Chi_Squares
Xsum_min <- X1min + X2min + X3min + X4min + X5min + X6min
Xsum_min
## X-squared
## 2.890813