Data
Search for Best Configuration
M1 <- 1
M2 <- 10000
Xsum <- numeric(0)
Values_mat <- numeric(0)
for(k in M1:M2){
set.seed(k)
N <- nrow(class_roll)
class_roll$group <-
sample(1:N) %%
2 %>%
factor(levels = c(0, 1), labels = c("Red", "Black"))
Xsum <- c(Xsum, red_and_black(class_roll)$Xsum)
Values_mat <- rbind(Values_mat, red_and_black(class_roll)$Values)
}
colnames(Values_mat) <- paste0("X", 1:6)
# Values_mat
# pairs(Values_mat)
cor(Values_mat) %>%
round(4)
## X1 X2 X3 X4 X5 X6
## X1 1.0000 -0.0131 0.0492 0.0061 0.0166 0.0184
## X2 -0.0131 1.0000 0.0060 0.0029 0.0027 0.0226
## X3 0.0492 0.0060 1.0000 -0.0079 0.0101 -0.0166
## X4 0.0061 0.0029 -0.0079 1.0000 -0.0051 0.0028
## X5 0.0166 0.0027 0.0101 -0.0051 1.0000 -0.0053
## X6 0.0184 0.0226 -0.0166 0.0028 -0.0053 1.0000
names(Xsum) <- M1:M2
Xsum %>%
summary %>%
round(2)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.35 16.40 20.37 21.05 25.00 56.01
Xsum %>%
sd %>%
round(2)
## [1] 6.52
Xsum %>%
`<=`(10) %>%
which %>%
`[`(Xsum, .) %>%
round(2)
## 17 89 105 109 250 271 390 552 570 648 723 729 786 795 803 872
## 7.16 8.84 8.03 8.95 9.82 9.24 9.32 9.68 6.61 7.53 9.34 9.85 9.90 9.99 9.71 5.45
## 885 900 1094 1116 1206 1226 1273 1287 1293 1331 1336 1421 1427 1541 1573 1593
## 9.03 8.35 9.58 8.71 8.60 9.41 7.80 9.12 7.95 9.68 8.97 7.96 9.92 8.45 8.06 9.57
## 1664 1681 1746 1778 1821 1964 2093 2288 2295 2322 2398 2429 2499 2592 2744 2777
## 8.99 9.32 7.71 9.84 8.06 9.47 9.15 9.86 8.72 9.31 9.25 8.73 7.77 9.43 8.71 9.54
## 2804 2832 2893 2992 3052 3071 3137 3170 3206 3337 3338 3474 3524 3539 3580 3630
## 7.06 5.77 9.31 8.89 9.17 7.64 8.55 9.66 7.78 5.41 9.29 8.11 8.41 9.91 8.96 6.38
## 3638 3685 3774 3784 3807 3849 3854 3941 3966 4021 4096 4099 4121 4125 4164 4181
## 9.38 8.34 6.89 8.57 7.33 9.71 6.68 9.10 8.88 9.74 9.46 5.88 8.16 9.33 9.97 8.34
## 4208 4236 4257 4320 4329 4599 4614 4892 4896 4927 4971 5049 5107 5253 5362 5482
## 8.25 6.15 7.79 7.85 8.10 9.74 7.80 9.66 9.63 9.56 8.75 9.65 5.45 7.87 6.25 7.47
## 5488 5515 5551 5575 5699 5728 5898 5931 5975 5977 6173 6175 6182 6188 6212 6260
## 9.78 7.27 9.65 7.22 8.92 9.91 8.96 9.57 7.74 8.44 9.87 8.97 9.50 8.82 9.62 7.68
## 6276 6279 6387 6413 6437 6444 6519 6536 6544 6693 6781 6788 6901 6979 6991 7026
## 8.18 9.37 8.13 6.33 8.26 7.38 9.24 9.59 9.79 7.53 7.75 9.90 9.87 8.90 5.92 8.20
## 7098 7161 7242 7338 7362 7366 7394 7414 7466 7494 7561 7571 7585 7730 7797 7819
## 8.00 8.00 8.72 5.96 9.91 9.45 9.59 8.25 8.67 7.49 9.75 7.11 9.78 9.67 9.20 7.14
## 7858 7866 7884 7993 8047 8065 8116 8163 8230 8351 8356 8367 8372 8464 8472 8498
## 8.75 9.70 9.74 7.65 9.01 9.36 7.61 8.01 9.61 8.45 9.02 8.60 7.72 6.89 9.85 7.97
## 8516 8545 8591 8597 8612 8659 8696 8719 8723 8883 8901 8958 9020 9031 9085 9195
## 7.95 8.69 8.91 9.82 9.18 9.80 9.18 9.02 9.97 8.29 9.76 8.21 8.59 9.77 9.51 9.22
## 9215 9304 9326 9397 9413 9428 9493 9504 9576 9580 9606 9629 9650 9663 9785 9831
## 9.48 7.28 7.48 8.50 9.21 9.13 9.75 8.77 9.22 7.43 7.88 7.88 9.75 8.99 8.82 4.35
## 9894 9908
## 9.43 8.14
Xmin <- names(Xsum[which(Xsum == min(Xsum))])
Xmin
## [1] "9831"
Plot
hist(Xsum, prob = TRUE, nclass = 30, xlim = c(0, 50), ylim = c(0, 0.065))
x <- seq(0, 50, by = 0.1)
lines(x, dchisq(x, df = 21), col = "red")
legend("topright", inset = 0.05, legend = c("Xsum", "Chi-square(21)"), col = c("black", "red"), lty = 1)

plot(density(Xsum), xlim = c(0, 50), main = "Density Estimation of Xsum")
lines(x, dchisq(x, df = 21), col = "red")
legend("topright", inset = 0.05, legend = c("Xsum", "Chi-square(21)"), col = c("black", "red"), lty = 1)

Randomization
set.seed(Xmin)
N <- nrow(class_roll)
class_roll$group <-
sample(1:N) %%
2 %>%
factor(levels = c(0, 1), labels = c("Red", "Black"))
red_and_black(class_roll)
## $Values
## [1] 1.46856184 0.31845373 0.66474820 0.04853833 1.23474583 0.61395926
##
## $Xsum
## [1] 4.349007
학번
class_roll$id_2 <-
class_roll$id %>%
ifelse(. <= 2015, "2015", .)
tbl1 <- class_roll %$%
table(.$group, .$id_2 %>% substr(1, 4)) %>%
`colnames<-`(c("2015 이전", 2016:2021))
tbl1 %>%
pander
Red |
19 |
26 |
30 |
56 |
20 |
48 |
109 |
Black |
18 |
32 |
25 |
52 |
18 |
50 |
113 |
X1min <- tbl1 %>%
chisq.test(simulate.p.value = TRUE) %>%
`[[`(1)
X1min
## X-squared
## 1.468562
학번 홀짝
tbl2 <- class_roll$id %>%
as.numeric %>%
`%%`(2) %>%
factor(levels = c(1, 0), labels = c("홀", "짝")) %>%
table(class_roll$group, .)
tbl2 %>%
pander
Red |
146 |
162 |
Black |
153 |
155 |
X2min <- tbl2 %>%
chisq.test(simulate.p.value = TRUE) %>%
`[[`(1)
X2min
## X-squared
## 0.3184537
학적 상태
tbl3 <- class_roll$status %>%
table(class_roll$group, .)
tbl3 %>%
pander
X3min <- tbl3 %>%
chisq.test(simulate.p.value = TRUE) %>%
`[[`(1)
X3min
## X-squared
## 0.6647482
e-mail 서비스업체
tbl4 <- class_roll$email %>%
strsplit("@", fixed = TRUE) %>%
sapply("[", 2) %>%
`==`("naver.com") %>%
ifelse("네이버", "기타서비스") %>%
factor(levels = c("네이버", "기타서비스")) %>%
table(class_roll$group, .)
tbl4 %>%
pander
X4min <- tbl4 %>%
chisq.test(simulate.p.value = TRUE) %>%
`[[`(1)
X4min
## X-squared
## 0.04853833
전화번호의 분포
cut_label <- paste(paste0(0:9, "000"), paste0(0:9, "999"),
sep = "~")
tbl5 <- class_roll$cell_no %>%
substr(start = 8, stop = 11) %>%
sapply(as.numeric) %>%
cut(labels = cut_label,
breaks = seq(0, 10000, by = 1000)) %>%
table(class_roll$group, .)
tbl5 %>%
pander
Red |
23 |
31 |
34 |
28 |
25 |
31 |
25 |
36 |
40 |
35 |
Black |
24 |
31 |
34 |
25 |
26 |
33 |
30 |
38 |
36 |
31 |
X5min <- tbl5 %>%
chisq.test(simulate.p.value = TRUE) %>%
`[[`(1)
X5min
## X-squared
## 1.234746
성씨 분포
f_name <- class_roll$name %>%
substring(first = 1, last = 1)
tbl6 <- f_name %>%
`%in%`(c("김", "이", "박")) %>%
ifelse(f_name, "기타") %>%
factor(levels = c("김", "이", "박", "기타")) %>%
table(class_roll$group, .)
tbl6 %>%
pander
Red |
63 |
51 |
24 |
170 |
Black |
65 |
44 |
25 |
174 |
X6min <- tbl6 %>%
chisq.test(simulate.p.value = TRUE) %>%
`[[`(1)
X6min
## X-squared
## 0.6139593
Sum of Chi_Squares
Xsum_min <- X1min + X2min + X3min + X4min + X5min + X6min
Xsum_min
## X-squared
## 4.349007