厚労省調査より検討

東大の過去5年の男女別の志願者数と合格者数。
(Source: http://www.mext.go.jp/…/afield…/2018/09/10/1409128_002_1.pdf

a=matrix(c(314,70,384,84,16,100,27,23,319,64,383,80,21,101,25,33,320,68,388,86,14,100,27,21,331,60,391,84,16,100,25,27,324,69,393,83,17,100,26,25,327,69,396,84,17,101),8,)
## Warning in matrix(c(314, 70, 384, 84, 16, 100, 27, 23, 319, 64, 383, 80, :
## データ長 [46] が行数 [8] を整数で割った、もしくは掛けた値ではありません
dat=t(a)
dat=dat[,c(1,2,4,5)]
row.names(dat) = c("H30","H29","H28","H27","H26","H25")
colnames(dat) = c("M_cand","F_cand","M_suc","F_suc")

clude_dat = apply(dat,sum,MARGIN = 2)
clude_mat=matrix(clude_dat,2,2)
# フィッシャーの正確検定で合格率の有意差を検討
fisher.test(clude_mat)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  clude_mat
## p-value = 0.9033
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.7595872 1.2443186
## sample estimates:
## odds ratio 
##  0.9752324

過去のデータから合格率に有意差はなし。

シミュレーションによる検討

シミュレーション関数を定義
sim(): simulation 関数 変更可能パラメターは以下の通り
-n:男女各人口 -M_sample_p: 男性の大学進学率(受験のフィールドにあがる割合)
-F_sample_p: 女性の大学進学率(受験のフィールドにあがる割合)
-M_p: 男性が東大への出願を決定する閾値(受験人口中のパーセンタイルで指定)
-F_p: 女性が東大への出願を決定する閾値(受験人口中のパーセンタイルで指定)
-top: 合格定員
戻り値
-Fem_P: 女性の入学者割合
-suc_P: 男女の合格者割合

sim_main:シミュレーションの実行関数
戻り値: 全シミュレーション中の男性合格率/女性合格率 が1以上の割合

sim <- function(n=2000,M_sample_p = 0.556,F_sample_p =0.482, M_p = 0.75,F_p=0.80, top = 50){
  Male <- rnorm(n = n,mean = 50,sd=25)
  Fem <- rnorm(n = n,mean = 50,sd=25)
  Male <- sample(Male,size = n*M_sample_p)
  Fem_sam <- sample(Fem,size = n*F_sample_p)
  pop = c(Male,Fem_sam)
  QM=quantile(pop,probs = M_p)
  QF_S= quantile(pop,probs = F_p)
  
  M_U_QM= Male[Male>=QM]
  F_U_QFS= Fem_sam[Fem_sam>=QF_S]
  
  M_df = data.frame(val = M_U_QM)
  M_df$sex <- "M"
  F_df = data.frame(val = F_U_QFS)
  F_df$sex <- "F"
  
  library(dplyr)
  df <- rbind.data.frame(M_df,F_df)
  df <- df %>%
    dplyr::arrange(order = desc(val))
  df$suc <- 0
  df$suc[1:top] <- 1
  
  test <- fisher.test(table(df$sex,df$suc))
  F_suc_p=prop.table(table(df$sex,df$suc),margin = 1)[1,2]
  M_suc_p=prop.table(table(df$sex,df$suc),margin = 1)[2,2]
  
  df=df[1:top,]
  
  
  tab=table(df[,"sex"])
  return(list(Fem_P= tab[1]/(tab[2]+tab[1]),
              Suc_P=c(M_suc_p,F_suc_p)))
}

sim_main<- function(B = 2000,n=20000,M_sample_p = 0.556,F_sample_p =0.482, M_p = 0.75,F_p=0.80, top = 100){
  library(doParallel)
  cl <- makeCluster(detectCores())
  registerDoParallel(cl)
  vec_2<- foreach(i = 1:B, .combine=c, .export = "sim") %dopar% {
    res=sim(n=n,M_sample_p=M_sample_p,F_sample_p=F_sample_p, M_p =  M_p, F_p=F_p ,top= top)
    res[[2]][1]/res[[2]][2]
  }
  stopCluster(cl)
    # for(i in 1:B){
    # res=sim(n=n,M_sample_p=M_sample_p,F_sample_p=F_sample_p, M_p =  M_p, F_p=F_p ,top= top)
    # vec_1[i]=res[[1]][1]
    # vec_2[i]=res[[2]][1]/res[[2]][2]
    # }

  #hist(vec_1,breaks = 15)
  #hist(vec_2,breaks = 30,xlim=c(0,1.5))
  return(prop.table(table(vec_2 >=1.00)))

}

シミュレーション(B=2000)

女性の大学進学率を変えてシミュレート

sim_main(M_sample_p = 0.556,F_sample_p =0.482,top= 100)
## Warning: package 'doParallel' was built under R version 3.5.3
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
## 
##  FALSE   TRUE 
## 0.8595 0.1405
sim_main(M_sample_p = 0.556,F_sample_p =0.556,top= 100)
## 
##  FALSE   TRUE 
## 0.8655 0.1345
sim_main(M_sample_p = 0.556,F_sample_p =0.600,top= 100)
## 
##  FALSE   TRUE 
## 0.8745 0.1255

女性の浪人忌避傾向を変えてシミュレート

#
sim_main(M_p=0.75,F_p=0.75,top= 100)
## 
##  FALSE   TRUE 
## 0.4955 0.5045
sim_main(M_p=0.75,F_p=0.80,top= 100)
## 
## FALSE  TRUE 
## 0.876 0.124
sim_main(M_p=0.75,F_p=0.85,top= 100)
## 
## FALSE  TRUE 
## 0.995 0.005

もう少し高いレベルじゃないと男女ともに出願しないと仮定

東大医学部の合格偏差値は76.8 くらい?

pnorm(q=70,mean=50,sd =10) = 0.9963189

pnorm(q=76.8,mean=50,sd =10) = 0.9963189

sim_main(n=550000,M_p=pnorm(q=75,mean=50,sd =10),F_p=pnorm(q=75,mean=50,sd =10),top= 100)
## 
##  FALSE   TRUE 
## 0.4945 0.5055
sim_main(n=550000,M_p=pnorm(q=75,mean=50,sd =10),F_p=pnorm(q=76,mean=50,sd =10),top= 100)
## 
##  FALSE   TRUE 
## 0.9295 0.0705
sim_main(n=550000,M_p=pnorm(q=75,mean=50,sd =10),F_p=pnorm(q=77,mean=50,sd =10),top= 100)
## 
## FALSE  TRUE 
## 0.999 0.001
sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 17763)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=Japanese_Japan.932  LC_CTYPE=Japanese_Japan.932   
## [3] LC_MONETARY=Japanese_Japan.932 LC_NUMERIC=C                  
## [5] LC_TIME=Japanese_Japan.932    
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
## [1] doParallel_1.0.14 iterators_1.0.10  foreach_1.4.4    
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.18     codetools_0.2-15 digest_0.6.18    rprojroot_1.3-2 
##  [5] backports_1.1.3  magrittr_1.5     evaluate_0.11    stringi_1.2.4   
##  [9] rmarkdown_1.10   tools_3.5.1      stringr_1.3.1    yaml_2.2.0      
## [13] compiler_3.5.1   htmltools_0.3.6  knitr_1.20