Created at 2017-11-18 01:39:36.

1 ラブライブ! サンシャイン!!

ラブライブ! サンシャイン!! とは，9人の女の子が出てくるアニメである．9人の女の子たちがラブライブを目指して頑張る姿を描いている．

1.1 果南の出番がない

しかしながら，2016年に放送されたアニメ第1期では，松浦果南のセリフがまったくない回があった．中の人である諏訪ななかも以下のようなツイートを残している．

まもなく第6話です🐬🌊
今日は出るかなん？？？
宜しくお願いします〜💚 https://t.co/eQsni4IODM
— 諏訪ななかofficial (@suwananaka) 2016年8月6日

今週も果南は……… #lovelive_sunshine
— 諏訪ななかofficial (@suwananaka) 2016年8月13日

1.2 果南のセリフがこんなに少ないわけがない

果南のセリフ回数を実際にアニメを視聴してカウントしたが，3,5,7話でセリフが一度もなかった．

library(rstan)
rstan_options(auto_write = TRUE)
options(mc.cores = parallel::detectCores())

txt <- "
       高海千歌 桜内梨子 渡辺曜 津島善子 国木田花丸 黒澤ルビィ 松浦果南 黒澤ダイヤ 小原鞠莉
第1話       133       34     48       13         10          8        8         17        1
第2話       115       65     39        2          7         10        5         18        0
第3話        94       68     68        3          5          9        0         11       27
第4話        59       30     33        1         58         77        9         12        9
第5話       102       87     20       65         35         26        0          8        3
第6話        63       44     31       21         18         39        3         24       19
第7話        73       50     21       28         27         40        0         14        4
第8話        55       34     23       16         20         16       13         19       13
第9話        46       21     24       12         10         23       49         24       44
第10話       64       36     29       24         15         27       20         41       22
第11話       53       19     72        9         14         12       16         18       27
第12話       75       28     30       10         12         24       26         31       15
第13話       63       28     23       18         16         18       15         19       15
"
dat <- read.table(text=txt)
# 各メンバーのイメージカラー
cols <- c("orange", "pink", "skyblue", "grey", "yellow3", "deeppink2", "lightseagreen", "red", "purple")
f <- list.files(pattern="[0-9].jpg")
library(jpeg)
# 適当なところから png を取ってきておく。
pics <- vector("list", 9)
j <- 0
for(i in seq(f)){
    pics[[i]] <- readJPEG(f[i], native=TRUE)
}
ra <- 1 #原点に近いところが潰れるので拡大したかったけど、等倍でやった。
xy0 <- sapply(pics, dim)[1:2, ]/3 #pixel
rownames(xy0) <- c("height", "width")
s0 <- 0.00135 #拡大縮小率
par(mar=c(5, 5, 4, 6), las=1, cex.lab=1.5)
y <- t(dat/rowSums(dat))
colnames(y) <- seq(ncol(y))
b <- barplot(y, col=cols, xlab="話数", ylab="全セリフに占める割合")
p <- par()$usr
px <- p[2] - p[1]
py <- p[4] - p[3]
#text(p[2]-0.7, cumsum(y[,ncol(y)])-y[,ncol(y)]/2, rownames(y), pos=4, xpd=TRUE, col=cols, font=2)
text(b, p[4], rowSums(dat), pos=3, xpd=TRUE)
mtext("総セリフ回数", 3, line=2, cex=1.5)
lay0 <- cbind(par()$usr[2], c(cumsum(c(0, y[-9,13])) + cumsum(y[,13]))/2)
for(j in seq(pics)){
  xleft=lay0[j, 1]*ra - xy0[2, j]/2*s0*px/py
  ybottom=lay0[j, 2]*ra - xy0[1, j]/2*s0*py
  xright=lay0[j, 1]*ra + xy0[2, j]/2*s0*px/py
  ytop=lay0[j, 2]*ra + xy0[1, j]/2*s0*py
  rasterImage(image=pics[[j]], xleft=xleft, ybottom=ybottom, xright=xright, ytop=ytop, xpd=TRUE)
}
text(xright, cumsum(y[,ncol(y)])-y[,ncol(y)]/2, rownames(y), pos=4, xpd=TRUE, col=cols, font=2)

2 モデル

メンバーたちのセリフカウントデータから，各メンバーのセリフ回数を推定する．

推定には以下のモデルを想定する．

メンバー内で占める割合 \(\theta_i, \displaystyle\sum_{i}^9\theta_i=1\) を用意し，セリフ回数は多項分布から得られると仮定する．
セリフ回数 \(N_i\) を決めるパラメータ\(\mu_i\) を決め，セリフ回数は負の二項分布から得られると仮定する．

2.1 メンバー内で占める割合

メンバー\(i=\{1,\dots,9\}\) が各話 \(t=\{1,\dots,13\}\) でsimplex \(0\leq \theta_{i,t} \leq 1, \displaystyle\sum_{i}^9\theta_{i,t}=1\) ずつ，セリフを専有するとする．

セリフ回数\(N_{i,t}\) は，各確率\(\theta_{i,t}\) に応じて多項分布\(\textrm{multinomial}(\theta_{i,t})\) からサンプリングされるとすると，コードは以下のようになる．

model01 <- "
data{
  int<lower=0> N;                       # キャラ人数
  int<lower=0> V;                       # 話数
  int<lower=0, upper=1000> serif[V, N]; # セリフの数
}
parameters{
  simplex[N] theta[V];                  # 総和1の得票率
  real<lower=0, upper=1000> lambda;     # 1話あたりのセリフ総数の平均
}
model{
  for(v in 1:V){
    sum(serif[v,]) ~ poisson(lambda);
    serif[v,] ~ multinomial(theta[v]);  # 多項分布からサンプリング
  }
}
generated quantities{
  int s[V, N];                          # セリフ数
  int<lower=10> s_all[V];               # 1話ごとの総セリフ数
  for(v in 1:V){
    s_all[v] <- poisson_rng(lambda);
    s[v,] <- multinomial_rng(theta[v], s_all[v]);
  }
}
"

standata <- list(N=ncol(dat), V=nrow(dat), serif=dat)
stanmodel01 <- stan_model(model_code=model01)
fit01 <- sampling(stanmodel01, data=standata, chains=4, warmup=1000, iter=2000, seed=1234)

2.2 セリフ回数

各話のセリフ回数は適当な分布からサンプリングされると仮定する．

正規分布でもよいが，各話ごとにスポットの当てられるキャラがいると考えると，そのキャラはその回だとセリフ数が増えるし，逆にあまりスポットが当たらない場合はセリフ数が減る．この増減は同じ確率ではなさそうだと思えば，左右非対称だったり，裾が重い分布を使ってみてもいいかもしれない．ここでは特に理由もなく負の二項分布を採用してみる．

Ns <- 0:25
Nm <- 10
d1 <- dpois(Ns, Nm)
d2 <- dnbinom(Ns, size=max(Ns), mu=Nm)
dcols <- c("blue", "green3")
par(mar=c(5, 5, 2, 2), las=1, cex.lab=1.5)
plot(Ns, d1, type="o", xlab="セリフ回数", ylab="確率密度", col=dcols[1], pch=15, lwd=3)
lines(Ns, d2, col=dcols[2], type="o", pch=15, lwd=3)
legend("topright", legend=c("ポアソン分布", "負の二項分布"), col=dcols, pch=15, cex=1.6)

Rstan では負の二項分布を使うときに，scale paraemter \(\alpha, \beta\) を使う．

\(N_i\sim \textrm{neg_binomial}(\alpha_i, \beta_i)\)

ここで，平均セリフ回数\(\mu_i\) は，\(\mu_i=\frac{\alpha_i}{\beta_i}\) となる．

model02 <- "
data{
  int<lower=0> N;
  int<lower=0> V;
  int<lower=0, upper=1000> serif[V, N];
}
parameters{
  vector<lower=0, upper=1000>[N] alpha; # 負の二項分布の平均
  vector<lower=0, upper=1000>[N] beta;  # 負の二項分布の分散
  real<lower=0, upper=1000> lambda;
}
model{
  for(v in 1:V){
    sum(serif[v,]) ~ poisson(lambda);
    serif[v,] ~ neg_binomial(alpha, beta);
  }
}
generated quantities{
  int s[V, N];
  int<lower=10> s_all[V];
  for(v in 1:V){
    s_all[v] <- poisson_rng(lambda);
    for(n in 1:N){
      s[v, n] <- neg_binomial_rng(alpha[n], beta[n]);
    }
  }
}
"

stanmodel02 <- stan_model(model_code=model02)
fit02 <- sampling(stanmodel02, data=standata, chains=4, warmup=1000, iter=2000, seed=1234)

3 結果

3.1 モデル1 で果南が最下位の確率


0	0	0

6.2	0.45	0

82.55	0.025	10.775

82.55% もの確率で，果南のセリフの回数がメンバー内で最下位となる．

3.2 各話で果南が最下位になる確率

各話ごとでの最下位になる確率でも，果南は3,5,6,7話で非常に高かった．アニメ開始の3話までは導入部分であり，千歌を中心とした2年生メンバーばかりがセリフを発していた．8話あたりから果南の休学話が出てきたようなきがして，このあたりから3年生組のセリフが多かった．後半では1年生組のセリフが少なくなり，花丸のセリフ回数が最下位になる可能性が高くなっていた．

3.3 モデル2 で果南が最下位になる確率


0	0	0

9.6	1.8	0

64.2	0.275	24.125

モデル2 では果南のセリフ回数が最下位になる確率は64.2% となっており，モデル1 よりかは緩和されていた．

# 負の二項分布モデル
y <- ex$alpha/ex$beta
colnames(y) <- colnames(dat)
par(mar=c(5, 4, 3, 6), las=1)
plot(0, xlab="", ylab="", main="負の二項分布モデル", las=1, pch=16, type="n", xlim=c(0, max(y)), ylim=c(0.8, ncol(dat)+0.4), yaxt="n")
abline(h=seq(ncol(dat)), lty=3, col=grey(0.4))
for(i in seq(ncol(dat))){
  vioplot2(y[,i], at=i, col=cols[i], horizontal=TRUE, add=TRUE)
}
points(unlist(dat), rep(1:ncol(dat), each=nrow(dat))-0.2, pch=16, col=rep(cols, each=nrow(dat)))
p <- par()$usr
px <- p[2] - p[1]
py <- p[4] - p[3]
text(p[2], seq(ncol(dat)), colnames(dat), pos=4, xpd=TRUE, col=cols, font=2)
mtext("平均セリフ回数", 1, line=2.5, cex=1.5)
lay0 <- cbind(p[1], seq(nrow(res)))
s0 <- 0.018
for(j in seq(pics)){
  xleft=lay0[j, 1]*ra - xy0[2, j]*s0*px/py
  ybottom=lay0[j, 2]*ra - xy0[1, j]/2*s0
  xright=lay0[j, 1]*ra# + xy0[2, j]*s0*px/py
  ytop=lay0[j, 2]*ra + xy0[1, j]/2*s0
  rasterImage(image=pics[[j]], xleft=xleft, ybottom=ybottom, xright=xright, ytop=ytop, xpd=TRUE)
}

千歌は主人公であるため，セリフ回数が非常に多かった．他の2年生組もセリフ回数は多かった．

果南は他のメンバーたちと比べて，セリフ回数の分布は0 に寄っているため，最下位にはなりやすい．ここで注目したいのが，善子は意外とセリフ回数が少なく，上の表でも最下位になる確率が9.6% あることだった．

4 まとめ

果南のセリフは少なかった．

2年生以外は基本的に少なかった．

善子（ヨハネ）が意外と少なかった．

俺の果南のセリフがこんなに少ないわけがない

@Med_KU