library(tidyverse)
See https://github.com/benpeloquin7/uid_rsa_conversations/blob/master/models/simulation.py for modeling framework – an extension of Levy (2018) / an alternative framework to the fixed point analysis.
Negative correlation arises in populations of RSA agents but not non-RSA agents.
fp_rsa_100 <- "../outputs/agent-100-full.csv" # not permuted conversation partners
fp_no_rsa_100 <- "../outputs/agent-100-no-rsa-full.csv" # not permuted conversation partners
fp_rsa_10 <- "../outputs/agent-10-full.csv" # 10 agents
fp_no_rsa_10 <- "../outputs/agent-10-no-rsa-full.csv"
fp_rsa_6 <- "../outputs/agent-6-full.csv" # 6 agents
fp_no_rsa_6 <- "../outputs/agent-6-no-rsa-full.csv"
fp_rsa_4 <- "../outputs/agent-4-full.csv" # 4 agents
fp_no_rsa_4 <- "../outputs/agent-4-no-rsa-full.csv"
fp_rsa_2 <- "../outputs/agent-2-full.csv" # 2 agents
fp_no_rsa_2 <- "../outputs/agent-2-no-rsa-full.csv"
fp_rsa_1 <- "../outputs/agent-1-full.csv" # 1 agents
fp_no_rsa_1 <- "../outputs/agent-1-no-rsa-full.csv"
df <- rbind(
# read.csv(fp_rsa_100) %>% mutate(use_rsa=TRUE),
# read.csv(fp_no_rsa_100) %>% mutate(use_rsa=FALSE),
read.csv(fp_rsa_10) %>% mutate(use_rsa=TRUE),
read.csv(fp_no_rsa_10) %>% mutate(use_rsa=FALSE),
read.csv(fp_rsa_6) %>% mutate(use_rsa=TRUE),
read.csv(fp_no_rsa_6) %>% mutate(use_rsa=FALSE),
read.csv(fp_rsa_4) %>% mutate(use_rsa=TRUE),
read.csv(fp_no_rsa_4) %>% mutate(use_rsa=FALSE),
read.csv(fp_rsa_2) %>% mutate(use_rsa=TRUE),
read.csv(fp_no_rsa_2) %>% mutate(use_rsa=FALSE),
read.csv(fp_rsa_1) %>% mutate(use_rsa=TRUE),
read.csv(fp_no_rsa_1) %>% mutate(use_rsa=FALSE)
)
df_pop <- df %>%
filter(id=='pop')
## Warning: package 'bindrcpp' was built under R version 3.4.4
df_no_pop <- df %>%
filter(id!='pop')
Preprocessing
df_stable <- df_pop[!is.na(df_pop$r), ]
# Get max rounds by id, sim_id
max_rounds <- df_stable %>%
group_by(use_rsa, n_agents, id, sim_id) %>%
summarise(max_round=max(round))
get_max_round <- function(use_rsa_, n_agents_, id_, sim_id_) {
subset(max_rounds, (use_rsa==use_rsa_ & n_agents==n_agents_ & id==id_ & sim_id==sim_id_))$max_round
}
# Attempt with mapply -- note this is slow running with we don't restrict to id==pop (for larger speaker populations).
max_rounds_list <- mapply(get_max_round, df_stable$use_rsa, df_stable$n_agents, df_stable$id, df_stable$sim_id)
# Attempt with purrr:pmap
# max_rounds_list <- df_stable %>%
# select(use_rsa, n_agents, id, sim_id) %>%
# purrr::pmap(., function(use_rsa, n_agents, id, sim_id) {get_max_round(use_rsa, n_agents, id, sim_id)}) %>%
# unlist()
assertthat::are_equal(length(max_rounds_list), nrow(df_stable))
## [1] TRUE
# Assign max round
df_stable$max_round <- max_rounds_list
df_stable <- df_stable %>%
mutate(stable=max_round==n_rounds-1,
is_final_round=max_round==round)
r Trajectoriesdf_stable %>%
filter(id=='pop') %>%
mutate(use_rsa=factor(ifelse(use_rsa, 'rsa', 'no-rsa'), levels=c('rsa', 'no-rsa'))) %>%
ggplot(aes(x=round, y=r, lty=use_rsa, col=as.factor(sim_id))) +
geom_line(alpha=0.6) +
ylim(-1, 1) +
theme_classic() +
theme(legend.position='none') +
facet_wrap(n_agents~use_rsa, nrow=5)
that-rate Trajectoriesdf_stable %>%
filter(id=='pop') %>%
mutate(use_rsa=factor(ifelse(use_rsa, 'rsa', 'no-rsa'), levels=c('rsa', 'no-rsa'))) %>%
ggplot(aes(x=round, y=that_rate, col=as.factor(sim_id))) +
geom_line(alpha=0.6) +
ylim(0, 1) +
theme_classic() +
theme(legend.position='none') +
facet_wrap(n_agents~use_rsa, nrow=5)
that-rate distrdf_stable %>%
filter(is_final_round, id=='pop') %>%
mutate(use_rsa=factor(ifelse(use_rsa, 'rsa', 'no-rsa'), levels=c('rsa', 'no-rsa'))) %>%
ggplot(aes(x=that_rate,y=..density..)) +
geom_histogram(bins=32) +
scale_x_continuous(limits=c(-0.05,1.05)) +
ylab("Probability density") +
xlab(expression(paste("Marginal frequency of optional marker ", t))) +
theme_classic() +
facet_wrap(n_agents~use_rsa, nrow=5)
## Warning: Removed 10 rows containing missing values (geom_bar).
r distrdf_stable %>%
filter(is_final_round, id=='pop') %>%
mutate(use_rsa=factor(ifelse(use_rsa, 'rsa', 'no-rsa'), levels=c('rsa', 'no-rsa'))) %>%
ggplot(aes(x=r, y=..density.., fill=as.factor(n_agents))) +
geom_vline(xintercept=0.0, lty=2, alpha=0.3) +
geom_histogram(bins=42, alpha=0.9) +
scale_x_continuous(limits=c(-1.05,1.05)) +
ylab("Probability density") +
xlab("Pearson correlation between\nphrase onset & t probabilities") +
theme_classic() +
facet_grid(n_agents~use_rsa)
this is actually really interesting – the effect is not a fun
df_stable %>%
filter(is_final_round) %>%
group_by(use_rsa, n_agents) %>%
summarise(mean_r=mean(r),
sd_r=sd(r),
n=n(),
ci_hi=mean_r+qnorm(0.975)*sd_r/sqrt(n),
ci_lo=mean_r+qnorm(0.025)*sd_r/sqrt(n)) %>%
ggplot(aes(x=as.factor(n_agents), y=mean_r, fill=use_rsa)) +
geom_bar(stat='identity', position='dodge') +
geom_errorbar(aes(ymin=ci_lo, ymax=ci_hi), width=0.2, position=position_dodge(0.9)) +
ylim(-0.3, 0.1) +
theme_classic()
df_stable %>%
filter(is_final_round) %>%
mutate(use_rsa=factor(ifelse(use_rsa, 'rsa', 'no-rsa'), levels=c('rsa', 'no-rsa'))) %>%
ggplot(aes(x=as.factor(n_agents), y=r, col=use_rsa, fill=use_rsa)) +
geom_hline(yintercept=0.0, lty=2, alpha=0.2) +
geom_point(aes(col=use_rsa), alpha=0.4) +
# geom_boxplot(alpha=0) +
geom_violin(alpha=0.3) +
coord_flip() +
theme_classic() +
facet_wrap(~use_rsa)
Average lexicon entropy decreases across RSA conversations.
df_pop %>%
group_by(use_rsa, n_agents, round) %>%
summarise(mean_entropy=mean(entropy),
sd_entropy=sd(entropy),
n=n(),
ci_hi=mean_entropy+qnorm(0.975)/sqrt(n),
ci_lo=mean_entropy+qnorm(0.025)/sqrt(n)) %>%
ungroup() %>%
mutate(use_rsa=factor(ifelse(use_rsa, 'rsa', 'no-rsa'), levels=c('rsa', 'no-rsa'))) %>%
ggplot(aes(x=round, y=mean_entropy, col=use_rsa)) +
geom_line() +
geom_errorbar(aes(ymin=ci_lo, ymax=ci_hi), alpha=0.3) +
theme_classic() +
facet_wrap(~n_agents)
df_stable %>%
filter(is_final_round) %>%
mutate(use_rsa=factor(ifelse(use_rsa, 'rsa', 'no-rsa'), levels=c('rsa', 'no-rsa'))) %>%
ggplot(aes(x=as.factor(n_agents), y=entropy, fill=use_rsa)) +
geom_boxplot() +
theme_classic()
Decerease in lexicon entropy.
df_stable %>%
filter(is_final_round, id=='pop') %>%
mutate(use_rsa=factor(ifelse(use_rsa, 'rsa', 'no-rsa'), levels=c('rsa', 'no-rsa'))) %>%
select(use_rsa, n_agents, sim_id, entropy) %>%
spread(use_rsa, entropy) %>%
mutate(diff=rsa-`no-rsa`) %>%
group_by(n_agents) %>%
ggplot(aes(x=as.factor(n_agents), y=diff, fill=as.factor(n_agents))) +
geom_hline(yintercept=0, lty=2, alpha=0.4, col='red') +
geom_violin(aes(fill=as.factor(n_agents)), alpha=0.4, trim=FALSE) +
geom_point(alpha=0.5) +
coord_flip() +
theme_classic()
## Warning: Removed 8 rows containing non-finite values (stat_ydensity).
## Warning: Removed 8 rows containing missing values (geom_point).
Due to lower overall entropy individual speakers lexicons converge more closely, perhaps as a function of populations size.
df_no_pop %>%
group_by(use_rsa, n_agents, round) %>%
summarise(mean_KL=mean(KL),
sd_KL=sd(KL),
n=n(),
ci_hi=mean_KL+qnorm(0.975)/sqrt(n),
ci_lo=mean_KL+qnorm(0.025)/sqrt(n)) %>%
ungroup() %>%
mutate(use_rsa=factor(ifelse(use_rsa, 'rsa', 'no-rsa'), levels=c('rsa', 'no-rsa'))) %>%
ggplot(aes(x=round, y=mean_KL, col=as.factor(n_agents))) +
geom_line() +
# geom_errorbar(aes(ymin=ci_lo, ymax=ci_hi)) +
ylim(0, 0.3) +
theme_classic() +
facet_wrap(~use_rsa)
B-prob example (one simulation). What should aggregate statistic be?
The population converges on a shared distribution over relative clauses (worth thinking as need-probabilities).
df_no_pop %>%
filter(sim_id %in% c(10)) %>%
ggplot(aes(x=round, y=current_B_prob, col=as.factor(paste0(id, '-', sim_id)))) +
geom_line() +
theme_classic() +
facet_grid(use_rsa~n_agents)
t-prob example (one simulation). What should aggregate statistic be?
However, there is still lots of individual-level variation in terms of optinal marking for RSA agents…
df_no_pop %>%
filter(sim_id %in% c(10)) %>%
ggplot(aes(x=round, y=current_t_prob, col=as.factor(paste0(id, '-', sim_id)))) +
geom_line() +
theme_classic() +
facet_grid(use_rsa~n_agents)
Variance within a population optional-t usage increases for RSA agents.
How to interpret this? As the conversations proceed the within population variance of optional-t usage increases for RSA speakers and declines for non-rsa speakers.
df_no_pop %>%
mutate(use_rsa=factor(ifelse(use_rsa, 'rsa', 'no-rsa'), levels=c('rsa', 'no-rsa'))) %>%
group_by(use_rsa, n_agents, id, round) %>%
summarise(mean_current_t_probs=mean(current_t_prob),
sd_current_t_probs=sd(current_t_prob)^2) %>%
group_by(use_rsa, n_agents, round) %>%
summarise(mean_sd_current_t_probs=mean(sd_current_t_probs),
sd_sd_current_t_probs=sd(sd_current_t_probs),
n=n(),
ci_lo=mean_sd_current_t_probs+qnorm(0.025)*sd_sd_current_t_probs/sqrt(n),
ci_hi=mean_sd_current_t_probs+qnorm(0.975)*sd_sd_current_t_probs/sqrt(n)) %>%
ggplot(aes(x=round, y=mean_sd_current_t_probs, col=as.factor(n_agents))) +
geom_errorbar(aes(ymin=ci_lo, ymax=ci_hi), alpha=0.4) +
geom_line() +
theme_classic() +
facet_grid(use_rsa~., scales="free")
## Warning: Removed 200 rows containing missing values (geom_errorbar).
Note the facet numbers are not meaningful… should show this another way.
sims <- sample(unique(df_stable$sim_id), min(length(unique(df_stable$sim_id)), 16))
df_stable %>%
filter(sim_id %in% sims, id=='pop') %>%
mutate(use_rsa=factor(ifelse(use_rsa, 'rsa', 'no-rsa'), levels=c('rsa', 'no-rsa'))) %>%
ggplot(aes(x=round, y=that_rate, lty=use_rsa, col=as.factor(n_agents))) +
geom_line(alpha=0.9) +
ylim(0, 1) +
ggtitle("Sample that-rate trajectories") +
facet_wrap(~sim_id) +
theme_classic()
df_stable %>%
filter(sim_id %in% sims, id=='pop') %>%
mutate(use_rsa=factor(ifelse(use_rsa, 'rsa', 'no-rsa'), levels=c('rsa', 'no-rsa'))) %>%
ggplot(aes(x=round, y=r, lty=use_rsa, col=as.factor(n_agents))) +
geom_line(alpha=0.9) +
ylim(-1, 1) +
ggtitle("Sample r trajectories") +
facet_wrap(~sim_id) +
theme_classic()
summary(lm(r~as.factor(n_agents)+use_rsa, data=df_stable))
##
## Call:
## lm(formula = r ~ as.factor(n_agents) + use_rsa, data = df_stable)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.86563 -0.10495 0.00929 0.12117 0.89316
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.038476 0.001552 24.79 <2e-16 ***
## as.factor(n_agents)2 -0.023040 0.002020 -11.40 <2e-16 ***
## as.factor(n_agents)4 -0.030702 0.001999 -15.36 <2e-16 ***
## as.factor(n_agents)6 -0.048918 0.001995 -24.52 <2e-16 ***
## as.factor(n_agents)10 -0.039181 0.001990 -19.69 <2e-16 ***
## use_rsaTRUE -0.171624 0.001244 -137.95 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1912 on 94903 degrees of freedom
## Multiple R-squared: 0.1745, Adjusted R-squared: 0.1744
## F-statistic: 4012 on 5 and 94903 DF, p-value: < 2.2e-16
# R. Levy's preprocessing
# dat$stable <- with(dat,thatrate > 0.001 & thatrate < 0.999)
# dat <- subset(dat, ! (k==1.0 & c==0.0))
# dat.summary <- dat %>% group_by(k,c) %>%
# dplyr:::summarise(stable=mean(stable),r=mean(r))
# This preprocessing is only valid for stable optionality plot
df_preprocessed <- df %>%
filter(al_round, id!='pop') %>%
filter(k != 1.0, c != 0.0) %>%
group_by(id, k, c) %>%
summarise(stable=mean(is_stable), r=mean(r))
stables <- df %>%
filter(is_final_round) %>%
mutate(stable=ifelse(that_rate > 0.1 & that_rate < 0.9, TRUE, FALSE)) %>%
filter(stable)
stables <- stables$sim_id
# Plot all trajectories
df_preprocessed %>%
filter(id!='pop') %>%
ggplot(aes(k,c)) +
geom_tile(aes(fill=r),colour="white") +
labs(y=expression(paste("String length cost parameter ", c)), fill="stable\noptionality\nrate") +
theme_classic() +
scale_x_continuous(name=expression(paste("Nonuniformity penalization parameter ",k)),
breaks=seq(0,2,by=0.2))