rm(list = ls())
setwd('/Users/benpeloquin/Desktop/Projects/Pragmods/models/')
source("matrices.R")
source("agents.R")
library(rrrsa)
library(dplyr)
library(tidyr)
library(ggplot2)
library(parallel)
library(stats)
Helper functions of data pre-processing and combining (from sims.R).
#####
##### normalizeBy()
##### -------------
##### normaze matrix by rows or cols, maintain naming
#####
normalizeBy <- function(m, rows = TRUE) {
colNames <- colnames(m)
rowNames <- rownames(m)
normedM <- apply(m, MARGIN = ifelse(rows, 1, 2), rrrsa::rsa.normVec)
colnames(normedM) <- colNames
rownames(normedM) <- rowNames
normedM
}
#####
##### inGlobalEnv()
##### -------------
##### boolean object presence in global env
#####
inGlobalEnv <- function(item) {
item %in% ls(envir = .GlobalEnv)
}
#####
##### getLiteralSemantics()
##### ---------------------
##### literal semantics mapper between
##### normalized matrices and dataframe
##### called in mutate() within dplyr chain
#####
getLiteralSemantics <- function(matrixName, object, query) {
if (!(inGlobalEnv("simpleM") | inGlobalEnv("complexM") | inGlobalEnv("oddmanM") | inGlobalEnv("twinsM"))) {
stop("Missing normalized matrices, please run data pre-processing block")
}
if (matrixName == "simple") {
simpleM[as.character(object), as.character(query)]
} else if (matrixName == "complex") {
complexM[as.character(object), as.character(query)]
} else if (matrixName == "oddman") {
oddmanM[as.character(object), as.character(query)]
} else if (matrixName == "twins") {
twinsM[as.character(object), as.character(query)]
} else {
stop("Error in getLiteralSemantics()")
}
}
#####
##### completeQueries()
##### ---------------------
##### add "query" words for each expt
##### this is necessary for running with rrrsa::runDf
#####
completeQueries <- function(d) {
## store data
newD <- d
currMatrix <- unique(as.character(d$matrix))
## meta-data
allQueries <- c("glasses", "hat", "mustache")
allObjects <- c("foil", "logical", "target")
## The current query
currQuery <- unique(as.character(d$query))
neededQueries <- setdiff(allQueries, currQuery)
## Add in new queries
for (q in neededQueries) {
df <- data.frame(cond = as.vector(d$cond),
expt = as.vector(d$expt),
matrix = as.vector(d$matrix),
prior = as.vector(d$prior),
query = rep(q, 3),
object = as.vector(d$object),
count = rep(NA, 3),
p = rep(NA, 3),
n = as.vector(d$n),
cil = rep(NA, 3),
cih = rep(NA, 3),
priorType = as.vector(d$priorType),
priorValue = as.vector(d$priorValue),
grouper = as.vector(d$grouper))
newD <- plyr::rbind.fill(newD, df)
if (currMatrix == "simple") return(newD)
}
newD
}
# undebug(completeQueries)
# debug(completeQueries)
# completeQueries(baserate_63)
#####
##### convertListToDf()
##### -----------------
##### Convert a list where each element represents
##### data for a different grouping variable
##### useful with simulations run below
#####
convertListToDF <- function(li, rows = TRUE) {
df <- data.frame()
nItems <- seq(1, length(li))
if (rows) {
for (i in nItems) {
df <- rbind(df, as.data.frame(li[[i]]))
}
} else {
df <- data.frame(matrix(nrow = length(li[[1]]), ncol = 1))
for (i in nItems) {
df <- cbind(df, as.data.frame(li[[i]]))
}
return(df[, -1])
}
df
}
#####
##### rrrsa_ready_d
##### -------------
##### process data for use with rrrsa
##### NOTE :: this is not a generic fn
##### an makes certain assumptions about column naming
##### (eg. colnames -> object, expt, n, matrix)
#####
rrrsa_ready_df <- function(d, foil_name = "foil", logical_name = "logical", target_name = "target") {
## Initial pre-processing before adding in new cols and then adding literal semantics
processedD <- d %>%
## Gather priors - we're only interested in prior for current object (need to use NSE to make
## compatible with boostrapped priors)
gather_("priorType", "priorValue", c(foil_name, logical_name, target_name))
processedD <- processedD %>%
## Filter out any mismatches between current object and priorType
## Do rowwise with grep so that we can use with boostrap naming conventions
rowwise %>%
filter(grepl(object, priorType)) %>%
rowwise %>%
## Grouping variable concatenates experiment type and sample size (expt-n)
mutate(grouper = paste0(as.character(expt), "-", as.character(n)))
fullData <- plyr::ddply(processedD, .variables = ("grouper"), .fun = completeQueries) %>%
rowwise %>%
## Each row gets a unique literal semantics from getLiteraSemantics() helper
mutate(speaker.p = getLiteralSemantics(matrix, query = query, object = object)) %>%
arrange(expt, n)
fullData
}
set.seed(1)
ALPHAS <- seq(0, 5, by = 0.1)
DEPTHS <- seq(0, 5)
## This is `d` df from line 128 in sims.Rmd
d <- read.csv("data/models.csv")
## Renamed, normalized matrices from matrices.R
simpleM <- normalizeBy(simple, rows = FALSE)
complexM <- normalizeBy(complex, rows = FALSE)
twinsM <- normalizeBy(twins, rows = FALSE)
oddmanM <- normalizeBy(oddman, rows = FALSE)
NOTE:: To work with rrrsa::runDf we need each matrix to include the literal semantics of all the queries (even if they weren’t actually queried) Look at group ‘baserate-63’: only ‘glasses’ is queried, but we need to add the literal semantics for ‘hat’ and ‘mustache’ as well.
fullData <- rrrsa_ready_df(d)
## Pre-sort by grouper
fullData <- fullData %>% arrange(grouper)
sims_prior_temp <- lapply(DEPTHS, function(depth) {
plyr::ddply(fullData, .variables = c("grouper"), rsa.runDf,
quantityVarName = "object", semanticsVarName = "speaker.p",
itemVarName = "query", priorsVarName = "priorValue", depth = depth, usePriorEveryRecurse = FALSE) %>%
rowwise %>%
mutate(tuningData = ifelse(!is.na(count), "keep", "throwout"),
depth = depth) %>%
filter(tuningData == "keep")
})
sims_prior <- convertListToDF(sims_prior_temp)
sims_prior %>%
filter(depth < 3) %>%
ggplot(., aes(x = preds, y = p, col = expt, pch = object)) +
geom_pointrange(aes(ymin = cil, ymax = cih)) +
xlim(c(0,1)) + ylim(c(0,1)) +
facet_wrap(~depth) +
ylab("Proportion choosing target") +
xlab("Model Predictions") +
geom_smooth(method="lm", aes(group=1)) +
geom_abline(slope = 1, intercept = 0, lty=2) +
ggtitle("Priors model sims, alpha = 1")
ggplot(sims_prior, aes(x = preds, y = p, col = expt, pch = object)) +
geom_pointrange(aes(ymin = cil, ymax = cih)) +
xlim(c(0,1)) + ylim(c(0,1)) +
facet_wrap(~depth) +
ylab("Proportion choosing target") +
xlab("Model Predictions") +
geom_smooth(method="lm", aes(group=1)) +
geom_abline(slope = 1, intercept = 0, lty=2) +
ggtitle("Priors model sims, alpha = 1")
sims_prior %>%
group_by(depth) %>%
summarise("cor" = cor(p, preds)) %>%
mutate("alpha" = 1)
## Need to identify indices we want for performance comparisons
fullData <- fullData %>%
rowwise %>%
mutate(tuningData = ifelse(!is.na(count), "keep", "throwout"))
keepIndices <- c("keep", "throwout")
compareIndices <- which(fullData$tuningData == "keep")
## Parallelize fitting
priorFit <- mclapply(DEPTHS, function(i) { ## parallelize over depths
rsa.tuneDepthAlpha(fullData, quantityVarName = "object",
semanticsVarName = "speaker.p", itemVarName = "query",
priorsVarName = "priorValue", groupName = "grouper",
compareDataName = "p", compareIndices = compareIndices,
depth = i, alphas = ALPHAS, usePriorEveryRecurse = FALSE)
},
mc.cores = detectCores())
aggregateTuneData <- convertListToDF(priorFit)
## Get best model performance by alpha, depth
maxCors <- aggregateTuneData %>%
group_by(depth) %>%
summarize(maxCor = max(cor),
alpha = alpha[which.max(cor)])
aggregateTuneData_plot <- aggregateTuneData %>%
mutate(Recursive_depth = as.factor(depth))
## Plot tuning (Plot adapted for personal_site)
ggplot(aggregateTuneData_plot, aes(x=alpha, y=cor, col=Recursive_depth)) +
geom_point(alpha=0.6, size=4) +
geom_vline(xintercept=maxCors$alpha, col="grey", lty="dotdash")+
ylab("Correlation") +
scale_x_continuous(breaks = round(seq(0, 6, by = 0.1)), "Rationality (alpha level)") +
ggthemes::theme_solarized() +
guides(fill=guide_legend(nrow=1,byrow=FALSE, position="bottom"))
depths_param_data <- maxCors %>%
arrange(depth)
best_prior_models_fitted_alpha <- convertListToDF(lapply(DEPTHS, function(depth) {
plyr::ddply(fullData, .variables = c("grouper"), rsa.runDf,
quantityVarName = "object", semanticsVarName = "speaker.p",
priors = "priorValue", usePriorEveryRecurse = FALSE,
itemVarName = "query", depth = depths_param_data$depth[depth + 1], alpha = depths_param_data$alpha[depth + 1]) %>%
mutate(tuningData = ifelse(!is.na(count), "keep", "throwout"),
depth = depth) %>%
filter(tuningData == "keep")
}))
## Quick check
## cor(best_prior_models_fitted_alpha[best_prior_models_fitted_alpha$depth == 1, ]$p, best_prior_models_fitted_alpha[best_prior_models_fitted_alpha$depth == 1, ]$preds)
print("Model performance for recursive depth 0-5 with fitted alphas")
## [1] "Model performance for recursive depth 0-5 with fitted alphas"
maxCors ## performance data
## Source: local data frame [6 x 3]
##
## depth maxCor alpha
## (dbl) (dbl) (dbl)
## 1 0 0.7357369 0.0
## 2 1 0.9453030 3.1
## 3 2 0.9522229 1.4
## 4 3 0.9527394 1.1
## 5 4 0.9523711 1.0
## 6 5 0.9528916 1.0
## plots
ggplot(best_prior_models_fitted_alpha, aes(x = preds, y = p, col = expt, pch = object)) +
geom_pointrange(aes(ymin = cil, ymax = cih)) +
xlim(c(0,1)) + ylim(c(0,1)) +
facet_wrap(~depth, labeller = label_both) +
ylab("Proportion choosing target") +
xlab("Model Predictions") +
geom_smooth(method="lm", aes(group=1)) +
geom_abline(slope = 1, intercept = 0, lty=2) +
ggtitle("Model performance for recursive depth 0-5 with fitted alphas")
sims_NoPrior_temp <- lapply(DEPTHS, function(depth) {
plyr::ddply(fullData, .variables = c("grouper"), rsa.runDf,
quantityVarName = "object", semanticsVarName = "speaker.p",
itemVarName = "query", depth = depth) %>%
rowwise %>%
mutate(tuningData = ifelse(!is.na(count), "keep", "throwout"),
depth = depth) %>%
filter(tuningData == "keep")
})
sims_NoPrior <- convertListToDF(sims_NoPrior_temp)
sims_NoPrior %>%
filter(depth < 3) %>%
ggplot(., aes(x = preds, y = p, col = expt, pch = object)) +
geom_pointrange(aes(ymin = cil, ymax = cih)) +
xlim(c(0,1)) + ylim(c(0,1)) +
facet_wrap(~depth) +
ylab("Proportion choosing target") +
xlab("Model Predictions") +
geom_smooth(method="lm", aes(group=1)) +
geom_abline(slope = 1, intercept = 0, lty=2)
ggplot(sims_NoPrior, aes(x = preds, y = p, col = expt, pch = object)) +
geom_pointrange(aes(ymin = cil, ymax = cih)) +
xlim(c(0,1)) + ylim(c(0,1)) +
facet_wrap(~depth) +
ylab("Proportion choosing target") +
xlab("Model Predictions") +
geom_smooth(method="lm", aes(group=1)) +
geom_abline(slope = 1, intercept = 0, lty=2)
sims_NoPrior %>%
group_by(depth) %>%
summarise("cor" = cor(p, preds)) %>%
mutate("alpha" = 1)
## Check that we have fullData and compareIndices for tuning
if (!(inGlobalEnv("fullData") & inGlobalEnv("compareIndices"))) {
fullData <- fullData %>%
rowwise %>%
mutate(tuningData = ifelse(!is.na(count), "keep", "throwout"))
keepIndices <- c("keep", "throwout")
compareIndices <- which(fullData$tuningData == "keep")
}
## Parallelize fitting
noPriorFit <- mclapply(DEPTHS, function(i) { ## parallelize over depths
rsa.tuneDepthAlpha(fullData, quantityVarName = "object",
semanticsVarName = "speaker.p", itemVarName = "query",
compareIndices = compareIndices, compareDataName = "p",
groupName = "grouper",
depth = i, alphas = ALPHAS)
},
mc.cores = detectCores())
aggregateTuneData_noPriors <- convertListToDF(noPriorFit)
## Get best model performance by alpha, depth
maxCors_noPriors <- aggregateTuneData_noPriors %>%
group_by(depth) %>%
summarize(maxCor = max(cor),
alpha = alpha[which.max(cor)])
## Plot tuning
ggplot(aggregateTuneData_noPriors, aes(x = alpha, y = cor, col = as.factor(depth))) +
geom_point() +
ylim(0.75, 1) +
geom_vline(xintercept = maxCors$alpha, lty = 4, col = "grey50") +
scale_x_continuous(breaks = round(seq(1, 6, by = 0.1)), "alpha") +
ggtitle("Model performance - no priors")
## Warning: Removed 51 rows containing missing values (geom_point).
depths_param_data <- maxCors_noPriors %>%
arrange(depth)
best_noPrior_models_fitted_alpha <- convertListToDF(lapply(DEPTHS, function(depth) { ## parallelize over depths
plyr::ddply(fullData, .variables = c("grouper"), rsa.runDf,
quantityVarName = "object", semanticsVarName = "speaker.p",
usePriorEveryRecurse = FALSE, itemVarName = "query",
depth = depths_param_data$depth[depth + 1], alpha = depths_param_data$alpha[depth + 1]) %>%
mutate(tuningData = ifelse(!is.na(count), "keep", "throwout"),
depth = depth) %>%
filter(tuningData == "keep")
}))
## Quick check
## cor(best_noPrior_models_fitted_alpha[best_noPrior_models_fitted_alpha$depth == 1, ]$p, best_noPrior_models_fitted_alpha[best_noPrior_models_fitted_alpha$depth == 1, ]$preds)
print("Model performance for recursive depth 0-5 with fitted alphas no priors")
## [1] "Model performance for recursive depth 0-5 with fitted alphas no priors"
maxCors_noPriors ## performance data
## Source: local data frame [6 x 3]
##
## depth maxCor alpha
## (dbl) (dbl) (dbl)
## 1 0 0.7357369 0.0
## 2 1 0.9457362 1.5
## 3 2 0.9513002 0.9
## 4 3 0.9520095 0.8
## 5 4 0.9520683 0.7
## 6 5 0.9521724 0.7
## plots
ggplot(best_noPrior_models_fitted_alpha, aes(x = preds, y = p, col = expt, pch = object)) +
geom_pointrange(aes(ymin = cil, ymax = cih)) +
xlim(c(0,1)) + ylim(c(0,1)) +
facet_wrap(~depth, labeller = label_both) +
ylab("Proportion choosing target") +
xlab("Model Predictions") +
geom_smooth(method="lm", aes(group=1)) +
geom_abline(slope = 1, intercept = 0, lty=2) +
ggtitle("No-prior model performance for recursive depth 0-5 with fitted alphas")
## combine priors and uniform
sims_prior <- sims_prior %>%
mutate(prior = "prior")
sims_NoPrior <- sims_NoPrior %>%
mutate(prior = "uniform")
allData <- rbind(sims_prior, sims_NoPrior) %>%
mutate(model = paste0("L", depth)) %>%
arrange(prior, query, object, matrix, cond, expt, model)
## Compare with MFs data
mfD <- read.csv("data/modelRuns.csv") %>%
arrange(prior, query, object, matrix, cond, expt, model)
## Add in MF preds check correlation
allData$mdPreds <- mfD$prediction
allData %>%
ggplot(aes(x = preds, y = mdPreds, fill = matrix, size = depth)) +
xlab("rrrsa predictions") +
ylab("MF predictions") +
geom_point(shape = 21, alpha = 0.30)
# ggsave("pragmods.png", path = "~/Desktop/", width=7, height=3)
Helpers - calculating bootstrapepd CIs
#####
##### se()
##### ----------------------
##### given a numeric vector calcuate std error :: sqrt(var(v))
#####
se <- function(v) {
sqrt(var(v))
}
#####
##### ci()
##### ----------------------
##### given a numeric vector calculate confidence intervals of width, `width`
#####
ci <- function(mu, sd, n, width = 0.95) {
interval <- (1 - width) / 2
error <- (qnorm(1 - interval) * sd) / sqrt(n)
return(c("bstrp_mu" = mu, "bstrp_err" = error))
}
#####
##### get_se()
##### ----------------------
##### given a df of bootstrapped estimates return std error
#####
get_se <- function(boot_ests) {
apply(as.matrix(boot_ests), 1, se)
}
#####
##### get_ci()
##### ----------------------
##### given a df of bootstrapped estimates return `width` confidence intervals
#####
get_ci <- function(boot_ests, width = 0.95) {
apply(as.matrix(boot_ests), 1, FUN = function(ests) {
mu <- mean(ests)
sd <- sd(ests)
n <- length(ests)
return(ci(mu, sd, n, width))
})
}
#####
##### boot_vec()
##### -------------
##### multinomial bootstrap
##### given vector of counts return n boostrap
##### estimates of proportions
#####
boot_vec <- function(countsVec, n = 1) {
t(normalizeBy(rmultinom(n, sum(countsVec), countsVec), rows = FALSE))
}
#####
##### pragmods_priors_boot()
##### ----------------------
##### custom pragmods wrapper to boot_priors()
##### used with mapply() to create new prior estimates
#####
pragmods_priors_boot <- function(c1, c2, c3, n = 1) {
boot_vec(c("foil_cnts" = c1, "logical_cnts" = c2, "target_cnts" = c3), n)
}
#####
##### add_booted_priors()
##### ----------------------
##### given a df with prior counts for 'foil', 'logical' and 'target'
##### return new df with n = 1 boostrap estimates of priors
##### NOTE :: like pragmods_priors_boot this is not a generic function -
##### assumes naming convnetions established here ('foil_cnts', 'logical_cnts', 'target_cnts')
#####
add_booted_priors <- function(df) {
res <- convertListToDF(with(df, mapply(pragmods_priors_boot, foil_cnts, logical_cnts, target_cnts, SIMPLIFY = FALSE)))
names(res) <- c("foil_bt", "logical_bt", "target_bt")
cbind(df, res)
}
#####
##### run_boot_sims()
##### ---------------------
##### Run `n_sims` simulations with `alhpa`, `depth`
##### Expects a df `d_prior_counts` that contains prior counts data
#####
run_boot_sims <- function(d_prior_counts, alpha = 1, depth = 1, n_sims = 100, verbose = TRUE) {
## Display run
if (verbose) {
cat(paste("Running ", n_sims, " simulations with alpha = ", alpha, ""))
}
## Store n boostrapped simulations
bstrap_sims <- mclapply(seq(1, n_sims), FUN = function(i) {
## Convert to rrrsa df
d_booted_priors <-
rrrsa_ready_df(add_booted_priors(d_prior_counts),
foil_name = "foil_bt", logical_name = "logical_bt", target_name = "target_bt")
## Run rrrsa
res <- plyr::ddply(d_booted_priors, .variables = c("grouper"), rsa.runDf,
quantityVarName = "object", semanticsVarName = "speaker.p",
itemVarName = "query", alpha = alpha, depth = depth,
priorsVarName = "priorValue", usePriorEveryRecurse = FALSE) %>%
rowwise %>%
mutate(tuningData = ifelse(!is.na(count), "keep", "throwout"),
depth = depth) %>%
filter(tuningData == "keep")
## Return predictions
res$preds
},
mc.cores = detectCores())
## Convert simulations to df
pred_sims <- convertListToDF(bstrap_sims, rows = FALSE)
pred_sims
}
## Get prior counts data (from sims.R)
priorCounts <- read.csv("data/prior_counts.csv")
names(priorCounts)[1:3] <- c("foil_cnts", "logical_cnts", "target_cnts")
## Merge with our original data set
d_prior_counts <- merge(d, priorCounts, by = "prior", all = TRUE)
Let’s lookat at bootstrapped levels at a single level of recursion. These next two code blocks are useful for verifying that were measuring the correct quantitites.
depth <- 4
d5_bstrap_sims <- run_boot_sims(d_prior_counts, depth = depth, n_sims = 100, verbose = FALSE)
d5_bstrap_sims$simNum <- seq(1, nrow(d5_bstrap_sims))
d5_bstrap_sims_long <- d5_bstrap_sims %>%
gather(itNum, val, -simNum) %>%
select(simNum, val)
## How do the sims look (apprx normally distr)?
d5_bstrap_sims_long %>%
filter(val != 1 & val != 0 & val != 0.5) %>%
ggplot(aes(val, fill = simNum)) +
geom_density() +
facet_wrap(~simNum)
d5_prior_model_sims <- sims_prior[which(sims_prior$depth == depth), ]
errs <- data.frame()
for (sim_row in seq(1, nrow(d5_bstrap_sims))) {
## Get diff with our estimate
diffs <- d5_prior_model_sims$preds[sim_row] - unlist(d5_bstrap_sims[sim_row, ])
errs <- rbind(errs, diffs)
}
cis <- data.frame(t(apply(errs, MARGIN = 1, FUN = quantile, c(0.05, 0.95))))
names(cis) <- c('low', 'hi')
d5_bstrap_cis <- cbind(d5_prior_model_sims, cis)
## Need to add bounds to CIs
d5_bstrap_cis <- d5_bstrap_cis %>%
mutate(ci_low = preds - hi,
ci_hi = preds - low)
ggplot(d5_bstrap_cis, aes(x = preds, y = p, col = expt, pch = object)) +
geom_errorbarh(aes(xmin = ci_hi, xmax = ci_low, height = 0.05)) +
geom_errorbar(aes(ymin = cil, ymax = cih, width = 0.05)) +
xlim(c(0,1)) + ylim(c(0,1)) +
ylab("Proportion choosing target") +
xlab("Model Predictions") +
facet_wrap(~depth) +
geom_smooth(method="lm", aes(group=1)) +
geom_abline(slope = 1, intercept = 0, lty=2) +
ggtitle(paste0("Priors model, depth = ", as.character(depth), ", bootatrapped 95% CIs"))
## Run simulations for each depth
sims_prior_bstrap_errs <- read.csv("sims_prior_bstrap_errs.csv") ## From previous run with set.seed(1)
## Run below code if we don't have saved previous sims
## ---------------------------------------------------
if (!inGlobalEnv("sims_prior_bstrap_errs")) {
depth_sims <- mclapply(DEPTHS, FUN = function(depth) {
run_boot_sims(d_prior_counts, depth = depth, n_sims = 500, verbose = FALSE)
}, mc.cores = detectCores())
## For each simulation level
cis <- data.frame() ## df to hold CIs
# fn <- function() {
for (depth in seq(1, length(depth_sims))) {
## Current depth data
curr_data <- data.frame(depth_sims[depth])
errs <- data.frame() ## Hold ci diffs
## For each simulation (row) in the current data frame
for (sim_row in seq(1, nrow(curr_data))) {
## Get diff with our estimate
diffs <- sims_prior[which(sims_prior$depth == depth - 1), ]$preds[sim_row] - unlist(curr_data[sim_row, ])
errs <- rbind(errs, diffs)
}
cis <- rbind(cis, data.frame(t(apply(errs, MARGIN = 1, FUN = quantile, c(0.05, 0.95)))))
}
# }
# debug(fn)
# undebug(fn)
# cis <- fn()
names(cis) <- c('low', 'hi')
sims_prior_bstrap_errs <- cbind(sims_prior, cis)
## Need to add bounds to CIs
sims_prior_bstrap_errs <- sims_prior_bstrap_errs %>%
mutate(ci_low = preds - hi,
ci_hi = preds - low)
}
# write.csv(sims_prior_bstrap_errs, "sims_prior_bstrap_errs.csv")
ggplot(sims_prior_bstrap_errs, aes(x = preds, y = p, col = expt, pch = object)) +
geom_errorbarh(aes(xmin = ci_hi, xmax = ci_low, height = 0.05)) +
geom_errorbar(aes(ymin = cil, ymax = cih, width = 0.05)) +
xlim(c(0,1)) + ylim(c(0,1)) +
ylab("Proportion choosing target") +
xlab("Model Predictions") +
facet_wrap(~depth) +
geom_smooth(method="lm", aes(group=1)) +
geom_abline(slope = 1, intercept = 0, lty=2) +
ggtitle("Priors model, bootstrapped (500 sims) 95% CIs")
sims_prior_bstrap_errs %>%
mutate(ci_width = hi - low,
type = paste0(cond, "-", expt)) %>%
ggplot(aes(x = depth, y = ci_width, col = type)) +
geom_point()
depth_tests <- c("complex", "oddman", "simple", "twins")
priors_tests <- c("baserate", "color", "lang")
full_data_type <- fullData %>%
mutate(type = ifelse(expt %in% depth_tests, "depth_test", "priors_test"))
full_data_priors_tests <- full_data_type %>% filter(type == "priors_test")
full_data_depth_tests <- full_data_type %>% filter(type == "depth_test")
full_data_priors_tests <- full_data_priors_tests %>% arrange(grouper)
full_data_priors_tests <- full_data_priors_tests %>%
rowwise %>%
mutate(tuningData = ifelse(!is.na(count), "keep", "throwout"))
keepIndices <- c("keep", "throwout")
compareIndices <- which(full_data_priors_tests$tuningData == "keep")
prior_d <- plyr::ddply(full_data_priors_tests, .variables = c("grouper"), .fun = rsa.runDf,
quantityVarName = "object", semanticsVarName = "speaker.p",
itemVarName = "query", priorsVarName = "priorValue",
usePriorEveryRecurse = FALSE) %>%
filter(tuningData == "keep")
noPrior_d <- plyr::ddply(full_data_priors_tests, .variables = c("grouper"), .fun = rsa.runDf,
quantityVarName = "object", semanticsVarName = "speaker.p",
itemVarName = "query",
usePriorEveryRecurse = FALSE) %>%
filter(tuningData == "keep")
join_priors_noPriors <- cbind(prior_d, noPrior_d$preds)
names(join_priors_noPriors)[length(names(join_priors_noPriors))] <- 'noPrior_preds'
join_priors_noPriors <- join_priors_noPriors %>%
gather(prior_type, value, c(preds, noPrior_preds))
ggplot(join_priors_noPriors, aes(x = value, y = p, col = prior_type)) +
geom_pointrange(aes(ymin = cil, ymax = cih)) +
xlim(c(0,1)) + ylim(c(0,1)) +
facet_wrap(~prior) +
ylab("Proportion choosing target") +
xlab("Model Predictions")
# geom_smooth(method="lm", aes(group=1)) +
# geom_abline(slope = 1, intercept = 0, lty=2) +
# ggtitle("Model performance for recursive depth 0-5 with fitted alphas")
## Parallelize fitting
priorFit <- mclapply(DEPTHS, function(i) { ## parallelize over depths
rsa.tuneDepthAlpha(full_data_priors_tests, quantityVarName = "object",
semanticsVarName = "speaker.p", itemVarName = "query",
priorsVarName = "priorValue", groupName = "grouper",
compareDataName = "p", compareIndices = compareIndices,
depth = i, alphas = ALPHAS, usePriorEveryRecurse = FALSE)
},
mc.cores = detectCores())
aggregateTuneData_priors <- convertListToDF(priorFit)
aggregateTuneData_priors %>%
group_by(depth) %>%
summarize(maxCor = max(cor),
alpha = alpha[which.max(cor)])
## Source: local data frame [6 x 3]
##
## depth maxCor alpha
## (dbl) (dbl) (dbl)
## 1 0 0.6312373 0.0
## 2 1 0.9470066 3.8
## 3 2 0.9469632 1.6
## 4 3 0.9465052 1.3
## 5 4 0.9469024 1.1
## 6 5 0.9459132 1.1
noPriorFit <- mclapply(DEPTHS, function(i) { ## parallelize over depths
rsa.tuneDepthAlpha(full_data_priors_tests, quantityVarName = "object",
semanticsVarName = "speaker.p", itemVarName = "query",
groupName = "grouper",
compareDataName = "p", compareIndices = compareIndices,
depth = i, alphas = ALPHAS, usePriorEveryRecurse = FALSE)
},
mc.cores = detectCores())
aggregateTuneData_noPriors <- convertListToDF(noPriorFit)
aggregateTuneData_noPriors %>%
group_by(depth) %>%
summarize(maxCor = max(cor),
alpha = alpha[which.max(cor)])
## Source: local data frame [6 x 3]
##
## depth maxCor alpha
## (dbl) (dbl) (dbl)
## 1 0 0.6312373 0.0
## 2 1 0.9489123 1.9
## 3 2 0.9488210 1.0
## 4 3 0.9488951 0.8
## 5 4 0.9488517 0.8
## 6 5 0.9487172 0.8
combined_tuning <- left_join(aggregateTuneData_priors, aggregateTuneData_noPriors, by = c("depth", "alpha"))
names(combined_tuning) <- c("priors", "depth", "alpha", "noPriors")
combined_tuning <- combined_tuning %>%
gather(type, correlation, -c(depth, alpha))
ggplot(combined_tuning, aes(x = alpha, y = correlation, col = as.factor(type))) +
geom_point() +
facet_wrap(~depth) +
ggtitle("Impact of priors on model preds with and without priors")
full_data_depth_tests <- full_data_depth_tests %>% arrange(grouper)
full_data_depth_tests <- full_data_depth_tests %>%
rowwise %>%
mutate(tuningData = ifelse(!is.na(count), "keep", "throwout"))
keepIndices <- c("keep", "throwout")
compareIndices <- which(full_data_depth_tests$tuningData == "keep")
sims_prior_temp <- lapply(seq(0, 2), function(depth) {
plyr::ddply(full_data_depth_tests, .variables = c("grouper"), rsa.runDf,
quantityVarName = "object", semanticsVarName = "speaker.p",
itemVarName = "query", priorsVarName = "priorValue", depth = depth, usePriorEveryRecurse = FALSE) %>%
rowwise %>%
mutate(tuningData = ifelse(!is.na(count), "keep", "throwout"),
depth = depth) %>%
filter(tuningData == "keep")
})
sims_prior_temp <- convertListToDF(sims_prior_temp)
sims_prior_temp %>% filter(preds != 0, preds != 1) %>%
ggplot(aes(x = preds, y = p, col = expt, pch = object)) +
geom_pointrange(aes(ymin = cil, ymax = cih)) +
xlim(c(0,1)) + ylim(c(0,1)) +
facet_wrap(expt~depth, labeller = label_both, ncol = 3) +
ylab("Proportion choosing target") +
xlab("Model Predictions") +
geom_smooth(method="lm", aes(group=1)) +
geom_abline(slope = 1, intercept = 0, lty=2) +
ggtitle("Model performance for recursive depth 0-2 with fitted alphas")
## Warning in qt((1 - level)/2, df): NaNs produced
## Warning in qt((1 - level)/2, df): NaNs produced
sims_prior_temp %>%
group_by(expt, depth) %>%
summarise(correlation = cor(preds, p))
## Source: local data frame [12 x 3]
## Groups: expt [?]
##
## expt depth correlation
## (fctr) (int) (dbl)
## 1 complex 0 0.8385561
## 2 complex 1 0.9611770
## 3 complex 2 0.9763472
## 4 oddman 0 0.9497125
## 5 oddman 1 0.9763249
## 6 oddman 2 0.9763249
## 7 simple 0 0.9001967
## 8 simple 1 0.9517313
## 9 simple 2 0.9898410
## 10 twins 0 0.9761129
## 11 twins 1 0.9970806
## 12 twins 2 0.9875595
ggplot(allData, aes(x = preds, y = p, col = prior)) +
geom_point(shape = 21, alpha = 0.85) +
facet_grid(model ~ cond)
allData %>%
filter(grouper == 'baserate-72' | grouper == 'color-47') %>%
ggplot(aes(x = preds, y = p, col = prior, shape = as.factor(object))) +
geom_point(alpha = 0.5, size = 4) +
facet_grid(model ~ grouper) +
geom_abline(slope = 1, intercept = 0, lty=2)
Almost looks like target and logical could be swapped… But with more recursions, the priors-model begins to improve…
temp_d <- sims_prior %>% filter(grouper == 'baserate-72' | grouper == 'color-47') %>%
arrange(cond, expt, matrix, prior, object, count, p)
temp_d$unif_preds <- sims_NoPrior %>% filter(grouper == 'baserate-72' | grouper == 'color-47') %>%
arrange(cond, expt, matrix, prior, object, count, p) %>%
mutate(unif_preds = preds) %>%
select(unif_preds) %>%
unlist()
temp_d <- temp_d %>%
mutate(delta = preds - unif_preds) %>%
arrange(-abs(delta))
## uniform vs priors
ggplot(temp_d, aes(x = unif_preds, y = preds, col = factor(grouper), size = factor(depth), shape = as.factor(object))) +
geom_point() +
geom_abline(slope = 1, intercept = 0, lty=2)
## Take a look at the data
temp_d
Still looks like these might have been swapped…
allData %>%
filter(grouper == 'baserate-72' | grouper == 'color-47') %>%
mutate(delta = p - preds) %>%
arrange(delta, model, grouper) %>%
select(cond, delta, p, preds, grouper, query, object, model, priorType, priorValue)
Save this for future generations of rsa pirates
############################################################################
#### SAVE - ORIGINAL DATA PROCESSING
####
processedD <- d %>%
## Gather priors - we're only interested in prior for current object
gather(priorType, priorValue, c(foil, logical, target)) %>%
## Filter out any mismatches between current object and priorType
filter(object == priorType) %>%
rowwise %>%
## Grouping variable concatenates experiment type and sample size (expt-n)
mutate(grouper = paste0(as.character(expt), "-", as.character(n)))
fullData <- plyr::ddply(processedD, .variables = ("grouper"), .fun = completeQueries) %>%
rowwise %>%
## Each row gets a unique literal semantics from getLiteraSemantics() helper
mutate(speaker.p = getLiteralSemantics(matrix, query = query, object = object)) %>%
arrange(expt, n)
####
#### SAVE - ORIGINAL DATA PROCESSING
############################################################################
# sims_prior %>%
# mutate(errorH = cih - p,
# errorL = p - cil) %>%
# ggplot(., aes(x = preds, y = p, col = expt, pch = object)) +
# geom_errorbarh(aes(xmin = preds - errorL, xmax = preds + errorH)) +
# geom_errorbar(aes(ymin = cil, ymax = cih)) +
# xlim(c(0,1)) + ylim(c(0,1)) +
# facet_wrap(~depth) +
# ylab("Proportion choosing target") +
# xlab("Model Predictions") +
# geom_smooth(method="lm", aes(group=1)) +
# geom_abline(slope = 1, intercept = 0, lty=2)