Prelim set-up
Data pre-processing
Simulations WITH priors (recursive depth 0-5)
Simulations WITHOUT priors (recursive depth 0-5)
Explicit comparisons with MF sims.Rmd
- Combine priors and unif priors
- Compare with MF data
Propagating errors into model
- Bootstrap se helpers
- Boostrapped runs
Analysis by experiment intent (study priors vs recursive depth)
- Data for prior vs depth expt types
  - What are effects of priors on prior oriented expts
  - Prior vs noPrior model preds
- How does recursion help holding alpha and priors constant?
Old analysis

Prelim set-up

rm(list = ls())

setwd('/Users/benpeloquin/Desktop/Projects/Pragmods/models/')
source("matrices.R")
source("agents.R")
library(rrrsa)
library(dplyr)
library(tidyr)
library(ggplot2)
library(parallel)
library(stats)

Helpers

Helper functions of data pre-processing and combining (from sims.R).

#####
##### normalizeBy()
##### -------------
##### normaze matrix by rows or cols, maintain naming
#####
normalizeBy <- function(m, rows = TRUE) {
  colNames <- colnames(m)
  rowNames <- rownames(m)
  normedM <- apply(m, MARGIN = ifelse(rows, 1, 2), rrrsa::rsa.normVec)
  colnames(normedM) <- colNames
  rownames(normedM) <- rowNames
  normedM
}

#####
##### inGlobalEnv()
##### -------------
##### boolean object presence in global env
#####
inGlobalEnv <- function(item) {
  item %in% ls(envir = .GlobalEnv)
}

#####
##### getLiteralSemantics()
##### ---------------------
##### literal semantics mapper between 
##### normalized matrices and dataframe
##### called in mutate() within dplyr chain
#####
getLiteralSemantics <- function(matrixName, object, query) {
  if (!(inGlobalEnv("simpleM") | inGlobalEnv("complexM") | inGlobalEnv("oddmanM") | inGlobalEnv("twinsM"))) {
    stop("Missing normalized matrices, please run data pre-processing block")
  }
  
  if (matrixName == "simple") {
    simpleM[as.character(object), as.character(query)]
  } else if (matrixName == "complex") {
    complexM[as.character(object), as.character(query)]
  } else if (matrixName == "oddman") {
    oddmanM[as.character(object), as.character(query)]
  } else if (matrixName == "twins") {
    twinsM[as.character(object), as.character(query)]
  } else {
    stop("Error in getLiteralSemantics()")
  }
}

#####
##### completeQueries()
##### ---------------------
##### add "query" words for each expt
##### this is necessary for running with rrrsa::runDf
#####
completeQueries <- function(d) {
  ## store data
  newD <- d
  currMatrix <- unique(as.character(d$matrix))
  
  ## meta-data
  allQueries <- c("glasses", "hat", "mustache")
  allObjects <- c("foil", "logical", "target")
  
  ## The current query
  currQuery <- unique(as.character(d$query))
  neededQueries <- setdiff(allQueries, currQuery)
  
  ## Add in new queries
  for (q in neededQueries) {
    df <- data.frame(cond = as.vector(d$cond),
                     expt = as.vector(d$expt),
                     matrix = as.vector(d$matrix),
                     prior = as.vector(d$prior),
                     query = rep(q, 3),
                     object = as.vector(d$object),
                     count = rep(NA, 3),
                     p = rep(NA, 3),
                     n = as.vector(d$n),
                     cil = rep(NA, 3), 
                     cih = rep(NA, 3),
                     priorType = as.vector(d$priorType),
                     priorValue = as.vector(d$priorValue),
                     grouper = as.vector(d$grouper))
    newD <- plyr::rbind.fill(newD, df)
    if (currMatrix == "simple") return(newD)
  }
  newD
}
# undebug(completeQueries)
# debug(completeQueries)
# completeQueries(baserate_63)

#####
##### convertListToDf()
##### -----------------
##### Convert a list where each element represents
##### data for a different grouping variable
##### useful with simulations run below
#####
convertListToDF <- function(li, rows = TRUE) {
  df <- data.frame()
  nItems <- seq(1, length(li))
  if (rows) {
    for (i in nItems) {
      df <- rbind(df, as.data.frame(li[[i]]))
    }
  } else {
    df <- data.frame(matrix(nrow = length(li[[1]]), ncol = 1))
    for (i in nItems) {
      df <- cbind(df, as.data.frame(li[[i]]))
    }
    return(df[, -1])
  }
  df
}

#####
##### rrrsa_ready_d
##### -------------
##### process data for use with rrrsa
##### NOTE :: this is not a generic fn
##### an makes certain assumptions about column naming
##### (eg. colnames -> object, expt, n, matrix)
#####
rrrsa_ready_df <- function(d, foil_name = "foil", logical_name = "logical", target_name = "target") {
  ## Initial pre-processing before adding in new cols and then adding literal semantics
  processedD <- d %>%
    ## Gather priors - we're only interested in prior for current object (need to use NSE to make
    ## compatible with boostrapped priors)
    gather_("priorType", "priorValue", c(foil_name, logical_name, target_name))
  processedD <- processedD %>%
    ## Filter out any mismatches between current object and priorType
    ## Do rowwise with grep so that we can use with boostrap naming conventions
    rowwise %>%
    filter(grepl(object, priorType)) %>%
    rowwise %>%
    ## Grouping variable concatenates experiment type and sample size (expt-n)
    mutate(grouper = paste0(as.character(expt), "-", as.character(n)))
  
  fullData <- plyr::ddply(processedD, .variables = ("grouper"), .fun = completeQueries) %>%
    rowwise %>%
    ## Each row gets a unique literal semantics from getLiteraSemantics() helper
    mutate(speaker.p = getLiteralSemantics(matrix, query = query, object = object)) %>%
    arrange(expt, n)
  
  fullData
}

Global data / settings

set.seed(1)
ALPHAS <- seq(0, 5, by = 0.1)
DEPTHS <- seq(0, 5)

Read in data

## This is `d` df from line 128 in sims.Rmd
d <- read.csv("data/models.csv")

Matrices from matrices.R

## Renamed, normalized matrices from matrices.R
simpleM <- normalizeBy(simple, rows = FALSE)
complexM <- normalizeBy(complex, rows = FALSE)
twinsM <- normalizeBy(twins, rows = FALSE)
oddmanM <- normalizeBy(oddman, rows = FALSE)

Data pre-processing

NOTE:: To work with rrrsa::runDf we need each matrix to include the literal semantics of all the queries (even if they weren’t actually queried) Look at group ‘baserate-63’: only ‘glasses’ is queried, but we need to add the literal semantics for ‘hat’ and ‘mustache’ as well.

fullData <- rrrsa_ready_df(d)

Simulations WITH priors (recursive depth 0-5)

Recursive depth sims

## Pre-sort by grouper
fullData <- fullData %>% arrange(grouper)
sims_prior_temp <- lapply(DEPTHS, function(depth) {
  plyr::ddply(fullData, .variables = c("grouper"), rsa.runDf,
            quantityVarName = "object", semanticsVarName = "speaker.p",
            itemVarName = "query", priorsVarName = "priorValue", depth = depth, usePriorEveryRecurse = FALSE) %>%
  rowwise %>%
  mutate(tuningData = ifelse(!is.na(count), "keep", "throwout"),
         depth = depth) %>%
  filter(tuningData == "keep")
})
sims_prior <- convertListToDF(sims_prior_temp)

Simulation plots

Recursive depth 0-2

sims_prior %>%
  filter(depth < 3) %>%
  ggplot(., aes(x = preds, y = p, col = expt, pch = object)) +
    geom_pointrange(aes(ymin = cil, ymax = cih)) + 
    xlim(c(0,1)) + ylim(c(0,1)) + 
    facet_wrap(~depth) +
    ylab("Proportion choosing target") + 
    xlab("Model Predictions") + 
    geom_smooth(method="lm", aes(group=1)) + 
    geom_abline(slope = 1, intercept = 0, lty=2) +
    ggtitle("Priors model sims, alpha = 1")

Recursive depth 0-5

ggplot(sims_prior, aes(x = preds, y = p, col = expt, pch = object)) +
  geom_pointrange(aes(ymin = cil, ymax = cih)) + 
  xlim(c(0,1)) + ylim(c(0,1)) + 
  facet_wrap(~depth) +
  ylab("Proportion choosing target") + 
  xlab("Model Predictions") + 
  geom_smooth(method="lm", aes(group=1)) + 
  geom_abline(slope = 1, intercept = 0, lty=2) +
  ggtitle("Priors model sims, alpha = 1")

Correlations

sims_prior %>%
  group_by(depth) %>%
  summarise("cor" = cor(p, preds)) %>%
  mutate("alpha" = 1)

Fitting alpha

## Need to identify indices we want for performance comparisons
fullData <- fullData %>%
  rowwise %>%
  mutate(tuningData = ifelse(!is.na(count), "keep", "throwout"))
keepIndices <- c("keep", "throwout")
compareIndices  <- which(fullData$tuningData == "keep")

## Parallelize fitting
priorFit <- mclapply(DEPTHS, function(i) { ## parallelize over depths
  rsa.tuneDepthAlpha(fullData, quantityVarName = "object",
                     semanticsVarName = "speaker.p", itemVarName = "query",
                     priorsVarName = "priorValue", groupName = "grouper",
                     compareDataName = "p", compareIndices = compareIndices,
                     depth = i, alphas = ALPHAS, usePriorEveryRecurse = FALSE)
  },
  mc.cores = detectCores())
aggregateTuneData <- convertListToDF(priorFit)

Model performance

## Get best model performance by alpha, depth
maxCors <- aggregateTuneData %>%
  group_by(depth) %>%
  summarize(maxCor = max(cor),
            alpha = alpha[which.max(cor)])

Plot performance tuning

aggregateTuneData_plot <- aggregateTuneData %>%
  mutate(Recursive_depth = as.factor(depth))

## Plot tuning (Plot adapted for personal_site)
ggplot(aggregateTuneData_plot, aes(x=alpha, y=cor, col=Recursive_depth)) +
  geom_point(alpha=0.6, size=4) +
  geom_vline(xintercept=maxCors$alpha, col="grey", lty="dotdash")+
  ylab("Correlation") +
  scale_x_continuous(breaks = round(seq(0, 6, by = 0.1)), "Rationality (alpha level)") +
  ggthemes::theme_solarized() +
  guides(fill=guide_legend(nrow=1,byrow=FALSE, position="bottom"))

Best priors model performance with fitted alphas

depths_param_data <- maxCors %>%
  arrange(depth)
best_prior_models_fitted_alpha <- convertListToDF(lapply(DEPTHS, function(depth) {
  plyr::ddply(fullData, .variables = c("grouper"), rsa.runDf,
              quantityVarName = "object", semanticsVarName = "speaker.p",
              priors = "priorValue", usePriorEveryRecurse = FALSE,
              itemVarName = "query", depth = depths_param_data$depth[depth + 1], alpha = depths_param_data$alpha[depth + 1]) %>%
    mutate(tuningData = ifelse(!is.na(count), "keep", "throwout"),
           depth = depth) %>%
    filter(tuningData == "keep")
  }))
## Quick check
## cor(best_prior_models_fitted_alpha[best_prior_models_fitted_alpha$depth == 1, ]$p, best_prior_models_fitted_alpha[best_prior_models_fitted_alpha$depth == 1, ]$preds)

Plot best model performance with fitted alphas

print("Model performance for recursive depth 0-5 with fitted alphas")

## [1] "Model performance for recursive depth 0-5 with fitted alphas"

maxCors ## performance data

## Source: local data frame [6 x 3]
## 
##   depth    maxCor alpha
##   (dbl)     (dbl) (dbl)
## 1     0 0.7357369   0.0
## 2     1 0.9453030   3.1
## 3     2 0.9522229   1.4
## 4     3 0.9527394   1.1
## 5     4 0.9523711   1.0
## 6     5 0.9528916   1.0

## plots
ggplot(best_prior_models_fitted_alpha, aes(x = preds, y = p, col = expt, pch = object)) +
  geom_pointrange(aes(ymin = cil, ymax = cih)) + 
  xlim(c(0,1)) + ylim(c(0,1)) + 
  facet_wrap(~depth, labeller = label_both) +
  ylab("Proportion choosing target") + 
  xlab("Model Predictions") + 
  geom_smooth(method="lm", aes(group=1)) + 
  geom_abline(slope = 1, intercept = 0, lty=2) +
  ggtitle("Model performance for recursive depth 0-5 with fitted alphas")

Simulations WITHOUT priors (recursive depth 0-5)

Recursive depth sims

sims_NoPrior_temp <- lapply(DEPTHS, function(depth) {
  plyr::ddply(fullData, .variables = c("grouper"), rsa.runDf,
            quantityVarName = "object", semanticsVarName = "speaker.p",
            itemVarName = "query", depth = depth) %>%
  rowwise %>%
  mutate(tuningData = ifelse(!is.na(count), "keep", "throwout"),
         depth = depth) %>%
  filter(tuningData == "keep")
})
sims_NoPrior <- convertListToDF(sims_NoPrior_temp)

Simulation plots

Recursive depth 0-2

sims_NoPrior %>%
  filter(depth < 3) %>%
  ggplot(., aes(x = preds, y = p, col = expt, pch = object)) +
    geom_pointrange(aes(ymin = cil, ymax = cih)) + 
    xlim(c(0,1)) + ylim(c(0,1)) + 
    facet_wrap(~depth) +
    ylab("Proportion choosing target") + 
    xlab("Model Predictions") + 
    geom_smooth(method="lm", aes(group=1)) + 
    geom_abline(slope = 1, intercept = 0, lty=2)

Recursive depth 0-5

ggplot(sims_NoPrior, aes(x = preds, y = p, col = expt, pch = object)) +
  geom_pointrange(aes(ymin = cil, ymax = cih)) + 
  xlim(c(0,1)) + ylim(c(0,1)) + 
  facet_wrap(~depth) +
  ylab("Proportion choosing target") + 
  xlab("Model Predictions") + 
  geom_smooth(method="lm", aes(group=1)) + 
  geom_abline(slope = 1, intercept = 0, lty=2)

Correlations

sims_NoPrior %>%
  group_by(depth) %>%
  summarise("cor" = cor(p, preds)) %>%
  mutate("alpha" = 1)

Fitting alpha

## Check that we have fullData and compareIndices for tuning
if (!(inGlobalEnv("fullData") & inGlobalEnv("compareIndices"))) {
  fullData <- fullData %>%
  rowwise %>%
  mutate(tuningData = ifelse(!is.na(count), "keep", "throwout"))
  keepIndices <- c("keep", "throwout")
  compareIndices  <- which(fullData$tuningData == "keep")  
}

## Parallelize fitting
noPriorFit <- mclapply(DEPTHS, function(i) { ## parallelize over depths
  rsa.tuneDepthAlpha(fullData, quantityVarName = "object",
                     semanticsVarName = "speaker.p", itemVarName = "query",
                     compareIndices = compareIndices, compareDataName = "p",
                     groupName = "grouper",
                     depth = i, alphas = ALPHAS)
  },
  mc.cores = detectCores())
aggregateTuneData_noPriors <- convertListToDF(noPriorFit)

Model performance

## Get best model performance by alpha, depth
maxCors_noPriors <- aggregateTuneData_noPriors %>%
  group_by(depth) %>%
  summarize(maxCor = max(cor),
            alpha = alpha[which.max(cor)])

Plot performance tuning

## Plot tuning
ggplot(aggregateTuneData_noPriors, aes(x = alpha, y = cor, col = as.factor(depth))) +
  geom_point() +
  ylim(0.75, 1) +
  geom_vline(xintercept = maxCors$alpha, lty = 4, col = "grey50") +
  scale_x_continuous(breaks = round(seq(1, 6, by = 0.1)), "alpha") +
  ggtitle("Model performance - no priors")

## Warning: Removed 51 rows containing missing values (geom_point).

Best NO-priors model performance with fitted alphas

depths_param_data <- maxCors_noPriors %>%
  arrange(depth)
best_noPrior_models_fitted_alpha <- convertListToDF(lapply(DEPTHS, function(depth) { ## parallelize over depths
  plyr::ddply(fullData, .variables = c("grouper"), rsa.runDf,
              quantityVarName = "object", semanticsVarName = "speaker.p",
              usePriorEveryRecurse = FALSE, itemVarName = "query",
              depth = depths_param_data$depth[depth + 1], alpha = depths_param_data$alpha[depth + 1]) %>%
    mutate(tuningData = ifelse(!is.na(count), "keep", "throwout"),
           depth = depth) %>%
    filter(tuningData == "keep")
  }))
## Quick check
## cor(best_noPrior_models_fitted_alpha[best_noPrior_models_fitted_alpha$depth == 1, ]$p, best_noPrior_models_fitted_alpha[best_noPrior_models_fitted_alpha$depth == 1, ]$preds)

Plot best No priors model performance with fitted alphas

print("Model performance for recursive depth 0-5 with fitted alphas no priors")

## [1] "Model performance for recursive depth 0-5 with fitted alphas no priors"

maxCors_noPriors ## performance data

## Source: local data frame [6 x 3]
## 
##   depth    maxCor alpha
##   (dbl)     (dbl) (dbl)
## 1     0 0.7357369   0.0
## 2     1 0.9457362   1.5
## 3     2 0.9513002   0.9
## 4     3 0.9520095   0.8
## 5     4 0.9520683   0.7
## 6     5 0.9521724   0.7

## plots
ggplot(best_noPrior_models_fitted_alpha, aes(x = preds, y = p, col = expt, pch = object)) +
  geom_pointrange(aes(ymin = cil, ymax = cih)) + 
  xlim(c(0,1)) + ylim(c(0,1)) + 
  facet_wrap(~depth, labeller = label_both) +
  ylab("Proportion choosing target") + 
  xlab("Model Predictions") + 
  geom_smooth(method="lm", aes(group=1)) + 
  geom_abline(slope = 1, intercept = 0, lty=2) +
  ggtitle("No-prior model performance for recursive depth 0-5 with fitted alphas")

Explicit comparisons with MF sims.Rmd

Combine priors and unif priors

## combine priors and uniform
sims_prior <- sims_prior %>%
  mutate(prior = "prior")
sims_NoPrior <- sims_NoPrior %>%
  mutate(prior = "uniform")
allData <- rbind(sims_prior, sims_NoPrior) %>%
  mutate(model = paste0("L", depth)) %>%
  arrange(prior, query, object, matrix, cond, expt, model)

Compare with MF data

## Compare with MFs data
mfD <- read.csv("data/modelRuns.csv") %>%
  arrange(prior, query, object, matrix, cond, expt, model)

## Add in MF preds check correlation
allData$mdPreds <- mfD$prediction
allData %>%
  ggplot(aes(x = preds, y = mdPreds, fill = matrix, size = depth)) +
    xlab("rrrsa predictions") +
    ylab("MF predictions") +
    geom_point(shape = 21, alpha = 0.30)

# ggsave("pragmods.png", path = "~/Desktop/", width=7, height=3)

Propagating errors into model

Bootstrap se helpers

Helpers - calculating bootstrapepd CIs

#####
##### se()
##### ----------------------
##### given a numeric vector calcuate std error :: sqrt(var(v))
#####
se <- function(v) {
  sqrt(var(v))
}

#####
##### ci()
##### ----------------------
##### given a numeric vector calculate confidence intervals of width, `width`
#####
ci <- function(mu, sd, n, width = 0.95) {
  interval <- (1 - width) / 2
  error <- (qnorm(1 - interval) * sd) / sqrt(n)
  return(c("bstrp_mu" = mu, "bstrp_err" = error))
}

#####
##### get_se()
##### ----------------------
##### given a df of bootstrapped estimates return std error
#####
get_se <- function(boot_ests) {
  apply(as.matrix(boot_ests), 1, se)  
}

#####
##### get_ci()
##### ----------------------
##### given a df of bootstrapped estimates return `width` confidence intervals
#####
get_ci <- function(boot_ests, width = 0.95) {
  apply(as.matrix(boot_ests), 1, FUN = function(ests) {
    mu <- mean(ests)
    sd <- sd(ests)
    n <- length(ests)
    return(ci(mu, sd, n, width))
  })
}

#####
##### boot_vec()
##### -------------
##### multinomial bootstrap
##### given vector of counts return n boostrap 
##### estimates of proportions
#####
boot_vec <- function(countsVec, n = 1) {
  t(normalizeBy(rmultinom(n, sum(countsVec), countsVec), rows = FALSE))
}
#####
##### pragmods_priors_boot()
##### ----------------------
##### custom pragmods wrapper to boot_priors()
##### used with mapply() to create new prior estimates
#####
pragmods_priors_boot <- function(c1, c2, c3, n = 1) {
  boot_vec(c("foil_cnts" = c1, "logical_cnts" = c2, "target_cnts" = c3), n)
}

#####
##### add_booted_priors()
##### ----------------------
##### given a df with prior counts for 'foil', 'logical' and 'target'
##### return new df with n = 1 boostrap estimates of priors
##### NOTE :: like pragmods_priors_boot this is not a generic function -
##### assumes naming convnetions established here ('foil_cnts', 'logical_cnts', 'target_cnts')
#####
add_booted_priors <- function(df) {
  res <- convertListToDF(with(df, mapply(pragmods_priors_boot, foil_cnts, logical_cnts, target_cnts, SIMPLIFY = FALSE)))
  names(res) <- c("foil_bt", "logical_bt", "target_bt")
  cbind(df, res)  
}

#####
##### run_boot_sims()
##### ---------------------
##### Run `n_sims` simulations with `alhpa`, `depth` 
##### Expects a df `d_prior_counts` that contains prior counts data
#####
run_boot_sims <- function(d_prior_counts, alpha = 1, depth = 1, n_sims = 100, verbose = TRUE) {
  ## Display run
  if (verbose) {
    cat(paste("Running ", n_sims, " simulations with alpha = ", alpha, ""))
  }
  
  ## Store n boostrapped simulations 
  bstrap_sims <- mclapply(seq(1, n_sims), FUN = function(i) {
    ## Convert to rrrsa df
    d_booted_priors <-
      rrrsa_ready_df(add_booted_priors(d_prior_counts),
                     foil_name = "foil_bt", logical_name = "logical_bt", target_name = "target_bt")
    ## Run rrrsa
    res <- plyr::ddply(d_booted_priors, .variables = c("grouper"), rsa.runDf,
              quantityVarName = "object", semanticsVarName = "speaker.p",
              itemVarName = "query", alpha = alpha, depth = depth,
              priorsVarName = "priorValue", usePriorEveryRecurse = FALSE) %>%
    rowwise %>%
    mutate(tuningData = ifelse(!is.na(count), "keep", "throwout"),
           depth = depth) %>%
    filter(tuningData == "keep")
    ## Return predictions
    res$preds
  }, 
  mc.cores = detectCores())
  
  ## Convert simulations to df
  pred_sims <- convertListToDF(bstrap_sims, rows = FALSE)
  pred_sims
}

Boostrapped runs

Prior counts data

## Get prior counts data (from sims.R)
priorCounts <- read.csv("data/prior_counts.csv")
names(priorCounts)[1:3] <- c("foil_cnts", "logical_cnts", "target_cnts")
## Merge with our original data set
d_prior_counts <- merge(d, priorCounts, by = "prior", all = TRUE)

Boostrapped data at depth == 5

Let’s lookat at bootstrapped levels at a single level of recursion. These next two code blocks are useful for verifying that were measuring the correct quantitites.

depth <- 4
d5_bstrap_sims <- run_boot_sims(d_prior_counts, depth = depth, n_sims = 100, verbose = FALSE)

Plot d5 strap sims

d5_bstrap_sims$simNum <- seq(1, nrow(d5_bstrap_sims))
d5_bstrap_sims_long <- d5_bstrap_sims %>%
  gather(itNum, val, -simNum) %>%
  select(simNum, val)

## How do the sims look (apprx normally distr)?
d5_bstrap_sims_long %>%
  filter(val != 1 & val != 0 & val != 0.5) %>%
  ggplot(aes(val, fill = simNum)) +
    geom_density() +
    facet_wrap(~simNum)

Calc d5 bstrap cis

d5_prior_model_sims <- sims_prior[which(sims_prior$depth == depth), ]
errs <- data.frame()
for (sim_row in seq(1, nrow(d5_bstrap_sims))) {
    ## Get diff with our estimate
    diffs <- d5_prior_model_sims$preds[sim_row] - unlist(d5_bstrap_sims[sim_row, ])
    errs <- rbind(errs, diffs)
}
cis <- data.frame(t(apply(errs, MARGIN = 1, FUN = quantile, c(0.05, 0.95))))
names(cis) <- c('low', 'hi')

d5_bstrap_cis <- cbind(d5_prior_model_sims, cis)
## Need to add bounds to CIs
d5_bstrap_cis <- d5_bstrap_cis %>%
  mutate(ci_low = preds - hi,
         ci_hi = preds - low)

Plot d5 bootstrap CIs

ggplot(d5_bstrap_cis, aes(x = preds, y = p, col = expt, pch = object)) +
  geom_errorbarh(aes(xmin = ci_hi, xmax = ci_low, height = 0.05)) +
  geom_errorbar(aes(ymin = cil, ymax = cih, width = 0.05)) + 
  xlim(c(0,1)) + ylim(c(0,1)) + 
  ylab("Proportion choosing target") + 
  xlab("Model Predictions") + 
  facet_wrap(~depth) +
  geom_smooth(method="lm", aes(group=1)) + 
  geom_abline(slope = 1, intercept = 0, lty=2) +
  ggtitle(paste0("Priors model, depth = ", as.character(depth), ", bootatrapped 95% CIs"))

Boostrapped data - all DEPTHS and calculate CIs

## Run simulations for each depth
sims_prior_bstrap_errs <- read.csv("sims_prior_bstrap_errs.csv") ## From previous run with set.seed(1)

## Run below code if we don't have saved previous sims
## ---------------------------------------------------
if (!inGlobalEnv("sims_prior_bstrap_errs")) {
  depth_sims <- mclapply(DEPTHS, FUN = function(depth) {
      run_boot_sims(d_prior_counts, depth = depth, n_sims = 500, verbose = FALSE)
      }, mc.cores = detectCores()) 
   
  ## For each simulation level
  cis <- data.frame() ## df to hold CIs
  
  # fn <- function() {
  for (depth in seq(1, length(depth_sims))) {
    ## Current depth data
    curr_data <- data.frame(depth_sims[depth])
    errs <- data.frame() ## Hold ci diffs
    ## For each simulation (row) in the current data frame
    for (sim_row in seq(1, nrow(curr_data))) {
      ## Get diff with our estimate
      diffs <-  sims_prior[which(sims_prior$depth == depth - 1), ]$preds[sim_row] - unlist(curr_data[sim_row, ])
      errs <- rbind(errs, diffs)
    }
    cis <- rbind(cis, data.frame(t(apply(errs, MARGIN = 1, FUN = quantile, c(0.05, 0.95)))))
  }
  # }
  # debug(fn)
  # undebug(fn)
  # cis <- fn()

  names(cis) <- c('low', 'hi')
  sims_prior_bstrap_errs <- cbind(sims_prior, cis)
  ## Need to add bounds to CIs
  sims_prior_bstrap_errs <- sims_prior_bstrap_errs %>%
    mutate(ci_low = preds - hi,
           ci_hi = preds - low)
}
# write.csv(sims_prior_bstrap_errs, "sims_prior_bstrap_errs.csv")

Plot with boostrapped errs - all depths

ggplot(sims_prior_bstrap_errs, aes(x = preds, y = p, col = expt, pch = object)) +
  geom_errorbarh(aes(xmin = ci_hi, xmax = ci_low, height = 0.05)) +
  geom_errorbar(aes(ymin = cil, ymax = cih, width = 0.05)) + 
  xlim(c(0,1)) + ylim(c(0,1)) + 
  ylab("Proportion choosing target") + 
  xlab("Model Predictions") + 
  facet_wrap(~depth) +
  geom_smooth(method="lm", aes(group=1)) + 
  geom_abline(slope = 1, intercept = 0, lty=2) +
  ggtitle("Priors model, bootstrapped (500 sims) 95% CIs")

Why do we see more variance with higher recursive depth?

sims_prior_bstrap_errs %>%
  mutate(ci_width = hi - low,
         type = paste0(cond, "-", expt)) %>%
  ggplot(aes(x = depth, y = ci_width, col = type)) +
  geom_point()

Analysis by experiment intent (study priors vs recursive depth)

Data for prior vs depth expt types

depth_tests <- c("complex", "oddman", "simple", "twins")
priors_tests <- c("baserate", "color", "lang")
full_data_type <- fullData %>%
  mutate(type = ifelse(expt %in% depth_tests, "depth_test", "priors_test"))
full_data_priors_tests <- full_data_type %>% filter(type == "priors_test")
full_data_depth_tests <- full_data_type %>% filter(type == "depth_test")

What are effects of priors on prior oriented expts

full_data_priors_tests <- full_data_priors_tests %>% arrange(grouper)
full_data_priors_tests <- full_data_priors_tests %>%
  rowwise %>%
  mutate(tuningData = ifelse(!is.na(count), "keep", "throwout"))
keepIndices <- c("keep", "throwout")
compareIndices  <- which(full_data_priors_tests$tuningData == "keep")

Prior vs noPrior model preds

prior_d <- plyr::ddply(full_data_priors_tests, .variables = c("grouper"), .fun = rsa.runDf,
            quantityVarName = "object", semanticsVarName = "speaker.p",
            itemVarName = "query", priorsVarName = "priorValue",
            usePriorEveryRecurse = FALSE) %>%
  filter(tuningData == "keep")

noPrior_d <- plyr::ddply(full_data_priors_tests, .variables = c("grouper"), .fun = rsa.runDf,
            quantityVarName = "object", semanticsVarName = "speaker.p",
            itemVarName = "query",
            usePriorEveryRecurse = FALSE) %>%
  filter(tuningData == "keep")


join_priors_noPriors <- cbind(prior_d, noPrior_d$preds)
names(join_priors_noPriors)[length(names(join_priors_noPriors))] <- 'noPrior_preds'
join_priors_noPriors <- join_priors_noPriors %>%
  gather(prior_type, value, c(preds, noPrior_preds))


ggplot(join_priors_noPriors, aes(x = value, y = p, col = prior_type)) +
  geom_pointrange(aes(ymin = cil, ymax = cih)) + 
  xlim(c(0,1)) + ylim(c(0,1)) + 
  facet_wrap(~prior) +
  ylab("Proportion choosing target") + 
  xlab("Model Predictions")

#   geom_smooth(method="lm", aes(group=1)) + 
#   geom_abline(slope = 1, intercept = 0, lty=2) +
#   ggtitle("Model performance for recursive depth 0-5 with fitted alphas")

Priors

## Parallelize fitting
priorFit <- mclapply(DEPTHS, function(i) { ## parallelize over depths
  rsa.tuneDepthAlpha(full_data_priors_tests, quantityVarName = "object",
                     semanticsVarName = "speaker.p", itemVarName = "query",
                     priorsVarName = "priorValue", groupName = "grouper",
                     compareDataName = "p", compareIndices = compareIndices,
                     depth = i, alphas = ALPHAS, usePriorEveryRecurse = FALSE)
  },
  mc.cores = detectCores())

aggregateTuneData_priors <- convertListToDF(priorFit)
aggregateTuneData_priors %>%
  group_by(depth) %>%
  summarize(maxCor = max(cor),
            alpha = alpha[which.max(cor)])

## Source: local data frame [6 x 3]
## 
##   depth    maxCor alpha
##   (dbl)     (dbl) (dbl)
## 1     0 0.6312373   0.0
## 2     1 0.9470066   3.8
## 3     2 0.9469632   1.6
## 4     3 0.9465052   1.3
## 5     4 0.9469024   1.1
## 6     5 0.9459132   1.1

No Priors

noPriorFit <- mclapply(DEPTHS, function(i) { ## parallelize over depths
  rsa.tuneDepthAlpha(full_data_priors_tests, quantityVarName = "object",
                     semanticsVarName = "speaker.p", itemVarName = "query",
                     groupName = "grouper",
                     compareDataName = "p", compareIndices = compareIndices,
                     depth = i, alphas = ALPHAS, usePriorEveryRecurse = FALSE)
  },
  mc.cores = detectCores())

aggregateTuneData_noPriors <- convertListToDF(noPriorFit)
aggregateTuneData_noPriors %>%
  group_by(depth) %>%
  summarize(maxCor = max(cor),
            alpha = alpha[which.max(cor)])

## Source: local data frame [6 x 3]
## 
##   depth    maxCor alpha
##   (dbl)     (dbl) (dbl)
## 1     0 0.6312373   0.0
## 2     1 0.9489123   1.9
## 3     2 0.9488210   1.0
## 4     3 0.9488951   0.8
## 5     4 0.9488517   0.8
## 6     5 0.9487172   0.8

How does alpha fitting respond to depth and priors

combined_tuning <- left_join(aggregateTuneData_priors, aggregateTuneData_noPriors, by = c("depth", "alpha"))
names(combined_tuning) <- c("priors", "depth", "alpha", "noPriors")
combined_tuning <- combined_tuning %>%
  gather(type, correlation, -c(depth, alpha))
ggplot(combined_tuning, aes(x = alpha, y = correlation, col = as.factor(type))) +
  geom_point() +
  facet_wrap(~depth) +
  ggtitle("Impact of priors on model preds with and without priors")

How does recursion help holding alpha and priors constant?

full_data_depth_tests <- full_data_depth_tests %>% arrange(grouper)
full_data_depth_tests <- full_data_depth_tests %>%
  rowwise %>%
  mutate(tuningData = ifelse(!is.na(count), "keep", "throwout"))
keepIndices <- c("keep", "throwout")
compareIndices  <- which(full_data_depth_tests$tuningData == "keep")

sims_prior_temp <- lapply(seq(0, 2), function(depth) {
  plyr::ddply(full_data_depth_tests, .variables = c("grouper"), rsa.runDf,
            quantityVarName = "object", semanticsVarName = "speaker.p",
            itemVarName = "query", priorsVarName = "priorValue", depth = depth, usePriorEveryRecurse = FALSE) %>%
  rowwise %>%
  mutate(tuningData = ifelse(!is.na(count), "keep", "throwout"),
         depth = depth) %>%
  filter(tuningData == "keep")
})
sims_prior_temp <- convertListToDF(sims_prior_temp)
sims_prior_temp %>% filter(preds != 0, preds != 1) %>%
  ggplot(aes(x = preds, y = p, col = expt, pch = object)) +
  geom_pointrange(aes(ymin = cil, ymax = cih)) + 
  xlim(c(0,1)) + ylim(c(0,1)) + 
  facet_wrap(expt~depth, labeller = label_both, ncol = 3) +
  ylab("Proportion choosing target") + 
  xlab("Model Predictions") + 
  geom_smooth(method="lm", aes(group=1)) + 
  geom_abline(slope = 1, intercept = 0, lty=2) +
  ggtitle("Model performance for recursive depth 0-2 with fitted alphas")

## Warning in qt((1 - level)/2, df): NaNs produced

## Warning in qt((1 - level)/2, df): NaNs produced

sims_prior_temp %>%
  group_by(expt, depth) %>%
  summarise(correlation = cor(preds, p))

## Source: local data frame [12 x 3]
## Groups: expt [?]
## 
##       expt depth correlation
##     (fctr) (int)       (dbl)
## 1  complex     0   0.8385561
## 2  complex     1   0.9611770
## 3  complex     2   0.9763472
## 4   oddman     0   0.9497125
## 5   oddman     1   0.9763249
## 6   oddman     2   0.9763249
## 7   simple     0   0.9001967
## 8   simple     1   0.9517313
## 9   simple     2   0.9898410
## 10   twins     0   0.9761129
## 11   twins     1   0.9970806
## 12   twins     2   0.9875595

Old analysis

Plot preds by experiment

ggplot(allData, aes(x = preds, y = p, col = prior)) +
  geom_point(shape = 21, alpha = 0.85) +
  facet_grid(model ~ cond)

Plot prior’s wonkiness

allData %>%
  filter(grouper == 'baserate-72' | grouper == 'color-47') %>%
  ggplot(aes(x = preds, y = p, col = prior, shape = as.factor(object))) +
    geom_point(alpha = 0.5, size = 4) +
    facet_grid(model ~ grouper) +
    geom_abline(slope = 1, intercept = 0, lty=2)

Almost looks like target and logical could be swapped… But with more recursions, the priors-model begins to improve…

Wonky priors vs unif priors

temp_d <- sims_prior %>% filter(grouper == 'baserate-72' | grouper == 'color-47') %>%
  arrange(cond, expt, matrix, prior, object, count, p)

temp_d$unif_preds <- sims_NoPrior %>% filter(grouper == 'baserate-72' | grouper == 'color-47') %>%
  arrange(cond, expt, matrix, prior, object, count, p) %>%
  mutate(unif_preds = preds) %>%
  select(unif_preds) %>%
  unlist()

temp_d <- temp_d %>%
  mutate(delta = preds - unif_preds) %>%
  arrange(-abs(delta))

## uniform vs priors
ggplot(temp_d, aes(x = unif_preds, y = preds, col = factor(grouper), size = factor(depth), shape = as.factor(object))) +
  geom_point() +
  geom_abline(slope = 1, intercept = 0, lty=2)

## Take a look at the data
temp_d

Still looks like these might have been swapped…

Wonky priors data

allData %>%
  filter(grouper == 'baserate-72' | grouper == 'color-47') %>%
  mutate(delta = p - preds) %>%
  arrange(delta, model, grouper) %>%
  select(cond, delta, p, preds, grouper, query, object, model, priorType, priorValue)

Save this for future generations of rsa pirates

############################################################################
#### SAVE - ORIGINAL DATA PROCESSING
####
processedD <- d %>%
  ## Gather priors - we're only interested in prior for current object
  gather(priorType, priorValue, c(foil, logical, target)) %>%
  ## Filter out any mismatches between current object and priorType
  filter(object == priorType) %>%
  rowwise %>%
  ## Grouping variable concatenates experiment type and sample size (expt-n)
  mutate(grouper = paste0(as.character(expt), "-", as.character(n)))

fullData <- plyr::ddply(processedD, .variables = ("grouper"), .fun = completeQueries) %>%
  rowwise %>%
  ## Each row gets a unique literal semantics from getLiteraSemantics() helper
  mutate(speaker.p = getLiteralSemantics(matrix, query = query, object = object)) %>%
  arrange(expt, n)
####
#### SAVE - ORIGINAL DATA PROCESSING
############################################################################

# sims_prior %>%
#   mutate(errorH = cih - p,
#          errorL = p - cil) %>%
#   ggplot(., aes(x = preds, y = p, col = expt, pch = object)) +
#     geom_errorbarh(aes(xmin = preds - errorL, xmax = preds + errorH)) +
#     geom_errorbar(aes(ymin = cil, ymax = cih)) + 
#     xlim(c(0,1)) + ylim(c(0,1)) + 
#     facet_wrap(~depth) +
#     ylab("Proportion choosing target") + 
#     xlab("Model Predictions") + 
#     geom_smooth(method="lm", aes(group=1)) + 
#     geom_abline(slope = 1, intercept = 0, lty=2)

Model Simulations

Ben Peloquin

April 7, 2016

Prelim set-up

Helpers

Global data / settings

Read in data

Matrices from matrices.R

Data pre-processing

Simulations WITH priors (recursive depth 0-5)

Recursive depth sims

Simulation plots

Recursive depth 0-2

Recursive depth 0-5

Correlations

Fitting alpha

Model performance

Plot performance tuning

Best priors model performance with fitted alphas

Plot best model performance with fitted alphas

Simulations WITHOUT priors (recursive depth 0-5)

Recursive depth sims

Simulation plots

Recursive depth 0-2

Recursive depth 0-5

Correlations

Fitting alpha

Model performance

Plot performance tuning

Best NO-priors model performance with fitted alphas

Plot best No priors model performance with fitted alphas

Explicit comparisons with MF sims.Rmd

Combine priors and unif priors

Compare with MF data

Propagating errors into model

Bootstrap se helpers

Boostrapped runs

Prior counts data

Boostrapped data at depth == 5

Plot d5 strap sims

Calc d5 bstrap cis

Plot d5 bootstrap CIs

Boostrapped data - all DEPTHS and calculate CIs

Plot with boostrapped errs - all depths

Why do we see more variance with higher recursive depth?

Analysis by experiment intent (study priors vs recursive depth)

Data for prior vs depth expt types

What are effects of priors on prior oriented expts

Prior vs noPrior model preds

Priors

No Priors

How does alpha fitting respond to depth and priors

How does recursion help holding alpha and priors constant?

Old analysis

Plot preds by experiment

Plot prior’s wonkiness

Wonky priors vs unif priors

Wonky priors data