polgrice- speaker

# rm(list = ls())
library(jsonlite)
library(ggplot2)
library(tidyr)
library(dplyr)
library(binom)
library(bootstrap)
library(langcog)
source("/Users/ericang/Documents/Research/polgrice_GIT/experiment/data_analysis/helper/useful.R")

raw.data.path <- "/Users/ericang/Documents/Research/polgrice_GIT/experiment/exp_versions/22_S_production_dropdown_v2/production-results/"

## LOOP TO READ IN FILES
all.data <- data.frame()
files <- dir(raw.data.path,pattern="*.json")

for (file.name in files) {
  
  ## these are the two functions that are most meaningful
  json_file <- readLines(paste(raw.data.path,file.name,sep=""))
  json_file_str = paste(json_file, collapse = "")
  json_file_str = gsub(",}", "}", json_file_str)
  jso = jsonlite::fromJSON(json_file_str)
  jso$answers$data$people <- NULL
  jso1 <- data.frame(jso)
  jso1$subid <- substring(file.name, 1, 6)
  
  ## now here's where data get bound together
  all.data <- rbind(all.data, jso1)
}

Filter out participants and clean up.

d0 <- all.data %>%
  select(subid, answers.data.cond, answers.data.order, answers.data.domain, answers.data.goal, answers.data.state, answers.data.posneg, answers.data.keyword)

d1 <- d0 %>%
  filter(answers.data.cond == 1) %>%
  mutate(positivity = factor(answers.data.posneg, labels = c("no_neg", "neg")),
         utterance = factor(answers.data.keyword, labels = c("terrible", "bad", "okay", "good", "amazing")))

d2 <- d0 %>%
  filter(answers.data.cond == 2) %>%
  filter(answers.data.posneg != "" & answers.data.keyword != "") %>%
  mutate(positivity = factor(answers.data.posneg, labels = c("no_neg", "neg")),
         utterance = factor(answers.data.keyword, labels = c("amazing", "good", "okay", "bad", "terrible")))

d3 <- d0 %>%
  filter(answers.data.cond == 3) %>%
  mutate(positivity = factor(answers.data.posneg, labels = c("neg", "no_neg")),
         utterance = factor(answers.data.keyword, labels = c("terrible", "bad", "okay", "good", "amazing")))

d4 <- d0 %>%
  filter(answers.data.cond == 4) %>%
  mutate(positivity = factor(answers.data.posneg, labels = c("neg", "no_neg")),
         utterance = factor(answers.data.keyword, labels = c("amazing", "good", "okay", "bad", "terrible")))

d <- rbind(d1, d2, d3, d4) %>%
  mutate(goal = answers.data.goal) %>%
  mutate(trial = answers.data.order) %>%
  mutate(item = answers.data.domain) %>%
  mutate(true_state = answers.data.state) %>%
  select(subid, trial, goal, true_state, positivity, utterance)

d$subid <- as.factor(d$subid)
d$trial <- as.numeric(d$trial)
d$positivity <- as.factor(d$positivity)
d$true_state <- as.factor(d$true_state)
d$utterance <- as.factor(d$utterance)
d$goal <- factor(d$goal, levels =c("informative", "social", "both"))
d$utterance <- ordered(d$utterance, levels = c("terrible", "bad", "okay", "good", "amazing"))

# ms <- d %>%
#   select(subid, goal, true_state, positivity, utterance) %>%
#   mutate(positivity = factor(positivity, labels = c("not", "yes"))) %>%
#   mutate(utterance = paste(positivity, utterance, sep="_")) %>%
#   mutate(true_state = substr(true_state, 6, 6)) %>%
#   mutate(subid = factor(subid, labels = c(1:61))) %>%
#   select(subid, goal, true_state, utterance)

ms2 <- d %>%
  filter(!is.na(positivity), !is.na(utterance)) %>% # why is there NA?
  group_by(true_state, goal) %>%
  summarise(n.total=n())

ms3 <- d %>%
  filter(!is.na(positivity), !is.na(utterance)) %>% # why is there NA?
  group_by(true_state, goal, positivity, utterance) %>%
  summarize(n = n())

ms <- left_join(ms2, ms3) %>%
  group_by(true_state, goal, positivity, utterance) %>%
  summarize(mean = n / n.total,
            ci_lower = binom.bayes(n, n.total)$lower,
            ci_upper = binom.bayes(n, n.total)$upper)

## Joining, by = c("true_state", "goal")

ms_fake <- cbind(expand.grid(true_state=levels(ms$true_state),goal=levels(ms$goal),positivity=levels(ms$positivity), utterance=levels(ms$utterance)), mean=NA, ci_lower=NA, ci_upper=NA)
  

ms.all <- rbind(data.frame(ms), data.frame(ms_fake))

ggplot(data=ms.all, aes(x=positivity, y=mean, fill=utterance)) +
  geom_bar(stat="identity", position=position_dodge()) +
  facet_grid(goal~true_state) +
  xlab("no neg (it was ~) vs neg (it wasn't ~) ") +
  ylab("count") +
  ggtitle("What would the speaker say given their goals?") +
  geom_errorbar(aes(ymin=ci_lower,ymax=ci_upper),position="dodge") +
  geom_hline(yintercept=.1, lty=2)

## Warning: Removed 150 rows containing missing values (geom_bar).

## Warning: Removed 150 rows containing missing values (geom_errorbar).

ms2 <- d %>%
  filter(!is.na(positivity), !is.na(utterance)) %>% # why is there NA?
  group_by(true_state, goal) %>%
  summarise(n.total=n())

ms <- d %>%
  filter(!is.na(positivity), !is.na(utterance)) %>% # why is there NA?
  group_by(true_state, goal, positivity) %>%
  summarise(n=n())

ms.all <- left_join(ms2, ms) %>%
  group_by(true_state, goal, positivity) %>%
  summarize(mean = n / n.total,
            ci_lower = binom.bayes(n, n.total)$lower,
            ci_upper = binom.bayes(n, n.total)$upper)

## Joining, by = c("true_state", "goal")

ggplot(data=ms.all, aes(x=true_state, y=mean, col=positivity, group=positivity)) +
  # geom_bar(stat="identity", position=position_dodge()) +
  geom_line(stat="identity", position=position_dodge()) +
  facet_grid(goal~.) +
  # xlab("neg (it wasn't ~) vs. no neg (it was ~)") +
  ylab("proportion ") +
  geom_errorbar(aes(ymin=ci_lower,ymax=ci_upper), position=position_dodge(width=.1))

## Warning: Width not defined. Set with `position_dodge(width = ?)`

ms_glmer <- d %>%
  mutate(positivity = factor(positivity, labels = c(0,1))) %>%
  mutate(positivity = as.numeric(as.character(positivity))) %>%
  mutate(true_state = as.numeric(substr(true_state, 6, 6)))
  # mutate(goal = factor(goal, levels = c("both", "informative", "social")))
  # filter(true_state < 3)
summary(glmer(data=ms_glmer, positivity ~ true_state * goal * (1|subid), family=binomial))

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: positivity ~ true_state * goal * (1 | subid)
##    Data: ms_glmer
## 
##      AIC      BIC   logLik deviance df.resid 
##   2516.5   2558.5  -1251.2   2502.5     2978 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.8051 -0.4654 -0.3011 -0.0962 11.5285 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  subid  (Intercept) 0.09787  0.3128  
## Number of obs: 2985, groups:  subid, 182
## 
## Fixed effects:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           -0.41112    0.18075  -2.275 0.022933 *  
## true_state            -0.39464    0.06243  -6.322 2.59e-10 ***
## goalsocial             0.96826    0.25603   3.782 0.000156 ***
## goalboth               2.88105    0.28428  10.134  < 2e-16 ***
## true_state:goalsocial -0.38391    0.09643  -3.981 6.86e-05 ***
## true_state:goalboth   -1.04098    0.11536  -9.023  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) tr_stt golscl golbth tr_stt:gls
## true_state  -0.870                                
## goalsocial  -0.694  0.614                         
## goalboth    -0.630  0.548  0.445                  
## tr_stt:glsc  0.568 -0.643 -0.880 -0.364           
## tr_stt:glbt  0.478 -0.534 -0.338 -0.894  0.353

ms_glmer2 <- d %>%
  mutate(utterance = as.factor(paste(positivity, utterance, sep="_"))) %>%
  mutate(true_state = as.numeric(substr(true_state, 6, 6)))

summary(glmer(data=ms_glmer2, utterance ~ true_state * goal * (1|subid), family=binomial))

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: utterance ~ true_state * goal * (1 | subid)
##    Data: ms_glmer2
## 
##      AIC      BIC   logLik deviance df.resid 
##    438.5    480.5   -212.2    424.5     2978 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -34.846   0.046   0.063   0.098   0.534 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  subid  (Intercept) 2.098    1.448   
## Number of obs: 2985, groups:  subid, 182
## 
## Fixed effects:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             4.6780     0.7246   6.456 1.08e-10 ***
## true_state              0.2210     0.2143   1.031 0.302459    
## goalsocial             -1.2629     0.8679  -1.455 0.145635    
## goalboth               -2.8548     0.7982  -3.577 0.000348 ***
## true_state:goalsocial   0.5870     0.3604   1.629 0.103410    
## true_state:goalboth     1.0960     0.3629   3.020 0.002528 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) tr_stt golscl golbth tr_stt:gls
## true_state  -0.761                                
## goalsocial  -0.644  0.641                         
## goalboth    -0.738  0.696  0.576                  
## tr_stt:glsc  0.468 -0.594 -0.860 -0.418           
## tr_stt:glbt  0.494 -0.589 -0.382 -0.852  0.354

polgrice- speaker

Erica Yoon

January 5 2017