# rm(list = ls())
library(jsonlite)
library(ggplot2)
library(tidyr)
library(dplyr)
library(binom)
library(bootstrap)
library(langcog)
source("/Users/ericang/Documents/Research/polgrice_GIT/experiment/data_analysis/helper/useful.R")
raw.data.path <- "/Users/ericang/Documents/Research/polgrice_GIT/experiment/exp_versions/22_S_production_dropdown_v2/production-results/"
## LOOP TO READ IN FILES
all.data <- data.frame()
files <- dir(raw.data.path,pattern="*.json")
for (file.name in files) {
## these are the two functions that are most meaningful
json_file <- readLines(paste(raw.data.path,file.name,sep=""))
json_file_str = paste(json_file, collapse = "")
json_file_str = gsub(",}", "}", json_file_str)
jso = jsonlite::fromJSON(json_file_str)
jso$answers$data$people <- NULL
jso1 <- data.frame(jso)
jso1$subid <- substring(file.name, 1, 6)
## now here's where data get bound together
all.data <- rbind(all.data, jso1)
}
Filter out participants and clean up.
d0 <- all.data %>%
select(subid, answers.data.cond, answers.data.order, answers.data.domain, answers.data.goal, answers.data.state, answers.data.posneg, answers.data.keyword)
d1 <- d0 %>%
filter(answers.data.cond == 1) %>%
mutate(positivity = factor(answers.data.posneg, labels = c("no_neg", "neg")),
utterance = factor(answers.data.keyword, labels = c("terrible", "bad", "okay", "good", "amazing")))
d2 <- d0 %>%
filter(answers.data.cond == 2) %>%
filter(answers.data.posneg != "" & answers.data.keyword != "") %>%
mutate(positivity = factor(answers.data.posneg, labels = c("no_neg", "neg")),
utterance = factor(answers.data.keyword, labels = c("amazing", "good", "okay", "bad", "terrible")))
d3 <- d0 %>%
filter(answers.data.cond == 3) %>%
mutate(positivity = factor(answers.data.posneg, labels = c("neg", "no_neg")),
utterance = factor(answers.data.keyword, labels = c("terrible", "bad", "okay", "good", "amazing")))
d4 <- d0 %>%
filter(answers.data.cond == 4) %>%
mutate(positivity = factor(answers.data.posneg, labels = c("neg", "no_neg")),
utterance = factor(answers.data.keyword, labels = c("amazing", "good", "okay", "bad", "terrible")))
d <- rbind(d1, d2, d3, d4) %>%
mutate(goal = answers.data.goal) %>%
mutate(trial = answers.data.order) %>%
mutate(item = answers.data.domain) %>%
mutate(true_state = answers.data.state) %>%
select(subid, trial, goal, true_state, positivity, utterance)
d$subid <- as.factor(d$subid)
d$trial <- as.numeric(d$trial)
d$positivity <- as.factor(d$positivity)
d$true_state <- as.factor(d$true_state)
d$utterance <- as.factor(d$utterance)
d$goal <- factor(d$goal, levels =c("informative", "social", "both"))
d$utterance <- ordered(d$utterance, levels = c("terrible", "bad", "okay", "good", "amazing"))
# ms <- d %>%
# select(subid, goal, true_state, positivity, utterance) %>%
# mutate(positivity = factor(positivity, labels = c("not", "yes"))) %>%
# mutate(utterance = paste(positivity, utterance, sep="_")) %>%
# mutate(true_state = substr(true_state, 6, 6)) %>%
# mutate(subid = factor(subid, labels = c(1:61))) %>%
# select(subid, goal, true_state, utterance)
ms2 <- d %>%
filter(!is.na(positivity), !is.na(utterance)) %>% # why is there NA?
group_by(true_state, goal) %>%
summarise(n.total=n())
ms3 <- d %>%
filter(!is.na(positivity), !is.na(utterance)) %>% # why is there NA?
group_by(true_state, goal, positivity, utterance) %>%
summarize(n = n())
ms <- left_join(ms2, ms3) %>%
group_by(true_state, goal, positivity, utterance) %>%
summarize(mean = n / n.total,
ci_lower = binom.bayes(n, n.total)$lower,
ci_upper = binom.bayes(n, n.total)$upper)
## Joining, by = c("true_state", "goal")
ms_fake <- cbind(expand.grid(true_state=levels(ms$true_state),goal=levels(ms$goal),positivity=levels(ms$positivity), utterance=levels(ms$utterance)), mean=NA, ci_lower=NA, ci_upper=NA)
ms.all <- rbind(data.frame(ms), data.frame(ms_fake))
ggplot(data=ms.all, aes(x=positivity, y=mean, fill=utterance)) +
geom_bar(stat="identity", position=position_dodge()) +
facet_grid(goal~true_state) +
xlab("no neg (it was ~) vs neg (it wasn't ~) ") +
ylab("count") +
ggtitle("What would the speaker say given their goals?") +
geom_errorbar(aes(ymin=ci_lower,ymax=ci_upper),position="dodge") +
geom_hline(yintercept=.1, lty=2)
## Warning: Removed 150 rows containing missing values (geom_bar).
## Warning: Removed 150 rows containing missing values (geom_errorbar).
ms2 <- d %>%
filter(!is.na(positivity), !is.na(utterance)) %>% # why is there NA?
group_by(true_state, goal) %>%
summarise(n.total=n())
ms <- d %>%
filter(!is.na(positivity), !is.na(utterance)) %>% # why is there NA?
group_by(true_state, goal, positivity) %>%
summarise(n=n())
ms.all <- left_join(ms2, ms) %>%
group_by(true_state, goal, positivity) %>%
summarize(mean = n / n.total,
ci_lower = binom.bayes(n, n.total)$lower,
ci_upper = binom.bayes(n, n.total)$upper)
## Joining, by = c("true_state", "goal")
ggplot(data=ms.all, aes(x=true_state, y=mean, col=positivity, group=positivity)) +
# geom_bar(stat="identity", position=position_dodge()) +
geom_line(stat="identity", position=position_dodge()) +
facet_grid(goal~.) +
# xlab("neg (it wasn't ~) vs. no neg (it was ~)") +
ylab("proportion ") +
geom_errorbar(aes(ymin=ci_lower,ymax=ci_upper), position=position_dodge(width=.1))
## Warning: Width not defined. Set with `position_dodge(width = ?)`
ms_glmer <- d %>%
mutate(positivity = factor(positivity, labels = c(0,1))) %>%
mutate(positivity = as.numeric(as.character(positivity))) %>%
mutate(true_state = as.numeric(substr(true_state, 6, 6)))
# mutate(goal = factor(goal, levels = c("both", "informative", "social")))
# filter(true_state < 3)
summary(glmer(data=ms_glmer, positivity ~ true_state * goal * (1|subid), family=binomial))
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: positivity ~ true_state * goal * (1 | subid)
## Data: ms_glmer
##
## AIC BIC logLik deviance df.resid
## 2516.5 2558.5 -1251.2 2502.5 2978
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.8051 -0.4654 -0.3011 -0.0962 11.5285
##
## Random effects:
## Groups Name Variance Std.Dev.
## subid (Intercept) 0.09787 0.3128
## Number of obs: 2985, groups: subid, 182
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.41112 0.18075 -2.275 0.022933 *
## true_state -0.39464 0.06243 -6.322 2.59e-10 ***
## goalsocial 0.96826 0.25603 3.782 0.000156 ***
## goalboth 2.88105 0.28428 10.134 < 2e-16 ***
## true_state:goalsocial -0.38391 0.09643 -3.981 6.86e-05 ***
## true_state:goalboth -1.04098 0.11536 -9.023 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) tr_stt golscl golbth tr_stt:gls
## true_state -0.870
## goalsocial -0.694 0.614
## goalboth -0.630 0.548 0.445
## tr_stt:glsc 0.568 -0.643 -0.880 -0.364
## tr_stt:glbt 0.478 -0.534 -0.338 -0.894 0.353
ms_glmer2 <- d %>%
mutate(utterance = as.factor(paste(positivity, utterance, sep="_"))) %>%
mutate(true_state = as.numeric(substr(true_state, 6, 6)))
summary(glmer(data=ms_glmer2, utterance ~ true_state * goal * (1|subid), family=binomial))
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: utterance ~ true_state * goal * (1 | subid)
## Data: ms_glmer2
##
## AIC BIC logLik deviance df.resid
## 438.5 480.5 -212.2 424.5 2978
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -34.846 0.046 0.063 0.098 0.534
##
## Random effects:
## Groups Name Variance Std.Dev.
## subid (Intercept) 2.098 1.448
## Number of obs: 2985, groups: subid, 182
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.6780 0.7246 6.456 1.08e-10 ***
## true_state 0.2210 0.2143 1.031 0.302459
## goalsocial -1.2629 0.8679 -1.455 0.145635
## goalboth -2.8548 0.7982 -3.577 0.000348 ***
## true_state:goalsocial 0.5870 0.3604 1.629 0.103410
## true_state:goalboth 1.0960 0.3629 3.020 0.002528 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) tr_stt golscl golbth tr_stt:gls
## true_state -0.761
## goalsocial -0.644 0.641
## goalboth -0.738 0.696 0.576
## tr_stt:glsc 0.468 -0.594 -0.860 -0.418
## tr_stt:glbt 0.494 -0.589 -0.382 -0.852 0.354