# rm(list = ls())
library(jsonlite)
library(ggplot2)
library(tidyr)
library(dplyr)
library(binom)
library(bootstrap)
library(langcog)
source("/Users/ericang/Documents/Research/polgrice_GIT/experiment/data_analysis/helper/useful.R")

raw.data.path <- "/Users/ericang/Documents/Research/polgrice_GIT/experiment/exp_versions/20_S_production_dropdown/production-results/"

## LOOP TO READ IN FILES
all.data <- data.frame()
files <- dir(raw.data.path,pattern="*.json")

for (file.name in files) {
  
  ## these are the two functions that are most meaningful
  json_file <- readLines(paste(raw.data.path,file.name,sep=""))
  json_file_str = paste(json_file, collapse = "")
  json_file_str = gsub(",}", "}", json_file_str)
  jso = jsonlite::fromJSON(json_file_str)
  jso$answers$data$people <- NULL
  jso1 <- data.frame(jso)
  jso1$subid <- substring(file.name, 1, 6)
  
  ## now here's where data get bound together
  all.data <- rbind(all.data, jso1)
}

Filter out participants and clean up.

d0 <- all.data %>%
  select(subid, answers.data.cond, answers.data.order, answers.data.domain, answers.data.goal, answers.data.state, answers.data.posneg, answers.data.keyword)

d1 <- d0 %>%
  filter(answers.data.cond == 1) %>%
  mutate(positivity = factor(answers.data.posneg, labels = c("no_neg", "neg")),
         utterance = factor(answers.data.keyword, labels = c("terrible", "bad", "okay", "good", "amazing")))

d2 <- d0 %>%
  filter(answers.data.cond == 2) %>%
  filter(answers.data.posneg != "" & answers.data.keyword != "") %>%
  mutate(positivity = factor(answers.data.posneg, labels = c("no_neg", "neg")),
         utterance = factor(answers.data.keyword, labels = c("amazing", "good", "okay", "bad", "terrible")))

d3 <- d0 %>%
  filter(answers.data.cond == 3) %>%
  mutate(positivity = factor(answers.data.posneg, labels = c("neg", "no_neg")),
         utterance = factor(answers.data.keyword, labels = c("terrible", "bad", "okay", "good", "amazing")))

d4 <- d0 %>%
  filter(answers.data.cond == 4) %>%
  mutate(positivity = factor(answers.data.posneg, labels = c("neg", "no_neg")),
         utterance = factor(answers.data.keyword, labels = c("amazing", "good", "okay", "bad", "terrible")))

d <- rbind(d1, d2, d3, d4) %>%
  mutate(goal = answers.data.goal) %>%
  mutate(trial = answers.data.order) %>%
  mutate(item = answers.data.domain) %>%
  mutate(true_state = answers.data.state) %>%
  select(subid, trial, goal, true_state, positivity, utterance)

d$subid <- as.factor(d$subid)
d$trial <- as.numeric(d$trial)
d$positivity <- as.factor(d$positivity)
d$true_state <- as.factor(d$true_state)
d$utterance <- as.factor(d$utterance)
d$goal <- factor(d$goal, levels =c("informative", "social", "both", "no_goal"))
d$utterance <- ordered(d$utterance, levels = c("terrible", "bad", "okay", "good", "amazing"))

# ms <- d %>%
#   select(subid, goal, true_state, positivity, utterance) %>%
#   mutate(positivity = factor(positivity, labels = c("not", "yes"))) %>%
#   mutate(utterance = paste(positivity, utterance, sep="_")) %>%
#   mutate(true_state = substr(true_state, 6, 6)) %>%
#   mutate(subid = factor(subid, labels = c(1:61))) %>%
#   select(subid, goal, true_state, utterance)
ms2 <- d %>%
  filter(!is.na(positivity), !is.na(utterance)) %>% # why is there NA?
  group_by(true_state, goal) %>%
  summarise(n.total=n())

ms3 <- d %>%
  filter(!is.na(positivity), !is.na(utterance)) %>% # why is there NA?
  group_by(true_state, goal, positivity, utterance) %>%
  summarize(n = n())

ms <- left_join(ms2, ms3) %>%
  group_by(true_state, goal, positivity, utterance) %>%
  summarize(mean = n / n.total,
            ci_lower = binom.bayes(n, n.total)$lower,
            ci_upper = binom.bayes(n, n.total)$upper) 
## Joining, by = c("true_state", "goal")
ms_fake <- cbind(expand.grid(true_state=levels(ms$true_state),goal=levels(ms$goal),positivity=levels(ms$positivity), utterance=levels(ms$utterance)), mean=NA, ci_lower=NA, ci_upper=NA)
  

ms.all <- rbind(data.frame(ms), data.frame(ms_fake))

ggplot(data=ms.all, aes(x=positivity, y=mean, fill=utterance)) +
  geom_bar(stat="identity", position=position_dodge()) +
  facet_grid(goal~true_state) +
  xlab("no neg (it was ~) vs neg (it wasn't ~) ") +
  ylab("count") +
  ggtitle("What would the speaker say given their goals?") +
  geom_errorbar(aes(ymin=ci_lower,ymax=ci_upper),position="dodge") +
  geom_hline(yintercept=.1, lty=2)
## Warning: Removed 200 rows containing missing values (geom_bar).
## Warning: Removed 200 rows containing missing values (geom_errorbar).

ms2 <- d %>%
  filter(!is.na(positivity), !is.na(utterance)) %>% # why is there NA?
  group_by(true_state, goal) %>%
  summarise(n.total=n())

ms <- d %>%
  filter(!is.na(positivity), !is.na(utterance)) %>% # why is there NA?
  group_by(true_state, goal, positivity) %>%
  summarise(n=n())

ms.all <- left_join(ms2, ms) %>%
  group_by(true_state, goal, positivity) %>%
  summarize(mean = n / n.total,
            ci_lower = binom.bayes(n, n.total)$lower,
            ci_upper = binom.bayes(n, n.total)$upper) 
## Joining, by = c("true_state", "goal")
ggplot(data=ms.all, aes(x=positivity, y=mean, fill=positivity)) +
  geom_bar(stat="identity", position=position_dodge()) +
  facet_grid(goal~true_state) +
  xlab("no neg (it was ~) vs neg (it wasn't ~) ") +
  ylab("count") +
  ggtitle("What would the speaker say given their goals?") +
  geom_errorbar(aes(ymin=ci_lower,ymax=ci_upper),position="dodge") +
  geom_hline(yintercept=.5, lty=2)