# rm(list = ls())
library(jsonlite)
library(ggplot2)
library(tidyr)
library(dplyr)
library(binom)
library(bootstrap)
library(langcog)
source("/Users/ericang/Documents/Research/polgrice_GIT/experiment/data_analysis/helper/useful.R")

raw.data.path <- "/Users/ericang/Documents/Research/polgrice_GIT/experiment/exp_versions/17_S_production/production-results/"

## LOOP TO READ IN FILES
all.data <- data.frame()
files <- dir(raw.data.path,pattern="*.json")

for (file.name in files) {
  
  ## these are the two functions that are most meaningful
  json_file <- readLines(paste(raw.data.path,file.name,sep=""))
  json_file_str = paste(json_file, collapse = "")
  json_file_str = gsub(",}", "}", json_file_str)
  jso = jsonlite::fromJSON(json_file_str)
  jso$answers$data$people <- NULL
  jso$answers$data$utterance1 <- jso$answers$data$utterance[1]
  jso$answers$data$utterance2 <- jso$answers$data$utterance[2]
  jso$answers$data$utterance3 <- jso$answers$data$utterance[3]
  jso$answers$data$utterance4 <- jso$answers$data$utterance[4]
  jso$answers$data$utterance5 <- jso$answers$data$utterance[5]
  jso$answers$data$utterance6 <- jso$answers$data$utterance[6]
  jso$answers$data$utterance7 <- jso$answers$data$utterance[7]
  jso$answers$data$utterance8 <- jso$answers$data$utterance[8]
  jso$answers$data$utterance9 <- jso$answers$data$utterance[9]
  jso$answers$data$utterance10 <- jso$answers$data$utterance[10]
  jso$answers$data$utterance <- NULL  
  jso1 <- data.frame(jso)
  jso1$subid <- substring(file.name, 1, 6)
  
  ## now here's where data get bound together
  all.data <- rbind(all.data, jso1)
}

Filter out participants and clean up.

rearr <- all.data %>%
  select(subid, num_range("answers.data.utterance", 1:10)) %>%
  distinct(subid, .keep_all = TRUE) %>%
  gather(utterance_order, utterance, num_range("answers.data.utterance", 1:10)) %>%
  mutate(utterance_order = as.numeric (as.character(substr(utterance_order, 23, 24))))
## Warning: attributes are not identical across measure variables; they will
## be dropped
d <- all.data %>%
  select(subid, answers.data.order, answers.data.knowledge, answers.data.domain, answers.data.goal, answers.data.state, num_range("answers.data.goalProb", 0:9)) %>%
  gather(utterance_order, utterance_prob, num_range("answers.data.goalProb", 0:9)) %>%
  mutate(utterance_order = as.numeric(as.character(substr(utterance_order, 22, 22))) + 1)

d <- left_join(d, rearr) %>%
  mutate(goal = answers.data.goal) %>%
  mutate(trial = answers.data.order) %>%
  mutate(item = answers.data.domain) %>%
  mutate(true_state = answers.data.state) %>%
  mutate(positivity = factor(as.numeric(grepl("yes", utterance)), 
                        levels = c(1, 0), 
                        labels = c("positive","negative"))) %>%
  mutate(utterance = substring(utterance, 5)) %>%
  select(subid, trial, goal, true_state, positivity, utterance,  utterance_prob)
## Joining, by = c("subid", "utterance_order")
d$subid <- as.factor(d$subid)
d$trial <- as.numeric(d$trial)
d$positivity <- as.factor(d$positivity)
d$true_state <- as.factor(d$true_state)
d$utterance <- as.factor(d$utterance)
d$goal <- as.factor(d$goal)
d$utterance_prob <- as.numeric(d$utterance_prob)
d$utterance <- ordered(d$utterance, levels = c("terrible", "bad", "okay", "good", "amazing"))

ms <- d %>%
  select(subid, goal, true_state, utterance, utterance_prob) %>%
  mutate(true_state = substr(true_state, 6, 6)) %>%
  mutate(subid = factor(subid, labels = c(1:30)))
# goal_prob ~ true_state + utterance + goal
ms <- d %>%
  group_by(true_state, goal, positivity, utterance, subid) %>%
  summarize(
            utterance_prob = mean(utterance_prob, na.rm=TRUE)
          ) %>%
  group_by(true_state, goal, positivity, utterance) %>%
  multi_boot_standard(column="utterance_prob") %>%
  mutate(utterance_prob = mean)
## Joining, by = c("true_state", "goal", "positivity", "utterance")
ggplot(data=ms, aes(x=positivity, y=utterance_prob, fill=utterance)) +
  geom_bar(stat="identity", position=position_dodge()) +
  facet_grid(goal~true_state) +
  xlab("neg (it wasn't ~) vs. no neg (it was ~)") +
  geom_errorbar(aes(ymin=ci_lower,ymax=ci_upper), position=position_dodge())

ms2 <- d %>%
  group_by(true_state, goal, positivity, subid) %>%
  summarize(
            utterance_prob = mean(utterance_prob, na.rm=TRUE)
          ) %>%
  group_by(true_state, goal, positivity) %>%
  multi_boot_standard(column="utterance_prob") %>%
  mutate(utterance_prob = mean)
## Joining, by = c("true_state", "goal", "positivity")
ggplot(data=ms2, aes(x=true_state, y=utterance_prob, col=positivity, group=positivity)) +
  # geom_bar(stat="identity", position=position_dodge()) +
  geom_line(stat="identity", position=position_dodge()) +
  facet_grid(goal~.) +
  xlab("neg (it wasn't ~) vs. no neg (it was ~)") +
  geom_errorbar(aes(ymin=ci_lower,ymax=ci_upper), position=position_dodge(width=.1))
## Warning: Width not defined. Set with `position_dodge(width = ?)`

ms2 <- ms %>%
  mutate(utterance_whole = paste(substr(positivity, 1, 3), utterance, sep="_")) %>%  
  mutate(utterance_whole = ordered(utterance_whole, 
                                   levels = c("pos_amazing","pos_good", "pos_okay", "pos_bad", "pos_terrible", 
                                              "neg_amazing","neg_good", "neg_okay", "neg_bad", "neg_terrible"
                                     # "pos_terrible", "pos_bad", "pos_okay", "pos_good", "pos_amazing",
                                              # "neg_terrible", "neg_bad", "neg_okay", "neg_good", "neg_amazing"
                                     ))) %>%
  arrange(true_state, goal, -utterance_prob) %>%
  group_by(true_state, goal) %>%
  mutate(rank=row_number()) %>%
  select(-mean)
  # mutate(utterance_whole = factor(utterance_whole, levels=rev(levels(as.factor(utterance_whole)))))

ggplot(data=ms2, aes(x=rank, y=utterance_prob, fill=utterance_whole)) +
  scale_fill_brewer(palette="RdYlGn") +
  geom_bar(stat="identity", position=position_dodge()) +
  facet_grid(goal~true_state) +
  scale_x_continuous(breaks=c(1:10)) +
  xlab("rank within state x goal") + 
  ylab("probability of saying") +
  geom_errorbar(aes(ymin=ci_lower,ymax=ci_upper), position=position_dodge())

ggplot(data=ms2, aes(y=rank, x=utterance_prob, col=utterance_whole, label = utterance_whole)) +
  scale_color_brewer(palette="RdYlGn") +
  geom_text(check_overlap = TRUE, size = 2.5, hjust = 0, nudge_x = 0.2) +
  geom_point(stat="identity") +
  facet_grid(goal~true_state) +
  scale_y_reverse(lim=c(10,0), breaks = c(1:10)) +
  scale_x_continuous(limits = c(0, 1.8), breaks = c(.25, .50, .75, 1)) +
  ylab("rank within state x goal") + 
  xlab("probability of saying") +
  geom_errorbarh(aes(xmin=ci_lower,xmax=ci_upper))