# rm(list = ls())
library(jsonlite)
library(ggplot2)
library(tidyr)
library(dplyr)
library(binom)
library(bootstrap)
library(langcog)
source("/Users/ericang/Documents/Research/polgrice_GIT/experiment/data_analysis/helper/useful.R")
raw.data.path <- "/Users/ericang/Documents/Research/polgrice_GIT/experiment/exp_versions/17_S_production/production-results/"
## LOOP TO READ IN FILES
all.data <- data.frame()
files <- dir(raw.data.path,pattern="*.json")
for (file.name in files) {
## these are the two functions that are most meaningful
json_file <- readLines(paste(raw.data.path,file.name,sep=""))
json_file_str = paste(json_file, collapse = "")
json_file_str = gsub(",}", "}", json_file_str)
jso = jsonlite::fromJSON(json_file_str)
jso$answers$data$people <- NULL
jso$answers$data$utterance1 <- jso$answers$data$utterance[1]
jso$answers$data$utterance2 <- jso$answers$data$utterance[2]
jso$answers$data$utterance3 <- jso$answers$data$utterance[3]
jso$answers$data$utterance4 <- jso$answers$data$utterance[4]
jso$answers$data$utterance5 <- jso$answers$data$utterance[5]
jso$answers$data$utterance6 <- jso$answers$data$utterance[6]
jso$answers$data$utterance7 <- jso$answers$data$utterance[7]
jso$answers$data$utterance8 <- jso$answers$data$utterance[8]
jso$answers$data$utterance9 <- jso$answers$data$utterance[9]
jso$answers$data$utterance10 <- jso$answers$data$utterance[10]
jso$answers$data$utterance <- NULL
jso1 <- data.frame(jso)
jso1$subid <- substring(file.name, 1, 6)
## now here's where data get bound together
all.data <- rbind(all.data, jso1)
}
Filter out participants and clean up.
rearr <- all.data %>%
select(subid, num_range("answers.data.utterance", 1:10)) %>%
distinct(subid, .keep_all = TRUE) %>%
gather(utterance_order, utterance, num_range("answers.data.utterance", 1:10)) %>%
mutate(utterance_order = as.numeric (as.character(substr(utterance_order, 23, 24))))
## Warning: attributes are not identical across measure variables; they will
## be dropped
d <- all.data %>%
select(subid, answers.data.order, answers.data.knowledge, answers.data.domain, answers.data.goal, answers.data.state, num_range("answers.data.goalProb", 0:9)) %>%
gather(utterance_order, utterance_prob, num_range("answers.data.goalProb", 0:9)) %>%
mutate(utterance_order = as.numeric(as.character(substr(utterance_order, 22, 22))) + 1)
d <- left_join(d, rearr) %>%
mutate(goal = answers.data.goal) %>%
mutate(trial = answers.data.order) %>%
mutate(item = answers.data.domain) %>%
mutate(true_state = answers.data.state) %>%
mutate(positivity = factor(as.numeric(grepl("yes", utterance)),
levels = c(1, 0),
labels = c("positive","negative"))) %>%
mutate(utterance = substring(utterance, 5)) %>%
select(subid, trial, goal, true_state, positivity, utterance, utterance_prob)
## Joining, by = c("subid", "utterance_order")
d$subid <- as.factor(d$subid)
d$trial <- as.numeric(d$trial)
d$positivity <- as.factor(d$positivity)
d$true_state <- as.factor(d$true_state)
d$utterance <- as.factor(d$utterance)
d$goal <- as.factor(d$goal)
d$utterance_prob <- as.numeric(d$utterance_prob)
d$utterance <- ordered(d$utterance, levels = c("terrible", "bad", "okay", "good", "amazing"))
ms <- d %>%
select(subid, goal, true_state, utterance, utterance_prob) %>%
mutate(true_state = substr(true_state, 6, 6)) %>%
mutate(subid = factor(subid, labels = c(1:30)))
# goal_prob ~ true_state + utterance + goal
ms <- d %>%
group_by(true_state, goal, positivity, utterance, subid) %>%
summarize(
utterance_prob = mean(utterance_prob, na.rm=TRUE)
) %>%
group_by(true_state, goal, positivity, utterance) %>%
multi_boot_standard(column="utterance_prob") %>%
mutate(utterance_prob = mean)
## Joining, by = c("true_state", "goal", "positivity", "utterance")
ggplot(data=ms, aes(x=positivity, y=utterance_prob, fill=utterance)) +
geom_bar(stat="identity", position=position_dodge()) +
facet_grid(goal~true_state) +
xlab("neg (it wasn't ~) vs. no neg (it was ~)") +
geom_errorbar(aes(ymin=ci_lower,ymax=ci_upper), position=position_dodge())
ms2 <- d %>%
group_by(true_state, goal, positivity, subid) %>%
summarize(
utterance_prob = mean(utterance_prob, na.rm=TRUE)
) %>%
group_by(true_state, goal, positivity) %>%
multi_boot_standard(column="utterance_prob") %>%
mutate(utterance_prob = mean)
## Joining, by = c("true_state", "goal", "positivity")
ggplot(data=ms2, aes(x=true_state, y=utterance_prob, col=positivity, group=positivity)) +
# geom_bar(stat="identity", position=position_dodge()) +
geom_line(stat="identity", position=position_dodge()) +
facet_grid(goal~.) +
xlab("neg (it wasn't ~) vs. no neg (it was ~)") +
geom_errorbar(aes(ymin=ci_lower,ymax=ci_upper), position=position_dodge(width=.1))
## Warning: Width not defined. Set with `position_dodge(width = ?)`
ms2 <- ms %>%
mutate(utterance_whole = paste(substr(positivity, 1, 3), utterance, sep="_")) %>%
mutate(utterance_whole = ordered(utterance_whole,
levels = c("pos_amazing","pos_good", "pos_okay", "pos_bad", "pos_terrible",
"neg_amazing","neg_good", "neg_okay", "neg_bad", "neg_terrible"
# "pos_terrible", "pos_bad", "pos_okay", "pos_good", "pos_amazing",
# "neg_terrible", "neg_bad", "neg_okay", "neg_good", "neg_amazing"
))) %>%
arrange(true_state, goal, -utterance_prob) %>%
group_by(true_state, goal) %>%
mutate(rank=row_number()) %>%
select(-mean)
# mutate(utterance_whole = factor(utterance_whole, levels=rev(levels(as.factor(utterance_whole)))))
ggplot(data=ms2, aes(x=rank, y=utterance_prob, fill=utterance_whole)) +
scale_fill_brewer(palette="RdYlGn") +
geom_bar(stat="identity", position=position_dodge()) +
facet_grid(goal~true_state) +
scale_x_continuous(breaks=c(1:10)) +
xlab("rank within state x goal") +
ylab("probability of saying") +
geom_errorbar(aes(ymin=ci_lower,ymax=ci_upper), position=position_dodge())
ggplot(data=ms2, aes(y=rank, x=utterance_prob, col=utterance_whole, label = utterance_whole)) +
scale_color_brewer(palette="RdYlGn") +
geom_text(check_overlap = TRUE, size = 2.5, hjust = 0, nudge_x = 0.2) +
geom_point(stat="identity") +
facet_grid(goal~true_state) +
scale_y_reverse(lim=c(10,0), breaks = c(1:10)) +
scale_x_continuous(limits = c(0, 1.8), breaks = c(.25, .50, .75, 1)) +
ylab("rank within state x goal") +
xlab("probability of saying") +
geom_errorbarh(aes(xmin=ci_lower,xmax=ci_upper))