library(tidyverse)
library(lubridate)
library(zoo)
library(here)
library(ggthemes)
library(knitr)
library(tidyboot)

theme_set(theme_classic(base_size = 16))
files <- list.files(here("finished"), recursive = T, full.names = T, pattern = "*.csv") %>%
  enframe(name = NULL, value = "file") %>%
  filter(!str_detect(file, "progress"), !str_detect(file, "unfinished")) %>%
  filter(!str_detect(file, "22.01.T1_0"), !str_detect(file, "22.01.T1_1"), !str_detect(file, "22.01.T1_2"))


read_transcript <- function(file) {
  suppressMessages(read_csv(file) %>%
    mutate_all(as.character) %>%
    mutate(video = str_split(file, "/") %>% unlist() %>% tail(1),
           video = gsub("\\.csv", "", video)))
}

data <- map_df(files$file, read_transcript) %>%
  select(-X6) %>%
  separate(video, into = c("subject", "session", "video", "split_units")) %>%
  separate(parent.trialnum, into = c("line", "person"), sep = " ") %>%
  rename(ordinal = parent.ordinal, onset = parent.onset, offset = parent.offset) %>%
  mutate_at(c("split_units", "onset", "offset", "ordinal"), as.numeric) %>%
  mutate_at(c("onset", "offset"), milliseconds)
 
# broken <- data %>%
#   filter(str_detect(parent.ordinal, "HEAD"))
splits <- read_csv(here("split_sessions.csv")) %>%
  separate(vid, into = c("subject", "session", "video")) %>%
  mutate_at(c("start_time", "end_time"), seconds)

tidy_data <- data %>%
  left_join(splits, by = c("subject", "session", "video", "split_units")) %>%
  mutate(onset = onset + start_time, offset = offset+start_time) %>%
  mutate_at(c("onset", "offset"), as.numeric) %>%
  arrange(subject, session, video, split_units, ordinal) %>%
  filter(!is.na(session)) %>%
  mutate(person = if_else(person == "p_chat", "parent", "child"),
         session = as.numeric(session),
         age = paste0(14 + (session - 1) * 4, " mo."))
tidy_data %>%
  distinct(age, subject) %>%
  group_by(age) %>%
  summarise(n = n()) %>%
  kable()
age n
14 mo. 27
18 mo. 20
22 mo. 21
26 mo. 17
30 mo. 7
34 mo. 7
38 mo. 5
42 mo. 4
46 mo. 2
50 mo. 1
58 mo. 1
wide_table <- tidy_data %>%
  distinct(age, subject) %>%
  mutate(exists = 1) %>%
  spread(age, exists,fill = 0) 

kable(wide_table)
subject 14 mo. 18 mo. 22 mo. 26 mo. 30 mo. 34 mo. 38 mo. 42 mo. 46 mo. 50 mo. 58 mo.
110 1 1 1 1 0 0 0 0 0 0 0
123 1 1 1 1 1 1 0 0 0 0 0
124 1 1 1 0 1 1 0 0 0 0 0
126 1 0 0 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 1 1 0 0 0
22 1 1 1 1 1 1 1 1 1 1 1
24 1 1 1 1 1 1 1 1 0 0 0
25 1 1 1 1 0 0 0 0 0 0 0
27 1 1 1 0 0 0 0 0 0 0 0
28 1 0 0 0 0 0 0 0 0 0 0
29 1 1 1 0 0 0 0 0 0 0 0
30 1 0 0 0 0 0 0 0 0 0 0
33 1 0 0 0 0 0 0 0 0 0 0
39 1 1 1 1 1 1 1 1 1 0 0
40 1 0 0 0 0 0 0 0 0 0 0
42 0 1 1 1 1 1 1 0 0 0 0
43 1 1 1 1 0 0 0 0 0 0 0
44 1 1 1 1 0 0 0 0 0 0 0
45 1 1 1 1 0 0 0 0 0 0 0
46 1 0 0 0 0 0 0 0 0 0 0
47 1 1 1 1 0 0 0 0 0 0 0
48 1 1 1 1 0 0 0 0 0 0 0
49 1 1 1 1 0 1 0 0 0 0 0
50 1 1 1 0 0 0 0 0 0 0 0
58 1 1 1 1 0 0 0 0 0 0 0
59 1 1 1 1 0 0 0 0 0 0 0
61 1 0 0 0 0 0 0 0 0 0 0
76 1 0 0 0 0 0 0 0 0 0 0
80 1 1 1 1 1 0 0 0 0 0 0
91 0 0 1 1 0 0 0 0 0 0 0
potential_kids <- wide_table %>%
  select( subject, `14 mo.`, `18 mo.`, `22 mo.`, `26 mo.`, `30 mo.`) %>%
  rowwise() %>%
  mutate(total = sum( `14 mo.` + `18 mo.` +`22 mo.` + `26 mo.` + `30 mo.`)) %>%
  arrange(desc(total)) %>%
  slice(1:20) 

kable(potential_kids)

potential_kids %>%
  select(-total) %>%
  gather(session, coded, -subject) %>%
  filter(coded == 0) %>%
  select(-coded)

#write_csv
turns <- tidy_data %>%
  group_by(subject, age) %>%
  mutate(lag_person = lag(person), lag = onset - lag(offset), 
         lag_question = str_detect(lag(parent.outcome), "\\?")) %>%
  filter(!is.na(lag_person), !is.na(person), person != lag_person) %>%
  filter(abs(lag) <= 5) 
medians <- turns %>%
  group_by(age, person) %>%
  summarise(lag = median(lag))

ggplot(turns, aes(x = lag, color = person, label = person)) + 
  facet_grid(age ~ .) +
  geom_vline(data = medians, aes(xintercept = lag, color = person),
             show.legend = F) +
  geom_vline(aes(xintercept = 0), linetype = "dashed", show.legend = F) +
  geom_density() + 
  scale_color_ptol() + 
  geom_rug(alpha = .3, show.legend = F) +
  theme(legend.position = c(.2, .85), legend.title = element_blank())

split_medians <- turns %>%
  filter(age!= "58 mo.") %>%
  ungroup() %>%
  mutate(lag_question = factor(lag_question, labels = c("other response",
                                                        "question response"))) %>%
  group_by(age, person, lag_question) %>%
  summarise(median = median(lag),
            se = sd(lag, na.rm = T)/sqrt(sum(!is.na(lag)) -1 ))


split_medians %>% ungroup() %>% 
  mutate(age = as.character(age) %>% str_sub(1,2) %>% as.numeric) %>%
  ggplot(aes(x = age, y = median, color = lag_question)) + 
  facet_wrap(~ person) + 
  geom_pointrange(aes(ymin = median - 1.96 * se, ymax = median + 1.96 * se), 
                  position = position_dodge(.5)) + 
  scale_color_ptol()