loading and cleaning data

# preprocess
# rm(list=ls())
library(ggplot2)
library(tidyverse)
## ── Attaching packages ──────────────────────────────── tidyverse 1.2.1 ──
## ✔ tibble  1.4.2     ✔ purrr   0.2.4
## ✔ tidyr   0.7.2     ✔ dplyr   0.7.4
## ✔ readr   1.1.1     ✔ stringr 1.3.1
## ✔ tibble  1.4.2     ✔ forcats 0.2.0
## Warning: package 'stringr' was built under R version 3.4.4
## ── Conflicts ─────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(langcog)
## 
## Attaching package: 'langcog'
## The following object is masked from 'package:base':
## 
##     scale
library(ggthemes)
## 
## Attaching package: 'ggthemes'
## The following objects are masked from 'package:langcog':
## 
##     scale_color_solarized, scale_colour_solarized,
##     scale_fill_solarized
library(lme4)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
## 
##     expand
library(here)
## here() starts at /Users/ejyoon/Documents/Documents/Research/polcon_local/03_data_analysis/scripts
library(brms)
## Loading required package: Rcpp
## Warning: package 'Rcpp' was built under R version 3.4.4
## Loading 'brms' package (version 2.0.1). Useful instructions
## can be found by typing help('brms'). A more detailed introduction
## to the package is available through vignette('brms_overview').
## Run theme_set(theme_default()) to use the default bayesplot theme.
## 
## Attaching package: 'brms'
## The following object is masked from 'package:lme4':
## 
##     ngrps
d1 <- read.csv(here("../data/processed_data/polcon_v3.csv")) %>%
  gather(var, question_type, qtype1:qtype2) %>%
  mutate(trial = trial_num) %>%
  mutate(var = substr(var, 6, 6)) %>%
  select(subid, order, var, trial, question_type)
## Warning: attributes are not identical across measure variables;
## they will be dropped
d2 <- read.csv(here("../data/processed_data/polcon_v3.csv")) %>%
  gather(var, correct, correct1:correct2) %>%
  mutate(trial = trial_num) %>%
  mutate(var = substr(var, 8, 8)) %>%
  select(subid, order, trial, var, correct)

d1 <- left_join(d1, d2) %>%
  mutate(trial = as.factor(as.character(trial))) %>%
  mutate(question_type = as.factor(ifelse(order == "2" & var == "2" & trial == "5", "comp", question_type)))
## Joining, by = c("subid", "order", "var", "trial")
# order sheet with trial info
order <- read.csv(here("../data/info/v1/v1_order.csv")) %>%
  mutate(trial = substr(trial, 6,9)) %>%
  separate(trial, c("trial", "var"))

d1_1 <- left_join(filter(d1, order == "1"), 
                  select(order, trial, var, request_type_ord1)) %>%
  mutate(request_type = request_type_ord1) %>%
  select(subid, order, trial, request_type,question_type, correct)
## Joining, by = c("var", "trial")
## Warning: Column `trial` joining factor and character vector, coercing into
## character vector
d1_2 <- left_join(filter(d1, order == "2"), 
                  select(order, trial, var, request_type_ord2)) %>%
  mutate(request_type = request_type_ord2) %>%
  select(subid, order, trial, request_type,question_type, correct)
## Joining, by = c("var", "trial")
## Warning: Column `trial` joining factor and character vector, coercing into
## character vector
d1 <- rbind(d1_1, d1_2)
#     mutate(valence = ifelse(question_type %in% c("polite","nice"), 
#                           "positive", "negative"), 
#          adjective = ifelse(question_type %in% c("nice","mean"), "easy","hard")) 

# log sheet with subj info
log <- read.csv(here("../data/info/v3/subject_log_v3.csv"))

d1 <- left_join(d1, log)
## Joining, by = "subid"
## Warning: Column `subid` joining factors with different levels, coercing to
## character vector
# filter by exclusion criteria
d1 <- d1 %>% 
  filter(keepdrop != "drop", age >= 3 | age <= 4)
  
# make factors
d1 <- d1 %>%
  mutate(subid = as.factor(subid),
         order = as.factor(order),
         question_type = as.factor(question_type),
         gender = as.factor(gender))

# how many before filtering by practice?
subj <- d1 %>%
    distinct(subid, .keep_all=TRUE) %>%
    mutate(age_fct = floor(age)) %>%
    group_by(age_fct) %>%
    summarise(count = n())
subj
## # A tibble: 2 x 2
##   age_fct count
##     <dbl> <int>
## 1       3    22
## 2       4    27
# drop those who failed practice trials
subj_failed_practice <- filter(d1, (trial == "0" & correct == "0"))
d1 <- d1 %>%
  filter(!subid %in% subj_failed_practice$subid) %>%
  filter(trial != "0")

# change question_type
# v3 had just polite vs. rude, but this was not reflected in the logging system 

d1 <- d1 %>%
  mutate(question_type = fct_recode(question_type,
                                    "polite" = "nice",
                                    "rude" = "mean")) %>%
  # make age grouping
  mutate(age_fct = floor(age)) %>%
  filter(question_type != "comp") %>%
  select(-keepdrop, -initials, -dot, -dob)

write.csv(d1,paste("../data/processed_data/polcon_v3_ready_for_ana.csv",sep=""),
           row.names=FALSE) 

subj <- d1 %>%
    distinct(subid, .keep_all=TRUE) %>%
    group_by(age_fct) %>%
    summarise(count = n())
subj
## # A tibble: 2 x 2
##   age_fct count
##     <dbl> <int>
## 1       3    21
## 2       4    25
d1 %>%
    distinct(subid, .keep_all=TRUE) %>%
    group_by(age) %>%
    ggplot(., aes(x=age)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

d1 <- d1 %>%
  mutate(question_type = fct_relevel(question_type, "polite"))

plots

correct ~ question type x age

ms <- d1 %>%
  mutate(correct = as.numeric(as.character(correct))) %>%
  group_by(age_fct, question_type, subid) %>%
  summarize(
    correct = mean(correct, na.rm=TRUE)
  ) %>%
  group_by(age_fct, question_type) %>%
  multi_boot_standard(col = "correct", na.rm=TRUE) %>%
  ungroup() %>%
  mutate(correct = mean,
         age_fct = factor(age_fct, labels=c("3-yr-olds", "4-yr-olds")))
p <- ggplot(subset(ms, correct!="NA" & (question_type != "play with" & question_type != "compliance")), 
            aes(x=age_fct, y=correct, fill=age_fct))
p + 
  geom_bar(position=position_dodge(), stat = "identity") +
#   geom_errorbar(position=position_dodge(.9), aes(ymin=ci_lower,ymax=ci_upper,width=.1)) +
  facet_grid(.~question_type) +
  xlab("Age") +
  ylab("Proportion correct") + 
  ggtitle("Correct answers on questions \"Who was being more _____ ?\"") +
  scale_fill_solarized(guide = "none") +
  geom_errorbar(position=position_dodge(.9), aes(ymin=ci_lower,ymax=ci_upper,width=.1)) +
    geom_hline(yintercept=.50,lty=4) + 
  theme_few()

correct ~ marker type x age

ms <- d1 %>%
  filter(question_type != "comp"& question_type != "play") %>%
  mutate(correct = as.numeric(as.character(correct))) %>%
  group_by(age_fct, request_type, subid) %>%
  summarize(
    correct = mean(correct, na.rm=TRUE)
  ) %>%
  group_by(age_fct, request_type) %>%
  multi_boot_standard(col = "correct", na.rm=TRUE) %>%
  ungroup() %>%
  mutate(correct = mean,
         age_fct = factor(age_fct, labels=c("3-yr-olds", "4-yr-olds")),
         request_type = recode(request_type, 
                               idq = "can you~",
                               please_idq = "can you please~"))
p <- ggplot(subset(ms, correct!="NA"), 
            aes(x=age_fct, y=correct, fill=age_fct))
p + 
  geom_bar(position=position_dodge(), stat = "identity") +
#   geom_errorbar(position=position_dodge(.9), aes(ymin=ci_lower,ymax=ci_upper,width=.1)) +
  facet_grid(.~request_type) +
  xlab("Age") +
  ylab("Proportion correct") + 
  ggtitle("Correct answers on questions \"Who was being more _____ ?\"") +
  scale_fill_solarized(guide = "none") +
  geom_errorbar(position=position_dodge(.9), aes(ymin=ci_lower,ymax=ci_upper,width=.1)) +
    geom_hline(yintercept=.50,lty=4) + 
  theme_few()

key plot: correct ~ question type x marker type x age

ms <- d1 %>%
  filter(question_type != "play" & question_type != "comp") %>%
  mutate(correct = as.numeric(as.character(correct))) %>%
  group_by(age_fct, question_type, request_type, subid) %>%
  summarize(
    correct = mean(correct, na.rm=TRUE)
  ) %>%
  group_by(age_fct, question_type, request_type) %>%
  multi_boot_standard(col = "correct", na.rm=TRUE) %>%
  ungroup() %>%
  mutate(correct = mean,
         # question_type = factor(question_type, levels=c("polite", "nice", "rude", "mean", "play", "comp"))) %>%
  # mutate(question_type = factor(question_type, labels=c("polite", "nice", "rude", "mean", "play with", "compliance")),
         age_fct = factor(age_fct, labels=c("3-yr-olds", "4-yr-olds")),
         request_type = recode(request_type, 
                               idq = "can you~",
                               please_idq = "can you please~"))
p <- ggplot(filter(ms, !is.na(correct)), 
            aes(x=age_fct, y=correct, fill=age_fct))
p + 
  geom_bar(position=position_dodge(), stat = "identity") +
  # geom_jitter(data=d1, aes(x=request_type, y=correct, fill=request_type)) +
  facet_grid(question_type~request_type) +
  xlab("Age") +
  ylab("Proportion correct") + 
  ggtitle("Correct answers on questions \"Who was being more _____ ?\"") +
  scale_fill_solarized(guide = "none") +
  geom_errorbar(position=position_dodge(.9), aes(ymin=ci_lower,ymax=ci_upper,width=.1)) +
  geom_hline(yintercept=.50,lty=4) +
  theme_few()

correct ~ question type x marker type x age, jitter

each point represents a participant.

p <- ggplot(d1 %>% filter(correct!="NA") %>%
              mutate(request_type = recode(request_type, 
                               idq = "can you~",
                               please_idq = "can you please~")), 
            aes(x=age, y=correct, col=age))
p + 
  # geom_violin(position = position_dodge(width = 1), scale = "width", size = 1,
              # draw_quantiles = c(0.5), alpha = 0.3) +
  geom_jitter() +
  geom_smooth() +
#   geom_errorbar(position=position_dodge(.9), aes(ymin=ci_lower,ymax=ci_upper,width=.1)) +
  facet_grid(question_type~request_type) +
  xlab("Age") +
  ylab("Proportion correct") + 
  ggtitle("Correct answers on questions \"Who was being more _____ ?\"") +
  scale_fill_solarized(guide = "none") +
  # geom_errorbar(position=position_dodge(.9), aes(ymin=ci_lower,ymax=ci_upper,width=.1)) +
    geom_hline(yintercept=.50,lty=4) + 
  theme_few()
## `geom_smooth()` using method = 'loess'

# stats

# brm_p <- brm(data=d1 %>% mutate(question_type = fct_relevel(question_type, "polite")) %>%               
#                mutate(age = scale(age)), family = "bernoulli", correct ~ age * request_type * question_type + (request_type + question_type | subid))
# save(brm_p, file=here("../scripts/polcon_brm_agescale.Rds"))
load(here("../scripts/polcon_brm_agescale.Rds"))
summary(brm_p)
##  Family: bernoulli 
##   Links: mu = logit 
## Formula: correct ~ age * request_type * question_type + (request_type + question_type | subid) 
##    Data: d1 %>% mutate(question_type = fct_relevel(question (Number of observations: 548) 
## Samples: 4 chains, each with iter = 2000; warmup = 1000; thin = 1; 
##          total post-warmup samples = 4000
##     ICs: LOO = NA; WAIC = NA; R2 = NA
##  
## Group-Level Effects: 
## ~subid (Number of levels: 46) 
##                                                Estimate Est.Error l-95% CI
## sd(Intercept)                                      0.68      0.32     0.07
## sd(request_typeplease)                             0.68      0.45     0.03
## sd(request_typeplease_idq)                         1.31      0.59     0.16
## sd(question_typerude)                              1.21      0.42     0.38
## cor(Intercept,request_typeplease)                  0.12      0.41    -0.69
## cor(Intercept,request_typeplease_idq)              0.34      0.36    -0.45
## cor(request_typeplease,request_typeplease_idq)     0.15      0.40    -0.66
## cor(Intercept,question_typerude)                  -0.31      0.35    -0.84
## cor(request_typeplease,question_typerude)         -0.19      0.41    -0.85
## cor(request_typeplease_idq,question_typerude)     -0.08      0.36    -0.74
##                                                u-95% CI Eff.Sample Rhat
## sd(Intercept)                                      1.34        648 1.01
## sd(request_typeplease)                             1.66        847 1.00
## sd(request_typeplease_idq)                         2.56        798 1.01
## sd(question_typerude)                              2.03        862 1.00
## cor(Intercept,request_typeplease)                  0.83       2037 1.00
## cor(Intercept,request_typeplease_idq)              0.89        767 1.00
## cor(request_typeplease,request_typeplease_idq)     0.82       1093 1.00
## cor(Intercept,question_typerude)                   0.51        895 1.00
## cor(request_typeplease,question_typerude)          0.67        604 1.00
## cor(request_typeplease_idq,question_typerude)      0.64       1007 1.01
## 
## Population-Level Effects: 
##                                              Estimate Est.Error l-95% CI
## Intercept                                        0.40      0.25    -0.09
## age                                              0.24      0.25    -0.26
## request_typeplease                               1.18      0.41     0.41
## request_typeplease_idq                           1.71      0.56     0.74
## question_typerude                                0.73      0.40    -0.03
## age:request_typeplease                          -0.00      0.37    -0.75
## age:request_typeplease_idq                       1.37      0.57     0.35
## age:question_typerude                           -0.23      0.40    -1.02
## request_typeplease:question_typerude            -1.78      0.53    -2.84
## request_typeplease_idq:question_typerude        -1.31      0.61    -2.51
## age:request_typeplease:question_typerude         1.06      0.53     0.01
## age:request_typeplease_idq:question_typerude    -0.45      0.64    -1.73
##                                              u-95% CI Eff.Sample Rhat
## Intercept                                        0.92       3249 1.00
## age                                              0.74       2394 1.00
## request_typeplease                               2.03       2465 1.00
## request_typeplease_idq                           2.94       1684 1.00
## question_typerude                                1.50       2464 1.00
## age:request_typeplease                           0.73       2562 1.00
## age:request_typeplease_idq                       2.62       1805 1.00
## age:question_typerude                            0.53       2275 1.00
## request_typeplease:question_typerude            -0.79       2376 1.00
## request_typeplease_idq:question_typerude        -0.15       2459 1.00
## age:request_typeplease:question_typerude         2.11       2578 1.00
## age:request_typeplease_idq:question_typerude     0.74       2148 1.00
## 
## Samples were drawn using sampling(NUTS). For each parameter, Eff.Sample 
## is a crude measure of effective sample size, and Rhat is the potential 
## scale reduction factor on split chains (at convergence, Rhat = 1).
glm_p <- glmer(data=d1 %>% mutate(question_type = fct_relevel(question_type, "polite")) %>%               
               mutate(age = scale(age)), family = "binomial", correct ~ age * request_type * question_type + (1 | subid))
summary(glm_p)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: correct ~ age * request_type * question_type + (1 | subid)
##    Data: 
## d1 %>% mutate(question_type = fct_relevel(question_type, "polite")) %>%  
##     mutate(age = scale(age))
## 
##      AIC      BIC   logLik deviance df.resid 
##    637.5    693.5   -305.8    611.5      535 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.8676 -0.7956  0.4319  0.6222  3.0943 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  subid  (Intercept) 0.435    0.6596  
## Number of obs: 548, groups:  subid, 46
## 
## Fixed effects:
##                                              Estimate Std. Error z value
## (Intercept)                                   0.38813    0.24360   1.593
## age                                           0.23151    0.24378   0.950
## request_typeplease                            1.01057    0.34638   2.917
## request_typeplease_idq                        1.20513    0.38659   3.117
## question_typerude                             0.60818    0.32864   1.851
## age:request_typeplease                       -0.02038    0.33800  -0.060
## age:request_typeplease_idq                    0.93664    0.39298   2.383
## age:question_typerude                        -0.24581    0.32421  -0.758
## request_typeplease:question_typerude         -1.56831    0.48799  -3.214
## request_typeplease_idq:question_typerude     -1.12285    0.52043  -2.158
## age:request_typeplease:question_typerude      0.93555    0.49022   1.908
## age:request_typeplease_idq:question_typerude -0.29345    0.52211  -0.562
##                                              Pr(>|z|)   
## (Intercept)                                   0.11109   
## age                                           0.34228   
## request_typeplease                            0.00353 **
## request_typeplease_idq                        0.00182 **
## question_typerude                             0.06422 . 
## age:request_typeplease                        0.95192   
## age:request_typeplease_idq                    0.01715 * 
## age:question_typerude                         0.44835   
## request_typeplease:question_typerude          0.00131 **
## request_typeplease_idq:question_typerude      0.03096 * 
## age:request_typeplease:question_typerude      0.05634 . 
## age:request_typeplease_idq:question_typerude  0.57408   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) age    rqst_t rqst__ qstn_t ag:rq_ ag:r__ ag:qs_ rqs_:_
## age          0.010                                                        
## rqst_typpls -0.583 -0.003                                                 
## rqst_typpl_ -0.523 -0.004  0.376                                          
## qustn_typrd -0.617 -0.005  0.442  0.393                                   
## ag:rqst_typ -0.006 -0.605  0.057  0.006  0.004                            
## ag:rqst_ty_ -0.004 -0.518  0.007  0.291  0.005  0.377                     
## ag:qstn_typ -0.008 -0.632  0.002  0.004  0.005  0.457  0.390              
## rqst_typp:_  0.411  0.001 -0.714 -0.267 -0.678 -0.038 -0.003  0.000       
## rqst_typ_:_  0.389  0.003 -0.279 -0.741 -0.631 -0.003 -0.215 -0.003  0.427
## ag:rqst_t:_  0.008  0.420 -0.032 -0.002  0.001 -0.691 -0.258 -0.664  0.041
## ag:rqst__:_  0.005  0.392 -0.001 -0.217 -0.003 -0.284 -0.749 -0.621 -0.001
##             rq__:_ ag:_:_
## age                      
## rqst_typpls              
## rqst_typpl_              
## qustn_typrd              
## ag:rqst_typ              
## ag:rqst_ty_              
## ag:qstn_typ              
## rqst_typp:_              
## rqst_typ_:_              
## ag:rqst_t:_  0.000       
## ag:rqst__:_  0.202  0.412