library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.8
## ✓ tidyr   1.2.0     ✓ stringr 1.4.0
## ✓ readr   2.1.2     ✓ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(here)

## here() starts at /Users/caoanjie/Desktop/projects/CCRR_analysis/study_2

library(lme4)

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

library(lmerTest)

## 
## Attaching package: 'lmerTest'

## The following object is masked from 'package:lme4':
## 
##     lmer

## The following object is masked from 'package:stats':
## 
##     step

library(effectsize)

tidy_d <- read_csv(here("data/4_processed/with_human_coded_main.csv"))

## Warning: One or more parsing issues, see `problems()` for details

## Rows: 41284 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): subject, culture, task_name, task_info, trial_info, resp_type, resp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Change Detection

Model

Change detection (linear regression): log(reaction_time) ~ culture * type_of_change + (type_of_change | subject) + (culture | picture)

cd_df <- tidy_d %>% 
  filter(task_name == "CD") %>% 
  mutate(reaction_time = as.numeric(resp), 
         type_of_change = task_info,
         picture = trial_info) %>% 
  select(culture, subject,reaction_time,type_of_change, picture) %>% 
  filter(!is.na(reaction_time))

## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion

#cd_model <- lmer(log(reaction_time) ~ culture * type_of_change + (type_of_change | subject) + (culture | picture),  data = cd_df)

#cd_model <- lmer(log(reaction_time) ~ culture * type_of_change + (type_of_change | subject) ,  data = cd_df)

cd_model <- lmer(log(reaction_time) ~ culture * type_of_change + (1 | subject) ,  data = cd_df)

summary(cd_model)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: log(reaction_time) ~ culture * type_of_change + (1 | subject)
##    Data: cd_df
## 
## REML criterion at convergence: 25977.6
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -6.8360 -0.6794 -0.0535  0.6175  4.4740 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  subject  (Intercept) 0.2218   0.4710  
##  Residual             0.4732   0.6879  
## Number of obs: 11857, groups:  subject, 468
## 
## Fixed effects:
##                                Estimate Std. Error        df t value Pr(>|t|)
## (Intercept)                   8.796e+00  3.849e-02 5.192e+02 228.513  < 2e-16
## cultureUS                     1.001e-01  4.859e-02 5.231e+02   2.060 0.039892
## type_of_changefocal           7.093e-02  2.064e-02 1.142e+04   3.436 0.000592
## cultureUS:type_of_changefocal 3.844e-02  2.628e-02 1.143e+04   1.463 0.143551
##                                  
## (Intercept)                   ***
## cultureUS                     *  
## type_of_changefocal           ***
## cultureUS:type_of_changefocal    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) cltrUS typ_f_
## cultureUS   -0.792              
## typ_f_chngf -0.245  0.194       
## cltrUS:ty__  0.192 -0.246 -0.785

Visualization

raw_CD <- tidy_d %>% 
  filter(task_name == "CD") %>% 
  mutate(log_rt = log(as.numeric(resp)))

## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion

ggplot(data = raw_CD, 
       aes(y = log_rt, x = culture, color = culture)) +
geom_point(alpha = .2, position = position_jitter(width = .1)) + 
  stat_summary(fun.data = "mean_cl_boot", color = "black") +
scale_color_manual(values = c("red", "blue"))+
scale_fill_manual(values = c("red", "blue"))+
guides(fill = "none") +
guides(color = "none") +
ylab("log RT (ms)") + 
xlab("") +
theme_classic() + 
labs(title = "Change Detection") +
theme(plot.title = element_text(hjust = 0.5, size = 8), 
      plot.subtitle = element_text(hjust = 0.5, size = 6), 
      text = element_text(size=8))+
  facet_wrap(~task_info)

## Warning: Removed 80 rows containing non-finite values (stat_summary).

## Warning: Removed 80 rows containing missing values (geom_point).

by item exploration

cn_coded <- read_csv(here("data/annotated/CN/CD_wr.csv"))

## Rows: 5190 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): subject, culture, prompt, response, coder
## dbl (1): correct
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

us_coded <- read_csv(here("data/annotated/US/CD_wr.csv"))

## Rows: 8880 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): subject, culture, prompt, response, coder
## dbl (1): correct
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

coded_df <- bind_rows(cn_coded, us_coded)

coded_df %>% 
  ggplot(aes(x = culture, y = correct)) + 
  stat_summary(fun.data = "mean_cl_boot") + 
  facet_wrap(~prompt)

# so it's clear that there are some big item level variations

# what if we took out all the ones with cultural differences in accuracy 

c <- coded_df %>% 
 nest_by(prompt) 


c$culture_pred <- lapply(c$data, 
       function(d){
         (glm(correct ~ culture , family = binomial, data = d) %>% 
           summary())$coefficients %>% 
           as.data.frame() %>% 
           rownames_to_column() %>% filter(rowname == "cultureUS") %>% pull(`Pr(>|z|)`)
       })

c %>% 
  select(prompt, culture_pred) %>% 
  unnest(culture_pred) %>% 
  left_join(coded_df, by = "prompt") %>% 
  filter(culture_pred < 0.05) %>% 
  ggplot(aes(x = culture, y = correct)) + 
  
  geom_text(aes(label = round(culture_pred, 2))) + 
  stat_summary(fun.data = "mean_cl_boot") +
  facet_wrap(~prompt)

cultural_diff_prompt <- c %>% 
  filter(culture_pred < 0.05) %>% 
  pull(prompt)

raw_CD %>% 
  filter(!(trial_info %in% cultural_diff_prompt)) %>% 
ggplot(
       aes(y = log_rt, x = culture, color = culture)) +
geom_point(alpha = .2, position = position_jitter(width = .1)) + 
  stat_summary(fun.data = "mean_cl_boot", color = "black") +
scale_color_manual(values = c("red", "blue"))+
scale_fill_manual(values = c("red", "blue"))+
guides(fill = "none") +
guides(color = "none") +
ylab("log RT (ms)") + 
xlab("") +
theme_classic() + 
labs(title = "Change Detection") +
theme(plot.title = element_text(hjust = 0.5, size = 8), 
      plot.subtitle = element_text(hjust = 0.5, size = 6), 
      text = element_text(size=8))+
  facet_wrap(~task_info)

## Warning: Removed 50 rows containing non-finite values (stat_summary).

## Warning: Removed 50 rows containing missing values (geom_point).

cd_model_pruned <- lmer(log_rt ~ culture * task_info + (1 | subject) ,  data = raw_CD %>% filter(!(trial_info %in% cultural_diff_prompt)) )

summary(cd_model_pruned)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: log_rt ~ culture * task_info + (1 | subject)
##    Data: raw_CD %>% filter(!(trial_info %in% cultural_diff_prompt))
## 
## REML criterion at convergence: 17507.8
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -5.1402 -0.6796 -0.0764  0.5986  4.1977 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  subject  (Intercept) 0.2089   0.4570  
##  Residual             0.4839   0.6957  
## Number of obs: 7820, groups:  subject, 468
## 
## Fixed effects:
##                            Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)                 8.71410    0.03853  530.91174 226.170  < 2e-16 ***
## cultureUS                   0.11891    0.04863  533.59586   2.445 0.014795 *  
## task_infofocal              0.10255    0.02656 7389.18837   3.860 0.000114 ***
## cultureUS:task_infofocal   -0.04226    0.03367 7394.70876  -1.255 0.209456    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) cltrUS tsk_nf
## cultureUS   -0.792              
## task_inffcl -0.266  0.211       
## cltrUS:tsk_  0.210 -0.266 -0.789

Free Description

Model

Free description: (logistic regression) first_mention ~ culture + (1 | subject) + (culture | picture)

mention_df <- tidy_d %>% 
  filter(task_name == "FD", resp_type == "first_mention_focal") %>% 
    mutate(first_mention = as.factor(case_when(
    resp == "1" ~ "focal",
    resp == "0" ~ "background")), 
    scene = trial_info) %>% 
  select(-resp, -task_info, -resp_type, -trial_info)

mention_model <- glmer(first_mention ~ culture + (1 | subject)+(culture | scene), family = binomial, data = mention_df)


#mention_model <- glmer(first_mention ~ culture + (1 | subject) , 
#                   family = binomial, data = mention_df)

mention_model

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: first_mention ~ culture + (1 | subject) + (culture | scene)
##    Data: mention_df
##       AIC       BIC    logLik  deviance  df.resid 
##  2082.526  2119.103 -1035.263  2070.526      3276 
## Random effects:
##  Groups  Name        Std.Dev. Corr 
##  subject (Intercept) 1.5592        
##  scene   (Intercept) 1.0659        
##          cultureUS   0.4595   -0.70
## Number of obs: 3282, groups:  subject, 469; scene, 7
## Fixed Effects:
## (Intercept)    cultureUS  
##       1.117        2.994

summary(mention_model)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: first_mention ~ culture + (1 | subject) + (culture | scene)
##    Data: mention_df
## 
##      AIC      BIC   logLik deviance df.resid 
##   2082.5   2119.1  -1035.3   2070.5     3276 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -5.7794  0.0901  0.1366  0.3162  3.2930 
## 
## Random effects:
##  Groups  Name        Variance Std.Dev. Corr 
##  subject (Intercept) 2.4310   1.5592        
##  scene   (Intercept) 1.1362   1.0659        
##          cultureUS   0.2112   0.4595   -0.70
## Number of obs: 3282, groups:  subject, 469; scene, 7
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   1.1167     0.4290   2.603  0.00924 ** 
## cultureUS     2.9942     0.3042   9.843  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##           (Intr)
## cultureUS -0.514

summary(mention_model)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: first_mention ~ culture + (1 | subject) + (culture | scene)
##    Data: mention_df
## 
##      AIC      BIC   logLik deviance df.resid 
##   2082.5   2119.1  -1035.3   2070.5     3276 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -5.7794  0.0901  0.1366  0.3162  3.2930 
## 
## Random effects:
##  Groups  Name        Variance Std.Dev. Corr 
##  subject (Intercept) 2.4310   1.5592        
##  scene   (Intercept) 1.1362   1.0659        
##          cultureUS   0.2112   0.4595   -0.70
## Number of obs: 3282, groups:  subject, 469; scene, 7
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   1.1167     0.4290   2.603  0.00924 ** 
## cultureUS     2.9942     0.3042   9.843  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##           (Intr)
## cultureUS -0.514

Visualization

FD_raw <- tidy_d %>% 
  filter(task_name == "FD") %>% 
  group_by(subject, culture) %>%
  summarise(first_mention = mean(as.numeric(resp)))

## `summarise()` has grouped output by 'subject'. You can override using the
## `.groups` argument.

FD_raw %>% 
  ggplot(aes(x = culture, y = first_mention, color = culture)) + 
  geom_point(alpha = .2, position = position_jitter(width = .1)) + 
  stat_summary(fun.data = "mean_cl_boot", color = "black") + 
  scale_y_continuous(breaks = seq(0,1,0.5), 
                     labels = {function(x) paste0(as.character(x*100),"%")})+
scale_color_manual(values = c("red", "blue"))+
scale_fill_manual(values = c("red", "blue"))+
guides(fill = "none") +
guides(color = "none") +
ylab("proportion first mention focal") + 
xlab("") +
theme_classic() + 
labs(title = "Free description") +
theme(plot.title = element_text(hjust = 0.5, size = 8), 
      plot.subtitle = element_text(hjust = 0.5, size = 6), 
      text = element_text(size=8))

Causal attribution

Model

Causal attribution (linear regression): rating ~ culture * attribution_type + (attribution_type | subject) + (culture | item)

ca_df <- tidy_d %>% 
  filter(task_name == "CA") %>% 
  mutate(attribution_type = task_info, 
         item = trial_info,
         rating = as.numeric(resp)) %>% 
  select(culture, subject,attribution_type,item, rating)

ca_model <- lmer(rating ~ culture * attribution_type + (attribution_type | subject) + (culture | item),  data = ca_df)

summary(ca_model)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: rating ~ culture * attribution_type + (attribution_type | subject) +  
##     (culture | item)
##    Data: ca_df
## 
## REML criterion at convergence: 17076.1
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.8871 -0.6313 -0.0996  0.5959  3.5181 
## 
## Random effects:
##  Groups   Name                        Variance Std.Dev. Corr 
##  subject  (Intercept)                 0.94212  0.9706        
##           attribution_typesituational 1.04324  1.0214   -0.72
##  item     (Intercept)                 0.12027  0.3468        
##           cultureUS                   0.08851  0.2975   0.30 
##  Residual                             1.77337  1.3317        
## Number of obs: 4703, groups:  subject, 409; item, 12
## 
## Fixed effects:
##                                       Estimate Std. Error       df t value
## (Intercept)                            3.16954    0.17526 18.14969  18.085
## cultureUS                             -0.07967    0.17218 25.45322  -0.463
## attribution_typesituational           -0.21294    0.23357 14.34566  -0.912
## cultureUS:attribution_typesituational -1.32982    0.22290 17.96075  -5.966
##                                       Pr(>|t|)    
## (Intercept)                           4.66e-13 ***
## cultureUS                                0.648    
## attribution_typesituational              0.377    
## cultureUS:attribution_typesituational 1.22e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) cltrUS attrb_
## cultureUS   -0.180              
## attrbtn_typ -0.702  0.086       
## cltrUS:ttr_  0.089 -0.701 -0.077

Visualization

raw_CA <- tidy_d %>% 
  filter(task_name == "CA") %>% 
  mutate(resp = as.numeric(resp) + 1)

ggplot(data = raw_CA, 
       aes(y = resp, x = culture, color = culture)) +
 geom_point(alpha = .2, position = position_jitter(width = .1)) + 
  stat_summary(fun.data = "mean_cl_boot", color = "black")  +
scale_color_manual(values = c("red", "blue"))+
scale_fill_manual(values = c("red", "blue"))+
guides(fill = "none") +
guides(color = "none") +
ylab("LS rating (1-7)") + 
xlab("") +
theme_classic() + 
labs(title = "Causal Attribution") +
theme(plot.title = element_text(hjust = 0.5, size = 8), 
      plot.subtitle = element_text(hjust = 0.5, size = 6), 
      text = element_text(size=8)) + 
  facet_wrap(~task_info)

## Warning: Removed 361 rows containing non-finite values (stat_summary).

## Warning: Removed 361 rows containing missing values (geom_point).

Symbolic Self Inflation

Model

Symbolic self-inflation (linear regression): percent_inflation ~ culture

si_df <- tidy_d %>% 
  filter(task_name == "SSI") %>% 
  filter(resp_type == "task_score_ratio") %>% 
  mutate(score = as.numeric(resp)) %>% 
  select(-resp, -task_info, -trial_info, -resp_type)

si_model <- glm(score ~ culture, family=gaussian, data = si_df)

si_model

## 
## Call:  glm(formula = score ~ culture, family = gaussian, data = si_df)
## 
## Coefficients:
## (Intercept)    cultureUS  
##      1.4392      -0.1268  
## 
## Degrees of Freedom: 416 Total (i.e. Null);  415 Residual
## Null Deviance:       142.4 
## Residual Deviance: 140.9     AIC: 736.8

summary(si_model)

## 
## Call:
## glm(formula = score ~ culture, family = gaussian, data = si_df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1877  -0.3675  -0.1396   0.2042   3.1062  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.43921    0.04591  31.345   <2e-16 ***
## cultureUS   -0.12678    0.05860  -2.163   0.0311 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.3394127)
## 
##     Null deviance: 142.44  on 416  degrees of freedom
## Residual deviance: 140.86  on 415  degrees of freedom
## AIC: 736.81
## 
## Number of Fisher Scoring iterations: 2

Visualization

SSI_raw <- tidy_d %>% 
  filter(task_name == "SSI") %>% 
  mutate(resp = as.numeric(resp)) 


ggplot(data = SSI_raw %>% filter(resp_type == "task_score_ratio"), 
       aes(y = resp, x = culture, color = culture)) +

geom_point(aes(y = resp, color = culture), 
           position = position_jitter(width = .15), size = .5, alpha = 0.8) +
  stat_summary(fun.data = "mean_cl_boot", color = "black")  +

scale_color_manual(values = c("red", "blue"))+
scale_fill_manual(values = c("red", "blue"))+
guides(fill = "none") +
guides(color = "none") +
ylab("") + 
xlab("") +
theme_classic() + 
labs(title = "Symbolic Self Inflation") +
theme(plot.title = element_text(hjust = 0.5, size = 8), 
      plot.subtitle = element_text(hjust = 0.5, size = 6), 
      text = element_text(size=8))

Triads

Model

Taxonomic/thematic similarity task: (logistic regression) choice ~ culture + (1 | subject) + (culture | item)

TD_catch_failed <- tidy_d %>% 
  filter(task_name == "TD") %>% 
  filter(task_info == "catch") %>% 
  filter(resp = FALSE) %>% 
  pull(subject)

triads_d <- tidy_d %>% 
    filter(!subject %in% TD_catch_failed) %>% 
  filter(task_name == "TD") %>% 
  mutate(choice = as.factor(resp), 
         item = trial_info) %>% 
  select(choice, culture, subject, item)
 

triads_model <- glmer(choice ~ culture + (1 | subject) + (culture | item), family = binomial, data = triads_d)
 
summary(triads_model)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: choice ~ culture + (1 | subject) + (culture | item)
##    Data: triads_d
## 
##      AIC      BIC   logLik deviance df.resid 
##   6370.2   6412.1  -3179.1   6358.2     7967 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -9.7870 -0.3425  0.1279  0.4031 13.3032 
## 
## Random effects:
##  Groups  Name        Variance Std.Dev. Corr
##  subject (Intercept) 3.0046   1.7334       
##  item    (Intercept) 5.5174   2.3489       
##          cultureUS   0.4674   0.6836   1.00
## Number of obs: 7973, groups:  subject, 469; item, 60
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  -0.1052     0.4598  -0.229   0.8191  
## cultureUS     1.9036     0.8782   2.168   0.0302 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##           (Intr)
## cultureUS -0.524

Visualization

TD_catch_failed <- tidy_d %>% 
  filter(task_name == "TD") %>% 
  filter(task_info == "catch") %>% 
  filter(resp = FALSE) %>% 
  pull(subject)

TD_raw <- tidy_d %>% 
  filter(!subject %in% TD_catch_failed) %>% 
  filter(task_name == "TD") %>% 
  group_by(subject, culture) %>%
  summarise(tax_match = mean(as.logical(resp)))

## `summarise()` has grouped output by 'subject'. You can override using the
## `.groups` argument.

ggplot(data = TD_raw, 
       aes(y = tax_match, x = culture, color = culture)) +
geom_point(aes(y = tax_match, color = culture), 
           position = position_jitter(width = .15), size = .5, alpha = 0.8) +
  stat_summary(fun.data = "mean_cl_boot", color = "black")  +
scale_y_continuous(breaks = seq(0,1,0.5), 
                     labels = {function(x) paste0(as.character(x*100),"%")})+
scale_color_manual(values = c("red", "blue"))+
scale_fill_manual(values = c("red", "blue"))+
guides(fill = "none") +
guides(color = "none") +
ylab("Triads taxonomic match") + 
xlab("") +
theme_classic() + 
labs(title = "Triads") +
theme(plot.title = element_text(hjust = 0.5, size = 8), 
      plot.subtitle = element_text(hjust = 0.5, size = 6), 
      text = element_text(size=8))

Semantic Intuition

Model

Semantic intuition: (logistic regression) choice ~ culture + (1 | subject) + (culture | item)

SeI_df <- tidy_d %>% 
  filter(task_name == "SeI") %>% 
  filter(task_info == "critical") %>% 
  mutate(
    choice_causal = case_when(
      resp == "causal_historical" ~ TRUE, 
      resp == "descriptivist" ~ FALSE
    )
  ) %>% 
  mutate(item = trial_info) %>% 
  select(choice_causal, culture, subject, item)

#SeI_model <- glmer(choice_causal ~ culture + (1 | subject) + (culture | item), family = binomial, data = SeI_df)

SeI_model <- glmer(choice_causal ~ culture + (1 | subject) , family = binomial, data = SeI_df, 
                   control=glmerControl(optimizer="bobyqa",optCtrl=list(maxfun=2e5)))

summary(SeI_model)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: choice_causal ~ culture + (1 | subject)
##    Data: SeI_df
## Control: glmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
## 
##      AIC      BIC   logLik deviance df.resid 
##   1086.3   1100.8   -540.1   1080.3      935 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.2532 -0.4073  0.2405  0.3685  0.9718 
## 
## Random effects:
##  Groups  Name        Variance Std.Dev.
##  subject (Intercept) 7.242    2.691   
## Number of obs: 938, groups:  subject, 469
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   0.2645     0.2771   0.955     0.34    
## cultureUS     1.7941     0.3988   4.499 6.83e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##           (Intr)
## cultureUS -0.636

Visualization

SeI_raw <- tidy_d %>% 
  filter(task_name == "SeI") %>% 
  filter(task_info == "critical") %>% 
  mutate(causal_historical_choice = case_when(
    resp == "causal_historical" ~ TRUE,
    resp == "descriptivist" ~ FALSE
  )) %>% 
  group_by(subject, culture) %>%
  summarise(causal_historical_resp = mean(causal_historical_choice))

## `summarise()` has grouped output by 'subject'. You can override using the
## `.groups` argument.

ggplot(data = SeI_raw, 
       aes(y = causal_historical_resp, x = culture, color = culture)) +
geom_point(aes(y = causal_historical_resp, color = culture), 
           position = position_jitter(width = .15), size = .5, alpha = 0.8) +
  stat_summary(fun.data = "mean_cl_boot", color = "black")  +
scale_y_continuous(breaks = seq(0,1,0.5), 
                     labels = {function(x) paste0(as.character(x*100),"%")})+
scale_color_manual(values = c("red", "blue"))+
scale_fill_manual(values = c("red", "blue"))+
guides(fill = "none") +
guides(color = "none") +
ylab("Causal Historical Choice") + 
xlab("") +
theme_classic() + 
labs(title = "Semantic Intuition") +
theme(plot.title = element_text(hjust = 0.5, size = 8), 
      plot.subtitle = element_text(hjust = 0.5, size = 6), 
      text = element_text(size=8))

Ambiguous RMTS

Model

Ambiguous RMTS (logistic regression): choice ~ culture + (trial_num | subject)

rmts_df <- tidy_d %>% 
  filter(task_name == "RMTS") %>% 
  mutate(choice = as.factor(case_when(
    resp == "1" ~ "rel",
    resp == "0" ~ "obj"))
         ) %>%
  group_by(subject) %>% 
  mutate(trial_num = as.factor(row_number())) %>% 
  select(-resp, -task_info, -trial_info, -resp_type)
# model 0: not converging 
#rmts_model <- glmer(choice ~ culture + (trial_num | subject), family = binomial, data = rmts_df)

# model 1: 
rmts_model <- glmer(choice ~ culture + (1 | subject), family = binomial, data = rmts_df)

#rmts_model
summary(rmts_model)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: choice ~ culture + (1 | subject)
##    Data: rmts_df
## 
##      AIC      BIC   logLik deviance df.resid 
##   1721.6   1738.2   -857.8   1715.6     1873 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.6016 -0.1880 -0.1848  0.2487  1.7378 
## 
## Random effects:
##  Groups  Name        Variance Std.Dev.
##  subject (Intercept) 16.58    4.071   
## Number of obs: 1876, groups:  subject, 469
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  -1.1874     0.3890  -3.052  0.00227 **
## cultureUS     0.1067     0.4842   0.220  0.82562   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##           (Intr)
## cultureUS -0.770

Visualization

RMTS_raw <- tidy_d %>% 
  filter(task_name == "RMTS") %>% 
  group_by(subject, culture) %>%
  summarise(relational_choice = mean(as.numeric(resp)))

## `summarise()` has grouped output by 'subject'. You can override using the
## `.groups` argument.

ggplot(data = RMTS_raw, 
       aes(y = relational_choice, x = culture, color = culture)) +
geom_point(aes(y = relational_choice, color = culture), 
           position = position_jitter(width = .15), size = .5, alpha = 0.8) +
stat_summary(fun.data = "mean_cl_boot", color = "black") + 
scale_y_continuous(breaks = seq(0,1,0.5), 
                     labels = {function(x) paste0(as.character(x*100),"%")})+
scale_color_manual(values = c("red", "blue"))+
scale_fill_manual(values = c("red", "blue"))+
guides(fill = "none") +
guides(color = "none") +
ylab("Proportion relational choice") + 
xlab("") +
theme_classic() + 
labs(title = "Ambiguous RMTS") +
theme(plot.title = element_text(hjust = 0.5, size = 8), 
      plot.subtitle = element_text(hjust = 0.5, size = 6), 
      text = element_text(size=8))

Ravens

Model

Raven (logistic regression): acc ~ culture + (1 | subject) + (1 | trial)

rv_df <- tidy_d %>% 
  filter(task_name == "RV") %>% 
  mutate(acc = as.numeric(resp)) %>% 
  group_by(subject) %>% 
  mutate(trial = as.factor(row_number())) %>% 
  select(-resp, -task_info, -trial_info, -resp_type)

rv_model <- glmer(acc ~ culture + (1 | subject) + (culture | trial), family = binomial, data = rv_df)

rv_model

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: acc ~ culture + (1 | subject) + (culture | trial)
##    Data: rv_df
##       AIC       BIC    logLik  deviance  df.resid 
##  5470.205  5510.018 -2729.103  5458.205      5622 
## Random effects:
##  Groups  Name        Std.Dev. Corr 
##  subject (Intercept) 1.689         
##  trial   (Intercept) 1.336         
##          cultureUS   0.568    -0.11
## Number of obs: 5628, groups:  subject, 469; trial, 12
## Fixed Effects:
## (Intercept)    cultureUS  
##       1.734       -1.786

summary(rv_model)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: acc ~ culture + (1 | subject) + (culture | trial)
##    Data: rv_df
## 
##      AIC      BIC   logLik deviance df.resid 
##   5470.2   5510.0  -2729.1   5458.2     5622 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -8.2272 -0.4963  0.1863  0.4771 10.8082 
## 
## Random effects:
##  Groups  Name        Variance Std.Dev. Corr 
##  subject (Intercept) 2.8515   1.689         
##  trial   (Intercept) 1.7855   1.336         
##          cultureUS   0.3226   0.568    -0.11
## Number of obs: 5628, groups:  subject, 469; trial, 12
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   1.7343     0.4140   4.189 2.80e-05 ***
## cultureUS    -1.7862     0.2472  -7.226 4.99e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##           (Intr)
## cultureUS -0.290

Visualization

RV_raw <- tidy_d %>% 
  filter(task_name == "RV") %>% 
  group_by(subject, culture) %>%
  summarise(RV_resp = mean(as.numeric(resp)))

## `summarise()` has grouped output by 'subject'. You can override using the
## `.groups` argument.

ggplot(data = RV_raw, 
       aes(y = RV_resp, x = culture, color = culture)) +
geom_point(aes(y = RV_resp, color = culture), 
           position = position_jitter(width = .15), size = .5, alpha = 0.8) +
stat_summary(fun.data = "mean_cl_boot", color = "black") + 
scale_y_continuous(breaks = seq(0,1,0.5), 
                     labels = {function(x) paste0(as.character(x*100),"%")})+
scale_color_manual(values = c("red", "blue"))+
scale_fill_manual(values = c("red", "blue"))+
guides(fill = "none") +
guides(color = "none") +
ylab("Ravens proportion correct") + 
xlab("") +
theme_classic() + 
labs(title = "Ravens") +
theme(plot.title = element_text(hjust = 0.5, size = 8), 
      plot.subtitle = element_text(hjust = 0.5, size = 6), 
      text = element_text(size=8))

Meta plots

Partial pooling DIY

helper function

https://stats.stackexchange.com/questions/257985/how-can-i-derive-effect-sizes-in-lme4-and-describe-the-magnitude-of-fixed-effect

get_d_ci_df <- function(d, main_df, task_name_str){
  
  subject_count_df <- main_df %>% 
    filter(task_name == task_name_str) %>% 
    distinct(subject, culture) %>% 
    group_by(culture) %>% 
    count()
  
  var_d <- (sum(subject_count_df$n)/prod(subject_count_df$n)) + 
    (d^2) / (2 * (sum(subject_count_df$n)))
  
  ci_lower <- d - 1.96 * sqrt(var_d)
  ci_upper <- d + 1.96 * sqrt(var_d)
  
  d_ci_df <- tibble(task_name = task_name_str, 
                    d = d, 
                    d_ci_lower = ci_lower, 
                    d_ci_upper = ci_upper)
    
  return (d_ci_df)
}

get_d_partial_pooling <- function(main_df, model, task_name_str){
  
  # task_name_str 
  interaction_effect_task <- c("CD", "CA", "EBB")

  # get variance 
  varcor_df <- summary(model)$varcor %>% 
               as.data.frame() %>% 
               # not entirely clear what this means 
               # but the doc says: second variable (NA for variance parameters)
               filter(is.na(var2)) 
  
  # calculate the pooled variance 
  pooled_var <- sqrt(sum(varcor_df$vcov))
  
  # get estimate for fixed effect
  fixed_effect_df <- summary(model)$coefficients %>% 
    as.data.frame() %>% rownames_to_column("term_name") 
  
  # branching because some cultural effects are main and some are interactions
  if(!(task_name_str %in% interaction_effect_task)){
    estimate <- fixed_effect_df %>% 
      filter(term_name == "cultureUS") %>% 
      pull(Estimate)
  }else{
    if(task_name_str == "EBB"){
       estimate <- fixed_effect_df %>% 
      filter(grepl("cultureUS:contextNC:size_diff", term_name)) %>% 
      pull(Estimate)
    }else{
      estimate <- fixed_effect_df %>% 
      filter(grepl("cultureUS:", term_name)) %>% 
      pull(Estimate)
    }
    
    
  }
  
  d <- estimate / pooled_var
  d_ci_df <- get_d_ci_df(d, main_df, task_name_str)
  
  return (d_ci_df)
  
}

get_d_no_pooling <- function(main_df, task_name_str){
  # needs to make sure it's us - cn to keep it consistent 
  # in complete yet 
  
  if(task_name_str == "SSI"){

    task_df <- main_df %>% 
     filter(resp_type == "task_score_ratio") %>% 
      filter(task_name == task_name_str) %>% 
      mutate(resp = as.numeric(resp)) 

  }else if(task_name_str == "CP"){
    task_df <- main_df %>% 
      filter(task_name == task_name_str)
    
  }else if(task_name_str == "SI"){
    task_df <- main_df %>% 
      filter(task_name == task_name_str) %>% 
      filter(resp_type == "inflation_score_ratio")
  }else if(task_name_str == "CA"){
    # for study 1
    task_df <- main_df %>% 
      # first is for s1, then for s2
      filter(grepl("situation", resp_type) | 
             grepl("situation", trial_info))
  }else if(task_name_str == "HZ"){
    task_df <- main_df %>% 
      filter(task_name == "HZ", resp_type == "hz_height")
  }
  
  
    mean_df <- task_df %>% 
          group_by(culture) %>% 
          summarise(mean_resp = mean(resp, na.rm = TRUE))
    sd <- sd(task_df$resp, na.rm = TRUE)
    
    d = (filter(mean_df, culture == "US")$mean_resp - filter(mean_df, culture == "CN")$mean_resp) / sd
    
    d_var = get_d_ci_df(d, main_df, task_name_str)
    return (d_var)
  
}

Stduy 1 ES

s1_d <- read_csv(here("data/s1_tidy_main.csv"))

## New names:
## * `` -> ...1

## Rows: 37595 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): subject, culture, task_name, task_info, trial_info, resp_type
## dbl (2): ...1, resp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# RMTS 
rmts_df <- s1_d %>% 
  filter(task_name == "RMTS") %>% 
  mutate(choice = as.factor(case_when(
    resp == "1" ~ "rel",
    resp == "0" ~ "obj"))
         ) %>%
  group_by(subject) %>% 
  mutate(trial_num = as.factor(row_number())) %>% 
  select(-resp, -task_info, -trial_info, -resp_type)
# model 0: not converging 
#rmts_model <- glmer(choice ~ culture + (trial_num | subject), family = binomial, data = rmts_df)

# model 1: 
s1_rmts_model <- glmer(choice ~ culture + (1 | subject), family = binomial, data = rmts_df)



# RV
rv_df <- s1_d %>% 
  filter(task_name == "RV") %>% 
  mutate(acc = as.numeric(resp)) %>% 
  group_by(subject) %>% 
  mutate(trial = as.factor(row_number())) %>% 
  select(-resp, -task_info, -trial_info, -resp_type)

s1_rv_model <- glmer(acc ~ culture + (1 | subject) + (culture | trial), family = binomial, data = rv_df)


# CP
cp_df <- s1_d %>% 
  filter(task_name == "CP") %>% 
  mutate(choice = as.factor(case_when(
    resp == "1" ~ "uniq",
    resp == "0" ~ "non_uniq"))
         ) %>% 
  select(-resp, -task_info, -trial_info, -resp_type)

cp_model <- glm(choice ~ culture, 
                   family=binomial(link="logit"),
                  data = cp_df)

# SSI 
ssi_df <- s1_d %>% 
  filter(task_name == "SI") %>% 
  filter(resp_type == "inflation_score_ratio") %>% 
  mutate(score = resp) %>% 
  select(-resp, -task_info, -trial_info, -resp_type)

s1_ssi_model <- glm(score ~ culture, family=gaussian, data = ssi_df)


# CA

ca_df <- s1_d %>% 
  filter(task_name == "CA") %>% 
  mutate(attrib_num = as.numeric(resp),
         attrib_binary = replace(attrib_num, attrib_num > 1, 1),
         trial = trial_info,
         attrib_type = factor(resp_type)) %>% 
  select(-resp, -task_info)

#ca_model <- glmer(attrib_num ~ attrib_type * culture + (attrib_type | subject) + (culture | trial), family=poisson, data = ca_df, control=glmerControl(optimizer="bobyqa"))
#boundary (singular) fit: see ?isSingular

#ca_model_binary <- glmer(attrib_binary ~ attrib_type * culture + (attrib_type | subject) + (culture | trial), family=binomial, data = ca_df, control=glmerControl(optimizer="bobyqa"))
#boundary (singular) fit: see ?isSingular

#ca_model1 <- glmer(attrib_num ~ attrib_type * culture + (attrib_type | subject) + (1 | trial), family=poisson, data = ca_df, control=glmerControl(optimizer="bobyqa"))
#boundary (singular) fit: see ?isSingular

#ca_model2 <- glmer(attrib_num ~ attrib_type * culture + (1 | subject) + (1 | trial), family=poisson, data = ca_df, control=glmerControl(optimizer="bobyqa"))
#boundary (singular) fit: see ?isSingular

#ca_model3 <- glmer(attrib_num ~ attrib_type * culture + (1 | subject), family=poisson, data = ca_df, control=glmerControl(optimizer="bobyqa"))
#boundary (singular) fit: see ?isSingular

s1_ca_model <- glm(attrib_num ~ attrib_type * culture, family=poisson, data = ca_df)



# HZ
HZ_height_df <- s1_d %>% 
  filter(task_name == "HZ", resp_type == "hz_height") %>% 
  mutate(
    height = resp
         ) %>% 
  select(-resp, -task_info, -trial_info)

HZ_height_model <- lm(height ~ culture, 
                      data = HZ_height_df)


ebb_df <- s1_d %>% 
  filter(task_name == "EBB", task_info != "HELPFUL") %>% 
  mutate(correct = as.factor(case_when(
    resp == "1" ~ "correct",
    resp == "0" ~ "incorrect")), 
         context = task_info, 
         size_diff = as.numeric(trial_info), 
         ) %>% 
  select(-resp, -task_info, -trial_info)
  
#full model
#ebb_model <- glmer(correct ~ culture * context * size_diff + (size_diff * context | subject), family = binomial, data = ebb_df, control=glmerControl(optimizer="bobyqa"))
#convergence code 1 from bobyqa: bobyqa -- maximum number of function evaluations exceeded
#boundary (singular) fit: see ?isSingular

#model 2 (if full does not converge)
#ebb_model <- glmer(correct ~ culture * context * size_diff + (context | subject), family = binomial, data = ebb_df, control=glmerControl(optimizer="bobyqa"))
#boundary (singular) fit: see ?isSingular


#model 3
ebb_model <- glmer(correct ~ culture * context * size_diff + ( 1 | subject), family = binomial, data = ebb_df, control=glmerControl(optimizer="bobyqa"))


# FD

s1_mention_df <- s1_d %>% 
  filter(task_name == "FD", resp_type == "first_mention_focal") %>% 
    mutate(first_mention = as.factor(case_when(
    resp == "1" ~ "focal",
    resp == "0" ~ "background")), 
    scene = trial_info) %>% 
  select(-resp, -task_info, -resp_type, -trial_info)

#mention_model <- glmer(first_mention ~ culture + (1 | subject)+(culture | scene), family = binomial, data = mention_df)
#Error: number of observations (=1146) < number of random effects (=1148) for term (scene | subject); the random-effects parameters are probably unidentifiable

s1_mention_model <- glmer(first_mention ~ culture + (1 | subject) , 
                   family = binomial, data = s1_mention_df)

Calculating ES

s1_df_pp <- bind_rows(
  get_d_partial_pooling(s1_d, s1_rmts_model, "RMTS"), 
  get_d_partial_pooling(s1_d, s1_rv_model, "RV"),
  get_d_no_pooling(s1_d, "CP"),
  get_d_no_pooling(s1_d, "SI"),
  get_d_no_pooling(s1_d, "CA"),
  get_d_no_pooling(s1_d, "HZ"),
  get_d_partial_pooling(s1_d, ebb_model, "EBB"),
  get_d_partial_pooling(s1_d, s1_mention_model, "FD")
) %>% 
  mutate(study_num = "Study 1")


s2_df_pp <- bind_rows(
  get_d_partial_pooling(tidy_d, cd_model, "CD"), 
  get_d_partial_pooling(tidy_d, mention_model, "FD"),
  get_d_partial_pooling(tidy_d, ca_model, "CA") , 
  get_d_no_pooling(tidy_d,  "SSI") , 
  get_d_partial_pooling(tidy_d, triads_model, "TD"), 
  get_d_partial_pooling(tidy_d, SeI_model, "SeI"), 
  get_d_partial_pooling(tidy_d, rmts_model, "RMTS"), 
  get_d_partial_pooling(tidy_d, rv_model, "RV")) %>% 
  mutate(study_num = "Study 2")


d_df <- bind_rows(s1_df_pp, s2_df_pp) %>% 
  mutate(
    task_name_print = case_when(
        grepl("RMTS", task_name) ~ "RMTS", 
        grepl("RV", task_name) ~ "Ravens", 
        grepl("SI", task_name) | grepl("SSI", task_name) ~ "Symbolic Self Inflation", 
        grepl("FD", task_name) ~ "Free Description", 
        grepl("CP", task_name) ~ "Uniqueness Preference", 
        grepl("CA", task_name) & study_num == "Study 1" ~ "Children Causal Attribution", 
        grepl("CA", task_name) & study_num == "Study 2" ~ "Adult Causal Attribution", 
        grepl("HZ", task_name) ~ "Horizon", 
        grepl("EBB", task_name) ~ "Ebbinghaus Illusion", 
        grepl("CD", task_name) ~ "Change Detection", 
        grepl("TD", task_name) ~ "Triads", 
        grepl("SeI", task_name) ~ "Semantic Intuition"
    )) %>% 
  mutate(significance =  !(d_ci_lower < 0 & d_ci_upper > 0), 
         advantage = case_when(
           d > 0 & significance ~ "US", 
           d < 0 & significance ~ "CN", 
           TRUE ~ "No Difference"
         ))

d_df %>% 
  arrange(-d) %>% 
  ggplot(aes(x = reorder(task_name_print,d), y = d, 
             ymin = d_ci_lower, ymax = d_ci_upper, 
             color = advantage, 
             shape = significance)) + 
  geom_hline(yintercept = 0, lty = 2) + 
  geom_pointrange(aes( 
    x = reorder(task_name_print,d), y = d, 
             ymin = d_ci_lower, ymax = d_ci_upper, group = study_num
    ), position = position_dodge(width = .6)) + 
   scale_shape_manual(values = c(1, 16), guide = "none") + 
  scale_color_manual(values = c("red", "gray", "blue")) + 
  coord_flip() + 
  theme_classic() + 
  ylab("US-CN difference") + 
  xlab("") + 
  theme(legend.title = element_blank())

03_prereged_model

anjie

6/2/2022

Change Detection

Model

Visualization

by item exploration

Free Description

Model

Visualization

Causal attribution

Model

Visualization

Symbolic Self Inflation

Model

Visualization

Triads

Model

Visualization

Semantic Intuition

Model

Visualization

Ambiguous RMTS

Model

Visualization

Ravens

Model

Visualization

Meta plots

Partial pooling DIY

helper function

Stduy 1 ES

Calculating ES