IAT_PATH <- here("data/study1c/AIID_subset_exploratory.csv")
ES_PATH <- here("data/study1c/bnc_vs_coca_es_5.csv")
FREQ_PATH <- here("data/study1c/iat_word_freq_difference_5.csv")
STIM_PATH <- here("analyses/study1c/iat_stuff/category_stimuli.csv")
es_lang_raw <- read_csv(ES_PATH) 
es_lang_tidy <- es_lang_raw %>%
  select(-test) %>%
  spread(model_source, effect_size) %>%
  mutate(fasttext_5_diff = trained_bnc_fasttext_5.csv - trained_coca_fasttext_5.csv,
         w2v_5_diff = trained_bnc_w2v_5.csv - trained_coca_w2v_5.csv) %>%
  select(bias_type, contains("diff")) %>%
  rename(domain = bias_type)
raw_exp <- read_csv(IAT_PATH)
# get columsn we care about at drop NAs
exp_filtered <- raw_exp %>%
  select(1,5,D,residence, sex, age, block_order, domain, 
         education, exclude_iat) %>%
  mutate(domain = case_when(domain == "Determinism - Free will" ~ "Determinism - Free Will",
                            TRUE ~ domain)) %>%
  mutate_if(is.character, as.factor)  %>%
  filter(domain %in% es_lang_tidy$domain,
         !exclude_iat) %>%
  drop_na()

exp_filtered_countries <- exp_filtered %>%
  filter(residence %in% c("us",  "uk")) %>%
  select(-exclude_iat)

resid_es <- exp_filtered_countries %>%
  add_residuals(lm(D ~ task_order + sex + age + block_order + education,
                   data = exp_filtered_countries))

es_iat_tidy <- resid_es %>%
  group_by(residence, domain) %>%
  summarize(resid = mean(resid)) %>%
  spread(residence, resid) %>%
  mutate(behavioral_diff = uk - us)
# get cat-att pairs
stim <- read_csv(STIM_PATH)[-1,]

names(stim) <- c("domain", "cat_id", "cat_label",
                 "S1", "S2", "S3", "S4", "S5", "S6", "S7",
                 "S8", "S9", "evaluative_label", "notes")
rep_domain <- stim %>%
  select(domain) %>%
  filter(!is.na(domain)) %>%
  pull(domain) %>%
  rep(each = 2)

domain_eval_set <- stim %>%
  slice(1:190) %>%
  select(evaluative_label) %>%
  mutate(domain = rep_domain) %>%
  filter(!is.na(evaluative_label)) %>%
  mutate(evaluative_label = str_replace(evaluative_label, " ", "_"))

Merge everything together

freq_data <- read_csv(FREQ_PATH)

uk_counts <- exp_filtered_countries %>%
  count(residence, domain) %>% 
  arrange(n) %>%
  filter(residence == "uk") %>%
  select(-residence) %>%
  rename(uk_counts = n)

full_df <- es_lang_tidy %>%
  full_join(es_iat_tidy) %>%
  full_join(freq_data) %>%
  left_join(domain_eval_set)  %>%
  left_join(uk_counts)

INCLUDING ALL 31:

full_df %>%
  ggplot(aes(x = fasttext_5_diff, y = behavioral_diff)) +
  geom_text(aes(label = domain), size = 2) + 
  geom_smooth(method = "lm") +
  theme_classic()

# lm
lm(behavioral_diff ~ fasttext_5_diff +median_freq_diff , 
   full_df) %>%
  summary()
## 
## Call:
## lm(formula = behavioral_diff ~ fasttext_5_diff + median_freq_diff, 
##     data = full_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.60645 -0.11424 -0.00011  0.09439  0.39204 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)  
## (Intercept)      0.068861   0.063787   1.080   0.2896  
## fasttext_5_diff  0.098098   0.055849   1.756   0.0899 .
## median_freq_diff 0.001461   0.001292   1.131   0.2677  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.211 on 28 degrees of freedom
## Multiple R-squared:  0.1318, Adjusted R-squared:  0.0698 
## F-statistic: 2.126 on 2 and 28 DF,  p-value: 0.1382
lm(behavioral_diff ~ fasttext_5_diff  , 
   full_df) %>%
  summary()
## 
## Call:
## lm(formula = behavioral_diff ~ fasttext_5_diff, data = full_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.59364 -0.11602 -0.00025  0.09301  0.38803 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)  
## (Intercept)      0.01084    0.03809   0.285   0.7780  
## fasttext_5_diff  0.09624    0.05609   1.716   0.0969 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2121 on 29 degrees of freedom
## Multiple R-squared:  0.09216,    Adjusted R-squared:  0.06085 
## F-statistic: 2.944 on 1 and 29 DF,  p-value: 0.09687