IAT_PATH <- here("data/study1c/AIID_subset_exploratory.csv")
ES_PATH <- here("data/study1c/bnc_vs_coca_es.csv")
FREQ_PATH <- here("data/study1c/iat_word_freq_difference.csv")
STIM_PATH <- here("analyses/study1c/iat_stuff/category_stimuli.csv")
es_lang_raw <- read_csv(ES_PATH) 
es_lang_tidy <- es_lang_raw %>%
  select(-test) %>%
  spread(model_source, effect_size) %>%
  mutate(fasttext_5_diff = trained_bnc_fasttext_5.csv - trained_coca_fasttext_5.csv,
         fasttext_10_diff = trained_bnc_fasttext_10.csv - trained_coca_fasttext_10.csv,
         w2v_5_diff = trained_bnc_w2v_5.csv - trained_coca_w2v_5.csv) %>%
  select(bias_type, contains("diff")) %>%
  rename(domain = bias_type)
raw_exp <- read_csv(IAT_PATH)
# get columsn we care about at drop NAs
exp_filtered <- raw_exp %>%
  select(1,5,D,residence, sex, age, block_order, domain, 
         education, exclude_iat) %>%
  mutate(domain = case_when(domain == "Determinism - Free will" ~ "Determinism - Free Will",
                            TRUE ~ domain)) %>%
  mutate_if(is.character, as.factor)  %>%
  filter(domain %in% es_lang_tidy$domain,
         !exclude_iat) %>%
  drop_na()

exp_filtered_countries <- exp_filtered %>%
  filter(residence %in% c("us",  "uk")) %>%
  select(-exclude_iat)

resid_es <- exp_filtered_countries %>%
  add_residuals(lm(D ~ task_order + sex + age + block_order + education,
                   data = exp_filtered_countries))

es_iat_tidy <- resid_es %>%
  group_by(residence, domain) %>%
  summarize(resid = mean(resid)) %>%
  spread(residence, resid) %>%
  mutate(behavioral_diff = uk - us)
# get cat-att pairs
stim <- read_csv(STIM_PATH)[-1,]

names(stim) <- c("domain", "cat_id", "cat_label",
                 "S1", "S2", "S3", "S4", "S5", "S6", "S7",
                 "S8", "S9", "evaluative_label", "notes")
rep_domain <- stim %>%
  select(domain) %>%
  filter(!is.na(domain)) %>%
  pull(domain) %>%
  rep(each = 2)

domain_eval_set <- stim %>%
  slice(1:190) %>%
  select(evaluative_label) %>%
  mutate(domain = rep_domain) %>%
  filter(!is.na(evaluative_label)) %>%
  mutate(evaluative_label = str_replace(evaluative_label, " ", "_"))

Merge everything together

freq_data <- read_csv(FREQ_PATH)

uk_counts <- exp_filtered_countries %>%
  count(residence, domain) %>% 
  arrange(n) %>%
  filter(residence == "uk") %>%
  select(-residence) %>%
  rename(uk_counts = n)

full_df <- es_lang_tidy %>%
  full_join(es_iat_tidy) %>%
  full_join(freq_data) %>%
  left_join(domain_eval_set)  %>%
  left_join(uk_counts)

INCLUDING ALL 21:

full_df %>%
  ggplot(aes(x = fasttext_10_diff, y = behavioral_diff)) +
  geom_text(aes(label = domain), size = 2) + 
  geom_smooth(method = "lm") +
  theme_classic()

# lm
lm(behavioral_diff ~ fasttext_10_diff +median_freq_diff , 
   full_df) %>%
  summary()
## 
## Call:
## lm(formula = behavioral_diff ~ fasttext_10_diff + median_freq_diff, 
##     data = full_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.28042 -0.13514  0.01845  0.14360  0.33178 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)
## (Intercept)       0.13449    0.09725   1.383    0.184
## fasttext_10_diff  0.04193    0.04415   0.950    0.355
## median_freq_diff  0.18974    0.17632   1.076    0.296
## 
## Residual standard error: 0.1983 on 18 degrees of freedom
## Multiple R-squared:  0.0845, Adjusted R-squared:  -0.01722 
## F-statistic: 0.8307 on 2 and 18 DF,  p-value: 0.4518
TARG_DOMAINS <- c("Lawyers - Politicians", "Organized Labor - Management",
                  "Avoiding - Approaching", "Determinism - Free Will")

EXCLUDING: Lawyers - Politicians, Organized Labor - Management, Avoiding - Approaching, Determinism - Free Will

full_df %>%
  filter(!(domain %in% TARG_DOMAINS)) %>%
  ggplot(aes(x = fasttext_10_diff , y = behavioral_diff)) +
  geom_text(aes(label = domain), size = 2) + 
  geom_smooth(method = "lm") +
  theme_classic()

# lm
lm(behavioral_diff ~ fasttext_10_diff + median_freq_diff, 
   full_df %>% filter(!(domain %in% TARG_DOMAINS))) %>%
  summary()
## 
## Call:
## lm(formula = behavioral_diff ~ fasttext_10_diff + median_freq_diff, 
##     data = full_df %>% filter(!(domain %in% TARG_DOMAINS)))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.31969 -0.16099  0.01536  0.15635  0.30669 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)
## (Intercept)       0.14689    0.13525   1.086    0.296
## fasttext_10_diff  0.03345    0.04899   0.683    0.506
## median_freq_diff  0.15742    0.25776   0.611    0.551
## 
## Residual standard error: 0.2097 on 14 degrees of freedom
## Multiple R-squared:  0.04451,    Adjusted R-squared:  -0.09199 
## F-statistic: 0.3261 on 2 and 14 DF,  p-value: 0.7271