IAT_PATH <- here("data/study1c/AIID_subset_exploratory.csv")
ES_PATH <- here("data/study1c/bnc_vs_coca_es.csv")
FREQ_PATH <- here("data/study1c/iat_word_freq_difference.csv")
STIM_PATH <- here("analyses/study1c/iat_stuff/category_stimuli.csv")
es_lang_raw <- read_csv(ES_PATH)
es_lang_tidy <- es_lang_raw %>%
select(-test) %>%
spread(model_source, effect_size) %>%
mutate(fasttext_5_diff = trained_bnc_fasttext_5.csv - trained_coca_fasttext_5.csv,
fasttext_10_diff = trained_bnc_fasttext_10.csv - trained_coca_fasttext_10.csv,
w2v_5_diff = trained_bnc_w2v_5.csv - trained_coca_w2v_5.csv) %>%
select(bias_type, contains("diff")) %>%
rename(domain = bias_type)
raw_exp <- read_csv(IAT_PATH)
# get columsn we care about at drop NAs
exp_filtered <- raw_exp %>%
select(1,5,D,residence, sex, age, block_order, domain,
education, exclude_iat) %>%
mutate(domain = case_when(domain == "Determinism - Free will" ~ "Determinism - Free Will",
TRUE ~ domain)) %>%
mutate_if(is.character, as.factor) %>%
filter(domain %in% es_lang_tidy$domain,
!exclude_iat) %>%
drop_na()
exp_filtered_countries <- exp_filtered %>%
filter(residence %in% c("us", "uk")) %>%
select(-exclude_iat)
resid_es <- exp_filtered_countries %>%
add_residuals(lm(D ~ task_order + sex + age + block_order + education,
data = exp_filtered_countries))
es_iat_tidy <- resid_es %>%
group_by(residence, domain) %>%
summarize(resid = mean(resid)) %>%
spread(residence, resid) %>%
mutate(behavioral_diff = uk - us)
# get cat-att pairs
stim <- read_csv(STIM_PATH)[-1,]
names(stim) <- c("domain", "cat_id", "cat_label",
"S1", "S2", "S3", "S4", "S5", "S6", "S7",
"S8", "S9", "evaluative_label", "notes")
rep_domain <- stim %>%
select(domain) %>%
filter(!is.na(domain)) %>%
pull(domain) %>%
rep(each = 2)
domain_eval_set <- stim %>%
slice(1:190) %>%
select(evaluative_label) %>%
mutate(domain = rep_domain) %>%
filter(!is.na(evaluative_label)) %>%
mutate(evaluative_label = str_replace(evaluative_label, " ", "_"))
Merge everything together
freq_data <- read_csv(FREQ_PATH)
uk_counts <- exp_filtered_countries %>%
count(residence, domain) %>%
arrange(n) %>%
filter(residence == "uk") %>%
select(-residence) %>%
rename(uk_counts = n)
full_df <- es_lang_tidy %>%
full_join(es_iat_tidy) %>%
full_join(freq_data) %>%
left_join(domain_eval_set) %>%
left_join(uk_counts)
INCLUDING ALL 21:
full_df %>%
ggplot(aes(x = fasttext_10_diff, y = behavioral_diff)) +
geom_text(aes(label = domain), size = 2) +
geom_smooth(method = "lm") +
theme_classic()
# lm
lm(behavioral_diff ~ fasttext_10_diff +median_freq_diff ,
full_df) %>%
summary()
##
## Call:
## lm(formula = behavioral_diff ~ fasttext_10_diff + median_freq_diff,
## data = full_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.28042 -0.13514 0.01845 0.14360 0.33178
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.13449 0.09725 1.383 0.184
## fasttext_10_diff 0.04193 0.04415 0.950 0.355
## median_freq_diff 0.18974 0.17632 1.076 0.296
##
## Residual standard error: 0.1983 on 18 degrees of freedom
## Multiple R-squared: 0.0845, Adjusted R-squared: -0.01722
## F-statistic: 0.8307 on 2 and 18 DF, p-value: 0.4518
TARG_DOMAINS <- c("Lawyers - Politicians", "Organized Labor - Management",
"Avoiding - Approaching", "Determinism - Free Will")
EXCLUDING: Lawyers - Politicians, Organized Labor - Management, Avoiding - Approaching, Determinism - Free Will
full_df %>%
filter(!(domain %in% TARG_DOMAINS)) %>%
ggplot(aes(x = fasttext_10_diff , y = behavioral_diff)) +
geom_text(aes(label = domain), size = 2) +
geom_smooth(method = "lm") +
theme_classic()
# lm
lm(behavioral_diff ~ fasttext_10_diff + median_freq_diff,
full_df %>% filter(!(domain %in% TARG_DOMAINS))) %>%
summary()
##
## Call:
## lm(formula = behavioral_diff ~ fasttext_10_diff + median_freq_diff,
## data = full_df %>% filter(!(domain %in% TARG_DOMAINS)))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.31969 -0.16099 0.01536 0.15635 0.30669
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.14689 0.13525 1.086 0.296
## fasttext_10_diff 0.03345 0.04899 0.683 0.506
## median_freq_diff 0.15742 0.25776 0.611 0.551
##
## Residual standard error: 0.2097 on 14 degrees of freedom
## Multiple R-squared: 0.04451, Adjusted R-squared: -0.09199
## F-statistic: 0.3261 on 2 and 14 DF, p-value: 0.7271