IAT_PATH <- here("data/study1c/AIID_subset_exploratory.csv")
ES_PATH <- here("data/study1c/bnc_vs_coca_es_5.csv")
FREQ_PATH <- here("data/study1c/iat_word_freq_difference_5.csv")
STIM_PATH <- here("analyses/study1c/iat_stuff/category_stimuli.csv")
es_lang_raw <- read_csv(ES_PATH)
es_lang_tidy <- es_lang_raw %>%
select(-test) %>%
spread(model_source, effect_size) %>%
mutate(fasttext_5_diff = trained_bnc_fasttext_5.csv - trained_coca_fasttext_5.csv,
w2v_5_diff = trained_bnc_w2v_5.csv - trained_coca_w2v_5.csv) %>%
select(bias_type, contains("diff")) %>%
rename(domain = bias_type)
raw_exp <- read_csv(IAT_PATH)
# get columsn we care about at drop NAs
exp_filtered <- raw_exp %>%
select(1,5,D,residence, sex, age, block_order, domain,
education, exclude_iat) %>%
mutate(domain = case_when(domain == "Determinism - Free will" ~ "Determinism - Free Will",
TRUE ~ domain)) %>%
mutate_if(is.character, as.factor) %>%
filter(domain %in% es_lang_tidy$domain,
!exclude_iat) %>%
drop_na()
exp_filtered_countries <- exp_filtered %>%
filter(residence %in% c("us", "uk")) %>%
select(-exclude_iat)
resid_es <- exp_filtered_countries %>%
add_residuals(lm(D ~ task_order + sex + age + block_order + education,
data = exp_filtered_countries))
es_iat_tidy <- resid_es %>%
group_by(residence, domain) %>%
summarize(resid = mean(resid)) %>%
spread(residence, resid) %>%
mutate(behavioral_diff = uk - us)
# get cat-att pairs
stim <- read_csv(STIM_PATH)[-1,]
names(stim) <- c("domain", "cat_id", "cat_label",
"S1", "S2", "S3", "S4", "S5", "S6", "S7",
"S8", "S9", "evaluative_label", "notes")
rep_domain <- stim %>%
select(domain) %>%
filter(!is.na(domain)) %>%
pull(domain) %>%
rep(each = 2)
domain_eval_set <- stim %>%
slice(1:190) %>%
select(evaluative_label) %>%
mutate(domain = rep_domain) %>%
filter(!is.na(evaluative_label)) %>%
mutate(evaluative_label = str_replace(evaluative_label, " ", "_"))
Merge everything together
freq_data <- read_csv(FREQ_PATH)
uk_counts <- exp_filtered_countries %>%
count(residence, domain) %>%
arrange(n) %>%
filter(residence == "uk") %>%
select(-residence) %>%
rename(uk_counts = n)
full_df <- es_lang_tidy %>%
full_join(es_iat_tidy) %>%
full_join(freq_data) %>%
left_join(domain_eval_set) %>%
left_join(uk_counts)
INCLUDING ALL 31:
full_df %>%
ggplot(aes(x = fasttext_5_diff, y = behavioral_diff)) +
geom_text(aes(label = domain), size = 2) +
geom_smooth(method = "lm") +
theme_classic()
# lm
lm(behavioral_diff ~ fasttext_5_diff +median_freq_diff ,
full_df) %>%
summary()
##
## Call:
## lm(formula = behavioral_diff ~ fasttext_5_diff + median_freq_diff,
## data = full_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.60645 -0.11424 -0.00011 0.09439 0.39204
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.068861 0.063787 1.080 0.2896
## fasttext_5_diff 0.098098 0.055849 1.756 0.0899 .
## median_freq_diff 0.001461 0.001292 1.131 0.2677
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.211 on 28 degrees of freedom
## Multiple R-squared: 0.1318, Adjusted R-squared: 0.0698
## F-statistic: 2.126 on 2 and 28 DF, p-value: 0.1382
lm(behavioral_diff ~ fasttext_5_diff ,
full_df) %>%
summary()
##
## Call:
## lm(formula = behavioral_diff ~ fasttext_5_diff, data = full_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.59364 -0.11602 -0.00025 0.09301 0.38803
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.01084 0.03809 0.285 0.7780
## fasttext_5_diff 0.09624 0.05609 1.716 0.0969 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2121 on 29 degrees of freedom
## Multiple R-squared: 0.09216, Adjusted R-squared: 0.06085
## F-statistic: 2.944 on 1 and 29 DF, p-value: 0.09687