d_raw <- gs_title("conceptviz_1_data") %>%
gs_read()
d <- map_df(d_raw$data, fromJSON, simplifyDataFrame = TRUE) %>%
bind_cols(subj_id = d_raw$subj_id) %>%
select(subj_id, everything())
d_tidy <- d %>%
gather(variable, value, -subj_id) %>%
separate(variable, c("variable", "trial_num"), sep = "_T") %>%
spread(variable, value) %>%
mutate_at(c("trial_num", "haus_sim", "rating", "RT", "trial_ID", "haus_bin"), as.numeric) %>%
mutate_at(c("category", "drawing_key_id_1", "drawing_key_id_2","trial_type"), as.factor)
d_tidy %>%
filter(trial_type == "attention_check") %>%
select(subj_id, rating) %>%
count(rating) %>%
ggplot(aes(x = rating, y = n/200)) +
geom_bar(stat = "identity") +
ylab("proportion responses")+
ggtitle("prop. attention checks") +
theme_minimal()
mean_attention_check_ratings = d_tidy %>%
filter(trial_type == "attention_check") %>%
group_by(subj_id) %>%
summarize(mean = mean(rating)) %>%
filter(mean < mean(.$mean))
Mostly 1s as correct.
d_tidy_crit <- d_tidy %>%
filter(subj_id %in% mean_attention_check_ratings$subj_id) %>%
filter(trial_type == "critical_trial")
means_ratings_by_bin <- d_tidy_crit %>%
group_by(haus_bin, category) %>%
multi_boot_standard(column = "rating", na.rm = TRUE)
ggplot(means_ratings_by_bin, aes(x = haus_bin, y = mean, group = category, color = category)) +
geom_pointrange(aes(ymin = summary_ci_lower, ymax = summary_ci_upper)) +
geom_smooth(method = "lm") +
ylab("mean similarity rating") +
xlab("hausdorfff Distance Decile") +
ggtitle("Similarity Ratings by hausdorff Decile") +
scale_x_continuous(breaks = 1:10) +
theme_minimal()
d_tidy_crit %>%
ggplot(aes(x = haus_sim, y = rating, group = category, color = category)) +
geom_smooth(method = "lm") +
ylab("mean similarity rating") +
xlab("hausdorfff Distance") +
ggtitle("Similarity Ratings by hausdorff Distance") +
theme_minimal()
d_tidy_crit %>%
group_by(category) %>%
do(tidy(cor.test(.$rating, .$haus_sim))) %>%
select(-parameter, -method, -alternative) %>%
kable()
| category | estimate | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|
| bread | 0.1941247 | 8.506932 | 0 | 0.1498770 | 0.2375963 |
| tree | 0.1469115 | 6.381305 | 0 | 0.1019964 | 0.1912287 |
There’s more variability in hausdorfff distance for bread, relative to tree, which explains the different slopes in the decile plot.
tree_all_measures <- read_csv(paste0("../../../data/hausdorff_similarities/pair_sim_drawings/pair_lists/balanced_lists/tree_all_measures.csv")) %>%
mutate_at(c("key_id_1", "key_id_2", "country_code_1","country_code_2"), as.factor) %>%
select(key_id_1, key_id_2, hd_sim, mhd_sim, cosine) %>%
mutate(category = "tree")
bread_all_measures <- read_csv(paste0("../../../data/hausdorff_similarities/pair_sim_drawings/pair_lists/balanced_lists/bread_all_measures.csv")) %>%
mutate_at(c("key_id_1", "key_id_2", "country_code_1","country_code_2"), as.factor) %>%
select(key_id_1, key_id_2, hd_sim, mhd_sim, cosine) %>%
mutate(category = "bread")
all_measures <- d_tidy_crit %>%
rename(key_id_1 = drawing_key_id_1,
key_id_2 = drawing_key_id_2) %>%
left_join(bind_rows(tree_all_measures, bread_all_measures), by = c("key_id_1", "key_id_2", "category")) %>%
select(category, rating, haus_sim, mhd_sim, cosine) %>%
rename(hd_sim = haus_sim)
all_measures %>%
gather("measure", "value", 3:5) %>%
ggplot(aes(x = value, y = rating, group = category, color = category)) +
geom_smooth(method = "lm") +
facet_grid(. ~ measure, scales = "free") +
theme_minimal()
all_measures %>%
gather("measure", "value", 3:5) %>%
group_by(category, measure) %>%
do(tidy(cor.test(.$rating, .$value))) %>%
select(-parameter, -method, -alternative) %>%
kable()
| category | measure | estimate | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|---|
| bread | cosine | -0.1465610 | -6.369195 | 0e+00 | -0.1908598 | -0.1016664 |
| bread | hd_sim | 0.1941247 | 8.506932 | 0e+00 | 0.1498770 | 0.2375963 |
| bread | mhd_sim | 0.1636560 | 7.131454 | 0e+00 | 0.1189697 | 0.2076806 |
| tree | cosine | -0.2067347 | -9.078501 | 0e+00 | -0.2499766 | -0.1626699 |
| tree | hd_sim | 0.1469115 | 6.381305 | 0e+00 | 0.1019964 | 0.1912287 |
| tree | mhd_sim | 0.1140205 | 4.931059 | 9e-07 | 0.0687798 | 0.1587932 |
Hausdorff Distance is better than Modified hausdorff Distance (MHD). For bread, MHD > cosine; For tree; MHD = cosine (but there’s less variability).
The correlations on the continous measures are modest, but I suspect that’s because there’s a ceiling on the correlation because we’re using a likert scale.