Conceptviz 1 analyses

Read in data from google sheet

d_raw <- gs_title("conceptviz_1_data") %>%
            gs_read()

d <- map_df(d_raw$data, fromJSON, simplifyDataFrame = TRUE) %>%
  bind_cols(subj_id = d_raw$subj_id) %>%
  select(subj_id, everything())

Tidy data

d_tidy <- d %>%
  gather(variable, value, -subj_id) %>%
  separate(variable, c("variable", "trial_num"), sep = "_T") %>%
  spread(variable, value) %>%
  mutate_at(c("trial_num", "haus_sim", "rating", "RT", "trial_ID",  "haus_bin"), as.numeric) %>%
  mutate_at(c("category", "drawing_key_id_1", "drawing_key_id_2","trial_type"), as.factor)

Attention Checks

d_tidy %>%
  filter(trial_type == "attention_check") %>%
  select(subj_id, rating) %>%
  count(rating) %>%
  ggplot(aes(x = rating, y = n/200)) +
  geom_bar(stat = "identity") +
  ylab("proportion responses")+
  ggtitle("prop. attention checks") +
  theme_minimal()

mean_attention_check_ratings = d_tidy %>%
  filter(trial_type == "attention_check") %>%
  group_by(subj_id) %>% 
  summarize(mean = mean(rating)) %>%
  filter(mean < mean(.$mean))

Mostly 1s as correct.

hausdorff by decile and ratings

d_tidy_crit <- d_tidy %>%
  filter(subj_id %in% mean_attention_check_ratings$subj_id) %>%
  filter(trial_type == "critical_trial") 

means_ratings_by_bin <- d_tidy_crit %>%
  group_by(haus_bin, category) %>%
  multi_boot_standard(column = "rating", na.rm = TRUE)

ggplot(means_ratings_by_bin, aes(x = haus_bin, y = mean, group = category, color = category)) +
  geom_pointrange(aes(ymin = summary_ci_lower, ymax = summary_ci_upper)) +
  geom_smooth(method = "lm") +
  ylab("mean similarity rating") +
  xlab("hausdorfff Distance Decile") +
  ggtitle("Similarity Ratings by hausdorff Decile") +
  scale_x_continuous(breaks = 1:10) +
  theme_minimal()

hausdorff continous and ratings

d_tidy_crit %>%
  ggplot(aes(x = haus_sim, y = rating, group = category, color = category)) +
  geom_smooth(method = "lm") +
  ylab("mean similarity rating") +
  xlab("hausdorfff Distance") +
  ggtitle("Similarity Ratings by hausdorff Distance") +
  theme_minimal()

d_tidy_crit %>%
  group_by(category) %>%
  do(tidy(cor.test(.$rating, .$haus_sim))) %>%
  select(-parameter, -method, -alternative) %>%
  kable()

category	estimate	statistic	p.value	conf.low	conf.high
bread	0.1941247	8.506932	0	0.1498770	0.2375963
tree	0.1469115	6.381305	0	0.1019964	0.1912287

There’s more variability in hausdorfff distance for bread, relative to tree, which explains the different slopes in the decile plot.

Hausdorff distance, Modified Hausdorff Distance, and Imagenet-NN cosine

tree_all_measures <- read_csv(paste0("../../../data/hausdorff_similarities/pair_sim_drawings/pair_lists/balanced_lists/tree_all_measures.csv")) %>%
    mutate_at(c("key_id_1", "key_id_2", "country_code_1","country_code_2"), as.factor) %>%
  select(key_id_1, key_id_2, hd_sim, mhd_sim, cosine) %>%
  mutate(category = "tree")

bread_all_measures <- read_csv(paste0("../../../data/hausdorff_similarities/pair_sim_drawings/pair_lists/balanced_lists/bread_all_measures.csv")) %>%
    mutate_at(c("key_id_1", "key_id_2", "country_code_1","country_code_2"), as.factor)  %>%
  select(key_id_1, key_id_2, hd_sim, mhd_sim, cosine)  %>%
  mutate(category = "bread")

all_measures <- d_tidy_crit %>%
  rename(key_id_1 = drawing_key_id_1,
         key_id_2 = drawing_key_id_2) %>%
  left_join(bind_rows(tree_all_measures, bread_all_measures), by = c("key_id_1", "key_id_2", "category")) %>%
  select(category, rating, haus_sim, mhd_sim, cosine) %>%
  rename(hd_sim = haus_sim)

all_measures %>%
  gather("measure", "value", 3:5) %>%
  ggplot(aes(x = value, y = rating, group = category, color = category)) +
  geom_smooth(method = "lm") +
  facet_grid(. ~ measure, scales = "free") +
  theme_minimal()

all_measures %>%
  gather("measure", "value", 3:5) %>%
  group_by(category, measure) %>%
  do(tidy(cor.test(.$rating, .$value))) %>%
  select(-parameter, -method, -alternative) %>%
  kable()

category	measure	estimate	statistic	p.value	conf.low	conf.high
bread	cosine	-0.1465610	-6.369195	0e+00	-0.1908598	-0.1016664
bread	hd_sim	0.1941247	8.506932	0e+00	0.1498770	0.2375963
bread	mhd_sim	0.1636560	7.131454	0e+00	0.1189697	0.2076806
tree	cosine	-0.2067347	-9.078501	0e+00	-0.2499766	-0.1626699
tree	hd_sim	0.1469115	6.381305	0e+00	0.1019964	0.1912287
tree	mhd_sim	0.1140205	4.931059	9e-07	0.0687798	0.1587932

Hausdorff Distance is better than Modified hausdorff Distance (MHD). For bread, MHD > cosine; For tree; MHD = cosine (but there’s less variability).

The correlations on the continous measures are modest, but I suspect that’s because there’s a ceiling on the correlation because we’re using a likert scale.