Read in cue and drawing data

cues = read_csv("data/extreme_cues.csv") %>%
  select(cue, conc.bin)

d <- list.files("data/summaries/", pattern = "*.csv") %>%
  purrr::map(function(x) paste0("data/summaries/", x)) %>% 
  purrr::map(read_csv) %>% 
  bind_rows() %>%
  select(-6,-7)

english_countries <- c("Australia", 
                      "Canada",
                      "United Kingdom", 
                      "United States")

d %<>% left_join(cues, by=c("word" = "cue")) %>%
  mutate(lang = ifelse(country %in% english_countries, "english", "non_english")) %>%
  mutate(conc.bin = as.factor(conc.bin))

Summary stats

Number of participants per country

d %>% 
  group_by(country) %>%
  summarize(`total drawings` = sum(n)) %>%
  arrange(-`total drawings`) %>%
  kable()
country total drawings
United States 7167069
United Kingdom 1175243
Canada 590448
Germany 505734
Russia 390447
Australia 378977
Brazil 284915
Sweden 280228
Finland 277797
Czech Republic 256153
Italy 249158
Poland 208624
France 205840
Thailand 194595
Korea, South 194400
Philippines 188080
Saudi Arabia 173614
Hungary 127225
Netherlands 101929
Romania 43095
Indonesia 41776
Ukraine 31615
Japan 30288
Slovakia 29402
India 28749
Vietnam 24747
United Arab Emirates 14564
Turkey 12426
Bulgaria 9021
China, Republic of (Taiwan) 8183
Croatia 6665
Malaysia 4818
Ireland 4592
Serbia 3225
Norway 3157
New Zealand 3111

Number of cues per country

The missing items are because we only include an item for a country if there were at least 1500 participants.

MIN_NUM_ITEMS <- 40

#  Missing 3 items (to do): "bird"   "rhinoceros" "tiger" 
item.counts = d %>%
  ungroup() %>%
  count(country) %>%
  arrange(-`nn`)
  
kable(item.counts)
country nn
Australia 106
Canada 106
Germany 106
Russia 106
United Kingdom 106
United States 106
Brazil 105
Finland 105
Sweden 105
Czech Republic 102
Italy 102
Poland 99
Philippines 92
France 90
Thailand 88
Korea, South 87
Saudi Arabia 82
Hungary 58
Netherlands 49
Indonesia 18
Romania 17
Ukraine 17
India 15
Japan 15
Slovakia 15
Vietnam 13
United Arab Emirates 8
Turkey 7
Bulgaria 5
China, Republic of (Taiwan) 5
Croatia 4
Ireland 3
Malaysia 3
New Zealand 2
Norway 2
Serbia 2
big.countries = item.counts %>%
                    filter(nn > MIN_NUM_ITEMS)

d %<>% filter(country %in% big.countries$country)

The analyses below only include countries with at least 40 items.

Histograms of DVs

Take log of mean lengths. This histograms are at the level of language status x item.

d %<>% mutate(log_mean_lengths = log(mean_lengths))

d %>%
  gather(measure, value, c(5,8)) %>%
  ggplot(aes(x = value)) +
    facet_wrap(measure~lang, scales = "free") +
    geom_histogram() +
    theme_bw() 

Main analyses

Dvs: mean length and number of strokes

By language status

d %>%
  gather(measure, value, c(5,8)) %>%
  ggplot(aes(y = value, x= lang, fill = lang)) +
    facet_wrap(~measure, scales = "free") +
    geom_boxplot() +
    theme_bw() +
    theme(legend.position = "none")

No differences here.

By language status and concreteness

d.f <- d %>%
  gather(measure, value, c(5,8)) %>% #missing length data for "circle" for some reason
  filter(!is.na(value)) 

#d.f %>%
#  count(country, lang, measure) %>%
#  filter(measure %in% c('log_mean_lengths',  "n_strokes")) %>%
#  as.data.frame()

summaries <- d.f%>%
  group_by(measure, conc.bin, lang) %>%
  multi_boot_standard(column = "value", na.rm = TRUE) %>%
  filter(measure %in% c('log_mean_lengths',  "n_strokes")) 

ggplot(summaries, aes(x = lang, y = mean,
                      fill = conc.bin, color = conc.bin, group = conc.bin)) +
    facet_wrap(~measure, scales = "free") +
    #geom_bar(position = "dodge", stat = "identity") +
    geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower), 
                   position=position_dodge(.9), size = .4) +
    theme_bw() 

Two effects here: (1) English speakers have longer strokes overall and (2) abstract words have longer strokes for both English and non-English speakers.

By-item

Mean log length by item.

item.lengths = d.f %>%
  filter(measure == "log_mean_lengths") %>%
  group_by(conc.bin, word) %>%
  summarize(mean = mean(value)) 

ggplot(item.lengths, aes(y = mean, x = reorder(word, mean), fill = conc.bin)) +
  geom_bar(position = "dodge", stat = "identity") +
  theme_bw() +
  ylab("mean stroke length") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  xlab("word") 

By-country

number of strokes

Note that these color scales are centered on the mean (red is above the mean and green is below).

#geo_codes = geocode(unique(d$country)) %>%
#            cbind(unique(d$country))

#write_csv(geo_codes, "geo_codes.csv")
geo_codes = read_csv("data/geo_codes.csv")

country.summaries <- d.f %>%
  group_by(measure, country) %>%
  summarize(mean = mean(value, na.rm = T)) %>%
  filter(measure %in% c("log_mean_lengths",  "n_strokes")) %>%
  left_join(geo_codes, by=c("country"= "countries"))

strokes.d = country.summaries %>%
  filter(measure == "n_strokes")

ggplot(strokes.d) +   
  borders("world", colour="gray50", fill="gray50") +
  ggtitle("n_strokes") +
  geom_point(aes(x = lon, y=lat, 
                 color = mean),size = 3) +
    scale_color_gradient2(midpoint = median(strokes.d$mean, na.rm = T), 
                          low = "green", mid = "white", high = "red") +
  mapTheme

mean stroke length

lengths.d  = country.summaries %>%
    filter(measure == "log_mean_lengths") 

ggplot(lengths.d) +   
    borders("world", colour="gray50", fill="gray50") +
    ggtitle("mean_lengths") +
    geom_point(aes(x = lon, y=lat, 
                   color = mean),size = 3) +
    scale_color_gradient2(midpoint = median(lengths.d$mean, na.rm = T), 
                          low = "green", mid = "white", high = "red") +    
    mapTheme

By-country concreteness differences

What countries show the biggest difference between abstract and concrete cues?

number of strokes

country.summaries.conc <- d.f %>%
  group_by(measure, country, conc.bin) %>%
  summarize(mean = mean(value, na.rm = T)) %>%
  filter(measure %in% c("log_mean_lengths", "n_strokes")) %>%
  left_join(geo_codes, by=c("country"= "countries")) %>%
  spread(conc.bin, mean) %>%
  mutate(dif = `1` - `6`/(`1` + `6`)) %>% # normalize difference
  filter(!is.na(dif))

n_strokes_dif.d = country.summaries.conc %>%
  filter(measure == "n_strokes")

ggplot(n_strokes_dif.d) +   
  borders("world", colour="gray50", fill="gray50") +
  ggtitle("difference in n_strokes between abstract and concrete") +
  geom_point(aes(x = lon, y=lat, 
                 color = dif),size = 3) +
    scale_color_gradient2(midpoint = median(n_strokes_dif.d$dif, na.rm = T), 
                          low = "green", mid = "white", high = "red") +
  mapTheme

mean stroke length

mean_lengths_dif.d = country.summaries.conc %>%
    filter(measure == "log_mean_lengths") 

ggplot(mean_lengths_dif.d) +   
    borders("world", colour="gray50", fill="gray50") +
    ggtitle("difference in mean lengths between abstract and concrete") +
    geom_point(aes(x = lon, y=lat, 
                   color = dif),size = 3) +
    scale_color_gradient2(midpoint = median(mean_lengths_dif.d$dif, na.rm = T), 
                          low = "green", mid = "white", high = "red") +    mapTheme