Read in cue and drawing data
cues = read_csv("data/extreme_cues.csv") %>%
select(cue, conc.bin)
d <- list.files("data/summaries/", pattern = "*.csv") %>%
purrr::map(function(x) paste0("data/summaries/", x)) %>%
purrr::map(read_csv) %>%
bind_rows() %>%
select(-6,-7)
english_countries <- c("Australia",
"Canada",
"United Kingdom",
"United States")
d %<>% left_join(cues, by=c("word" = "cue")) %>%
mutate(lang = ifelse(country %in% english_countries, "english", "non_english")) %>%
mutate(conc.bin = as.factor(conc.bin))
Summary stats
Number of participants per country
d %>%
group_by(country) %>%
summarize(`total drawings` = sum(n)) %>%
arrange(-`total drawings`) %>%
kable()
| United States |
7167069 |
| United Kingdom |
1175243 |
| Canada |
590448 |
| Germany |
505734 |
| Russia |
390447 |
| Australia |
378977 |
| Brazil |
284915 |
| Sweden |
280228 |
| Finland |
277797 |
| Czech Republic |
256153 |
| Italy |
249158 |
| Poland |
208624 |
| France |
205840 |
| Thailand |
194595 |
| Korea, South |
194400 |
| Philippines |
188080 |
| Saudi Arabia |
173614 |
| Hungary |
127225 |
| Netherlands |
101929 |
| Romania |
43095 |
| Indonesia |
41776 |
| Ukraine |
31615 |
| Japan |
30288 |
| Slovakia |
29402 |
| India |
28749 |
| Vietnam |
24747 |
| United Arab Emirates |
14564 |
| Turkey |
12426 |
| Bulgaria |
9021 |
| China, Republic of (Taiwan) |
8183 |
| Croatia |
6665 |
| Malaysia |
4818 |
| Ireland |
4592 |
| Serbia |
3225 |
| Norway |
3157 |
| New Zealand |
3111 |
Number of cues per country
The missing items are because we only include an item for a country if there were at least 1500 participants.
MIN_NUM_ITEMS <- 40
# Missing 3 items (to do): "bird" "rhinoceros" "tiger"
item.counts = d %>%
ungroup() %>%
count(country) %>%
arrange(-`nn`)
kable(item.counts)
| Australia |
106 |
| Canada |
106 |
| Germany |
106 |
| Russia |
106 |
| United Kingdom |
106 |
| United States |
106 |
| Brazil |
105 |
| Finland |
105 |
| Sweden |
105 |
| Czech Republic |
102 |
| Italy |
102 |
| Poland |
99 |
| Philippines |
92 |
| France |
90 |
| Thailand |
88 |
| Korea, South |
87 |
| Saudi Arabia |
82 |
| Hungary |
58 |
| Netherlands |
49 |
| Indonesia |
18 |
| Romania |
17 |
| Ukraine |
17 |
| India |
15 |
| Japan |
15 |
| Slovakia |
15 |
| Vietnam |
13 |
| United Arab Emirates |
8 |
| Turkey |
7 |
| Bulgaria |
5 |
| China, Republic of (Taiwan) |
5 |
| Croatia |
4 |
| Ireland |
3 |
| Malaysia |
3 |
| New Zealand |
2 |
| Norway |
2 |
| Serbia |
2 |
big.countries = item.counts %>%
filter(nn > MIN_NUM_ITEMS)
d %<>% filter(country %in% big.countries$country)
The analyses below only include countries with at least 40 items.
Histograms of DVs
Take log of mean lengths. This histograms are at the level of language status x item.
d %<>% mutate(log_mean_lengths = log(mean_lengths))
d %>%
gather(measure, value, c(5,8)) %>%
ggplot(aes(x = value)) +
facet_wrap(measure~lang, scales = "free") +
geom_histogram() +
theme_bw()

Main analyses
Dvs: mean length and number of strokes
By language status
d %>%
gather(measure, value, c(5,8)) %>%
ggplot(aes(y = value, x= lang, fill = lang)) +
facet_wrap(~measure, scales = "free") +
geom_boxplot() +
theme_bw() +
theme(legend.position = "none")

No differences here.
By language status and concreteness
d.f <- d %>%
gather(measure, value, c(5,8)) %>% #missing length data for "circle" for some reason
filter(!is.na(value))
#d.f %>%
# count(country, lang, measure) %>%
# filter(measure %in% c('log_mean_lengths', "n_strokes")) %>%
# as.data.frame()
summaries <- d.f%>%
group_by(measure, conc.bin, lang) %>%
multi_boot_standard(column = "value", na.rm = TRUE) %>%
filter(measure %in% c('log_mean_lengths', "n_strokes"))
ggplot(summaries, aes(x = lang, y = mean,
fill = conc.bin, color = conc.bin, group = conc.bin)) +
facet_wrap(~measure, scales = "free") +
#geom_bar(position = "dodge", stat = "identity") +
geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower),
position=position_dodge(.9), size = .4) +
theme_bw()

Two effects here: (1) English speakers have longer strokes overall and (2) abstract words have longer strokes for both English and non-English speakers.
By-item
Mean log length by item.
item.lengths = d.f %>%
filter(measure == "log_mean_lengths") %>%
group_by(conc.bin, word) %>%
summarize(mean = mean(value))
ggplot(item.lengths, aes(y = mean, x = reorder(word, mean), fill = conc.bin)) +
geom_bar(position = "dodge", stat = "identity") +
theme_bw() +
ylab("mean stroke length") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
xlab("word")

By-country
number of strokes
Note that these color scales are centered on the mean (red is above the mean and green is below).
#geo_codes = geocode(unique(d$country)) %>%
# cbind(unique(d$country))
#write_csv(geo_codes, "geo_codes.csv")
geo_codes = read_csv("data/geo_codes.csv")
country.summaries <- d.f %>%
group_by(measure, country) %>%
summarize(mean = mean(value, na.rm = T)) %>%
filter(measure %in% c("log_mean_lengths", "n_strokes")) %>%
left_join(geo_codes, by=c("country"= "countries"))
strokes.d = country.summaries %>%
filter(measure == "n_strokes")
ggplot(strokes.d) +
borders("world", colour="gray50", fill="gray50") +
ggtitle("n_strokes") +
geom_point(aes(x = lon, y=lat,
color = mean),size = 3) +
scale_color_gradient2(midpoint = median(strokes.d$mean, na.rm = T),
low = "green", mid = "white", high = "red") +
mapTheme

mean stroke length
lengths.d = country.summaries %>%
filter(measure == "log_mean_lengths")
ggplot(lengths.d) +
borders("world", colour="gray50", fill="gray50") +
ggtitle("mean_lengths") +
geom_point(aes(x = lon, y=lat,
color = mean),size = 3) +
scale_color_gradient2(midpoint = median(lengths.d$mean, na.rm = T),
low = "green", mid = "white", high = "red") +
mapTheme

By-country concreteness differences
What countries show the biggest difference between abstract and concrete cues?
number of strokes
country.summaries.conc <- d.f %>%
group_by(measure, country, conc.bin) %>%
summarize(mean = mean(value, na.rm = T)) %>%
filter(measure %in% c("log_mean_lengths", "n_strokes")) %>%
left_join(geo_codes, by=c("country"= "countries")) %>%
spread(conc.bin, mean) %>%
mutate(dif = `1` - `6`/(`1` + `6`)) %>% # normalize difference
filter(!is.na(dif))
n_strokes_dif.d = country.summaries.conc %>%
filter(measure == "n_strokes")
ggplot(n_strokes_dif.d) +
borders("world", colour="gray50", fill="gray50") +
ggtitle("difference in n_strokes between abstract and concrete") +
geom_point(aes(x = lon, y=lat,
color = dif),size = 3) +
scale_color_gradient2(midpoint = median(n_strokes_dif.d$dif, na.rm = T),
low = "green", mid = "white", high = "red") +
mapTheme

mean stroke length
mean_lengths_dif.d = country.summaries.conc %>%
filter(measure == "log_mean_lengths")
ggplot(mean_lengths_dif.d) +
borders("world", colour="gray50", fill="gray50") +
ggtitle("difference in mean lengths between abstract and concrete") +
geom_point(aes(x = lon, y=lat,
color = dif),size = 3) +
scale_color_gradient2(midpoint = median(mean_lengths_dif.d$dif, na.rm = T),
low = "green", mid = "white", high = "red") + mapTheme
