Summary of drawing analyses

Summary stats
Main analyses
- By language status
- By language status and concreteness
By-item
By-country
- number of strokes
- mean stroke length
By-country concreteness differences
- number of strokes
- mean stroke length

Read in cue and drawing data

cues = read_csv("data/extreme_cues.csv") %>%
  select(cue, conc.bin)

d <- list.files("data/summaries/", pattern = "*.csv") %>%
  purrr::map(function(x) paste0("data/summaries/", x)) %>% 
  purrr::map(read_csv) %>% 
  bind_rows() %>%
  select(-6,-7)

english_countries <- c("Australia", 
                      "Canada",
                      "United Kingdom", 
                      "United States")

d %<>% left_join(cues, by=c("word" = "cue")) %>%
  mutate(lang = ifelse(country %in% english_countries, "english", "non_english")) %>%
  mutate(conc.bin = as.factor(conc.bin))

Summary stats

Number of participants per country

d %>% 
  group_by(country) %>%
  summarize(`total drawings` = sum(n)) %>%
  arrange(-`total drawings`) %>%
  kable()

country	total drawings
United States	7167069
United Kingdom	1175243
Canada	590448
Germany	505734
Russia	390447
Australia	378977
Brazil	284915
Sweden	280228
Finland	277797
Czech Republic	256153
Italy	249158
Poland	208624
France	205840
Thailand	194595
Korea, South	194400
Philippines	188080
Saudi Arabia	173614
Hungary	127225
Netherlands	101929
Romania	43095
Indonesia	41776
Ukraine	31615
Japan	30288
Slovakia	29402
India	28749
Vietnam	24747
United Arab Emirates	14564
Turkey	12426
Bulgaria	9021
China, Republic of (Taiwan)	8183
Croatia	6665
Malaysia	4818
Ireland	4592
Serbia	3225
Norway	3157
New Zealand	3111

Number of cues per country

The missing items are because we only include an item for a country if there were at least 1500 participants.

MIN_NUM_ITEMS <- 40

#  Missing 3 items (to do): "bird"   "rhinoceros" "tiger" 
item.counts = d %>%
  ungroup() %>%
  count(country) %>%
  arrange(-`nn`)
  
kable(item.counts)

country	nn
Australia	106
Canada	106
Germany	106
Russia	106
United Kingdom	106
United States	106
Brazil	105
Finland	105
Sweden	105
Czech Republic	102
Italy	102
Poland	99
Philippines	92
France	90
Thailand	88
Korea, South	87
Saudi Arabia	82
Hungary	58
Netherlands	49
Indonesia	18
Romania	17
Ukraine	17
India	15
Japan	15
Slovakia	15
Vietnam	13
United Arab Emirates	8
Turkey	7
Bulgaria	5
China, Republic of (Taiwan)	5
Croatia	4
Ireland	3
Malaysia	3
New Zealand	2
Norway	2
Serbia	2

big.countries = item.counts %>%
                    filter(nn > MIN_NUM_ITEMS)

d %<>% filter(country %in% big.countries$country)

The analyses below only include countries with at least 40 items.

Histograms of DVs

Take log of mean lengths. This histograms are at the level of language status x item.

d %<>% mutate(log_mean_lengths = log(mean_lengths))

d %>%
  gather(measure, value, c(5,8)) %>%
  ggplot(aes(x = value)) +
    facet_wrap(measure~lang, scales = "free") +
    geom_histogram() +
    theme_bw()

Main analyses

Dvs: mean length and number of strokes

By language status

d %>%
  gather(measure, value, c(5,8)) %>%
  ggplot(aes(y = value, x= lang, fill = lang)) +
    facet_wrap(~measure, scales = "free") +
    geom_boxplot() +
    theme_bw() +
    theme(legend.position = "none")

No differences here.

By language status and concreteness

d.f <- d %>%
  gather(measure, value, c(5,8)) %>% #missing length data for "circle" for some reason
  filter(!is.na(value)) 

#d.f %>%
#  count(country, lang, measure) %>%
#  filter(measure %in% c('log_mean_lengths',  "n_strokes")) %>%
#  as.data.frame()

summaries <- d.f%>%
  group_by(measure, conc.bin, lang) %>%
  multi_boot_standard(column = "value", na.rm = TRUE) %>%
  filter(measure %in% c('log_mean_lengths',  "n_strokes")) 

ggplot(summaries, aes(x = lang, y = mean,
                      fill = conc.bin, color = conc.bin, group = conc.bin)) +
    facet_wrap(~measure, scales = "free") +
    #geom_bar(position = "dodge", stat = "identity") +
    geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower), 
                   position=position_dodge(.9), size = .4) +
    theme_bw()

Two effects here: (1) English speakers have longer strokes overall and (2) abstract words have longer strokes for both English and non-English speakers.

By-item

Mean log length by item.

item.lengths = d.f %>%
  filter(measure == "log_mean_lengths") %>%
  group_by(conc.bin, word) %>%
  summarize(mean = mean(value)) 

ggplot(item.lengths, aes(y = mean, x = reorder(word, mean), fill = conc.bin)) +
  geom_bar(position = "dodge", stat = "identity") +
  theme_bw() +
  ylab("mean stroke length") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  xlab("word")

By-country

number of strokes

Note that these color scales are centered on the mean (red is above the mean and green is below).

#geo_codes = geocode(unique(d$country)) %>%
#            cbind(unique(d$country))

#write_csv(geo_codes, "geo_codes.csv")
geo_codes = read_csv("data/geo_codes.csv")

country.summaries <- d.f %>%
  group_by(measure, country) %>%
  summarize(mean = mean(value, na.rm = T)) %>%
  filter(measure %in% c("log_mean_lengths",  "n_strokes")) %>%
  left_join(geo_codes, by=c("country"= "countries"))

strokes.d = country.summaries %>%
  filter(measure == "n_strokes")

ggplot(strokes.d) +   
  borders("world", colour="gray50", fill="gray50") +
  ggtitle("n_strokes") +
  geom_point(aes(x = lon, y=lat, 
                 color = mean),size = 3) +
    scale_color_gradient2(midpoint = median(strokes.d$mean, na.rm = T), 
                          low = "green", mid = "white", high = "red") +
  mapTheme

mean stroke length

lengths.d  = country.summaries %>%
    filter(measure == "log_mean_lengths") 

ggplot(lengths.d) +   
    borders("world", colour="gray50", fill="gray50") +
    ggtitle("mean_lengths") +
    geom_point(aes(x = lon, y=lat, 
                   color = mean),size = 3) +
    scale_color_gradient2(midpoint = median(lengths.d$mean, na.rm = T), 
                          low = "green", mid = "white", high = "red") +    
    mapTheme

By-country concreteness differences

What countries show the biggest difference between abstract and concrete cues?

number of strokes

country.summaries.conc <- d.f %>%
  group_by(measure, country, conc.bin) %>%
  summarize(mean = mean(value, na.rm = T)) %>%
  filter(measure %in% c("log_mean_lengths", "n_strokes")) %>%
  left_join(geo_codes, by=c("country"= "countries")) %>%
  spread(conc.bin, mean) %>%
  mutate(dif = `1` - `6`/(`1` + `6`)) %>% # normalize difference
  filter(!is.na(dif))

n_strokes_dif.d = country.summaries.conc %>%
  filter(measure == "n_strokes")

ggplot(n_strokes_dif.d) +   
  borders("world", colour="gray50", fill="gray50") +
  ggtitle("difference in n_strokes between abstract and concrete") +
  geom_point(aes(x = lon, y=lat, 
                 color = dif),size = 3) +
    scale_color_gradient2(midpoint = median(n_strokes_dif.d$dif, na.rm = T), 
                          low = "green", mid = "white", high = "red") +
  mapTheme

mean stroke length

mean_lengths_dif.d = country.summaries.conc %>%
    filter(measure == "log_mean_lengths") 

ggplot(mean_lengths_dif.d) +   
    borders("world", colour="gray50", fill="gray50") +
    ggtitle("difference in mean lengths between abstract and concrete") +
    geom_point(aes(x = lon, y=lat, 
                   color = dif),size = 3) +
    scale_color_gradient2(midpoint = median(mean_lengths_dif.d$dif, na.rm = T), 
                          low = "green", mid = "white", high = "red") +    mapTheme

Summary of drawing analyses

Molly Lewis

2017-06-02

Summary stats

Number of participants per country

Number of cues per country

Histograms of DVs

Main analyses

By language status

By language status and concreteness

By-item

By-country

number of strokes

mean stroke length

By-country concreteness differences

number of strokes

mean stroke length