The Quick Draw Dataset is a collection of 50 million drawings across 345 categories. Using the Brysbaert concretneess norms (Brysbaert, Warriner, & Kuperman, 2013), I selected the subset of drawing cues that were in the top and bottom 6-tile of concreteness (n = 109). I then download the “simplified” datasets for this set of cues from: https://console.cloud.google.com/storage/browser/quickdraw_dataset/full/simplified. The drawings have been pre-scaled so they are all 255 x 255. The simplified versions also do not have time information. I then munged jsons to be in long form and saved them as a feather file (munge_jsons.R).

Sanity check

This is what the drawings look like: (random sampling of 30 drawings for 1 cue)

ITEM <- "tree"
N_DRAWINGS <- 30

ex <- read_feather(paste0("../../data/raw_data/feathers/", ITEM, ".txt"))

ex %>%
  filter(key_id %in% sample(unique(ex$key_id), N_DRAWINGS)) %>% # sample N
  ggplot(aes(x = x, y = -y, color = country, alpha = recognized)) +
  ggtitle(ITEM)+
  facet_wrap(~key_id) +
  geom_point() +
  geom_path(aes(group = stroke_num)) +
  theme_bw()

Summary stats

# Read in summary drawing data
d <- list.files("../../data/summary_data/") %>%
  purrr::map(function(x) paste0("../../data/summary_data/", x)) %>% 
  purrr::map(read_feather) %>% 
  bind_rows() 

Number of participants per country

d %>% 
  group_by(country) %>%
  summarize(`total drawings` = sum(n)) %>%
  arrange(-`total drawings`) %>%
  kable()
country total drawings
United States 7354604
United Kingdom 1206724
Canada 606436
Germany 519593
Russia 403044
Australia 389381
Brazil 292649
Sweden 288390
Finland 285992
Czech Republic 264192
Italy 257576
Poland 214918
France 213295
Thailand 201155
Korea, South 200041
Philippines 191936
Saudi Arabia 176679
Hungary 133319
Netherlands 105578
Romania 44903
Indonesia 43701
Ukraine 31615
Slovakia 30905
Japan 30288
India 28749
Vietnam 24747
United Arab Emirates 14564
Turkey 12426
Bulgaria 9021
China, Republic of (Taiwan) 8183
Croatia 6665
Malaysia 4818
Ireland 4592
Serbia 3225
Norway 3157
New Zealand 3111

Number of cues per country

The missing items are because we only include an item for a country if there were at least 1500 participants.

MIN_NUM_ITEMS <- 40

item_counts = d %>%
  ungroup() %>%
  count(country) %>%
  arrange(-`nn`)
  
kable(item_counts)
country nn
Australia 109
Canada 109
Germany 109
Russia 109
United Kingdom 109
United States 109
Brazil 108
Finland 108
Sweden 108
Czech Republic 105
Italy 105
Poland 102
Philippines 94
France 93
Thailand 91
Korea, South 90
Saudi Arabia 84
Hungary 61
Netherlands 51
Indonesia 19
Romania 18
Ukraine 17
Slovakia 16
India 15
Japan 15
Vietnam 13
United Arab Emirates 8
Turkey 7
Bulgaria 5
China, Republic of (Taiwan) 5
Croatia 4
Ireland 3
Malaysia 3
New Zealand 2
Norway 2
Serbia 2
big.countries <- item_counts %>%
                    filter(nn > MIN_NUM_ITEMS)

d %<>% filter(country %in% big.countries$country)

The analyses below only include countries with at least 40 items.

Number of participants per cue

d %>% 
  group_by(word) %>%
  summarize(`total drawings` = sum(n)) %>%
  arrange(-`total drawings`) %>%
  kable()
word total drawings
snowman 273640
calendar 261155
marker 257343
banana 248786
hand 237823
yoga 224341
cooler 222212
knee 215582
penguin 202765
flashlight 194767
chair 181568
firetruck 178000
sink 169971
rhinoceros 152320
pig 149708
parrot 148635
coffee cup 147835
mermaid 144518
horse 142527
camouflage 142184
hospital 137223
spreadsheet 136986
van 136949
finger 136852
octagon 130872
garden 130189
helicopter 129700
dumbbell 129682
fireplace 129368
frog 128844
snorkel 127870
lightning 125547
lantern 124373
angel 121919
octopus 121836
passport 120982
roller coaster 118940
flower 117868
apple 117528
tree 116255
line 116240
tornado 116084
boomerang 115468
brain 115220
hexagon 115203
house 111259
hurricane 110601
star 109673
fan 109429
radio 109103
drill 108517
mouth 107905
carrot 107721
bird 107042
fish 107040
cookie 105203
pizza 103801
cactus 103509
microwave 103441
stairs 102538
hamburger 102491
cup 101882
camera 101218
elephant 101061
mailbox 100998
rainbow 100926
elbow 100228
compass 100183
vase 99766
giraffe 99242
ladder 98722
axe 98585
toothbrush 98500
wine bottle 98235
traffic light 98068
dragon 98007
square 97933
circle 97690
ear 97024
ice cream 96983
umbrella 96968
leaf 96930
strawberry 96884
baseball bat 96543
tiger 95664
snake 95617
triangle 95379
stereo 95189
binoculars 95084
bat 94412
palm tree 93798
eraser 93525
hot tub 93285
sea turtle 93170
cloud 93142
picture frame 93064
lighter 92794
clock 92761
pillow 92216
couch 92203
saw 91760
tractor 91330
saxophone 91129
raccoon 90901
zigzag 90827
panda 90044
basket 89670
squiggle 87545
bed 85466

By-country summaries

Below is the distribution over items within each country for a number of different measures. The measures are: stroke length, number of strokes, proportion recognized by google.

Stroke length

Joy plot

ggplot(d, aes(x = mean_lengths, y = reorder(country, mean_lengths))) + 
  geom_joy(scale = 2, aes(color = country), alpha = .3, size = 1) +
  theme_joy(font_size = 13, grid = T) + 
  theme(axis.title.y = element_blank(),
        legend.position = "none")

Map

geo_codes <- read_csv("../../data/supplementary_data/geo_codes.csv")

d %<>% left_join(geo_codes, by = c("country" = "countries"))

d %>%
  group_by(country, lat, lon) %>%
  summarize(mean_lengths = mean(mean_lengths)) %>%
  ggplot() +     
  borders("world", colour="gray50", fill="gray50") +
  geom_point(aes(x = lon, y=lat, 
                 color = mean_lengths),size = 3) +
  scale_color_gradient2(midpoint = median(d$mean_lengths, na.rm = T), 
                          low = "green", mid = "white", high = "red") +
  mapTheme

Num Strokes

Joy plot

ggplot(d, aes(x = mean_n_strokes, y = reorder(country, mean_n_strokes))) + 
  geom_joy(scale = 3, aes(color = country), alpha = .3, size = 1) +
  theme_joy(font_size = 13, grid = T) + 
  theme(axis.title.y = element_blank(),
        legend.position = "none")

Map

d %>%
  group_by(country, lat, lon) %>%
  summarize(mean_n_strokes = mean(mean_n_strokes)) %>%
  ggplot() +   
  borders("world", colour="gray50", fill="gray50") +
  geom_point(aes(x = lon, y=lat, 
                 color = mean_n_strokes),size = 3) +
    scale_color_gradient2(midpoint = median(d$mean_n_strokes, na.rm = T), 
                          low = "green", mid = "white", high = "red") +
  mapTheme

Prop recognized

Joy plot

ggplot(d, aes(x = prop_recognized, y = reorder(country, prop_recognized))) + 
  geom_joy(scale = 3, aes(color = country), alpha = .3, size = 1) +
  theme_joy(font_size = 13, grid = T) + 
  theme(axis.title.y = element_blank(),
        legend.position = "none")

Map

d %>%
  group_by(country, lat, lon) %>%
  summarize(prop_recognized = mean(prop_recognized)) %>%
  ggplot() +   
  borders("world", colour="gray50", fill="gray50") +
  geom_point(aes(x = lon, y=lat,
                 color = prop_recognized),size = 3) +
    scale_color_gradient2(midpoint = median(d$prop_recognized, na.rm = T), 
                          low = "green", mid = "white", high = "red") +
  mapTheme

By-item summaries

Below is the distribution over countries for each item for a number of different measures. The measures are: stroke length, number of strokes, and proportion recognized by google.

Stroke length

ggplot(d, aes(x = mean_lengths, y = reorder(word, mean_lengths))) + 
  geom_joy(scale = 3, aes(color = word), alpha = .3, size = 1) +
  theme_joy(font_size = 13, grid = T) + 
  theme(axis.title.y = element_blank(),
        legend.position = "none")

Num Strokes

ggplot(d, aes(x = mean_n_strokes, y = reorder(word, mean_n_strokes))) + 
  geom_joy(scale = 3, aes(color = word), alpha = .3, size = 1) +
  theme_joy(font_size = 13, grid = T) + 
  theme(axis.title.y = element_blank(),
        legend.position = "none")

Prop recognized

ggplot(d, aes(x = prop_recognized, y = reorder(word, prop_recognized))) + 
  geom_joy(scale = 3, aes(color = word), alpha = .3, size = 1) +
  theme_joy(font_size = 13, grid = T) + 
  theme(axis.title.y = element_blank(),
        legend.position = "none")