The Quick Draw Dataset is a collection of 50 million drawings across 345 categories. Using the Brysbaert concretneess norms (Brysbaert, Warriner, & Kuperman, 2013), I selected the subset of drawing cues that were in the top and bottom 6-tile of concreteness (n = 109). I then download the “simplified” datasets for this set of cues from: https://console.cloud.google.com/storage/browser/quickdraw_dataset/full/simplified. The drawings have been pre-scaled so they are all 255 x 255. The simplified versions also do not have time information. I then munged jsons to be in long form and saved them as a feather file (munge_jsons.R).
This is what the drawings look like: (random sampling of 30 drawings for 1 cue)
ITEM <- "tree"
N_DRAWINGS <- 30
ex <- read_feather(paste0("../../data/raw_data/feathers/", ITEM, ".txt"))
ex %>%
filter(key_id %in% sample(unique(ex$key_id), N_DRAWINGS)) %>% # sample N
ggplot(aes(x = x, y = -y, color = country, alpha = recognized)) +
ggtitle(ITEM)+
facet_wrap(~key_id) +
geom_point() +
geom_path(aes(group = stroke_num)) +
theme_bw()
# Read in summary drawing data
d <- list.files("../../data/summary_data/") %>%
purrr::map(function(x) paste0("../../data/summary_data/", x)) %>%
purrr::map(read_feather) %>%
bind_rows()
d %>%
group_by(country) %>%
summarize(`total drawings` = sum(n)) %>%
arrange(-`total drawings`) %>%
kable()
| country | total drawings |
|---|---|
| United States | 7354604 |
| United Kingdom | 1206724 |
| Canada | 606436 |
| Germany | 519593 |
| Russia | 403044 |
| Australia | 389381 |
| Brazil | 292649 |
| Sweden | 288390 |
| Finland | 285992 |
| Czech Republic | 264192 |
| Italy | 257576 |
| Poland | 214918 |
| France | 213295 |
| Thailand | 201155 |
| Korea, South | 200041 |
| Philippines | 191936 |
| Saudi Arabia | 176679 |
| Hungary | 133319 |
| Netherlands | 105578 |
| Romania | 44903 |
| Indonesia | 43701 |
| Ukraine | 31615 |
| Slovakia | 30905 |
| Japan | 30288 |
| India | 28749 |
| Vietnam | 24747 |
| United Arab Emirates | 14564 |
| Turkey | 12426 |
| Bulgaria | 9021 |
| China, Republic of (Taiwan) | 8183 |
| Croatia | 6665 |
| Malaysia | 4818 |
| Ireland | 4592 |
| Serbia | 3225 |
| Norway | 3157 |
| New Zealand | 3111 |
The missing items are because we only include an item for a country if there were at least 1500 participants.
MIN_NUM_ITEMS <- 40
item_counts = d %>%
ungroup() %>%
count(country) %>%
arrange(-`nn`)
kable(item_counts)
| country | nn |
|---|---|
| Australia | 109 |
| Canada | 109 |
| Germany | 109 |
| Russia | 109 |
| United Kingdom | 109 |
| United States | 109 |
| Brazil | 108 |
| Finland | 108 |
| Sweden | 108 |
| Czech Republic | 105 |
| Italy | 105 |
| Poland | 102 |
| Philippines | 94 |
| France | 93 |
| Thailand | 91 |
| Korea, South | 90 |
| Saudi Arabia | 84 |
| Hungary | 61 |
| Netherlands | 51 |
| Indonesia | 19 |
| Romania | 18 |
| Ukraine | 17 |
| Slovakia | 16 |
| India | 15 |
| Japan | 15 |
| Vietnam | 13 |
| United Arab Emirates | 8 |
| Turkey | 7 |
| Bulgaria | 5 |
| China, Republic of (Taiwan) | 5 |
| Croatia | 4 |
| Ireland | 3 |
| Malaysia | 3 |
| New Zealand | 2 |
| Norway | 2 |
| Serbia | 2 |
big.countries <- item_counts %>%
filter(nn > MIN_NUM_ITEMS)
d %<>% filter(country %in% big.countries$country)
The analyses below only include countries with at least 40 items.
d %>%
group_by(word) %>%
summarize(`total drawings` = sum(n)) %>%
arrange(-`total drawings`) %>%
kable()
| word | total drawings |
|---|---|
| snowman | 273640 |
| calendar | 261155 |
| marker | 257343 |
| banana | 248786 |
| hand | 237823 |
| yoga | 224341 |
| cooler | 222212 |
| knee | 215582 |
| penguin | 202765 |
| flashlight | 194767 |
| chair | 181568 |
| firetruck | 178000 |
| sink | 169971 |
| rhinoceros | 152320 |
| pig | 149708 |
| parrot | 148635 |
| coffee cup | 147835 |
| mermaid | 144518 |
| horse | 142527 |
| camouflage | 142184 |
| hospital | 137223 |
| spreadsheet | 136986 |
| van | 136949 |
| finger | 136852 |
| octagon | 130872 |
| garden | 130189 |
| helicopter | 129700 |
| dumbbell | 129682 |
| fireplace | 129368 |
| frog | 128844 |
| snorkel | 127870 |
| lightning | 125547 |
| lantern | 124373 |
| angel | 121919 |
| octopus | 121836 |
| passport | 120982 |
| roller coaster | 118940 |
| flower | 117868 |
| apple | 117528 |
| tree | 116255 |
| line | 116240 |
| tornado | 116084 |
| boomerang | 115468 |
| brain | 115220 |
| hexagon | 115203 |
| house | 111259 |
| hurricane | 110601 |
| star | 109673 |
| fan | 109429 |
| radio | 109103 |
| drill | 108517 |
| mouth | 107905 |
| carrot | 107721 |
| bird | 107042 |
| fish | 107040 |
| cookie | 105203 |
| pizza | 103801 |
| cactus | 103509 |
| microwave | 103441 |
| stairs | 102538 |
| hamburger | 102491 |
| cup | 101882 |
| camera | 101218 |
| elephant | 101061 |
| mailbox | 100998 |
| rainbow | 100926 |
| elbow | 100228 |
| compass | 100183 |
| vase | 99766 |
| giraffe | 99242 |
| ladder | 98722 |
| axe | 98585 |
| toothbrush | 98500 |
| wine bottle | 98235 |
| traffic light | 98068 |
| dragon | 98007 |
| square | 97933 |
| circle | 97690 |
| ear | 97024 |
| ice cream | 96983 |
| umbrella | 96968 |
| leaf | 96930 |
| strawberry | 96884 |
| baseball bat | 96543 |
| tiger | 95664 |
| snake | 95617 |
| triangle | 95379 |
| stereo | 95189 |
| binoculars | 95084 |
| bat | 94412 |
| palm tree | 93798 |
| eraser | 93525 |
| hot tub | 93285 |
| sea turtle | 93170 |
| cloud | 93142 |
| picture frame | 93064 |
| lighter | 92794 |
| clock | 92761 |
| pillow | 92216 |
| couch | 92203 |
| saw | 91760 |
| tractor | 91330 |
| saxophone | 91129 |
| raccoon | 90901 |
| zigzag | 90827 |
| panda | 90044 |
| basket | 89670 |
| squiggle | 87545 |
| bed | 85466 |
Below is the distribution over items within each country for a number of different measures. The measures are: stroke length, number of strokes, proportion recognized by google.
ggplot(d, aes(x = mean_lengths, y = reorder(country, mean_lengths))) +
geom_joy(scale = 2, aes(color = country), alpha = .3, size = 1) +
theme_joy(font_size = 13, grid = T) +
theme(axis.title.y = element_blank(),
legend.position = "none")
geo_codes <- read_csv("../../data/supplementary_data/geo_codes.csv")
d %<>% left_join(geo_codes, by = c("country" = "countries"))
d %>%
group_by(country, lat, lon) %>%
summarize(mean_lengths = mean(mean_lengths)) %>%
ggplot() +
borders("world", colour="gray50", fill="gray50") +
geom_point(aes(x = lon, y=lat,
color = mean_lengths),size = 3) +
scale_color_gradient2(midpoint = median(d$mean_lengths, na.rm = T),
low = "green", mid = "white", high = "red") +
mapTheme
ggplot(d, aes(x = mean_n_strokes, y = reorder(country, mean_n_strokes))) +
geom_joy(scale = 3, aes(color = country), alpha = .3, size = 1) +
theme_joy(font_size = 13, grid = T) +
theme(axis.title.y = element_blank(),
legend.position = "none")
d %>%
group_by(country, lat, lon) %>%
summarize(mean_n_strokes = mean(mean_n_strokes)) %>%
ggplot() +
borders("world", colour="gray50", fill="gray50") +
geom_point(aes(x = lon, y=lat,
color = mean_n_strokes),size = 3) +
scale_color_gradient2(midpoint = median(d$mean_n_strokes, na.rm = T),
low = "green", mid = "white", high = "red") +
mapTheme
ggplot(d, aes(x = prop_recognized, y = reorder(country, prop_recognized))) +
geom_joy(scale = 3, aes(color = country), alpha = .3, size = 1) +
theme_joy(font_size = 13, grid = T) +
theme(axis.title.y = element_blank(),
legend.position = "none")
d %>%
group_by(country, lat, lon) %>%
summarize(prop_recognized = mean(prop_recognized)) %>%
ggplot() +
borders("world", colour="gray50", fill="gray50") +
geom_point(aes(x = lon, y=lat,
color = prop_recognized),size = 3) +
scale_color_gradient2(midpoint = median(d$prop_recognized, na.rm = T),
low = "green", mid = "white", high = "red") +
mapTheme
Below is the distribution over countries for each item for a number of different measures. The measures are: stroke length, number of strokes, and proportion recognized by google.
ggplot(d, aes(x = mean_lengths, y = reorder(word, mean_lengths))) +
geom_joy(scale = 3, aes(color = word), alpha = .3, size = 1) +
theme_joy(font_size = 13, grid = T) +
theme(axis.title.y = element_blank(),
legend.position = "none")
ggplot(d, aes(x = mean_n_strokes, y = reorder(word, mean_n_strokes))) +
geom_joy(scale = 3, aes(color = word), alpha = .3, size = 1) +
theme_joy(font_size = 13, grid = T) +
theme(axis.title.y = element_blank(),
legend.position = "none")
ggplot(d, aes(x = prop_recognized, y = reorder(word, prop_recognized))) +
geom_joy(scale = 3, aes(color = word), alpha = .3, size = 1) +
theme_joy(font_size = 13, grid = T) +
theme(axis.title.y = element_blank(),
legend.position = "none")