Exploring user behavior on Snapshots at Sea

library(data.table)
library(dplyr)
library(dtplyr)
library(ggplot2)
library(treemap)
library(stringr)
library(magrittr)

Define functions

#summarize input file: group by user and count classifications 
summarize_user_classifications <- function(dataset, col_name){
     dataset$user <- dataset[[col_name]]
     data_out <- dataset %>% 
          mutate(., user = ifelse(str_detect(user, pattern = "not-logged-in"), "not-logged-in", user)) %>%
          group_by(., user) %>%
          summarise(., num_classifications = n()) %>% #count the number of classifications per user
          arrange(., -num_classifications) %>% #don't know why desc(num_classifications) isn't working
          add_rownames() #index for the treemap
     return(data_out)
}

Read in and summarize user data: in this case, snapshots at sea

classifications <- fread("../Data/snapshots-at-sea-classifications.csv", select = 1:8)

Read 0.0% of 1085163 rows
Read 14.7% of 1085163 rows
Read 28.6% of 1085163 rows
Read 41.5% of 1085163 rows
Read 56.2% of 1085163 rows
Read 71.0% of 1085163 rows
Read 83.9% of 1085163 rows
Read 94.9% of 1085163 rows
Read 1085163 rows and 8 (of 14) columns from 1.150 GB file in 00:00:14
demo_dat <- summarize_user_classifications(classifications, "user_name")
Deprecated, use tibble::rownames_to_column() instead.
print(demo_dat)

Look at user classification activity

ggplot(data = dat, aes(x=num_classifications)) + geom_density()

ggplot(data = dat, aes(x=num_classifications)) + geom_density() + scale_x_log10()

ggplot(data = dat, aes(x=num_classifications)) + geom_histogram(bins = 100) + scale_x_log10()

Looking at the distribution for just the lowest 100 classifiers

ggplot(data = filter(dat, num_classifications <= 100), aes(x=num_classifications)) + geom_bar() 

ggplot(data = filter(dat, num_classifications <= 25), aes(x=num_classifications)) + geom_bar() 

treemap(dtf = dat, index = "rowname", vSize = "num_classifications", border.lwds = .1, fontsize.labels = 4)

LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKCiMgRXhwbG9yaW5nIHVzZXIgYmVoYXZpb3Igb24gU25hcHNob3RzIGF0IFNlYQoKYGBge3IsIHJlc3VsdHMgPSAiaGlkZSJ9CmxpYnJhcnkoZGF0YS50YWJsZSkKbGlicmFyeShkcGx5cikKbGlicmFyeShkdHBseXIpCmxpYnJhcnkoZ2dwbG90MikKbGlicmFyeSh0cmVlbWFwKQpsaWJyYXJ5KHN0cmluZ3IpCmxpYnJhcnkobWFncml0dHIpCgpgYGAKCgpEZWZpbmUgZnVuY3Rpb25zCmBgYHtyLCByZXN1bHRzID0gImhpZGUifQoKI3N1bW1hcml6ZSBpbnB1dCBmaWxlOiBncm91cCBieSB1c2VyIGFuZCBjb3VudCBjbGFzc2lmaWNhdGlvbnMgCnN1bW1hcml6ZV91c2VyX2NsYXNzaWZpY2F0aW9ucyA8LSBmdW5jdGlvbihkYXRhc2V0LCBjb2xfbmFtZSl7CiAgICAgZGF0YXNldCR1c2VyIDwtIGRhdGFzZXRbW2NvbF9uYW1lXV0KICAgICBkYXRhX291dCA8LSBkYXRhc2V0ICU+JSAKICAgICAgICAgIG11dGF0ZSguLCB1c2VyID0gaWZlbHNlKHN0cl9kZXRlY3QodXNlciwgcGF0dGVybiA9ICJub3QtbG9nZ2VkLWluIiksICJub3QtbG9nZ2VkLWluIiwgdXNlcikpICU+JQogICAgICAgICAgZ3JvdXBfYnkoLiwgdXNlcikgJT4lCiAgICAgICAgICBzdW1tYXJpc2UoLiwgbnVtX2NsYXNzaWZpY2F0aW9ucyA9IG4oKSkgJT4lICNjb3VudCB0aGUgbnVtYmVyIG9mIGNsYXNzaWZpY2F0aW9ucyBwZXIgdXNlcgogICAgICAgICAgYXJyYW5nZSguLCAtbnVtX2NsYXNzaWZpY2F0aW9ucykgJT4lICNkb24ndCBrbm93IHdoeSBkZXNjKG51bV9jbGFzc2lmaWNhdGlvbnMpIGlzbid0IHdvcmtpbmcKICAgICAgICAgIGFkZF9yb3duYW1lcygpICNpbmRleCBmb3IgdGhlIHRyZWVtYXAKICAgICByZXR1cm4oZGF0YV9vdXQpCn0KYGBgCgoKUmVhZCBpbiBhbmQgc3VtbWFyaXplIHVzZXIgZGF0YTogaW4gdGhpcyBjYXNlLCBzbmFwc2hvdHMgYXQgc2VhCmBgYHtyfQoKY2xhc3NpZmljYXRpb25zIDwtIGZyZWFkKCIuLi9EYXRhL3NuYXBzaG90cy1hdC1zZWEtY2xhc3NpZmljYXRpb25zLmNzdiIsIHNlbGVjdCA9IDE6OCkKCmRlbW9fZGF0IDwtIHN1bW1hcml6ZV91c2VyX2NsYXNzaWZpY2F0aW9ucyhjbGFzc2lmaWNhdGlvbnMsICJ1c2VyX25hbWUiKQoKcHJpbnQoZGVtb19kYXQpCgpgYGAKCkxvb2sgYXQgdXNlciBjbGFzc2lmaWNhdGlvbiBhY3Rpdml0eQpgYGB7cn0KZ2dwbG90KGRhdGEgPSBkYXQsIGFlcyh4PW51bV9jbGFzc2lmaWNhdGlvbnMpKSArIGdlb21fZGVuc2l0eSgpCgpgYGAKYGBge3J9CmdncGxvdChkYXRhID0gZGF0LCBhZXMoeD1udW1fY2xhc3NpZmljYXRpb25zKSkgKyBnZW9tX2RlbnNpdHkoKSArIHNjYWxlX3hfbG9nMTAoKQoKYGBgCgpgYGB7cn0KZ2dwbG90KGRhdGEgPSBkYXQsIGFlcyh4PW51bV9jbGFzc2lmaWNhdGlvbnMpKSArIGdlb21faGlzdG9ncmFtKGJpbnMgPSAxMDApICsgc2NhbGVfeF9sb2cxMCgpCgpgYGAKCkxvb2tpbmcgYXQgdGhlIGRpc3RyaWJ1dGlvbiBmb3IganVzdCB0aGUgbG93ZXN0IDEwMCBjbGFzc2lmaWVycwpgYGB7cn0KZ2dwbG90KGRhdGEgPSBmaWx0ZXIoZGF0LCBudW1fY2xhc3NpZmljYXRpb25zIDw9IDEwMCksIGFlcyh4PW51bV9jbGFzc2lmaWNhdGlvbnMpKSArIGdlb21fYmFyKCkgCgpgYGAKCmBgYHtyfQpnZ3Bsb3QoZGF0YSA9IGZpbHRlcihkYXQsIG51bV9jbGFzc2lmaWNhdGlvbnMgPD0gMjUpLCBhZXMoeD1udW1fY2xhc3NpZmljYXRpb25zKSkgKyBnZW9tX2JhcigpIAoKYGBgCgpgYGB7cn0KdHJlZW1hcChkdGYgPSBkYXQsIGluZGV4ID0gInJvd25hbWUiLCB2U2l6ZSA9ICJudW1fY2xhc3NpZmljYXRpb25zIiwgYm9yZGVyLmx3ZHMgPSAuMSwgZm9udHNpemUubGFiZWxzID0gNCkKYGBgCgoK