La totalité des données provient de l’export des données slack (.zip) disponible ici: https://zqsd.slack.com/services/export
allUsers <- fromJSON("input/users.json" ,flatten=TRUE) %>%
as.data.frame()
On ne garde que les users non supprimés et on exclu les bots. On ne conserve que les colonnes id, name, real_name et color
keeps <- c("id", "name", "real_name", "color")
allUsers <- allUsers[!allUsers$deleted & !allUsers$is_bot ,keeps]
head(allUsers)
## id name real_name color
## 1 U03JKQTDX logs Logs 4bbe2e
## 2 U03JMNC9J paf Julien 9f69e7
## 3 U03JMQH0A b3nz3n B3nZ3n e7392d
## 4 U03JN1BGE qn7o Anto Mrb 3c989f
## 5 U03JSCX10 gawel Ga ' ' l 674b1b
## 6 U03JSENHL roux Roux e96699
On enrichi les données pour qu’elles soient directement exploitables et on extrait les données de couleur qui seront utilisées plus tard dans tous les charts
#ajout du "#" devant le code couleur
allUsers$color <- paste0("#",allUsers$color)
colors <- allUsers[!allUsers$color == "#NA",c("name","color")]
plotColors <- colors$color
names(plotColors) <- colors$name
head(plotColors)
## logs paf b3nz3n qn7o gawel roux
## "#4bbe2e" "#9f69e7" "#e7392d" "#3c989f" "#674b1b" "#e96699"
On commence par lister tous les fichiers présents dans le dossier general
files <- list.files(path = "input/general", full.names = TRUE)
files <- files[order(files)]
length(files)
## [1] 431
On extrait ensuite les données de chaque fichier (un par jour, depuis 2015). Pour se faire on définit une fonction qui va prendre en entrée le nom du fichier et retourner un data frame contenant les données de ce jour
extractFileContent <- function(filename){
messages <- fromJSON(filename,flatten = TRUE)
messages %>%
select(one_of(c("client_msg_id","type","text","user","ts","reply_users","reactions"))) %>%
as.data.frame()
}
On applique ensuite cette fonction sur toute la liste des fichiers
allMessages <- sapply(files, FUN=extractFileContent)
On enrichit/manipule ensuite les données
#on "applatit" le dataset
allMessages <- rbindlist(allMessages, fill=TRUE)
#on converti les timestamps en date
allMessages$ts = as_datetime(as.integer(allMessages$ts))
#on ajoute une colonne qui contient le "vrai" nom sur base de l'ID.
allMessages$username <- with(allUsers, name[match(allMessages$user, id)])
#on ne garde que les messages des users que l'on a conservé lors de l'étape d'extraction des users
allMessages <- allMessages[allMessages$user %in% allUsers$id,]
#on ne garde que les données de l'année 2021
allMessages <- allMessages[allMessages$ts >= '2021-01-01 00:00:00' & allMessages$ts <= '2021-12-31 00:00:00' ,]
threads <- allMessages[!allMessages$reply_users == "NULL",c("user","reply_users")]
threads <- tidyr::unnest(threads, cols = reply_users)
threads %>% group_by(user,reply_users) %>%
summarise(count_replies = n()) -> threads
## `summarise()` has grouped output by 'user'. You can override using the `.groups` argument.
threads$user <- with(allUsers, name[match(threads$user, id)])
threads$reply_users <- with(allUsers, name[match(threads$reply_users, id)])
threads <- threads[threads$user %in% c("b3nz3n","logs","qn7o","roux","rylou","tony","vv","woookash"),]
threads <- threads[threads$count_replies > 5,]
ggplot(threads,
aes(axis1 = user,
axis2 = reply_users,
y = count_replies)) +
geom_alluvium(aes(fill = user),curve_type = "sigmoid",width = 1/10, alpha=0.7) +
geom_stratum(width = 1/10) +
geom_text(stat = "stratum",
aes(label= paste(after_stat(stratum))))+
scale_x_discrete(limits = c("user", "reply_users"),
expand = c(.1, .1)) +
scale_fill_manual(values = plotColors )+
ggtitle("Nombre de messages envoyés par \"reply_users\" dans un thread créé par \"user\"")+
theme(legend.position = "none")
## Rapport ZQSD002: Utilisation des emojii par user
reactions <- allMessages[!allMessages$reactions == "NULL",c("user","reactions")]
reactions <- tidyr::unnest(reactions, cols = reactions)
reactions <- reactions[,c("user","name","count")]
reactions %>% group_by(user,name) %>%
summarise(count_reaction = sum(count)) -> reactions
## `summarise()` has grouped output by 'user'. You can override using the `.groups` argument.
reactions$user <- with(allUsers, name[match(reactions$user, id)])
reactions %>% group_by(name) %>% summarise(sum(count_reaction)) %>% top_n(n = 10) -> top10emojii
## Selecting by sum(count_reaction)
reactions <- reactions[reactions$name %in% top10emojii$name,]
ggplot(data = reactions, aes(name, count_reaction, fill = user)) +
geom_bar(stat ='identity') +
scale_fill_manual(values = plotColors) +
ggtitle("Emojii usage per user") +
xlab("emojii")+
ylab("Number of uses")+
theme(axis.text.x = element_text(angle = -90, hjust = 0))
jokeScores <- rev(c("zero","one","two","three","four","five","six","seven","height","nine","keycap_ten"))
jokeColors <- rev(c("#FF0000","#dd776e","#e2886c","#e79a69","#ecac67","#e9b861","#f5ce62","#d4c86a","#b0be6e","#94bd77","#73b87e"))
names(jokeColors) <- jokeScores
reactions <- allMessages[!allMessages$reactions == "NULL",c("user","reactions")]
reactions <- tidyr::unnest(reactions, cols = reactions)
reactions <- reactions[,c("user","name","count")]
reactions %>% group_by(user,name) %>%
summarise(count_reaction = sum(count)) -> reactions
## `summarise()` has grouped output by 'user'. You can override using the `.groups` argument.
reactions$user <- with(allUsers, name[match(reactions$user, id)])
reactions <- reactions[reactions$name %in% jokeScores,]
reactions$name <- ordered(reactions$name, levels=names(jokeColors))
ggplot(data = reactions, aes(x=user, y=count_reaction, fill = name,order=name)) +
geom_bar(stat ='identity') +
scale_fill_manual("",values=jokeColors)+
ggtitle("Les plus rigolos? depuis 2015") +
xlab("username")+
ylab("score")+
theme(axis.text.x = element_text(angle = -90, hjust = 0))