DQLab Data Mentoring ini diadakan oleh DQLab
- tanggal 21 Juli 2020 pukul 19.00 - 20.30 WIB
- bersama Muhammad Aswan Syahputra
Harap untuk meperhatikan ethics dalam menggunakan data.
Silakan izin terlebih dahulu jika ingin menggunakan data.
Baca selengkapnya mengenai cara unduh data WhatsApp di sini
Di sini kita akan menggunakan GitHUb milik Mas Aswan Syahputra.
#library("remotes")
#usethis::use_course("aswansyahputra/wafun")
#devtools::install_deps()
Bisa di-comment (#) saja jikalau sudah terinstall semua.
library(readr)
library(dplyr)
library(tibble)
library(tidyr)
library(stringr)
library(lubridate)
library(syn)
library(textfeatures)
library(emo)
devtools::load_all()
Di sini, saya menggunakan data yang sudah tersedia dari Kaggle.
wachats_raw <- read_lines("data-raw/wachats_kaggle.txt")
glimpse(wachats_raw)
chr [1:764] "25/06/2015, 01:42 - <U+200E>Vishnu Gaud created this group" ...
head(wachats_raw)
[1] "25/06/2015, 01:42 - <U+200E>Vishnu Gaud created this group"
[2] "25/06/2015, 01:42 - <U+200E>You were added"
[3] "18/12/2016, 01:57 - Shahain: <Media omitted>"
[4] "21/12/2016, 21:54 - Pankaj Sinha: <Media omitted>"
[5] "21/12/2016, 21:57 - Shahain: Wow"
[6] "21/12/2016, 22:48 - Sakshi: <Media omitted>"
wachats <-
wachats_raw %>%
enframe(name = NULL, value = "content") %>%
# separate(
# content,
# into = c("datetime", "content"),
# sep = " - "
# ) %>%
separate(
content,
into = c("datetime", "content"),
sep = "(?<=\\d{2}/\\d{2}/\\d{4}, \\d{2}:\\d{2}) - ",
fill = "left"
) %>%
mutate(
chatid = case_when(
!is.na(datetime) ~ row_number(),
TRUE ~ NA_integer_
)
) %>%
fill(chatid, datetime) %>%
group_by(chatid) %>%
summarise(
datetime = unique(datetime),
content = paste0(content, collapse = "\n"),
n_lines = n()
) %>%
ungroup() %>%
filter(str_detect(content, ":")) %>%
separate(
content,
into = c("author", "text"),
sep = ": ",
extra = "merge"
) %>%
mutate(
datetime = dmy_hm(datetime)
)
glimpse(wachats)
Rows: 372
Columns: 5
$ chatid <int> 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 19, 20, 21, 2...
$ datetime <dttm> 2016-12-18 01:57:00, 2016-12-21 21:54:00, 2016-12-21 21:5...
$ author <chr> "Shahain", "Pankaj Sinha", "Shahain", "Sakshi", "Sakshi", ...
$ text <chr> "<Media omitted>", "<Media omitted>", "Wow", "<Media omitt...
$ n_lines <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 41, ...
wachats
# A tibble: 372 x 5
chatid datetime author text n_lines
<int> <dttm> <chr> <chr> <int>
1 3 2016-12-18 01:57:00 Shahain "<Media omitted>" 1
2 4 2016-12-21 21:54:00 Pankaj S~ "<Media omitted>" 1
3 5 2016-12-21 21:57:00 Shahain "Wow" 1
4 6 2016-12-21 22:48:00 Sakshi "<Media omitted>" 1
5 7 2016-12-21 22:49:00 Sakshi "<Media omitted>" 1
6 8 2016-12-21 22:50:00 Neha Wip~ "Awsum\U0001f600\U0001f600\U000~ 1
7 9 2016-12-21 22:51:00 Sakshi "\U0001f648" 1
8 10 2016-12-21 22:57:00 Ganguly "\U0001f642\U0001f642\U0001f44d~ 1
9 11 2016-12-21 23:28:00 Vishnu G~ "Waste out of wealth \U0001f602" 1
10 12 2016-12-21 23:48:00 Venu Wip~ "Fancy dress competition?" 1
# ... with 362 more rows
wachats_features <-
wachats %>%
mutate(
hour = hour(datetime),
day = wday(datetime, week_start = 1),
any_media = str_detect(text, "<Media omitted>"),
any_emoji = emo::ji_detect(text),
emoji = emo::ji_extract_all(text),
n_emojis = emo::ji_count(text),
n_chars = nchar(text),
n_words = n_words(text),
n_nonasciis = n_nonasciis(text),
n_digits = n_digits(text),
n_hashtags = n_hashtags(text),
n_mentions = n_mentions(text),
n_commas = n_commas(text),
n_periods = n_periods(text),
n_exclaims = n_exclaims(text),
n_newlines = n_newlines(text),
n_caps = n_caps(text),
n_lowers = n_lowers(text),
n_urls = n_urls(text),
n_puncts = n_puncts(text)
) %>%
relocate(n_lines, .before = n_emojis) %>%
mutate(
across(
starts_with("n_"),
~ if_else(text == "<Media omitted>", NA_integer_, .x)
)
)
glimpse(wachats_features)
Rows: 372
Columns: 25
$ chatid <int> 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 19, 20, 21...
$ datetime <dttm> 2016-12-18 01:57:00, 2016-12-21 21:54:00, 2016-12-21 2...
$ author <chr> "Shahain", "Pankaj Sinha", "Shahain", "Sakshi", "Sakshi...
$ text <chr> "<Media omitted>", "<Media omitted>", "Wow", "<Media om...
$ hour <int> 1, 21, 21, 22, 22, 22, 22, 22, 23, 23, 0, 0, 0, 6, 12, ...
$ day <dbl> 7, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 6...
$ any_media <lgl> TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, FALSE, FALSE, FAL...
$ any_emoji <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TR...
$ emoji <list> [<>, <>, <>, <>, <>, <"\U0001f600", "\U0001f600", "\U0...
$ n_lines <int> NA, NA, 1, NA, NA, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, NA, 1,...
$ n_emojis <int> NA, NA, 0, NA, NA, 4, 1, 4, 1, 0, 3, 0, 0, 4, 6, NA, 1,...
$ n_chars <int> NA, NA, 3, NA, NA, 11, 1, 6, 21, 24, 3, 37, 115, 4, 52,...
$ n_words <int> NA, NA, 1, NA, NA, 1, 1, 1, 5, 3, 1, 4, 21, 1, 8, NA, 1...
$ n_nonasciis <int> NA, NA, 0, NA, NA, 24, 4, 24, 4, 0, 12, 0, 0, 16, 24, N...
$ n_digits <int> NA, NA, 0, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0,...
$ n_hashtags <int> NA, NA, 0, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0,...
$ n_mentions <int> NA, NA, 0, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0,...
$ n_commas <int> NA, NA, 0, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0,...
$ n_periods <int> NA, NA, 0, NA, NA, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, NA, 0,...
$ n_exclaims <int> NA, NA, 0, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0,...
$ n_newlines <int> NA, NA, 0, NA, NA, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, NA, 0,...
$ n_caps <int> NA, NA, 1, NA, NA, 1, 0, 0, 1, 1, 0, 1, 3, 0, 2, NA, 0,...
$ n_lowers <int> NA, NA, 2, NA, NA, 4, 0, 0, 15, 20, 0, 33, 89, 0, 35, N...
$ n_urls <int> NA, NA, 0, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0,...
$ n_puncts <int> NA, NA, 0, NA, NA, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, NA, 0,...
wachats_features
# A tibble: 372 x 25
chatid datetime author text hour day any_media any_emoji emoji
<int> <dttm> <chr> <chr> <int> <dbl> <lgl> <lgl> <lis>
1 3 2016-12-18 01:57:00 Shaha~ "<Me~ 1 7 TRUE FALSE <chr~
2 4 2016-12-21 21:54:00 Panka~ "<Me~ 21 3 TRUE FALSE <chr~
3 5 2016-12-21 21:57:00 Shaha~ "Wow" 21 3 FALSE FALSE <chr~
4 6 2016-12-21 22:48:00 Sakshi "<Me~ 22 3 TRUE FALSE <chr~
5 7 2016-12-21 22:49:00 Sakshi "<Me~ 22 3 TRUE FALSE <chr~
6 8 2016-12-21 22:50:00 Neha ~ "Aws~ 22 3 FALSE TRUE <chr~
7 9 2016-12-21 22:51:00 Sakshi "\U0~ 22 3 FALSE TRUE <chr~
8 10 2016-12-21 22:57:00 Gangu~ "\U0~ 22 3 FALSE TRUE <chr~
9 11 2016-12-21 23:28:00 Vishn~ "Was~ 23 3 FALSE TRUE <chr~
10 12 2016-12-21 23:48:00 Venu ~ "Fan~ 23 3 FALSE FALSE <chr~
# ... with 362 more rows, and 16 more variables: n_lines <int>, n_emojis <int>,
# n_chars <int>, n_words <int>, n_nonasciis <int>, n_digits <int>,
# n_hashtags <int>, n_mentions <int>, n_commas <int>, n_periods <int>,
# n_exclaims <int>, n_newlines <int>, n_caps <int>, n_lowers <int>,
# n_urls <int>, n_puncts <int>
library(dplyr)
library(lubridate)
library(tidyr)
library(forcats)
library(ggplot2)
library(ggalt)
library(hrbrthemes)
library(wordcloud2)
theme_set(
theme_ft_rc(
base_family = font_tw,
grid = FALSE,
ticks = TRUE
) +
theme(
plot.title.position = "plot",
legend.position = "bottom"
)
)
# wachats_features %>%
# count(
# date = as.Date(datetime)
# ) %>%
# ggplot(aes(date, n)) +
# geom_line()
wachats_features %>%
ggplot(aes(as.Date(datetime))) +
geom_line(stat = "count", colour = ft_cols$red) +
labs(
x = NULL,
y = "# chats",
title = "How many chats are sent per day?",
caption = "Viz: Muhammad Aswan Syahputra"
)
mostactive <-
wachats_features %>%
count(
author
) %>%
slice_max(n = 4, order_by = n)
wachats_features %>%
semi_join(mostactive) %>%
count(
author,
hour
) %>%
mutate(
author = fct_reorder(author, n, sum, na.rm = TRUE)
) %>%
ggplot(aes(hour, author, colour = n)) +
geom_point(size = 8, alpha = 0.7, show.legend = FALSE) +
scale_x_continuous(breaks = 0:23) +
scale_colour_viridis_c(option = "inferno", trans = "log2") +
labs(
x = NULL,
y = NULL,
title = "At what time the most active user are online?",
subtitle = "Lighter colour denotes high number of chats",
caption = "Viz: Muhammad Aswan Syahputra"
)
friends <- c(
"Shahain",
"Pankaj Sinha",
"Sahil Phatania"
)
wachats_features %>%
filter(author %in% friends) %>%
mutate(
week = week(datetime),
wday = wday(datetime, label = TRUE)
) %>%
group_by(
wday,
author
) %>%
summarise(
n = n() / n_distinct(week)
) %>%
ggplot(aes(wday, n, fill = author)) +
geom_col(colour = NA, position = "dodge") +
scale_fill_ft() +
labs(
x = NULL,
y = "# chats/week",
fill = NULL,
title = "How often my friends and I \nsend chats to our group?",
caption = "Viz: Muhammad Aswan Syahputra"
) +
coord_polar()
wachats_emoji_stats <-
wachats_features %>%
group_by(author) %>%
summarise(
emoji_yes = mean(any_emoji, na.rm = TRUE),
n_emoji_yes = sum(any_emoji == TRUE),
emoji_no = 1 - emoji_yes,
n_emoji_no = sum(any_emoji == FALSE),
n_chats = n()
)
wachats_emoji_stats %>%
filter(emoji_yes == 1)
# A tibble: 2 x 6
author emoji_yes n_emoji_yes emoji_no n_emoji_no n_chats
<chr> <dbl> <int> <dbl> <int> <int>
1 Shweta 1 3 0 0 3
2 Yogesh Raghavan 1 3 0 0 3
wachats_emoji_stats %>%
filter(emoji_no == 1)
# A tibble: 3 x 6
author emoji_yes n_emoji_yes emoji_no n_emoji_no n_chats
<chr> <dbl> <int> <dbl> <int> <int>
1 <U+202A>+91 97360 22813<U+202C> 0 0 1 1 1
2 Shradha 0 0 1 2 2
3 Venu Wipro 0 0 1 6 6
wachats_emoji_stats %>%
filter(emoji_no == 0.5)
# A tibble: 1 x 6
author emoji_yes n_emoji_yes emoji_no n_emoji_no n_chats
<chr> <dbl> <int> <dbl> <int> <int>
1 Ganguly 0.5 2 0.5 2 4
wachats_emoji_stats %>%
filter(
emoji_yes != 1,
emoji_no != 1,
emoji_no != 0.5
) %>%
mutate(
log_ratio = log2(emoji_yes / emoji_no)
) %>%
group_by(
sign = sign(log_ratio)
) %>%
slice_max(
order_by = abs(log_ratio),
n = 5,
with.ties = FALSE
) %>%
ungroup() %>%
mutate(
author = fct_reorder(author, log_ratio),
sign = as.character(sign)
) %>%
ggplot(aes(log_ratio, author, colour = sign)) +
geom_lollipop(horizontal = TRUE, point.size = 5, show.legend = FALSE) +
geom_vline(xintercept = 0, linetype = "dashed", colour = ft_cols$slate) +
geom_point(x = 0, size = 7, fill = ft_cols$white, pch = 21, show.legend = FALSE) +
geom_text(aes(label = n_chats), x = 0, family = "Roboto Condensed", size = 3, show.legend = FALSE) +
scale_x_continuous(breaks = -2:2) +
scale_colour_manual(values = c(
"-1" = ft_cols$red,
"1" = ft_cols$green, ft_cols$green
)) +
labs(
x = "Log2 ratio of using emoji in chats",
y = NULL,
title = "How often ones using or not using emoji in their chats?",
subtitle = "One positive point means that emojis are used twice as much",
caption = "Viz: Muhammad Aswan Syahputra"
)
wachats_features %>%
filter(any_emoji == TRUE) %>%
unnest_longer(emoji) %>%
count(emoji, sort = TRUE) %>%
wordcloud2(
fontFamily = "Roboto Condensed",
backgroundColor = "#1e1e1e"
)
library(dplyr)
library(tidyr)
library(tibble)
library(purrr)
library(ggplot2)
library(hrbrthemes)
library(FactoMineR)
library(janitor)
wachats_prep <-
wachats_features %>%
drop_na(author) %>%
group_by(author) %>%
summarise(
n_chats = n(),
across(
c(hour:any_emoji, n_emojis:n_puncts),
~ mean(.x, na.rm = TRUE)
)
) %>%
column_to_rownames("author")
glimpse(wachats_prep)
Rows: 17
Columns: 20
$ n_chats <int> 1, 4, 7, 13, 29, 13, 26, 33, 46, 21, 10, 74, 2, 3, 6, 8...
$ hour <dbl> 23.000000, 10.250000, 5.285714, 11.923077, 13.793103, 1...
$ day <dbl> 3.000000, 3.500000, 3.285714, 3.769231, 3.482759, 3.615...
$ any_media <dbl> 0.00000000, 0.00000000, 0.00000000, 0.15384615, 0.00000...
$ any_emoji <dbl> 0.0000000, 0.5000000, 0.5714286, 0.4615385, 0.7931034, ...
$ n_emojis <dbl> 0.0000000, 1.5000000, 0.8571429, 9.2727273, 2.7586207, ...
$ n_chars <dbl> 23.00000, 20.25000, 29.00000, 191.00000, 43.86207, 23.3...
$ n_words <dbl> 4.000000, 3.500000, 5.714286, 29.000000, 8.965517, 4.15...
$ n_nonasciis <dbl> 0.0000000, 8.0000000, 3.4285714, 41.0909091, 56.5862069...
$ n_digits <dbl> 0.0000000, 0.0000000, 0.2857143, 1.1818182, 0.4482759, ...
$ n_hashtags <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0...
$ n_mentions <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000...
$ n_commas <dbl> 0.00000000, 0.00000000, 0.00000000, 0.90909091, 0.17241...
$ n_periods <dbl> 2.0000000, 0.5000000, 0.4285714, 2.3636364, 1.7586207, ...
$ n_exclaims <dbl> 0.0000000, 0.0000000, 0.4285714, 0.0000000, 0.0000000, ...
$ n_newlines <dbl> 0.0000000, 0.0000000, 0.1428571, 5.5454545, 1.0000000, ...
$ n_caps <dbl> 2.0000000, 1.2500000, 2.1428571, 24.9090909, 1.1379310,...
$ n_lowers <dbl> 16.000000, 14.000000, 20.142857, 98.909091, 13.413793, ...
$ n_urls <dbl> 0.00000000, 0.00000000, 0.00000000, 0.09090909, 0.00000...
$ n_puncts <dbl> 0.0000000, 0.0000000, 0.0000000, 6.0000000, 0.4827586, ...
rownames(wachats_prep)
[1] "<U+202A>+91 97360 22813<U+202C>" "Ganguly" "Kranthi" "Kushbhu"
[5] "Mukti Sharma" "Nauty's phone" "Neha Wipro" "Pankaj Sinha"
[9] "Preeti" "Sahil Phatania" "Sakshi" "Shahain"
[13] "Shradha" "Shweta" "Venu Wipro" "Vishnu Gaud"
[17] "Yogesh Raghavan"
wachats_pca <-
PCA(wachats_prep)
plot.PCA(wachats_pca, choix = "ind")
plot.PCA(wachats_pca, choix = "var")
wachats_cluster <-
HCPC(wachats_pca, nb.clust = -1, graph = FALSE)
plot.HCPC(wachats_cluster, choice = "tree")
plot.HCPC(wachats_cluster, choice = "3D.map")
plot.HCPC(wachats_cluster, choice = "map", draw.tree = FALSE)
wachats_cluster_descriptors <-
wachats_cluster %>%
pluck("desc.var", "quanti") %>%
map_dfr(
~ .x %>%
as_tibble(rownames = "descriptor") %>%
clean_names(),
.id = "cluster"
) %>%
mutate(cluster = paste("Cluster", cluster))
ggplot(
wachats_cluster_descriptors,
aes(cluster, descriptor, fill = v_test)
) +
geom_tile() +
scale_fill_distiller(
palette = "Spectral",
direction = 1,
breaks = -3:3,
guide = guide_colorbar(barwidth = 20, barheight = 0.8)
) +
labs(
x = NULL,
y = NULL,
fill = "V-test",
title = "What features that describe a cluster?",
subtitle = "Positive v-test value signifies that the average value is higher than in the overall data"
) +
theme_ft_rc(
base_family = font_tw,
grid = FALSE,
ticks = TRUE
) +
theme(
plot.title.position = "plot",
legend.position = "bottom"
)