DQLab Data Mentoring ini diadakan oleh DQLab
- tanggal 21 Juli 2020 pukul 19.00 - 20.30 WIB
- bersama Muhammad Aswan Syahputra

Unduh Data WhatsApp

Harap untuk meperhatikan ethics dalam menggunakan data.
Silakan izin terlebih dahulu jika ingin menggunakan data.

  1. Buka pesan pribadi atau grup
  2. Klik titik tiga di kanan atas
  3. Klik opsi “More”
  4. Klik opsi “Export chat”
  5. Pilih apakah ingin “export with media” atau “without media”

Baca selengkapnya mengenai cara unduh data WhatsApp di sini

Install Library dan Dependencies-nya

Di sini kita akan menggunakan GitHUb milik Mas Aswan Syahputra.

#library("remotes")
#usethis::use_course("aswansyahputra/wafun")
#devtools::install_deps()

Bisa di-comment (#) saja jikalau sudah terinstall semua.

Preprocessing

Unggah Pustaka

library(readr)
library(dplyr)
library(tibble)
library(tidyr)
library(stringr)
library(lubridate)
library(syn)
library(textfeatures)
library(emo)
devtools::load_all()

Unggah Data

Di sini, saya menggunakan data yang sudah tersedia dari Kaggle.

wachats_raw <- read_lines("data-raw/wachats_kaggle.txt")

glimpse(wachats_raw)
 chr [1:764] "25/06/2015, 01:42 - <U+200E>Vishnu Gaud created this group" ...
head(wachats_raw)
[1] "25/06/2015, 01:42 - <U+200E>Vishnu Gaud created this group"
[2] "25/06/2015, 01:42 - <U+200E>You were added"        
[3] "18/12/2016, 01:57 - Shahain: <Media omitted>"      
[4] "21/12/2016, 21:54 - Pankaj Sinha: <Media omitted>" 
[5] "21/12/2016, 21:57 - Shahain: Wow"                  
[6] "21/12/2016, 22:48 - Sakshi: <Media omitted>"       

Mengatur Format

wachats <-
  wachats_raw %>%
  enframe(name = NULL, value = "content") %>%
  # separate(
  #   content,
  #   into = c("datetime", "content"),
  #   sep = " - "
  # ) %>%
  separate(
    content,
    into = c("datetime", "content"),
    sep = "(?<=\\d{2}/\\d{2}/\\d{4}, \\d{2}:\\d{2}) - ",
    fill = "left"
  ) %>%
  mutate(
    chatid = case_when(
      !is.na(datetime) ~ row_number(),
      TRUE ~ NA_integer_
    )
  ) %>%
  fill(chatid, datetime) %>%
  group_by(chatid) %>%
  summarise(
    datetime = unique(datetime),
    content = paste0(content, collapse = "\n"),
    n_lines = n()
  ) %>%
  ungroup() %>%
  filter(str_detect(content, ":")) %>%
  separate(
    content,
    into = c("author", "text"),
    sep = ": ",
    extra = "merge"
  ) %>%
  mutate(
    datetime = dmy_hm(datetime)
  )

glimpse(wachats)
Rows: 372
Columns: 5
$ chatid   <int> 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 19, 20, 21, 2...
$ datetime <dttm> 2016-12-18 01:57:00, 2016-12-21 21:54:00, 2016-12-21 21:5...
$ author   <chr> "Shahain", "Pankaj Sinha", "Shahain", "Sakshi", "Sakshi", ...
$ text     <chr> "<Media omitted>", "<Media omitted>", "Wow", "<Media omitt...
$ n_lines  <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 41, ...
wachats
# A tibble: 372 x 5
   chatid datetime            author    text                             n_lines
    <int> <dttm>              <chr>     <chr>                              <int>
 1      3 2016-12-18 01:57:00 Shahain   "<Media omitted>"                      1
 2      4 2016-12-21 21:54:00 Pankaj S~ "<Media omitted>"                      1
 3      5 2016-12-21 21:57:00 Shahain   "Wow"                                  1
 4      6 2016-12-21 22:48:00 Sakshi    "<Media omitted>"                      1
 5      7 2016-12-21 22:49:00 Sakshi    "<Media omitted>"                      1
 6      8 2016-12-21 22:50:00 Neha Wip~ "Awsum\U0001f600\U0001f600\U000~       1
 7      9 2016-12-21 22:51:00 Sakshi    "\U0001f648"                           1
 8     10 2016-12-21 22:57:00 Ganguly   "\U0001f642\U0001f642\U0001f44d~       1
 9     11 2016-12-21 23:28:00 Vishnu G~ "Waste out of wealth \U0001f602"       1
10     12 2016-12-21 23:48:00 Venu Wip~ "Fancy dress competition?"             1
# ... with 362 more rows

Membersihkan Data

wachats_features <-
  wachats %>%
  mutate(
    hour = hour(datetime),
    day = wday(datetime, week_start = 1),
    any_media = str_detect(text, "<Media omitted>"),
    any_emoji = emo::ji_detect(text),
    emoji = emo::ji_extract_all(text),
    n_emojis = emo::ji_count(text),
    n_chars = nchar(text),
    n_words = n_words(text),
    n_nonasciis = n_nonasciis(text),
    n_digits = n_digits(text),
    n_hashtags = n_hashtags(text),
    n_mentions = n_mentions(text),
    n_commas = n_commas(text),
    n_periods = n_periods(text),
    n_exclaims = n_exclaims(text),
    n_newlines = n_newlines(text),
    n_caps = n_caps(text),
    n_lowers = n_lowers(text),
    n_urls = n_urls(text),
    n_puncts = n_puncts(text)
  ) %>%
  relocate(n_lines, .before = n_emojis) %>%
  mutate(
    across(
      starts_with("n_"),
      ~ if_else(text == "<Media omitted>", NA_integer_, .x)
    )
  )

glimpse(wachats_features)
Rows: 372
Columns: 25
$ chatid      <int> 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 19, 20, 21...
$ datetime    <dttm> 2016-12-18 01:57:00, 2016-12-21 21:54:00, 2016-12-21 2...
$ author      <chr> "Shahain", "Pankaj Sinha", "Shahain", "Sakshi", "Sakshi...
$ text        <chr> "<Media omitted>", "<Media omitted>", "Wow", "<Media om...
$ hour        <int> 1, 21, 21, 22, 22, 22, 22, 22, 23, 23, 0, 0, 0, 6, 12, ...
$ day         <dbl> 7, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 6...
$ any_media   <lgl> TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, FALSE, FALSE, FAL...
$ any_emoji   <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TR...
$ emoji       <list> [<>, <>, <>, <>, <>, <"\U0001f600", "\U0001f600", "\U0...
$ n_lines     <int> NA, NA, 1, NA, NA, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, NA, 1,...
$ n_emojis    <int> NA, NA, 0, NA, NA, 4, 1, 4, 1, 0, 3, 0, 0, 4, 6, NA, 1,...
$ n_chars     <int> NA, NA, 3, NA, NA, 11, 1, 6, 21, 24, 3, 37, 115, 4, 52,...
$ n_words     <int> NA, NA, 1, NA, NA, 1, 1, 1, 5, 3, 1, 4, 21, 1, 8, NA, 1...
$ n_nonasciis <int> NA, NA, 0, NA, NA, 24, 4, 24, 4, 0, 12, 0, 0, 16, 24, N...
$ n_digits    <int> NA, NA, 0, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0,...
$ n_hashtags  <int> NA, NA, 0, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0,...
$ n_mentions  <int> NA, NA, 0, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0,...
$ n_commas    <int> NA, NA, 0, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0,...
$ n_periods   <int> NA, NA, 0, NA, NA, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, NA, 0,...
$ n_exclaims  <int> NA, NA, 0, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0,...
$ n_newlines  <int> NA, NA, 0, NA, NA, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, NA, 0,...
$ n_caps      <int> NA, NA, 1, NA, NA, 1, 0, 0, 1, 1, 0, 1, 3, 0, 2, NA, 0,...
$ n_lowers    <int> NA, NA, 2, NA, NA, 4, 0, 0, 15, 20, 0, 33, 89, 0, 35, N...
$ n_urls      <int> NA, NA, 0, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0,...
$ n_puncts    <int> NA, NA, 0, NA, NA, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, NA, 0,...
wachats_features
# A tibble: 372 x 25
   chatid datetime            author text   hour   day any_media any_emoji emoji
    <int> <dttm>              <chr>  <chr> <int> <dbl> <lgl>     <lgl>     <lis>
 1      3 2016-12-18 01:57:00 Shaha~ "<Me~     1     7 TRUE      FALSE     <chr~
 2      4 2016-12-21 21:54:00 Panka~ "<Me~    21     3 TRUE      FALSE     <chr~
 3      5 2016-12-21 21:57:00 Shaha~ "Wow"    21     3 FALSE     FALSE     <chr~
 4      6 2016-12-21 22:48:00 Sakshi "<Me~    22     3 TRUE      FALSE     <chr~
 5      7 2016-12-21 22:49:00 Sakshi "<Me~    22     3 TRUE      FALSE     <chr~
 6      8 2016-12-21 22:50:00 Neha ~ "Aws~    22     3 FALSE     TRUE      <chr~
 7      9 2016-12-21 22:51:00 Sakshi "\U0~    22     3 FALSE     TRUE      <chr~
 8     10 2016-12-21 22:57:00 Gangu~ "\U0~    22     3 FALSE     TRUE      <chr~
 9     11 2016-12-21 23:28:00 Vishn~ "Was~    23     3 FALSE     TRUE      <chr~
10     12 2016-12-21 23:48:00 Venu ~ "Fan~    23     3 FALSE     FALSE     <chr~
# ... with 362 more rows, and 16 more variables: n_lines <int>, n_emojis <int>,
#   n_chars <int>, n_words <int>, n_nonasciis <int>, n_digits <int>,
#   n_hashtags <int>, n_mentions <int>, n_commas <int>, n_periods <int>,
#   n_exclaims <int>, n_newlines <int>, n_caps <int>, n_lowers <int>,
#   n_urls <int>, n_puncts <int>

Visualization

Unggah Pustaka

library(dplyr)
library(lubridate)
library(tidyr)
library(forcats)
library(ggplot2)
library(ggalt)
library(hrbrthemes)
library(wordcloud2)

Mengatur Tema

theme_set(
  theme_ft_rc(
    base_family = font_tw,
    grid = FALSE,
    ticks = TRUE
  ) +
    theme(
      plot.title.position = "plot",
      legend.position = "bottom"
    )
)

Jumlah Pesan Harian

# wachats_features %>%
#   count(
#     date = as.Date(datetime)
#   ) %>%
#   ggplot(aes(date, n)) +
#   geom_line()

wachats_features %>%
  ggplot(aes(as.Date(datetime))) +
  geom_line(stat = "count", colour = ft_cols$red) +
  labs(
    x = NULL,
    y = "# chats",
    title = "How many chats are sent per day?",
    caption = "Viz: Muhammad Aswan Syahputra"
  )

Tren Waktu Pengiriman Pesan

mostactive <-
  wachats_features %>%
  count(
    author
  ) %>%
  slice_max(n = 4, order_by = n)

wachats_features %>%
  semi_join(mostactive) %>%
  count(
    author,
    hour
  ) %>%
  mutate(
    author = fct_reorder(author, n, sum, na.rm = TRUE)
  ) %>%
  ggplot(aes(hour, author, colour = n)) +
  geom_point(size = 8, alpha = 0.7, show.legend = FALSE) +
  scale_x_continuous(breaks = 0:23) +
  scale_colour_viridis_c(option = "inferno", trans = "log2") +
  labs(
    x = NULL,
    y = NULL,
    title = "At what time the most active user are online?",
    subtitle = "Lighter colour denotes high number of chats",
    caption = "Viz: Muhammad Aswan Syahputra"
  )

Orang yang Paling Aktif

friends <- c(
  "Shahain",
  "Pankaj Sinha",
  "Sahil Phatania"
)

wachats_features %>%
  filter(author %in% friends) %>%
  mutate(
    week = week(datetime),
    wday = wday(datetime, label = TRUE)
  ) %>%
  group_by(
    wday,
    author
  ) %>%
  summarise(
    n = n() / n_distinct(week)
  ) %>%
  ggplot(aes(wday, n, fill = author)) +
  geom_col(colour = NA, position = "dodge") +
  scale_fill_ft() +
  labs(
    x = NULL,
    y = "# chats/week",
    fill = NULL,
    title = "How often my friends and I \nsend chats to our group?",
    caption = "Viz: Muhammad Aswan Syahputra"
  ) +
  coord_polar()

Penggunaan Emoji Terbanyak

wachats_emoji_stats <-
  wachats_features %>%
  group_by(author) %>%
  summarise(
    emoji_yes = mean(any_emoji, na.rm = TRUE),
    n_emoji_yes = sum(any_emoji == TRUE),
    emoji_no = 1 - emoji_yes,
    n_emoji_no = sum(any_emoji == FALSE),
    n_chats = n()
  )

wachats_emoji_stats %>%
  filter(emoji_yes == 1)
# A tibble: 2 x 6
  author          emoji_yes n_emoji_yes emoji_no n_emoji_no n_chats
  <chr>               <dbl>       <int>    <dbl>      <int>   <int>
1 Shweta                  1           3        0          0       3
2 Yogesh Raghavan         1           3        0          0       3
wachats_emoji_stats %>%
  filter(emoji_no == 1)
# A tibble: 3 x 6
  author          emoji_yes n_emoji_yes emoji_no n_emoji_no n_chats
  <chr>               <dbl>       <int>    <dbl>      <int>   <int>
1 <U+202A>+91 97360 22813<U+202C>         0           0        1          1       1
2 Shradha                 0           0        1          2       2
3 Venu Wipro              0           0        1          6       6
wachats_emoji_stats %>%
  filter(emoji_no == 0.5)
# A tibble: 1 x 6
  author  emoji_yes n_emoji_yes emoji_no n_emoji_no n_chats
  <chr>       <dbl>       <int>    <dbl>      <int>   <int>
1 Ganguly       0.5           2      0.5          2       4
wachats_emoji_stats %>%
  filter(
    emoji_yes != 1,
    emoji_no != 1,
    emoji_no != 0.5
  ) %>%
  mutate(
    log_ratio = log2(emoji_yes / emoji_no)
  ) %>%
  group_by(
    sign = sign(log_ratio)
  ) %>%
  slice_max(
    order_by = abs(log_ratio),
    n = 5,
    with.ties = FALSE
  ) %>%
  ungroup() %>%
  mutate(
    author = fct_reorder(author, log_ratio),
    sign = as.character(sign)
  ) %>%
  ggplot(aes(log_ratio, author, colour = sign)) +
  geom_lollipop(horizontal = TRUE, point.size = 5, show.legend = FALSE) +
  geom_vline(xintercept = 0, linetype = "dashed", colour = ft_cols$slate) +
  geom_point(x = 0, size = 7, fill = ft_cols$white, pch = 21, show.legend = FALSE) +
  geom_text(aes(label = n_chats), x = 0, family = "Roboto Condensed", size = 3, show.legend = FALSE) +
  scale_x_continuous(breaks = -2:2) +
  scale_colour_manual(values = c(
    "-1" = ft_cols$red,
    "1" = ft_cols$green, ft_cols$green
  )) +
  labs(
    x = "Log2 ratio of using emoji in chats",
    y = NULL,
    title = "How often ones using or not using emoji in their chats?",
    subtitle = "One positive point means that emojis are used twice as much",
    caption = "Viz: Muhammad Aswan Syahputra"
  )

Dan yang Tidak Kalah Seru, WordCloud Emoji

wachats_features %>%
  filter(any_emoji == TRUE) %>%
  unnest_longer(emoji) %>%
  count(emoji, sort = TRUE) %>%
  wordcloud2(
    fontFamily = "Roboto Condensed",
    backgroundColor = "#1e1e1e"
  )

Profilling

Unggah Pustaka

library(dplyr)
library(tidyr)
library(tibble)
library(purrr)
library(ggplot2)
library(hrbrthemes)
library(FactoMineR)
library(janitor)

Mengatur Format Data untuk Pemodelan

wachats_prep <-
  wachats_features %>%
  drop_na(author) %>%
  group_by(author) %>%
  summarise(
    n_chats = n(),
    across(
      c(hour:any_emoji, n_emojis:n_puncts),
      ~ mean(.x, na.rm = TRUE)
    )
  ) %>%
  column_to_rownames("author")

glimpse(wachats_prep)
Rows: 17
Columns: 20
$ n_chats     <int> 1, 4, 7, 13, 29, 13, 26, 33, 46, 21, 10, 74, 2, 3, 6, 8...
$ hour        <dbl> 23.000000, 10.250000, 5.285714, 11.923077, 13.793103, 1...
$ day         <dbl> 3.000000, 3.500000, 3.285714, 3.769231, 3.482759, 3.615...
$ any_media   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.15384615, 0.00000...
$ any_emoji   <dbl> 0.0000000, 0.5000000, 0.5714286, 0.4615385, 0.7931034, ...
$ n_emojis    <dbl> 0.0000000, 1.5000000, 0.8571429, 9.2727273, 2.7586207, ...
$ n_chars     <dbl> 23.00000, 20.25000, 29.00000, 191.00000, 43.86207, 23.3...
$ n_words     <dbl> 4.000000, 3.500000, 5.714286, 29.000000, 8.965517, 4.15...
$ n_nonasciis <dbl> 0.0000000, 8.0000000, 3.4285714, 41.0909091, 56.5862069...
$ n_digits    <dbl> 0.0000000, 0.0000000, 0.2857143, 1.1818182, 0.4482759, ...
$ n_hashtags  <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0...
$ n_mentions  <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000...
$ n_commas    <dbl> 0.00000000, 0.00000000, 0.00000000, 0.90909091, 0.17241...
$ n_periods   <dbl> 2.0000000, 0.5000000, 0.4285714, 2.3636364, 1.7586207, ...
$ n_exclaims  <dbl> 0.0000000, 0.0000000, 0.4285714, 0.0000000, 0.0000000, ...
$ n_newlines  <dbl> 0.0000000, 0.0000000, 0.1428571, 5.5454545, 1.0000000, ...
$ n_caps      <dbl> 2.0000000, 1.2500000, 2.1428571, 24.9090909, 1.1379310,...
$ n_lowers    <dbl> 16.000000, 14.000000, 20.142857, 98.909091, 13.413793, ...
$ n_urls      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.09090909, 0.00000...
$ n_puncts    <dbl> 0.0000000, 0.0000000, 0.0000000, 6.0000000, 0.4827586, ...
rownames(wachats_prep)
 [1] "<U+202A>+91 97360 22813<U+202C>" "Ganguly"         "Kranthi"         "Kushbhu"        
 [5] "Mukti Sharma"    "Nauty's phone"   "Neha Wipro"      "Pankaj Sinha"   
 [9] "Preeti"          "Sahil Phatania"  "Sakshi"          "Shahain"        
[13] "Shradha"         "Shweta"          "Venu Wipro"      "Vishnu Gaud"    
[17] "Yogesh Raghavan"

Principle Componenet Analysis (PCA)

wachats_pca <-
  PCA(wachats_prep)

plot.PCA(wachats_pca, choix = "ind")

plot.PCA(wachats_pca, choix = "var")

Hierarchical Clustering on Principal Components (HCPC)

wachats_cluster <-
  HCPC(wachats_pca, nb.clust = -1, graph = FALSE)

plot.HCPC(wachats_cluster, choice = "tree")

plot.HCPC(wachats_cluster, choice = "3D.map")

plot.HCPC(wachats_cluster, choice = "map", draw.tree = FALSE)

Menampilkan Klaster

wachats_cluster_descriptors <-
  wachats_cluster %>%
  pluck("desc.var", "quanti") %>%
  map_dfr(
    ~ .x %>%
      as_tibble(rownames = "descriptor") %>%
      clean_names(),
    .id = "cluster"
  ) %>%
  mutate(cluster = paste("Cluster", cluster))

ggplot(
  wachats_cluster_descriptors,
  aes(cluster, descriptor, fill = v_test)
) +
  geom_tile() +
  scale_fill_distiller(
    palette = "Spectral",
    direction = 1,
    breaks = -3:3,
    guide = guide_colorbar(barwidth = 20, barheight = 0.8)
  ) +
  labs(
    x = NULL,
    y = NULL,
    fill = "V-test",
    title = "What features that describe a cluster?",
    subtitle = "Positive v-test value signifies that the average value is higher than in the overall data"
  ) +
  theme_ft_rc(
    base_family = font_tw,
    grid = FALSE,
    ticks = TRUE
  ) +
  theme(
    plot.title.position = "plot",
    legend.position = "bottom"
  )

Terima Kasih

Muhammad Yusuf Aristyanto