In-Class Exercise

Load Libraries

library(tidyverse)
library(lubridate)
library(tidyquant)

Read Data

data <- read_csv("./multipleChoiceResponses1.csv")

Question 1: Count the usefulness by learning platform

Select only the columns with LearningPlatformUsefulness and remove rows where usefulness is NA. Remove LearningPlatformUsefulness from each string in learning_platform. Use count() to change the dataset to have one row per learning_platform usefulness pair with a column that is the number of entries with that pairing.

q1_result <- data %>%
  select(starts_with("LearningPlatformUsefulness")) %>%
  pivot_longer(
    cols = everything(),
    names_to = "learning_platform",
    values_to = "usefulness"
  ) %>%
  filter(!is.na(usefulness)) %>%
  mutate(learning_platform = str_remove(learning_platform, "LearningPlatformUsefulness")) %>%
  count(learning_platform, usefulness)

q1_result
## # A tibble: 54 × 3
##    learning_platform usefulness          n
##    <chr>             <chr>           <int>
##  1 Arxiv             Not Useful         37
##  2 Arxiv             Somewhat useful  1038
##  3 Arxiv             Very useful      1316
##  4 Blogs             Not Useful         45
##  5 Blogs             Somewhat useful  2406
##  6 Blogs             Very useful      2314
##  7 College           Not Useful        101
##  8 College           Somewhat useful  1405
##  9 College           Very useful      1853
## 10 Communities       Not Useful         16
## # ℹ 44 more rows

Question 2: Compute total responses and responses that are at least useful

Filter out rows with “Not Useful” from the learning platform.

q2_result <- q1_result %>%
  group_by(learning_platform) %>%
  summarise(
    count = sum(n[usefulness != "Not Useful"]),
    tot = sum(n),
    perc_usefulness = count / tot
  ) %>%
  arrange(desc(count))

q2_result
## # A tibble: 18 × 4
##    learning_platform count   tot perc_usefulness
##    <chr>             <int> <int>           <dbl>
##  1 Kaggle             6527  6583           0.991
##  2 Courses            5945  5992           0.992
##  3 SO                 5576  5640           0.989
##  4 YouTube            5125  5229           0.980
##  5 Projects           4755  4794           0.992
##  6 Blogs              4720  4765           0.991
##  7 Textbook           4112  4181           0.983
##  8 College            3258  3359           0.970
##  9 Arxiv              2354  2391           0.985
## 10 Documentation      2279  2321           0.982
## 11 Conferences        2063  2182           0.945
## 12 Friends            1530  1581           0.968
## 13 Tutoring           1394  1426           0.978
## 14 Communities        1126  1142           0.986
## 15 Podcasts           1090  1214           0.898
## 16 Newsletters        1033  1089           0.949
## 17 Company             940   981           0.958
## 18 TradeBook           324   333           0.973

Question 3: Select learning_platform and count

Show the top 10 useful learning platforms. Show the cumulative percent of top 10 useful learning platforms (cum_pct). Other platforms will be grouped as Other.

q3_result <- q2_result %>%
  select(learning_platform, count) %>%
  slice_max(order_by = count, n = 10) %>%
  rename(count1 = count) %>%
  mutate(cum_pct = cumsum(count1) / sum(q2_result$count)) %>%
  bind_rows(
    tibble(
      learning_platform = "Other",
      count1 = sum(q2_result$count) - sum(.$count1),
      cum_pct = 1.0
    )
  ) %>%
  mutate(
    learning_platform = fct_reorder(learning_platform, count1, .desc = FALSE),
    learning_platform = fct_relevel(learning_platform, "Other", after = 0)
  )

q3_result
## # A tibble: 11 × 3
##    learning_platform count1 cum_pct
##    <fct>              <int>   <dbl>
##  1 Kaggle              6527   0.121
##  2 Courses             5945   0.230
##  3 SO                  5576   0.333
##  4 YouTube             5125   0.428
##  5 Projects            4755   0.516
##  6 Blogs               4720   0.603
##  7 Textbook            4112   0.679
##  8 College             3258   0.739
##  9 Arxiv               2354   0.782
## 10 Documentation       2279   0.825
## 11 Other               9500   1

Question 4: Visualization

Based on the previous results, show the plotting as follows.

q3_result <- q3_result %>%
  mutate(
    rank = if_else(learning_platform == "Other", NA_integer_, 11 - row_number()),
    label_text = str_glue("Rank: {rank}\nUseful: {scales::comma(count1)}\nCumPct: {scales::percent(cum_pct, accuracy = 0.1)}")
  )

q3_result %>%
  ggplot(aes(x = count1, y = learning_platform)) +
  geom_segment(aes(xend = 0, yend = learning_platform), 
               color = "black", 
               linewidth = 0.5) +
  geom_point(aes(size = count1), 
             color = "darkgreen") +
  geom_label(aes(label = label_text), 
             hjust = 0,
             nudge_x = 200,
             size = 2.5,
             label.padding = unit(0.12, "lines"),
             label.size = 0.25,
             lineheight = 0.85) +
  scale_size_continuous(range = c(2, 8)) +
  scale_x_continuous(
    breaks = seq(0, 10000, 2500),
    labels = scales::comma,
    expand = expansion(mult = c(0.02, 0.25))
  ) +
  labs(
    x = "Number of responses with at least usefulness",
    y = "Learning platform",
    title = "Top 10 learning platform"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 12),
    axis.title = element_text(size = 10),
    axis.text = element_text(size = 9),
    panel.grid.major.x = element_line(color = "grey"),
    panel.grid.major.y = element_blank(),
    panel.grid.minor = element_blank(),
    legend.position = "none"
  )