R Markdown

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(nycflights13)
library(tidyquant)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## ── Attaching core tidyquant packages ─────────────────────── tidyquant 1.0.11 ──
## ✔ PerformanceAnalytics 2.0.8      ✔ TTR                  0.24.4
## ✔ quantmod             0.4.28     ✔ xts                  0.14.1
## ── Conflicts ────────────────────────────────────────── tidyquant_conflicts() ──
## ✖ zoo::as.Date()                 masks base::as.Date()
## ✖ zoo::as.Date.numeric()         masks base::as.Date.numeric()
## ✖ dplyr::filter()                masks stats::filter()
## ✖ xts::first()                   masks dplyr::first()
## ✖ dplyr::lag()                   masks stats::lag()
## ✖ xts::last()                    masks dplyr::last()
## ✖ PerformanceAnalytics::legend() masks graphics::legend()
## ✖ quantmod::summary()            masks base::summary()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
multiple_choice <- read_csv("Downloads/multipleChoiceResponses1.csv")
## Rows: 16716 Columns: 47
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (46): LearningPlatformUsefulnessArxiv, LearningPlatformUsefulnessBlogs, ...
## dbl  (1): Age
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
result_q1 <- multiple_choice %>%
  select(starts_with("LearningPlatformUsefulness")) %>%
  pivot_longer(
    cols = everything(),
    names_to = "learning_platform",
    values_to = "usefulness"
  ) %>%
  filter(!is.na(usefulness)) %>%
  # clean platform names
  mutate(
    learning_platform = str_remove(learning_platform, "LearningPlatformUsefulness"),
    learning_platform = str_trim(learning_platform)
  ) %>%
  # count pairs
  count(learning_platform, usefulness, name = "n") %>%
  arrange(learning_platform)

result_q1
## # A tibble: 54 × 3
##    learning_platform usefulness          n
##    <chr>             <chr>           <int>
##  1 Arxiv             Not Useful         37
##  2 Arxiv             Somewhat useful  1038
##  3 Arxiv             Very useful      1316
##  4 Blogs             Not Useful         45
##  5 Blogs             Somewhat useful  2406
##  6 Blogs             Very useful      2314
##  7 College           Not Useful        101
##  8 College           Somewhat useful  1405
##  9 College           Very useful      1853
## 10 Communities       Not Useful         16
## # ℹ 44 more rows
long_df <- multiple_choice %>%
  select(starts_with("LearningPlatformUsefulness")) %>%
  pivot_longer(
    cols = everything(),
    names_to = "learning_platform",
    values_to = "usefulness"
  ) %>%
  filter(!is.na(usefulness)) %>%
  mutate(
    learning_platform = str_remove(learning_platform, "LearningPlatformUsefulness"),
    learning_platform = str_trim(learning_platform)
  )

result_q2 <- long_df %>%
  group_by(learning_platform) %>%
  summarise(
    tot = n(),
    count = sum(usefulness != "Not Useful"),
    perc_usefulness = count / tot
  ) %>%
  arrange(learning_platform)

result_q2
## # A tibble: 18 × 4
##    learning_platform   tot count perc_usefulness
##    <chr>             <int> <int>           <dbl>
##  1 Arxiv              2391  2354           0.985
##  2 Blogs              4765  4720           0.991
##  3 College            3359  3258           0.970
##  4 Communities        1142  1126           0.986
##  5 Company             981   940           0.958
##  6 Conferences        2182  2063           0.945
##  7 Courses            5992  5945           0.992
##  8 Documentation      2321  2279           0.982
##  9 Friends            1581  1530           0.968
## 10 Kaggle             6583  6527           0.991
## 11 Newsletters        1089  1033           0.949
## 12 Podcasts           1214  1090           0.898
## 13 Projects           4794  4755           0.992
## 14 SO                 5640  5576           0.989
## 15 Textbook           4181  4112           0.983
## 16 TradeBook           333   324           0.973
## 17 Tutoring           1426  1394           0.978
## 18 YouTube            5229  5125           0.980
ordered_df <- result_q2 %>%
  arrange(desc(count))


ordered_df <- ordered_df %>%
  mutate(
    cum_pct = cumsum(count) / sum(count)
  )


top10 <- ordered_df %>% slice(1:10)

other_sum <- ordered_df %>% slice(11:n()) %>% summarise(count = sum(count)) %>% pull(count)

top10_plus_other <- top10 %>%
  bind_rows(
    tibble(
      learning_platform = "Other",
      count = other_sum,
      cum_pct = 1  # 100% cumulative
    )
  ) %>%
  # reorder factor levels for plotting later
  mutate(
    learning_platform = fct_reorder(learning_platform, count),
    learning_platform = fct_relevel(learning_platform, "Other", after = Inf)
  )

top10_plus_other
## # A tibble: 11 × 5
##    learning_platform   tot count perc_usefulness cum_pct
##    <fct>             <int> <int>           <dbl>   <dbl>
##  1 Kaggle             6583  6527           0.991   0.121
##  2 Courses            5992  5945           0.992   0.230
##  3 SO                 5640  5576           0.989   0.333
##  4 YouTube            5229  5125           0.980   0.428
##  5 Projects           4794  4755           0.992   0.516
##  6 Blogs              4765  4720           0.991   0.603
##  7 Textbook           4181  4112           0.983   0.679
##  8 College            3359  3258           0.970   0.739
##  9 Arxiv              2391  2354           0.985   0.782
## 10 Documentation      2321  2279           0.982   0.825
## 11 Other                NA  9500          NA       1
plot_df <- top10_plus_other %>%
  arrange(desc(count)) %>%
  mutate(
    rank = row_number(),
    label = ifelse(
      learning_platform == "Other",
      paste0("Rank: NA\nUseful: ", count, "\nCumPct: 100.0%"),
      paste0(
        "Rank: ", rank, "\n",
        "Useful: ", count, "\n",
        "CumPct: ", scales::percent(cum_pct, accuracy = 0.1)
      )
    )
  )

# Plot
ggplot(plot_df, aes(x = count, y = learning_platform)) +
  geom_col(fill = "steelblue") +
  geom_text(
    aes(label = label),
    hjust = -0.1,
    size = 3
  ) +
  scale_x_continuous(expand = expansion(mult = c(0, 0.15))) +
  labs(
    title = "Top 10 learning platform",
    x = "Number of responses with at least usefulness",
    y = "Learning platform"
  ) +
  theme_minimal(base_size = 12)