# Load libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(nycflights13)
## Warning: package 'nycflights13' was built under R version 4.5.2
library(tidyquant)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo 
## ── Attaching core tidyquant packages ─────────────────────── tidyquant 1.0.11 ──
## ✔ PerformanceAnalytics 2.0.8      ✔ TTR                  0.24.4
## ✔ quantmod             0.4.28     ✔ xts                  0.14.1── Conflicts ────────────────────────────────────────── tidyquant_conflicts() ──
## ✖ zoo::as.Date()                 masks base::as.Date()
## ✖ zoo::as.Date.numeric()         masks base::as.Date.numeric()
## ✖ dplyr::filter()                masks stats::filter()
## ✖ xts::first()                   masks dplyr::first()
## ✖ dplyr::lag()                   masks stats::lag()
## ✖ xts::last()                    masks dplyr::last()
## ✖ PerformanceAnalytics::legend() masks graphics::legend()
## ✖ quantmod::summary()            masks base::summary()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyverse)
df <- read_csv("multipleChoiceResponses1.csv")
## Rows: 16716 Columns: 47
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (46): LearningPlatformUsefulnessArxiv, LearningPlatformUsefulnessBlogs, ...
## dbl  (1): Age
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
###1. Count the usefulness by learning platform.
df1 <- df %>%
  select(starts_with("LearningPlatformUsefulness")) %>%
  pivot_longer(everything(), names_to = "learning_platform", values_to = "usefulness") %>%
  filter(!is.na(usefulness)) %>%
  mutate(learning_platform = str_remove(learning_platform, "LearningPlatformUsefulness")) %>%
  count(learning_platform, usefulness)
###2. Compute the number of total reponses and number of reponses which are atleast useful
df2 <- df1 %>%
  group_by(learning_platform) %>%
  summarize(
    tot = sum(n),
    count = sum(n[usefulness != "Not Useful"])
  ) %>%
  mutate(perc_usefulness = count / tot)
###3. Based on previous results, select the first two columns learning_platform and count.
df3 <- df2 %>%
  select(learning_platform, count) %>%
  arrange(desc(count)) %>%
  mutate(
    cum_pct = cumsum(count) / sum(count),
    learning_platform = fct_reorder(learning_platform, count)
  ) %>%
  mutate(learning_platform = fct_relevel(learning_platform, "Other", after = Inf))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `learning_platform = fct_relevel(learning_platform, "Other",
##   after = Inf)`.
## Caused by warning:
## ! 1 unknown level in `f`: Other
df3 <- df2 %>%
  arrange(desc(count)) %>%
  mutate(rank = row_number(),
         learning_platform = if_else(rank <= 10, learning_platform, "Other")) %>%
  group_by(learning_platform) %>%
  summarize(count = sum(count)) %>%
  ungroup() %>%
  arrange(desc(count)) %>%
  mutate(
    cum_pct = cumsum(count) / sum(count),
    rank = row_number(),
    label = paste0("Rank: ", rank,
                   "\nUseful: ", count,
                   "\nCumPct: ", scales::percent(cum_pct, accuracy = 0.1))
  )
###4. Based on the previous results, show the plotting as follows.
library(tidyverse)
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
library(forcats)

ggplot(df3, aes(x = fct_reorder(learning_platform, count), y = count)) +
  geom_col(fill = "gray30", width = 0.7) +
  coord_flip() +
  
  # annotation boxes on the right
  geom_label(aes(
      label = paste0("Rank: ", rank,
                     "\nUseful: ", format(count, big.mark=","),
                     "\nCumPct: ", percent(cum_pct, accuracy = 0.1))
    ),
    hjust = 0, vjust = 0.5, size = 1.5, label.size = 0.2, fill = "white"
  ) +
  
  expand_limits(y = max(df3$count) * 1.25) +  # add right margin for labels
  
  labs(
    title = "Top 10 learning platform",
    x = "Learning platform",
    y = "Number of responses with at least usefulness"
  ) +
  
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(size = 16, hjust = 0.5, face = "bold"),
    panel.grid.major.y = element_blank(),
    panel.grid.minor = element_blank(),
    axis.title.y = element_text(size = 12),
    axis.title.x = element_text(size = 12)
  )
## Warning: The `label.size` argument of `geom_label()` is deprecated as of ggplot2 3.5.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.