#1. Count the usefulness by learning platform
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
data <- read.csv("multipleChoiceResponses1.csv")
usefulness_columns <- colnames(data)[1:18]
usefulness_counts <- data %>%
  select(all_of(usefulness_columns)) %>%
  pivot_longer(cols = everything(), names_to = "LearningPlatform", values_to = "Usefulness") %>%
  filter(!is.na(Usefulness)) %>%
mutate(Usefulness = gsub("Not Useful", "Least Useful", Usefulness),
LearningPlatform = gsub("LearningPlatformUsefulness", "", LearningPlatform)) %>%
  count(LearningPlatform, Usefulness)

print(usefulness_counts)
## # A tibble: 54 × 3
##    LearningPlatform Usefulness          n
##    <chr>            <chr>           <int>
##  1 Arxiv            Least Useful       37
##  2 Arxiv            Somewhat useful  1038
##  3 Arxiv            Very useful      1316
##  4 Blogs            Least Useful       45
##  5 Blogs            Somewhat useful  2406
##  6 Blogs            Very useful      2314
##  7 College          Least Useful      101
##  8 College          Somewhat useful  1405
##  9 College          Very useful      1853
## 10 Communities      Least Useful       16
## # ℹ 44 more rows
#2. Compute the number of total responses and the number of responses which are at least useful
atleastuseful <- data %>%
  gather(key = "learning_platform", value = "usefulness", starts_with("LearningPlatformUsefulness")) %>%
  filter(!is.na(usefulness)) %>%
  mutate(learning_platform = gsub("LearningPlatformUsefulness", "", learning_platform)) %>%
  count(learning_platform, name = "count") %>%
  left_join(
    data %>%
      gather(key = "learning_platform", value = "usefulness", starts_with("LearningPlatformUsefulness")) %>%
      filter(!is.na(usefulness) & usefulness != "Not Useful") %>%
      mutate(learning_platform = gsub("LearningPlatformUsefulness", "", learning_platform)) %>%
      count(learning_platform, name = "at_least_useful"),
    by = "learning_platform"
  ) %>%
  mutate(
    tot = ifelse(is.na(at_least_useful), count, at_least_useful),
    perc_usefulness = tot / count
  ) %>%
  select(learning_platform, tot, count, perc_usefulness)

atleastuseful
##    learning_platform  tot count perc_usefulness
## 1              Arxiv 2354  2391       0.9845253
## 2              Blogs 4720  4765       0.9905561
## 3            College 3258  3359       0.9699315
## 4        Communities 1126  1142       0.9859895
## 5            Company  940   981       0.9582059
## 6        Conferences 2063  2182       0.9454629
## 7            Courses 5945  5992       0.9921562
## 8      Documentation 2279  2321       0.9819044
## 9            Friends 1530  1581       0.9677419
## 10            Kaggle 6527  6583       0.9914932
## 11       Newsletters 1033  1089       0.9485767
## 12          Podcasts 1090  1214       0.8978583
## 13          Projects 4755  4794       0.9918648
## 14                SO 5576  5640       0.9886525
## 15          Textbook 4112  4181       0.9834968
## 16         TradeBook  324   333       0.9729730
## 17          Tutoring 1394  1426       0.9775596
## 18           YouTube 5125  5229       0.9801109
#3 Select the first two columns: learning_platform and count
selected_columns <- atleastuseful %>%
  select(learning_platform, count)

# Show the top 10 useful learning platforms
top_10_platforms <- selected_columns %>%
  top_n(10, count) %>%
  mutate(learning_platform = fct_reorder(learning_platform, count))

# Calculate cumulative percent
top_10_platforms <- top_10_platforms %>%
  mutate(cum_pct = cumsum(count) / sum(count))

# Identify platforms not in the top 10
other_platforms <- selected_columns %>%
  anti_join(top_10_platforms, by = "learning_platform") %>%
  summarise(count = sum(count), learning_platform = "Other")

# Combine top 10 platforms and 'Other'
result_grouped <- bind_rows(top_10_platforms, other_platforms)

# Ensure cumulative percentage for 'Other' is exactly 1
result_grouped$cum_pct[result_grouped$learning_platform == "Other"] <- 1

# Print the results
print(result_grouped)
##    learning_platform count    cum_pct
## 1              Arxiv  2391 0.05283394
## 2              Blogs  4765 0.15812617
## 3            College  3359 0.23235002
## 4            Courses  5992 0.36475528
## 5      Documentation  2321 0.41604243
## 6             Kaggle  6583 0.56150702
## 7           Projects  4794 0.66744006
## 8                 SO  5640 0.79206717
## 9           Textbook  4181 0.88445476
## 10           YouTube  5229 1.00000000
## 11             Other  9948 1.00000000
#4 Based on the previous results, show the plotting as follows.

library(ggplot2)
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
# Create a bar plot
ggplot(result_grouped, aes(x = count, y = learning_platform)) +
  geom_segment(aes(xend = 0, yend = learning_platform), size = 1) +
  geom_point(size = 3) +
  geom_label(aes(label = str_glue("Cumulative Percent: {scales::percent(cum_pct)}")),
             hjust = "inward",
             size = 2,
             color = "black") +   
  labs(title = "Top 10 Useful Learning Platforms",
       x = "Number of Responses with at Least Usefulness",
       y = "Learning Platform") +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.