library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(tidyquant)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## ── Attaching core tidyquant packages ─────────────────────── tidyquant 1.0.11 ──
## ✔ PerformanceAnalytics 2.0.8      ✔ TTR                  0.24.4
## ✔ quantmod             0.4.28     ✔ xts                  0.14.1
## ── Conflicts ────────────────────────────────────────── tidyquant_conflicts() ──
## ✖ zoo::as.Date()                 masks base::as.Date()
## ✖ zoo::as.Date.numeric()         masks base::as.Date.numeric()
## ✖ dplyr::filter()                masks stats::filter()
## ✖ xts::first()                   masks dplyr::first()
## ✖ dplyr::lag()                   masks stats::lag()
## ✖ xts::last()                    masks dplyr::last()
## ✖ PerformanceAnalytics::legend() masks graphics::legend()
## ✖ quantmod::summary()            masks base::summary()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(nycflights13)
df <- read_csv("multipleChoiceResponses1.csv")
## Rows: 16716 Columns: 47
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (46): LearningPlatformUsefulnessArxiv, LearningPlatformUsefulnessBlogs, ...
## dbl  (1): Age
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
use_cols <- df %>% select(starts_with("LearningPlatformUsefulness"))

use_tbl <- use_cols %>%
  pivot_longer(
    cols = everything(),
    names_to = "learning_platform",
    values_to = "usefulness"
  ) %>%
  filter(!is.na(usefulness)) %>%
  mutate(
    learning_platform = str_replace(learning_platform, "LearningPlatformUsefulness", "")
  ) %>%
  count(learning_platform, usefulness, name = "n") %>%
  arrange(learning_platform, usefulness)
print(use_tbl)
## # A tibble: 54 × 3
##    learning_platform usefulness          n
##    <chr>             <chr>           <int>
##  1 Arxiv             Not Useful         37
##  2 Arxiv             Somewhat useful  1038
##  3 Arxiv             Very useful      1316
##  4 Blogs             Not Useful         45
##  5 Blogs             Somewhat useful  2406
##  6 Blogs             Very useful      2314
##  7 College           Not Useful        101
##  8 College           Somewhat useful  1405
##  9 College           Very useful      1853
## 10 Communities       Not Useful         16
## # ℹ 44 more rows
totals <- use_tbl %>%
  group_by(learning_platform) %>%
  summarise(tot = sum(n), .groups = "drop")

useful_counts <- use_tbl %>%
  filter(tolower(usefulness) != "not useful") %>%
  group_by(learning_platform) %>%
  summarise(count = sum(n), .groups = "drop")

result2 <- useful_counts %>%
  left_join(totals, by = "learning_platform") %>%
  mutate(perc_usefulness = count / tot) %>%
  arrange(desc(count))
print(result2)
## # A tibble: 18 × 4
##    learning_platform count   tot perc_usefulness
##    <chr>             <int> <int>           <dbl>
##  1 Kaggle             6527  6583           0.991
##  2 Courses            5945  5992           0.992
##  3 SO                 5576  5640           0.989
##  4 YouTube            5125  5229           0.980
##  5 Projects           4755  4794           0.992
##  6 Blogs              4720  4765           0.991
##  7 Textbook           4112  4181           0.983
##  8 College            3258  3359           0.970
##  9 Arxiv              2354  2391           0.985
## 10 Documentation      2279  2321           0.982
## 11 Conferences        2063  2182           0.945
## 12 Friends            1530  1581           0.968
## 13 Tutoring           1394  1426           0.978
## 14 Communities        1126  1142           0.986
## 15 Podcasts           1090  1214           0.898
## 16 Newsletters        1033  1089           0.949
## 17 Company             940   981           0.958
## 18 TradeBook           324   333           0.973
total_useful <- sum(result2$count)

top10 <- result2 %>%
  arrange(desc(count)) %>%
  mutate(
    count1 = count,
    pct = count1 / total_useful,
    cum_pct = cumsum(pct)
  ) %>%
  slice(1:10) %>%
  select(learning_platform, count1, cum_pct)

other_val <- result2 %>%
  arrange(desc(count)) %>%
  slice(11:n()) %>%
  summarise(count1 = sum(count)) %>%
  pull(count1)

top10_out <- top10 %>%
  bind_rows(tibble(learning_platform = "Other", count1 = other_val, cum_pct = 1.0))
print(top10_out)
## # A tibble: 11 × 3
##    learning_platform count1 cum_pct
##    <chr>              <int>   <dbl>
##  1 Kaggle              6527   0.121
##  2 Courses             5945   0.230
##  3 SO                  5576   0.333
##  4 YouTube             5125   0.428
##  5 Projects            4755   0.516
##  6 Blogs               4720   0.603
##  7 Textbook            4112   0.679
##  8 College             3258   0.739
##  9 Arxiv               2354   0.782
## 10 Documentation       2279   0.825
## 11 Other               9500   1
top10_out %>%
  mutate(learning_platform = fct_reorder(learning_platform, count1)) %>%
  ggplot(aes(x = count1, y = learning_platform)) +
  geom_col() +
  labs(
    title = "Top 10 Learning Platforms (At Least Useful)",
    x = "Number of responses with at least usefulness",
    y = "Learning Platform"
  ) +
  theme_minimal()

write_csv(top10_out, "top10_learning_platforms.csv")
message("Saved top10_learning_platforms.csv to working directory.")
## Saved top10_learning_platforms.csv to working directory.