R Markdown
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(nycflights13)
library(tidyquant)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## ── Attaching core tidyquant packages ─────────────────────── tidyquant 1.0.11 ──
## ✔ PerformanceAnalytics 2.0.8 ✔ TTR 0.24.4
## ✔ quantmod 0.4.28 ✔ xts 0.14.1
## ── Conflicts ────────────────────────────────────────── tidyquant_conflicts() ──
## ✖ zoo::as.Date() masks base::as.Date()
## ✖ zoo::as.Date.numeric() masks base::as.Date.numeric()
## ✖ dplyr::filter() masks stats::filter()
## ✖ xts::first() masks dplyr::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ xts::last() masks dplyr::last()
## ✖ PerformanceAnalytics::legend() masks graphics::legend()
## ✖ quantmod::summary() masks base::summary()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
multiple_choice <- read_csv("Downloads/multipleChoiceResponses1.csv")
## Rows: 16716 Columns: 47
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (46): LearningPlatformUsefulnessArxiv, LearningPlatformUsefulnessBlogs, ...
## dbl (1): Age
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
result_q1 <- multiple_choice %>%
select(starts_with("LearningPlatformUsefulness")) %>%
pivot_longer(
cols = everything(),
names_to = "learning_platform",
values_to = "usefulness"
) %>%
filter(!is.na(usefulness)) %>%
# clean platform names
mutate(
learning_platform = str_remove(learning_platform, "LearningPlatformUsefulness"),
learning_platform = str_trim(learning_platform)
) %>%
# count pairs
count(learning_platform, usefulness, name = "n") %>%
arrange(learning_platform)
result_q1
## # A tibble: 54 × 3
## learning_platform usefulness n
## <chr> <chr> <int>
## 1 Arxiv Not Useful 37
## 2 Arxiv Somewhat useful 1038
## 3 Arxiv Very useful 1316
## 4 Blogs Not Useful 45
## 5 Blogs Somewhat useful 2406
## 6 Blogs Very useful 2314
## 7 College Not Useful 101
## 8 College Somewhat useful 1405
## 9 College Very useful 1853
## 10 Communities Not Useful 16
## # ℹ 44 more rows
long_df <- multiple_choice %>%
select(starts_with("LearningPlatformUsefulness")) %>%
pivot_longer(
cols = everything(),
names_to = "learning_platform",
values_to = "usefulness"
) %>%
filter(!is.na(usefulness)) %>%
mutate(
learning_platform = str_remove(learning_platform, "LearningPlatformUsefulness"),
learning_platform = str_trim(learning_platform)
)
result_q2 <- long_df %>%
group_by(learning_platform) %>%
summarise(
tot = n(),
count = sum(usefulness != "Not Useful"),
perc_usefulness = count / tot
) %>%
arrange(learning_platform)
result_q2
## # A tibble: 18 × 4
## learning_platform tot count perc_usefulness
## <chr> <int> <int> <dbl>
## 1 Arxiv 2391 2354 0.985
## 2 Blogs 4765 4720 0.991
## 3 College 3359 3258 0.970
## 4 Communities 1142 1126 0.986
## 5 Company 981 940 0.958
## 6 Conferences 2182 2063 0.945
## 7 Courses 5992 5945 0.992
## 8 Documentation 2321 2279 0.982
## 9 Friends 1581 1530 0.968
## 10 Kaggle 6583 6527 0.991
## 11 Newsletters 1089 1033 0.949
## 12 Podcasts 1214 1090 0.898
## 13 Projects 4794 4755 0.992
## 14 SO 5640 5576 0.989
## 15 Textbook 4181 4112 0.983
## 16 TradeBook 333 324 0.973
## 17 Tutoring 1426 1394 0.978
## 18 YouTube 5229 5125 0.980
ordered_df <- result_q2 %>%
arrange(desc(count))
ordered_df <- ordered_df %>%
mutate(
cum_pct = cumsum(count) / sum(count)
)
top10 <- ordered_df %>% slice(1:10)
other_sum <- ordered_df %>% slice(11:n()) %>% summarise(count = sum(count)) %>% pull(count)
top10_plus_other <- top10 %>%
bind_rows(
tibble(
learning_platform = "Other",
count = other_sum,
cum_pct = 1 # 100% cumulative
)
) %>%
# reorder factor levels for plotting later
mutate(
learning_platform = fct_reorder(learning_platform, count),
learning_platform = fct_relevel(learning_platform, "Other", after = Inf)
)
top10_plus_other
## # A tibble: 11 × 5
## learning_platform tot count perc_usefulness cum_pct
## <fct> <int> <int> <dbl> <dbl>
## 1 Kaggle 6583 6527 0.991 0.121
## 2 Courses 5992 5945 0.992 0.230
## 3 SO 5640 5576 0.989 0.333
## 4 YouTube 5229 5125 0.980 0.428
## 5 Projects 4794 4755 0.992 0.516
## 6 Blogs 4765 4720 0.991 0.603
## 7 Textbook 4181 4112 0.983 0.679
## 8 College 3359 3258 0.970 0.739
## 9 Arxiv 2391 2354 0.985 0.782
## 10 Documentation 2321 2279 0.982 0.825
## 11 Other NA 9500 NA 1
plot_df <- top10_plus_other %>%
arrange(desc(count)) %>%
mutate(
rank = row_number(),
label = ifelse(
learning_platform == "Other",
paste0("Rank: NA\nUseful: ", count, "\nCumPct: 100.0%"),
paste0(
"Rank: ", rank, "\n",
"Useful: ", count, "\n",
"CumPct: ", scales::percent(cum_pct, accuracy = 0.1)
)
)
)
# Plot
ggplot(plot_df, aes(x = count, y = learning_platform)) +
geom_col(fill = "steelblue") +
geom_text(
aes(label = label),
hjust = -0.1,
size = 3
) +
scale_x_continuous(expand = expansion(mult = c(0, 0.15))) +
labs(
title = "Top 10 learning platform",
x = "Number of responses with at least usefulness",
y = "Learning platform"
) +
theme_minimal(base_size = 12)
