library(tidyverse)
library(lubridate)
library(nycflights13)
library(tidyquant)
# Read the data
data <- read_csv("multipleChoiceResponses1.csv")
# Select columns with LearningPlatformUsefulness and pivot
df_platform <- data %>%
select(starts_with("LearningPlatformUsefulness")) %>%
pivot_longer(
cols = everything(),
names_to = "learning_platform",
values_to = "usefulness"
) %>%
filter(!is.na(usefulness)) %>%
mutate(learning_platform = str_remove(learning_platform, "LearningPlatformUsefulness")) %>%
count(learning_platform, usefulness)
# Show first 10 rows
head(df_platform, 10)
## # A tibble: 10 × 3
## learning_platform usefulness n
## <chr> <chr> <int>
## 1 Arxiv Not Useful 37
## 2 Arxiv Somewhat useful 1038
## 3 Arxiv Very useful 1316
## 4 Blogs Not Useful 45
## 5 Blogs Somewhat useful 2406
## 6 Blogs Very useful 2314
## 7 College Not Useful 101
## 8 College Somewhat useful 1405
## 9 College Very useful 1853
## 10 Communities Not Useful 16
# Calculate total responses and percentage of usefulness
df_summary <- df_platform %>%
group_by(learning_platform) %>%
summarize(
count = sum(n[usefulness != "Not Useful"]),
tot = sum(n),
perc_usefulness = count / tot
) %>%
arrange(desc(count))
df_summary
## # A tibble: 18 × 4
## learning_platform count tot perc_usefulness
## <chr> <int> <int> <dbl>
## 1 Kaggle 6527 6583 0.991
## 2 Courses 5945 5992 0.992
## 3 SO 5576 5640 0.989
## 4 YouTube 5125 5229 0.980
## 5 Projects 4755 4794 0.992
## 6 Blogs 4720 4765 0.991
## 7 Textbook 4112 4181 0.983
## 8 College 3258 3359 0.970
## 9 Arxiv 2354 2391 0.985
## 10 Documentation 2279 2321 0.982
## 11 Conferences 2063 2182 0.945
## 12 Friends 1530 1581 0.968
## 13 Tutoring 1394 1426 0.978
## 14 Communities 1126 1142 0.986
## 15 Podcasts 1090 1214 0.898
## 16 Newsletters 1033 1089 0.949
## 17 Company 940 981 0.958
## 18 TradeBook 324 333 0.973
# Select top 10 platforms and calculate cumulative percentage
df_top10 <- df_summary %>%
select(learning_platform, count) %>%
slice_max(order_by = count, n = 10) %>%
mutate(
count1 = count,
cum_pct = cumsum(count1) / sum(df_summary$count)
) %>%
select(learning_platform, count1, cum_pct)
# Add "Other" category
other_count <- sum(df_summary$count) - sum(df_top10$count1)
other_row <- tibble(
learning_platform = "Other",
count1 = other_count,
cum_pct = 1.0
)
df_final <- bind_rows(df_top10, other_row) %>%
mutate(learning_platform = fct_reorder(learning_platform, count1)) %>%
mutate(learning_platform = fct_relevel(learning_platform, "Other", after = 0))
df_final
## # A tibble: 11 × 3
## learning_platform count1 cum_pct
## <fct> <int> <dbl>
## 1 Kaggle 6527 0.121
## 2 Courses 5945 0.230
## 3 SO 5576 0.333
## 4 YouTube 5125 0.428
## 5 Projects 4755 0.516
## 6 Blogs 4720 0.603
## 7 Textbook 4112 0.679
## 8 College 3258 0.739
## 9 Arxiv 2354 0.782
## 10 Documentation 2279 0.825
## 11 Other 9500 1
# Create labels for the plot
df_plot <- df_final %>%
mutate(
rank = if_else(learning_platform == "Other", NA_integer_, row_number()),
label_text = sprintf(
"Rank: %s\nUseful: %s\nCumPct: %.1f%%",
ifelse(is.na(rank), "NA", as.character(11 - rank)),
format(count1, big.mark = ","),
cum_pct * 100
)
)
# Create the plot
ggplot(df_plot, aes(x = count1, y = learning_platform)) +
geom_col(fill = "#2c3e50", alpha = 0.8) +
geom_text(aes(label = label_text),
hjust = -0.1,
size = 3,
color = "#2c3e50",
family = "mono") +
scale_x_continuous(
limits = c(0, max(df_plot$count1) * 1.5),
breaks = seq(0, 10000, 2500)
) +
labs(
x = "Number of responses with at least usefulness.",
y = NULL,
title = NULL
) +
theme_minimal() +
theme(
panel.grid.major.y = element_blank(),
panel.grid.minor = element_blank(),
axis.text.y = element_text(size = 10),
axis.text.x = element_text(size = 10),
axis.title.x = element_text(size = 11, margin = margin(t = 10))
)