This analysis examines the usefulness of various learning platforms based on survey responses from the multipleChoiceResponses1.csv dataset.
# Load required libraries
library(tidyverse)
library(lubridate)
library(knitr)
# Read the data
data <- read_csv("multipleChoiceResponses1.csv")
Select only the columns with LearningPlatformUsefulness and remove rows where usefulness is NA. Remove “LearningPlatformUsefulness” from each string in learning_platform. Use count() to change the dataset to have one row per learning_platform usefulness pair.
platform_usefulness <- data %>%
select(starts_with("LearningPlatformUsefulness")) %>%
pivot_longer(
cols = everything(),
names_to = "learning_platform",
values_to = "usefulness"
) %>%
filter(!is.na(usefulness)) %>%
mutate(learning_platform = str_remove(learning_platform, "LearningPlatformUsefulness")) %>%
count(learning_platform, usefulness)
# Display first 10 rows
platform_usefulness %>%
head(10)
## # A tibble: 10 × 3
## learning_platform usefulness n
## <chr> <chr> <int>
## 1 Arxiv Not Useful 37
## 2 Arxiv Somewhat useful 1038
## 3 Arxiv Very useful 1316
## 4 Blogs Not Useful 45
## 5 Blogs Somewhat useful 2406
## 6 Blogs Very useful 2314
## 7 College Not Useful 101
## 8 College Somewhat useful 1405
## 9 College Very useful 1853
## 10 Communities Not Useful 16
Compute the number of total responses and number of responses which are at least useful (filter out “Not Useful”).
platform_summary <- platform_usefulness %>%
group_by(learning_platform) %>%
summarize(
count = sum(n[usefulness != "Not Useful"]),
tot = sum(n),
perc_usefulness = count / tot
) %>%
arrange(desc(count))
platform_summary
## # A tibble: 18 × 4
## learning_platform count tot perc_usefulness
## <chr> <int> <int> <dbl>
## 1 Kaggle 6527 6583 0.991
## 2 Courses 5945 5992 0.992
## 3 SO 5576 5640 0.989
## 4 YouTube 5125 5229 0.980
## 5 Projects 4755 4794 0.992
## 6 Blogs 4720 4765 0.991
## 7 Textbook 4112 4181 0.983
## 8 College 3258 3359 0.970
## 9 Arxiv 2354 2391 0.985
## 10 Documentation 2279 2321 0.982
## 11 Conferences 2063 2182 0.945
## 12 Friends 1530 1581 0.968
## 13 Tutoring 1394 1426 0.978
## 14 Communities 1126 1142 0.986
## 15 Podcasts 1090 1214 0.898
## 16 Newsletters 1033 1089 0.949
## 17 Company 940 981 0.958
## 18 TradeBook 324 333 0.973
Select the first two columns (learning_platform and count), show the top 10 useful learning platforms, and calculate cumulative percentage. Other platforms are grouped as “Other”.
top_platforms <- platform_summary %>%
select(learning_platform, count) %>%
slice_head(n = 10) %>%
mutate(
count1 = count,
cum_pct = cumsum(count1) / sum(platform_summary$count)
) %>%
select(learning_platform, count1, cum_pct)
# Add "Other" category
other_count <- platform_summary %>%
slice_tail(n = nrow(platform_summary) - 10) %>%
summarize(count1 = sum(count)) %>%
pull(count1)
top_platforms_with_other <- top_platforms %>%
bind_rows(tibble(
learning_platform = "Other",
count1 = other_count,
cum_pct = 1.0
))
# Use fct_reorder() and fct_relevel() to maintain order by count
top_platforms_with_other <- top_platforms_with_other %>%
mutate(learning_platform = fct_reorder(learning_platform, count1, .desc = TRUE)) %>%
mutate(learning_platform = fct_relevel(learning_platform, "Other", after = Inf))
top_platforms_with_other
## # A tibble: 11 × 3
## learning_platform count1 cum_pct
## <fct> <int> <dbl>
## 1 Kaggle 6527 0.121
## 2 Courses 5945 0.230
## 3 SO 5576 0.333
## 4 YouTube 5125 0.428
## 5 Projects 4755 0.516
## 6 Blogs 4720 0.603
## 7 Textbook 4112 0.679
## 8 College 3258 0.739
## 9 Arxiv 2354 0.782
## 10 Documentation 2279 0.825
## 11 Other 9500 1
Create a horizontal bar chart showing the top 10 learning platforms with rank, useful count, and cumulative percentage labels.
# Prepare data for plotting
plot_data <- top_platforms_with_other %>%
mutate(
rank = ifelse(learning_platform == "Other", NA, row_number()),
label = sprintf(
"Rank: %s\nUseful: %s\nCumPct: %.1f%%",
ifelse(is.na(rank), "NA", as.character(rank)),
format(count1, big.mark = ","),
cum_pct * 100
)
) %>%
arrange(desc(count1)) %>%
mutate(learning_platform = fct_reorder(learning_platform, count1))
# Create the plot
ggplot(plot_data, aes(x = count1, y = learning_platform)) +
geom_col(fill = "#3d5a80", width = 0.7) +
geom_label(aes(label = label),
hjust = 0,
size = 2.8,
lineheight = 0.85,
label.padding = unit(0.15, "lines"),
label.size = 0.2) +
scale_x_continuous(
limits = c(0, max(plot_data$count1) * 1.15),
expand = c(0, 0),
breaks = seq(0, 10000, 2500)
) +
labs(
x = "Number of responses with at least usefulness",
y = "Learning platform",
title = "Top 10 learning platform"
) +
theme_minimal() +
theme(
panel.grid.major.y = element_blank(),
panel.grid.minor.x = element_blank(),
panel.grid.major.x = element_line(color = "gray90"),
axis.text.y = element_text(size = 11, color = "black"),
axis.text.x = element_text(size = 10, color = "black"),
axis.title.x = element_text(size = 11, margin = margin(t = 10)),
axis.title.y = element_text(size = 11, margin = margin(r = 10)),
plot.title = element_text(size = 14, face = "plain", hjust = 0),
plot.margin = margin(10, 10, 10, 10)
)
The analysis reveals that:
Data Source: multipleChoiceResponses1.csv