#1. Count the usefulness by learning platform
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
data <- read.csv("multipleChoiceResponses1.csv")
usefulness_columns <- colnames(data)[1:18]
usefulness_counts <- data %>%
select(all_of(usefulness_columns)) %>%
pivot_longer(cols = everything(), names_to = "LearningPlatform", values_to = "Usefulness") %>%
filter(!is.na(Usefulness)) %>%
mutate(Usefulness = gsub("Not Useful", "Least Useful", Usefulness),
LearningPlatform = gsub("LearningPlatformUsefulness", "", LearningPlatform)) %>%
count(LearningPlatform, Usefulness)
print(usefulness_counts)
## # A tibble: 54 × 3
## LearningPlatform Usefulness n
## <chr> <chr> <int>
## 1 Arxiv Least Useful 37
## 2 Arxiv Somewhat useful 1038
## 3 Arxiv Very useful 1316
## 4 Blogs Least Useful 45
## 5 Blogs Somewhat useful 2406
## 6 Blogs Very useful 2314
## 7 College Least Useful 101
## 8 College Somewhat useful 1405
## 9 College Very useful 1853
## 10 Communities Least Useful 16
## # ℹ 44 more rows
#2. Compute the number of total responses and the number of responses which are at least useful
atleastuseful <- data %>%
gather(key = "learning_platform", value = "usefulness", starts_with("LearningPlatformUsefulness")) %>%
filter(!is.na(usefulness)) %>%
mutate(learning_platform = gsub("LearningPlatformUsefulness", "", learning_platform)) %>%
count(learning_platform, name = "count") %>%
left_join(
data %>%
gather(key = "learning_platform", value = "usefulness", starts_with("LearningPlatformUsefulness")) %>%
filter(!is.na(usefulness) & usefulness != "Not Useful") %>%
mutate(learning_platform = gsub("LearningPlatformUsefulness", "", learning_platform)) %>%
count(learning_platform, name = "at_least_useful"),
by = "learning_platform"
) %>%
mutate(
tot = ifelse(is.na(at_least_useful), count, at_least_useful),
perc_usefulness = tot / count
) %>%
select(learning_platform, tot, count, perc_usefulness)
atleastuseful
## learning_platform tot count perc_usefulness
## 1 Arxiv 2354 2391 0.9845253
## 2 Blogs 4720 4765 0.9905561
## 3 College 3258 3359 0.9699315
## 4 Communities 1126 1142 0.9859895
## 5 Company 940 981 0.9582059
## 6 Conferences 2063 2182 0.9454629
## 7 Courses 5945 5992 0.9921562
## 8 Documentation 2279 2321 0.9819044
## 9 Friends 1530 1581 0.9677419
## 10 Kaggle 6527 6583 0.9914932
## 11 Newsletters 1033 1089 0.9485767
## 12 Podcasts 1090 1214 0.8978583
## 13 Projects 4755 4794 0.9918648
## 14 SO 5576 5640 0.9886525
## 15 Textbook 4112 4181 0.9834968
## 16 TradeBook 324 333 0.9729730
## 17 Tutoring 1394 1426 0.9775596
## 18 YouTube 5125 5229 0.9801109
#3 Select the first two columns: learning_platform and count
selected_columns <- atleastuseful %>%
select(learning_platform, count)
# Show the top 10 useful learning platforms
top_10_platforms <- selected_columns %>%
top_n(10, count) %>%
mutate(learning_platform = fct_reorder(learning_platform, count))
# Calculate cumulative percent
top_10_platforms <- top_10_platforms %>%
mutate(cum_pct = cumsum(count) / sum(count))
# Identify platforms not in the top 10
other_platforms <- selected_columns %>%
anti_join(top_10_platforms, by = "learning_platform") %>%
summarise(count = sum(count), learning_platform = "Other")
# Combine top 10 platforms and 'Other'
result_grouped <- bind_rows(top_10_platforms, other_platforms)
# Ensure cumulative percentage for 'Other' is exactly 1
result_grouped$cum_pct[result_grouped$learning_platform == "Other"] <- 1
# Print the results
print(result_grouped)
## learning_platform count cum_pct
## 1 Arxiv 2391 0.05283394
## 2 Blogs 4765 0.15812617
## 3 College 3359 0.23235002
## 4 Courses 5992 0.36475528
## 5 Documentation 2321 0.41604243
## 6 Kaggle 6583 0.56150702
## 7 Projects 4794 0.66744006
## 8 SO 5640 0.79206717
## 9 Textbook 4181 0.88445476
## 10 YouTube 5229 1.00000000
## 11 Other 9948 1.00000000
#4 Based on the previous results, show the plotting as follows.
library(ggplot2)
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
# Create a bar plot
ggplot(result_grouped, aes(x = count, y = learning_platform)) +
geom_segment(aes(xend = 0, yend = learning_platform), size = 1) +
geom_point(size = 3) +
geom_label(aes(label = str_glue("Cumulative Percent: {scales::percent(cum_pct)}")),
hjust = "inward",
size = 2,
color = "black") +
labs(title = "Top 10 Useful Learning Platforms",
x = "Number of Responses with at Least Usefulness",
y = "Learning Platform") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
