Put
multipleChoiceResponses1.csvin the same folder as this.Rmd.
df <- readr::read_csv("multipleChoiceResponses1.csv", show_col_types = FALSE)
glimpse(df)
## Rows: 16,716
## Columns: 47
## $ LearningPlatformUsefulnessArxiv <chr> NA, NA, "Very useful", NA,…
## $ LearningPlatformUsefulnessBlogs <chr> NA, NA, NA, "Very useful",…
## $ LearningPlatformUsefulnessCollege <chr> NA, NA, "Somewhat useful",…
## $ LearningPlatformUsefulnessCompany <chr> NA, NA, NA, NA, NA, NA, NA…
## $ LearningPlatformUsefulnessConferences <chr> "Very useful", NA, NA, "Ve…
## $ LearningPlatformUsefulnessFriends <chr> NA, NA, NA, "Very useful",…
## $ LearningPlatformUsefulnessKaggle <chr> NA, "Somewhat useful", "So…
## $ LearningPlatformUsefulnessNewsletters <chr> NA, NA, NA, NA, NA, NA, NA…
## $ LearningPlatformUsefulnessCommunities <chr> NA, NA, NA, NA, NA, NA, NA…
## $ LearningPlatformUsefulnessDocumentation <chr> NA, NA, NA, "Very useful",…
## $ LearningPlatformUsefulnessCourses <chr> NA, NA, "Very useful", "Ve…
## $ LearningPlatformUsefulnessProjects <chr> NA, NA, NA, "Very useful",…
## $ LearningPlatformUsefulnessPodcasts <chr> "Very useful", NA, NA, NA,…
## $ LearningPlatformUsefulnessSO <chr> NA, NA, NA, NA, NA, "Very …
## $ LearningPlatformUsefulnessTextbook <chr> NA, NA, NA, NA, "Somewhat …
## $ LearningPlatformUsefulnessTradeBook <chr> "Somewhat useful", NA, NA,…
## $ LearningPlatformUsefulnessTutoring <chr> NA, NA, NA, NA, NA, NA, NA…
## $ LearningPlatformUsefulnessYouTube <chr> NA, NA, "Very useful", NA,…
## $ CurrentJobTitleSelect <chr> "DBA/Database Engineer", N…
## $ MLMethodNextYearSelect <chr> "Random Forests", "Random …
## $ WorkChallengeFrequencyPolitics <chr> "Rarely", NA, NA, "Often",…
## $ WorkChallengeFrequencyUnusedResults <chr> NA, NA, NA, "Often", "Some…
## $ WorkChallengeFrequencyUnusefulInstrumenting <chr> NA, NA, NA, "Often", NA, N…
## $ WorkChallengeFrequencyDeployment <chr> NA, NA, NA, "Often", NA, N…
## $ WorkChallengeFrequencyDirtyData <chr> NA, NA, NA, "Often", NA, "…
## $ WorkChallengeFrequencyExplaining <chr> NA, NA, NA, "Often", NA, N…
## $ WorkChallengeFrequencyPass <chr> NA, NA, NA, NA, NA, NA, NA…
## $ WorkChallengeFrequencyIntegration <chr> NA, NA, NA, "Often", NA, N…
## $ WorkChallengeFrequencyTalent <chr> NA, NA, NA, "Often", "Some…
## $ WorkChallengeFrequencyDataFunds <chr> NA, NA, NA, "Often", "Some…
## $ WorkChallengeFrequencyDomainExpertise <chr> NA, NA, NA, "Most of the t…
## $ WorkChallengeFrequencyML <chr> NA, NA, NA, "Often", NA, N…
## $ WorkChallengeFrequencyTools <chr> NA, NA, NA, "Often", NA, N…
## $ WorkChallengeFrequencyExpectations <chr> NA, NA, NA, "Often", NA, N…
## $ WorkChallengeFrequencyITCoordination <chr> NA, NA, NA, NA, "Sometimes…
## $ WorkChallengeFrequencyHiringFunds <chr> NA, NA, NA, "Often", NA, N…
## $ WorkChallengeFrequencyPrivacy <chr> "Often", NA, NA, "Often", …
## $ WorkChallengeFrequencyScaling <chr> "Most of the time", NA, NA…
## $ WorkChallengeFrequencyEnvironments <chr> NA, NA, NA, "Often", "Some…
## $ WorkChallengeFrequencyClarity <chr> NA, NA, NA, "Often", NA, N…
## $ WorkChallengeFrequencyDataAccess <chr> NA, NA, NA, "Often", NA, N…
## $ WorkChallengeFrequencyOtherSelect <chr> NA, NA, NA, NA, NA, NA, NA…
## $ WorkInternalVsExternalTools <chr> "Do not know", NA, NA, "En…
## $ FormalEducation <chr> "Bachelor's degree", "Mast…
## $ Age <dbl> NA, 30, 28, 56, 38, 46, 35…
## $ DataScienceIdentitySelect <chr> "Yes", "Yes", "Yes", "Yes"…
## $ JobSatisfaction <chr> "5", NA, NA, "10 - Highly …
Steps (per the PDF): - Select only columns with
LearningPlatformUsefulness and remove rows where usefulness
is NA. - Remove LearningPlatformUsefulness from each string
in learning_platform. - Use count() to get one
row per learning_platform × usefulness
pair.
q1_counts <- df %>%
select(starts_with("LearningPlatformUsefulness")) %>%
pivot_longer(
cols = everything(),
names_to = "learning_platform",
values_to = "usefulness"
) %>%
filter(!is.na(usefulness)) %>%
mutate(
learning_platform = str_remove(learning_platform, "^LearningPlatformUsefulness"),
learning_platform = str_replace_all(learning_platform, "_", " ")
) %>%
count(learning_platform, usefulness, name = "n") %>%
arrange(learning_platform, usefulness)
q1_counts %>% print(n = 20)
## # A tibble: 54 × 3
## learning_platform usefulness n
## <chr> <chr> <int>
## 1 Arxiv Not Useful 37
## 2 Arxiv Somewhat useful 1038
## 3 Arxiv Very useful 1316
## 4 Blogs Not Useful 45
## 5 Blogs Somewhat useful 2406
## 6 Blogs Very useful 2314
## 7 College Not Useful 101
## 8 College Somewhat useful 1405
## 9 College Very useful 1853
## 10 Communities Not Useful 16
## 11 Communities Somewhat useful 567
## 12 Communities Very useful 559
## 13 Company Not Useful 41
## 14 Company Somewhat useful 502
## 15 Company Very useful 438
## 16 Conferences Not Useful 119
## 17 Conferences Somewhat useful 1305
## 18 Conferences Very useful 758
## 19 Courses Not Useful 47
## 20 Courses Somewhat useful 1750
## # ℹ 34 more rows
tot = total responses (all usefulness
levels)count = number of responses that are at
least useful (filter out Not Useful)perc_usefulness = count / totq2_summary <- q1_counts %>%
group_by(learning_platform) %>%
summarise(
tot = sum(n),
count = sum(n[usefulness != "Not Useful"]),
perc_usefulness = count / tot,
.groups = "drop"
) %>%
arrange(desc(count))
q2_summary
learning_platform and
countcount)cum_pct)top_n <- 10
q3_top10 <- q2_summary %>%
select(learning_platform, count) %>%
slice_max(order_by = count, n = top_n, with_ties = FALSE) %>%
mutate(
rank = row_number(),
cum_pct = cumsum(count) / sum(q2_summary$count)
)
q3_other <- q2_summary %>%
select(learning_platform, count) %>%
anti_join(q3_top10 %>% select(learning_platform), by = "learning_platform") %>%
summarise(
learning_platform = "Other",
count = sum(count),
.groups = "drop"
) %>%
mutate(
rank = NA_integer_,
cum_pct = 1
)
q3_final <- bind_rows(q3_top10, q3_other) %>%
mutate(
learning_platform = fct_relevel(
fct_reorder(learning_platform, count, .desc = FALSE), # reorder for horizontal bar plot
"Other", after = Inf
)
)
q3_final %>%
select(learning_platform, count, cum_pct) %>%
arrange(desc(count))
# Labels that match the example style (rank, useful, cumulative percent)
plot_df <- q3_final %>%
mutate(
rank_label = if_else(is.na(rank), "Rank: NA", paste0("Rank: ", rank)),
count_label = paste0("Useful: ", comma(count)),
cum_label = paste0("CumPct: ", percent(cum_pct, accuracy = 0.1)),
label = paste(rank_label, count_label, cum_label, sep = "\n")
)
ggplot(plot_df, aes(x = count, y = learning_platform)) +
geom_col() +
geom_text(
aes(label = label),
hjust = 0,
nudge_x = max(plot_df$count) * 0.01,
size = 3
) +
scale_x_continuous(labels = comma) +
labs(
title = "Top 10 learning platform",
subtitle = "Number of responses with at least usefulness",
x = "Number of responses with at least usefulness",
y = "Learning platform"
) +
coord_cartesian(xlim = c(0, max(plot_df$count) * 1.25)) +
theme_minimal()