Inclass Exercise (Learning Platform Usefulness)

Data import
Q1 — Count the usefulness by learning platform
Q2 — Total responses vs at-least-useful responses (and percent)
Q3 — Top 10 useful learning platforms + cumulative percent + Other
Q4 — Plot: Top 10 learning platforms (and Other)

Data import

Put multipleChoiceResponses1.csv in the same folder as this .Rmd.

df <- readr::read_csv("multipleChoiceResponses1.csv", show_col_types = FALSE)
glimpse(df)

## Rows: 16,716
## Columns: 47
## $ LearningPlatformUsefulnessArxiv             <chr> NA, NA, "Very useful", NA,…
## $ LearningPlatformUsefulnessBlogs             <chr> NA, NA, NA, "Very useful",…
## $ LearningPlatformUsefulnessCollege           <chr> NA, NA, "Somewhat useful",…
## $ LearningPlatformUsefulnessCompany           <chr> NA, NA, NA, NA, NA, NA, NA…
## $ LearningPlatformUsefulnessConferences       <chr> "Very useful", NA, NA, "Ve…
## $ LearningPlatformUsefulnessFriends           <chr> NA, NA, NA, "Very useful",…
## $ LearningPlatformUsefulnessKaggle            <chr> NA, "Somewhat useful", "So…
## $ LearningPlatformUsefulnessNewsletters       <chr> NA, NA, NA, NA, NA, NA, NA…
## $ LearningPlatformUsefulnessCommunities       <chr> NA, NA, NA, NA, NA, NA, NA…
## $ LearningPlatformUsefulnessDocumentation     <chr> NA, NA, NA, "Very useful",…
## $ LearningPlatformUsefulnessCourses           <chr> NA, NA, "Very useful", "Ve…
## $ LearningPlatformUsefulnessProjects          <chr> NA, NA, NA, "Very useful",…
## $ LearningPlatformUsefulnessPodcasts          <chr> "Very useful", NA, NA, NA,…
## $ LearningPlatformUsefulnessSO                <chr> NA, NA, NA, NA, NA, "Very …
## $ LearningPlatformUsefulnessTextbook          <chr> NA, NA, NA, NA, "Somewhat …
## $ LearningPlatformUsefulnessTradeBook         <chr> "Somewhat useful", NA, NA,…
## $ LearningPlatformUsefulnessTutoring          <chr> NA, NA, NA, NA, NA, NA, NA…
## $ LearningPlatformUsefulnessYouTube           <chr> NA, NA, "Very useful", NA,…
## $ CurrentJobTitleSelect                       <chr> "DBA/Database Engineer", N…
## $ MLMethodNextYearSelect                      <chr> "Random Forests", "Random …
## $ WorkChallengeFrequencyPolitics              <chr> "Rarely", NA, NA, "Often",…
## $ WorkChallengeFrequencyUnusedResults         <chr> NA, NA, NA, "Often", "Some…
## $ WorkChallengeFrequencyUnusefulInstrumenting <chr> NA, NA, NA, "Often", NA, N…
## $ WorkChallengeFrequencyDeployment            <chr> NA, NA, NA, "Often", NA, N…
## $ WorkChallengeFrequencyDirtyData             <chr> NA, NA, NA, "Often", NA, "…
## $ WorkChallengeFrequencyExplaining            <chr> NA, NA, NA, "Often", NA, N…
## $ WorkChallengeFrequencyPass                  <chr> NA, NA, NA, NA, NA, NA, NA…
## $ WorkChallengeFrequencyIntegration           <chr> NA, NA, NA, "Often", NA, N…
## $ WorkChallengeFrequencyTalent                <chr> NA, NA, NA, "Often", "Some…
## $ WorkChallengeFrequencyDataFunds             <chr> NA, NA, NA, "Often", "Some…
## $ WorkChallengeFrequencyDomainExpertise       <chr> NA, NA, NA, "Most of the t…
## $ WorkChallengeFrequencyML                    <chr> NA, NA, NA, "Often", NA, N…
## $ WorkChallengeFrequencyTools                 <chr> NA, NA, NA, "Often", NA, N…
## $ WorkChallengeFrequencyExpectations          <chr> NA, NA, NA, "Often", NA, N…
## $ WorkChallengeFrequencyITCoordination        <chr> NA, NA, NA, NA, "Sometimes…
## $ WorkChallengeFrequencyHiringFunds           <chr> NA, NA, NA, "Often", NA, N…
## $ WorkChallengeFrequencyPrivacy               <chr> "Often", NA, NA, "Often", …
## $ WorkChallengeFrequencyScaling               <chr> "Most of the time", NA, NA…
## $ WorkChallengeFrequencyEnvironments          <chr> NA, NA, NA, "Often", "Some…
## $ WorkChallengeFrequencyClarity               <chr> NA, NA, NA, "Often", NA, N…
## $ WorkChallengeFrequencyDataAccess            <chr> NA, NA, NA, "Often", NA, N…
## $ WorkChallengeFrequencyOtherSelect           <chr> NA, NA, NA, NA, NA, NA, NA…
## $ WorkInternalVsExternalTools                 <chr> "Do not know", NA, NA, "En…
## $ FormalEducation                             <chr> "Bachelor's degree", "Mast…
## $ Age                                         <dbl> NA, 30, 28, 56, 38, 46, 35…
## $ DataScienceIdentitySelect                   <chr> "Yes", "Yes", "Yes", "Yes"…
## $ JobSatisfaction                             <chr> "5", NA, NA, "10 - Highly …

Q1 — Count the usefulness by learning platform

Steps (per the PDF): - Select only columns with LearningPlatformUsefulness and remove rows where usefulness is NA. - Remove LearningPlatformUsefulness from each string in learning_platform. - Use count() to get one row per learning_platform × usefulness pair.

q1_counts <- df %>%
  select(starts_with("LearningPlatformUsefulness")) %>%
  pivot_longer(
    cols = everything(),
    names_to = "learning_platform",
    values_to = "usefulness"
  ) %>%
  filter(!is.na(usefulness)) %>%
  mutate(
    learning_platform = str_remove(learning_platform, "^LearningPlatformUsefulness"),
    learning_platform = str_replace_all(learning_platform, "_", " ")
  ) %>%
  count(learning_platform, usefulness, name = "n") %>%
  arrange(learning_platform, usefulness)

q1_counts %>% print(n = 20)

## # A tibble: 54 × 3
##    learning_platform usefulness          n
##    <chr>             <chr>           <int>
##  1 Arxiv             Not Useful         37
##  2 Arxiv             Somewhat useful  1038
##  3 Arxiv             Very useful      1316
##  4 Blogs             Not Useful         45
##  5 Blogs             Somewhat useful  2406
##  6 Blogs             Very useful      2314
##  7 College           Not Useful        101
##  8 College           Somewhat useful  1405
##  9 College           Very useful      1853
## 10 Communities       Not Useful         16
## 11 Communities       Somewhat useful   567
## 12 Communities       Very useful       559
## 13 Company           Not Useful         41
## 14 Company           Somewhat useful   502
## 15 Company           Very useful       438
## 16 Conferences       Not Useful        119
## 17 Conferences       Somewhat useful  1305
## 18 Conferences       Very useful       758
## 19 Courses           Not Useful         47
## 20 Courses           Somewhat useful  1750
## # ℹ 34 more rows

Q2 — Total responses vs at-least-useful responses (and percent)

Compute tot = total responses (all usefulness levels)
Compute count = number of responses that are at least useful (filter out Not Useful)
Compute perc_usefulness = count / tot

q2_summary <- q1_counts %>%
  group_by(learning_platform) %>%
  summarise(
    tot = sum(n),
    count = sum(n[usefulness != "Not Useful"]),
    perc_usefulness = count / tot,
    .groups = "drop"
  ) %>%
  arrange(desc(count))

q2_summary

Q3 — Top 10 useful learning platforms + cumulative percent + Other

Select the first two columns: learning_platform and count
Show the top 10 useful learning platforms (by count)
Compute cumulative percent (cum_pct)
Group the remaining platforms as Other

top_n <- 10

q3_top10 <- q2_summary %>%
  select(learning_platform, count) %>%
  slice_max(order_by = count, n = top_n, with_ties = FALSE) %>%
  mutate(
    rank = row_number(),
    cum_pct = cumsum(count) / sum(q2_summary$count)
  )

q3_other <- q2_summary %>%
  select(learning_platform, count) %>%
  anti_join(q3_top10 %>% select(learning_platform), by = "learning_platform") %>%
  summarise(
    learning_platform = "Other",
    count = sum(count),
    .groups = "drop"
  ) %>%
  mutate(
    rank = NA_integer_,
    cum_pct = 1
  )

q3_final <- bind_rows(q3_top10, q3_other) %>%
  mutate(
    learning_platform = fct_relevel(
      fct_reorder(learning_platform, count, .desc = FALSE),  # reorder for horizontal bar plot
      "Other", after = Inf
    )
  )

q3_final %>%
  select(learning_platform, count, cum_pct) %>%
  arrange(desc(count))

Q4 — Plot: Top 10 learning platforms (and Other)

# Labels that match the example style (rank, useful, cumulative percent)
plot_df <- q3_final %>%
  mutate(
    rank_label = if_else(is.na(rank), "Rank: NA", paste0("Rank: ", rank)),
    count_label = paste0("Useful: ", comma(count)),
    cum_label = paste0("CumPct: ", percent(cum_pct, accuracy = 0.1)),
    label = paste(rank_label, count_label, cum_label, sep = "\n")
  )

ggplot(plot_df, aes(x = count, y = learning_platform)) +
  geom_col() +
  geom_text(
    aes(label = label),
    hjust = 0,
    nudge_x = max(plot_df$count) * 0.01,
    size = 3
  ) +
  scale_x_continuous(labels = comma) +
  labs(
    title = "Top 10 learning platform",
    subtitle = "Number of responses with at least usefulness",
    x = "Number of responses with at least usefulness",
    y = "Learning platform"
  ) +
  coord_cartesian(xlim = c(0, max(plot_df$count) * 1.25)) +
  theme_minimal()