Setup: Load Libraries

[cite_start]As requested in the exercise PDF[cite: 1], we first load the necessary libraries, especially tidyverse for data manipulation and plotting.

library(tidyverse)
library(lubridate)
library(nycflights13)
library(tidyquant) # For themes and colors

Load Data

[cite_start]We load the multipleChoiceResponses1.csv dataset[cite: 84164].

# Load the dataset
responses_df <- read_csv("multipleChoiceResponses1.csv")

# Glimpse the data to understand its structure
glimpse(responses_df)
## Rows: 16,716
## Columns: 47
## $ LearningPlatformUsefulnessArxiv             <chr> NA, NA, "Very useful", NA,…
## $ LearningPlatformUsefulnessBlogs             <chr> NA, NA, NA, "Very useful",…
## $ LearningPlatformUsefulnessCollege           <chr> NA, NA, "Somewhat useful",…
## $ LearningPlatformUsefulnessCompany           <chr> NA, NA, NA, NA, NA, NA, NA…
## $ LearningPlatformUsefulnessConferences       <chr> "Very useful", NA, NA, "Ve…
## $ LearningPlatformUsefulnessFriends           <chr> NA, NA, NA, "Very useful",…
## $ LearningPlatformUsefulnessKaggle            <chr> NA, "Somewhat useful", "So…
## $ LearningPlatformUsefulnessNewsletters       <chr> NA, NA, NA, NA, NA, NA, NA…
## $ LearningPlatformUsefulnessCommunities       <chr> NA, NA, NA, NA, NA, NA, NA…
## $ LearningPlatformUsefulnessDocumentation     <chr> NA, NA, NA, "Very useful",…
## $ LearningPlatformUsefulnessCourses           <chr> NA, NA, "Very useful", "Ve…
## $ LearningPlatformUsefulnessProjects          <chr> NA, NA, NA, "Very useful",…
## $ LearningPlatformUsefulnessPodcasts          <chr> "Very useful", NA, NA, NA,…
## $ LearningPlatformUsefulnessSO                <chr> NA, NA, NA, NA, NA, "Very …
## $ LearningPlatformUsefulnessTextbook          <chr> NA, NA, NA, NA, "Somewhat …
## $ LearningPlatformUsefulnessTradeBook         <chr> "Somewhat useful", NA, NA,…
## $ LearningPlatformUsefulnessTutoring          <chr> NA, NA, NA, NA, NA, NA, NA…
## $ LearningPlatformUsefulnessYouTube           <chr> NA, NA, "Very useful", NA,…
## $ CurrentJobTitleSelect                       <chr> "DBA/Database Engineer", N…
## $ MLMethodNextYearSelect                      <chr> "Random Forests", "Random …
## $ WorkChallengeFrequencyPolitics              <chr> "Rarely", NA, NA, "Often",…
## $ WorkChallengeFrequencyUnusedResults         <chr> NA, NA, NA, "Often", "Some…
## $ WorkChallengeFrequencyUnusefulInstrumenting <chr> NA, NA, NA, "Often", NA, N…
## $ WorkChallengeFrequencyDeployment            <chr> NA, NA, NA, "Often", NA, N…
## $ WorkChallengeFrequencyDirtyData             <chr> NA, NA, NA, "Often", NA, "…
## $ WorkChallengeFrequencyExplaining            <chr> NA, NA, NA, "Often", NA, N…
## $ WorkChallengeFrequencyPass                  <chr> NA, NA, NA, NA, NA, NA, NA…
## $ WorkChallengeFrequencyIntegration           <chr> NA, NA, NA, "Often", NA, N…
## $ WorkChallengeFrequencyTalent                <chr> NA, NA, NA, "Often", "Some…
## $ WorkChallengeFrequencyDataFunds             <chr> NA, NA, NA, "Often", "Some…
## $ WorkChallengeFrequencyDomainExpertise       <chr> NA, NA, NA, "Most of the t…
## $ WorkChallengeFrequencyML                    <chr> NA, NA, NA, "Often", NA, N…
## $ WorkChallengeFrequencyTools                 <chr> NA, NA, NA, "Often", NA, N…
## $ WorkChallengeFrequencyExpectations          <chr> NA, NA, NA, "Often", NA, N…
## $ WorkChallengeFrequencyITCoordination        <chr> NA, NA, NA, NA, "Sometimes…
## $ WorkChallengeFrequencyHiringFunds           <chr> NA, NA, NA, "Often", NA, N…
## $ WorkChallengeFrequencyPrivacy               <chr> "Often", NA, NA, "Often", …
## $ WorkChallengeFrequencyScaling               <chr> "Most of the time", NA, NA…
## $ WorkChallengeFrequencyEnvironments          <chr> NA, NA, NA, "Often", "Some…
## $ WorkChallengeFrequencyClarity               <chr> NA, NA, NA, "Often", NA, N…
## $ WorkChallengeFrequencyDataAccess            <chr> NA, NA, NA, "Often", NA, N…
## $ WorkChallengeFrequencyOtherSelect           <chr> NA, NA, NA, NA, NA, NA, NA…
## $ WorkInternalVsExternalTools                 <chr> "Do not know", NA, NA, "En…
## $ FormalEducation                             <chr> "Bachelor's degree", "Mast…
## $ Age                                         <dbl> NA, 30, 28, 56, 38, 46, 35…
## $ DataScienceIdentitySelect                   <chr> "Yes", "Yes", "Yes", "Yes"…
## $ JobSatisfaction                             <chr> "5", NA, NA, "10 - Highly …

1. Count Usefulness by Learning Platform

First, we need to transform the data from its wide format (one column per platform) to a long format (one row per platform-usefulness pair).

  • [cite_start]Select only the columns starting with LearningPlatformUsefulness[cite: 1].
  • pivot_longer to make the data tidy.
  • [cite_start]Remove the “LearningPlatformUsefulness” prefix from the platform names[cite: 1].
  • [cite_start]Filter out any rows where usefulness is NA[cite: 1].
  • [cite_start]count the occurrences of each learning_platform and usefulness pair[cite: 1].
# Task 1: Count usefulness by learning platform
usefulness_counts_df <- responses_df %>%
  select(starts_with("LearningPlatformUsefulness")) %>%
  pivot_longer(
    cols = everything(),
    names_to = "learning_platform",
    values_to = "usefulness"
  ) %>%
  filter(!is.na(usefulness)) %>%
  mutate(
    learning_platform = str_remove(learning_platform, "LearningPlatformUsefulness")
  ) %>%
  count(learning_platform, usefulness, name = "count") %>%
  arrange(learning_platform, desc(count))

# Show the result
print(usefulness_counts_df, n = 20)
## # A tibble: 54 × 3
##    learning_platform usefulness      count
##    <chr>             <chr>           <int>
##  1 Arxiv             Very useful      1316
##  2 Arxiv             Somewhat useful  1038
##  3 Arxiv             Not Useful         37
##  4 Blogs             Somewhat useful  2406
##  5 Blogs             Very useful      2314
##  6 Blogs             Not Useful         45
##  7 College           Very useful      1853
##  8 College           Somewhat useful  1405
##  9 College           Not Useful        101
## 10 Communities       Somewhat useful   567
## 11 Communities       Very useful       559
## 12 Communities       Not Useful         16
## 13 Company           Somewhat useful   502
## 14 Company           Very useful       438
## 15 Company           Not Useful         41
## 16 Conferences       Somewhat useful  1305
## 17 Conferences       Very useful       758
## 18 Conferences       Not Useful        119
## 19 Courses           Very useful      4195
## 20 Courses           Somewhat useful  1750
## # ℹ 34 more rows

2. Compute Total and “At Least Useful” Responses

[cite_start]Next, we’ll aggregate the data for each platform to find the total number of responses and the number of responses that were “at least useful” (i.e., “Very useful” or “Somewhat useful”)[cite: 1]. We also calculate the percentage of useful responses.

# Task 2: Compute total and "at least useful" responses
platform_summary_df <- usefulness_counts_df %>%
  group_by(learning_platform) %>%
  summarise(
    total_responses = sum(count),
    at_least_useful = sum(
      count[usefulness %in% c("Very useful", "Somewhat useful")],
      na.rm = TRUE # Handle cases where a platform has no "useful" votes
    )
  ) %>%
  mutate(
    pct_useful = at_least_useful / total_responses
  ) %>%
  arrange(desc(at_least_useful))

# Show the result
print(platform_summary_df)
## # A tibble: 18 × 4
##    learning_platform total_responses at_least_useful pct_useful
##    <chr>                       <int>           <int>      <dbl>
##  1 Kaggle                       6583            6527      0.991
##  2 Courses                      5992            5945      0.992
##  3 SO                           5640            5576      0.989
##  4 YouTube                      5229            5125      0.980
##  5 Projects                     4794            4755      0.992
##  6 Blogs                        4765            4720      0.991
##  7 Textbook                     4181            4112      0.983
##  8 College                      3359            3258      0.970
##  9 Arxiv                        2391            2354      0.985
## 10 Documentation                2321            2279      0.982
## 11 Conferences                  2182            2063      0.945
## 12 Friends                      1581            1530      0.968
## 13 Tutoring                     1426            1394      0.978
## 14 Communities                  1142            1126      0.986
## 15 Podcasts                     1214            1090      0.898
## 16 Newsletters                  1089            1033      0.949
## 17 Company                       981             940      0.958
## 18 TradeBook                     333             324      0.973

3. Top 10 Platforms and “Other” Group

Based on the previous results, we will create a summary table of the top 10 most useful platforms, grouping all others into an “Other” category. [cite_start]We’ll show the count of “at least useful” responses (aliased as count1) and the cumulative percentage[cite: 1].

[cite_start]We use fct_lump_n to easily group the non-top-10 platforms, then use fct_reorder to sort them by count and fct_relevel to move “Other” to the end, as shown in the hint[cite: 1].

# Task 3: Top 10 platforms with "Other"

# Lump platforms outside the top 10 (by at_least_useful) into "Other"
# Then calculate cumulative percentages
final_summary_df <- platform_summary_df %>%
  select(learning_platform, count1 = at_least_useful) %>%
  # 1. Create the factor, lumping all but top 10 by count1 into "Other"
  mutate(
    learning_platform = fct_lump_n(learning_platform, n = 10, w = count1)
  ) %>%
  # 2. Group by the new factor and sum the counts
  group_by(learning_platform) %>%
  summarise(count1 = sum(count1)) %>%
  ungroup() %>%
  # 3. Reorder the factor: by count descending, then move "Other" to the end
  mutate(
    learning_platform = fct_reorder(learning_platform, count1, .desc = TRUE),
    learning_platform = fct_relevel(learning_platform, "Other", after = Inf)
  ) %>%
  # 4. Arrange by the factor level to get the correct order for cumsum
  arrange(learning_platform) %>%
  # 5. Calculate percentages
  mutate(
    pct = count1 / sum(count1),
    cum_pct = cumsum(pct)
  )

# Show the final tibble, which matches the format in the PDF
print(final_summary_df)
## # A tibble: 11 × 4
##    learning_platform count1    pct cum_pct
##    <fct>              <int>  <dbl>   <dbl>
##  1 Kaggle              6527 0.121    0.121
##  2 Courses             5945 0.110    0.230
##  3 SO                  5576 0.103    0.333
##  4 YouTube             5125 0.0946   0.428
##  5 Projects            4755 0.0878   0.516
##  6 Blogs               4720 0.0872   0.603
##  7 Textbook            4112 0.0759   0.679
##  8 College             3258 0.0602   0.739
##  9 Arxiv               2354 0.0435   0.782
## 10 Documentation       2279 0.0421   0.825
## 11 Other               9500 0.175    1

4. Plot the Top Learning Platforms

[cite_start]Finally, we visualize the results from Task 3. We create a bar chart showing the “at least useful” counts for the top 10 platforms and the “Other” category[cite: 1]. The x-axis is ordered by the factor levels created in the previous step.

# Task 4: Plotting

# Add rank and formatted labels to the data for plotting
plot_data_df <- final_summary_df %>%
  mutate(
    rank = row_number(),
    # Create labels for Rank (or "Other") and the count
    rank_label = if_else(
      learning_platform == "Other",
      "Other",
      str_glue("Rank: {rank}")
    ),
    count_label = scales::comma(count1, accuracy = 1)
  )

# Create the plot
ggplot(plot_data_df, aes(x = learning_platform, y = count1)) +
  # Use geom_col for bar chart. Fill by platform, hide legend.
  geom_col(aes(fill = learning_platform), show.legend = FALSE) +
  
  # Add count labels *above* the bars
  geom_text(
    aes(label = count_label),
    vjust = -0.5, # Position above bar
    size = 3.5,
    fontface = "bold"
  ) +
  
  # Add rank labels *inside* the top of the bars
  geom_text(
    aes(label = rank_label),
    vjust = 1.5, # Position inside bar, near top
    color = "white",
    fontface = "bold",
    size = 3.5
  ) +
  
  # Apply formatting
  scale_fill_tq() + # Use tidyquant colors
  theme_tq() +      # Use tidyquant theme
  scale_y_continuous(
    labels = scales::comma,
    expand = expansion(mult = c(0, .1)) # Add space at top for labels
  ) +
  labs(
    title = "Top 10 Useful Learning Platforms",
    subtitle = "Based on 'Very Useful' or 'Somewhat Useful' responses",
    x = "Learning Platform",
    y = "Number of 'At Least Useful' Responses"
  ) +
  theme(
    # Rotate x-axis labels for readability
    axis.text.x = element_text(angle = 45, hjust = 1)
  )