Libraries

# Load libraries
library(tidyverse)

Load Data

# Read the data with proper handling
data <- read_csv("multipleChoiceResponses1.csv", 
                 name_repair = "universal",
                 show_col_types = FALSE)

# Check the structure
glimpse(data)
## Rows: 16,716
## Columns: 47
## $ LearningPlatformUsefulnessArxiv             <chr> NA, NA, "Very useful", NA,…
## $ LearningPlatformUsefulnessBlogs             <chr> NA, NA, NA, "Very useful",…
## $ LearningPlatformUsefulnessCollege           <chr> NA, NA, "Somewhat useful",…
## $ LearningPlatformUsefulnessCompany           <chr> NA, NA, NA, NA, NA, NA, NA…
## $ LearningPlatformUsefulnessConferences       <chr> "Very useful", NA, NA, "Ve…
## $ LearningPlatformUsefulnessFriends           <chr> NA, NA, NA, "Very useful",…
## $ LearningPlatformUsefulnessKaggle            <chr> NA, "Somewhat useful", "So…
## $ LearningPlatformUsefulnessNewsletters       <chr> NA, NA, NA, NA, NA, NA, NA…
## $ LearningPlatformUsefulnessCommunities       <chr> NA, NA, NA, NA, NA, NA, NA…
## $ LearningPlatformUsefulnessDocumentation     <chr> NA, NA, NA, "Very useful",…
## $ LearningPlatformUsefulnessCourses           <chr> NA, NA, "Very useful", "Ve…
## $ LearningPlatformUsefulnessProjects          <chr> NA, NA, NA, "Very useful",…
## $ LearningPlatformUsefulnessPodcasts          <chr> "Very useful", NA, NA, NA,…
## $ LearningPlatformUsefulnessSO                <chr> NA, NA, NA, NA, NA, "Very …
## $ LearningPlatformUsefulnessTextbook          <chr> NA, NA, NA, NA, "Somewhat …
## $ LearningPlatformUsefulnessTradeBook         <chr> "Somewhat useful", NA, NA,…
## $ LearningPlatformUsefulnessTutoring          <chr> NA, NA, NA, NA, NA, NA, NA…
## $ LearningPlatformUsefulnessYouTube           <chr> NA, NA, "Very useful", NA,…
## $ CurrentJobTitleSelect                       <chr> "DBA/Database Engineer", N…
## $ MLMethodNextYearSelect                      <chr> "Random Forests", "Random …
## $ WorkChallengeFrequencyPolitics              <chr> "Rarely", NA, NA, "Often",…
## $ WorkChallengeFrequencyUnusedResults         <chr> NA, NA, NA, "Often", "Some…
## $ WorkChallengeFrequencyUnusefulInstrumenting <chr> NA, NA, NA, "Often", NA, N…
## $ WorkChallengeFrequencyDeployment            <chr> NA, NA, NA, "Often", NA, N…
## $ WorkChallengeFrequencyDirtyData             <chr> NA, NA, NA, "Often", NA, "…
## $ WorkChallengeFrequencyExplaining            <chr> NA, NA, NA, "Often", NA, N…
## $ WorkChallengeFrequencyPass                  <chr> NA, NA, NA, NA, NA, NA, NA…
## $ WorkChallengeFrequencyIntegration           <chr> NA, NA, NA, "Often", NA, N…
## $ WorkChallengeFrequencyTalent                <chr> NA, NA, NA, "Often", "Some…
## $ WorkChallengeFrequencyDataFunds             <chr> NA, NA, NA, "Often", "Some…
## $ WorkChallengeFrequencyDomainExpertise       <chr> NA, NA, NA, "Most of the t…
## $ WorkChallengeFrequencyML                    <chr> NA, NA, NA, "Often", NA, N…
## $ WorkChallengeFrequencyTools                 <chr> NA, NA, NA, "Often", NA, N…
## $ WorkChallengeFrequencyExpectations          <chr> NA, NA, NA, "Often", NA, N…
## $ WorkChallengeFrequencyITCoordination        <chr> NA, NA, NA, NA, "Sometimes…
## $ WorkChallengeFrequencyHiringFunds           <chr> NA, NA, NA, "Often", NA, N…
## $ WorkChallengeFrequencyPrivacy               <chr> "Often", NA, NA, "Often", …
## $ WorkChallengeFrequencyScaling               <chr> "Most of the time", NA, NA…
## $ WorkChallengeFrequencyEnvironments          <chr> NA, NA, NA, "Often", "Some…
## $ WorkChallengeFrequencyClarity               <chr> NA, NA, NA, "Often", NA, N…
## $ WorkChallengeFrequencyDataAccess            <chr> NA, NA, NA, "Often", NA, N…
## $ WorkChallengeFrequencyOtherSelect           <chr> NA, NA, NA, NA, NA, NA, NA…
## $ WorkInternalVsExternalTools                 <chr> "Do not know", NA, NA, "En…
## $ FormalEducation                             <chr> "Bachelor's degree", "Mast…
## $ Age                                         <dbl> NA, 30, 28, 56, 38, 46, 35…
## $ DataScienceIdentitySelect                   <chr> "Yes", "Yes", "Yes", "Yes"…
## $ JobSatisfaction                             <chr> "5", NA, NA, "10 - Highly …

Question 1: Count the usefulness by learning platform

Task:

# Select columns with LearningPlatformUsefulness and remove NA rows
q1_data <- data %>%
  select(starts_with("LearningPlatformUsefulness")) %>%
  pivot_longer(
    cols = everything(),
    names_to = "learning_platform",
    values_to = "usefulness"
  ) %>%
  filter(!is.na(usefulness)) %>%
  # Remove "LearningPlatformUsefulness" prefix from learning_platform
  mutate(learning_platform = str_replace(learning_platform, 
                                         "LearningPlatformUsefulness", 
                                         "")) %>%
  # Count occurrences of each platform-usefulness pair
  count(learning_platform, usefulness)

q1_data
## # A tibble: 54 × 3
##    learning_platform usefulness          n
##    <chr>             <chr>           <int>
##  1 Arxiv             Not Useful         37
##  2 Arxiv             Somewhat useful  1038
##  3 Arxiv             Very useful      1316
##  4 Blogs             Not Useful         45
##  5 Blogs             Somewhat useful  2406
##  6 Blogs             Very useful      2314
##  7 College           Not Useful        101
##  8 College           Somewhat useful  1405
##  9 College           Very useful      1853
## 10 Communities       Not Useful         16
## # ℹ 44 more rows

Question 2: Compute the number of total responses and number of responses which are at least useful

Task:

q2_data <- q1_data %>%
  group_by(learning_platform) %>%
  summarize(
    count = sum(n[usefulness != "Not Useful"]),  # At least useful
    tot = sum(n),                                 # Total responses
    perc_usefulness = count / tot                 # Percentage
  ) %>%
  arrange(learning_platform)

q2_data
## # A tibble: 18 × 4
##    learning_platform count   tot perc_usefulness
##    <chr>             <int> <int>           <dbl>
##  1 Arxiv              2354  2391           0.985
##  2 Blogs              4720  4765           0.991
##  3 College            3258  3359           0.970
##  4 Communities        1126  1142           0.986
##  5 Company             940   981           0.958
##  6 Conferences        2063  2182           0.945
##  7 Courses            5945  5992           0.992
##  8 Documentation      2279  2321           0.982
##  9 Friends            1530  1581           0.968
## 10 Kaggle             6527  6583           0.991
## 11 Newsletters        1033  1089           0.949
## 12 Podcasts           1090  1214           0.898
## 13 Projects           4755  4794           0.992
## 14 SO                 5576  5640           0.989
## 15 Textbook           4112  4181           0.983
## 16 TradeBook           324   333           0.973
## 17 Tutoring           1394  1426           0.978
## 18 YouTube            5125  5229           0.980

Question 3: Based on previous results, select the first two columns learning_platform and count

Task:

q3_data <- q2_data %>%
  select(learning_platform, count) %>%
  arrange(desc(count)) %>%
  slice(1:10) %>%
  mutate(
    count1 = count,
    cum_pct = cumsum(count) / sum(q2_data$count)
  ) %>%
  select(learning_platform, count1, cum_pct)

# Add "Other" row
other_count <- sum(q2_data$count) - sum(q3_data$count1)
other_row <- tibble(
  learning_platform = "Other",
  count1 = other_count,
  cum_pct = 1.0
)

q3_data <- bind_rows(q3_data, other_row) %>%
  mutate(learning_platform = fct_reorder(learning_platform, count1, .desc = FALSE)) %>%
  mutate(learning_platform = fct_relevel(learning_platform, "Other", after = 0))

q3_data
## # A tibble: 11 × 3
##    learning_platform count1 cum_pct
##    <fct>              <int>   <dbl>
##  1 Kaggle              6527   0.121
##  2 Courses             5945   0.230
##  3 SO                  5576   0.333
##  4 YouTube             5125   0.428
##  5 Projects            4755   0.516
##  6 Blogs               4720   0.603
##  7 Textbook            4112   0.679
##  8 College             3258   0.739
##  9 Arxiv               2354   0.782
## 10 Documentation       2279   0.825
## 11 Other               9500   1

Question 4: Based on the previous results, show the plotting as follows

Task:

# Prepare labels for the plot
q4_data <- q3_data %>%
  mutate(
    rank = if_else(learning_platform == "Other", NA_integer_, 
                   row_number(desc(count1)) - 1),
    label_text = paste0(
      "Rank: ", ifelse(is.na(rank), "NA", rank), "\n",
      "Useful: ", format(count1, big.mark = ","), "\n",
      "CumPct: ", sprintf("%.1f%%", cum_pct * 100)
    )
  )

# Create the plot
ggplot(q4_data, aes(x = learning_platform, y = count1)) +
  geom_col(fill = "steelblue", alpha = 0.8) +
  geom_text(aes(label = label_text), 
            hjust = -0.1, 
            size = 3,
            lineheight = 0.9) +
  coord_flip() +
  scale_y_continuous(
    limits = c(0, max(q4_data$count1) * 1.3),
    breaks = seq(0, 10000, 2500)
  ) +
  labs(
    title = "Top 10 learning platform",
    x = "Learning platform",
    y = "Number of responses with at least usefulness"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 14),
    axis.text = element_text(size = 10),
    axis.title = element_text(size = 11),
    panel.grid.major.y = element_blank(),
    panel.grid.minor = element_blank()
  )