Overview

Use the multipleChoiceResponses1.csv to answer the following questions:

  1. Count the usefulness by learning platform.
  2. Compute the number of total responses and number of responses which are at least useful.
  3. Based on previous results, select the first two columns learning_platform and count.
  4. Based on the previous results, show the plotting.

Libraries

# Load libraries
library(tidyverse)
library(lubridate)
library(nycflights13)
library(tidyquant)

Load Data

# Read the data
responses <- read_csv("multipleChoiceResponses1.csv")

Question 1: Count the usefulness by learning platform

Task:

  • Select only the columns with LearningPlatformUsefulness and remove rows where usefulness is NA.
  • Remove LearningPlatformUsefulness from each string in learning_platform.
  • Use count() to change the dataset to have one row per learning_platform usefulness pair with a column that is the number of entries with that pairing.
q1_data <- responses %>%
  select(starts_with("LearningPlatformUsefulness")) %>%
  pivot_longer(
    cols = everything(),
    names_to = "learning_platform",
    values_to = "usefulness"
  ) %>%
  filter(!is.na(usefulness)) %>%
  mutate(learning_platform = str_remove(learning_platform, "LearningPlatformUsefulness")) %>%
  count(learning_platform, usefulness)

# Display first 10 rows
head(q1_data, 10)
## # A tibble: 10 × 3
##    learning_platform usefulness          n
##    <chr>             <chr>           <int>
##  1 Arxiv             Not Useful         37
##  2 Arxiv             Somewhat useful  1038
##  3 Arxiv             Very useful      1316
##  4 Blogs             Not Useful         45
##  5 Blogs             Somewhat useful  2406
##  6 Blogs             Very useful      2314
##  7 College           Not Useful        101
##  8 College           Somewhat useful  1405
##  9 College           Very useful      1853
## 10 Communities       Not Useful         16

Question 2: Compute total responses and at least useful responses

Task:

  • Filter out rows with “Not Useful” from the learning platform.
  • Compute the number of total responses and number of responses which are at least useful.
q2_data <- q1_data %>%
  group_by(learning_platform) %>%
  summarise(
    count = sum(n[usefulness != "Not Useful"]),  # At least useful
    tot = sum(n),                                 # Total responses
    perc_usefulness = count / tot
  ) %>%
  arrange(desc(count))

q2_data
## # A tibble: 18 × 4
##    learning_platform count   tot perc_usefulness
##    <chr>             <int> <int>           <dbl>
##  1 Kaggle             6527  6583           0.991
##  2 Courses            5945  5992           0.992
##  3 SO                 5576  5640           0.989
##  4 YouTube            5125  5229           0.980
##  5 Projects           4755  4794           0.992
##  6 Blogs              4720  4765           0.991
##  7 Textbook           4112  4181           0.983
##  8 College            3258  3359           0.970
##  9 Arxiv              2354  2391           0.985
## 10 Documentation      2279  2321           0.982
## 11 Conferences        2063  2182           0.945
## 12 Friends            1530  1581           0.968
## 13 Tutoring           1394  1426           0.978
## 14 Communities        1126  1142           0.986
## 15 Podcasts           1090  1214           0.898
## 16 Newsletters        1033  1089           0.949
## 17 Company             940   981           0.958
## 18 TradeBook           324   333           0.973

Question 3: Top 10 useful learning platforms

Task:

  • Show the top 10 useful learning platforms.
  • Show the cumulative percent of top 10 useful learning platforms (cum_pct). Other platforms will be grouped as Other.
  • Hint: use fct_reorder() and fct_relevel()
q3_data <- q2_data %>%
  select(learning_platform, count) %>%
  slice_max(count, n = 10) %>%
  mutate(
    count1 = count,
    cum_pct = cumsum(count) / sum(q2_data$count)
  )

# Add "Other" category
other_count <- sum(q2_data$count) - sum(q3_data$count)
other_row <- tibble(
  learning_platform = "Other",
  count = other_count,
  count1 = other_count,
  cum_pct = 1.0
)

q3_data <- q3_data %>%
  bind_rows(other_row) %>%
  mutate(
    learning_platform = fct_reorder(learning_platform, count1),
    learning_platform = fct_relevel(learning_platform, "Other", after = 0)
  ) %>%
  select(learning_platform, count1, cum_pct)

q3_data
## # A tibble: 11 × 3
##    learning_platform count1 cum_pct
##    <fct>              <int>   <dbl>
##  1 Kaggle              6527   0.121
##  2 Courses             5945   0.230
##  3 SO                  5576   0.333
##  4 YouTube             5125   0.428
##  5 Projects            4755   0.516
##  6 Blogs               4720   0.603
##  7 Textbook            4112   0.679
##  8 College             3258   0.739
##  9 Arxiv               2354   0.782
## 10 Documentation       2279   0.825
## 11 Other               9500   1

Question 4: Visualization

Task:

  • Based on the previous results, show the plotting.
  • Hint: You can use the example codings in lecture week 12.
# Prepare data for plotting with labels
plot_data <- q3_data %>%
  mutate(
    rank = if_else(learning_platform == "Other", NA_integer_, 
                   as.integer(11 - as.numeric(learning_platform))),
    label_text = sprintf(
      "Rank: %s\nUseful: %s\nCumPct: %.1f%%",
      if_else(is.na(rank), "NA", as.character(rank)),
      format(count1, big.mark = ","),
      cum_pct * 100
    )
  )

# Create the plot
ggplot(plot_data, aes(x = count1, y = learning_platform)) +
  geom_col(fill = "steelblue", alpha = 0.8) +
  geom_text(aes(label = label_text), 
            hjust = -0.05, 
            size = 3,
            lineheight = 0.9) +
  scale_x_continuous(
    limits = c(0, max(plot_data$count1) * 1.4),
    breaks = seq(0, 10000, 2500)
  ) +
  labs(
    title = "Learning Platform Usefulness Analysis",
    subtitle = "Top 10 Platforms by Number of Useful Responses",
    x = "Number of responses with at least usefulness",
    y = NULL
  ) +
  theme_minimal() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.grid.minor = element_blank(),
    plot.title = element_text(face = "bold", size = 14),
    plot.subtitle = element_text(size = 11),
    axis.text = element_text(size = 10)
  )

Summary

The analysis reveals that Kaggle is the most useful learning platform with 6,527 useful responses (12.1% of total), followed by Courses (5,945 responses, 23.0% cumulative) and Stack Overflow (5,576 responses, 33.3% cumulative). The top 10 platforms account for 82.5% of all useful responses, demonstrating the concentration of preferred learning resources among data science practitioners.