Inclass exercise

Questions

1. Count the usefulness by learning platform

Select only the columns with LearningPlatformUsefulness and remove rows where usefulness is NA.
Remove LearningPlatformUsefulness from each string in learning_platform.
Use count() to change the dataset to have one row per learning_platform usefulness pair with a column that is the number of entries with that pairing.

# Read the data
data <- read_csv("multipleChoiceResponses1.csv")

# Select columns with LearningPlatformUsefulness and pivot
df_platform <- data %>%
  select(starts_with("LearningPlatformUsefulness")) %>%
  pivot_longer(
    cols = everything(),
    names_to = "learning_platform",
    values_to = "usefulness"
  ) %>%
  filter(!is.na(usefulness)) %>%
  mutate(learning_platform = str_remove(learning_platform, "LearningPlatformUsefulness")) %>%
  count(learning_platform, usefulness)

# Show first 10 rows
head(df_platform, 10)

## # A tibble: 10 × 3
##    learning_platform usefulness          n
##    <chr>             <chr>           <int>
##  1 Arxiv             Not Useful         37
##  2 Arxiv             Somewhat useful  1038
##  3 Arxiv             Very useful      1316
##  4 Blogs             Not Useful         45
##  5 Blogs             Somewhat useful  2406
##  6 Blogs             Very useful      2314
##  7 College           Not Useful        101
##  8 College           Somewhat useful  1405
##  9 College           Very useful      1853
## 10 Communities       Not Useful         16

2. Compute the number of total responses and number of responses which are at least useful

Filter out rows with Not Useful from the learning platform.

# Calculate total responses and percentage of usefulness
df_summary <- df_platform %>%
  group_by(learning_platform) %>%
  summarize(
    count = sum(n[usefulness != "Not Useful"]),
    tot = sum(n),
    perc_usefulness = count / tot
  ) %>%
  arrange(desc(count))

df_summary

## # A tibble: 18 × 4
##    learning_platform count   tot perc_usefulness
##    <chr>             <int> <int>           <dbl>
##  1 Kaggle             6527  6583           0.991
##  2 Courses            5945  5992           0.992
##  3 SO                 5576  5640           0.989
##  4 YouTube            5125  5229           0.980
##  5 Projects           4755  4794           0.992
##  6 Blogs              4720  4765           0.991
##  7 Textbook           4112  4181           0.983
##  8 College            3258  3359           0.970
##  9 Arxiv              2354  2391           0.985
## 10 Documentation      2279  2321           0.982
## 11 Conferences        2063  2182           0.945
## 12 Friends            1530  1581           0.968
## 13 Tutoring           1394  1426           0.978
## 14 Communities        1126  1142           0.986
## 15 Podcasts           1090  1214           0.898
## 16 Newsletters        1033  1089           0.949
## 17 Company             940   981           0.958
## 18 TradeBook           324   333           0.973

3. Based on previous results, select the first two columns learning_platform and count

Show the top 10 useful learning platforms.
Show the cumulative percent of top 10 useful learning platforms (cum_pct). Other platforms will be grouped as Other.
Hint: use fct_reorder() and fct_relevel()

# Select top 10 platforms and calculate cumulative percentage
df_top10 <- df_summary %>%
  select(learning_platform, count) %>%
  slice_max(order_by = count, n = 10) %>%
  mutate(
    count1 = count,
    cum_pct = cumsum(count1) / sum(df_summary$count)
  ) %>%
  select(learning_platform, count1, cum_pct)

# Add "Other" category
other_count <- sum(df_summary$count) - sum(df_top10$count1)
other_row <- tibble(
  learning_platform = "Other",
  count1 = other_count,
  cum_pct = 1.0
)

df_final <- bind_rows(df_top10, other_row) %>%
  mutate(learning_platform = fct_reorder(learning_platform, count1)) %>%
  mutate(learning_platform = fct_relevel(learning_platform, "Other", after = 0))

df_final

## # A tibble: 11 × 3
##    learning_platform count1 cum_pct
##    <fct>              <int>   <dbl>
##  1 Kaggle              6527   0.121
##  2 Courses             5945   0.230
##  3 SO                  5576   0.333
##  4 YouTube             5125   0.428
##  5 Projects            4755   0.516
##  6 Blogs               4720   0.603
##  7 Textbook            4112   0.679
##  8 College             3258   0.739
##  9 Arxiv               2354   0.782
## 10 Documentation       2279   0.825
## 11 Other               9500   1

4. Based on the previous results, show the plotting as follows

Hint: You can use the example codings in lecture week 12.

# Create labels for the plot
df_plot <- df_final %>%
  mutate(
    rank = if_else(learning_platform == "Other", NA_integer_, row_number()),
    label_text = sprintf(
      "Rank: %s\nUseful: %s\nCumPct: %.1f%%",
      ifelse(is.na(rank), "NA", as.character(11 - rank)),
      format(count1, big.mark = ","),
      cum_pct * 100
    )
  )

# Create the plot
ggplot(df_plot, aes(x = count1, y = learning_platform)) +
  geom_col(fill = "#2c3e50", alpha = 0.8) +
  geom_text(aes(label = label_text), 
            hjust = -0.1, 
            size = 3,
            color = "#2c3e50",
            family = "mono") +
  scale_x_continuous(
    limits = c(0, max(df_plot$count1) * 1.5),
    breaks = seq(0, 10000, 2500)
  ) +
  labs(
    x = "Number of responses with at least usefulness.",
    y = NULL,
    title = NULL
  ) +
  theme_minimal() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.grid.minor = element_blank(),
    axis.text.y = element_text(size = 10),
    axis.text.x = element_text(size = 10),
    axis.title.x = element_text(size = 11, margin = margin(t = 10))
  )

Inclass exercise

Nomin Ayurzana

2025/11/18

Libraries

Questions

1. Count the usefulness by learning platform

2. Compute the number of total responses and number of responses which are at least useful

3. Based on previous results, select the first two columns learning_platform and count

4. Based on the previous results, show the plotting as follows