Libraries

library(tidyverse)
library(lubridate)
library(nycflights13)
library(tidyquant)

Questions

Use the multipleChoiceResponses1.csv to answer the following questions:

1. Count the usefulness by learning platform

  • Select only the columns with LearningPlatformUsefulness and remove rows where usefulness is NA.
  • Remove LearningPlatformUsefulness from each string in learning_platform.
  • Use count() to change the dataset to have one row per learning_platform usefulness pair with a column that is the number of entries with that pairing.
# Read the data
# Хэрэв файл өөр газар байвал бүтэн замыг бичнэ үү:
# data <- read_csv("C:/path/to/your/multipleChoiceResponses1.csv")
data <- read_csv("multipleChoiceResponses1.csv")

# Select columns with LearningPlatformUsefulness and pivot
df_platform <- data %>%
  select(starts_with("LearningPlatformUsefulness")) %>%
  pivot_longer(
    cols = everything(),
    names_to = "learning_platform",
    values_to = "usefulness"
  ) %>%
  filter(!is.na(usefulness)) %>%
  mutate(learning_platform = str_remove(learning_platform, "LearningPlatformUsefulness")) %>%
  count(learning_platform, usefulness)

# Show first 10 rows
head(df_platform, 10)
## # A tibble: 10 × 3
##    learning_platform usefulness          n
##    <chr>             <chr>           <int>
##  1 Arxiv             Not Useful         37
##  2 Arxiv             Somewhat useful  1038
##  3 Arxiv             Very useful      1316
##  4 Blogs             Not Useful         45
##  5 Blogs             Somewhat useful  2406
##  6 Blogs             Very useful      2314
##  7 College           Not Useful        101
##  8 College           Somewhat useful  1405
##  9 College           Very useful      1853
## 10 Communities       Not Useful         16

2. Compute the number of total responses and number of responses which are at least useful

  • Filter out rows with Not Useful from the learning platform.
# Calculate total responses and percentage of usefulness
df_summary <- df_platform %>%
  group_by(learning_platform) %>%
  summarize(
    count = sum(n[usefulness != "Not Useful"]),
    tot = sum(n),
    perc_usefulness = count / tot
  ) %>%
  arrange(desc(count))

df_summary
## # A tibble: 18 × 4
##    learning_platform count   tot perc_usefulness
##    <chr>             <int> <int>           <dbl>
##  1 Kaggle             6527  6583           0.991
##  2 Courses            5945  5992           0.992
##  3 SO                 5576  5640           0.989
##  4 YouTube            5125  5229           0.980
##  5 Projects           4755  4794           0.992
##  6 Blogs              4720  4765           0.991
##  7 Textbook           4112  4181           0.983
##  8 College            3258  3359           0.970
##  9 Arxiv              2354  2391           0.985
## 10 Documentation      2279  2321           0.982
## 11 Conferences        2063  2182           0.945
## 12 Friends            1530  1581           0.968
## 13 Tutoring           1394  1426           0.978
## 14 Communities        1126  1142           0.986
## 15 Podcasts           1090  1214           0.898
## 16 Newsletters        1033  1089           0.949
## 17 Company             940   981           0.958
## 18 TradeBook           324   333           0.973

3. Based on previous results, select the first two columns learning_platform and count

  • Show the top 10 useful learning platforms.
  • Show the cumulative percent of top 10 useful learning platforms (cum_pct). Other platforms will be grouped as Other.
  • Hint: use fct_reorder() and fct_relevel()
# Select top 10 platforms and calculate cumulative percentage
df_top10 <- df_summary %>%
  select(learning_platform, count) %>%
  slice_max(order_by = count, n = 10) %>%
  mutate(
    count1 = count,
    cum_pct = cumsum(count1) / sum(df_summary$count)
  ) %>%
  select(learning_platform, count1, cum_pct)

# Add "Other" category
other_count <- sum(df_summary$count) - sum(df_top10$count1)
other_row <- tibble(
  learning_platform = "Other",
  count1 = other_count,
  cum_pct = 1.0
)

df_final <- bind_rows(df_top10, other_row) %>%
  mutate(learning_platform = fct_reorder(learning_platform, count1)) %>%
  mutate(learning_platform = fct_relevel(learning_platform, "Other", after = 0))

df_final
## # A tibble: 11 × 3
##    learning_platform count1 cum_pct
##    <fct>              <int>   <dbl>
##  1 Kaggle              6527   0.121
##  2 Courses             5945   0.230
##  3 SO                  5576   0.333
##  4 YouTube             5125   0.428
##  5 Projects            4755   0.516
##  6 Blogs               4720   0.603
##  7 Textbook            4112   0.679
##  8 College             3258   0.739
##  9 Arxiv               2354   0.782
## 10 Documentation       2279   0.825
## 11 Other               9500   1

4. Based on the previous results, show the plotting as follows

  • Hint: You can use the example codings in lecture week 12.
# Create labels for the plot
df_plot <- df_final %>%
  mutate(
    rank = if_else(learning_platform == "Other", NA_integer_, row_number()),
    label_text = sprintf(
      "Rank: %s | Useful: %s | CumPct: %.1f%%",
      ifelse(is.na(rank), "NA", as.character(11 - rank)),
      format(count1, big.mark = ","),
      cum_pct * 100
    )
  )

# Create the plot
ggplot(df_plot, aes(x = count1, y = learning_platform)) +
  geom_col(fill = "#27ae60", alpha = 0.75, width = 0.65) +
  geom_text(aes(label = label_text), 
            hjust = -0.08, 
            size = 3.2,
            color = "#34495e") +
  scale_x_continuous(
    limits = c(0, max(df_plot$count1) * 1.48),
    breaks = seq(0, 10000, 2500),
    labels = scales::comma
  ) +
  labs(
    x = "Number of responses (at least useful)",
    y = "",
    title = "Learning Platform Usefulness Analysis"
  ) +
  theme_light(base_size = 11) +
  theme(
    plot.title = element_text(size = 15, face = "bold", color = "#2c3e50", margin = margin(b = 10)),
    panel.grid.major.y = element_line(color = "gray85", size = 0.2),
    panel.grid.minor = element_blank(),
    panel.grid.major.x = element_line(color = "gray80", size = 0.4),
    panel.border = element_rect(color = "gray70", size = 0.5),
    axis.text.y = element_text(size = 10.5, color = "#34495e"),
    axis.text.x = element_text(size = 9.5),
    axis.title.x = element_text(size = 11, margin = margin(t = 10)),
    plot.margin = margin(12, 12, 12, 12)
  )