Homework 7

Load Libraries

# Load required libraries
library(tidyverse)
library(lubridate)
library(knitr)

Read Data

# Read the data
data <- read_csv("multipleChoiceResponses1.csv")

Question 1: Count the Usefulness by Learning Platform

Select only the columns with LearningPlatformUsefulness and remove rows where usefulness is NA. Remove “LearningPlatformUsefulness” from each string in learning_platform. Use count() to change the dataset to have one row per learning_platform usefulness pair.

platform_usefulness <- data %>%
  select(starts_with("LearningPlatformUsefulness")) %>%
  pivot_longer(
    cols = everything(),
    names_to = "learning_platform",
    values_to = "usefulness"
  ) %>%
  filter(!is.na(usefulness)) %>%
  mutate(learning_platform = str_remove(learning_platform, "LearningPlatformUsefulness")) %>%
  count(learning_platform, usefulness)

# Display first 10 rows
platform_usefulness %>%
  head(10) %>%
  kable(caption = "Count of Usefulness by Learning Platform (First 10 Rows)")

Count of Usefulness by Learning Platform (First 10 Rows)
learning_platform	usefulness	n
Arxiv	Not Useful	37
Arxiv	Somewhat useful	1038
Arxiv	Very useful	1316
Blogs	Not Useful	45
Blogs	Somewhat useful	2406
Blogs	Very useful	2314
College	Not Useful	101
College	Somewhat useful	1405
College	Very useful	1853
Communities	Not Useful	16

Question 2: Total Responses and At-Least-Useful Responses

Compute the number of total responses and number of responses which are at least useful (filter out “Not Useful”).

platform_summary <- platform_usefulness %>%
  group_by(learning_platform) %>%
  summarize(
    count = sum(n[usefulness != "Not Useful"]),
    tot = sum(n),
    perc_usefulness = count / tot
  ) %>%
  arrange(desc(count))

platform_summary %>%
  kable(caption = "Summary of Learning Platform Usefulness", 
        digits = 3,
        col.names = c("Learning Platform", "At Least Useful", "Total", "% Usefulness"))

Summary of Learning Platform Usefulness
Learning Platform	At Least Useful	Total	% Usefulness
Kaggle	6527	6583	0.991
Courses	5945	5992	0.992
SO	5576	5640	0.989
YouTube	5125	5229	0.980
Projects	4755	4794	0.992
Blogs	4720	4765	0.991
Textbook	4112	4181	0.983
College	3258	3359	0.970
Arxiv	2354	2391	0.985
Documentation	2279	2321	0.982
Conferences	2063	2182	0.945
Friends	1530	1581	0.968
Tutoring	1394	1426	0.978
Communities	1126	1142	0.986
Podcasts	1090	1214	0.898
Newsletters	1033	1089	0.949
Company	940	981	0.958
TradeBook	324	333	0.973

Question 3: Top 10 Platforms with Cumulative Percentage

Select the first two columns (learning_platform and count), show the top 10 useful learning platforms, and calculate cumulative percentage. Other platforms are grouped as “Other”.

top_platforms <- platform_summary %>%
  select(learning_platform, count) %>%
  slice_head(n = 10) %>%
  mutate(
    count1 = count,
    cum_pct = cumsum(count1) / sum(platform_summary$count)
  ) %>%
  select(learning_platform, count1, cum_pct)

# Add "Other" category
other_count <- platform_summary %>%
  slice_tail(n = nrow(platform_summary) - 10) %>%
  summarize(count1 = sum(count)) %>%
  pull(count1)

top_platforms_with_other <- top_platforms %>%
  bind_rows(tibble(
    learning_platform = "Other",
    count1 = other_count,
    cum_pct = 1.0
  )) %>%
  mutate(learning_platform = fct_inorder(learning_platform)) %>%
  mutate(learning_platform = fct_relevel(learning_platform, "Other", after = Inf))

top_platforms_with_other %>%
  kable(caption = "Top 10 Learning Platforms with Cumulative Percentage", 
        digits = 3,
        col.names = c("Learning Platform", "At Least Useful Count", "Cumulative %"))

Top 10 Learning Platforms with Cumulative Percentage
Learning Platform	At Least Useful Count	Cumulative %
Kaggle	6527	0.121
Courses	5945	0.230
SO	5576	0.333
YouTube	5125	0.428
Projects	4755	0.516
Blogs	4720	0.603
Textbook	4112	0.679
College	3258	0.739
Arxiv	2354	0.782
Documentation	2279	0.825
Other	9500	1.000

Question 4: Visualization

Create a horizontal bar chart showing the top 10 learning platforms with rank, useful count, and cumulative percentage labels.

# Prepare data for plotting
plot_data <- top_platforms_with_other %>%
  mutate(
    rank = ifelse(learning_platform == "Other", NA, row_number()),
    label = sprintf(
      "Rank: %s\nUseful: %s\nCumPct: %.1f%%",
      ifelse(is.na(rank), "NA", as.character(rank)),
      format(count1, big.mark = ","),
      cum_pct * 100
    )
  ) %>%
  arrange(desc(count1)) %>%
  mutate(learning_platform = fct_reorder(learning_platform, count1))

# Create the plot
ggplot(plot_data, aes(x = count1, y = learning_platform)) +
  geom_col(fill = "steelblue", alpha = 0.8) +
  geom_text(aes(label = label), 
            hjust = -0.1, 
            size = 3.5,
            lineheight = 0.9) +
  scale_x_continuous(
    limits = c(0, max(plot_data$count1) * 1.4),
    expand = c(0, 0)
  ) +
  labs(
    x = "Number of responses with at least usefulness",
    y = NULL,
    title = "Learning Platform Usefulness Analysis"
  ) +
  theme_minimal() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.grid.minor = element_blank(),
    axis.text = element_text(size = 11),
    axis.title = element_text(size = 12),
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold")
  )