Use the multipleChoiceResponses1.csv to answer the
following questions:
# Load libraries
library(tidyverse)
library(lubridate)
library(nycflights13)
library(tidyquant)
# Read the data
responses <- read_csv("multipleChoiceResponses1.csv")
Task:
count() to change the dataset to have one row per
learning_platform usefulness pair with a column that is the number of
entries with that pairing.q1_data <- responses %>%
select(starts_with("LearningPlatformUsefulness")) %>%
pivot_longer(
cols = everything(),
names_to = "learning_platform",
values_to = "usefulness"
) %>%
filter(!is.na(usefulness)) %>%
mutate(learning_platform = str_remove(learning_platform, "LearningPlatformUsefulness")) %>%
count(learning_platform, usefulness)
# Display first 10 rows
head(q1_data, 10)
## # A tibble: 10 × 3
## learning_platform usefulness n
## <chr> <chr> <int>
## 1 Arxiv Not Useful 37
## 2 Arxiv Somewhat useful 1038
## 3 Arxiv Very useful 1316
## 4 Blogs Not Useful 45
## 5 Blogs Somewhat useful 2406
## 6 Blogs Very useful 2314
## 7 College Not Useful 101
## 8 College Somewhat useful 1405
## 9 College Very useful 1853
## 10 Communities Not Useful 16
Task:
q2_data <- q1_data %>%
group_by(learning_platform) %>%
summarise(
count = sum(n[usefulness != "Not Useful"]), # At least useful
tot = sum(n), # Total responses
perc_usefulness = count / tot
) %>%
arrange(desc(count))
q2_data
## # A tibble: 18 × 4
## learning_platform count tot perc_usefulness
## <chr> <int> <int> <dbl>
## 1 Kaggle 6527 6583 0.991
## 2 Courses 5945 5992 0.992
## 3 SO 5576 5640 0.989
## 4 YouTube 5125 5229 0.980
## 5 Projects 4755 4794 0.992
## 6 Blogs 4720 4765 0.991
## 7 Textbook 4112 4181 0.983
## 8 College 3258 3359 0.970
## 9 Arxiv 2354 2391 0.985
## 10 Documentation 2279 2321 0.982
## 11 Conferences 2063 2182 0.945
## 12 Friends 1530 1581 0.968
## 13 Tutoring 1394 1426 0.978
## 14 Communities 1126 1142 0.986
## 15 Podcasts 1090 1214 0.898
## 16 Newsletters 1033 1089 0.949
## 17 Company 940 981 0.958
## 18 TradeBook 324 333 0.973
Task:
fct_reorder() and
fct_relevel()q3_data <- q2_data %>%
select(learning_platform, count) %>%
slice_max(count, n = 10) %>%
mutate(
count1 = count,
cum_pct = cumsum(count) / sum(q2_data$count)
)
# Add "Other" category
other_count <- sum(q2_data$count) - sum(q3_data$count)
other_row <- tibble(
learning_platform = "Other",
count = other_count,
count1 = other_count,
cum_pct = 1.0
)
q3_data <- q3_data %>%
bind_rows(other_row) %>%
mutate(
learning_platform = fct_reorder(learning_platform, count1),
learning_platform = fct_relevel(learning_platform, "Other", after = 0)
) %>%
select(learning_platform, count1, cum_pct)
q3_data
## # A tibble: 11 × 3
## learning_platform count1 cum_pct
## <fct> <int> <dbl>
## 1 Kaggle 6527 0.121
## 2 Courses 5945 0.230
## 3 SO 5576 0.333
## 4 YouTube 5125 0.428
## 5 Projects 4755 0.516
## 6 Blogs 4720 0.603
## 7 Textbook 4112 0.679
## 8 College 3258 0.739
## 9 Arxiv 2354 0.782
## 10 Documentation 2279 0.825
## 11 Other 9500 1
Task:
# Prepare data for plotting with labels
plot_data <- q3_data %>%
mutate(
rank = if_else(learning_platform == "Other", NA_integer_,
as.integer(11 - as.numeric(learning_platform))),
label_text = sprintf(
"Rank: %s\nUseful: %s\nCumPct: %.1f%%",
if_else(is.na(rank), "NA", as.character(rank)),
format(count1, big.mark = ","),
cum_pct * 100
)
)
# Create the plot
ggplot(plot_data, aes(x = count1, y = learning_platform)) +
geom_col(fill = "steelblue", alpha = 0.8) +
geom_text(aes(label = label_text),
hjust = -0.05,
size = 3,
lineheight = 0.9) +
scale_x_continuous(
limits = c(0, max(plot_data$count1) * 1.4),
breaks = seq(0, 10000, 2500)
) +
labs(
title = "Learning Platform Usefulness Analysis",
subtitle = "Top 10 Platforms by Number of Useful Responses",
x = "Number of responses with at least usefulness",
y = NULL
) +
theme_minimal() +
theme(
panel.grid.major.y = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_text(face = "bold", size = 14),
plot.subtitle = element_text(size = 11),
axis.text = element_text(size = 10)
)
The analysis reveals that Kaggle is the most useful learning platform with 6,527 useful responses (12.1% of total), followed by Courses (5,945 responses, 23.0% cumulative) and Stack Overflow (5,576 responses, 33.3% cumulative). The top 10 platforms account for 82.5% of all useful responses, demonstrating the concentration of preferred learning resources among data science practitioners.