# Load libraries
library(tidyverse)
# Read the data with proper handling
data <- read_csv("multipleChoiceResponses1.csv",
name_repair = "universal",
show_col_types = FALSE)
# Check the structure
glimpse(data)
## Rows: 16,716
## Columns: 47
## $ LearningPlatformUsefulnessArxiv <chr> NA, NA, "Very useful", NA,…
## $ LearningPlatformUsefulnessBlogs <chr> NA, NA, NA, "Very useful",…
## $ LearningPlatformUsefulnessCollege <chr> NA, NA, "Somewhat useful",…
## $ LearningPlatformUsefulnessCompany <chr> NA, NA, NA, NA, NA, NA, NA…
## $ LearningPlatformUsefulnessConferences <chr> "Very useful", NA, NA, "Ve…
## $ LearningPlatformUsefulnessFriends <chr> NA, NA, NA, "Very useful",…
## $ LearningPlatformUsefulnessKaggle <chr> NA, "Somewhat useful", "So…
## $ LearningPlatformUsefulnessNewsletters <chr> NA, NA, NA, NA, NA, NA, NA…
## $ LearningPlatformUsefulnessCommunities <chr> NA, NA, NA, NA, NA, NA, NA…
## $ LearningPlatformUsefulnessDocumentation <chr> NA, NA, NA, "Very useful",…
## $ LearningPlatformUsefulnessCourses <chr> NA, NA, "Very useful", "Ve…
## $ LearningPlatformUsefulnessProjects <chr> NA, NA, NA, "Very useful",…
## $ LearningPlatformUsefulnessPodcasts <chr> "Very useful", NA, NA, NA,…
## $ LearningPlatformUsefulnessSO <chr> NA, NA, NA, NA, NA, "Very …
## $ LearningPlatformUsefulnessTextbook <chr> NA, NA, NA, NA, "Somewhat …
## $ LearningPlatformUsefulnessTradeBook <chr> "Somewhat useful", NA, NA,…
## $ LearningPlatformUsefulnessTutoring <chr> NA, NA, NA, NA, NA, NA, NA…
## $ LearningPlatformUsefulnessYouTube <chr> NA, NA, "Very useful", NA,…
## $ CurrentJobTitleSelect <chr> "DBA/Database Engineer", N…
## $ MLMethodNextYearSelect <chr> "Random Forests", "Random …
## $ WorkChallengeFrequencyPolitics <chr> "Rarely", NA, NA, "Often",…
## $ WorkChallengeFrequencyUnusedResults <chr> NA, NA, NA, "Often", "Some…
## $ WorkChallengeFrequencyUnusefulInstrumenting <chr> NA, NA, NA, "Often", NA, N…
## $ WorkChallengeFrequencyDeployment <chr> NA, NA, NA, "Often", NA, N…
## $ WorkChallengeFrequencyDirtyData <chr> NA, NA, NA, "Often", NA, "…
## $ WorkChallengeFrequencyExplaining <chr> NA, NA, NA, "Often", NA, N…
## $ WorkChallengeFrequencyPass <chr> NA, NA, NA, NA, NA, NA, NA…
## $ WorkChallengeFrequencyIntegration <chr> NA, NA, NA, "Often", NA, N…
## $ WorkChallengeFrequencyTalent <chr> NA, NA, NA, "Often", "Some…
## $ WorkChallengeFrequencyDataFunds <chr> NA, NA, NA, "Often", "Some…
## $ WorkChallengeFrequencyDomainExpertise <chr> NA, NA, NA, "Most of the t…
## $ WorkChallengeFrequencyML <chr> NA, NA, NA, "Often", NA, N…
## $ WorkChallengeFrequencyTools <chr> NA, NA, NA, "Often", NA, N…
## $ WorkChallengeFrequencyExpectations <chr> NA, NA, NA, "Often", NA, N…
## $ WorkChallengeFrequencyITCoordination <chr> NA, NA, NA, NA, "Sometimes…
## $ WorkChallengeFrequencyHiringFunds <chr> NA, NA, NA, "Often", NA, N…
## $ WorkChallengeFrequencyPrivacy <chr> "Often", NA, NA, "Often", …
## $ WorkChallengeFrequencyScaling <chr> "Most of the time", NA, NA…
## $ WorkChallengeFrequencyEnvironments <chr> NA, NA, NA, "Often", "Some…
## $ WorkChallengeFrequencyClarity <chr> NA, NA, NA, "Often", NA, N…
## $ WorkChallengeFrequencyDataAccess <chr> NA, NA, NA, "Often", NA, N…
## $ WorkChallengeFrequencyOtherSelect <chr> NA, NA, NA, NA, NA, NA, NA…
## $ WorkInternalVsExternalTools <chr> "Do not know", NA, NA, "En…
## $ FormalEducation <chr> "Bachelor's degree", "Mast…
## $ Age <dbl> NA, 30, 28, 56, 38, 46, 35…
## $ DataScienceIdentitySelect <chr> "Yes", "Yes", "Yes", "Yes"…
## $ JobSatisfaction <chr> "5", NA, NA, "10 - Highly …
Task:
# Select columns with LearningPlatformUsefulness and remove NA rows
q1_data <- data %>%
select(starts_with("LearningPlatformUsefulness")) %>%
pivot_longer(
cols = everything(),
names_to = "learning_platform",
values_to = "usefulness"
) %>%
filter(!is.na(usefulness)) %>%
# Remove "LearningPlatformUsefulness" prefix from learning_platform
mutate(learning_platform = str_replace(learning_platform,
"LearningPlatformUsefulness",
"")) %>%
# Count occurrences of each platform-usefulness pair
count(learning_platform, usefulness)
q1_data
## # A tibble: 54 × 3
## learning_platform usefulness n
## <chr> <chr> <int>
## 1 Arxiv Not Useful 37
## 2 Arxiv Somewhat useful 1038
## 3 Arxiv Very useful 1316
## 4 Blogs Not Useful 45
## 5 Blogs Somewhat useful 2406
## 6 Blogs Very useful 2314
## 7 College Not Useful 101
## 8 College Somewhat useful 1405
## 9 College Very useful 1853
## 10 Communities Not Useful 16
## # ℹ 44 more rows
Task:
q2_data <- q1_data %>%
group_by(learning_platform) %>%
summarize(
count = sum(n[usefulness != "Not Useful"]), # At least useful
tot = sum(n), # Total responses
perc_usefulness = count / tot # Percentage
) %>%
arrange(learning_platform)
q2_data
## # A tibble: 18 × 4
## learning_platform count tot perc_usefulness
## <chr> <int> <int> <dbl>
## 1 Arxiv 2354 2391 0.985
## 2 Blogs 4720 4765 0.991
## 3 College 3258 3359 0.970
## 4 Communities 1126 1142 0.986
## 5 Company 940 981 0.958
## 6 Conferences 2063 2182 0.945
## 7 Courses 5945 5992 0.992
## 8 Documentation 2279 2321 0.982
## 9 Friends 1530 1581 0.968
## 10 Kaggle 6527 6583 0.991
## 11 Newsletters 1033 1089 0.949
## 12 Podcasts 1090 1214 0.898
## 13 Projects 4755 4794 0.992
## 14 SO 5576 5640 0.989
## 15 Textbook 4112 4181 0.983
## 16 TradeBook 324 333 0.973
## 17 Tutoring 1394 1426 0.978
## 18 YouTube 5125 5229 0.980
Task:
q3_data <- q2_data %>%
select(learning_platform, count) %>%
arrange(desc(count)) %>%
slice(1:10) %>%
mutate(
count1 = count,
cum_pct = cumsum(count) / sum(q2_data$count)
) %>%
select(learning_platform, count1, cum_pct)
# Add "Other" row
other_count <- sum(q2_data$count) - sum(q3_data$count1)
other_row <- tibble(
learning_platform = "Other",
count1 = other_count,
cum_pct = 1.0
)
q3_data <- bind_rows(q3_data, other_row) %>%
mutate(learning_platform = fct_reorder(learning_platform, count1, .desc = FALSE)) %>%
mutate(learning_platform = fct_relevel(learning_platform, "Other", after = 0))
q3_data
## # A tibble: 11 × 3
## learning_platform count1 cum_pct
## <fct> <int> <dbl>
## 1 Kaggle 6527 0.121
## 2 Courses 5945 0.230
## 3 SO 5576 0.333
## 4 YouTube 5125 0.428
## 5 Projects 4755 0.516
## 6 Blogs 4720 0.603
## 7 Textbook 4112 0.679
## 8 College 3258 0.739
## 9 Arxiv 2354 0.782
## 10 Documentation 2279 0.825
## 11 Other 9500 1
Task:
# Prepare labels for the plot
q4_data <- q3_data %>%
mutate(
rank = if_else(learning_platform == "Other", NA_integer_,
row_number(desc(count1)) - 1),
label_text = paste0(
"Rank: ", ifelse(is.na(rank), "NA", rank), "\n",
"Useful: ", format(count1, big.mark = ","), "\n",
"CumPct: ", sprintf("%.1f%%", cum_pct * 100)
)
)
# Create the plot
ggplot(q4_data, aes(x = learning_platform, y = count1)) +
geom_col(fill = "steelblue", alpha = 0.8) +
geom_text(aes(label = label_text),
hjust = -0.1,
size = 3,
lineheight = 0.9) +
coord_flip() +
scale_y_continuous(
limits = c(0, max(q4_data$count1) * 1.3),
breaks = seq(0, 10000, 2500)
) +
labs(
title = "Top 10 learning platform",
x = "Learning platform",
y = "Number of responses with at least usefulness"
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, size = 14),
axis.text = element_text(size = 10),
axis.title = element_text(size = 11),
panel.grid.major.y = element_blank(),
panel.grid.minor = element_blank()
)