library(tidyr)
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(forcats)
data <- read_csv("multipleChoiceResponses.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 16716 Columns: 228
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (212): GenderSelect, Country, EmploymentStatus, StudentStatus, LearningD...
## dbl (13): Age, LearningCategorySelftTaught, LearningCategoryOnlineCourses, ...
## num (1): CompensationAmount
## lgl (2): WorkToolsFrequencyAngoss, WorkToolsFrequencyKNIMECommercial
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
interesting_columns <- data %>%
select(starts_with(c("Leaning", "Working", "Age", "EmployerIndustry",
"CurrentJob", "MLMethod", "Formal")))
str(interesting_columns)
## tibble [16,716 × 5] (S3: tbl_df/tbl/data.frame)
## $ Age : num [1:16716] NA 30 28 56 38 46 35 22 43 33 ...
## $ EmployerIndustry : chr [1:16716] "Internet-based" NA NA "Mix of fields" ...
## $ CurrentJobTitleSelect : chr [1:16716] "DBA/Database Engineer" NA NA "Operations Research Practitioner" ...
## $ MLMethodNextYearSelect: chr [1:16716] "Random Forests" "Random Forests" "Deep learning" "Neural Nets" ...
## $ FormalEducation : chr [1:16716] "Bachelor's degree" "Master's degree" "Master's degree" "Master's degree" ...
interesting_columns <- interesting_columns %>%
mutate(across(where(is.character), as.factor))
category_counts <- sapply(interesting_columns, function(col) {
if(is.factor(col)) nlevels(col) else NA
})
category_counts <- na.omit(category_counts)
top_5_category_columns <- sort(category_counts, decreasing = TRUE)[1:5]
print(top_5_category_columns)
## MLMethodNextYearSelect EmployerIndustry CurrentJobTitleSelect
## 25 16 16
## FormalEducation <NA>
## 7 NA
current_job_levels <- levels(interesting_columns$CurrentJobTitleSelect)
current_job_count <- nlevels(interesting_columns$CurrentJobTitleSelect)
list(Levels = current_job_levels, Count = current_job_count)
## $Levels
## [1] "Business Analyst"
## [2] "Computer Scientist"
## [3] "Data Analyst"
## [4] "Data Miner"
## [5] "Data Scientist"
## [6] "DBA/Database Engineer"
## [7] "Engineer"
## [8] "Machine Learning Engineer"
## [9] "Operations Research Practitioner"
## [10] "Other"
## [11] "Predictive Modeler"
## [12] "Programmer"
## [13] "Researcher"
## [14] "Scientist/Researcher"
## [15] "Software Developer/Software Engineer"
## [16] "Statistician"
##
## $Count
## [1] 16
ggplot(data, aes(x = fct_infreq(EmployerIndustry))) +
geom_bar() +
coord_flip() +
labs(x = "Employer Industry", y = "Count", title = "Employer Industries")

filtered_data <- data %>%
filter(!is.na(Age) & !is.na(EmployerIndustry))
ggplot(filtered_data, aes(x = fct_infreq(EmployerIndustry))) +
geom_bar() +
coord_flip() +
labs(x = "Employer Industry", y = "Count", title = "Filtered Employer Industries")

industry_counts <- filtered_data %>%
count(EmployerIndustry, sort = TRUE)
ggplot(industry_counts, aes(y = reorder(EmployerIndustry, n), x = 0, xend = n)) +
geom_segment(aes(xend = n, yend = EmployerIndustry), color = "blue") +
geom_point(aes(x = n), color = "red") +
labs(x = "Count", y = "Employer Industry", title = "Industry Frequencies") +
theme_minimal()

data$WorkInternalVsExternalTools <- fct_relevel(data$WorkInternalVsExternalTools,
"Entirely internal",
"More internal than external",
"Approximately half internal and half external",
"More external than internal",
"Entirely external",
"Do not know")
ggplot(data, aes(x = WorkInternalVsExternalTools)) +
geom_bar() +
labs(x = "Work Internal vs External Tools", y = "Count",
title = "Internal vs External Tool Usage") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
