library(tidyr)
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(forcats)
data <- read_csv("multipleChoiceResponses.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 16716 Columns: 228
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (212): GenderSelect, Country, EmploymentStatus, StudentStatus, LearningD...
## dbl (13): Age, LearningCategorySelftTaught, LearningCategoryOnlineCourses, ...
## num (1): CompensationAmount
## lgl (2): WorkToolsFrequencyAngoss, WorkToolsFrequencyKNIMECommercial
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
selected_columns <- data %>%
select(starts_with(c("Leaning", "Working", "Age", "EmployerIndustry",
"CurrentJob", "MLMethod", "Formal")))
glimpse(selected_columns)
## Rows: 16,716
## Columns: 5
## $ Age <dbl> NA, 30, 28, 56, 38, 46, 35, 22, 43, 33, 20, 27,…
## $ EmployerIndustry <chr> "Internet-based", NA, NA, "Mix of fields", "Tec…
## $ CurrentJobTitleSelect <chr> "DBA/Database Engineer", NA, NA, "Operations Re…
## $ MLMethodNextYearSelect <chr> "Random Forests", "Random Forests", "Deep learn…
## $ FormalEducation <chr> "Bachelor's degree", "Master's degree", "Master…
selected_columns <- selected_columns %>%
mutate(across(where(is.character), as.factor))
factor_levels <- sapply(selected_columns, function(col) if(is.factor(col)) nlevels(col) else NA)
factor_levels <- na.omit(factor_levels)
factor_levels
## EmployerIndustry CurrentJobTitleSelect MLMethodNextYearSelect
## 16 16 25
## FormalEducation
## 7
## attr(,"na.action")
## Age
## 1
## attr(,"class")
## [1] "omit"
top_levels <- factor_levels %>%
sort(decreasing = TRUE) %>%
head(5)
top_levels
## MLMethodNextYearSelect EmployerIndustry CurrentJobTitleSelect
## 25 16 16
## FormalEducation
## 7
current_job_levels <- levels(selected_columns$CurrentJobTitleSelect)
current_job_count <- nlevels(selected_columns$CurrentJobTitleSelect)
list(Levels = current_job_levels, Count = current_job_count)
## $Levels
## [1] "Business Analyst"
## [2] "Computer Scientist"
## [3] "Data Analyst"
## [4] "Data Miner"
## [5] "Data Scientist"
## [6] "DBA/Database Engineer"
## [7] "Engineer"
## [8] "Machine Learning Engineer"
## [9] "Operations Research Practitioner"
## [10] "Other"
## [11] "Predictive Modeler"
## [12] "Programmer"
## [13] "Researcher"
## [14] "Scientist/Researcher"
## [15] "Software Developer/Software Engineer"
## [16] "Statistician"
##
## $Count
## [1] 16
ggplot(data, aes(x = fct_infreq(EmployerIndustry))) +
geom_bar() +
coord_flip() +
labs(x = "Employer Industry", y = "Count", title = "Bar Plot of Employer Industry")

filtered_data <- data %>%
filter(!is.na(Age) & !is.na(EmployerIndustry))
ggplot(filtered_data, aes(x = fct_infreq(EmployerIndustry))) +
geom_bar() +
coord_flip() +
labs(x = "Employer Industry", y = "Count", title = "Filtered Employer Industry Bar Plot")

industry_counts <- filtered_data %>%
count(EmployerIndustry, sort = TRUE)
ggplot(industry_counts, aes(y = reorder(EmployerIndustry, n), x = 0, xend = n)) +
geom_segment(aes(xend = n, yend = EmployerIndustry), color = "blue") +
geom_point(aes(x = n), color = "red") +
labs(x = "Count", y = "Employer Industry", title = "Segment Plot of Employer Industry") +
theme_minimal()

data$WorkInternalVsExternalTools <- fct_relevel(data$WorkInternalVsExternalTools,
"Entirely internal",
"More internal than external",
"Approximately half internal and half external",
"More external than internal",
"Entirely external",
"Do not know")
ggplot(data, aes(x = WorkInternalVsExternalTools)) +
geom_bar() +
labs(x = "Work Internal vs External Tools", y = "Count",
title = "Reordered WorkInternalVsExternalTools") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
