library('tidyr')
library('readr')
library('dplyr')
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library('ggplot2')
library('forcats')
library('tidyquant')
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## ── Attaching core tidyquant packages ──────────────────────── tidyquant 1.0.9 ──
## ✔ PerformanceAnalytics 2.0.8 ✔ TTR 0.24.4
## ✔ quantmod 0.4.26 ✔ xts 0.14.1
## ── Conflicts ────────────────────────────────────────── tidyquant_conflicts() ──
## ✖ zoo::as.Date() masks base::as.Date()
## ✖ zoo::as.Date.numeric() masks base::as.Date.numeric()
## ✖ dplyr::filter() masks stats::filter()
## ✖ xts::first() masks dplyr::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ xts::last() masks dplyr::last()
## ✖ PerformanceAnalytics::legend() masks graphics::legend()
## ✖ quantmod::summary() masks base::summary()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Load libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ lubridate 1.9.4 ✔ stringr 1.5.1
## ✔ purrr 1.0.2 ✔ tibble 3.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ xts::first() masks dplyr::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ xts::last() masks dplyr::last()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# 1. Load the dataset
file_path <- "multipleChoiceResponses.csv" # Replace with the correct file path
data <- read_csv(file_path)
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 16716 Columns: 228
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (212): GenderSelect, Country, EmploymentStatus, StudentStatus, LearningD...
## dbl (13): Age, LearningCategorySelftTaught, LearningCategoryOnlineCourses, ...
## num (1): CompensationAmount
## lgl (2): WorkToolsFrequencyAngoss, WorkToolsFrequencyKNIMECommercial
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# 1.1 Select columns starting with specific prefixes and display them using glimpse
selected_data <- data %>%
select(starts_with("Leaning"),
starts_with("Working"),
starts_with("Age"),
starts_with("EmployerIndustry"),
starts_with("CurrentJob"),
starts_with("MLMethod"),
starts_with("Formal"))
glimpse(selected_data)
## Rows: 16,716
## Columns: 5
## $ Age <dbl> NA, 30, 28, 56, 38, 46, 35, 22, 43, 33, 20, 27,…
## $ EmployerIndustry <chr> "Internet-based", NA, NA, "Mix of fields", "Tec…
## $ CurrentJobTitleSelect <chr> "DBA/Database Engineer", NA, NA, "Operations Re…
## $ MLMethodNextYearSelect <chr> "Random Forests", "Random Forests", "Deep learn…
## $ FormalEducation <chr> "Bachelor's degree", "Master's degree", "Master…
# 1.2 Change all character columns to factors and find the number of levels for each factor
selected_data <- selected_data %>%
mutate(across(where(is.character), as.factor)) # Convert character columns to factors
# Calculate number of levels for each factor column
num_levels <- selected_data %>%
summarise(across(where(is.factor), ~nlevels(.))) %>%
pivot_longer(everything(), names_to = "Variable", values_to = "NumLevels")
print(num_levels)
## # A tibble: 4 × 2
## Variable NumLevels
## <chr> <int>
## 1 EmployerIndustry 16
## 2 CurrentJobTitleSelect 16
## 3 MLMethodNextYearSelect 25
## 4 FormalEducation 7
# 1.3 Select the 5 columns with the highest number of levels
top_5_levels <- num_levels %>%
arrange(desc(NumLevels)) %>%
slice(1:5)
print(top_5_levels)
## # A tibble: 4 × 2
## Variable NumLevels
## <chr> <int>
## 1 MLMethodNextYearSelect 25
## 2 EmployerIndustry 16
## 3 CurrentJobTitleSelect 16
## 4 FormalEducation 7
# 1.4 Filter where the column variable equals "CurrentJobTitleSelect" and show its levels
if ("CurrentJobTitleSelect" %in% names(selected_data)) {
current_job <- selected_data$CurrentJobTitleSelect
cat("Levels of CurrentJobTitleSelect:\n")
print(levels(current_job)) # Show levels
cat("Number of levels:", nlevels(current_job)) # Show number of levels
} else {
cat("Column 'CurrentJobTitleSelect' not found in the dataset.\n")
}
## Levels of CurrentJobTitleSelect:
## [1] "Business Analyst"
## [2] "Computer Scientist"
## [3] "Data Analyst"
## [4] "Data Miner"
## [5] "Data Scientist"
## [6] "DBA/Database Engineer"
## [7] "Engineer"
## [8] "Machine Learning Engineer"
## [9] "Operations Research Practitioner"
## [10] "Other"
## [11] "Predictive Modeler"
## [12] "Programmer"
## [13] "Researcher"
## [14] "Scientist/Researcher"
## [15] "Software Developer/Software Engineer"
## [16] "Statistician"
## Number of levels: 16
# 2. Make a bar plot of EmployerIndustry
if ("EmployerIndustry" %in% names(selected_data)) {
# Count occurrences of EmployerIndustry for plotting
industry_count <- selected_data %>%
count(EmployerIndustry, sort = TRUE, name = "Count") %>%
drop_na(EmployerIndustry) # Drop rows with NA
# 2.1 Basic bar plot with flipped coordinates
ggplot(industry_count, aes(x = reorder(EmployerIndustry, Count), y = Count)) +
geom_bar(stat = "identity", fill = "lightblue") +
coord_flip() +
theme_minimal() +
labs(title = "Employer Industry Count",
x = "Employer Industry",
y = "Count")
# 2.2 Filter rows where Age and EmployerIndustry are not NA
filtered_data <- selected_data %>%
filter(!is.na(Age) & !is.na(EmployerIndustry)) %>%
count(EmployerIndustry, sort = TRUE, name = "Count")
# Replot filtered data with flipped coordinates
ggplot(filtered_data, aes(x = reorder(EmployerIndustry, Count), y = Count)) +
geom_bar(stat = "identity", fill = "lightblue") +
coord_flip() +
theme_minimal() +
labs(title = "Filtered Employer Industry Count (Age and EmployerIndustry not NA)",
x = "Employer Industry",
y = "Count")
# 2.3 Use geom_segment to plot filtered data sorted in descending order
filtered_data <- filtered_data %>%
arrange(desc(Count))
ggplot(filtered_data, aes(x = reorder(EmployerIndustry, Count), y = Count)) +
geom_segment(aes(xend = EmployerIndustry, y = 0, yend = Count),
color = "lightblue", size = 1.2) +
geom_point(aes(y = Count), color = "hotpink", size = 3) +
coord_flip() +
theme_minimal() +
labs(title = "Employer Industry Count with geom_segment",
x = "Employer Industry",
y = "Count")
} else {
cat("Column 'EmployerIndustry' not found in the dataset.\n")
}
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
