library(tidyr)
library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(forcats)
data <- read_csv("multipleChoiceResponses.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 16716 Columns: 228
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (212): GenderSelect, Country, EmploymentStatus, StudentStatus, LearningD...
## dbl  (13): Age, LearningCategorySelftTaught, LearningCategoryOnlineCourses, ...
## num   (1): CompensationAmount
## lgl   (2): WorkToolsFrequencyAngoss, WorkToolsFrequencyKNIMECommercial
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
interesting_columns <- data %>%
  select(starts_with(c("Leaning", "Working", "Age", "EmployerIndustry", 
                       "CurrentJob", "MLMethod", "Formal")))
str(interesting_columns)
## tibble [16,716 × 5] (S3: tbl_df/tbl/data.frame)
##  $ Age                   : num [1:16716] NA 30 28 56 38 46 35 22 43 33 ...
##  $ EmployerIndustry      : chr [1:16716] "Internet-based" NA NA "Mix of fields" ...
##  $ CurrentJobTitleSelect : chr [1:16716] "DBA/Database Engineer" NA NA "Operations Research Practitioner" ...
##  $ MLMethodNextYearSelect: chr [1:16716] "Random Forests" "Random Forests" "Deep learning" "Neural Nets" ...
##  $ FormalEducation       : chr [1:16716] "Bachelor's degree" "Master's degree" "Master's degree" "Master's degree" ...
interesting_columns <- interesting_columns %>%
  mutate(across(where(is.character), as.factor))
category_counts <- sapply(interesting_columns, function(col) {
  if(is.factor(col)) nlevels(col) else NA
})
category_counts <- na.omit(category_counts)
top_5_category_columns <- sort(category_counts, decreasing = TRUE)[1:5]
print(top_5_category_columns)
## MLMethodNextYearSelect       EmployerIndustry  CurrentJobTitleSelect 
##                     25                     16                     16 
##        FormalEducation                   <NA> 
##                      7                     NA
current_job_levels <- levels(interesting_columns$CurrentJobTitleSelect)
current_job_count <- nlevels(interesting_columns$CurrentJobTitleSelect)
list(Levels = current_job_levels, Count = current_job_count)
## $Levels
##  [1] "Business Analyst"                    
##  [2] "Computer Scientist"                  
##  [3] "Data Analyst"                        
##  [4] "Data Miner"                          
##  [5] "Data Scientist"                      
##  [6] "DBA/Database Engineer"               
##  [7] "Engineer"                            
##  [8] "Machine Learning Engineer"           
##  [9] "Operations Research Practitioner"    
## [10] "Other"                               
## [11] "Predictive Modeler"                  
## [12] "Programmer"                          
## [13] "Researcher"                          
## [14] "Scientist/Researcher"                
## [15] "Software Developer/Software Engineer"
## [16] "Statistician"                        
## 
## $Count
## [1] 16
ggplot(data, aes(x = fct_infreq(EmployerIndustry))) +
  geom_bar() +
  coord_flip() +
  labs(x = "Employer Industry", y = "Count", title = "Employer Industries")

filtered_data <- data %>%
  filter(!is.na(Age) & !is.na(EmployerIndustry))
ggplot(filtered_data, aes(x = fct_infreq(EmployerIndustry))) +
  geom_bar() +
  coord_flip() +
  labs(x = "Employer Industry", y = "Count", title = "Filtered Employer Industries")

industry_counts <- filtered_data %>%
  count(EmployerIndustry, sort = TRUE)
ggplot(industry_counts, aes(y = reorder(EmployerIndustry, n), x = 0, xend = n)) +
  geom_segment(aes(xend = n, yend = EmployerIndustry), color = "blue") +
  geom_point(aes(x = n), color = "red") +
  labs(x = "Count", y = "Employer Industry", title = "Industry Frequencies") +
  theme_minimal()

data$WorkInternalVsExternalTools <- fct_relevel(data$WorkInternalVsExternalTools,
                                                "Entirely internal",
                                                "More internal than external",
                                                "Approximately half internal and half external",
                                                "More external than internal",
                                                "Entirely external",
                                                "Do not know")
ggplot(data, aes(x = WorkInternalVsExternalTools)) +
  geom_bar() +
  labs(x = "Work Internal vs External Tools", y = "Count", 
       title = "Internal vs External Tool Usage") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))