library(tidyr)
library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(forcats)
data <- read_csv("multipleChoiceResponses.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 16716 Columns: 228
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (212): GenderSelect, Country, EmploymentStatus, StudentStatus, LearningD...
## dbl  (13): Age, LearningCategorySelftTaught, LearningCategoryOnlineCourses, ...
## num   (1): CompensationAmount
## lgl   (2): WorkToolsFrequencyAngoss, WorkToolsFrequencyKNIMECommercial
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
selected_columns <- data %>%
  select(starts_with(c("Leaning", "Working", "Age", "EmployerIndustry", 
                       "CurrentJob", "MLMethod", "Formal")))
glimpse(selected_columns)
## Rows: 16,716
## Columns: 5
## $ Age                    <dbl> NA, 30, 28, 56, 38, 46, 35, 22, 43, 33, 20, 27,…
## $ EmployerIndustry       <chr> "Internet-based", NA, NA, "Mix of fields", "Tec…
## $ CurrentJobTitleSelect  <chr> "DBA/Database Engineer", NA, NA, "Operations Re…
## $ MLMethodNextYearSelect <chr> "Random Forests", "Random Forests", "Deep learn…
## $ FormalEducation        <chr> "Bachelor's degree", "Master's degree", "Master…
selected_columns <- selected_columns %>%
  mutate(across(where(is.character), as.factor))
factor_levels <- sapply(selected_columns, function(col) if(is.factor(col)) nlevels(col) else NA)
factor_levels <- na.omit(factor_levels)
factor_levels
##       EmployerIndustry  CurrentJobTitleSelect MLMethodNextYearSelect 
##                     16                     16                     25 
##        FormalEducation 
##                      7 
## attr(,"na.action")
## Age 
##   1 
## attr(,"class")
## [1] "omit"
top_levels <- factor_levels %>%
  sort(decreasing = TRUE) %>%
  head(5)
top_levels
## MLMethodNextYearSelect       EmployerIndustry  CurrentJobTitleSelect 
##                     25                     16                     16 
##        FormalEducation 
##                      7
current_job_levels <- levels(selected_columns$CurrentJobTitleSelect)
current_job_count <- nlevels(selected_columns$CurrentJobTitleSelect)
list(Levels = current_job_levels, Count = current_job_count)
## $Levels
##  [1] "Business Analyst"                    
##  [2] "Computer Scientist"                  
##  [3] "Data Analyst"                        
##  [4] "Data Miner"                          
##  [5] "Data Scientist"                      
##  [6] "DBA/Database Engineer"               
##  [7] "Engineer"                            
##  [8] "Machine Learning Engineer"           
##  [9] "Operations Research Practitioner"    
## [10] "Other"                               
## [11] "Predictive Modeler"                  
## [12] "Programmer"                          
## [13] "Researcher"                          
## [14] "Scientist/Researcher"                
## [15] "Software Developer/Software Engineer"
## [16] "Statistician"                        
## 
## $Count
## [1] 16
ggplot(data, aes(x = fct_infreq(EmployerIndustry))) +
  geom_bar() +
  coord_flip() +
  labs(x = "Employer Industry", y = "Count", title = "Bar Plot of Employer Industry")

filtered_data <- data %>%
  filter(!is.na(Age) & !is.na(EmployerIndustry))
ggplot(filtered_data, aes(x = fct_infreq(EmployerIndustry))) +
  geom_bar() +
  coord_flip() +
  labs(x = "Employer Industry", y = "Count", title = "Filtered Employer Industry Bar Plot")

industry_counts <- filtered_data %>%
  count(EmployerIndustry, sort = TRUE)
ggplot(industry_counts, aes(y = reorder(EmployerIndustry, n), x = 0, xend = n)) +
  geom_segment(aes(xend = n, yend = EmployerIndustry), color = "blue") +
  geom_point(aes(x = n), color = "red") +
  labs(x = "Count", y = "Employer Industry", title = "Segment Plot of Employer Industry") +
  theme_minimal()

data$WorkInternalVsExternalTools <- fct_relevel(data$WorkInternalVsExternalTools,
                                                "Entirely internal",
                                                "More internal than external",
                                                "Approximately half internal and half external",
                                                "More external than internal",
                                                "Entirely external",
                                                "Do not know")
ggplot(data, aes(x = WorkInternalVsExternalTools)) +
  geom_bar() +
  labs(x = "Work Internal vs External Tools", y = "Count", 
       title = "Reordered WorkInternalVsExternalTools") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))