HW06

library('tidyr')
library('readr')
library('dplyr')

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library('ggplot2')
library('forcats')
library('tidyquant')

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

## ── Attaching core tidyquant packages ──────────────────────── tidyquant 1.0.9 ──
## ✔ PerformanceAnalytics 2.0.8      ✔ TTR                  0.24.4
## ✔ quantmod             0.4.26     ✔ xts                  0.14.1
## ── Conflicts ────────────────────────────────────────── tidyquant_conflicts() ──
## ✖ zoo::as.Date()                 masks base::as.Date()
## ✖ zoo::as.Date.numeric()         masks base::as.Date.numeric()
## ✖ dplyr::filter()                masks stats::filter()
## ✖ xts::first()                   masks dplyr::first()
## ✖ dplyr::lag()                   masks stats::lag()
## ✖ xts::last()                    masks dplyr::last()
## ✖ PerformanceAnalytics::legend() masks graphics::legend()
## ✖ quantmod::summary()            masks base::summary()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

# Load libraries
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ lubridate 1.9.4     ✔ stringr   1.5.1
## ✔ purrr     1.0.2     ✔ tibble    3.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ xts::first()    masks dplyr::first()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ xts::last()     masks dplyr::last()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

# 1. Load the dataset
file_path <- "multipleChoiceResponses.csv"   # Replace with the correct file path
data <- read_csv(file_path)

## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)

## Rows: 16716 Columns: 228
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (212): GenderSelect, Country, EmploymentStatus, StudentStatus, LearningD...
## dbl  (13): Age, LearningCategorySelftTaught, LearningCategoryOnlineCourses, ...
## num   (1): CompensationAmount
## lgl   (2): WorkToolsFrequencyAngoss, WorkToolsFrequencyKNIMECommercial
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# 1.1 Select columns starting with specific prefixes and display them using glimpse
selected_data <- data %>%
  select(starts_with("Leaning"),
         starts_with("Working"),
         starts_with("Age"),
         starts_with("EmployerIndustry"),
         starts_with("CurrentJob"),
         starts_with("MLMethod"),
         starts_with("Formal"))

glimpse(selected_data)

## Rows: 16,716
## Columns: 5
## $ Age                    <dbl> NA, 30, 28, 56, 38, 46, 35, 22, 43, 33, 20, 27,…
## $ EmployerIndustry       <chr> "Internet-based", NA, NA, "Mix of fields", "Tec…
## $ CurrentJobTitleSelect  <chr> "DBA/Database Engineer", NA, NA, "Operations Re…
## $ MLMethodNextYearSelect <chr> "Random Forests", "Random Forests", "Deep learn…
## $ FormalEducation        <chr> "Bachelor's degree", "Master's degree", "Master…

# 1.2 Change all character columns to factors and find the number of levels for each factor
selected_data <- selected_data %>%
  mutate(across(where(is.character), as.factor))  # Convert character columns to factors

# Calculate number of levels for each factor column
num_levels <- selected_data %>%
  summarise(across(where(is.factor), ~nlevels(.))) %>%
  pivot_longer(everything(), names_to = "Variable", values_to = "NumLevels")

print(num_levels)

## # A tibble: 4 × 2
##   Variable               NumLevels
##   <chr>                      <int>
## 1 EmployerIndustry              16
## 2 CurrentJobTitleSelect         16
## 3 MLMethodNextYearSelect        25
## 4 FormalEducation                7

# 1.3 Select the 5 columns with the highest number of levels
top_5_levels <- num_levels %>%
  arrange(desc(NumLevels)) %>%
  slice(1:5)

print(top_5_levels)

## # A tibble: 4 × 2
##   Variable               NumLevels
##   <chr>                      <int>
## 1 MLMethodNextYearSelect        25
## 2 EmployerIndustry              16
## 3 CurrentJobTitleSelect         16
## 4 FormalEducation                7

# 1.4 Filter where the column variable equals "CurrentJobTitleSelect" and show its levels
if ("CurrentJobTitleSelect" %in% names(selected_data)) {
  current_job <- selected_data$CurrentJobTitleSelect
  
  cat("Levels of CurrentJobTitleSelect:\n")
  print(levels(current_job))   # Show levels
  
  cat("Number of levels:", nlevels(current_job))  # Show number of levels
} else {
  cat("Column 'CurrentJobTitleSelect' not found in the dataset.\n")
}

## Levels of CurrentJobTitleSelect:
##  [1] "Business Analyst"                    
##  [2] "Computer Scientist"                  
##  [3] "Data Analyst"                        
##  [4] "Data Miner"                          
##  [5] "Data Scientist"                      
##  [6] "DBA/Database Engineer"               
##  [7] "Engineer"                            
##  [8] "Machine Learning Engineer"           
##  [9] "Operations Research Practitioner"    
## [10] "Other"                               
## [11] "Predictive Modeler"                  
## [12] "Programmer"                          
## [13] "Researcher"                          
## [14] "Scientist/Researcher"                
## [15] "Software Developer/Software Engineer"
## [16] "Statistician"                        
## Number of levels: 16

# 2. Make a bar plot of EmployerIndustry
if ("EmployerIndustry" %in% names(selected_data)) {
  
  # Count occurrences of EmployerIndustry for plotting
  industry_count <- selected_data %>%
    count(EmployerIndustry, sort = TRUE, name = "Count") %>%
    drop_na(EmployerIndustry)  # Drop rows with NA
  
  # 2.1 Basic bar plot with flipped coordinates
  ggplot(industry_count, aes(x = reorder(EmployerIndustry, Count), y = Count)) +
    geom_bar(stat = "identity", fill = "lightblue") +
    coord_flip() +
    theme_minimal() +
    labs(title = "Employer Industry Count",
         x = "Employer Industry",
         y = "Count")
  
  # 2.2 Filter rows where Age and EmployerIndustry are not NA
  filtered_data <- selected_data %>%
    filter(!is.na(Age) & !is.na(EmployerIndustry)) %>%
    count(EmployerIndustry, sort = TRUE, name = "Count")
  
  # Replot filtered data with flipped coordinates
  ggplot(filtered_data, aes(x = reorder(EmployerIndustry, Count), y = Count)) +
    geom_bar(stat = "identity", fill = "lightblue") +
    coord_flip() +
    theme_minimal() +
    labs(title = "Filtered Employer Industry Count (Age and EmployerIndustry not NA)",
         x = "Employer Industry",
         y = "Count")
  
  # 2.3 Use geom_segment to plot filtered data sorted in descending order
  filtered_data <- filtered_data %>%
    arrange(desc(Count))
  
  ggplot(filtered_data, aes(x = reorder(EmployerIndustry, Count), y = Count)) +
    geom_segment(aes(xend = EmployerIndustry, y = 0, yend = Count), 
                 color = "lightblue", size = 1.2) +
    geom_point(aes(y = Count), color = "hotpink", size = 3) +
    coord_flip() +
    theme_minimal() +
    labs(title = "Employer Industry Count with geom_segment",
         x = "Employer Industry",
         y = "Count")
  
} else {
  cat("Column 'EmployerIndustry' not found in the dataset.\n")
}

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

HW06

2024-12-17