library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
responses <- read.csv("/cloud/project/multipleChoiceResponses.csv", header = TRUE, stringsAsFactors = FALSE)

selected_columns <- responses %>%
  select(
    starts_with("Learning"),
    starts_with("Working"),
    starts_with("Age"),
    starts_with("EmploymentStatus"),
    starts_with("CurrentJob"),
    starts_with("MLMethod"),
    starts_with("Formal")
  )

char_columns <- sapply(selected_columns, is.character)
selected_columns[, char_columns] <- lapply(selected_columns[, char_columns], as.factor)

# 1.2
factor_levels <- data.frame(
  variable = names(selected_columns),
  num_levels = sapply(selected_columns, nlevels)
)

# 1.2
print(factor_levels)
##                                                                        variable
## LearningDataScience                                         LearningDataScience
## LearningPlatformSelect                                   LearningPlatformSelect
## LearningPlatformUsefulnessArxiv                 LearningPlatformUsefulnessArxiv
## LearningPlatformUsefulnessBlogs                 LearningPlatformUsefulnessBlogs
## LearningPlatformUsefulnessCollege             LearningPlatformUsefulnessCollege
## LearningPlatformUsefulnessCompany             LearningPlatformUsefulnessCompany
## LearningPlatformUsefulnessConferences     LearningPlatformUsefulnessConferences
## LearningPlatformUsefulnessFriends             LearningPlatformUsefulnessFriends
## LearningPlatformUsefulnessKaggle               LearningPlatformUsefulnessKaggle
## LearningPlatformUsefulnessNewsletters     LearningPlatformUsefulnessNewsletters
## LearningPlatformUsefulnessCommunities     LearningPlatformUsefulnessCommunities
## LearningPlatformUsefulnessDocumentation LearningPlatformUsefulnessDocumentation
## LearningPlatformUsefulnessCourses             LearningPlatformUsefulnessCourses
## LearningPlatformUsefulnessProjects           LearningPlatformUsefulnessProjects
## LearningPlatformUsefulnessPodcasts           LearningPlatformUsefulnessPodcasts
## LearningPlatformUsefulnessSO                       LearningPlatformUsefulnessSO
## LearningPlatformUsefulnessTextbook           LearningPlatformUsefulnessTextbook
## LearningPlatformUsefulnessTradeBook         LearningPlatformUsefulnessTradeBook
## LearningPlatformUsefulnessTutoring           LearningPlatformUsefulnessTutoring
## LearningPlatformUsefulnessYouTube             LearningPlatformUsefulnessYouTube
## LearningDataScienceTime                                 LearningDataScienceTime
## LearningCategorySelftTaught                         LearningCategorySelftTaught
## LearningCategoryOnlineCourses                     LearningCategoryOnlineCourses
## LearningCategoryWork                                       LearningCategoryWork
## LearningCategoryUniversity                           LearningCategoryUniversity
## LearningCategoryKaggle                                   LearningCategoryKaggle
## LearningCategoryOther                                     LearningCategoryOther
## Age                                                                         Age
## EmploymentStatus                                               EmploymentStatus
## CurrentJobTitleSelect                                     CurrentJobTitleSelect
## MLMethodNextYearSelect                                   MLMethodNextYearSelect
## FormalEducation                                                 FormalEducation
##                                         num_levels
## LearningDataScience                              4
## LearningPlatformSelect                        5363
## LearningPlatformUsefulnessArxiv                  4
## LearningPlatformUsefulnessBlogs                  4
## LearningPlatformUsefulnessCollege                4
## LearningPlatformUsefulnessCompany                4
## LearningPlatformUsefulnessConferences            4
## LearningPlatformUsefulnessFriends                4
## LearningPlatformUsefulnessKaggle                 4
## LearningPlatformUsefulnessNewsletters            4
## LearningPlatformUsefulnessCommunities            4
## LearningPlatformUsefulnessDocumentation          4
## LearningPlatformUsefulnessCourses                4
## LearningPlatformUsefulnessProjects               4
## LearningPlatformUsefulnessPodcasts               4
## LearningPlatformUsefulnessSO                     4
## LearningPlatformUsefulnessTextbook               4
## LearningPlatformUsefulnessTradeBook              4
## LearningPlatformUsefulnessTutoring               4
## LearningPlatformUsefulnessYouTube                4
## LearningDataScienceTime                          7
## LearningCategorySelftTaught                      0
## LearningCategoryOnlineCourses                    0
## LearningCategoryWork                             0
## LearningCategoryUniversity                       0
## LearningCategoryKaggle                           0
## LearningCategoryOther                            0
## Age                                              0
## EmploymentStatus                                 7
## CurrentJobTitleSelect                           17
## MLMethodNextYearSelect                          26
## FormalEducation                                  8
# 1.2
top5_levels <- factor_levels %>%
  arrange(desc(num_levels)) %>%
  head(5)

# 1.2
print(top5_levels)
##                                        variable num_levels
## LearningPlatformSelect   LearningPlatformSelect       5363
## MLMethodNextYearSelect   MLMethodNextYearSelect         26
## CurrentJobTitleSelect     CurrentJobTitleSelect         17
## FormalEducation                 FormalEducation          8
## LearningDataScienceTime LearningDataScienceTime          7
# 1.3 
current_job_levels <- factor_levels %>%
  filter(variable == "CurrentJobTitleSelect")

# 1.3
print(current_job_levels)
##                                    variable num_levels
## CurrentJobTitleSelect CurrentJobTitleSelect         17
# 1.4 
current_job_levels <- factor_levels %>%
  filter(variable == "CurrentJobTitleSelect")

print(current_job_levels)
##                                    variable num_levels
## CurrentJobTitleSelect CurrentJobTitleSelect         17
library(ggplot2)

employer_industry_plot <- ggplot(responses, aes(x = EmployerIndustry)) +
  geom_bar() +
  coord_flip()

print(employer_industry_plot)

#question_2
#2.1
library(ggplot2)

responses <- read.csv("/cloud/project/multipleChoiceResponses.csv", header = TRUE, stringsAsFactors = FALSE)

employer_industry_plot <- ggplot(responses, aes(x = EmployerIndustry)) +
  geom_bar(stat = "count") +
  coord_flip()

print(employer_industry_plot)

#2.2
library(ggplot2)

responses <- read.csv("/cloud/project/multipleChoiceResponses.csv", header = TRUE, stringsAsFactors = FALSE)

filtered_responses <- na.omit(responses[c("Age", "EmployerIndustry")])

filtered_plot <- ggplot(filtered_responses, aes(x = EmployerIndustry)) +
  geom_bar(stat = "count") +
  coord_flip()

print(filtered_plot)

#2.3
library(ggplot2)

responses <- read.csv("/cloud/project/multipleChoiceResponses.csv", header = TRUE, stringsAsFactors = FALSE)

filtered_responses <- na.omit(responses[c("Age", "EmployerIndustry")])

ordered_plot <- ggplot(filtered_responses, aes(x = reorder(EmployerIndustry, -table(EmployerIndustry)[EmployerIndustry]), y = Age)) +
  geom_segment(aes(xend = reorder(EmployerIndustry, -table(EmployerIndustry)[EmployerIndustry]), yend = 0), size = 1) +
  coord_flip()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#3
responses <- read.csv("/cloud/project/multipleChoiceResponses.csv", header = TRUE, stringsAsFactors = FALSE)

work_levels <- levels(responses$WorkInternalVsExternalTools)

custom_order <- c(
  "Entirely internal",
  "More internal than external",
  "Approximately half internal and half external",
  "More external than internal",
  "Entirely external",
  "Do not know"
)

responses$WorkInternalVsExternalTools <- factor(
  responses$WorkInternalVsExternalTools,
  levels = custom_order
)

work_plot <- ggplot(responses, aes(x = WorkInternalVsExternalTools)) +
  geom_bar(stat = "count") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

print(work_plot)