#Load library

install.packages('tidyr')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
install.packages('readr')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
install.packages('ggplot2')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
install.packages('forcats')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
install.packages('tidyquant')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
library('tidyr')
library('readr')
library('dplyr')
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library('ggplot2')
library('forcats')
library('tidyquant')
## Loading required package: lubridate
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
## Loading required package: PerformanceAnalytics
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
## 
##     legend
## Loading required package: quantmod
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
#Import data
multiple_choice_responses <- read_csv("multipleChoiceResponses.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 16716 Columns: 228
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (212): GenderSelect, Country, EmploymentStatus, StudentStatus, LearningD...
## dbl  (13): Age, LearningCategorySelftTaught, LearningCategoryOnlineCourses, ...
## num   (1): CompensationAmount
## lgl   (2): WorkToolsFrequencyAngoss, WorkToolsFrequencyKNIMECommercial
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Part 1 ----
# 1.1 

library(dplyr)

selected_columns <- multiple_choice_responses %>%
  select(
    starts_with("Learning"),
    starts_with("Working"),
    starts_with("Age"),
    starts_with("EmploymentStatus"),
    starts_with("CurrentJob"),
    starts_with("MLMethod"),
    starts_with("Formal")
  )

char_columns <- sapply(selected_columns, is.character)
selected_columns[, char_columns] <- lapply(selected_columns[, char_columns], as.factor)

# 1.2
factor_levels <- data.frame(
  variable = names(selected_columns),
  num_levels = sapply(selected_columns, nlevels)
)

# 1.2
print(factor_levels)
##                                                                        variable
## LearningDataScience                                         LearningDataScience
## LearningPlatformSelect                                   LearningPlatformSelect
## LearningPlatformUsefulnessArxiv                 LearningPlatformUsefulnessArxiv
## LearningPlatformUsefulnessBlogs                 LearningPlatformUsefulnessBlogs
## LearningPlatformUsefulnessCollege             LearningPlatformUsefulnessCollege
## LearningPlatformUsefulnessCompany             LearningPlatformUsefulnessCompany
## LearningPlatformUsefulnessConferences     LearningPlatformUsefulnessConferences
## LearningPlatformUsefulnessFriends             LearningPlatformUsefulnessFriends
## LearningPlatformUsefulnessKaggle               LearningPlatformUsefulnessKaggle
## LearningPlatformUsefulnessNewsletters     LearningPlatformUsefulnessNewsletters
## LearningPlatformUsefulnessCommunities     LearningPlatformUsefulnessCommunities
## LearningPlatformUsefulnessDocumentation LearningPlatformUsefulnessDocumentation
## LearningPlatformUsefulnessCourses             LearningPlatformUsefulnessCourses
## LearningPlatformUsefulnessProjects           LearningPlatformUsefulnessProjects
## LearningPlatformUsefulnessPodcasts           LearningPlatformUsefulnessPodcasts
## LearningPlatformUsefulnessSO                       LearningPlatformUsefulnessSO
## LearningPlatformUsefulnessTextbook           LearningPlatformUsefulnessTextbook
## LearningPlatformUsefulnessTradeBook         LearningPlatformUsefulnessTradeBook
## LearningPlatformUsefulnessTutoring           LearningPlatformUsefulnessTutoring
## LearningPlatformUsefulnessYouTube             LearningPlatformUsefulnessYouTube
## LearningDataScienceTime                                 LearningDataScienceTime
## LearningCategorySelftTaught                         LearningCategorySelftTaught
## LearningCategoryOnlineCourses                     LearningCategoryOnlineCourses
## LearningCategoryWork                                       LearningCategoryWork
## LearningCategoryUniversity                           LearningCategoryUniversity
## LearningCategoryKaggle                                   LearningCategoryKaggle
## LearningCategoryOther                                     LearningCategoryOther
## Age                                                                         Age
## EmploymentStatus                                               EmploymentStatus
## CurrentJobTitleSelect                                     CurrentJobTitleSelect
## MLMethodNextYearSelect                                   MLMethodNextYearSelect
## FormalEducation                                                 FormalEducation
##                                         num_levels
## LearningDataScience                              3
## LearningPlatformSelect                        5362
## LearningPlatformUsefulnessArxiv                  3
## LearningPlatformUsefulnessBlogs                  3
## LearningPlatformUsefulnessCollege                3
## LearningPlatformUsefulnessCompany                3
## LearningPlatformUsefulnessConferences            3
## LearningPlatformUsefulnessFriends                3
## LearningPlatformUsefulnessKaggle                 3
## LearningPlatformUsefulnessNewsletters            3
## LearningPlatformUsefulnessCommunities            3
## LearningPlatformUsefulnessDocumentation          3
## LearningPlatformUsefulnessCourses                3
## LearningPlatformUsefulnessProjects               3
## LearningPlatformUsefulnessPodcasts               3
## LearningPlatformUsefulnessSO                     3
## LearningPlatformUsefulnessTextbook               3
## LearningPlatformUsefulnessTradeBook              3
## LearningPlatformUsefulnessTutoring               3
## LearningPlatformUsefulnessYouTube                3
## LearningDataScienceTime                          6
## LearningCategorySelftTaught                      0
## LearningCategoryOnlineCourses                    0
## LearningCategoryWork                             0
## LearningCategoryUniversity                       0
## LearningCategoryKaggle                           0
## LearningCategoryOther                            0
## Age                                              0
## EmploymentStatus                                 7
## CurrentJobTitleSelect                           16
## MLMethodNextYearSelect                          25
## FormalEducation                                  7
# 1.2
top5_levels <- factor_levels %>%
  arrange(desc(num_levels)) %>%
  head(5)

# 1.2
print(top5_levels)
##                                      variable num_levels
## LearningPlatformSelect LearningPlatformSelect       5362
## MLMethodNextYearSelect MLMethodNextYearSelect         25
## CurrentJobTitleSelect   CurrentJobTitleSelect         16
## EmploymentStatus             EmploymentStatus          7
## FormalEducation               FormalEducation          7
# 1.3 
current_job_levels <- factor_levels %>%
  filter(variable == "CurrentJobTitleSelect")

# 1.3
print(current_job_levels)
##                                    variable num_levels
## CurrentJobTitleSelect CurrentJobTitleSelect         16
# 1.4 
current_job_levels <- factor_levels %>%
  filter(variable == "CurrentJobTitleSelect")

print(current_job_levels)
##                                    variable num_levels
## CurrentJobTitleSelect CurrentJobTitleSelect         16
employer_industry_plot <- ggplot(multiple_choice_responses, aes(x = EmployerIndustry)) +
  geom_bar() +
  coord_flip()

print(employer_industry_plot)

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00