library(tidyr)
library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(forcats)
library(tidyquant)  
## Loading required package: lubridate
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
## Loading required package: PerformanceAnalytics
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
## 
##     legend
## Loading required package: quantmod
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(quantmod)
survey_data <- read_csv("multipleChoiceResponses.csv")
## Rows: 212 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): variable
## dbl (1): num_levels
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dat <- read_csv("multipleChoiceResponses.csv")
## Rows: 212 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): variable
## dbl (1): num_levels
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
problems(dat)
## # A tibble: 0 × 5
## # ℹ 5 variables: row <int>, col <int>, expected <chr>, actual <chr>, file <chr>
dat <- read_csv("multipleChoiceResponses.csv", col_types = cols(
  GenderSelect = col_character(),
  Country = col_character(),
  EmploymentStatus = col_character(),
  Age = col_double(),
  LearningCategorySelftTaught = col_double(),
))
## Warning: The following named parsers don't match the column names:
## GenderSelect, Country, EmploymentStatus, Age, LearningCategorySelftTaught
dat <- read_csv("multipleChoiceResponses.csv")
## Rows: 212 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): variable
## dbl (1): num_levels
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
spec(dat)
## cols(
##   variable = col_character(),
##   num_levels = col_double()
## )
dat_comma <- read.csv("multipleChoiceResponses.csv", sep = ",")

str(dat_comma)
## 'data.frame':    212 obs. of  2 variables:
##  $ variable  : chr  "GenderSelect" "Country" "EmploymentStatus" "StudentStatus" ...
##  $ num_levels: int  4 52 7 2 3 2 2 16 3 91 ...
selected_data <- survey_data %>%
  select(starts_with("Leaning"), starts_with("Working"), starts_with("Age"), 
         starts_with("EmployerIndustry"), starts_with("CurrentJob"), 
         starts_with("MLMethod"), starts_with("Formal"))
glimpse(selected_data)
## Rows: 212
## Columns: 0
dat <- dat %>%
  mutate_if(is.character, as.factor)
factor_levels <- sapply(dat, function(x) if(is.factor(x)) nlevels(x) else NA)
factor_levels_df <- data.frame(variable = names(factor_levels), 
                               num_levels = factor_levels, 
                               stringsAsFactors = FALSE)
factor_levels_df <- factor_levels_df[!is.na(factor_levels_df$num_levels), ]

print(factor_levels_df)
##          variable num_levels
## variable variable        212
write.csv(factor_levels_df, "multipleChoiceResponses.csv", row.names = FALSE)
top_factors <- factor_levels_df %>%
  arrange(desc(num_levels)) %>%
  head(5)

print(top_factors)
##          variable num_levels
## variable variable        212