hm

library(tidyr)
library(readr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(forcats)
library(tidyquant)

## Loading required package: lubridate

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

## Loading required package: PerformanceAnalytics

## Loading required package: xts

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################

## 
## Attaching package: 'xts'

## The following objects are masked from 'package:dplyr':
## 
##     first, last

## 
## Attaching package: 'PerformanceAnalytics'

## The following object is masked from 'package:graphics':
## 
##     legend

## Loading required package: quantmod

## Loading required package: TTR

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(quantmod)

survey_data <- read_csv("multipleChoiceResponses.csv")

## Rows: 212 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): variable
## dbl (1): num_levels
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

dat <- read_csv("multipleChoiceResponses.csv")

## Rows: 212 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): variable
## dbl (1): num_levels
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

problems(dat)

## # A tibble: 0 × 5
## # ℹ 5 variables: row <int>, col <int>, expected <chr>, actual <chr>, file <chr>

dat <- read_csv("multipleChoiceResponses.csv", col_types = cols(
  GenderSelect = col_character(),
  Country = col_character(),
  EmploymentStatus = col_character(),
  Age = col_double(),
  LearningCategorySelftTaught = col_double(),
))

## Warning: The following named parsers don't match the column names:
## GenderSelect, Country, EmploymentStatus, Age, LearningCategorySelftTaught

dat <- read_csv("multipleChoiceResponses.csv")

## Rows: 212 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): variable
## dbl (1): num_levels
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

spec(dat)

## cols(
##   variable = col_character(),
##   num_levels = col_double()
## )

dat_comma <- read.csv("multipleChoiceResponses.csv", sep = ",")

str(dat_comma)

## 'data.frame':    212 obs. of  2 variables:
##  $ variable  : chr  "GenderSelect" "Country" "EmploymentStatus" "StudentStatus" ...
##  $ num_levels: int  4 52 7 2 3 2 2 16 3 91 ...

selected_data <- survey_data %>%
  select(starts_with("Leaning"), starts_with("Working"), starts_with("Age"), 
         starts_with("EmployerIndustry"), starts_with("CurrentJob"), 
         starts_with("MLMethod"), starts_with("Formal"))
glimpse(selected_data)

## Rows: 212
## Columns: 0

dat <- dat %>%
  mutate_if(is.character, as.factor)
factor_levels <- sapply(dat, function(x) if(is.factor(x)) nlevels(x) else NA)
factor_levels_df <- data.frame(variable = names(factor_levels), 
                               num_levels = factor_levels, 
                               stringsAsFactors = FALSE)
factor_levels_df <- factor_levels_df[!is.na(factor_levels_df$num_levels), ]

print(factor_levels_df)

##          variable num_levels
## variable variable        212

write.csv(factor_levels_df, "multipleChoiceResponses.csv", row.names = FALSE)

top_factors <- factor_levels_df %>%
  arrange(desc(num_levels)) %>%
  head(5)

print(top_factors)

##          variable num_levels
## variable variable        212

hm

2023-12-19