library(tidyr)
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(forcats)
library(tidyquant)
## Loading required package: lubridate
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
## Loading required package: PerformanceAnalytics
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## ######################### Warning from 'xts' package ##########################
## # #
## # The dplyr lag() function breaks how base R's lag() function is supposed to #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or #
## # source() into this session won't work correctly. #
## # #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop #
## # dplyr from breaking base R's lag() function. #
## # #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning. #
## # #
## ###############################################################################
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
##
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
##
## legend
## Loading required package: quantmod
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(quantmod)
survey_data <- read_csv("multipleChoiceResponses.csv")
## Rows: 212 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): variable
## dbl (1): num_levels
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dat <- read_csv("multipleChoiceResponses.csv")
## Rows: 212 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): variable
## dbl (1): num_levels
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
problems(dat)
## # A tibble: 0 × 5
## # ℹ 5 variables: row <int>, col <int>, expected <chr>, actual <chr>, file <chr>
dat <- read_csv("multipleChoiceResponses.csv", col_types = cols(
GenderSelect = col_character(),
Country = col_character(),
EmploymentStatus = col_character(),
Age = col_double(),
LearningCategorySelftTaught = col_double(),
))
## Warning: The following named parsers don't match the column names:
## GenderSelect, Country, EmploymentStatus, Age, LearningCategorySelftTaught
dat <- read_csv("multipleChoiceResponses.csv")
## Rows: 212 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): variable
## dbl (1): num_levels
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
spec(dat)
## cols(
## variable = col_character(),
## num_levels = col_double()
## )
dat_comma <- read.csv("multipleChoiceResponses.csv", sep = ",")
str(dat_comma)
## 'data.frame': 212 obs. of 2 variables:
## $ variable : chr "GenderSelect" "Country" "EmploymentStatus" "StudentStatus" ...
## $ num_levels: int 4 52 7 2 3 2 2 16 3 91 ...
selected_data <- survey_data %>%
select(starts_with("Leaning"), starts_with("Working"), starts_with("Age"),
starts_with("EmployerIndustry"), starts_with("CurrentJob"),
starts_with("MLMethod"), starts_with("Formal"))
glimpse(selected_data)
## Rows: 212
## Columns: 0
dat <- dat %>%
mutate_if(is.character, as.factor)
factor_levels <- sapply(dat, function(x) if(is.factor(x)) nlevels(x) else NA)
factor_levels_df <- data.frame(variable = names(factor_levels),
num_levels = factor_levels,
stringsAsFactors = FALSE)
factor_levels_df <- factor_levels_df[!is.na(factor_levels_df$num_levels), ]
print(factor_levels_df)
## variable num_levels
## variable variable 212
write.csv(factor_levels_df, "multipleChoiceResponses.csv", row.names = FALSE)
top_factors <- factor_levels_df %>%
arrange(desc(num_levels)) %>%
head(5)
print(top_factors)
## variable num_levels
## variable variable 212