library(tidyr)
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(forcats)
library(tidyquant)
## Loading required package: lubridate
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
## Loading required package: PerformanceAnalytics
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## ######################### Warning from 'xts' package ##########################
## # #
## # The dplyr lag() function breaks how base R's lag() function is supposed to #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or #
## # source() into this session won't work correctly. #
## # #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop #
## # dplyr from breaking base R's lag() function. #
## # #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning. #
## # #
## ###############################################################################
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
##
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
##
## legend
## Loading required package: quantmod
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(quantmod)
survey_data <- read_csv("multipleChoiceResponses.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 16716 Columns: 228
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (212): GenderSelect, Country, EmploymentStatus, StudentStatus, LearningD...
## dbl (13): Age, LearningCategorySelftTaught, LearningCategoryOnlineCourses, ...
## num (1): CompensationAmount
## lgl (2): WorkToolsFrequencyAngoss, WorkToolsFrequencyKNIMECommercial
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dat <- read_csv("multipleChoiceResponses.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 16716 Columns: 228
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (212): GenderSelect, Country, EmploymentStatus, StudentStatus, LearningD...
## dbl (13): Age, LearningCategorySelftTaught, LearningCategoryOnlineCourses, ...
## num (1): CompensationAmount
## lgl (2): WorkToolsFrequencyAngoss, WorkToolsFrequencyKNIMECommercial
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
problems(dat)
## # A tibble: 65 × 5
## row col expected actual file
## <int> <int> <chr> <chr> <chr>
## 1 297 84 1/0/T/F/TRUE/FALSE Rarely ""
## 2 673 84 1/0/T/F/TRUE/FALSE Most of the time ""
## 3 1210 207 a number - ""
## 4 1317 99 1/0/T/F/TRUE/FALSE Rarely ""
## 5 1595 99 1/0/T/F/TRUE/FALSE Often ""
## 6 2444 99 1/0/T/F/TRUE/FALSE Most of the time ""
## 7 2467 99 1/0/T/F/TRUE/FALSE Most of the time ""
## 8 2623 99 1/0/T/F/TRUE/FALSE Often ""
## 9 2631 99 1/0/T/F/TRUE/FALSE Sometimes ""
## 10 2725 99 1/0/T/F/TRUE/FALSE Often ""
## # ℹ 55 more rows
dat <- read_csv("multipleChoiceResponses.csv", col_types = cols(
GenderSelect = col_character(),
Country = col_character(),
EmploymentStatus = col_character(),
Age = col_double(),
LearningCategorySelftTaught = col_double(),
))
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
dat <- read_csv("multipleChoiceResponses.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 16716 Columns: 228
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (212): GenderSelect, Country, EmploymentStatus, StudentStatus, LearningD...
## dbl (13): Age, LearningCategorySelftTaught, LearningCategoryOnlineCourses, ...
## num (1): CompensationAmount
## lgl (2): WorkToolsFrequencyAngoss, WorkToolsFrequencyKNIMECommercial
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
spec(dat)
## cols(
## GenderSelect = col_character(),
## Country = col_character(),
## Age = col_double(),
## EmploymentStatus = col_character(),
## StudentStatus = col_character(),
## LearningDataScience = col_character(),
## CodeWriter = col_character(),
## CareerSwitcher = col_character(),
## CurrentJobTitleSelect = col_character(),
## TitleFit = col_character(),
## CurrentEmployerType = col_character(),
## MLToolNextYearSelect = col_character(),
## MLMethodNextYearSelect = col_character(),
## LanguageRecommendationSelect = col_character(),
## PublicDatasetsSelect = col_character(),
## LearningPlatformSelect = col_character(),
## LearningPlatformUsefulnessArxiv = col_character(),
## LearningPlatformUsefulnessBlogs = col_character(),
## LearningPlatformUsefulnessCollege = col_character(),
## LearningPlatformUsefulnessCompany = col_character(),
## LearningPlatformUsefulnessConferences = col_character(),
## LearningPlatformUsefulnessFriends = col_character(),
## LearningPlatformUsefulnessKaggle = col_character(),
## LearningPlatformUsefulnessNewsletters = col_character(),
## LearningPlatformUsefulnessCommunities = col_character(),
## LearningPlatformUsefulnessDocumentation = col_character(),
## LearningPlatformUsefulnessCourses = col_character(),
## LearningPlatformUsefulnessProjects = col_character(),
## LearningPlatformUsefulnessPodcasts = col_character(),
## LearningPlatformUsefulnessSO = col_character(),
## LearningPlatformUsefulnessTextbook = col_character(),
## LearningPlatformUsefulnessTradeBook = col_character(),
## LearningPlatformUsefulnessTutoring = col_character(),
## LearningPlatformUsefulnessYouTube = col_character(),
## BlogsPodcastsNewslettersSelect = col_character(),
## LearningDataScienceTime = col_character(),
## JobSkillImportanceBigData = col_character(),
## JobSkillImportanceDegree = col_character(),
## JobSkillImportanceStats = col_character(),
## JobSkillImportanceEnterpriseTools = col_character(),
## JobSkillImportancePython = col_character(),
## JobSkillImportanceR = col_character(),
## JobSkillImportanceSQL = col_character(),
## JobSkillImportanceKaggleRanking = col_character(),
## JobSkillImportanceMOOC = col_character(),
## JobSkillImportanceVisualizations = col_character(),
## JobSkillImportanceOtherSelect1 = col_character(),
## JobSkillImportanceOtherSelect2 = col_character(),
## JobSkillImportanceOtherSelect3 = col_character(),
## CoursePlatformSelect = col_character(),
## HardwarePersonalProjectsSelect = col_character(),
## TimeSpentStudying = col_character(),
## ProveKnowledgeSelect = col_character(),
## DataScienceIdentitySelect = col_character(),
## FormalEducation = col_character(),
## MajorSelect = col_character(),
## Tenure = col_character(),
## PastJobTitlesSelect = col_character(),
## FirstTrainingSelect = col_character(),
## LearningCategorySelftTaught = col_double(),
## LearningCategoryOnlineCourses = col_double(),
## LearningCategoryWork = col_double(),
## LearningCategoryUniversity = col_double(),
## LearningCategoryKaggle = col_double(),
## LearningCategoryOther = col_double(),
## MLSkillsSelect = col_character(),
## MLTechniquesSelect = col_character(),
## ParentsEducation = col_character(),
## EmployerIndustry = col_character(),
## EmployerSize = col_character(),
## EmployerSizeChange = col_character(),
## EmployerMLTime = col_character(),
## EmployerSearchMethod = col_character(),
## UniversityImportance = col_character(),
## JobFunctionSelect = col_character(),
## WorkHardwareSelect = col_character(),
## WorkDataTypeSelect = col_character(),
## WorkProductionFrequency = col_character(),
## WorkDatasetSize = col_character(),
## WorkAlgorithmsSelect = col_character(),
## WorkToolsSelect = col_character(),
## WorkToolsFrequencyAmazonML = col_character(),
## WorkToolsFrequencyAWS = col_character(),
## WorkToolsFrequencyAngoss = col_logical(),
## WorkToolsFrequencyC = col_character(),
## WorkToolsFrequencyCloudera = col_character(),
## WorkToolsFrequencyDataRobot = col_character(),
## WorkToolsFrequencyFlume = col_character(),
## WorkToolsFrequencyGCP = col_character(),
## WorkToolsFrequencyHadoop = col_character(),
## WorkToolsFrequencyIBMCognos = col_character(),
## WorkToolsFrequencyIBMSPSSModeler = col_character(),
## WorkToolsFrequencyIBMSPSSStatistics = col_character(),
## WorkToolsFrequencyIBMWatson = col_character(),
## WorkToolsFrequencyImpala = col_character(),
## WorkToolsFrequencyJava = col_character(),
## WorkToolsFrequencyJulia = col_character(),
## WorkToolsFrequencyJupyter = col_character(),
## WorkToolsFrequencyKNIMECommercial = col_logical(),
## WorkToolsFrequencyKNIMEFree = col_character(),
## WorkToolsFrequencyMathematica = col_character(),
## WorkToolsFrequencyMATLAB = col_character(),
## WorkToolsFrequencyAzure = col_character(),
## WorkToolsFrequencyExcel = col_character(),
## WorkToolsFrequencyMicrosoftRServer = col_character(),
## WorkToolsFrequencyMicrosoftSQL = col_character(),
## WorkToolsFrequencyMinitab = col_character(),
## WorkToolsFrequencyNoSQL = col_character(),
## WorkToolsFrequencyOracle = col_character(),
## WorkToolsFrequencyOrange = col_character(),
## WorkToolsFrequencyPerl = col_character(),
## WorkToolsFrequencyPython = col_character(),
## WorkToolsFrequencyQlik = col_character(),
## WorkToolsFrequencyR = col_character(),
## WorkToolsFrequencyRapidMinerCommercial = col_character(),
## WorkToolsFrequencyRapidMinerFree = col_character(),
## WorkToolsFrequencySalfrod = col_character(),
## WorkToolsFrequencySAPBusinessObjects = col_character(),
## WorkToolsFrequencySASBase = col_character(),
## WorkToolsFrequencySASEnterprise = col_character(),
## WorkToolsFrequencySASJMP = col_character(),
## WorkToolsFrequencySpark = col_character(),
## WorkToolsFrequencySQL = col_character(),
## WorkToolsFrequencyStan = col_character(),
## WorkToolsFrequencyStatistica = col_character(),
## WorkToolsFrequencyTableau = col_character(),
## WorkToolsFrequencyTensorFlow = col_character(),
## WorkToolsFrequencyTIBCO = col_character(),
## WorkToolsFrequencyUnix = col_character(),
## WorkToolsFrequencySelect1 = col_character(),
## WorkToolsFrequencySelect2 = col_character(),
## WorkFrequencySelect3 = col_character(),
## WorkMethodsSelect = col_character(),
## `WorkMethodsFrequencyA/B` = col_character(),
## WorkMethodsFrequencyAssociationRules = col_character(),
## WorkMethodsFrequencyBayesian = col_character(),
## WorkMethodsFrequencyCNNs = col_character(),
## WorkMethodsFrequencyCollaborativeFiltering = col_character(),
## `WorkMethodsFrequencyCross-Validation` = col_character(),
## WorkMethodsFrequencyDataVisualization = col_character(),
## WorkMethodsFrequencyDecisionTrees = col_character(),
## WorkMethodsFrequencyEnsembleMethods = col_character(),
## WorkMethodsFrequencyEvolutionaryApproaches = col_character(),
## WorkMethodsFrequencyGANs = col_character(),
## WorkMethodsFrequencyGBM = col_character(),
## WorkMethodsFrequencyHMMs = col_character(),
## WorkMethodsFrequencyKNN = col_character(),
## WorkMethodsFrequencyLiftAnalysis = col_character(),
## WorkMethodsFrequencyLogisticRegression = col_character(),
## WorkMethodsFrequencyMLN = col_character(),
## WorkMethodsFrequencyNaiveBayes = col_character(),
## WorkMethodsFrequencyNLP = col_character(),
## WorkMethodsFrequencyNeuralNetworks = col_character(),
## WorkMethodsFrequencyPCA = col_character(),
## WorkMethodsFrequencyPrescriptiveModeling = col_character(),
## WorkMethodsFrequencyRandomForests = col_character(),
## WorkMethodsFrequencyRecommenderSystems = col_character(),
## WorkMethodsFrequencyRNNs = col_character(),
## WorkMethodsFrequencySegmentation = col_character(),
## WorkMethodsFrequencySimulation = col_character(),
## WorkMethodsFrequencySVMs = col_character(),
## WorkMethodsFrequencyTextAnalysis = col_character(),
## WorkMethodsFrequencyTimeSeriesAnalysis = col_character(),
## WorkMethodsFrequencySelect1 = col_character(),
## WorkMethodsFrequencySelect2 = col_character(),
## WorkMethodsFrequencySelect3 = col_character(),
## TimeGatheringData = col_double(),
## TimeModelBuilding = col_double(),
## TimeProduction = col_double(),
## TimeVisualizing = col_double(),
## TimeFindingInsights = col_double(),
## TimeOtherSelect = col_double(),
## AlgorithmUnderstandingLevel = col_character(),
## WorkChallengesSelect = col_character(),
## WorkChallengeFrequencyPolitics = col_character(),
## WorkChallengeFrequencyUnusedResults = col_character(),
## WorkChallengeFrequencyUnusefulInstrumenting = col_character(),
## WorkChallengeFrequencyDeployment = col_character(),
## WorkChallengeFrequencyDirtyData = col_character(),
## WorkChallengeFrequencyExplaining = col_character(),
## WorkChallengeFrequencyPass = col_character(),
## WorkChallengeFrequencyIntegration = col_character(),
## WorkChallengeFrequencyTalent = col_character(),
## WorkChallengeFrequencyDataFunds = col_character(),
## WorkChallengeFrequencyDomainExpertise = col_character(),
## WorkChallengeFrequencyML = col_character(),
## WorkChallengeFrequencyTools = col_character(),
## WorkChallengeFrequencyExpectations = col_character(),
## WorkChallengeFrequencyITCoordination = col_character(),
## WorkChallengeFrequencyHiringFunds = col_character(),
## WorkChallengeFrequencyPrivacy = col_character(),
## WorkChallengeFrequencyScaling = col_character(),
## WorkChallengeFrequencyEnvironments = col_character(),
## WorkChallengeFrequencyClarity = col_character(),
## WorkChallengeFrequencyDataAccess = col_character(),
## WorkChallengeFrequencyOtherSelect = col_character(),
## WorkDataVisualizations = col_character(),
## WorkInternalVsExternalTools = col_character(),
## WorkMLTeamSeatSelect = col_character(),
## WorkDatasets = col_character(),
## WorkDatasetsChallenge = col_character(),
## WorkDataStorage = col_character(),
## WorkDataSharing = col_character(),
## WorkDataSourcing = col_character(),
## WorkCodeSharing = col_character(),
## RemoteWork = col_character(),
## CompensationAmount = col_number(),
## CompensationCurrency = col_character(),
## SalaryChange = col_character(),
## JobSatisfaction = col_character(),
## JobSearchResource = col_character(),
## JobHuntTime = col_character(),
## JobFactorLearning = col_character(),
## JobFactorSalary = col_character(),
## JobFactorOffice = col_character(),
## JobFactorLanguages = col_character(),
## JobFactorCommute = col_character(),
## JobFactorManagement = col_character(),
## JobFactorExperienceLevel = col_character(),
## JobFactorDepartment = col_character(),
## JobFactorTitle = col_character(),
## JobFactorCompanyFunding = col_character(),
## JobFactorImpact = col_character(),
## JobFactorRemote = col_character(),
## JobFactorIndustry = col_character(),
## JobFactorLeaderReputation = col_character(),
## JobFactorDiversity = col_character(),
## JobFactorPublishingOpportunity = col_character()
## )
dat_comma <- read.csv("multipleChoiceResponses.csv", sep = ",")
str(dat_comma)
## 'data.frame': 16716 obs. of 228 variables:
## $ GenderSelect : chr "Non-binary, genderqueer, or gender non-conforming" "Female" "Male" "Male" ...
## $ Country : chr "" "United States" "Canada" "United States" ...
## $ Age : int NA 30 28 56 38 46 35 22 43 33 ...
## $ EmploymentStatus : chr "Employed full-time" "Not employed, but looking for work" "Not employed, but looking for work" "Independent contractor, freelancer, or self-employed" ...
## $ StudentStatus : chr "" "" "" "" ...
## $ LearningDataScience : chr "" "" "" "" ...
## $ CodeWriter : chr "Yes" "" "" "Yes" ...
## $ CareerSwitcher : chr "" "" "" "" ...
## $ CurrentJobTitleSelect : chr "DBA/Database Engineer" "" "" "Operations Research Practitioner" ...
## $ TitleFit : chr "Fine" "" "" "Poorly" ...
## $ CurrentEmployerType : chr "Employed by a company that doesn't perform advanced analytics,Employed by non-profit or NGO" "" "" "Self-employed" ...
## $ MLToolNextYearSelect : chr "SAS Base" "Python" "Amazon Web services" "TensorFlow" ...
## $ MLMethodNextYearSelect : chr "Random Forests" "Random Forests" "Deep learning" "Neural Nets" ...
## $ LanguageRecommendationSelect : chr "F#" "Python" "R" "Python" ...
## $ PublicDatasetsSelect : chr "Dataset aggregator/platform (i.e. Socrata/Kaggle Datasets/data.world/etc.),University/Non-profit research group websites,Other" "Dataset aggregator/platform (i.e. Socrata/Kaggle Datasets/data.world/etc.)" "Dataset aggregator/platform (i.e. Socrata/Kaggle Datasets/data.world/etc.)" "I collect my own data (e.g. web-scraping)" ...
## $ LearningPlatformSelect : chr "College/University,Conferences,Podcasts,Trade book" "Kaggle" "Arxiv,College/University,Kaggle,Online courses,YouTube Videos" "Blogs,College/University,Conferences,Friends network,Official documentation,Online courses,Personal Projects" ...
## $ LearningPlatformUsefulnessArxiv : chr "" "" "Very useful" "" ...
## $ LearningPlatformUsefulnessBlogs : chr "" "" "" "Very useful" ...
## $ LearningPlatformUsefulnessCollege : chr "" "" "Somewhat useful" "Very useful" ...
## $ LearningPlatformUsefulnessCompany : chr "" "" "" "" ...
## $ LearningPlatformUsefulnessConferences : chr "Very useful" "" "" "Very useful" ...
## $ LearningPlatformUsefulnessFriends : chr "" "" "" "Very useful" ...
## $ LearningPlatformUsefulnessKaggle : chr "" "Somewhat useful" "Somewhat useful" "" ...
## $ LearningPlatformUsefulnessNewsletters : chr "" "" "" "" ...
## $ LearningPlatformUsefulnessCommunities : chr "" "" "" "" ...
## $ LearningPlatformUsefulnessDocumentation : chr "" "" "" "Very useful" ...
## $ LearningPlatformUsefulnessCourses : chr "" "" "Very useful" "Very useful" ...
## $ LearningPlatformUsefulnessProjects : chr "" "" "" "Very useful" ...
## $ LearningPlatformUsefulnessPodcasts : chr "Very useful" "" "" "" ...
## $ LearningPlatformUsefulnessSO : chr "" "" "" "" ...
## $ LearningPlatformUsefulnessTextbook : chr "" "" "" "" ...
## $ LearningPlatformUsefulnessTradeBook : chr "Somewhat useful" "" "" "" ...
## $ LearningPlatformUsefulnessTutoring : chr "" "" "" "" ...
## $ LearningPlatformUsefulnessYouTube : chr "" "" "Very useful" "" ...
## $ BlogsPodcastsNewslettersSelect : chr "Becoming a Data Scientist Podcast,Data Machina Newsletter,O'Reilly Data Newsletter,Partially Derivative Podcast"| __truncated__ "Becoming a Data Scientist Podcast,Siraj Raval YouTube Channel" "FastML Blog,No Free Hunch Blog,Talking Machines Podcast" "KDnuggets Blog" ...
## $ LearningDataScienceTime : chr "" "1-2 years" "1-2 years" "" ...
## $ JobSkillImportanceBigData : chr "" "" "Necessary" "" ...
## $ JobSkillImportanceDegree : chr "" "Nice to have" "" "" ...
## $ JobSkillImportanceStats : chr "" "Unnecessary" "" "" ...
## $ JobSkillImportanceEnterpriseTools : chr "" "" "" "" ...
## $ JobSkillImportancePython : chr "" "Unnecessary" "" "" ...
## $ JobSkillImportanceR : chr "" "" "Necessary" "" ...
## $ JobSkillImportanceSQL : chr "" "Necessary" "" "" ...
## $ JobSkillImportanceKaggleRanking : chr "" "" "" "" ...
## $ JobSkillImportanceMOOC : chr "" "" "" "" ...
## $ JobSkillImportanceVisualizations : chr "" "" "" "" ...
## $ JobSkillImportanceOtherSelect1 : chr "" "" "" "" ...
## $ JobSkillImportanceOtherSelect2 : chr "" "" "" "" ...
## $ JobSkillImportanceOtherSelect3 : chr "" "" "" "" ...
## $ CoursePlatformSelect : chr "" "" "Coursera,edX" "" ...
## $ HardwarePersonalProjectsSelect : chr "" "" "Basic laptop (Macbook)" "" ...
## $ TimeSpentStudying : chr "" "2 - 10 hours" "2 - 10 hours" "" ...
## $ ProveKnowledgeSelect : chr "" "Master's degree" "Github Portfolio" "" ...
## $ DataScienceIdentitySelect : chr "Yes" "Yes" "Yes" "Yes" ...
## $ FormalEducation : chr "Bachelor's degree" "Master's degree" "Master's degree" "Master's degree" ...
## $ MajorSelect : chr "Management information systems" "Computer Science" "Engineering (non-computer focused)" "Mathematics or statistics" ...
## $ Tenure : chr "More than 10 years" "Less than a year" "3 to 5 years" "More than 10 years" ...
## $ PastJobTitlesSelect : chr "Predictive Modeler,Programmer,Researcher" "Software Developer/Software Engineer" "Data Scientist,Machine Learning Engineer" "Business Analyst,Operations Research Practitioner,Predictive Modeler,Programmer,Other" ...
## $ FirstTrainingSelect : chr "University courses" "University courses" "University courses" "University courses" ...
## $ LearningCategorySelftTaught : num 0 10 20 30 60 45 40 0 70 10 ...
## $ LearningCategoryOnlineCourses : num 0 30 50 0 5 25 0 40 0 70 ...
## $ LearningCategoryWork : num 100 0 0 40 5 20 0 0 30 15 ...
## $ LearningCategoryUniversity : num 0 30 30 30 30 0 50 50 0 0 ...
## $ LearningCategoryKaggle : num 0 30 0 0 0 10 10 10 0 5 ...
## $ LearningCategoryOther : num 0 0 0 0 0 0 0 0 0 0 ...
## $ MLSkillsSelect : chr "Computer Vision,Natural Language Processing,Supervised Machine Learning (Tabular Data),Time Series" "Computer Vision,Supervised Machine Learning (Tabular Data),Unsupervised Learning" "Adversarial Learning,Computer Vision,Natural Language Processing" "Recommendation Engines,Reinforcement learning,Supervised Machine Learning (Tabular Data),Survival Analysis,Time"| __truncated__ ...
## $ MLTechniquesSelect : chr "Evolutionary Approaches,Neural Networks - GANs,Neural Networks - RNNs" "Bayesian Techniques,Decision Trees - Gradient Boosted Machines,Decision Trees - Random Forests,Logistic Regress"| __truncated__ "Decision Trees - Random Forests,Ensemble Methods,Neural Networks - CNNs,Support Vector Machines (SVMs)" "Bayesian Techniques,Decision Trees - Gradient Boosted Machines,Decision Trees - Random Forests,Ensemble Methods"| __truncated__ ...
## $ ParentsEducation : chr "A doctoral degree" "A bachelor's degree" "A bachelor's degree" "High school" ...
## $ EmployerIndustry : chr "Internet-based" "" "" "Mix of fields" ...
## $ EmployerSize : chr "100 to 499 employees" "" "" "" ...
## $ EmployerSizeChange : chr "Increased slightly" "" "" "" ...
## $ EmployerMLTime : chr "3-5 years" "" "" "" ...
## $ EmployerSearchMethod : chr "I visited the company's Web site and found a job listing there" "" "" "" ...
## $ UniversityImportance : chr "Not very important" "" "" "Very important" ...
## $ JobFunctionSelect : chr "Build prototypes to explore applying machine learning to new areas" "" "" "Analyze and understand data to influence product or business decisions" ...
## $ WorkHardwareSelect : chr "Gaming Laptop (Laptop + CUDA capable GPU),Workstation + Cloud service" "" "" "Laptop + Cloud service (AWS, Azure, GCE ...)" ...
## $ WorkDataTypeSelect : chr "Text data,Relational data" "" "" "Relational data" ...
## $ WorkProductionFrequency : chr "Rarely" "" "" "Always" ...
## $ WorkDatasetSize : chr "10GB" "" "" "1GB" ...
## $ WorkAlgorithmsSelect : chr "Neural Networks,Random Forests,RNNs" "" "" "Bayesian Techniques,Decision Trees,Random Forests,Regression/Logistic Regression" ...
## $ WorkToolsSelect : chr "Amazon Web services,Oracle Data Mining/ Oracle R Enterprise,Perl" "" "" "Amazon Machine Learning,Amazon Web services,Cloudera,Hadoop/Hive/Pig,Impala,Java,Mathematica,MATLAB/Octave,Micr"| __truncated__ ...
## $ WorkToolsFrequencyAmazonML : chr "" "" "" "Rarely" ...
## $ WorkToolsFrequencyAWS : chr "Rarely" "" "" "Often" ...
## $ WorkToolsFrequencyAngoss : chr "" "" "" "" ...
## $ WorkToolsFrequencyC : chr "" "" "" "" ...
## $ WorkToolsFrequencyCloudera : chr "" "" "" "Rarely" ...
## $ WorkToolsFrequencyDataRobot : chr "" "" "" "" ...
## $ WorkToolsFrequencyFlume : chr "" "" "" "" ...
## $ WorkToolsFrequencyGCP : chr "" "" "" "" ...
## $ WorkToolsFrequencyHadoop : chr "" "" "" "Rarely" ...
## $ WorkToolsFrequencyIBMCognos : chr "" "" "" "" ...
## $ WorkToolsFrequencyIBMSPSSModeler : chr "" "" "" "" ...
## $ WorkToolsFrequencyIBMSPSSStatistics : chr "" "" "" "" ...
## $ WorkToolsFrequencyIBMWatson : chr "" "" "" "" ...
## $ WorkToolsFrequencyImpala : chr "" "" "" "Rarely" ...
## $ WorkToolsFrequencyJava : chr "" "" "" "Rarely" ...
## $ WorkToolsFrequencyJulia : chr "" "" "" "" ...
## $ WorkToolsFrequencyJupyter : chr "" "" "" "" ...
## $ WorkToolsFrequencyKNIMECommercial : chr "" "" "" "" ...
## [list output truncated]
selected_data <- survey_data %>%
select(starts_with("Leaning"), starts_with("Working"), starts_with("Age"),
starts_with("EmployerIndustry"), starts_with("CurrentJob"),
starts_with("MLMethod"), starts_with("Formal"))
glimpse(selected_data)
## Rows: 16,716
## Columns: 5
## $ Age <dbl> NA, 30, 28, 56, 38, 46, 35, 22, 43, 33, 20, 27,…
## $ EmployerIndustry <chr> "Internet-based", NA, NA, "Mix of fields", "Tec…
## $ CurrentJobTitleSelect <chr> "DBA/Database Engineer", NA, NA, "Operations Re…
## $ MLMethodNextYearSelect <chr> "Random Forests", "Random Forests", "Deep learn…
## $ FormalEducation <chr> "Bachelor's degree", "Master's degree", "Master…