linkMC<-"https://raw.githubusercontent.com/betsyrosalen/DATA_607_Project_3/master/project3_master/rawdata/multipleChoiceResponses.csv"
#importing MC items
MC<-read_csv (linkMC)
## Parsed with column specification:
## cols(
## .default = col_character(),
## Age = col_integer(),
## LearningCategorySelftTaught = col_integer(),
## LearningCategoryOnlineCourses = col_integer(),
## LearningCategoryWork = col_integer(),
## LearningCategoryUniversity = col_double(),
## LearningCategoryKaggle = col_double(),
## LearningCategoryOther = col_integer(),
## TimeGatheringData = col_integer(),
## TimeModelBuilding = col_integer(),
## TimeProduction = col_integer(),
## TimeVisualizing = col_integer(),
## TimeFindingInsights = col_integer(),
## TimeOtherSelect = col_integer()
## )
## See spec(...) for full column specifications.
# removing NAs as they are not meaningful
subset <- MC %>%
filter(!is.na(FormalEducation), !is.na(MLMethodNextYearSelect)) %>%
select(FormalEducation, MLMethodNextYearSelect)
First we plot the distribution of formal education in the dataset
subset %>%
ggplot() +
geom_bar(mapping = aes(x = FormalEducation, fill = FormalEducation), show.legend = FALSE) +
coord_flip()

The data set predominantly contains candidates with Master's degree.
Now let's look at the different ML/DS methods in the dataset
unique(subset$MLMethodNextYearSelect)
## [1] "Random Forests"
## [2] "Deep learning"
## [3] "Neural Nets"
## [4] "Text Mining"
## [5] "Genetic & Evolutionary Algorithms"
## [6] "Link Analysis"
## [7] "Rule Induction"
## [8] "Regression"
## [9] "Proprietary Algorithms"
## [10] "I don't plan on learning a new ML/DS method"
## [11] "Ensemble Methods (e.g. boosting, bagging)"
## [12] "Factor Analysis"
## [13] "Social Network Analysis"
## [14] "Monte Carlo Methods"
## [15] "Time Series Analysis"
## [16] "Other"
## [17] "Bayesian Methods"
## [18] "Survival Analysis"
## [19] "MARS"
## [20] "Anomaly Detection"
## [21] "Cluster Analysis"
## [22] "Decision Trees"
## [23] "Association Rules"
## [24] "Uplift Modeling"
## [25] "Support Vector Machines (SVM)"
Now we can plot the distribution of ML/DS methods in the formal education bar chart
subset %>%
ggplot() +
geom_bar(mapping = aes(x = FormalEducation,
fill = MLMethodNextYearSelect),
position = "fill") +
coord_flip() +
theme(legend.position="bottom")

subset %>%
ggplot() +
geom_bar(mapping = aes(x = FormalEducation, fill = MLMethodNextYearSelect), position = "dodge") +
coord_flip() +
theme(legend.position="bottom")

algo <- c("Deep learning")
subset %>%
group_by(FormalEducation, MLMethodNextYearSelect) %>%
summarise(n = n()) %>%
mutate(freq = n / sum(n)) %>%
filter(MLMethodNextYearSelect %in% algo )
## # A tibble: 7 x 4
## # Groups: FormalEducation [7]
## FormalEducation MLMethodNextYe… n freq
## <chr> <chr> <int> <dbl>
## 1 Bachelor's degree Deep learning 1339 0.401
## 2 Doctoral degree Deep learning 764 0.436
## 3 I did not complete any formal education pas… Deep learning 68 0.386
## 4 I prefer not to answer Deep learning 28 0.500
## 5 Master's degree Deep learning 1825 0.397
## 6 Professional degree Deep learning 118 0.377
## 7 Some college/university study without earni… Deep learning 212 0.375