Kaggle conducted an industry-wide survey to establish a comprehensive view of the state of data science and machine learning. The survey received over 16,000 responses and we learned a ton about who is working with data, what’s happening at the cutting edge of machine learning across industries, and how new data scientists can best break into the field.
You can dowload data here.
R codes for this graph:
rm(list = ls())
library(tidyverse)
df_survey <- read.csv("E:\\R_project\\Kaggle\\ds_survey\\multipleChoiceResponses.csv", stringsAsFactors = FALSE)
df_survey %>%
group_by(MLToolNextYearSelect) %>%
count() %>%
ungroup() %>%
arrange(-n) %>%
slice(-1) %>%
filter(!str_detect(MLToolNextYearSelect, "I don")) %>%
slice(1:14) %>%
rename(tool = MLToolNextYearSelect) %>%
mutate(tool = case_when(str_detect(tool, "Spa") ~ "Spark",
str_detect(tool, "Jup") ~ "Jupyter Notebooks",
str_detect(tool, "Web") ~ "Amazon Web Services",
str_detect(tool, "IBM") ~ "IBM Watson Analytics",
str_detect(tool, "Micro") ~ "Microsoft Azure ML", TRUE ~ tool)) -> df2
df_survey %>%
group_by(MLTechniquesSelect) %>%
count() %>%
ungroup() %>%
arrange(-n) %>%
pull(MLTechniquesSelect) %>%
str_split("\\,", simplify = TRUE) %>%
as.vector() %>%
str_split("-", simplify = TRUE) %>%
as.vector() %>%
str_squish() %>%
table() %>%
as.data.frame() -> df3
names(df3) <- c("Model", "N")
df3 %>%
arrange(-N) %>%
slice(-1) %>%
filter(!str_detect(Model, "colon")) %>%
filter(!str_detect(Model, "Other")) %>%
mutate(Model = as.character(Model)) %>%
mutate(Model = case_when(str_detect(Model, "Gradient") ~ "Gradient Boosting Machine",
str_detect(Model, "Support") ~ "Support Vector Machines",
str_detect(Model, "CNN") ~ "Convolutional Neural Networks",
str_detect(Model, "RNN") ~ "Recurrent Neural Networks",
str_detect(Model, "GAN") ~ "Generative Adversary Networks",
TRUE ~ Model)) %>%
group_by(Model) %>%
summarise(n = sum(N)) %>%
ungroup() %>%
arrange(-n) -> df3
library(hrbrthemes)
library(extrafont)
library(gridExtra)
my_colors <- c("#3E606F")
my_font <- "Roboto Condensed"
my_bar <- function(df_selected) {
names(df_selected) <- c("Model", "value")
df_selected %>%
arrange(value) %>%
mutate(Model = factor(Model, levels = Model)) -> m
m %>%
ggplot(aes(Model, value)) +
geom_col(fill = my_colors, color = my_colors, width = 0.8) +
coord_flip() +
geom_text(data = m, aes(label = value), hjust = 1.1, color = "white", size = 6, family = my_font) +
theme_ft_rc() +
theme(panel.grid = element_blank()) +
theme(axis.text.x = element_blank()) +
theme(axis.text.y = element_text(color = "white", size = 14, family = my_font)) +
theme(plot.title = element_text(size = 20)) +
scale_y_discrete(expand = c(0.01, 0)) +
labs(x = NULL, y = NULL)
}
my_bar(df3) +
labs(title = "Figure 1: Most Used ML Models") -> p1
my_bar(df2) +
labs(title = "Figure 2: Best Tools Next Year") -> p2
grid.arrange(p1, p2, nrow = 1)