Kaggle conducted an industry-wide survey to establish a comprehensive view of the state of data science and machine learning. The survey received over 16,000 responses and we learned a ton about who is working with data, what’s happening at the cutting edge of machine learning across industries, and how new data scientists can best break into the field.
You can dowload data here.
R codes for this graph:
rm(list = ls())
library(tidyverse)
df_survey <- read.csv("E:\\R_project\\Kaggle\\ds_survey\\multipleChoiceResponses.csv", stringsAsFactors = FALSE)
tool_used <- c("IBMSPSSModeler", "IBMSPSSStatistics", "C", "MATLAB", "Statistica",
"Java", "Julia", "Excel", "MicrosoftSQL", "TensorFlow", "Spark",
"NoSQL", "Oracle", "Perl", "Python", "R", "DataRobot", "NoSQL",
"RapidMinerCommercial", "RapidMinerFree", "SASBase", "Mathematica",
"SASEnterprise", "SASJMP", "SQL", "Tableau")
df_survey %>%
select(paste0("WorkToolsFrequency", tool_used)) %>%
gather(tool, response) %>%
group_by(tool, response) %>%
count() %>%
ungroup() %>%
filter(response != "") %>%
mutate(tool = str_sub(tool, 19, str_count(tool))) %>%
group_by(tool, response) %>%
summarise(n = sum(n)) %>%
mutate(response = factor(response, levels = c("Rarely", "Sometimes", "Often", "Most of the time"))) -> df1
library(extrafont)
my_colors <- c("#8C3F4D", "#3E606F")
my_font <- "Roboto Condensed"
df1 %>%
ggplot(aes(response, n)) +
geom_col(fill = my_colors[2]) +
geom_col(data = df1 %>% filter(tool == "Python"), aes(response, n), fill = my_colors[1]) +
geom_col(data = df1 %>% filter(tool == "R"), aes(response, n), fill = my_colors[1]) +
geom_col(data = df1 %>% filter(tool == "SQL"), aes(response, n), fill = my_colors[1]) +
facet_wrap(~ tool, strip.position = "top") +
coord_flip() +
geom_text(data = df1 %>% filter(tool != "Python"), aes(response, n, label = n), hjust = -0.1, family = my_font) +
geom_text(data = df1 %>% filter(tool == "Python", response == "Most of the time"), aes(response, n, label = n), hjust = 1.2, color = "white") +
geom_text(data = df1 %>% filter(tool == "Python", response != "Most of the time"), aes(response, n, label = n), hjust = -0.1) +
theme(plot.background = element_rect(fill = "#f5f5f2", color = NA)) +
theme(panel.background = element_rect(fill = "#f5f5f2", color = NA)) +
theme(strip.text.x = element_text(colour = "gray30", face = "bold", size = 12, family = my_font)) +
theme(panel.grid = element_blank()) +
theme(axis.text.x = element_blank()) +
theme(axis.text.y = element_text(family = my_font, size = 12)) +
theme(axis.ticks = element_blank()) +
theme(plot.title = element_text(family = my_font, size = 22)) +
theme(plot.caption = element_text(family = my_font, size = 10, face = "italic")) +
theme(plot.margin = unit(c(1.2, 1.2, 1.2, 1.2), "cm")) +
scale_y_continuous(expand = c(0, 0)) +
labs(x = NULL, y = NULL,
title = "Figure 1: Most Used Tools for Data Science",
caption = "Data Source: Kaggle Data Science Survey")