Data Source and Description

Kaggle conducted an industry-wide survey to establish a comprehensive view of the state of data science and machine learning. The survey received over 16,000 responses and we learned a ton about who is working with data, what’s happening at the cutting edge of machine learning across industries, and how new data scientists can best break into the field.

You can dowload data here.

Who Are Data Scientists?

R codes for this graph:

rm(list = ls())
library(tidyverse)

df_survey <- read.csv("E:\\R_project\\Kaggle\\ds_survey\\multipleChoiceResponses.csv", stringsAsFactors = FALSE)

df_survey %>% 
  filter(EmploymentStatus == "Employed full-time") -> df_fullTime



df_fullTime %>% 
  group_by(MajorSelect) %>% 
  count() %>% 
  ungroup() %>% 
  filter(str_count(MajorSelect) != 0) %>% 
  mutate(MajorSelect = case_when(str_detect(MajorSelect, "health") ~ "Health Science", 
                                 str_detect(MajorSelect, "social") ~ "Social Science", 
                                 str_detect(MajorSelect, "human") ~ "Humanities", 
                                 str_detect(MajorSelect, "non-com") ~ "Engineering*", 
                                 str_detect(MajorSelect, "Fine") ~ "Fine/Performing Arts", 
                                 str_detect(MajorSelect, "never") ~ "Unknown", 
                                 str_detect(MajorSelect, "Infor") ~ "IT/SA/NET", 
                                 str_detect(MajorSelect, "Mana") ~ "MIS", 
                                 str_detect(MajorSelect, "Math") ~ "Mathematics/Statistics", 
                                 str_detect(MajorSelect, "Other") ~ "Unknown", TRUE ~ MajorSelect)) %>% 
  group_by(MajorSelect) %>% 
  summarise(value = sum(n)) %>% 
  ungroup() %>% 
  arrange(value) %>% 
  mutate(MajorSelect = factor(MajorSelect, levels = MajorSelect)) -> m


library(hrbrthemes)
my_colors <- c("#8C3F4D")
my_font <- "Roboto Condensed"


m %>% 
  ggplot(aes(MajorSelect, value)) +
  geom_col(fill = my_colors, color = my_colors, width = 0.8) +
  coord_flip() +
  geom_text(data = m %>% filter(value > 203), aes(label = value), hjust = 1.1, color = "white", size = 5.5, family = my_font) + 
  geom_text(data = m %>% filter(value <= 203), aes(label = value), hjust = -0.1, color = "white", size = 5.5, family = my_font) + 
  theme_ft_rc() + 
  theme(panel.grid = element_blank()) + 
  theme(axis.text.x = element_blank()) + 
  theme(axis.text.y = element_text(color = "white", size = 16, family = my_font)) + 
  theme(plot.title = element_text(size = 28)) + 
  theme(plot.subtitle = element_text(family = my_font, size = 16, color = "grey80")) + 
  theme(plot.caption = element_text(family = my_font, size = 13, face = "italic")) + 
  scale_y_discrete(expand = c(0.01, 0)) + 
  theme(plot.margin = unit(c(1.2, 1.2, 1.2, 1.2), "cm")) + 
  labs(x = NULL, y = NULL, 
       title = "Figure 1: Who Are Data Scientists?", 
       subtitle = "Electrical Engineering and Computer Science are excluded from Engineering*\nand interviewees were employed full-time.", 
       caption = "Data Source: Kaggle Data Science Survey")

Median Salary by Major

R codes for this graph:

df_fullTime %>% 
  mutate(Com = str_replace_all(CompensationAmount, "\\,", "") %>% as.numeric) %>% 
  mutate(Com = Com / 1000) %>% 
  mutate(MajorSelect = case_when(str_detect(MajorSelect, "health") ~ "Health Science", 
                                 str_detect(MajorSelect, "social") ~ "Social Science", 
                                 str_detect(MajorSelect, "human") ~ "Humanities", 
                                 str_detect(MajorSelect, "non-com") ~ "Engineering*", 
                                 str_detect(MajorSelect, "Fine") ~ "Fine/Performing Arts", 
                                 str_detect(MajorSelect, "never") ~ "Unknown", 
                                 str_detect(MajorSelect, "Infor") ~ "IT/SA/NET", 
                                 str_detect(MajorSelect, "Mana") ~ "MIS", 
                                 str_detect(MajorSelect, "Math") ~ "Mathematics/Statistics", 
                                 str_detect(MajorSelect, "Other") ~ "Unknown", TRUE ~ MajorSelect)) %>%
  group_by(MajorSelect) %>% 
  summarise(med_income = median(Com, na.rm = TRUE)) %>% 
  ungroup() %>% 
  filter(str_count(MajorSelect) != 0) %>% 
  arrange(med_income) %>% 
  mutate(MajorSelect = factor(MajorSelect, levels = MajorSelect)) %>% 
  mutate(label = round(med_income, 0)) -> n



n %>% 
  ggplot(aes(MajorSelect, med_income)) +
  geom_col(fill = my_colors, color = my_colors, width = 0.8) +
  coord_flip() +
  geom_text(aes(label = label), hjust = 1.1, color = "white", size = 5.5, family = my_font) + 
  theme_ft_rc() + 
  theme(panel.grid = element_blank()) + 
  theme(axis.text.x = element_blank()) + 
  theme(axis.text.y = element_text(color = "white", size = 16, family = my_font)) + 
  theme(plot.title = element_text(size = 28)) + 
  theme(plot.subtitle = element_text(family = my_font, size = 16, color = "grey80")) + 
  theme(plot.caption = element_text(family = my_font, size = 13, face = "italic")) + 
  scale_y_discrete(expand = c(0.01, 0)) + 
  theme(plot.margin = unit(c(1.2, 1.2, 1.2, 1.2), "cm")) + 
  labs(x = NULL, y = NULL, 
       title = "Figure 2: Median Salary for DS by Major", 
       subtitle = "Electrical Engineering and Computer Science are excluded from Engineering*\nand interviewees were employed full-time.", 
       caption = "Data Source: Kaggle Data Science Survey")
