Data Source and Description

Kaggle conducted an industry-wide survey to establish a comprehensive view of the state of data science and machine learning. The survey received over 16,000 responses and we learned a ton about who is working with data, what’s happening at the cutting edge of machine learning across industries, and how new data scientists can best break into the field.

You can dowload data here.

Who Are Data Scientists?

R codes for this graph:

rm(list = ls())
library(tidyverse)

df_survey <- read.csv("E:\\R_project\\Kaggle\\ds_survey\\multipleChoiceResponses.csv", stringsAsFactors = FALSE)

df_survey %>% 
  filter(EmploymentStatus == "Employed full-time") -> df_fullTime



df_fullTime %>% 
  group_by(MajorSelect) %>% 
  count() %>% 
  ungroup() %>% 
  filter(str_count(MajorSelect) != 0) %>% 
  mutate(MajorSelect = case_when(str_detect(MajorSelect, "health") ~ "Health Science", 
                                 str_detect(MajorSelect, "social") ~ "Social Science", 
                                 str_detect(MajorSelect, "human") ~ "Humanities", 
                                 str_detect(MajorSelect, "non-com") ~ "Engineering*", 
                                 str_detect(MajorSelect, "Fine") ~ "Fine/Performing Arts", 
                                 str_detect(MajorSelect, "never") ~ "Unknown", 
                                 str_detect(MajorSelect, "Infor") ~ "IT/SA/NET", 
                                 str_detect(MajorSelect, "Mana") ~ "MIS", 
                                 str_detect(MajorSelect, "Math") ~ "Mathematics/Statistics", 
                                 str_detect(MajorSelect, "Other") ~ "Unknown", TRUE ~ MajorSelect)) %>% 
  group_by(MajorSelect) %>% 
  summarise(value = sum(n)) %>% 
  ungroup() %>% 
  arrange(value) %>% 
  mutate(MajorSelect = factor(MajorSelect, levels = MajorSelect)) -> m


library(hrbrthemes)
my_colors <- c("#8C3F4D")
my_font <- "Roboto Condensed"


m %>% 
  ggplot(aes(MajorSelect, value)) +
  geom_col(fill = my_colors, color = my_colors, width = 0.8) +
  coord_flip() +
  geom_text(data = m %>% filter(value > 203), aes(label = value), hjust = 1.1, color = "white", size = 5.5, family = my_font) + 
  geom_text(data = m %>% filter(value <= 203), aes(label = value), hjust = -0.1, color = "white", size = 5.5, family = my_font) + 
  theme_ft_rc() + 
  theme(panel.grid = element_blank()) + 
  theme(axis.text.x = element_blank()) + 
  theme(axis.text.y = element_text(color = "white", size = 16, family = my_font)) + 
  theme(plot.title = element_text(size = 28)) + 
  theme(plot.subtitle = element_text(family = my_font, size = 16, color = "grey80")) + 
  theme(plot.caption = element_text(family = my_font, size = 13, face = "italic")) + 
  scale_y_discrete(expand = c(0.01, 0)) + 
  theme(plot.margin = unit(c(1.2, 1.2, 1.2, 1.2), "cm")) + 
  labs(x = NULL, y = NULL, 
       title = "Figure 1: Who Are Data Scientists?", 
       subtitle = "Electrical Engineering and Computer Science are excluded from Engineering*\nand interviewees were employed full-time.", 
       caption = "Data Source: Kaggle Data Science Survey")

Median Salary by Major

R codes for this graph:

df_fullTime %>% 
  mutate(Com = str_replace_all(CompensationAmount, "\\,", "") %>% as.numeric) %>% 
  mutate(Com = Com / 1000) %>% 
  mutate(MajorSelect = case_when(str_detect(MajorSelect, "health") ~ "Health Science", 
                                 str_detect(MajorSelect, "social") ~ "Social Science", 
                                 str_detect(MajorSelect, "human") ~ "Humanities", 
                                 str_detect(MajorSelect, "non-com") ~ "Engineering*", 
                                 str_detect(MajorSelect, "Fine") ~ "Fine/Performing Arts", 
                                 str_detect(MajorSelect, "never") ~ "Unknown", 
                                 str_detect(MajorSelect, "Infor") ~ "IT/SA/NET", 
                                 str_detect(MajorSelect, "Mana") ~ "MIS", 
                                 str_detect(MajorSelect, "Math") ~ "Mathematics/Statistics", 
                                 str_detect(MajorSelect, "Other") ~ "Unknown", TRUE ~ MajorSelect)) %>%
  group_by(MajorSelect) %>% 
  summarise(med_income = median(Com, na.rm = TRUE)) %>% 
  ungroup() %>% 
  filter(str_count(MajorSelect) != 0) %>% 
  arrange(med_income) %>% 
  mutate(MajorSelect = factor(MajorSelect, levels = MajorSelect)) %>% 
  mutate(label = round(med_income, 0)) -> n



n %>% 
  ggplot(aes(MajorSelect, med_income)) +
  geom_col(fill = my_colors, color = my_colors, width = 0.8) +
  coord_flip() +
  geom_text(aes(label = label), hjust = 1.1, color = "white", size = 5.5, family = my_font) + 
  theme_ft_rc() + 
  theme(panel.grid = element_blank()) + 
  theme(axis.text.x = element_blank()) + 
  theme(axis.text.y = element_text(color = "white", size = 16, family = my_font)) + 
  theme(plot.title = element_text(size = 28)) + 
  theme(plot.subtitle = element_text(family = my_font, size = 16, color = "grey80")) + 
  theme(plot.caption = element_text(family = my_font, size = 13, face = "italic")) + 
  scale_y_discrete(expand = c(0.01, 0)) + 
  theme(plot.margin = unit(c(1.2, 1.2, 1.2, 1.2), "cm")) + 
  labs(x = NULL, y = NULL, 
       title = "Figure 2: Median Salary for DS by Major", 
       subtitle = "Electrical Engineering and Computer Science are excluded from Engineering*\nand interviewees were employed full-time.", 
       caption = "Data Source: Kaggle Data Science Survey")
LS0tDQp0aXRsZTogIkthZ2dsZSBEYXRhIFNjaWVuY2UgU3VydmV5IChNZWRpYW4gU2FsYXJ5IGJ5IE1ham9yKSINCmF1dGhvcjogIk5ndXllbiBDaGkgRHVuZyINCnN1YnRpdGxlOiAiRGFpbHkgR3JhcGggU2VyaWVzIg0Kb3V0cHV0Og0KICBodG1sX2RvY3VtZW50Og0KICAgIGNvZGVfZG93bmxvYWQ6IHllcw0KICAgICMgY29kZV9mb2xkaW5nOiBoaWRlDQogICAgaGlnaGxpZ2h0OiB6ZW5idXJuDQogICAgdGhlbWU6IGZsYXRseQ0KICAgIHRvYzogeWVzDQogICAgdG9jX2Zsb2F0OiB5ZXMNCiAgd29yZF9kb2N1bWVudDoNCiAgICB0b2M6IHllcw0KLS0tDQoNCmBgYHtyIHNldHVwLGluY2x1ZGU9RkFMU0V9DQprbml0cjo6b3B0c19jaHVuayRzZXQoZWNobyA9IFRSVUUsIHdhcm5pbmcgPSBGQUxTRSwgbWVzc2FnZSA9IEZBTFNFLCBmaWcucmV0aW5hPTIpDQpgYGANCg0KIyBEYXRhIFNvdXJjZSBhbmQgRGVzY3JpcHRpb24NCg0KS2FnZ2xlIGNvbmR1Y3RlZCBhbiBpbmR1c3RyeS13aWRlIHN1cnZleSB0byBlc3RhYmxpc2ggYSBjb21wcmVoZW5zaXZlIHZpZXcgb2YgdGhlIHN0YXRlIG9mIGRhdGEgc2NpZW5jZSBhbmQgbWFjaGluZSBsZWFybmluZy4gVGhlIHN1cnZleSByZWNlaXZlZCBvdmVyIDE2LDAwMCByZXNwb25zZXMgYW5kIHdlIGxlYXJuZWQgYSB0b24gYWJvdXQgd2hvIGlzIHdvcmtpbmcgd2l0aCBkYXRhLCB3aGF04oCZcyBoYXBwZW5pbmcgYXQgdGhlIGN1dHRpbmcgZWRnZSBvZiBtYWNoaW5lIGxlYXJuaW5nIGFjcm9zcyBpbmR1c3RyaWVzLCBhbmQgaG93IG5ldyBkYXRhIHNjaWVudGlzdHMgY2FuIGJlc3QgYnJlYWsgaW50byB0aGUgZmllbGQuDQoNCllvdSBjYW4gZG93bG9hZCBkYXRhIFtoZXJlXShodHRwczovL3d3dy5rYWdnbGUuY29tL2thZ2dsZS9rYWdnbGUtc3VydmV5LTIwMTcvZGF0YSkuIA0KDQojIFdobyBBcmUgRGF0YSBTY2llbnRpc3RzPw0KDQohW10oQzpcVXNlcnNcWmJvb2tcRGVza3RvcFxwaWNccGljMS5qcGcpDQoNClIgY29kZXMgZm9yIHRoaXMgZ3JhcGg6IA0KDQpgYGB7ciwgZXZhbD1GQUxTRX0NCg0Kcm0obGlzdCA9IGxzKCkpDQpsaWJyYXJ5KHRpZHl2ZXJzZSkNCg0KZGZfc3VydmV5IDwtIHJlYWQuY3N2KCJFOlxcUl9wcm9qZWN0XFxLYWdnbGVcXGRzX3N1cnZleVxcbXVsdGlwbGVDaG9pY2VSZXNwb25zZXMuY3N2Iiwgc3RyaW5nc0FzRmFjdG9ycyA9IEZBTFNFKQ0KDQpkZl9zdXJ2ZXkgJT4lIA0KICBmaWx0ZXIoRW1wbG95bWVudFN0YXR1cyA9PSAiRW1wbG95ZWQgZnVsbC10aW1lIikgLT4gZGZfZnVsbFRpbWUNCg0KDQoNCmRmX2Z1bGxUaW1lICU+JSANCiAgZ3JvdXBfYnkoTWFqb3JTZWxlY3QpICU+JSANCiAgY291bnQoKSAlPiUgDQogIHVuZ3JvdXAoKSAlPiUgDQogIGZpbHRlcihzdHJfY291bnQoTWFqb3JTZWxlY3QpICE9IDApICU+JSANCiAgbXV0YXRlKE1ham9yU2VsZWN0ID0gY2FzZV93aGVuKHN0cl9kZXRlY3QoTWFqb3JTZWxlY3QsICJoZWFsdGgiKSB+ICJIZWFsdGggU2NpZW5jZSIsIA0KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgc3RyX2RldGVjdChNYWpvclNlbGVjdCwgInNvY2lhbCIpIH4gIlNvY2lhbCBTY2llbmNlIiwgDQogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBzdHJfZGV0ZWN0KE1ham9yU2VsZWN0LCAiaHVtYW4iKSB+ICJIdW1hbml0aWVzIiwgDQogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBzdHJfZGV0ZWN0KE1ham9yU2VsZWN0LCAibm9uLWNvbSIpIH4gIkVuZ2luZWVyaW5nKiIsIA0KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgc3RyX2RldGVjdChNYWpvclNlbGVjdCwgIkZpbmUiKSB+ICJGaW5lL1BlcmZvcm1pbmcgQXJ0cyIsIA0KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgc3RyX2RldGVjdChNYWpvclNlbGVjdCwgIm5ldmVyIikgfiAiVW5rbm93biIsIA0KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgc3RyX2RldGVjdChNYWpvclNlbGVjdCwgIkluZm9yIikgfiAiSVQvU0EvTkVUIiwgDQogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBzdHJfZGV0ZWN0KE1ham9yU2VsZWN0LCAiTWFuYSIpIH4gIk1JUyIsIA0KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgc3RyX2RldGVjdChNYWpvclNlbGVjdCwgIk1hdGgiKSB+ICJNYXRoZW1hdGljcy9TdGF0aXN0aWNzIiwgDQogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBzdHJfZGV0ZWN0KE1ham9yU2VsZWN0LCAiT3RoZXIiKSB+ICJVbmtub3duIiwgVFJVRSB+IE1ham9yU2VsZWN0KSkgJT4lIA0KICBncm91cF9ieShNYWpvclNlbGVjdCkgJT4lIA0KICBzdW1tYXJpc2UodmFsdWUgPSBzdW0obikpICU+JSANCiAgdW5ncm91cCgpICU+JSANCiAgYXJyYW5nZSh2YWx1ZSkgJT4lIA0KICBtdXRhdGUoTWFqb3JTZWxlY3QgPSBmYWN0b3IoTWFqb3JTZWxlY3QsIGxldmVscyA9IE1ham9yU2VsZWN0KSkgLT4gbQ0KDQoNCmxpYnJhcnkoaHJicnRoZW1lcykNCm15X2NvbG9ycyA8LSBjKCIjOEMzRjREIikNCm15X2ZvbnQgPC0gIlJvYm90byBDb25kZW5zZWQiDQoNCg0KbSAlPiUgDQogIGdncGxvdChhZXMoTWFqb3JTZWxlY3QsIHZhbHVlKSkgKw0KICBnZW9tX2NvbChmaWxsID0gbXlfY29sb3JzLCBjb2xvciA9IG15X2NvbG9ycywgd2lkdGggPSAwLjgpICsNCiAgY29vcmRfZmxpcCgpICsNCiAgZ2VvbV90ZXh0KGRhdGEgPSBtICU+JSBmaWx0ZXIodmFsdWUgPiAyMDMpLCBhZXMobGFiZWwgPSB2YWx1ZSksIGhqdXN0ID0gMS4xLCBjb2xvciA9ICJ3aGl0ZSIsIHNpemUgPSA1LjUsIGZhbWlseSA9IG15X2ZvbnQpICsgDQogIGdlb21fdGV4dChkYXRhID0gbSAlPiUgZmlsdGVyKHZhbHVlIDw9IDIwMyksIGFlcyhsYWJlbCA9IHZhbHVlKSwgaGp1c3QgPSAtMC4xLCBjb2xvciA9ICJ3aGl0ZSIsIHNpemUgPSA1LjUsIGZhbWlseSA9IG15X2ZvbnQpICsgDQogIHRoZW1lX2Z0X3JjKCkgKyANCiAgdGhlbWUocGFuZWwuZ3JpZCA9IGVsZW1lbnRfYmxhbmsoKSkgKyANCiAgdGhlbWUoYXhpcy50ZXh0LnggPSBlbGVtZW50X2JsYW5rKCkpICsgDQogIHRoZW1lKGF4aXMudGV4dC55ID0gZWxlbWVudF90ZXh0KGNvbG9yID0gIndoaXRlIiwgc2l6ZSA9IDE2LCBmYW1pbHkgPSBteV9mb250KSkgKyANCiAgdGhlbWUocGxvdC50aXRsZSA9IGVsZW1lbnRfdGV4dChzaXplID0gMjgpKSArIA0KICB0aGVtZShwbG90LnN1YnRpdGxlID0gZWxlbWVudF90ZXh0KGZhbWlseSA9IG15X2ZvbnQsIHNpemUgPSAxNiwgY29sb3IgPSAiZ3JleTgwIikpICsgDQogIHRoZW1lKHBsb3QuY2FwdGlvbiA9IGVsZW1lbnRfdGV4dChmYW1pbHkgPSBteV9mb250LCBzaXplID0gMTMsIGZhY2UgPSAiaXRhbGljIikpICsgDQogIHNjYWxlX3lfZGlzY3JldGUoZXhwYW5kID0gYygwLjAxLCAwKSkgKyANCiAgdGhlbWUocGxvdC5tYXJnaW4gPSB1bml0KGMoMS4yLCAxLjIsIDEuMiwgMS4yKSwgImNtIikpICsgDQogIGxhYnMoeCA9IE5VTEwsIHkgPSBOVUxMLCANCiAgICAgICB0aXRsZSA9ICJGaWd1cmUgMTogV2hvIEFyZSBEYXRhIFNjaWVudGlzdHM/IiwgDQogICAgICAgc3VidGl0bGUgPSAiRWxlY3RyaWNhbCBFbmdpbmVlcmluZyBhbmQgQ29tcHV0ZXIgU2NpZW5jZSBhcmUgZXhjbHVkZWQgZnJvbSBFbmdpbmVlcmluZypcbmFuZCBpbnRlcnZpZXdlZXMgd2VyZSBlbXBsb3llZCBmdWxsLXRpbWUuIiwgDQogICAgICAgY2FwdGlvbiA9ICJEYXRhIFNvdXJjZTogS2FnZ2xlIERhdGEgU2NpZW5jZSBTdXJ2ZXkiKQ0KDQoNCmBgYA0KDQojIE1lZGlhbiBTYWxhcnkgYnkgTWFqb3INCg0KDQoNCiFbXShDOlxVc2Vyc1xaYm9va1xEZXNrdG9wXHBpY1xwaWMyLmpwZykNCg0KUiBjb2RlcyBmb3IgdGhpcyBncmFwaDogDQoNCmBgYHtyLCBldmFsPUZBTFNFfQ0KDQpkZl9mdWxsVGltZSAlPiUgDQogIG11dGF0ZShDb20gPSBzdHJfcmVwbGFjZV9hbGwoQ29tcGVuc2F0aW9uQW1vdW50LCAiXFwsIiwgIiIpICU+JSBhcy5udW1lcmljKSAlPiUgDQogIG11dGF0ZShDb20gPSBDb20gLyAxMDAwKSAlPiUgDQogIG11dGF0ZShNYWpvclNlbGVjdCA9IGNhc2Vfd2hlbihzdHJfZGV0ZWN0KE1ham9yU2VsZWN0LCAiaGVhbHRoIikgfiAiSGVhbHRoIFNjaWVuY2UiLCANCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIHN0cl9kZXRlY3QoTWFqb3JTZWxlY3QsICJzb2NpYWwiKSB+ICJTb2NpYWwgU2NpZW5jZSIsIA0KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgc3RyX2RldGVjdChNYWpvclNlbGVjdCwgImh1bWFuIikgfiAiSHVtYW5pdGllcyIsIA0KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgc3RyX2RldGVjdChNYWpvclNlbGVjdCwgIm5vbi1jb20iKSB+ICJFbmdpbmVlcmluZyoiLCANCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIHN0cl9kZXRlY3QoTWFqb3JTZWxlY3QsICJGaW5lIikgfiAiRmluZS9QZXJmb3JtaW5nIEFydHMiLCANCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIHN0cl9kZXRlY3QoTWFqb3JTZWxlY3QsICJuZXZlciIpIH4gIlVua25vd24iLCANCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIHN0cl9kZXRlY3QoTWFqb3JTZWxlY3QsICJJbmZvciIpIH4gIklUL1NBL05FVCIsIA0KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgc3RyX2RldGVjdChNYWpvclNlbGVjdCwgIk1hbmEiKSB+ICJNSVMiLCANCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIHN0cl9kZXRlY3QoTWFqb3JTZWxlY3QsICJNYXRoIikgfiAiTWF0aGVtYXRpY3MvU3RhdGlzdGljcyIsIA0KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgc3RyX2RldGVjdChNYWpvclNlbGVjdCwgIk90aGVyIikgfiAiVW5rbm93biIsIFRSVUUgfiBNYWpvclNlbGVjdCkpICU+JQ0KICBncm91cF9ieShNYWpvclNlbGVjdCkgJT4lIA0KICBzdW1tYXJpc2UobWVkX2luY29tZSA9IG1lZGlhbihDb20sIG5hLnJtID0gVFJVRSkpICU+JSANCiAgdW5ncm91cCgpICU+JSANCiAgZmlsdGVyKHN0cl9jb3VudChNYWpvclNlbGVjdCkgIT0gMCkgJT4lIA0KICBhcnJhbmdlKG1lZF9pbmNvbWUpICU+JSANCiAgbXV0YXRlKE1ham9yU2VsZWN0ID0gZmFjdG9yKE1ham9yU2VsZWN0LCBsZXZlbHMgPSBNYWpvclNlbGVjdCkpICU+JSANCiAgbXV0YXRlKGxhYmVsID0gcm91bmQobWVkX2luY29tZSwgMCkpIC0+IG4NCg0KDQoNCm4gJT4lIA0KICBnZ3Bsb3QoYWVzKE1ham9yU2VsZWN0LCBtZWRfaW5jb21lKSkgKw0KICBnZW9tX2NvbChmaWxsID0gbXlfY29sb3JzLCBjb2xvciA9IG15X2NvbG9ycywgd2lkdGggPSAwLjgpICsNCiAgY29vcmRfZmxpcCgpICsNCiAgZ2VvbV90ZXh0KGFlcyhsYWJlbCA9IGxhYmVsKSwgaGp1c3QgPSAxLjEsIGNvbG9yID0gIndoaXRlIiwgc2l6ZSA9IDUuNSwgZmFtaWx5ID0gbXlfZm9udCkgKyANCiAgdGhlbWVfZnRfcmMoKSArIA0KICB0aGVtZShwYW5lbC5ncmlkID0gZWxlbWVudF9ibGFuaygpKSArIA0KICB0aGVtZShheGlzLnRleHQueCA9IGVsZW1lbnRfYmxhbmsoKSkgKyANCiAgdGhlbWUoYXhpcy50ZXh0LnkgPSBlbGVtZW50X3RleHQoY29sb3IgPSAid2hpdGUiLCBzaXplID0gMTYsIGZhbWlseSA9IG15X2ZvbnQpKSArIA0KICB0aGVtZShwbG90LnRpdGxlID0gZWxlbWVudF90ZXh0KHNpemUgPSAyOCkpICsgDQogIHRoZW1lKHBsb3Quc3VidGl0bGUgPSBlbGVtZW50X3RleHQoZmFtaWx5ID0gbXlfZm9udCwgc2l6ZSA9IDE2LCBjb2xvciA9ICJncmV5ODAiKSkgKyANCiAgdGhlbWUocGxvdC5jYXB0aW9uID0gZWxlbWVudF90ZXh0KGZhbWlseSA9IG15X2ZvbnQsIHNpemUgPSAxMywgZmFjZSA9ICJpdGFsaWMiKSkgKyANCiAgc2NhbGVfeV9kaXNjcmV0ZShleHBhbmQgPSBjKDAuMDEsIDApKSArIA0KICB0aGVtZShwbG90Lm1hcmdpbiA9IHVuaXQoYygxLjIsIDEuMiwgMS4yLCAxLjIpLCAiY20iKSkgKyANCiAgbGFicyh4ID0gTlVMTCwgeSA9IE5VTEwsIA0KICAgICAgIHRpdGxlID0gIkZpZ3VyZSAyOiBNZWRpYW4gU2FsYXJ5IGZvciBEUyBieSBNYWpvciIsIA0KICAgICAgIHN1YnRpdGxlID0gIkVsZWN0cmljYWwgRW5naW5lZXJpbmcgYW5kIENvbXB1dGVyIFNjaWVuY2UgYXJlIGV4Y2x1ZGVkIGZyb20gRW5naW5lZXJpbmcqXG5hbmQgaW50ZXJ2aWV3ZWVzIHdlcmUgZW1wbG95ZWQgZnVsbC10aW1lLiIsIA0KICAgICAgIGNhcHRpb24gPSAiRGF0YSBTb3VyY2U6IEthZ2dsZSBEYXRhIFNjaWVuY2UgU3VydmV5IikNCiAgDQoNCg0KICANCg0KDQpgYGANCg0KDQo=