1 Giới thiệu

Tôi sẽ sử dụng Dataset: https://www.kaggle.com/kaggle/kaggle-survey-2017 này để phân tích và tìm ra những skills nào cần có khi làm việc trong lĩnh vực data science.

rm(list = ls())
library(pander)
library(tidyverse)
## -- Attaching packages ------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.0     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.1
## v tidyr   0.8.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## -- Conflicts ---------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggthemes)

df_survey <- read.csv("E:\\R\\Data\\kaggle-survey-2017\\multipleChoiceResponses.csv", stringsAsFactors = FALSE)

Tôi chọn lựa những biến số về skills cần thiết, có thì tốt và không cần thiết theo quan điểm của những người được phỏng vấn (16.734 người).

df_survey %>% 
  select(contains("JobSkill"), -contains("JobSkillImportanceOther")) %>%
 gather(a,b) %>%
mutate(a = str_replace_all(a, "JobSkillImportance", "")) %>%
  table() %>%
  as.data.frame() %>%
spread(b, Freq) %>%
  select(-V1) -> df_skill 

names(df_skill)[1]<-"Skills"
names(df_skill)[3]<-"Nice.To.Have"
head(df_skill, 11)
##             Skills Necessary Nice.To.Have Unnecessary
## 1          BigData      1503         2271         182
## 2           Degree      1094         2338         477
## 3  EnterpriseTools       535         2087        1072
## 4    KaggleRanking       460         2621         789
## 5             MOOC       411         2326        1095
## 6           Python      2604         1319         108
## 7                R      1636         2027         281
## 8              SQL      1690         1914         288
## 9            Stats      2035         1812         113
## 10  Visualizations      1761         1898         208
str(df_skill)
## 'data.frame':    10 obs. of  4 variables:
##  $ Skills      : Factor w/ 10 levels "BigData","Degree",..: 1 2 3 4 5 6 7 8 9 10
##  $ Necessary   : int  1503 1094 535 460 411 2604 1636 1690 2035 1761
##  $ Nice.To.Have: int  2271 2338 2087 2621 2326 1319 2027 1914 1812 1898
##  $ Unnecessary : int  182 477 1072 789 1095 108 281 288 113 208

2 Necessary skills: Cần thiết

library(hrbrthemes)
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
##       Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
##       if Arial Narrow is not on your system, please see http://bit.ly/arialnarrow
my_colors <- c("#835C3B")
my_font <- "Arial"

df_skill %>%
  arrange(desc(Necessary)) %>%
  ggplot(aes(reorder(Skills, Necessary),Necessary)) + 
    geom_col(fill = my_colors, color = my_colors, width = 0.8) +
    coord_flip() +
    geom_text(data = df_skill , aes(label = Necessary), 
              hjust = 1.1, color = "yellow", size = 5.5, family = my_font) + 
    theme_ft_rc() +
  theme(plot.background = element_rect(fill = "grey20"),
        panel.background = element_rect(fill = "grey20",
                                        colour = "grey20")) +
    theme(panel.grid = element_blank()) + 
    theme(axis.text.x = element_blank()) + 
    theme(axis.text.y = element_text(color = "yellow", size = 16, family = my_font)) + 
    theme(plot.title = element_text(color = "yellow", size = 28)) + 
    scale_y_discrete(expand = c(0.01, 0)) + 
    theme(plot.margin = unit(c(1.2, 1.2, 1.2, 1.2), "cm")) + 
    labs(x = NULL, y = NULL, 
         title = "Necessary Skills", 
         caption = "Data Source: Kaggle Data Science Survey")

3 Nice-To-Have: Có thì tốt

df_skill %>%
  arrange(desc(Nice.To.Have)) %>%
  ggplot(aes(reorder(Skills, Nice.To.Have),Nice.To.Have)) + 
  geom_col(fill = my_colors, color = my_colors, width = 0.8) +
  coord_flip() +
  geom_text(data = df_skill , aes(label = Nice.To.Have), 
            hjust = 1.1, color = "green", size = 5.5, family = my_font) + 
  theme_ft_rc() + 
  theme(plot.background = element_rect(fill = "grey20"),
        panel.background = element_rect(fill = "grey20",
                                        colour = "grey20")) +
  theme(panel.grid = element_blank()) + 
  theme(axis.text.x = element_blank()) + 
  theme(axis.text.y = element_text(color = "lightgreen", size = 16, family = my_font)) + 
  theme(plot.title = element_text(color = "lightgreen",size = 28)) + 
  scale_y_discrete(expand = c(0.01, 0)) + 
  theme(plot.margin = unit(c(1, 1, 1, 1), "cm")) + 
  labs(x = NULL, y = NULL, 
       title = "Nice To Have Skills", 
       caption = "Data Source: Kaggle Data Science Survey")

4 Unnecessary: Không cần thiết

df_skill %>%
  arrange(desc(Unnecessary)) %>%
  ggplot(aes(reorder(Skills, Unnecessary),Unnecessary)) + 
  geom_col(fill = my_colors, color = my_colors, width = 0.8) +
  coord_flip() +
  geom_text(data = df_skill , aes(label = Unnecessary), 
            hjust = 1.1, color = "pink", size = 5.5, family = my_font) + 
  theme_ft_rc() + 
  theme(plot.background = element_rect(fill = "grey20"),
        panel.background = element_rect(fill = "grey20",
                                        colour = "grey20")) +
  theme(panel.grid = element_blank()) + 
  theme(axis.text.x = element_blank()) + 
  theme(axis.text.y = element_text(color = "pink", size = 16, family = my_font)) + 
  theme(plot.title = element_text(color = "pink", size = 28)) + 
  scale_y_discrete(expand = c(0.01, 0)) + 
  theme(plot.margin = unit(c(1.2, 1.2, 1.2, 1.2), "cm")) + 
  labs(x = NULL, y = NULL, 
       title = "Unnecessary Skills", 
       caption = "Data Source: Kaggle Data Science Survey")

4.1 Kết luận

Python, kiến thức thống kê, đồ họa, SQL, R là những kĩ năng cần thiết nhất mà những người làm việc trong lĩnh vực data science cần có hiện nay.

LS0tDQp0aXRsZTogIlNraWxscyBOZWVkZWQgQnkgRGF0YSBTY2llbnRpc3RzIg0KYXV0aG9yOiAiTmd1eWVuIE5nb2MgVGhpZXUiDQpkYXRlOiAiNy8zLzIwMTkiDQpvdXRwdXQ6DQogIGh0bWxfZG9jdW1lbnQ6IA0KICAgIGNvZGVfZG93bmxvYWQ6IHRydWUNCiAgICBjb2RlX2ZvbGRpbmc6IGhpZGUNCiAgICBudW1iZXJfc2VjdGlvbnM6IHllcw0KICAgIHRoZW1lOiAiZGVmYXVsdCINCiAgICB0b2M6IFRSVUUNCiAgICB0b2NfZmxvYXQ6IFRSVUUNCi0tLQ0KDQpgYGB7ciBzZXR1cCwgaW5jbHVkZT1GQUxTRX0NCmtuaXRyOjpvcHRzX2NodW5rJHNldChlY2hvID0gVFJVRSkNCmBgYA0KDQojIEdp4bubaSB0aGnhu4d1DQoNClTDtGkgc+G6vSBz4butIGThu6VuZyBEYXRhc2V0OiBodHRwczovL3d3dy5rYWdnbGUuY29tL2thZ2dsZS9rYWdnbGUtc3VydmV5LTIwMTcgbsOgeSDEkeG7gyBwaMOibiB0w61jaCB2w6AgdMOsbSByYSBuaOG7r25nIHNraWxscyBuw6BvIGPhuqduIGPDsyBraGkgbMOgbSB2aeG7h2MgdHJvbmcgbMSpbmggduG7sWMgZGF0YSBzY2llbmNlLg0KDQoNCmBgYHtyICB3YXJuaW5nPUZBTFNFfQ0Kcm0obGlzdCA9IGxzKCkpDQpsaWJyYXJ5KHBhbmRlcikNCmxpYnJhcnkodGlkeXZlcnNlKQ0KbGlicmFyeShnZ3RoZW1lcykNCg0KZGZfc3VydmV5IDwtIHJlYWQuY3N2KCJFOlxcUlxcRGF0YVxca2FnZ2xlLXN1cnZleS0yMDE3XFxtdWx0aXBsZUNob2ljZVJlc3BvbnNlcy5jc3YiLCBzdHJpbmdzQXNGYWN0b3JzID0gRkFMU0UpDQoNCg0KYGBgDQoNClTDtGkgY2jhu41uIGzhu7FhIG5o4buvbmcgYmnhur9uIHPhu5EgduG7gSBza2lsbHMgY+G6p24gdGhp4bq/dCwgY8OzIHRow6wgdOG7kXQgdsOgIGtow7RuZyBj4bqnbiB0aGnhur90IHRoZW8gcXVhbiDEkWnhu4NtIGPhu6dhIG5o4buvbmcgbmfGsOG7nWkgxJHGsOG7o2MgcGjhu49uZyB24bqlbiAoMTYuNzM0IG5nxrDhu51pKS4NCg0KYGBge3Igd2FybmluZz1GQUxTRSB9DQpkZl9zdXJ2ZXkgJT4lIA0KICBzZWxlY3QoY29udGFpbnMoIkpvYlNraWxsIiksIC1jb250YWlucygiSm9iU2tpbGxJbXBvcnRhbmNlT3RoZXIiKSkgJT4lDQogZ2F0aGVyKGEsYikgJT4lDQptdXRhdGUoYSA9IHN0cl9yZXBsYWNlX2FsbChhLCAiSm9iU2tpbGxJbXBvcnRhbmNlIiwgIiIpKSAlPiUNCiAgdGFibGUoKSAlPiUNCiAgYXMuZGF0YS5mcmFtZSgpICU+JQ0Kc3ByZWFkKGIsIEZyZXEpICU+JQ0KICBzZWxlY3QoLVYxKSAtPiBkZl9za2lsbCANCg0KbmFtZXMoZGZfc2tpbGwpWzFdPC0iU2tpbGxzIg0KbmFtZXMoZGZfc2tpbGwpWzNdPC0iTmljZS5Uby5IYXZlIg0KaGVhZChkZl9za2lsbCwgMTEpDQpzdHIoZGZfc2tpbGwpDQpgYGANCg0KIyBOZWNlc3Nhcnkgc2tpbGxzOiBD4bqnbiB0aGnhur90DQoNCmBgYHtyIHdhcm5pbmc9RkFMU0V9DQpsaWJyYXJ5KGhyYnJ0aGVtZXMpDQpteV9jb2xvcnMgPC0gYygiIzgzNUMzQiIpDQpteV9mb250IDwtICJBcmlhbCINCg0KZGZfc2tpbGwgJT4lDQogIGFycmFuZ2UoZGVzYyhOZWNlc3NhcnkpKSAlPiUNCiAgZ2dwbG90KGFlcyhyZW9yZGVyKFNraWxscywgTmVjZXNzYXJ5KSxOZWNlc3NhcnkpKSArIA0KICAgIGdlb21fY29sKGZpbGwgPSBteV9jb2xvcnMsIGNvbG9yID0gbXlfY29sb3JzLCB3aWR0aCA9IDAuOCkgKw0KICAgIGNvb3JkX2ZsaXAoKSArDQogICAgZ2VvbV90ZXh0KGRhdGEgPSBkZl9za2lsbCAsIGFlcyhsYWJlbCA9IE5lY2Vzc2FyeSksIA0KICAgICAgICAgICAgICBoanVzdCA9IDEuMSwgY29sb3IgPSAieWVsbG93Iiwgc2l6ZSA9IDUuNSwgZmFtaWx5ID0gbXlfZm9udCkgKyANCiAgICB0aGVtZV9mdF9yYygpICsNCiAgdGhlbWUocGxvdC5iYWNrZ3JvdW5kID0gZWxlbWVudF9yZWN0KGZpbGwgPSAiZ3JleTIwIiksDQogICAgICAgIHBhbmVsLmJhY2tncm91bmQgPSBlbGVtZW50X3JlY3QoZmlsbCA9ICJncmV5MjAiLA0KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIGNvbG91ciA9ICJncmV5MjAiKSkgKw0KICAgIHRoZW1lKHBhbmVsLmdyaWQgPSBlbGVtZW50X2JsYW5rKCkpICsgDQogICAgdGhlbWUoYXhpcy50ZXh0LnggPSBlbGVtZW50X2JsYW5rKCkpICsgDQogICAgdGhlbWUoYXhpcy50ZXh0LnkgPSBlbGVtZW50X3RleHQoY29sb3IgPSAieWVsbG93Iiwgc2l6ZSA9IDE2LCBmYW1pbHkgPSBteV9mb250KSkgKyANCiAgICB0aGVtZShwbG90LnRpdGxlID0gZWxlbWVudF90ZXh0KGNvbG9yID0gInllbGxvdyIsIHNpemUgPSAyOCkpICsgDQogICAgc2NhbGVfeV9kaXNjcmV0ZShleHBhbmQgPSBjKDAuMDEsIDApKSArIA0KICAgIHRoZW1lKHBsb3QubWFyZ2luID0gdW5pdChjKDEuMiwgMS4yLCAxLjIsIDEuMiksICJjbSIpKSArIA0KICAgIGxhYnMoeCA9IE5VTEwsIHkgPSBOVUxMLCANCiAgICAgICAgIHRpdGxlID0gIk5lY2Vzc2FyeSBTa2lsbHMiLCANCiAgICAgICAgIGNhcHRpb24gPSAiRGF0YSBTb3VyY2U6IEthZ2dsZSBEYXRhIFNjaWVuY2UgU3VydmV5IikNCmBgYA0KDQoNCiMgTmljZS1Uby1IYXZlOiBDw7MgdGjDrCB04buRdA0KDQpgYGB7ciB3YXJuaW5nPUZBTFNFfQ0KZGZfc2tpbGwgJT4lDQogIGFycmFuZ2UoZGVzYyhOaWNlLlRvLkhhdmUpKSAlPiUNCiAgZ2dwbG90KGFlcyhyZW9yZGVyKFNraWxscywgTmljZS5Uby5IYXZlKSxOaWNlLlRvLkhhdmUpKSArIA0KICBnZW9tX2NvbChmaWxsID0gbXlfY29sb3JzLCBjb2xvciA9IG15X2NvbG9ycywgd2lkdGggPSAwLjgpICsNCiAgY29vcmRfZmxpcCgpICsNCiAgZ2VvbV90ZXh0KGRhdGEgPSBkZl9za2lsbCAsIGFlcyhsYWJlbCA9IE5pY2UuVG8uSGF2ZSksIA0KICAgICAgICAgICAgaGp1c3QgPSAxLjEsIGNvbG9yID0gImdyZWVuIiwgc2l6ZSA9IDUuNSwgZmFtaWx5ID0gbXlfZm9udCkgKyANCiAgdGhlbWVfZnRfcmMoKSArIA0KICB0aGVtZShwbG90LmJhY2tncm91bmQgPSBlbGVtZW50X3JlY3QoZmlsbCA9ICJncmV5MjAiKSwNCiAgICAgICAgcGFuZWwuYmFja2dyb3VuZCA9IGVsZW1lbnRfcmVjdChmaWxsID0gImdyZXkyMCIsDQogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgY29sb3VyID0gImdyZXkyMCIpKSArDQogIHRoZW1lKHBhbmVsLmdyaWQgPSBlbGVtZW50X2JsYW5rKCkpICsgDQogIHRoZW1lKGF4aXMudGV4dC54ID0gZWxlbWVudF9ibGFuaygpKSArIA0KICB0aGVtZShheGlzLnRleHQueSA9IGVsZW1lbnRfdGV4dChjb2xvciA9ICJsaWdodGdyZWVuIiwgc2l6ZSA9IDE2LCBmYW1pbHkgPSBteV9mb250KSkgKyANCiAgdGhlbWUocGxvdC50aXRsZSA9IGVsZW1lbnRfdGV4dChjb2xvciA9ICJsaWdodGdyZWVuIixzaXplID0gMjgpKSArIA0KICBzY2FsZV95X2Rpc2NyZXRlKGV4cGFuZCA9IGMoMC4wMSwgMCkpICsgDQogIHRoZW1lKHBsb3QubWFyZ2luID0gdW5pdChjKDEsIDEsIDEsIDEpLCAiY20iKSkgKyANCiAgbGFicyh4ID0gTlVMTCwgeSA9IE5VTEwsIA0KICAgICAgIHRpdGxlID0gIk5pY2UgVG8gSGF2ZSBTa2lsbHMiLCANCiAgICAgICBjYXB0aW9uID0gIkRhdGEgU291cmNlOiBLYWdnbGUgRGF0YSBTY2llbmNlIFN1cnZleSIpDQpgYGANCg0KDQojIFVubmVjZXNzYXJ5OiBLaMO0bmcgY+G6p24gdGhp4bq/dA0KDQpgYGB7ciB3YXJuaW5nPUZBTFNFfQ0KZGZfc2tpbGwgJT4lDQogIGFycmFuZ2UoZGVzYyhVbm5lY2Vzc2FyeSkpICU+JQ0KICBnZ3Bsb3QoYWVzKHJlb3JkZXIoU2tpbGxzLCBVbm5lY2Vzc2FyeSksVW5uZWNlc3NhcnkpKSArIA0KICBnZW9tX2NvbChmaWxsID0gbXlfY29sb3JzLCBjb2xvciA9IG15X2NvbG9ycywgd2lkdGggPSAwLjgpICsNCiAgY29vcmRfZmxpcCgpICsNCiAgZ2VvbV90ZXh0KGRhdGEgPSBkZl9za2lsbCAsIGFlcyhsYWJlbCA9IFVubmVjZXNzYXJ5KSwgDQogICAgICAgICAgICBoanVzdCA9IDEuMSwgY29sb3IgPSAicGluayIsIHNpemUgPSA1LjUsIGZhbWlseSA9IG15X2ZvbnQpICsgDQogIHRoZW1lX2Z0X3JjKCkgKyANCiAgdGhlbWUocGxvdC5iYWNrZ3JvdW5kID0gZWxlbWVudF9yZWN0KGZpbGwgPSAiZ3JleTIwIiksDQogICAgICAgIHBhbmVsLmJhY2tncm91bmQgPSBlbGVtZW50X3JlY3QoZmlsbCA9ICJncmV5MjAiLA0KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIGNvbG91ciA9ICJncmV5MjAiKSkgKw0KICB0aGVtZShwYW5lbC5ncmlkID0gZWxlbWVudF9ibGFuaygpKSArIA0KICB0aGVtZShheGlzLnRleHQueCA9IGVsZW1lbnRfYmxhbmsoKSkgKyANCiAgdGhlbWUoYXhpcy50ZXh0LnkgPSBlbGVtZW50X3RleHQoY29sb3IgPSAicGluayIsIHNpemUgPSAxNiwgZmFtaWx5ID0gbXlfZm9udCkpICsgDQogIHRoZW1lKHBsb3QudGl0bGUgPSBlbGVtZW50X3RleHQoY29sb3IgPSAicGluayIsIHNpemUgPSAyOCkpICsgDQogIHNjYWxlX3lfZGlzY3JldGUoZXhwYW5kID0gYygwLjAxLCAwKSkgKyANCiAgdGhlbWUocGxvdC5tYXJnaW4gPSB1bml0KGMoMS4yLCAxLjIsIDEuMiwgMS4yKSwgImNtIikpICsgDQogIGxhYnMoeCA9IE5VTEwsIHkgPSBOVUxMLCANCiAgICAgICB0aXRsZSA9ICJVbm5lY2Vzc2FyeSBTa2lsbHMiLCANCiAgICAgICBjYXB0aW9uID0gIkRhdGEgU291cmNlOiBLYWdnbGUgRGF0YSBTY2llbmNlIFN1cnZleSIpDQoNCg0KYGBgDQoNCiMjIEvhur90IGx14bqtbg0KDQpQeXRob24sIGtp4bq/biB0aOG7qWMgdGjhu5FuZyBrw6osIMSR4buTIGjhu41hLCBTUUwsIFIgbMOgIG5o4buvbmcga8SpIG7Eg25nIGPhuqduIHRoaeG6v3QgbmjhuqV0IG3DoCBuaOG7r25nIG5nxrDhu51pIGzDoG0gdmnhu4djIHRyb25nIGzEqW5oIHbhu7FjIGRhdGEgc2NpZW5jZSBj4bqnbiBjw7MgaGnhu4duIG5heS4=