Goal
Please use data to answer the question, “Which are the most valued data science skills?”
Pre-Requistes : Available Libraries
- googlesheets
- readxl
- DT
- data.table
- kableExtra
- dplyr
- tidyr
- tidyverse
- rlang
- stringr
- RMySQL
- DBI
- ggplot2
- tm
- wordcloud2
- RColorBrewer
Gather data
Read Data Source into R from Google/Spread Sheets
Load above survey data into R data frames using spreadsheet functions
# which google sheets do you have access to? may ask you to authenticate in a browser!
#gs_ls("https://docs.google.com/spreadsheets/d/1rTr2r5NlSy8QBEhqwpP0HL2aZfWZflrN8kw7KC3Vi5M/edit#gid=1248983012")
#gs_ls("https://docs.google.com/spreadsheets/d/1rTr2r5NlSy8QBEhqwpP0HL2aZfWZflrN8kw7KC3Vi5M/edit?usp=sharing")
#SpreadSheet <- gs_title("Data Science")
#Get Sheet names
#gs_ws_ls(SpreadSheet)
# convert to data.frame
#df_Job_Listings <- as.data.frame(gs_read(ss=SpreadSheet, ws = "Job Listings"))
#df_Language_Skills <- as.data.frame(gs_read(ss=SpreadSheet, ws = "Language Skills"))
#df_Software_Skills <- as.data.frame(gs_read(ss=SpreadSheet, ws = "Software Skills"))
myWorkingDir <- getwd()
mySourceFile <- paste0(myWorkingDir,"/Data Science.xlsx")
excel_sheets(path = mySourceFile)
## [1] "Job Listings" "Language Skills"
## [3] "Software Skills" "General Skills"
## [5] "Background Profile" "multipleChoiceResponses"
#df_Jobs <- read_excel(path = mySourceFile, sheet = 1, range = "A1:B6")
#df_Languages <- read_excel(path = mySourceFile, sheet = "Language Skills", range = "A1:E38")
#df_Softwares <- read_excel(path = mySourceFile, sheet = "Software Skills", range = "A1:E16")
df_Job_Listings <- read_excel(path = mySourceFile, sheet = 1)
df_Language_Skills <- read_excel(path = mySourceFile, sheet = "Language Skills")
df_Software_Skills <- read_excel(path = mySourceFile, sheet = "Software Skills")
df_MCR <- read_excel(path = mySourceFile, sheet = "multipleChoiceResponses")
Show Unitdy data
Untidy Data- Job Listings
DT::datatable(df_Job_Listings, options = list(pagelength=5))
kable(df_Job_Listings) %>%
kable_styling(bootstrap_options = c("striped","hover","condensed","responsive"),full_width = F,position = "left",font_size = 12) %>%
row_spec(0, background ="gray")
|
Source
|
Count
|
|
LinkedIn
|
8400
|
|
Indeed
|
5200
|
|
SimplyHired
|
3800
|
|
Monster
|
3750
|
|
AngelList
|
600
|
Untidy Data Table - Language Skills
DT::datatable(df_Language_Skills, options = list(pagelength=5))
kable(tail(df_Language_Skills)) %>%
kable_styling(bootstrap_options = c("striped","hover","condensed","responsive"),full_width = F,position = "left",font_size = 12) %>%
row_spec(0, background ="gray")
|
Language
|
LinkedIn
|
Indeed
|
SimplyHired
|
Monster
|
LinkedIn %
|
Indeed %
|
SimplyHired %
|
Monster %
|
Avg %
|
GlassDoor Self Reported % 2017
|
Difference
|
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
|
Total
|
38882
|
27477
|
21204
|
19350
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
|
Search Criteria
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
|
“data scientist” alone
|
8610
|
5138
|
3829
|
3746
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
|
“data scientist” “[keyword]”
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
Untidy Data Table - Software Skills
DT::datatable(df_Software_Skills, options = list(pagelength=5))
|
Software
|
LinkedIn
|
Indeed
|
SimplyHired
|
Monster
|
|
NLP
|
643
|
466
|
362
|
576
|
|
natural language processing
|
791
|
621
|
429
|
575
|
|
NLP + natural language processing
|
222
|
177
|
131
|
569
|
|
NA
|
NA
|
NA
|
NA
|
NA
|
|
“data scientist” “[keyword]”
|
NA
|
NA
|
NA
|
NA
|
|
“data engineering’ searched
|
NA
|
NA
|
NA
|
NA
|
Untidy Data Table - Demographics
DT::datatable(df_MCR, options = list(pagelength=5))