Data Scientist Skills in IT

In this project, I focuse on skill sets of Data Scienctis in information technology industry. The data souce of the job description for data Scientist are from Linkedin, Google Job, Indeed and Glassdoor.

load data from HTML file to R

library(XML)
library(RCurl)
## Loading required package: bitops
library(xlsx)
## Loading required package: rJava
## 
## Attaching package: 'rJava'
## The following object is masked from 'package:RCurl':
## 
##     clone
## Loading required package: xlsxjars
library(rvest)
## Warning: package 'rvest' was built under R version 3.4.2
## Loading required package: xml2
## 
## Attaching package: 'rvest'
## The following object is masked from 'package:XML':
## 
##     xml
library(stringr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# build a data fram to store values
company=c()
link=c()
jobName=c()
reqSkills=c()

Web Scraping in indeed.com

Job1: Data Science Intern-Samsung Pay

con = url("http://samsungresearchamerica.applytojob.com/apply/job_20171013044600_CBVZWKVBUBHPMYQD/Data-Science-InternSamsung-Pay?source=INDE")
htmlCode = readLines(con)

link=con

df<-unlist(strsplit(htmlCode[5], "-"))
jobName<-df[1]
company<-df[3]

#require<-jobrequire<-htmlCode[215:226]
require<-str_replace_all(htmlCode[215:226], "<li>", "")
reqSkills<- str_replace_all(require, "</li>", "")
reqSkills
##  [1] "Solid background in data mining/machine learning"                                               
##  [2] "Hands-on experience of using data mining/machine learning algorithms in real-world applications"
##  [3] "Proficiency in data structures and algorithms"                                                  
##  [4] "Strong programming skills with Java/Python"                                                     
##  [5] "Basic knowledge about RDBMS and NoSQL databases"                                                
##  [6] "Expertise in Linux (Ubuntu)"                                                                    
##  [7] "Excellent communication and interpersonal skills"                                               
##  [8] "Publications in relevant top venues (e.g., KDD, NIPS, ICML, AAAI, IJCAI, ICDM, etc.)"           
##  [9] "PhD students in Computer Science or relevant fields"                                            
## [10] "Preferred Experience Requirements:"                                                             
## [11] "Basic knowledge about Amazon Web Services"                                                      
## [12] "Working knowledge of big data tools and data pipelines"
job1<-data.frame(company,jobName,reqSkills)
head(job1)
##                             company                     jobName
## 1         Samsung Research America          Data Science Intern
## 2         Samsung Research America          Data Science Intern
## 3         Samsung Research America          Data Science Intern
## 4         Samsung Research America          Data Science Intern
## 5         Samsung Research America          Data Science Intern
## 6         Samsung Research America          Data Science Intern
##                                                                                         reqSkills
## 1                                                Solid background in data mining/machine learning
## 2 Hands-on experience of using data mining/machine learning algorithms in real-world applications
## 3                                                   Proficiency in data structures and algorithms
## 4                                                      Strong programming skills with Java/Python
## 5                                                 Basic knowledge about RDBMS and NoSQL databases
## 6                                                                     Expertise in Linux (Ubuntu)
#dim(job1)

Web Scraping in a company website

Job2: Data Science in Microsoft

url2<-read_html("https://careers.microsoft.com/jobdetails.aspx?jid=324590&job_id=1075466&utm_source=Indeed&show_desc=0")

extractHTML<-htmlTreeParse(url2, useInternalNodes = TRUE)
htmldf = unlist(xpathApply(extractHTML, "//span", xmlValue))

company<-"Microsoft"
jobName<-"Data Science"

df<-unlist(strsplit(htmldf[13], "[.]"))
reqSkills<-unlist(strsplit(df[21:28], split = ",", fixed = TRUE))
reqSkills<- str_replace_all(reqSkills, "[. ]","")

job2<-data.frame(company,jobName,reqSkills)
head(job2)
##     company      jobName
## 1 Microsoft Data Science
## 2 Microsoft Data Science
## 3 Microsoft Data Science
## 4 Microsoft Data Science
## 5 Microsoft Data Science
## 6 Microsoft Data Science
##                                                                              reqSkills
## 1 \nRequirements:<U+0095>5yearsofsoftwaredevelopmentexperience<U+0095>1-3+yearsworkinginwebanalytics
## 2                                                                              bigdata
## 3                datascienceoralgotradingpreferablywithorganicorpaidsearchtechnologies
## 4                                                       webanalyticstoolsordevelopment
## 5                                                                            ecommerce
## 6                                                                   publishingnetworks
#dim(job2)

# Combin two jobs into a table
library(dplyr)
db<-rbind(job1, job2)
#db

Web Scraping in glassdoor

job3: Data Scientist, Global Premium Services, Google Technical Services Professional Services

url3<-read_html("https://www.glassdoor.com/job-listing/data-scientist-global-premium-services-google-technical-services-professional-services-google-JV_IC1132348_KO0,86_KE87,93.htm?jl=2516839891" )

extractHTML<-htmlTreeParse(url3, useInternalNodes = TRUE)
htmldf = unlist(xpathApply(extractHTML, "//div", xmlValue))

df<-unlist(strsplit(htmldf[59], ","))
jobName<-df[1]
company<-"Google"

reqSkills<-unlist(strsplit(htmldf[75], "[.]"))
reqSkills<-unlist(strsplit(reqSkills, split = ",", fixed = TRUE))

job3<-data.frame(company,jobName,reqSkills)
head(job3)
##   company                           jobName
## 1  Google \nView Allnum of numClose (Esc)\n
## 2  Google \nView Allnum of numClose (Esc)\n
## 3  Google \nView Allnum of numClose (Esc)\n
## 4  Google \nView Allnum of numClose (Esc)\n
## 5  Google \nView Allnum of numClose (Esc)\n
## 6  Google \nView Allnum of numClose (Esc)\n
##                                                                                                                                                                                                                                                             reqSkills
## 1                                                                                                                                          \nNote: By applying to this position your application is automatically submitted to the following locations: Mountain View
## 2                                                                                                                                                                                                                                                                  CA
## 3                                                                                                                                                                                                                                                       USA; New York
## 4                                                                                                                                                                                                                                                                  NY
## 5                                                                                                                  USA\nGoogle Technical Services: Professional Services (gPS) is a team of solution-oriented trusted advisors supporting millions of users worldwide
## 6  Our consultative services take our deep technical and product expertise and combine it with our powerful understanding of our customer<U+0092>s needs and goals to solve their biggest business challenges allowing them to grow and get the most out of Google solutions
#dim(job3)

#Update the database
db<-rbind(db, job3)
#db

Web Scraping in Dice

Job4: Data Scientist - Engineering Client Support in Bloomberg L.P.

df <-read_html("https://www.dice.com/jobs/detail/Data-Scientist-%26%2345-Engineering-Client-Support-Bloomberg-L.P.-New-York-NY-10001/10432313/59225?icid=sr8-1p&q=Data+Scientist&l=")

jobName<-df%>% html_nodes("#jt") %>% html_text()
 
company<-df %>% html_nodes("span") %>% html_text()
company<-company[14]

df<-df %>% html_nodes("li") %>% html_text()
reqSkills<-df[51:60]

job4<-data.frame(company,jobName,reqSkills)
head(job4)
##          company                                     jobName
## 1 Bloomberg L.P. Data Scientist - Engineering Client Support
## 2 Bloomberg L.P. Data Scientist - Engineering Client Support
## 3 Bloomberg L.P. Data Scientist - Engineering Client Support
## 4 Bloomberg L.P. Data Scientist - Engineering Client Support
## 5 Bloomberg L.P. Data Scientist - Engineering Client Support
## 6 Bloomberg L.P. Data Scientist - Engineering Client Support
##                                                                                                                                                  reqSkills
## 1  Demonstrated success in applying statistical concepts to business problems- Expert at correctly processing and manipulating large and complex datasets 
## 2                                                         3+ years of professional experience programming in Python or R to solve data analytics problems 
## 3                                                                                                     Experience with distributed data stores (Hadoop/S3) 
## 4                                                                                                         Detail oriented and analytical approach to data 
## 5                                                                           Self-driven attitude to find new opportunities to increase system performance 
## 6                                                                                                                                  Familiarity with Spark
#dim(job4)

#grepl("R",job4$reqSkills)

#Update the database
db<-rbind(db, job4)
head(db)
##                             company                     jobName
## 1         Samsung Research America          Data Science Intern
## 2         Samsung Research America          Data Science Intern
## 3         Samsung Research America          Data Science Intern
## 4         Samsung Research America          Data Science Intern
## 5         Samsung Research America          Data Science Intern
## 6         Samsung Research America          Data Science Intern
##                                                                                         reqSkills
## 1                                                Solid background in data mining/machine learning
## 2 Hands-on experience of using data mining/machine learning algorithms in real-world applications
## 3                                                   Proficiency in data structures and algorithms
## 4                                                      Strong programming skills with Java/Python
## 5                                                 Basic knowledge about RDBMS and NoSQL databases
## 6                                                                     Expertise in Linux (Ubuntu)

Find the top required skills of Data Scientist Skills in IT

“Which are the most valued data science skills?”

Answer: The most valued data sciend skills are SQL, Python, web analysis, machine learning, Java. IT companies prefer more programing skills, like Python and SQL, and analysis skill like machine learning to analyze web data. The communication also is one to top soft skills that helps in teamwork and precentation while many jobs require client facing.

#db$reqSkills

skills<-c("machine learning","data mining","datamining","bigdata","SQL","Java","Python","Hadoop","Spark" ,"statistics","databases","programming","development","debugging","web","communication","businessintelligence" )
length(skills)
## [1] 17
num=c()
for(i in 1:length(skills)){
  z<-grepl(skills[i],db$reqSkills)
  num[i]<-sum(z, na.rm=TRUE)
}
as.numeric(num)
##  [1] 4 2 1 2 4 3 4 1 1 1 3 3 4 1 4 2 1
Skill <- data.frame(skills,num)
top10<-Skill[order(num,decreasing=T)[1:10],]
str(top10)
## 'data.frame':    10 obs. of  2 variables:
##  $ skills: Factor w/ 17 levels "bigdata","businessintelligence",..: 11 15 13 8 17 10 5 12 4 1
##  $ num   : int  4 4 4 4 4 3 3 3 2 2
# Histogram for `top 10 skills`   
library(plotly)
## Warning: package 'plotly' was built under R version 3.4.2
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
p <- plot_ly(x = ~top10$num, y = ~top10$skills, type = 'bar', orientation = 'h', title  = 'Top 10 Skills of Data Scientist in IT')
p
## Warning: 'bar' objects don't have these attributes: 'title'
## Valid attributes include:
## 'type', 'visible', 'showlegend', 'legendgroup', 'opacity', 'name', 'uid', 'ids', 'customdata', 'hoverinfo', 'hoverlabel', 'stream', 'x', 'x0', 'dx', 'y', 'y0', 'dy', 'text', 'hovertext', 'textposition', 'textfont', 'insidetextfont', 'outsidetextfont', 'orientation', 'base', 'offset', 'width', 'marker', 'r', 't', 'error_y', 'error_x', '_deprecated', 'xaxis', 'yaxis', 'xcalendar', 'ycalendar', 'idssrc', 'customdatasrc', 'hoverinfosrc', 'xsrc', 'ysrc', 'textsrc', 'hovertextsrc', 'textpositionsrc', 'basesrc', 'offsetsrc', 'widthsrc', 'rsrc', 'tsrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule'
library(ggplot2)
ggplot(top10, aes(x = top10$skills, y = top10$num, fill = top10$num)) + 
  geom_bar(stat = "identity") +
  xlab("Skills") + 
  ylab("Frequency") + 
  theme(legend.position = "none",  
        axis.text.x = element_text(angle = 65, hjust = 1)) +
  ggtitle("Top 10 Skills of Data Scientist in IT")  

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.