****Scrape Kaggle job board to extract data science skills in demand
library(stringr)
library(dplyr)## Warning: package 'dplyr' was built under R version 3.2.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(wordcloud)## Warning: package 'wordcloud' was built under R version 3.2.4
## Loading required package: RColorBrewer
library(tm)## Warning: package 'tm' was built under R version 3.2.3
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.2.3
library(XML)## Warning: package 'XML' was built under R version 3.2.3
library(RCurl)## Warning: package 'RCurl' was built under R version 3.2.3
## Loading required package: bitops
kaggle_base_url <- 'https://www.kaggle.com'
kaggle_lines <- readLines("https://www.kaggle.com/jobs")
kaggle_lines <- grep('<a class="job-post-row"', kaggle_lines, value=TRUE)
kaggle_lines <- gsub('.*(href=")','',kaggle_lines)
kaggle_lines <- gsub('(" target=\"\">)','',kaggle_lines)
full_url <- paste(kaggle_base_url,kaggle_lines,sep='')
length(full_url)## [1] 40
t1<-as.character()
for(i in 1:length(full_url))
{
webpage<-getURL(full_url[i])
webpage <- readLines(tc <- textConnection(webpage)); close(tc)
pagetree <- htmlTreeParse(webpage, error=function(...){}, useInternalNodes = TRUE)
# search for key words
text1<-unlist(xpathSApply(pagetree,"/html/body/div/descendant::*[contains (text(),'skills')]",xmlValue))
text2<-unlist(xpathSApply(pagetree,"/html/body/div/descendant::*[contains (text(),'experience')]",xmlValue))
text3<-unlist(xpathSApply(pagetree,"/html/body/div/descendant::*[contains (text(),'language')]",xmlValue))
if (length(paste(text1,text2,text3))>0 )
{ t1[i]<-paste(text1,text2,text3) }
else
t1[i]<-"123"
}## Warning in t1[i] <- paste(text1, text2, text3): number of items to replace
## is not a multiple of replacement length
## Warning in t1[i] <- paste(text1, text2, text3): number of items to replace
## is not a multiple of replacement length
## Warning in t1[i] <- paste(text1, text2, text3): number of items to replace
## is not a multiple of replacement length
## Warning in t1[i] <- paste(text1, text2, text3): number of items to replace
## is not a multiple of replacement length
## Warning in t1[i] <- paste(text1, text2, text3): number of items to replace
## is not a multiple of replacement length
## Warning in t1[i] <- paste(text1, text2, text3): number of items to replace
## is not a multiple of replacement length
## Warning in t1[i] <- paste(text1, text2, text3): number of items to replace
## is not a multiple of replacement length
## Warning in t1[i] <- paste(text1, text2, text3): number of items to replace
## is not a multiple of replacement length
## Warning in t1[i] <- paste(text1, text2, text3): number of items to replace
## is not a multiple of replacement length
## Warning in t1[i] <- paste(text1, text2, text3): number of items to replace
## is not a multiple of replacement length
## Warning in t1[i] <- paste(text1, text2, text3): number of items to replace
## is not a multiple of replacement length
## Warning in t1[i] <- paste(text1, text2, text3): number of items to replace
## is not a multiple of replacement length
## Warning in t1[i] <- paste(text1, text2, text3): number of items to replace
## is not a multiple of replacement length
## Warning in t1[i] <- paste(text1, text2, text3): number of items to replace
## is not a multiple of replacement length
## Warning in t1[i] <- paste(text1, text2, text3): number of items to replace
## is not a multiple of replacement length
## Warning in t1[i] <- paste(text1, text2, text3): number of items to replace
## is not a multiple of replacement length
## Warning in t1[i] <- paste(text1, text2, text3): number of items to replace
## is not a multiple of replacement length
## Warning in t1[i] <- paste(text1, text2, text3): number of items to replace
## is not a multiple of replacement length
write.table(t1,file="data.csv")file<-read.csv("data.csv",header = FALSE,stringsAsFactors = FALSE)
file$V1<-paste(file$V1,file$V2,file$V3,file$V4,file$V4,file$V5,file$V6,file$V7,file$V7,file$V8,file$v9,file$V10,sep = "")
#file1$V1<-data.frame()
#file1$V1<-file$V1
#str(file1$V1)
review_text <- paste(file$V1, collapse=" ")
review_source <- VectorSource(review_text)
corpus <- Corpus(review_source)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
dtm <- DocumentTermMatrix(corpus)
dtm2 <- as.matrix(dtm)
frequency <- colSums(dtm2)
frequency <- sort(frequency, decreasing=TRUE)
head(frequency,170)## data will experience business
## 55 35 31 25
## skills learning science machine
## 20 18 16 14
## team 123 analytics problems
## 13 12 12 11
## relevant work years language
## 11 10 10 9
## looking working optum position
## 9 9 8 8
## python big knowledge products
## 8 7 7 7
## programming strong training communication
## 7 7 7 6
## customers gaps identify lead
## 6 6 6 6
## make new one provide
## 6 6 6 6
## role sellers ability across
## 6 6 5 5
## actionable amazon analysis concepts
## 5 5 5 5
## core create develop experienced
## 5 5 5 5
## information passionate projects results
## 5 5 5 5
## scientist solve solving systems
## 5 5 5 5
## teams technical tools advanced
## 5 5 5 4
## also analytic building company
## 4 4 4 4
## critical deliver function healthcare
## 4 4 4 4
## highimpact join languages needed
## 4 4 4 4
## programs providing required senior
## 4 4 4 4
## skill software sql technology
## 4 4 4 4
## understand using able advocating
## 4 4 3 3
## algorithms app applied apply
## 3 3 3 3
## best build candidate clearly
## 3 3 3 3
## close combination complex crossfunctional
## 3 3 3 3
## databases developing development drive
## 3 3 3 3
## engineer environment example excellent
## 3 3 3 3
## focus following hands handson
## 3 3 3 3
## hires impact important insight
## 3 3 3 3
## insights intelligence interns key
## 3 3 3 3
## large least leveraging like
## 3 3 3 3
## lines major mentors mining
## 3 3 3 3
## modeling must opportunity optumlabs
## 3 3 3 3
## platform power problem processing
## 3 3 3 3
## quantitative questions recruitment scientistmachine
## 3 3 3 3
## scientists simple solutions statistical
## 3 3 3 3
## statistics successful two units
## 3 3 3 3
## variety acquire add analytical
## 3 2 2 2
## andor answering appthis artificial
## 2 2 2 2
## asking associated can capital
## 2 2 2 2
## cloud collaborative comfortable content
## 2 2 2 2
## datadriven deep degree demand
## 2 2 2 2
## design easy ecosystem email
## 2 2 2 2
## ensure exceptional
## 2 2
skill_data<-as.data.frame(frequency)
skill_data$skill<-rownames(skill_data)
# frequency count for technical skills
tech_skill<-skill_data[skill_data$skill %in% c("presentation","math","design","modeling","predictive","research","business","algorithm","machine","statistics","software engineering","matlab","analytics","mining","analysis","big","programming","python","scriptingpythonrjavascala","sql","java","hadoop","linux","sas","C++"),]
tech_skill$skill_type<-c("technical")
# frequency count for non technical skills
non_tech_skill<-skill_data[skill_data$skill %in% c("Self","interpersonal","innovation","creative","curiosity","leadership","team","management","strategy","communication"),]
non_tech_skill$skill_type<-c("Non technical")
# combine technical and non technical skills
data_science_skills<-rbind(tech_skill,non_tech_skill)
rownames(data_science_skills)<-1:nrow(data_science_skills)
head(data_science_skills,15)## frequency skill skill_type
## 1 25 business technical
## 2 14 machine technical
## 3 12 analytics technical
## 4 8 python technical
## 5 7 big technical
## 6 7 programming technical
## 7 5 analysis technical
## 8 4 sql technical
## 9 3 mining technical
## 10 3 modeling technical
## 11 3 statistics technical
## 12 2 design technical
## 13 2 java technical
## 14 2 research technical
## 15 1 hadoop technical
write.table(data_science_skills,file="data_science_skills.csv")