****Scrape Kaggle job board to extract data science skills in demand

  1. Load all required packages
library(stringr)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.2.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.2.4
## Loading required package: RColorBrewer
library(tm)
## Warning: package 'tm' was built under R version 3.2.3
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.2.3
library(XML)
## Warning: package 'XML' was built under R version 3.2.3
library(RCurl)
## Warning: package 'RCurl' was built under R version 3.2.3
## Loading required package: bitops
  1. Get the list of links from Kaggle, and then extract contents out of those links
kaggle_base_url <- 'https://www.kaggle.com'
kaggle_lines <- readLines("https://www.kaggle.com/jobs")

kaggle_lines <- grep('<a class="job-post-row"', kaggle_lines, value=TRUE)
kaggle_lines <- gsub('.*(href=")','',kaggle_lines)
kaggle_lines <- gsub('(" target=\"\">)','',kaggle_lines)

full_url <- paste(kaggle_base_url,kaggle_lines,sep='')

length(full_url)
## [1] 40
  1. For each url extract key expressions like experience,skill, language
t1<-as.character()

for(i in 1:length(full_url))
{
  webpage<-getURL(full_url[i])
  
  webpage <- readLines(tc <- textConnection(webpage)); close(tc)
  
  pagetree <- htmlTreeParse(webpage, error=function(...){}, useInternalNodes = TRUE)
  
  # search for key words
  
text1<-unlist(xpathSApply(pagetree,"/html/body/div/descendant::*[contains (text(),'skills')]",xmlValue))
  
 text2<-unlist(xpathSApply(pagetree,"/html/body/div/descendant::*[contains (text(),'experience')]",xmlValue))
  
 text3<-unlist(xpathSApply(pagetree,"/html/body/div/descendant::*[contains (text(),'language')]",xmlValue))
 
  

 if (length(paste(text1,text2,text3))>0 )
   
 { t1[i]<-paste(text1,text2,text3)  }
 
 else 

 t1[i]<-"123"
 
}
## Warning in t1[i] <- paste(text1, text2, text3): number of items to replace
## is not a multiple of replacement length

## Warning in t1[i] <- paste(text1, text2, text3): number of items to replace
## is not a multiple of replacement length

## Warning in t1[i] <- paste(text1, text2, text3): number of items to replace
## is not a multiple of replacement length

## Warning in t1[i] <- paste(text1, text2, text3): number of items to replace
## is not a multiple of replacement length

## Warning in t1[i] <- paste(text1, text2, text3): number of items to replace
## is not a multiple of replacement length

## Warning in t1[i] <- paste(text1, text2, text3): number of items to replace
## is not a multiple of replacement length

## Warning in t1[i] <- paste(text1, text2, text3): number of items to replace
## is not a multiple of replacement length

## Warning in t1[i] <- paste(text1, text2, text3): number of items to replace
## is not a multiple of replacement length

## Warning in t1[i] <- paste(text1, text2, text3): number of items to replace
## is not a multiple of replacement length

## Warning in t1[i] <- paste(text1, text2, text3): number of items to replace
## is not a multiple of replacement length

## Warning in t1[i] <- paste(text1, text2, text3): number of items to replace
## is not a multiple of replacement length

## Warning in t1[i] <- paste(text1, text2, text3): number of items to replace
## is not a multiple of replacement length

## Warning in t1[i] <- paste(text1, text2, text3): number of items to replace
## is not a multiple of replacement length

## Warning in t1[i] <- paste(text1, text2, text3): number of items to replace
## is not a multiple of replacement length

## Warning in t1[i] <- paste(text1, text2, text3): number of items to replace
## is not a multiple of replacement length

## Warning in t1[i] <- paste(text1, text2, text3): number of items to replace
## is not a multiple of replacement length

## Warning in t1[i] <- paste(text1, text2, text3): number of items to replace
## is not a multiple of replacement length

## Warning in t1[i] <- paste(text1, text2, text3): number of items to replace
## is not a multiple of replacement length
write.table(t1,file="data.csv")
  1. Working with raw data file and do text mining of key words
file<-read.csv("data.csv",header = FALSE,stringsAsFactors = FALSE)

file$V1<-paste(file$V1,file$V2,file$V3,file$V4,file$V4,file$V5,file$V6,file$V7,file$V7,file$V8,file$v9,file$V10,sep = "")
#file1$V1<-data.frame()
#file1$V1<-file$V1

#str(file1$V1)

review_text <- paste(file$V1, collapse=" ")
review_source <- VectorSource(review_text)

corpus <- Corpus(review_source)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)

corpus <- tm_map(corpus, removeWords, stopwords("english"))

dtm <- DocumentTermMatrix(corpus)
dtm2 <- as.matrix(dtm)
frequency <- colSums(dtm2)
frequency <- sort(frequency, decreasing=TRUE)
head(frequency,170)
##             data             will       experience         business 
##               55               35               31               25 
##           skills         learning          science          machine 
##               20               18               16               14 
##             team              123        analytics         problems 
##               13               12               12               11 
##         relevant             work            years         language 
##               11               10               10                9 
##          looking          working            optum         position 
##                9                9                8                8 
##           python              big        knowledge         products 
##                8                7                7                7 
##      programming           strong         training    communication 
##                7                7                7                6 
##        customers             gaps         identify             lead 
##                6                6                6                6 
##             make              new              one          provide 
##                6                6                6                6 
##             role          sellers          ability           across 
##                6                6                5                5 
##       actionable           amazon         analysis         concepts 
##                5                5                5                5 
##             core           create          develop      experienced 
##                5                5                5                5 
##      information       passionate         projects          results 
##                5                5                5                5 
##        scientist            solve          solving          systems 
##                5                5                5                5 
##            teams        technical            tools         advanced 
##                5                5                5                4 
##             also         analytic         building          company 
##                4                4                4                4 
##         critical          deliver         function       healthcare 
##                4                4                4                4 
##       highimpact             join        languages           needed 
##                4                4                4                4 
##         programs        providing         required           senior 
##                4                4                4                4 
##            skill         software              sql       technology 
##                4                4                4                4 
##       understand            using             able       advocating 
##                4                4                3                3 
##       algorithms              app          applied            apply 
##                3                3                3                3 
##             best            build        candidate          clearly 
##                3                3                3                3 
##            close      combination          complex  crossfunctional 
##                3                3                3                3 
##        databases       developing      development            drive 
##                3                3                3                3 
##         engineer      environment          example        excellent 
##                3                3                3                3 
##            focus        following            hands          handson 
##                3                3                3                3 
##            hires           impact        important          insight 
##                3                3                3                3 
##         insights     intelligence          interns              key 
##                3                3                3                3 
##            large            least       leveraging             like 
##                3                3                3                3 
##            lines            major          mentors           mining 
##                3                3                3                3 
##         modeling             must      opportunity        optumlabs 
##                3                3                3                3 
##         platform            power          problem       processing 
##                3                3                3                3 
##     quantitative        questions      recruitment scientistmachine 
##                3                3                3                3 
##       scientists           simple        solutions      statistical 
##                3                3                3                3 
##       statistics       successful              two            units 
##                3                3                3                3 
##          variety          acquire              add       analytical 
##                3                2                2                2 
##            andor        answering          appthis       artificial 
##                2                2                2                2 
##           asking       associated              can          capital 
##                2                2                2                2 
##            cloud    collaborative      comfortable          content 
##                2                2                2                2 
##       datadriven             deep           degree           demand 
##                2                2                2                2 
##           design             easy        ecosystem            email 
##                2                2                2                2 
##           ensure      exceptional 
##                2                2
  1. Do frequency count and prepare data file by skill type, skill name frequency
skill_data<-as.data.frame(frequency)
skill_data$skill<-rownames(skill_data)

# frequency count for technical skills 


tech_skill<-skill_data[skill_data$skill %in% c("presentation","math","design","modeling","predictive","research","business","algorithm","machine","statistics","software engineering","matlab","analytics","mining","analysis","big","programming","python","scriptingpythonrjavascala","sql","java","hadoop","linux","sas","C++"),]

tech_skill$skill_type<-c("technical")


# frequency count for non technical skills

non_tech_skill<-skill_data[skill_data$skill %in% c("Self","interpersonal","innovation","creative","curiosity","leadership","team","management","strategy","communication"),]
non_tech_skill$skill_type<-c("Non technical")

# combine technical and non technical skills

data_science_skills<-rbind(tech_skill,non_tech_skill)
rownames(data_science_skills)<-1:nrow(data_science_skills)
head(data_science_skills,15)
##    frequency       skill skill_type
## 1         25    business  technical
## 2         14     machine  technical
## 3         12   analytics  technical
## 4          8      python  technical
## 5          7         big  technical
## 6          7 programming  technical
## 7          5    analysis  technical
## 8          4         sql  technical
## 9          3      mining  technical
## 10         3    modeling  technical
## 11         3  statistics  technical
## 12         2      design  technical
## 13         2        java  technical
## 14         2    research  technical
## 15         1      hadoop  technical
write.table(data_science_skills,file="data_science_skills.csv")