What are the most valuable skills as a Data Scientist?
library(bitops)
library(RCurl)
library(jsonlite)
library(stringr)
library(plyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:RCurl':
##
## complete
library(rvest)
## Loading required package: xml2
urlfn <- function(i) {
url <- paste("https://www.itjobswatch.co.uk/search?q=Data+Scientist&start=", i, sep="")
return(url)
}
mainURLs <- sapply(seq(0, 60, by = 15), urlfn)
# mainURLs <- sapply(0:0, urlfn)
SearchTerm <- 'Data Scientist'
Currency <- 'Pound'
# Main Table
allPagesCtr <- 1
for(mainURL in mainURLs) {
print(paste0("Processing Page ", allPagesCtr))
htmlF <- read_html(mainURL, simplifyVector = TRUE)
Salary <- htmlF %>%
html_nodes('.jobDetails div:nth-child(1) span') %>%
html_text()
Location <- htmlF %>%
html_nodes('.location') %>%
html_text()
surl <- htmlF %>%
html_nodes('.showMore') %>%
html_attrs()
# concatenate sub-url to main url
surl <- unlist(lapply(surl, `[[`, 1))
surl <- paste0('https://www.itjobswatch.co.uk', surl)
Title <- htmlF %>%
html_nodes('.jobTitle a') %>%
html_text()
Company <- htmlF %>%
html_nodes('.company span') %>%
html_text()
mainDF <- data.frame(Salary = Salary, Location = Location, Title = Title)
mainDF$JobLink <- surl
mainDF$SearchTerm <- SearchTerm
mainDF$Currency <- Currency
mainDF$Company <- Company
mainDF$MainSite <- mainURL
mainDF[] <- lapply(mainDF, as.character)
# Sub Table
jobskills <- readLines('https://raw.githubusercontent.com/DATA607/Project3/master/jobskills.txt')
mainCtr <- 1
subdf <- ldply(surl, function(suburl) {
df1 <- data.frame(matrix(ncol = 4, nrow = 5))
colnames(df1) <- c("Id", "SubUrl", "Skill", "Count")
text <- readLines(suburl, n = -1)
text <- str_c(text, collapse = "")
countr<- 1
for (skill in jobskills) {
skillcount <- lapply(skill, str_count, string=text)
df1[countr,1] <- mainCtr
df1[countr,2] <- suburl
df1[countr,3] <- skill
df1[countr,4] <- skillcount
mainCtr <<- mainCtr + 1
countr <- countr + 1
}
df1
})
options(warn=-1)
allPagesCtr <<- allPagesCtr + 1
combinedDF <- data.frame()
combinedDF <- inner_join(x=mainDF, y=subdf, by=c("JobLink" = "SubUrl") )
# combinedDF <- str_replace_all(combinedDF, "[\t\r\n]" , "")
print(paste0("Created CombinedDF with dimensions=",dim(combinedDF)))
# reorder columns
# combinedDF <- combinedDF[c(9, 8, 5, 4, 6, 1, 2, 5, 10, 11)]
write.table(combinedDF, file = "itjobsUK.csv", append=TRUE, row.names=FALSE, quote=TRUE, sep=",")
}
## [1] "Processing Page 1"
## Warning in readLines(suburl, n = -1): incomplete final line found on
## 'https://www.itjobswatch.co.uk/ja/clk?jr=btoyd-8n4-7lz-290'
## Warning in readLines(suburl, n = -1): incomplete final line found on
## 'https://www.itjobswatch.co.uk/ja/clk?jr=btlq4-8n4-7lz-290'
## Warning in readLines(suburl, n = -1): incomplete final line found on
## 'https://www.itjobswatch.co.uk/ja/clk?jr=bs3z8-8n4-7lz-290'
## Warning in readLines(suburl, n = -1): incomplete final line found on
## 'https://www.itjobswatch.co.uk/ja/clk?jr=bteqy-8n4-7lz-290'
## [1] "Created CombinedDF with dimensions=375"
## [2] "Created CombinedDF with dimensions=11"
## [1] "Processing Page 2"
## [1] "Created CombinedDF with dimensions=375"
## [2] "Created CombinedDF with dimensions=11"
## [1] "Processing Page 3"
## [1] "Created CombinedDF with dimensions=375"
## [2] "Created CombinedDF with dimensions=11"
## [1] "Processing Page 4"
## [1] "Created CombinedDF with dimensions=375"
## [2] "Created CombinedDF with dimensions=11"
## [1] "Processing Page 5"
## [1] "Created CombinedDF with dimensions=350"
## [2] "Created CombinedDF with dimensions=11"