Data607Project3

What are the most valuable skills as a Data Scientist?

library(bitops)
library(RCurl)
library(jsonlite)
library(stringr)
library(plyr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)

## 
## Attaching package: 'tidyr'

## The following object is masked from 'package:RCurl':
## 
##     complete

library(rvest)

## Loading required package: xml2

urlfn <- function(i) {
  url <- paste("https://www.itjobswatch.co.uk/search?q=Data+Scientist&start=", i, sep="")
  return(url)
}

mainURLs <- sapply(seq(0, 60, by = 15), urlfn)
# mainURLs <- sapply(0:0, urlfn)
SearchTerm <- 'Data Scientist'
Currency <- 'Pound'

# Main Table
allPagesCtr <- 1
for(mainURL in mainURLs) {
  print(paste0("Processing Page ", allPagesCtr))
  htmlF <- read_html(mainURL, simplifyVector = TRUE)
  
  Salary <- htmlF %>% 
  html_nodes('.jobDetails div:nth-child(1) span')  %>% 
  html_text()
  
  Location <- htmlF %>% 
    html_nodes('.location') %>% 
      html_text()

  surl <- htmlF %>% 
      html_nodes('.showMore') %>% 
        html_attrs() 
  # concatenate sub-url to main url
  surl <- unlist(lapply(surl, `[[`, 1))
  surl <- paste0('https://www.itjobswatch.co.uk', surl)
    
  Title <- htmlF %>% 
    html_nodes('.jobTitle a') %>% 
      html_text() 

  Company <- htmlF %>%
    html_nodes('.company span') %>%
      html_text()
  
  mainDF <- data.frame(Salary = Salary, Location = Location, Title = Title) 
  mainDF$JobLink <- surl
  mainDF$SearchTerm <- SearchTerm
  mainDF$Currency <- Currency
  mainDF$Company <- Company
  mainDF$MainSite <- mainURL
  mainDF[] <- lapply(mainDF, as.character)
  
  # Sub Table
  jobskills <- readLines('https://raw.githubusercontent.com/DATA607/Project3/master/jobskills.txt')
  mainCtr <- 1
  subdf <- ldply(surl, function(suburl) {
    df1 <- data.frame(matrix(ncol = 4, nrow = 5))
    colnames(df1) <- c("Id", "SubUrl", "Skill", "Count")
    text <- readLines(suburl, n = -1)
    text <- str_c(text, collapse = "")
    countr<- 1
    for (skill in jobskills) {
      skillcount <- lapply(skill, str_count, string=text)
      df1[countr,1] <- mainCtr
      df1[countr,2] <- suburl
      df1[countr,3] <- skill
      df1[countr,4] <- skillcount
      mainCtr <<-  mainCtr + 1
      countr <- countr + 1
    }
    df1
  })

  options(warn=-1)
  
  allPagesCtr <<- allPagesCtr + 1
  combinedDF <- data.frame()
  combinedDF <- inner_join(x=mainDF, y=subdf, by=c("JobLink" = "SubUrl") )
  # combinedDF <- str_replace_all(combinedDF, "[\t\r\n]" , "")
  print(paste0("Created CombinedDF with dimensions=",dim(combinedDF)))
  
  # reorder columns
  # combinedDF <- combinedDF[c(9,   8,  5,  4,  6,  1,  2,  5,  10, 11)]
  write.table(combinedDF, file = "itjobsUK.csv", append=TRUE, row.names=FALSE, quote=TRUE, sep=",")
}

## [1] "Processing Page 1"

## Warning in readLines(suburl, n = -1): incomplete final line found on
## 'https://www.itjobswatch.co.uk/ja/clk?jr=btoyd-8n4-7lz-290'

## Warning in readLines(suburl, n = -1): incomplete final line found on
## 'https://www.itjobswatch.co.uk/ja/clk?jr=btlq4-8n4-7lz-290'

## Warning in readLines(suburl, n = -1): incomplete final line found on
## 'https://www.itjobswatch.co.uk/ja/clk?jr=bs3z8-8n4-7lz-290'

## Warning in readLines(suburl, n = -1): incomplete final line found on
## 'https://www.itjobswatch.co.uk/ja/clk?jr=bteqy-8n4-7lz-290'

## [1] "Created CombinedDF with dimensions=375"
## [2] "Created CombinedDF with dimensions=11" 
## [1] "Processing Page 2"
## [1] "Created CombinedDF with dimensions=375"
## [2] "Created CombinedDF with dimensions=11" 
## [1] "Processing Page 3"
## [1] "Created CombinedDF with dimensions=375"
## [2] "Created CombinedDF with dimensions=11" 
## [1] "Processing Page 4"
## [1] "Created CombinedDF with dimensions=375"
## [2] "Created CombinedDF with dimensions=11" 
## [1] "Processing Page 5"
## [1] "Created CombinedDF with dimensions=350"
## [2] "Created CombinedDF with dimensions=11"

Data607Project3

Ann Liu-Ferrara

March 17, 2017

What are the most valuable skills as a Data Scientist?