1. Research Question

The goal of this project is to answer the research question Which are the most valued data science skills?

To obtain the latest available information about data science job postings we decided to scrap information from Indeed.com using various html nodes in the rvest package.

The resulting dataset contains fields such as: job title, company, location, job summary, link and job description.

2. Library Preparation

suppressWarnings({
  
library(RColorBrewer)
library(rvest)
library(tidyverse)
library(data.table)
library(DT)
library(digest)
library(RPostgreSQL)
library(tidytext)
library(wordcloud)
library(RColorBrewer)
library(RCurl)
library(XML)
library(stringr)
library(ggplot2)
library(knitr)
library(magrittr)
})
## Loading required package: xml2
## -- Attaching packages ------------------------------------------------------------------------------------------------------------------------ tidyverse 1.2.1 --
## v ggplot2 3.1.0     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   1.0.0     v stringr 1.3.1
## v readr   1.3.1     v forcats 0.4.0
## -- Conflicts --------------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter()         masks stats::filter()
## x readr::guess_encoding() masks rvest::guess_encoding()
## x dplyr::lag()            masks stats::lag()
## x purrr::pluck()          masks rvest::pluck()
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
## Loading required package: DBI
## Loading required package: bitops
## 
## Attaching package: 'RCurl'
## The following object is masked from 'package:tidyr':
## 
##     complete
## 
## Attaching package: 'XML'
## The following object is masked from 'package:rvest':
## 
##     xml
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract

3. Scraping Indeed.com Data

data <- data.frame(title=character(),
                 company=character(), 
                 location=character(), 
                 summary=character(), 
                 link=character(), 
                 description = character(),
                 stringsAsFactors=FALSE) 
for (i in seq(0, 990, 10)){
  url_ds <- paste0('https://www.indeed.com/jobs?q=data+scientist&l=all&start=',i)
  var <- read_html(url_ds)
}


  title <-  var %>% 
    html_nodes('#resultsCol .jobtitle') %>%
    html_text() %>%
    str_extract("(\\w+.+)+") 

  company <- var %>% 
    html_nodes('#resultsCol .company') %>%
    html_text() %>%
    str_extract("(\\w+).+") 
  location <- var %>%
    html_nodes('#resultsCol .location') %>%
    html_text() %>%
    str_extract("(\\w+.)+,.[A-Z]{2}") 
  summary <- var %>%
    html_nodes('#resultsCol .summary') %>%
    html_text() %>%
    str_extract(".+")
  link <- var %>%
    html_nodes('#resultsCol .jobtitle .turnstileLink, #resultsCol a.jobtitle') %>%
    html_attr('href') 
  link <- paste0("https://www.indeed.com",link)
    
  data <- rbind(data, as.data.frame(cbind(title,
                                                  company,
                                                  location,
                                                  summary,
                                                  link)))

1. Create a unique id for each job posting

 #create a unique id for each job posting 
data$uniqueid <- mapply(function(x, y, z) digest(paste0(x,y,z)), data$title, data$location, data$company)

3. Obtain full description for all job postings

for (i in (1:length(data$link))){
  desciption <- tryCatch(
     html_text(html_node(read_html(as.character(data$link[i])),'.jobsearch-JobComponent-description')),
     error=function(e){NA}
  )
  if (is.null(desciption)){
    desc <- NA
  }
  data$description[i] <- desciption
}

4. Save file as csv.file

## write.csv(listings, 'C:/Users/a/Desktop/607/myfile.csv')

5. Store Data in SQL

#con <- dbConnect(
# dbDriver('PostgreSQL'),
# dbname = 'postgres',
#host = 'localhost',
#  port = 5432,
#  user = 'postgres',
# password = 'Ylua4786')


# sqltable <- "CREATE TABLE Indeed (
# title TEXT,
# company TEXT,
#location TEXT,
# summary TEXT,
# link TEXT,
# description TEXT,
# uniqueid TEXT PRIMARY KEY
#)"
#    Create new SQL table "Indeed" using the query above if it does not exist yet 
#   if (!dbExistsTable(con, "Indeed")){
#   dbExecute(con, sqltable)
#   }

#dbWriteTable(con, "dup_Indeed", listings, row.names = FALSE)
#dupcheck <- "
#SELECT
 # uniqueid,
  #MAX(title) as title,
 # MAX(company) as company,
 # MAX(location) as location,
 # MAX(summary) as summary,
 # MAX(link) as link,
#  MAX(description) as description
#FROM dup_listings
#WHERE uniqueid NOT IN (SELECT uniqueid FROM listings)
#GROUP BY uniqueid;
#"
#dupcheck_results <- dbGetQuery(con, dupcheck)

#if (nrow(dupcheck_results) > 0) {
#  dbWriteTable(con, "Indeed", dupcheck_results, append = TRUE, row.names = FALSE)
#}
#dbRemoveTable(con, "dup_Indeed")
#dbDisconnect(con)

6. Import Data from SQL

con <- dbConnect(
  dbDriver('PostgreSQL'),
  dbname = 'postgres',
  host = 'localhost',
  port = 5432,
  user = 'postgres',
  password = 'Ylua4786'
)
#Query to get Indeed table  from SQL database.
sql <- " 
    SELECT uniqueid, title, company, location, summary, link, description
    FROM listings
"
#Show query results
Indeed_results <- dbGetQuery(con, sql)
dbDisconnect(con)
## [1] TRUE

7. Data Cleaning

1. Remove duplicate words

JOBLIST <- as.data.frame(readr::read_csv("https://raw.githubusercontent.com/joshuargst/607project3/master/joblistings.csv"))
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   X1 = col_double(),
##   title = col_character(),
##   company = col_character(),
##   location = col_character(),
##   summary = col_character(),
##   link = col_character(),
##   description = col_character(),
##   State = col_character(),
##   uniqueid = col_character()
## )
#uW <- unique(data.frame(strsplit(gsub("[^[:alnum:] ]", " ", toupper(JOBLIST[1,]$description)), " +")))
#names(uW) <- c("Skills")

2. Split words for each row

#for(i in 2:nrow(JOBLIST)){ #nrow(JOBLIST)

3. Put into a dataframe

# newDf <- unique(data.frame(strsplit(toupper(gsub("[^[:alnum:] ]", " ", JOBLIST[i,]$description)), " +")))
# names(newDf) <- c("Skills")
# uW <- rbind(uW, newDf)
# print(i)
#}

4. Counted the duplicate rows and save as csv.file

#count(uW, vars = "Skills")
# write.csv(a, 'C:/Users/a/Desktop/607/Skills.csv')

8. Data Visualization

url<-"https://raw.githubusercontent.com/uplotnik/Data607/master/Skills.csv"
x<- read.csv(url)
wordcloud(JOBLIST$description, scale=c(5,0.5), max.words=100, random.order=FALSE, rot.per=0.35, use.r.layout=FALSE, colors=brewer.pal(8, "Dark2"))
## Loading required namespace: tm
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents

hard_skills <- c("R", "PYTHON", "SQL", "JAVA","PERL","C","HADOOP","APACHE","ORACLE","SCALA","ACCESS", "SAS","LINUX", "AZURE", "EXCEL","Metlab","AWS","TABLEAU","SPARK","HIVE")
hard_skills<-filter(x, Skills %in% hard_skills)%>%
  mutate(perc=freq/sum(freq))
 ggplot(hard_skills, aes(x = Skills, y = freq, fill = freq, label = scales::percent(perc))) + 
    geom_col(position = 'dodge') + 
    geom_text(position = position_dodge(width = .9),    # move to center of bars
              vjust = -0.5,    # nudge above top of bar
              size = 3) + 
    scale_y_continuous(labels = scales::percent)+  theme(axis.text=element_text(angle=90))+  ggtitle("Hard Skills")