The goal of this project is to answer the research question Which are the most valued data science skills?
To obtain the latest available information about data science job postings we decided to scrap information from Indeed.com using various html nodes in the rvest package.
The resulting dataset contains fields such as: job title, company, location, job summary, link and job description.
suppressWarnings({
library(RColorBrewer)
library(rvest)
library(tidyverse)
library(data.table)
library(DT)
library(digest)
library(RPostgreSQL)
library(tidytext)
library(wordcloud)
library(RColorBrewer)
library(RCurl)
library(XML)
library(stringr)
library(ggplot2)
library(knitr)
library(magrittr)
})
## Loading required package: xml2
## -- Attaching packages ------------------------------------------------------------------------------------------------------------------------ tidyverse 1.2.1 --
## v ggplot2 3.1.0 v purrr 0.3.2
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 1.0.0 v stringr 1.3.1
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts --------------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x readr::guess_encoding() masks rvest::guess_encoding()
## x dplyr::lag() masks stats::lag()
## x purrr::pluck() masks rvest::pluck()
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
## Loading required package: DBI
## Loading required package: bitops
##
## Attaching package: 'RCurl'
## The following object is masked from 'package:tidyr':
##
## complete
##
## Attaching package: 'XML'
## The following object is masked from 'package:rvest':
##
## xml
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
data <- data.frame(title=character(),
company=character(),
location=character(),
summary=character(),
link=character(),
description = character(),
stringsAsFactors=FALSE)
for (i in seq(0, 990, 10)){
url_ds <- paste0('https://www.indeed.com/jobs?q=data+scientist&l=all&start=',i)
var <- read_html(url_ds)
}
title <- var %>%
html_nodes('#resultsCol .jobtitle') %>%
html_text() %>%
str_extract("(\\w+.+)+")
company <- var %>%
html_nodes('#resultsCol .company') %>%
html_text() %>%
str_extract("(\\w+).+")
location <- var %>%
html_nodes('#resultsCol .location') %>%
html_text() %>%
str_extract("(\\w+.)+,.[A-Z]{2}")
summary <- var %>%
html_nodes('#resultsCol .summary') %>%
html_text() %>%
str_extract(".+")
link <- var %>%
html_nodes('#resultsCol .jobtitle .turnstileLink, #resultsCol a.jobtitle') %>%
html_attr('href')
link <- paste0("https://www.indeed.com",link)
data <- rbind(data, as.data.frame(cbind(title,
company,
location,
summary,
link)))
#create a unique id for each job posting
data$uniqueid <- mapply(function(x, y, z) digest(paste0(x,y,z)), data$title, data$location, data$company)
data %<>%
distinct(uniqueid, .keep_all = TRUE)
data %<>%
distinct(link, .keep_all = TRUE)
for (i in (1:length(data$link))){
desciption <- tryCatch(
html_text(html_node(read_html(as.character(data$link[i])),'.jobsearch-JobComponent-description')),
error=function(e){NA}
)
if (is.null(desciption)){
desc <- NA
}
data$description[i] <- desciption
}
## write.csv(listings, 'C:/Users/a/Desktop/607/myfile.csv')
#con <- dbConnect(
# dbDriver('PostgreSQL'),
# dbname = 'postgres',
#host = 'localhost',
# port = 5432,
# user = 'postgres',
# password = 'Ylua4786')
# sqltable <- "CREATE TABLE Indeed (
# title TEXT,
# company TEXT,
#location TEXT,
# summary TEXT,
# link TEXT,
# description TEXT,
# uniqueid TEXT PRIMARY KEY
#)"
# Create new SQL table "Indeed" using the query above if it does not exist yet
# if (!dbExistsTable(con, "Indeed")){
# dbExecute(con, sqltable)
# }
#dbWriteTable(con, "dup_Indeed", listings, row.names = FALSE)
#dupcheck <- "
#SELECT
# uniqueid,
#MAX(title) as title,
# MAX(company) as company,
# MAX(location) as location,
# MAX(summary) as summary,
# MAX(link) as link,
# MAX(description) as description
#FROM dup_listings
#WHERE uniqueid NOT IN (SELECT uniqueid FROM listings)
#GROUP BY uniqueid;
#"
#dupcheck_results <- dbGetQuery(con, dupcheck)
#if (nrow(dupcheck_results) > 0) {
# dbWriteTable(con, "Indeed", dupcheck_results, append = TRUE, row.names = FALSE)
#}
#dbRemoveTable(con, "dup_Indeed")
#dbDisconnect(con)
con <- dbConnect(
dbDriver('PostgreSQL'),
dbname = 'postgres',
host = 'localhost',
port = 5432,
user = 'postgres',
password = 'Ylua4786'
)
#Query to get Indeed table from SQL database.
sql <- "
SELECT uniqueid, title, company, location, summary, link, description
FROM listings
"
#Show query results
Indeed_results <- dbGetQuery(con, sql)
dbDisconnect(con)
## [1] TRUE
JOBLIST <- as.data.frame(readr::read_csv("https://raw.githubusercontent.com/joshuargst/607project3/master/joblistings.csv"))
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_double(),
## title = col_character(),
## company = col_character(),
## location = col_character(),
## summary = col_character(),
## link = col_character(),
## description = col_character(),
## State = col_character(),
## uniqueid = col_character()
## )
#uW <- unique(data.frame(strsplit(gsub("[^[:alnum:] ]", " ", toupper(JOBLIST[1,]$description)), " +")))
#names(uW) <- c("Skills")
#for(i in 2:nrow(JOBLIST)){ #nrow(JOBLIST)
# newDf <- unique(data.frame(strsplit(toupper(gsub("[^[:alnum:] ]", " ", JOBLIST[i,]$description)), " +")))
# names(newDf) <- c("Skills")
# uW <- rbind(uW, newDf)
# print(i)
#}
#count(uW, vars = "Skills")
# write.csv(a, 'C:/Users/a/Desktop/607/Skills.csv')
url<-"https://raw.githubusercontent.com/uplotnik/Data607/master/Skills.csv"
x<- read.csv(url)
wordcloud(JOBLIST$description, scale=c(5,0.5), max.words=100, random.order=FALSE, rot.per=0.35, use.r.layout=FALSE, colors=brewer.pal(8, "Dark2"))
## Loading required namespace: tm
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents
hard_skills <- c("R", "PYTHON", "SQL", "JAVA","PERL","C","HADOOP","APACHE","ORACLE","SCALA","ACCESS", "SAS","LINUX", "AZURE", "EXCEL","Metlab","AWS","TABLEAU","SPARK","HIVE")
hard_skills<-filter(x, Skills %in% hard_skills)%>%
mutate(perc=freq/sum(freq))
ggplot(hard_skills, aes(x = Skills, y = freq, fill = freq, label = scales::percent(perc))) +
geom_col(position = 'dodge') +
geom_text(position = position_dodge(width = .9), # move to center of bars
vjust = -0.5, # nudge above top of bar
size = 3) +
scale_y_continuous(labels = scales::percent)+ theme(axis.text=element_text(angle=90))+ ggtitle("Hard Skills")