| Last modified: 10/21/2017 21:14 |
\(\quad\) W. Edwards Deming said, "In God we trust, all others must bring data." Please use data to answer the question, "Which are the most valued data science skills?" Consider your work as an exploration; there is not necessarily a "right answer."
\(\quad\) Lawyers require a strong familiarity of the laws in the country in which they practice. Doctors need a competent understanding of the human anatomy. But as an emerging field in the realm of business analytics, data science has no clear requirements. What skills define a data scientist?
\(\quad\) In this project, we set out to answer that question. By collecting data from job postings, as well as resumes, we hope to pinpoint the qualifications needed to become a successful data scientist.
\(\quad\) We began by checking job-search sites for listings for "data scientist". We then crawled through each page, extracting all links (and discarding irrelevant ones) on each, and stored the results in a .csv file. With the help of the rvest package, we looped through the entire file, converting each line to a url, parsing the html, extracting the relevant portions, and storing the results in a second column of our file. Since we only decided to add location into our analysis afterwards, we repeated the process, this time making a new column for the location of each job.
| TODO: |
| TODO: Reverse filter words - whitelist specific, rather than blacklist 90%. |
# Load libraries -----------------
library(tidyverse)
library(ggplot2)
library(rvest)
library(tm)
library(SnowballC)
library(wordcloud2)
library(stringr)
library(plotly)
#
# Read the two csv files. The former contains urls, the latter has links + a column with the job description
#
cb_raw <- read.csv("careerbuilder.csv", stringsAsFactors = F)
cb <- read.csv("cb.csv", stringsAsFactors = F)
for (i in 1:nrow(cb_raw)){
if (cb$desc[i] == ""){
url <- read_html(cb_raw$Links[i])
cb$desc[i] <- url %>%
html_node(".description") %>%
html_text()
}
Sys.sleep(5)
}
#
# Repeat the process, this time only extracting the location for each job (forgot to do it on the first run).
# Since this was done on a different day, some of the links were dead. To prevent my loop from
# breaking every time a link was throwing a 404 error, I used the tryCatch function.
#
cb_raw <- read.csv("careerbuilder.csv", stringsAsFactors = F)
cb <- read.csv("cb.csv", stringsAsFactors = F)
for (i in 1:nrow(cb_raw)){
if (is.null(cb$loc[i]) == T){
tryCatch({
url <- read_html(cb_raw$Links[i])
cb$loc[i] <- url %>%
html_node("#job-company-name") %>%
html_text()
},
error=function(e) NA)
}
Sys.sleep(5)
}
# Regex pattern to only keep location info
# Bulletpoint unicode is U+2022 ----------------
# 1/2 unicode is U+00BD
patt1 <- "\\n.*\\n.*\\n"
patt2 <- ".*\\,\\s\\w+"
patt3 <- "[[:punct:]]\\w+.*"
loop_seq <- c(1:49, 50:68, 71:92, 94:129, 131:nrow(cb))
loop_seq2 <- c(1:48, 50:68, 71:92, 94:114, 116:129, 131:nrow(cb))
# Split the string by newline character, making it easier to extract what we need
for (i in loop_seq){
cb$loc[i] <- str_split(cb$loc[i], patt1) %>%
str_extract_all(patt2)
}
# Extract the actual location - the only relevant info
for (i in loop_seq2){
cb$loc[i] <- str_extract_all(cb$loc[i], patt3)
}
# This one was formatted differently than the others, so had to do it manually
cb$loc[69] <- str_split(cb$loc[69], patt1) %>%
str_extract_all("\\w+\\s\\w+\\,\\w+")
# Removed the quotation marks, and trimmed the whitespace
cb$loc <- gsub('"', " ", cb$loc) %>%
str_trim()
dice <- read.csv("https://raw.githubusercontent.com/JoshuaSturm/data607-project3/master/dice.csv?token=AbaIfRRPehMvvHafdfhuqfZMoC1xHzuAks5Z8oXrwA%3D%3D", stringsAsFactors = F)
for (i in 1:nrow(dice))
{
url <- read_html(dice$Links[i])
dice$Desc[i] <- url %>%
html_node("#jobdescSec") %>%
html_text()
dice$loc[i] <- url %>%
html_node(".location span") %>%
html_text()
dice$title[i] <- url %>%
html_node("#jt") %>%
html_text()
Sys.sleep(5)
}
datajobs <- read.csv("https://raw.githubusercontent.com/JoshuaSturm/data607-project3/master/datajobs.csv?token=AbaIfRP6t79f4joLWamQBQKqshxumELIks5Z8oYawA%3D%3D", stringsAsFactors = F)
for (i in 1:nrow(datajobs))
{
url <- read_html(datajobs$Links[i])
datajobs$Desc[i] <- url %>%
html_node("#job_description") %>%
html_text()
datajobs$loc[i] <- url %>%
html_node("div:nth-child(4) :nth-child(1) .jobpost-table-cell-2") %>%
html_text()
datajobs$title[i] <- url %>%
html_node("h1 strong") %>%
html_text()
Sys.sleep(5)
}
removeColumn <- c("and", "the", "with", "for", "our", "will", "are", "that", "you", "this", "from", "new", "their", "have", "other", "years", "your", "all", "using", "more", "such", "about", "use", "within", "andor", "must", "one", "not", "make", "johnson")
clean_cb <- read.csv("cb.csv", stringsAsFactors = F)
clean_cb$desc <- str_replace_all(clean_cb$desc, "[[:blank:]](r|R)[[:blank:]]", " RRRRRRRRRRR ")
clean_cb <- Corpus(VectorSource(clean_cb$desc)) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeNumbers) %>%
#tm_map(removeWords, stopwords("english")) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace) #%>%
#tm_map(stemDocument)
clean_cb <- tm_map(clean_cb, removeWords, removeColumn)
clean_cb <- Corpus(VectorSource(clean_cb))
abc <- TermDocumentMatrix(clean_cb)
abc <- as.matrix(abc)
abc <- sort(rowSums(abc),decreasing=TRUE)
abc <- data.frame(word = names(abc),freq=abc)
abc$word <- str_replace(abc$word, "rrrrrrrrrrr", "R")
clean_dice <- read.csv("dice.csv", stringsAsFactors = F)
clean_dice$desc <- str_replace_all(clean_dice$desc, "[[:blank:]](r|R)[[:blank:]]", " RRRRRRRRRRR ")
clean_dice <- Corpus(VectorSource(clean_dice$desc)) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeNumbers) %>%
#tm_map(removeWords, stopwords("english")) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace) #%>%
#tm_map(stemDocument)
clean_dice <- tm_map(clean_dice, removeWords, removeColumn)
clean_dice <- Corpus(VectorSource(clean_dice))
def <- TermDocumentMatrix(clean_dice)
def <- as.matrix(def)
def <- sort(rowSums(def),decreasing=TRUE)
def <- data.frame(word = names(def),freq=def)
def$word <- str_replace(def$word, "rrrrrrrrrrr", "R")
clean_dj <- read.csv("datajobs.csv", stringsAsFactors = F)
clean_dj$desc <- str_replace_all(clean_dj$desc, "[[:blank:]](r|R)[[:blank:]]", " RRRRRRRRRRR ")
clean_dj <- Corpus(VectorSource(clean_dj$desc)) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeNumbers) %>%
#tm_map(removeWords, stopwords("english")) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace) #%>%
#tm_map(stemDocument)
clean_dj <- tm_map(clean_dj, removeWords, removeColumn)
clean_dj <- Corpus(VectorSource(clean_dj))
ghi <- TermDocumentMatrix(clean_dj)
ghi <- as.matrix(ghi)
ghi <- sort(rowSums(ghi),decreasing=TRUE)
ghi <- data.frame(word = names(ghi),freq=ghi)
ghi$word <- str_replace(ghi$word, "rrrrrrrrrrr", "R")
merged <- merge(merge(abc, def, by=c("word","freq"), all=T), ghi, by=c("word","freq"), all=T)
merged$word <- merged$word %>%
str_replace_all(".*database.*", "databases") %>%
str_replace_all(".*python.*", "python") %>%
str_replace_all(".*visual.*", "visualization") %>%
str_replace_all(".*hadoop.*", "hadoop") %>%
str_replace_all(".*mining.*", "data mining") %>%
str_replace_all(".*program.*", "programming") %>%
str_replace_all(".*phd.*", "ph.d") %>%
str_replace_all(".*linux.*", "linux") %>%
str_replace_all(".*spark.*", "spark") %>%
str_replace_all(".*collab.*", "collaborate") %>%
str_replace_all(c(".*learning.*", ".*machine.*"), "machine learning")
merged <- merged %>%
group_by(word) %>%
summarise_all(funs(sum))
# Selectively pick results
whitelisted <- c("R", "sql", "python", "sas", "mathematics", "programming", "communication", "strategic", "hadoop", "machine learning", "spark", "data mining", "databases", "visualization", "collaborate")
wordcloud2(merged, figPath = "Rlogo.png", size = 1, color = "skyblue")
#Get the top 10 words from the merged file
top10 <- head(arrange(merged,desc(freq)), n = 10)
ggplot(data=top10, aes(x=word, y=freq)) +
geom_bar(stat="identity") +
labs(x="Frequency",y="Words")
#Dice
dice_file <- read.csv("dice.csv", stringsAsFactors = F)
dice_states <- data.frame(unlist(str_extract_all(dice_file$loc, "[A-Z]{2}")))
colnames(dice_states) <- c('States')
head(dice_states, 10)
## States
## 1 NJ
## 2 NJ
## 3 NY
## 4 NY
## 5 NY
## 6 NJ
## 7 NY
## 8 NY
## 9 CT
## 10 MA
#CareerBuilder
cb_file <- read.csv("cb.csv", stringsAsFactors = F)
cb_states <- data.frame(unlist(str_extract_all(cb_file$loc, "[A-Z]{2}")))
colnames(cb_states) <- c('States')
head(cb_states, 10)
## States
## 1 TX
## 2 CO
## 3 DC
## 4 MA
## 5 DC
## 6 DC
## 7 PA
## 8 VA
## 9 CA
## 10 CA
#Data Jobs
dj_file <- read.csv("datajobs.csv", stringsAsFactors = F)
dj_states <- data.frame(unlist(str_extract_all(dj_file$loc, "[A-Z]{2}")))
colnames(dj_states) <- c('States')
head(dj_states, 10)
## States
## 1 FL
## 2 TN
## 3 NJ
## 4 FL
## 5 FL
## 6 WA
## 7 CA
## 8 CA
## 9 FL
## 10 NY
#Merge all State Files
states_merged <- merge(merge(dice_states, cb_states, by=c("States"), all=T), dj_states, by=c("States"), all=T)
states_merged <- states_merged %>%
group_by(States)
states_merged <- summarise(
states_merged,
Freq = n())
head(states_merged,10)
## # A tibble: 10 x 2
## States Freq
## <fctr> <int>
## 1 AZ 8
## 2 CA 2431
## 3 CO 16
## 4 CT 1
## 5 DC 4
## 6 GA 99
## 7 IA 1
## 8 IL 30
## 9 MA 28
## 10 MD 2
#Write to CSV
write.csv(states_merged, file = "states_merged.csv",row.names=FALSE)
#Use States CSv to make Plotly Map
df <- read.csv("states_merged.csv")
df$hover <- with(df, paste(States, '<br>', "Posts:", Freq))
# with(df, paste(States, '<br>', "Posts", Freq))
# give state boundaries a white border
l <- list(color = toRGB("white"), width = 2)
# specify some map projection/options
g <- list(
scope = 'usa',
projection = list(type = 'albers usa'),
showlakes = TRUE,
lakecolor = toRGB('white')
)
p <- plot_geo(df, locationmode = 'USA-states') %>%
add_trace(
z = ~Freq, text = ~hover, locations = ~States,
color = ~Freq, colors = 'Greens'
) %>%
colorbar(title = "Data Job Postings") %>%
layout(
title = 'Data Jobs by State',
geo = g
)
p