DATA 607 - Project 3 (Group 6)

Last modified: 10/21/2017 21:14

Task

\(\quad\) W. Edwards Deming said, "In God we trust, all others must bring data." Please use data to answer the question, "Which are the most valued data science skills?" Consider your work as an exploration; there is not necessarily a "right answer."

Foreword

\(\quad\) Lawyers require a strong familiarity of the laws in the country in which they practice. Doctors need a competent understanding of the human anatomy. But as an emerging field in the realm of business analytics, data science has no clear requirements. What skills define a data scientist?
\(\quad\) In this project, we set out to answer that question. By collecting data from job postings, as well as resumes, we hope to pinpoint the qualifications needed to become a successful data scientist.

Approach

\(\quad\) We began by checking job-search sites for listings for "data scientist". We then crawled through each page, extracting all links (and discarding irrelevant ones) on each, and stored the results in a .csv file. With the help of the rvest package, we looped through the entire file, converting each line to a url, parsing the html, extracting the relevant portions, and storing the results in a second column of our file. Since we only decided to add location into our analysis afterwards, we repeated the process, this time making a new column for the location of each job.

TODO: ~~Merge and clean all data sources~~. Analysis and visulations (Partially complete)

TODO: Reverse filter words - whitelist specific, rather than blacklist 90%.

Libraries

# Load libraries -----------------

library(tidyverse)
library(ggplot2)
library(rvest)
library(tm)
library(SnowballC)
library(wordcloud2)
library(stringr)
library(plotly)

Web scraping

Careerbuilder.com

#
# Read the two csv files. The former contains urls, the latter has links + a column with the job description
#

cb_raw <- read.csv("careerbuilder.csv", stringsAsFactors = F)
cb <- read.csv("cb.csv", stringsAsFactors = F)

for (i in 1:nrow(cb_raw)){
  if (cb$desc[i] == ""){  
      url <- read_html(cb_raw$Links[i])
      cb$desc[i] <- url %>%
      html_node(".description") %>%
      html_text()
     }
  Sys.sleep(5)
}

#
# Repeat the process, this time only extracting the location for each job (forgot to do it on the first run).
# Since this was done on a different day, some of the links were dead. To prevent my loop from
# breaking every time a link was throwing a 404 error, I used the tryCatch function.
#

cb_raw <- read.csv("careerbuilder.csv", stringsAsFactors = F)
cb <- read.csv("cb.csv", stringsAsFactors = F)

for (i in 1:nrow(cb_raw)){
   if (is.null(cb$loc[i]) == T){  
     tryCatch({
      url <- read_html(cb_raw$Links[i])
      cb$loc[i] <- url %>%
      html_node("#job-company-name") %>%
      html_text()
       },
      error=function(e) NA)
 }
  Sys.sleep(5)
}

# Regex pattern to only keep location info
# Bulletpoint unicode is U+2022 ----------------
# 1/2 unicode is U+00BD


patt1 <- "\\n.*\\n.*\\n"
patt2 <- ".*\\,\\s\\w+"
patt3 <- "[[:punct:]]\\w+.*"
loop_seq <- c(1:49, 50:68, 71:92, 94:129, 131:nrow(cb))
loop_seq2 <- c(1:48, 50:68, 71:92, 94:114, 116:129, 131:nrow(cb))

# Split the string by newline character, making it easier to extract what we need

for (i in loop_seq){
  cb$loc[i] <- str_split(cb$loc[i], patt1) %>% 
    str_extract_all(patt2)
}

# Extract the actual location - the only relevant info

for (i in loop_seq2){
  cb$loc[i] <- str_extract_all(cb$loc[i], patt3)
}

# This one was formatted differently than the others, so had to do it manually

cb$loc[69] <- str_split(cb$loc[69], patt1) %>%
  str_extract_all("\\w+\\s\\w+\\,\\w+")

# Removed the quotation marks, and trimmed the whitespace

cb$loc <- gsub('"', " ", cb$loc) %>%
  str_trim()

Dice.com

dice <- read.csv("https://raw.githubusercontent.com/JoshuaSturm/data607-project3/master/dice.csv?token=AbaIfRRPehMvvHafdfhuqfZMoC1xHzuAks5Z8oXrwA%3D%3D", stringsAsFactors = F)

for (i in 1:nrow(dice))
{
  url <- read_html(dice$Links[i])
  dice$Desc[i] <- url %>%
  html_node("#jobdescSec") %>%
  html_text()
  
  dice$loc[i] <- url %>%
  html_node(".location span") %>%
  html_text()
  
  dice$title[i] <- url %>%
  html_node("#jt") %>%
  html_text()
  
  Sys.sleep(5)
}

DataJobs

datajobs <- read.csv("https://raw.githubusercontent.com/JoshuaSturm/data607-project3/master/datajobs.csv?token=AbaIfRP6t79f4joLWamQBQKqshxumELIks5Z8oYawA%3D%3D", stringsAsFactors = F)

for (i in 1:nrow(datajobs))
{
  url <- read_html(datajobs$Links[i])
  datajobs$Desc[i] <- url %>%
  html_node("#job_description") %>%
  html_text()
  
  datajobs$loc[i] <- url %>%
  html_node("div:nth-child(4) :nth-child(1) .jobpost-table-cell-2") %>%
  html_text()
  
  datajobs$title[i] <- url %>%
  html_node("h1 strong") %>%
  html_text()
  
  Sys.sleep(5)
}

Cleaning and tidying the data

CareerBuilder

removeColumn <- c("and", "the", "with", "for", "our", "will", "are", "that", "you", "this", "from", "new", "their", "have", "other", "years", "your", "all", "using", "more", "such", "about", "use", "within", "andor", "must", "one", "not", "make", "johnson")


clean_cb <- read.csv("cb.csv", stringsAsFactors = F)
clean_cb$desc <- str_replace_all(clean_cb$desc, "[[:blank:]](r|R)[[:blank:]]", " RRRRRRRRRRR ")
clean_cb <- Corpus(VectorSource(clean_cb$desc)) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeNumbers) %>%
  #tm_map(removeWords, stopwords("english")) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) #%>%
  #tm_map(stemDocument)
clean_cb <- tm_map(clean_cb, removeWords, removeColumn)
clean_cb <- Corpus(VectorSource(clean_cb))


abc <- TermDocumentMatrix(clean_cb)
abc <- as.matrix(abc)
abc <- sort(rowSums(abc),decreasing=TRUE)
abc <- data.frame(word = names(abc),freq=abc)
abc$word <- str_replace(abc$word, "rrrrrrrrrrr", "R")

Dice

clean_dice <- read.csv("dice.csv", stringsAsFactors = F)
clean_dice$desc <- str_replace_all(clean_dice$desc, "[[:blank:]](r|R)[[:blank:]]", " RRRRRRRRRRR ")
clean_dice <- Corpus(VectorSource(clean_dice$desc)) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeNumbers) %>%
  #tm_map(removeWords, stopwords("english")) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) #%>%
  #tm_map(stemDocument)
clean_dice <- tm_map(clean_dice, removeWords, removeColumn)
clean_dice <- Corpus(VectorSource(clean_dice))


def <- TermDocumentMatrix(clean_dice)
def <- as.matrix(def)
def <- sort(rowSums(def),decreasing=TRUE)
def <- data.frame(word = names(def),freq=def)
def$word <- str_replace(def$word, "rrrrrrrrrrr", "R")

DataJobs

clean_dj <- read.csv("datajobs.csv", stringsAsFactors = F)
clean_dj$desc <- str_replace_all(clean_dj$desc, "[[:blank:]](r|R)[[:blank:]]", " RRRRRRRRRRR ")
clean_dj <- Corpus(VectorSource(clean_dj$desc)) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeNumbers) %>%
  #tm_map(removeWords, stopwords("english")) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) #%>%
  #tm_map(stemDocument)
clean_dj <- tm_map(clean_dj, removeWords, removeColumn)
clean_dj <- Corpus(VectorSource(clean_dj))


ghi <- TermDocumentMatrix(clean_dj)
ghi <- as.matrix(ghi)
ghi <- sort(rowSums(ghi),decreasing=TRUE)
ghi <- data.frame(word = names(ghi),freq=ghi)
ghi$word <- str_replace(ghi$word, "rrrrrrrrrrr", "R")

Merge all three dataframes

merged <- merge(merge(abc, def, by=c("word","freq"), all=T), ghi, by=c("word","freq"), all=T)

merged$word <- merged$word %>%
  str_replace_all(".*database.*", "databases") %>%
  str_replace_all(".*python.*", "python") %>%
  str_replace_all(".*visual.*", "visualization") %>%
  str_replace_all(".*hadoop.*", "hadoop") %>%
  str_replace_all(".*mining.*", "data mining") %>%
  str_replace_all(".*program.*", "programming") %>%
  str_replace_all(".*phd.*", "ph.d") %>%
  str_replace_all(".*linux.*", "linux") %>%
  str_replace_all(".*spark.*", "spark") %>%
  str_replace_all(".*collab.*", "collaborate") %>%
  str_replace_all(c(".*learning.*", ".*machine.*"), "machine learning")


merged <- merged %>% 
  group_by(word) %>%
  summarise_all(funs(sum))

Analysis

# Selectively pick results

whitelisted <- c("R", "sql", "python", "sas", "mathematics", "programming", "communication", "strategic", "hadoop", "machine learning", "spark", "data mining", "databases", "visualization", "collaborate")

wordcloud2(merged, figPath = "Rlogo.png", size = 1, color = "skyblue")

#Get the top 10 words from the merged file
top10 <- head(arrange(merged,desc(freq)), n = 10)


ggplot(data=top10, aes(x=word, y=freq)) +
  geom_bar(stat="identity") +
  labs(x="Frequency",y="Words")

#Dice
dice_file <- read.csv("dice.csv", stringsAsFactors = F)
dice_states <- data.frame(unlist(str_extract_all(dice_file$loc, "[A-Z]{2}")))
colnames(dice_states) <- c('States')
head(dice_states, 10)

##    States
## 1      NJ
## 2      NJ
## 3      NY
## 4      NY
## 5      NY
## 6      NJ
## 7      NY
## 8      NY
## 9      CT
## 10     MA

#CareerBuilder
cb_file <- read.csv("cb.csv", stringsAsFactors = F)
cb_states <- data.frame(unlist(str_extract_all(cb_file$loc, "[A-Z]{2}")))
colnames(cb_states) <- c('States')
head(cb_states, 10)

##    States
## 1      TX
## 2      CO
## 3      DC
## 4      MA
## 5      DC
## 6      DC
## 7      PA
## 8      VA
## 9      CA
## 10     CA

#Data Jobs
dj_file <- read.csv("datajobs.csv", stringsAsFactors = F)
dj_states <- data.frame(unlist(str_extract_all(dj_file$loc, "[A-Z]{2}")))
colnames(dj_states) <- c('States')
head(dj_states, 10)

##    States
## 1      FL
## 2      TN
## 3      NJ
## 4      FL
## 5      FL
## 6      WA
## 7      CA
## 8      CA
## 9      FL
## 10     NY

#Merge all State Files
states_merged <- merge(merge(dice_states, cb_states, by=c("States"), all=T), dj_states, by=c("States"), all=T)

states_merged  <- states_merged %>% 
  group_by(States) 

states_merged <- summarise(
  states_merged,
  Freq = n())

head(states_merged,10)

## # A tibble: 10 x 2
##    States  Freq
##    <fctr> <int>
##  1     AZ     8
##  2     CA  2431
##  3     CO    16
##  4     CT     1
##  5     DC     4
##  6     GA    99
##  7     IA     1
##  8     IL    30
##  9     MA    28
## 10     MD     2

#Write to CSV
write.csv(states_merged, file = "states_merged.csv",row.names=FALSE) 

#Use States CSv to make Plotly Map
df <- read.csv("states_merged.csv")
df$hover <- with(df, paste(States, '<br>', "Posts:", Freq))

# with(df, paste(States, '<br>', "Posts", Freq))
# give state boundaries a white border
l <- list(color = toRGB("white"), width = 2)
# specify some map projection/options
g <- list(
  scope = 'usa',
  projection = list(type = 'albers usa'),
  showlakes = TRUE,
  lakecolor = toRGB('white')
)

p <- plot_geo(df, locationmode = 'USA-states') %>%
  add_trace(
    z = ~Freq, text = ~hover, locations = ~States,
    color = ~Freq, colors = 'Greens'
  ) %>%
  colorbar(title = "Data Job Postings") %>%
  layout(
    title = 'Data Jobs by State',
    geo = g
  )
p

DATA 607 - Project 3 (Group 6)

Joshua Sturm, Michele Bradley, Samuel Cohen-Devries, Emilie Bolduc, David Quarshie

October 16, 2017

Task

Foreword

Approach

Libraries

Web scraping

Careerbuilder.com

Dice.com

DataJobs

Cleaning and tidying the data

CareerBuilder

Dice

DataJobs

Merge all three dataframes

Analysis

Afterword