Objective

Use ggplot2 from the tidyverse package to create a map that shows the top locations of 2018 U.S. Data Science jobs

Tidyverse functions used:

  • str_extract from stringr package for string manipulation
  • group_by, mutate, and left_join from dplyr package for data manipulation
  • ggplot, geom_polygon, and scale_fill_gradient from ggplot2 to create maps

Dataset: Data Scientist Job Market in the U.S.

I found this dataset on kaggle and decided to expand on project 3 topic by visualizing locations of top data science jobs

Data Context (provided by kaggle dataset author):

“For those who are actively looking for data scientist jobs in the U.S., the best news this month is the LinkedIn Workforce Report August 2018. According to the report, there is a shortage of 151,717 people with data science skills, with particularly acute shortages in New York City, San Francisco Bay Area and Los Angeles.

To help job hunters (including me) to better understand the job market, I scraped Indeed website and collected information of 7,000 data scientist jobs around the U.S. on August 3rd. The information that I collected are: Company Name, Position Name, Location, Job Description, and Number of Reviews of the Company."

Load Libraries

library(tidyverse)
library(ggmap)

Import Data

#Import data from csv file downloaded from kaggle
#URL: https://www.kaggle.com/sl6149/data-scientist-job-market-in-the-us/version/4
data <- read.csv("alldata.csv")

#Turn data from the 'maps' package into a dataframe for plotting with 'ggplot2'
states <- map_data("state")

Clean Data

#Remove City and zipcode from 'location' column and place result into a new vector
#Extract everything from strings excepts digits
state <- gsub('[[:digit:]]+', '', data$location)
#Extract matching patterns from strings using 'stringr' package
state <- str_extract(state, '\\b[^,]+$')
#Remove spaces
state <- gsub("[[:space:]]", "", state)

#Bind vector to dataframe
df <- cbind(data, state)
#Use 'dply' package to group data by state and put count frequencies in new variable
df2 <- df %>%
  group_by(state) %>% 
  mutate(count = n())

#Create subset of data
finalDf <- subset(df2, select = c(state, count))
finalDf <- unique(finalDf, by = "state")
finalDf <- na.omit(finalDf)

#Replace state abbreviation with full name
finalDf$state <- gsub("GA", "georgia", finalDf$state)
finalDf$state <- gsub( "TX","texas", finalDf$state)
finalDf$state <- gsub("CO", "colorado", finalDf$state)
finalDf$state <- gsub("MA", "massachusetts", finalDf$state)
finalDf$state <- gsub("IL", "illonois", finalDf$state)
#DC is not a state so associate it with Maryland for simpicity
finalDf$state <- gsub("DC", "maryland", finalDf$state)
finalDf$state <- gsub("CA", "california", finalDf$state)
finalDf$state <- gsub("NY", "new york", finalDf$state)
finalDf$state <- gsub("NJ", "new jersey", finalDf$state)
finalDf$state <- gsub("WA", "washington", finalDf$state)

#Change column name
colnames(finalDf)[colnames(finalDf) == "state"] <- "region"

#Use left_join to attach job counts to every point on polygons (borders) of the states
coord <- left_join(states, finalDf, by = "region")

Visualize Data

#Create base U.S. map
states_base <- ggplot(data = states, mapping = aes(x = long, y = lat, group = group)) + 
  coord_fixed(1.3) + 
  geom_polygon(color = "black", fill = "grey")

states_base 

#Shade states by job posting counts
plot <- states_base + 
      geom_polygon(data = coord, aes(fill = count), color = "grey") +
      geom_polygon(color = "black", fill = NA) +
      theme_bw()

plot

#Change color scale
plot + scale_fill_gradientn(colours = rev(rainbow(7)),
                         breaks = c(2, 4, 10, 100, 1000, 10000),
                         trans = "log10")

#From the plot, we see that overall there are more data scientist jobs postings for locations on the east and west coasts than in the south (with the exception of New Jersey)