Template

Introduction

Looking at my journey to being hired by LinkedIn, using some less traditional visualisation methods, though hardly breaking the mold here!

library(tidyverse)

clean_names <- function(x) str_replace_all(tolower(x), " ", "_")

kable_tally <- function(data, variable, cap = "") {
  data %>% group_by_(variable) %>% tally %>% arrange(desc(n)) %>% 
    knitr::kable(caption = cap, format = "html") %>% 
    kableExtra::kable_styling(full_width = FALSE)
}

count_words <- function(word, corpus) sum(str_count(corpus, word), na.rm = TRUE)

count_exact_words <- function(word, corpus) sum(corpus %in% word, na.rm = TRUE)

The data was generated manually in a Google Sheet as I was tracking all the applications I made over the course of several months.

if(!interactive()){
  setwd("..")
}

interviews_raw <- readxl::read_excel("data/job_search_list.xlsx") %>% 
  select(-Email, -Contact, -Phone) 

colnames(interviews_raw) <- 
  colnames(interviews_raw) %>%
  clean_names

Cleaning

summary(interviews_raw %>% 
          mutate_at(c("status", "stage_reached", "contact_gender"), as.factor))

##    company           job_title                  status       stage_reached
##  Length:22          Length:22          ACCEPTED    : 1   1st Round  : 2   
##  Class :character   Class :character   INCOMPATIBLE: 3   2nd Round  : 2   
##  Mode  :character   Mode  :character   NOREPLY     : 8   Final Round: 1   
##                                        REJECTED    :10   Late       : 1   
##                                                          None       :16   
##                                                                           
##                                                                           
##   applied_date                 contact_gender     salary     
##  Min.   :2018-03-02 00:00:00   F      : 2     Min.   :28.00  
##  1st Qu.:2018-03-27 06:00:00   M      :13     1st Qu.:29.50  
##  Median :2018-04-27 00:00:00   UNKNOWN: 7     Median :35.00  
##  Mean   :2018-04-17 04:21:49                  Mean   :33.25  
##  3rd Qu.:2018-05-07 00:00:00                  3rd Qu.:35.62  
##  Max.   :2018-05-29 00:00:00                  Max.   :37.50  
##                                               NA's   :14     
##  key_phrases            link           is_recruiter?  
##  Length:22          Length:22          Mode :logical  
##  Class :character   Class :character   FALSE:10       
##  Mode  :character   Mode  :character   TRUE :12       
##                                                       
##                                                       
##                                                       
##

glimpse(interviews_raw)

## Observations: 22
## Variables: 10
## $ company         <chr> "AON", "(Archer)", "First Data", "(Wallace Mye...
## $ job_title       <chr> "Data analyst", "DATA ANALYST - ANALYTICS CENT...
## $ status          <chr> "REJECTED", "NOREPLY", "NOREPLY", "REJECTED", ...
## $ stage_reached   <chr> "1st Round", "None", "None", "2nd Round", "1st...
## $ applied_date    <dttm> 2018-03-02, 2018-03-02, 2018-03-02, 2018-03-0...
## $ contact_gender  <chr> "UNKNOWN", "F", "UNKNOWN", "M", "UNKNOWN", "M"...
## $ salary          <dbl> NA, 37.5, NA, NA, NA, NA, NA, 37.5, NA, NA, NA...
## $ key_phrases     <chr> "global, BI Technologies, Python, data mining,...
## $ link            <chr> "https://www.irishjobs.ie/Jobs/Data-Analyst-81...
## $ `is_recruiter?` <lgl> FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, ...

A few things come up from the initial summary:

Inconsistent formatting of factor levels.
key_phrases is really a character vector and should be nested.
salary uses inconsistent factor levels.
contact_gender should use NA for missing values.

interviews_clean <-
  interviews_raw %>% 
  mutate_at(c("stage_reached"), toupper) %>% 
  mutate(salary = ifelse(salary == "?", NA, salary))

interviews_clean$key_phrases <-
  (str_split(interviews_raw$key_phrases, ",") %>% 
  map(str_trim) %>% 
  map(clean_names))

Exploration

Titles were mostly data analytics roles.

interviews_clean %>% kable_tally("job_title", "Job Title Tally")

Job Title Tally
job_title	n
Data Analyst	8
Junior Data Analyst	2
Data analyst	1
DATA ANALYST - ANALYTICS CENTRE (JNR-MID)	1
Data Analyst - Dublin North.	1
Data BI Developer	1
Data Engineer	1
Data Scientist	1
Graduate Software Developer	1
JUNIOR DATA ANALYST - PHARMA SUPPLY CHAIN	1
QA Graduate Programme	1
Robotic Process Automation (RPA) Analyst (036660)	1
SQL Data Analyst?	1
SQL Developer (Capita)	1

Mostly even split between explicit rejection and applications where I recieved no reply. “INCOMPATIBLE” means I didn’t meet the requirements for the position.

interviews_clean %>% 
  kable_tally("status", "Application Status Tally") %>% 
  kableExtra::row_spec(4, bold = T, color = "white", background = "green")

Application Status Tally
status	n
REJECTED	10
NOREPLY	8
INCOMPATIBLE	3
ACCEPTED	1

interviews_clean %>% 
  mutate(month = format(applied_date, "%B")) %>% 
  group_by(month) %>% tally() %>% 
  knitr::kable() %>% 
  kableExtra::kable_styling(full_width = FALSE)

month	n
April	7
March	7
May	8

Analysis

Sankey (or alluvial) plot of how far I got in each round and the ultimate result of the application.

stage_levels <- c("FINAL ROUND", "2ND ROUND", "1ST ROUND", "LATE", "NONE")
status_levels <- c("ACCEPTED", "REJECTED", "INCOMPATIBLE", "NOREPLY",
                   "CONTACTED","APPLIED" )

interviews_clean %>% group_by(status, stage_reached) %>% tally() %>% 
  ungroup() %>% 
  mutate(stage_reached = factor(stage_reached, levels = stage_levels)) %>% 
  ggplot(aes(weight = n, axis1 = stage_reached, 
             axis2 = status, fill = status)) +
  ggalluvial::geom_alluvium( width = 1/12) +
  ggalluvial::geom_stratum(width = 1/12, fill = "white", color = "grey") +
  annotate("text", x = 1, y = c(21.5, 20, 18, 16.5, 8), label = stage_levels,
           size = 3) +
  scale_x_continuous(breaks = c(1, 2), labels = c("Round Reached", "Status")) +
  theme_void() +
  ggtitle("Round Reached vs Status of Application") +
  scale_fill_brewer(palette = "Dark2")

## Missing alluvia for some stratum combinations.
## Missing alluvia for some stratum combinations.

A simple timeline of when I applied for each position.

successful_application_date <-
  interviews_clean$applied_date[interviews_clean$company == "LinkedIn (CPL)"]

interviews_clean %>% 
  ggplot(aes(x = applied_date, y = 0, colour = status)) +
  geom_abline(slope = 0, intercept = 0) + 
  geom_point(size = 2) + 
  theme_minimal() +
  theme(axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
        axis.title.y = element_blank(),
        panel.grid = element_blank(),
        axis.text.x = element_text())+
  scale_colour_brewer(palette = "Dark2")

When looking at repetitions of keywords the answers aren’t that surprising. Entry level positions are focussed on data sourcing, cleaning and visualisation so there are tools repeatedly mentioned that are good for these tasks. Ability to communicate effectively is also a very highly prized skill, we need to be able to tell people what insights we’ve found!

key_words <- unlist(interviews_clean$key_phrases)

word_count <-
  data.frame(word = unique(key_words),
  n = map_int(unique(key_words), count_words, key_words)) %>% 
  arrange(desc(n))

word_count_exact <-
  data.frame(word = unique(key_words),
  n = map_int(unique(key_words), count_exact_words, key_words)) %>% 
  arrange(desc(n))

word_count_exact %>% top_n(7) %>% 
  knitr::kable(caption = "Top 7 Key Words") %>% 
  kableExtra::kable_styling(full_width = FALSE)

## Selecting by n

Top 7 Key Words
word	n
sql	15
excel	10
tableau	7
python	6
communication	5
reports	3
bi	3
r	3
qlikview	3

If we go further down the list we find stuff like r and, bi (business intelligence), SAS, SAP, SCALA etc. So apparently if you want a job in data analytics in Ireland in 2018, you should probably be learning SQL, Excel, Tableau and Python. The other tools and languages are probably more of a “nice to have” than anything.

Conclusions

A lot of recruiters will tell you if you’ve been rejected, at the same time a lot won’t.
Finding your first job in a field can take a long time.
You should probably learn SQL, Excel, Tableau and Python if you want a data analytics job.