Overview

As part of our regular labour market surveillance, the ABC collates job advertisements from various sources, including the Jobs Bank, online jobs portals, as well as print media such as the Straits Times and The New Paper.

Your task is to:

  1. Clean the data;
  2. Make use of various analytics techniques, for example text analytics and machine learning, to classify these job advertisements into various sectors and occupational groups;
  3. Make meaningful inferences on the state of the labour market.

The Singapore Standard Industrial Classification (SSIC) and Singapore Standard Occupational Classification (SSOC) can be found on the website of the Department of Statistics.

Topic Modelling is a method used for unsupervised classification of documents, such as blog posts or news articles, that we would like to divide into natural groups so that we can understand them separately.

Here, we utilise Latent Dirichlet Allocation (LDA) which is a common method for fitting a topic model. It treats each document as a mixture of topics, and each topic as a mixture of words. This allows documents to overlap one another in terms of content, in a way that mirrors the typical use of natural language.

Loading Libraries

# Loading Libraries
library(readxl)
## Warning: package 'readxl' was built under R version 3.6.3
library(tm)
## Loading required package: NLP
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(topicmodels)
## Warning: package 'topicmodels' was built under R version 3.6.3
library(tidytext)
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
library(tidyr)

Loading Data

# Loading Data
SJT_1214 <- read_excel("./SGP_JobText_Dec-2014_Report.xlsx")
SJT_0615 <- read_excel("./SGP_JobText_Jun-2015_Report.xlsx")
SJT_1215 <- read_excel("./SGP_JobText_Dec-2015_Report.xlsx")
Data_Combined <- c(SJT_1214, SJT_0615, SJT_1215)
Data_Combined_Corpus <- VCorpus(VectorSource(Data_Combined))

Cleaning Data

# Cleaning Data
clean_data <- function(clean_corpus){
  clean_corpus <- tm_map(clean_corpus, function(x) iconv(x, "latin1", "ASCII", sub = " "))
  clean_corpus <- tm_map(clean_corpus, PlainTextDocument)
  toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
  clean_corpus <- tm_map(clean_corpus, toSpace, "\\\\")
  clean_corpus <- tm_map(clean_corpus, content_transformer(tolower))
  clean_corpus <- tm_map(clean_corpus, removeWords, stopwords("english"))
  clean_corpus <- tm_map(clean_corpus, removeNumbers)
  clean_corpus <- tm_map(clean_corpus, removePunctuation)
  clean_corpus <- tm_map(clean_corpus, stripWhitespace)
  clean_corpus <- tm_map(clean_corpus, stemDocument, language = "english")
  clean_corpus <- tm_map(clean_corpus, removeWords,
                         c("account", "achiev", "across", "address", "advertis",
                           "also", "appli", "applic", "asia", "averag", "back",
                           "base", "benefit", "best", "busi", "call", "candid",
                           "career", "central", "close", "compani", "contact",
                           "contract", "copi", "copyright", "corpor", "current",
                           "custom", "date", "decemb", "degre", "descript",
                           "detail", "email", "employ", "employe", "english",
                           "ensur", "excel", "execut", "expect", "experi", "find",
                           "firm", "follow", "full", "function", "good", "group",
                           "high", "hour", "includ", "initi", "interest",
                           "jobstreetcom", "join", "knowledg", "languag", "larger",
                           "lead", "least", "level", "licens", "locanto", "locat",
                           "look", "make", "market", "meet", "messag", "month",
                           "need", "number", "offer", "opportun", "overview",
                           "page", "part", "partner", "peopl", "perman", "person",
                           "plan", "pleas", "posit", "post", "privaci", "process",
                           "profession", "provid", "qualiti", "recruit", "region",
                           "regist", "registr", "relat", "relev", "report",
                           "requir", "respons", "resum", "role", "salari", "save",
                           "search", "send", "senior", "servic", "shortlist",
                           "singapor", "site", "size", "skill", "snapshot",
                           "solut", "staf", "staff", "strong", "success",
                           "support", "system", "take", "talent", "team", "term",
                           "time", "train", "updat", "view", "week", "well",
                           "will", "within", "work", "year"))
  clean_corpus}

Data_Combined_Corpus_Clean <- clean_data(Data_Combined_Corpus)
Data_Combined_Corpus_Clean <- VCorpus(VectorSource(Data_Combined_Corpus_Clean))

DTM <- DocumentTermMatrix(Data_Combined_Corpus_Clean,
                          control = list(wordLengths = c(4, 10)))
inspect(DTM)
## <<DocumentTermMatrix (documents: 3, terms: 125980)>>
## Non-/sparse entries: 180953/196987
## Sparsity           : 52%
## Maximal term length: 10
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs client consult develop engin industri  manag offic product resourc  sale
##    1  26812   17736   37131 28172    23941  94939 29905   33518   13749 42176
##    2  47865   44048   62190 52567    75988 136638 47013   46309   42237 69281
##    3  93581  123538   91983 80985   160406 235476 82292   70521   92483 85560

Finding Frequently Occurring Terms

# Finding Frequently Occurring Terms
FT <- findFreqTerms(DTM, lowfreq = 50000, highfreq = Inf)
FT.matrix <- as.matrix(DTM[, FT])
print(FT.matrix)
##     Terms
## Docs admin administr assist  bank build client  code communic consult  data
##    1 11688     10628  23580 25225 10552  26812  6735    24003   17736 13020
##    2 16175     18134  46655 28822 19806  47865 28499    39769   44048 25967
##    3 27647     29316  63768 46226 32903  93581 22720    53676  123538 58116
##     Terms
## Docs design develop  educ engin environ financ financi global human industri
##    1  14060   37131 14710 28172   11558  20509   17789  14301  7622    23941
##    2  22943   62190 13956 52567   23250  26108   25948  22286 34765    75988
##    3  32862   91983 30105 80985   37841  33913   39999  36054 88350   160406
##     Terms
## Docs inform intern local  manag medic network offic  oper perform photo polici
##    1  17777  17286  9011  94939  5744    8271 29905 25457   18618  2791  16187
##    2  46191  28256 20877 136638 18553   16256 47013 39467   25857 14249  41952
##    3  59786  54928 25778 235476 32083   32545 82292 62997   37719 36902  69198
##     Terms
## Docs product project resourc retail  sale secur technic technolog
##    1   33518   26727   13749   9184 42176 16978   12382     14135
##    2   46309   39073   42237  19185 69281 25438   18889     29305
##    3   70521   57352   92483  28030 85560 29866   26437     45914

Initially, we derived 73 Frequently Occurring Terms, each with a word count above 100,000. However, some of the words did not convey as much meaning towards our analysis. Hence, we decided to remove them and they have been listed under the “Cleaning Data” section. Eventually, we obtained a list of 39 Frequently Occurring Terms, each with a word count above 50,000. This will help us in interpreting our data.

Creating a Four-Topic LDA Model

# Creating a Four-Topic LDA Model
SJT_lda <- LDA(DTM, k = 4, control = list(seed = 1234))
print(SJT_lda)
## A LDA_VEM topic model with 4 topics.

Essentially, we built a Four-Topic LDA Model due to both data and computational constraints. For future studies, we can design and build more sophisticated topic models with a greater number of topics.

Topic-Word Probabilities

# Examining Per-Topic-Per-Word Probabilities (Beta) from the Model
SJT_topics <- tidy(SJT_lda, matrix = "beta")
print(SJT_topics)
## # A tibble: 503,920 x 3
##    topic term                beta
##    <int> <chr>              <dbl>
##  1     1 "\u007fjob" 0.0000000933
##  2     2 "\u007fjob" 0.000000142 
##  3     3 "\u007fjob" 0.0000000504
##  4     4 "\u007fjob" 0.0000000202
##  5     1 "aaaad"     0.0000000294
##  6     2 "aaaad"     0.0000000506
##  7     3 "aaaad"     0.0000000518
##  8     4 "aaaad"     0.0000000219
##  9     1 "aaalac"    0.0000000977
## 10     2 "aaalac"    0.0000000744
## # ... with 503,910 more rows
# Plotting Common Words in the Four Topics Extracted
SJT_top_terms <- SJT_topics %>%
  group_by(topic) %>%
  top_n(20, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)
SJT_top_terms %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()

From the Summary and Visualisation shown above, we derived four main topics that were extracted from regular labour market surveillance, done by the ABC for December 2014, June 2015, and December 2015. This allows us to understand and make meaningful inferences on the state of the labour market during these time periods.

Document-Topic Probabilities

# Examining Per-Document-Per-Topic Probabilities (Gamma) from the Model
SJT_gamma <- tidy(SJT_lda, matrix = "gamma")
print(SJT_gamma)
## # A tibble: 12 x 3
##    document topic gamma
##    <chr>    <int> <dbl>
##  1 1            1 0.154
##  2 2            1 0.221
##  3 3            1 0.286
##  4 1            2 0.300
##  5 2            2 0.225
##  6 3            2 0.288
##  7 1            3 0.168
##  8 2            3 0.272
##  9 3            3 0.270
## 10 1            4 0.379
## 11 2            4 0.283
## 12 3            4 0.156
# Plotting our Documents to the Four Topics Extracted
document_names <- c("1" = "December 2014",
                    "2" = "June 2015",
                    "3" = "December 2015")
SJT_gamma %>%
  mutate(title = reorder(document, gamma * topic)) %>%
  ggplot(aes(x = factor(topic), y = gamma,
             colour = factor(topic), fill = factor(topic))) +
  geom_boxplot() +
  facet_wrap(~ title) +
  facet_grid(~ document, labeller = labeller(document = document_names)) +
  ggtitle("ABC Documents Sorted By 4 Job Topics") +
  xlab("Job Topic") + ylab("Probability (Gamma)") +
  labs(colour = "Job Topic", fill = "Job Topic")

From the Summary and Plot shown above, each gamma value indicates the proportion of words from each document, used to generate that particular topic. Essentially, Document 1 is data derived from regular labour market surveillance for December 2014; Document 2 is data derived from regular labour market surveillance for June 2015; and Document 3 is data derived from regular labour market surveillance for December 2015.

Conclusion

In summary, we were provided with data derived from regular labour market surveillance, done by the ABC for December 2014, June 2015, and December 2015. We were able to utilise this data to understand and make meaningful inferences on the state of the labour market during these time periods. For future studies, we hope to design and build more sophisticated models to improve our study.