library(readr)
library(tidyverse)
library(stringi)
library(tm)
library(corpus)
library(wordcloud)
library(data.table)
Load data
jobs_df <- as.data.frame(read.delim("https://raw.githubusercontent.com/cassandra-coste/CUNY607/main/project_3/raw_jobdata.csv", header = TRUE, stringsAsFactors = FALSE, sep = ","))
text <- jobs_df %>% select(description)
Make corpus and remove punctuation, numbers, stopwords, convert cases, etc
corpus <- VCorpus(VectorSource(text))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeWords, c("gender","race","religion","sexual","orientation"))
wordcloud(corpus, max.words = 50, colors = colorRampPalette(brewer.pal(7, "Dark2"))(32))

Tokenization of textbody into unigrams (one word), bigrams (two words), trigrams (three words), and quadgrams(four words)
#Unigrams
unigramTokenizer <- function(x) { unlist(lapply(ngrams(words(x), 1), paste, collapse = " "), use.names = FALSE) }
unigram <- TermDocumentMatrix(corpus, control = list(wordLengths = c(1, 20)))
#Bigrams
bigramTokenizer <- function(x) { unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE) }
bigram <- TermDocumentMatrix(corpus, control = list(wordLengths = c(3, 40),tokenize = bigramTokenizer))
#Trigrams
trigramTokenizer <- function(x) { unlist(lapply(ngrams(words(x), 3), paste, collapse = " "), use.names = FALSE) }
trigram <- TermDocumentMatrix(corpus, control = list(wordLengths = c(3, 60),tokenize = trigramTokenizer))
Plot unigram
#Unigrams
unigramrow <- sort(slam::row_sums(unigram), decreasing=T)
unigramfreq <- data.table(tok = names(unigramrow), freq = unigramrow)
ggplot(unigramfreq[1:25,], aes(x = reorder(tok,freq), y = freq)) + coord_flip() +
geom_bar(stat = "identity", fill = "coral") + theme_bw() +
ggtitle("Top 25 Unigrams") +labs(x = "", y = "")

Plot bigram
#Bigrams
bigramrow <- sort(slam::row_sums(bigram), decreasing=T)
bigramfreq <- data.table(tok = names(bigramrow), freq = bigramrow)
ggplot(bigramfreq[1:25,], aes(x = reorder(tok,freq), y = freq)) + coord_flip() +
geom_bar(stat = "identity", fill = "coral") + theme_bw() +
ggtitle("Top 25 Bigrams") +labs(x = "", y = "")

Plot trigram
#Trigrams
trigramrow <- sort(slam::row_sums(trigram), decreasing=T)
trigramfreq <- data.table(tok = names(trigramrow), freq = trigramrow)
ggplot(trigramfreq[1:25,], aes(x = reorder(tok,freq), y = freq)) + coord_flip() +
geom_bar(stat = "identity", fill = "coral") + theme_bw() +
ggtitle("Top 25 Trigrams") +labs(x = "", y = "")

Create unigrams, bigrams, and trigrams of interest
#Unigrams
my_unigrams <- c("python", "sql")
#Bigrams
my_bigrams <- c("machine learning", "big data")
#Trigrams
my_trigrams <- c("natural language processing", "written verbal communication")
Search for unigrams, bigrams, and trigrams
unigrams_specified <- DocumentTermMatrix(corpus, control=list(tokenizer = unigramTokenizer, dictionary = my_unigrams))
bigrams_specified <- DocumentTermMatrix(corpus, control=list(tokenizer = bigramTokenizer, dictionary = my_bigrams))
trigrams_specified <- DocumentTermMatrix(corpus, control=list(tokenizer = trigramTokenizer, dictionary = my_trigrams))
unigrams_df <- unigrams_specified %>% as.matrix() %>% as.data.frame()
unigrams_df <- gather(unigrams_df,
key = "word",
value = "count")
bigrams_df <- bigrams_specified %>% as.matrix() %>% as.data.frame()
bigrams_df <- gather(bigrams_df,
key = "word",
value = "count")
trigrams_df <- trigrams_specified %>% as.matrix() %>% as.data.frame()
trigrams_df <- gather(trigrams_df,
key = "word",
value = "count")
ngrams_df <- rbind(unigrams_df, bigrams_df, trigrams_df)
Plot ngram
ggplot(ngrams_df , aes(x = reorder(word,count), y = count)) + coord_flip() +
geom_bar(stat = "identity", fill = "coral") + theme_bw() +
ggtitle("Top Skills Using Targeted Words") +labs(x = "", y = "")
