Project 3 ngrams

library(readr)
library(tidyverse)
library(stringi)
library(tm)
library(corpus)
library(wordcloud)
library(data.table)

Load data

jobs_df <- as.data.frame(read.delim("https://raw.githubusercontent.com/cassandra-coste/CUNY607/main/project_3/raw_jobdata.csv", header = TRUE, stringsAsFactors = FALSE, sep = ","))

text <- jobs_df %>% select(description)

Make corpus and remove punctuation, numbers, stopwords, convert cases, etc

corpus <- VCorpus(VectorSource(text))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeWords, c("gender","race","religion","sexual","orientation"))

wordcloud(corpus, max.words = 50, colors = colorRampPalette(brewer.pal(7, "Dark2"))(32))

Tokenization of textbody into unigrams (one word), bigrams (two words), trigrams (three words), and quadgrams(four words)

#Unigrams
unigramTokenizer <- function(x) { unlist(lapply(ngrams(words(x), 1), paste, collapse = " "), use.names = FALSE) }
unigram <- TermDocumentMatrix(corpus, control = list(wordLengths = c(1, 20)))


#Bigrams
bigramTokenizer <- function(x) { unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE) }
bigram <- TermDocumentMatrix(corpus, control = list(wordLengths = c(3, 40),tokenize = bigramTokenizer))


#Trigrams
trigramTokenizer <- function(x) { unlist(lapply(ngrams(words(x), 3), paste, collapse = " "), use.names = FALSE) }
trigram <- TermDocumentMatrix(corpus, control = list(wordLengths = c(3, 60),tokenize = trigramTokenizer))

Plot unigram

#Unigrams

unigramrow <- sort(slam::row_sums(unigram), decreasing=T)
unigramfreq <- data.table(tok = names(unigramrow), freq = unigramrow)

ggplot(unigramfreq[1:25,], aes(x = reorder(tok,freq), y = freq)) + coord_flip() +
     geom_bar(stat = "identity", fill = "coral") + theme_bw() +
     ggtitle("Top 25 Unigrams") +labs(x = "", y = "")

Plot bigram

#Bigrams

bigramrow <- sort(slam::row_sums(bigram), decreasing=T)
bigramfreq <- data.table(tok = names(bigramrow), freq = bigramrow)

ggplot(bigramfreq[1:25,], aes(x = reorder(tok,freq), y = freq)) + coord_flip() +
     geom_bar(stat = "identity", fill = "coral") + theme_bw() +
     ggtitle("Top 25 Bigrams") +labs(x = "", y = "")

Plot trigram

#Trigrams
trigramrow <- sort(slam::row_sums(trigram), decreasing=T)
trigramfreq <- data.table(tok = names(trigramrow), freq = trigramrow)

ggplot(trigramfreq[1:25,], aes(x = reorder(tok,freq), y = freq)) + coord_flip() +
     geom_bar(stat = "identity", fill = "coral") + theme_bw() +
     ggtitle("Top 25 Trigrams") +labs(x = "", y = "")

Create unigrams, bigrams, and trigrams of interest

#Unigrams

my_unigrams <- c("python", "sql")

#Bigrams 

my_bigrams <- c("machine learning", "big data")

#Trigrams

my_trigrams <- c("natural language processing", "written verbal communication")

Search for unigrams, bigrams, and trigrams

unigrams_specified <- DocumentTermMatrix(corpus, control=list(tokenizer = unigramTokenizer, dictionary = my_unigrams))

bigrams_specified <- DocumentTermMatrix(corpus, control=list(tokenizer = bigramTokenizer, dictionary = my_bigrams))

trigrams_specified <- DocumentTermMatrix(corpus, control=list(tokenizer = trigramTokenizer, dictionary = my_trigrams))


unigrams_df  <- unigrams_specified  %>% as.matrix() %>%  as.data.frame() 
unigrams_df  <- gather(unigrams_df,
                   key = "word",
                   value = "count")

bigrams_df  <- bigrams_specified  %>% as.matrix() %>%  as.data.frame() 
bigrams_df  <- gather(bigrams_df,
                   key = "word",
                   value = "count")

trigrams_df  <- trigrams_specified  %>% as.matrix() %>%  as.data.frame() 
trigrams_df  <- gather(trigrams_df,
                   key = "word",
                   value = "count")

ngrams_df <- rbind(unigrams_df, bigrams_df, trigrams_df)

Plot ngram

ggplot(ngrams_df , aes(x = reorder(word,count), y = count)) + coord_flip() +
     geom_bar(stat = "identity", fill = "coral") + theme_bw() +
     ggtitle("Top Skills Using Targeted Words") +labs(x = "", y = "")

Project 3 ngrams

Cassandra Coste

3/19/2021

Load data

Make corpus and remove punctuation, numbers, stopwords, convert cases, etc

Tokenization of textbody into unigrams (one word), bigrams (two words), trigrams (three words), and quadgrams(four words)

Plot unigram

Plot bigram

Plot trigram

Create unigrams, bigrams, and trigrams of interest

Search for unigrams, bigrams, and trigrams

Plot ngram