The main objective is to build the sample corpus, find the 2-gram and 3-gram term document matrix and perform exploratory analysis on the words. The data is available to be downloaded from
https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip
Archive’s content:
Due to computational performance issues I could took only 3000 records.
library(tm)
## Warning: package 'tm' was built under R version 3.4.4
## Loading required package: NLP
library(NLP)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
bigram_tokenizer <- function(x) {
unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
}
trigram_tokenizer <- function(x) {
unlist(lapply(ngrams(words(x), 3), paste, collapse = " "), use.names = FALSE)
}
data_blog <- readLines("week2/final/en_US/en_US.blogs.txt", skipNul = TRUE, n = 3000)
data_news <- readLines("week2/final/en_US/en_US.news.txt", skipNul = TRUE, n = 3000)
data_twitter <- readLines("week2/final/en_US/en_US.twitter.txt", skipNul = TRUE, n = 3000)
# Corpus aggregation.
data_vector <- c(data_blog, data_news, data_twitter)
data_corpus <- VCorpus(VectorSource(data_vector))
The content has been transformed to all lowercase and cleaned by removing:
# Data cleaning.
removeRepeat <- function(x) gsub("([[:alpha:]])\\1{2,}", "\\1\\1", x)
removeURL <- function(x) gsub("http[[:alnum:]]*","", x)
data_corpus <- tm_map(data_corpus, content_transformer(tolower))
data_corpus <- tm_map(data_corpus, content_transformer(removeRepeat))
data_corpus <- tm_map(data_corpus, content_transformer(removeURL))
data_corpus <- tm_map(data_corpus, removePunctuation)
data_corpus <- tm_map(data_corpus, removeNumbers)
data_corpus <- tm_map(data_corpus, stripWhitespace)
data_corpus <- tm_map(data_corpus, removeWords, stopwords("english"))
A Term Document Matrix has been created to rank words.
one_gram_dtm <- TermDocumentMatrix(data_corpus)
one_gram_matrix = as.data.frame((as.matrix(one_gram_dtm)))
one_gram_v <- sort(rowSums(one_gram_matrix), decreasing = TRUE)
one_gram_d <- data.frame(word = names(one_gram_v), freq = one_gram_v)
# Bar chart
ggplot(data = head(one_gram_d, 20), aes(x = reorder(word, -freq), y = freq)) +
geom_bar(stat="identity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
two_gram_dtm <- TermDocumentMatrix(data_corpus, control = list(tokenize = bigram_tokenizer))
two_gram_matrix = as.data.frame((as.matrix(two_gram_dtm)))
two_gram_v <- sort(rowSums(two_gram_matrix), decreasing = TRUE)
two_gram_d <- data.frame(word = names(two_gram_v), freq = two_gram_v)
# Bar chart
ggplot(data = head(two_gram_d, 20), aes(x = reorder(word, -freq), y = freq)) +
geom_bar(stat="identity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
three_gram_dtm <- TermDocumentMatrix(data_corpus, control = list(tokenize = trigram_tokenizer))
three_gram_matrix = as.data.frame((as.matrix(three_gram_dtm)))
three_gram_v <- sort(rowSums(three_gram_matrix), decreasing = TRUE)
three_gram_d <- data.frame(word = names(three_gram_v), freq = three_gram_v)
# Bar chart
ggplot(data = head(three_gram_d, 20), aes(x = reorder(word, -freq), y = freq)) +
geom_bar(stat="identity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))