# Load required libraries
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'tidyr' was built under R version 4.2.3
## Warning: package 'readr' was built under R version 4.2.3
## Warning: package 'purrr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'forcats' was built under R version 4.2.3
## Warning: package 'lubridate' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(RWeka)
## Warning: package 'RWeka' was built under R version 4.2.3
## java.home option: 
## JAVA_HOME environment variable: "C:\Program Files\Java\jre-1.8"
## Warning in fun(libname, pkgname): Java home setting is INVALID, it will be ignored.
## Please do NOT set it unless you want to override system settings.
library(ggplot2)
library(stringi)
library(RColorBrewer)
library(pryr)
## Warning: package 'pryr' was built under R version 4.2.3
## 
## Attaching package: 'pryr'
## 
## The following object is masked from 'package:dplyr':
## 
##     where
## 
## The following objects are masked from 'package:purrr':
## 
##     compose, partial
library(tm)
## Warning: package 'tm' was built under R version 4.2.3
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## 
## The following object is masked from 'package:ggplot2':
## 
##     annotate
## 
## 
## Attaching package: 'tm'
## 
## The following object is masked from 'package:pryr':
## 
##     inspect
library (knitr)
## Warning: package 'knitr' was built under R version 4.2.3

Loading the Data

# Defines file paths
blogs_path <- "en_US.blogs.txt"
news_path <- "en_US.news.txt"
twitter_path <- "en_US.twitter.txt"

# Loads the entire data
blogs <- readLines(blogs_path)
news <- readLines(news_path)
## Warning in readLines(news_path): incomplete final line found on
## 'en_US.news.txt'
twitter <- readLines(twitter_path)
## Warning in readLines(twitter_path): line 167155 appears to contain an embedded
## nul
## Warning in readLines(twitter_path): line 268547 appears to contain an embedded
## nul
## Warning in readLines(twitter_path): line 1274086 appears to contain an embedded
## nul
## Warning in readLines(twitter_path): line 1759032 appears to contain an embedded
## nul
# Defines the overall data
data_list <- list(blogs = blogs, news = news, twitter = twitter)

Summary Statistics

# Generates summary statistics
stats <- data.frame(
  FileName = c("blogs", "news", "twitter"),
  FileSize = sapply(data_list, function(x) format(object.size(x), "Mb")),
  Lines = sapply(data_list, function(x) length(x)),
  LinesNEmpty = sapply(data_list, function(x) sum(nzchar(x))),
  Chars = sapply(data_list, function(x) sum(nchar(x))),
  CharsNWhite = sapply(data_list, function(x) sum(nchar(stringi::stri_replace_all_regex(x, "\\s+", ""))))
)

# Prints the Stats using kable
knitr::kable(stats, caption = "Summary Statistics for All Data")
Summary Statistics for All Data
FileName FileSize Lines LinesNEmpty Chars CharsNWhite
blogs blogs 255.4 Mb 899288 899288 206824509 170389662
news news 19.8 Mb 77259 77259 15639408 13072698
twitter twitter 319 Mb 2360148 2360148 162122651 134082634

Sampling the Data

# Sets the seed for reproducibility
set.seed(123)

# Defines the sample size as a percentage of each dataset
sample_pct <- 0.01

# Takes a random sample from each dataset
blogs_sample <- sample(blogs, round(length(blogs) * sample_pct))
news_sample <- sample(news, round(length(news) * sample_pct))
twitter_sample <- sample(twitter, round(length(twitter) * sample_pct))

# Combines the samples into a single dataset
sample_data <- c(blogs_sample, news_sample, twitter_sample)

# Splits the sample data back into blogs, news, and twitter
blogs_sample <- sample_data[1:length(blogs_sample)]
news_sample <- sample_data[(length(blogs_sample) + 1):(length(blogs_sample) + length(news_sample))]
twitter_sample <- sample_data[(length(blogs_sample) + length(news_sample) + 1):length(sample_data)]

# Defines the sample data list
sample_data_list <- list(blogs = blogs_sample, news = news_sample, twitter = twitter_sample)

Suammary Statistics for the Sample Data

# Generates summary statistics for the sample data
sample_stats <- data.frame(
  FileName = c("blogs_sample", "news_sample", "twitter_sample"),
  FileSize = sapply(sample_data_list, function(x) format(object.size(x), "Mb")),
  Lines = sapply(sample_data_list, function(x) length(x)),
  LinesNEmpty = sapply(sample_data_list, function(x) sum(nzchar(x))),
  Chars = sapply(sample_data_list, function(x) sum(nchar(x))),
  CharsNWhite = sapply(sample_data_list, function(x) sum(nchar(stringi::stri_replace_all_regex(x, "\\s+", ""))))
)

# Prints the sample stats using kable
knitr::kable(sample_stats, caption = "Summary Statistics for Sampled Data")
Summary Statistics for Sampled Data
FileName FileSize Lines LinesNEmpty Chars CharsNWhite
blogs blogs_sample 2.5 Mb 8993 8993 2066092 1702426
news news_sample 0.2 Mb 773 773 153678 128396
twitter twitter_sample 3.2 Mb 23601 23601 1625149 1343631

Cleaning the Data and Building a Corpus

# Builds the corpus
corpus <- VCorpus(VectorSource(sample_data))

# Checks the size of the corpus in memory
print(object_size(corpus))
## 77.81 MB
# Cleans the corpus

# Converts all to lower case
clean_corpus <- tm_map(corpus, content_transformer(tolower))
# Removes punctuation marks
clean_corpus <- tm_map(clean_corpus, removePunctuation)
# Removes numbers
clean_corpus <- tm_map(clean_corpus, removeNumbers)
# Removes whitespace
clean_corpus <- tm_map(clean_corpus, stripWhitespace)
# Converts all to plain text document
clean_corpus <- tm_map(clean_corpus, PlainTextDocument)

# Converts clean_corpus to a character vector
clean_corpus_char <- unlist(lapply(clean_corpus, content))

Tokenizing and Constructing the N-Grams

# Defines tokenizers for unigrams, bigrams, and trigrams
tokenizers <- list(
  uniTokenizer = function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1)),
  biTokenizer = function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2)),
  triTokenizer = function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
)

# Constructs term-document matrices for unigrams, bigrams, and trigrams
tdm_list <- lapply(tokenizers, function(tokenizer) {
  TermDocumentMatrix(clean_corpus, control = list(tokenize = tokenizer))
})

# Names the list elements
names(tdm_list) <- c("uniMatrix", "biMatrix", "triMatrix")

# Finds frequent terms
uniCorpus <- findFreqTerms(tdm_list$uniMatrix, lowfreq = 20)
biCorpus <- findFreqTerms(tdm_list$biMatrix, lowfreq = 20)
triCorpus <- findFreqTerms(tdm_list$triMatrix, lowfreq = 20)

# Calculates frequencies for the frequent terms
uniCorpusFreq <- rowSums(as.matrix(tdm_list$uniMatrix[uniCorpus,]))
uniCorpusFreq <- data.frame(word = names(uniCorpusFreq), frequency = uniCorpusFreq)

biCorpusFreq <- rowSums(as.matrix(tdm_list$biMatrix[biCorpus,]))
biCorpusFreq <- data.frame(word = names(biCorpusFreq), frequency = biCorpusFreq)

triCorpusFreq <- rowSums(as.matrix(tdm_list$triMatrix[triCorpus,]))
triCorpusFreq <- data.frame(word = names(triCorpusFreq), frequency = triCorpusFreq)

# Sets the order of each corpus frequency to descending
uniCorpusFreqDescend <- arrange(uniCorpusFreq, desc(frequency))
biCorpusFreqDescend <- arrange(biCorpusFreq, desc(frequency))
triCorpusFreqDescend <- arrange(triCorpusFreq, desc(frequency))

Data Visualisation

# Visualises the top 20 Unigrams
uniBar <- ggplot(data = head(uniCorpusFreqDescend, 20), aes(x = reorder(word, -frequency), y = frequency)) +
  geom_bar(stat = "identity", fill = "red") +
  xlab("Words") +
  ylab("Frequency") +
  ggtitle("Top 20 Unigrams") +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(axis.text.x = element_text(angle = 60, hjust = 1))
uniBar

# Visualises the top 20 Bigrams
biBar <- ggplot(data = head(biCorpusFreqDescend, 20), aes(x = reorder(word, -frequency), y = frequency)) +
  geom_bar(stat = "identity", fill = "green") +
  xlab("Words") +
  ylab("Frequency") +
  ggtitle("Top 20 Bigrams") +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(axis.text.x = element_text(angle = 60, hjust = 1))
biBar

# Visualises the top 20 Trigrams
triBar <- ggplot(data = head(triCorpusFreqDescend, 20), aes(x = reorder(word, -frequency), y = frequency)) +
  geom_bar(stat = "identity", fill = "blue") +
  xlab("Words") +
  ylab("Frequency") +
  ggtitle("Top 20 Trigrams") +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(axis.text.x = element_text(angle = 60, hjust = 1))
triBar

Unique Data Visualisation

# Adds a column to each data frame to indicate the type of n-gram
uniCorpusFreqDescend$type <- "Unigram"
biCorpusFreqDescend$type <- "Bigram"
triCorpusFreqDescend$type <- "Trigram"

# Selects the top 10 words for each type of n-gram
top_uniCorpusFreqDescend <- head(uniCorpusFreqDescend, 10)
top_biCorpusFreqDescend <- head(biCorpusFreqDescend, 10)
top_triCorpusFreqDescend <- head(triCorpusFreqDescend, 10)

# Creates a bubble cloud for the top 10 words of Unigram
ggplot(top_uniCorpusFreqDescend, aes(x = reorder(word, -frequency), y = frequency, size = frequency)) +
  geom_point(alpha = 0.7) +
  scale_size(range = c(1, 20)) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
  labs(title = "Bubble Cloud for Top 10 Words of Unigram", x = "Words", y = "Frequency", size = "Frequency")

# Creates a bubble cloud for the top 10 words of Bigram
ggplot(top_biCorpusFreqDescend, aes(x = reorder(word, -frequency), y = frequency, size = frequency)) +
  geom_point(alpha = 0.7) +
  scale_size(range = c(1, 20)) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
  labs(title = "Bubble Cloud for Top 10 Words of Bigram", x = "Words", y = "Frequency", size = "Frequency")

# Creates a bubble cloud for the top 10 words of Trigram
ggplot(top_triCorpusFreqDescend, aes(x = reorder(word, -frequency), y = frequency, size = frequency)) +
  geom_point(alpha = 0.7) +
  scale_size(range = c(1, 20)) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
  labs(title = "Bubble Cloud for Top 10 Words of Trigram", x = "Words", y = "Frequency", size = "Frequency")

Summary

As per the data above it an be seen that the combination of all data sets has its common words and word strings. We took a 1% sample from the data as the data file was quite large, thus the question is whether a 1% sample is enough for an accurate prediction model. Additionally we need to consider removing stopwords or creating a filter for profane words if they are suggested in our model.

What Next?