# Load required libraries
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'tidyr' was built under R version 4.2.3
## Warning: package 'readr' was built under R version 4.2.3
## Warning: package 'purrr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'forcats' was built under R version 4.2.3
## Warning: package 'lubridate' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(RWeka)
## Warning: package 'RWeka' was built under R version 4.2.3
## java.home option:
## JAVA_HOME environment variable: "C:\Program Files\Java\jre-1.8"
## Warning in fun(libname, pkgname): Java home setting is INVALID, it will be ignored.
## Please do NOT set it unless you want to override system settings.
library(ggplot2)
library(stringi)
library(RColorBrewer)
library(pryr)
## Warning: package 'pryr' was built under R version 4.2.3
##
## Attaching package: 'pryr'
##
## The following object is masked from 'package:dplyr':
##
## where
##
## The following objects are masked from 'package:purrr':
##
## compose, partial
library(tm)
## Warning: package 'tm' was built under R version 4.2.3
## Loading required package: NLP
##
## Attaching package: 'NLP'
##
## The following object is masked from 'package:ggplot2':
##
## annotate
##
##
## Attaching package: 'tm'
##
## The following object is masked from 'package:pryr':
##
## inspect
library (knitr)
## Warning: package 'knitr' was built under R version 4.2.3
# Defines file paths
blogs_path <- "en_US.blogs.txt"
news_path <- "en_US.news.txt"
twitter_path <- "en_US.twitter.txt"
# Loads the entire data
blogs <- readLines(blogs_path)
news <- readLines(news_path)
## Warning in readLines(news_path): incomplete final line found on
## 'en_US.news.txt'
twitter <- readLines(twitter_path)
## Warning in readLines(twitter_path): line 167155 appears to contain an embedded
## nul
## Warning in readLines(twitter_path): line 268547 appears to contain an embedded
## nul
## Warning in readLines(twitter_path): line 1274086 appears to contain an embedded
## nul
## Warning in readLines(twitter_path): line 1759032 appears to contain an embedded
## nul
# Defines the overall data
data_list <- list(blogs = blogs, news = news, twitter = twitter)
# Generates summary statistics
stats <- data.frame(
FileName = c("blogs", "news", "twitter"),
FileSize = sapply(data_list, function(x) format(object.size(x), "Mb")),
Lines = sapply(data_list, function(x) length(x)),
LinesNEmpty = sapply(data_list, function(x) sum(nzchar(x))),
Chars = sapply(data_list, function(x) sum(nchar(x))),
CharsNWhite = sapply(data_list, function(x) sum(nchar(stringi::stri_replace_all_regex(x, "\\s+", ""))))
)
# Prints the Stats using kable
knitr::kable(stats, caption = "Summary Statistics for All Data")
| FileName | FileSize | Lines | LinesNEmpty | Chars | CharsNWhite | |
|---|---|---|---|---|---|---|
| blogs | blogs | 255.4 Mb | 899288 | 899288 | 206824509 | 170389662 |
| news | news | 19.8 Mb | 77259 | 77259 | 15639408 | 13072698 |
| 319 Mb | 2360148 | 2360148 | 162122651 | 134082634 |
# Sets the seed for reproducibility
set.seed(123)
# Defines the sample size as a percentage of each dataset
sample_pct <- 0.01
# Takes a random sample from each dataset
blogs_sample <- sample(blogs, round(length(blogs) * sample_pct))
news_sample <- sample(news, round(length(news) * sample_pct))
twitter_sample <- sample(twitter, round(length(twitter) * sample_pct))
# Combines the samples into a single dataset
sample_data <- c(blogs_sample, news_sample, twitter_sample)
# Splits the sample data back into blogs, news, and twitter
blogs_sample <- sample_data[1:length(blogs_sample)]
news_sample <- sample_data[(length(blogs_sample) + 1):(length(blogs_sample) + length(news_sample))]
twitter_sample <- sample_data[(length(blogs_sample) + length(news_sample) + 1):length(sample_data)]
# Defines the sample data list
sample_data_list <- list(blogs = blogs_sample, news = news_sample, twitter = twitter_sample)
# Generates summary statistics for the sample data
sample_stats <- data.frame(
FileName = c("blogs_sample", "news_sample", "twitter_sample"),
FileSize = sapply(sample_data_list, function(x) format(object.size(x), "Mb")),
Lines = sapply(sample_data_list, function(x) length(x)),
LinesNEmpty = sapply(sample_data_list, function(x) sum(nzchar(x))),
Chars = sapply(sample_data_list, function(x) sum(nchar(x))),
CharsNWhite = sapply(sample_data_list, function(x) sum(nchar(stringi::stri_replace_all_regex(x, "\\s+", ""))))
)
# Prints the sample stats using kable
knitr::kable(sample_stats, caption = "Summary Statistics for Sampled Data")
| FileName | FileSize | Lines | LinesNEmpty | Chars | CharsNWhite | |
|---|---|---|---|---|---|---|
| blogs | blogs_sample | 2.5 Mb | 8993 | 8993 | 2066092 | 1702426 |
| news | news_sample | 0.2 Mb | 773 | 773 | 153678 | 128396 |
| twitter_sample | 3.2 Mb | 23601 | 23601 | 1625149 | 1343631 |
# Builds the corpus
corpus <- VCorpus(VectorSource(sample_data))
# Checks the size of the corpus in memory
print(object_size(corpus))
## 77.81 MB
# Cleans the corpus
# Converts all to lower case
clean_corpus <- tm_map(corpus, content_transformer(tolower))
# Removes punctuation marks
clean_corpus <- tm_map(clean_corpus, removePunctuation)
# Removes numbers
clean_corpus <- tm_map(clean_corpus, removeNumbers)
# Removes whitespace
clean_corpus <- tm_map(clean_corpus, stripWhitespace)
# Converts all to plain text document
clean_corpus <- tm_map(clean_corpus, PlainTextDocument)
# Converts clean_corpus to a character vector
clean_corpus_char <- unlist(lapply(clean_corpus, content))
# Defines tokenizers for unigrams, bigrams, and trigrams
tokenizers <- list(
uniTokenizer = function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1)),
biTokenizer = function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2)),
triTokenizer = function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
)
# Constructs term-document matrices for unigrams, bigrams, and trigrams
tdm_list <- lapply(tokenizers, function(tokenizer) {
TermDocumentMatrix(clean_corpus, control = list(tokenize = tokenizer))
})
# Names the list elements
names(tdm_list) <- c("uniMatrix", "biMatrix", "triMatrix")
# Finds frequent terms
uniCorpus <- findFreqTerms(tdm_list$uniMatrix, lowfreq = 20)
biCorpus <- findFreqTerms(tdm_list$biMatrix, lowfreq = 20)
triCorpus <- findFreqTerms(tdm_list$triMatrix, lowfreq = 20)
# Calculates frequencies for the frequent terms
uniCorpusFreq <- rowSums(as.matrix(tdm_list$uniMatrix[uniCorpus,]))
uniCorpusFreq <- data.frame(word = names(uniCorpusFreq), frequency = uniCorpusFreq)
biCorpusFreq <- rowSums(as.matrix(tdm_list$biMatrix[biCorpus,]))
biCorpusFreq <- data.frame(word = names(biCorpusFreq), frequency = biCorpusFreq)
triCorpusFreq <- rowSums(as.matrix(tdm_list$triMatrix[triCorpus,]))
triCorpusFreq <- data.frame(word = names(triCorpusFreq), frequency = triCorpusFreq)
# Sets the order of each corpus frequency to descending
uniCorpusFreqDescend <- arrange(uniCorpusFreq, desc(frequency))
biCorpusFreqDescend <- arrange(biCorpusFreq, desc(frequency))
triCorpusFreqDescend <- arrange(triCorpusFreq, desc(frequency))
# Visualises the top 20 Unigrams
uniBar <- ggplot(data = head(uniCorpusFreqDescend, 20), aes(x = reorder(word, -frequency), y = frequency)) +
geom_bar(stat = "identity", fill = "red") +
xlab("Words") +
ylab("Frequency") +
ggtitle("Top 20 Unigrams") +
theme(plot.title = element_text(hjust = 0.5)) +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
uniBar
# Visualises the top 20 Bigrams
biBar <- ggplot(data = head(biCorpusFreqDescend, 20), aes(x = reorder(word, -frequency), y = frequency)) +
geom_bar(stat = "identity", fill = "green") +
xlab("Words") +
ylab("Frequency") +
ggtitle("Top 20 Bigrams") +
theme(plot.title = element_text(hjust = 0.5)) +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
biBar
# Visualises the top 20 Trigrams
triBar <- ggplot(data = head(triCorpusFreqDescend, 20), aes(x = reorder(word, -frequency), y = frequency)) +
geom_bar(stat = "identity", fill = "blue") +
xlab("Words") +
ylab("Frequency") +
ggtitle("Top 20 Trigrams") +
theme(plot.title = element_text(hjust = 0.5)) +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
triBar
# Adds a column to each data frame to indicate the type of n-gram
uniCorpusFreqDescend$type <- "Unigram"
biCorpusFreqDescend$type <- "Bigram"
triCorpusFreqDescend$type <- "Trigram"
# Selects the top 10 words for each type of n-gram
top_uniCorpusFreqDescend <- head(uniCorpusFreqDescend, 10)
top_biCorpusFreqDescend <- head(biCorpusFreqDescend, 10)
top_triCorpusFreqDescend <- head(triCorpusFreqDescend, 10)
# Creates a bubble cloud for the top 10 words of Unigram
ggplot(top_uniCorpusFreqDescend, aes(x = reorder(word, -frequency), y = frequency, size = frequency)) +
geom_point(alpha = 0.7) +
scale_size(range = c(1, 20)) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
labs(title = "Bubble Cloud for Top 10 Words of Unigram", x = "Words", y = "Frequency", size = "Frequency")
# Creates a bubble cloud for the top 10 words of Bigram
ggplot(top_biCorpusFreqDescend, aes(x = reorder(word, -frequency), y = frequency, size = frequency)) +
geom_point(alpha = 0.7) +
scale_size(range = c(1, 20)) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
labs(title = "Bubble Cloud for Top 10 Words of Bigram", x = "Words", y = "Frequency", size = "Frequency")
# Creates a bubble cloud for the top 10 words of Trigram
ggplot(top_triCorpusFreqDescend, aes(x = reorder(word, -frequency), y = frequency, size = frequency)) +
geom_point(alpha = 0.7) +
scale_size(range = c(1, 20)) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
labs(title = "Bubble Cloud for Top 10 Words of Trigram", x = "Words", y = "Frequency", size = "Frequency")
As per the data above it an be seen that the combination of all data sets has its common words and word strings. We took a 1% sample from the data as the data file was quite large, thus the question is whether a 1% sample is enough for an accurate prediction model. Additionally we need to consider removing stopwords or creating a filter for profane words if they are suggested in our model.
Build and test different prediction models and evaluate each based on their performance.
Create and test any necessary modifications to resolve any issues encountered during modeling.
Build, test and deploy a Shiny app with a simple user interface that has acceptable run time and reliably and accurately predicts the next word based on a word or phrase entered by the user.
Decide whether to remove the stopwords and filter out any profanity, if need be.