# Load required libraries
library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.2.3

## Warning: package 'ggplot2' was built under R version 4.2.3

## Warning: package 'tibble' was built under R version 4.2.3

## Warning: package 'tidyr' was built under R version 4.2.3

## Warning: package 'readr' was built under R version 4.2.3

## Warning: package 'purrr' was built under R version 4.2.3

## Warning: package 'dplyr' was built under R version 4.2.3

## Warning: package 'forcats' was built under R version 4.2.3

## Warning: package 'lubridate' was built under R version 4.2.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors

library(RWeka)

## Warning: package 'RWeka' was built under R version 4.2.3

## java.home option: 
## JAVA_HOME environment variable: "C:\Program Files\Java\jre-1.8"

## Warning in fun(libname, pkgname): Java home setting is INVALID, it will be ignored.
## Please do NOT set it unless you want to override system settings.

library(ggplot2)
library(stringi)
library(RColorBrewer)
library(pryr)

## Warning: package 'pryr' was built under R version 4.2.3

## 
## Attaching package: 'pryr'
## 
## The following object is masked from 'package:dplyr':
## 
##     where
## 
## The following objects are masked from 'package:purrr':
## 
##     compose, partial

library(tm)

## Warning: package 'tm' was built under R version 4.2.3

## Loading required package: NLP
## 
## Attaching package: 'NLP'
## 
## The following object is masked from 'package:ggplot2':
## 
##     annotate
## 
## 
## Attaching package: 'tm'
## 
## The following object is masked from 'package:pryr':
## 
##     inspect

library (knitr)

## Warning: package 'knitr' was built under R version 4.2.3

Loading the Data

# Defines file paths
blogs_path <- "en_US.blogs.txt"
news_path <- "en_US.news.txt"
twitter_path <- "en_US.twitter.txt"

# Loads the entire data
blogs <- readLines(blogs_path)
news <- readLines(news_path)

## Warning in readLines(news_path): incomplete final line found on
## 'en_US.news.txt'

twitter <- readLines(twitter_path)

## Warning in readLines(twitter_path): line 167155 appears to contain an embedded
## nul

## Warning in readLines(twitter_path): line 268547 appears to contain an embedded
## nul

## Warning in readLines(twitter_path): line 1274086 appears to contain an embedded
## nul

## Warning in readLines(twitter_path): line 1759032 appears to contain an embedded
## nul

# Defines the overall data
data_list <- list(blogs = blogs, news = news, twitter = twitter)

Summary Statistics

# Generates summary statistics
stats <- data.frame(
  FileName = c("blogs", "news", "twitter"),
  FileSize = sapply(data_list, function(x) format(object.size(x), "Mb")),
  Lines = sapply(data_list, function(x) length(x)),
  LinesNEmpty = sapply(data_list, function(x) sum(nzchar(x))),
  Chars = sapply(data_list, function(x) sum(nchar(x))),
  CharsNWhite = sapply(data_list, function(x) sum(nchar(stringi::stri_replace_all_regex(x, "\\s+", ""))))
)

# Prints the Stats using kable
knitr::kable(stats, caption = "Summary Statistics for All Data")

Summary Statistics for All Data
	FileName	FileSize	Lines	LinesNEmpty	Chars	CharsNWhite
blogs	blogs	255.4 Mb	899288	899288	206824509	170389662
news	news	19.8 Mb	77259	77259	15639408	13072698
twitter	twitter	319 Mb	2360148	2360148	162122651	134082634

Sampling the Data

# Sets the seed for reproducibility
set.seed(123)

# Defines the sample size as a percentage of each dataset
sample_pct <- 0.01

# Takes a random sample from each dataset
blogs_sample <- sample(blogs, round(length(blogs) * sample_pct))
news_sample <- sample(news, round(length(news) * sample_pct))
twitter_sample <- sample(twitter, round(length(twitter) * sample_pct))

# Combines the samples into a single dataset
sample_data <- c(blogs_sample, news_sample, twitter_sample)

# Splits the sample data back into blogs, news, and twitter
blogs_sample <- sample_data[1:length(blogs_sample)]
news_sample <- sample_data[(length(blogs_sample) + 1):(length(blogs_sample) + length(news_sample))]
twitter_sample <- sample_data[(length(blogs_sample) + length(news_sample) + 1):length(sample_data)]

# Defines the sample data list
sample_data_list <- list(blogs = blogs_sample, news = news_sample, twitter = twitter_sample)

Suammary Statistics for the Sample Data

# Generates summary statistics for the sample data
sample_stats <- data.frame(
  FileName = c("blogs_sample", "news_sample", "twitter_sample"),
  FileSize = sapply(sample_data_list, function(x) format(object.size(x), "Mb")),
  Lines = sapply(sample_data_list, function(x) length(x)),
  LinesNEmpty = sapply(sample_data_list, function(x) sum(nzchar(x))),
  Chars = sapply(sample_data_list, function(x) sum(nchar(x))),
  CharsNWhite = sapply(sample_data_list, function(x) sum(nchar(stringi::stri_replace_all_regex(x, "\\s+", ""))))
)

# Prints the sample stats using kable
knitr::kable(sample_stats, caption = "Summary Statistics for Sampled Data")

Summary Statistics for Sampled Data
	FileName	FileSize	Lines	LinesNEmpty	Chars	CharsNWhite
blogs	blogs_sample	2.5 Mb	8993	8993	2066092	1702426
news	news_sample	0.2 Mb	773	773	153678	128396
twitter	twitter_sample	3.2 Mb	23601	23601	1625149	1343631

Cleaning the Data and Building a Corpus

# Builds the corpus
corpus <- VCorpus(VectorSource(sample_data))

# Checks the size of the corpus in memory
print(object_size(corpus))

## 77.81 MB

# Cleans the corpus

# Converts all to lower case
clean_corpus <- tm_map(corpus, content_transformer(tolower))
# Removes punctuation marks
clean_corpus <- tm_map(clean_corpus, removePunctuation)
# Removes numbers
clean_corpus <- tm_map(clean_corpus, removeNumbers)
# Removes whitespace
clean_corpus <- tm_map(clean_corpus, stripWhitespace)
# Converts all to plain text document
clean_corpus <- tm_map(clean_corpus, PlainTextDocument)

# Converts clean_corpus to a character vector
clean_corpus_char <- unlist(lapply(clean_corpus, content))

Tokenizing and Constructing the N-Grams

# Defines tokenizers for unigrams, bigrams, and trigrams
tokenizers <- list(
  uniTokenizer = function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1)),
  biTokenizer = function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2)),
  triTokenizer = function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
)

# Constructs term-document matrices for unigrams, bigrams, and trigrams
tdm_list <- lapply(tokenizers, function(tokenizer) {
  TermDocumentMatrix(clean_corpus, control = list(tokenize = tokenizer))
})

# Names the list elements
names(tdm_list) <- c("uniMatrix", "biMatrix", "triMatrix")

# Finds frequent terms
uniCorpus <- findFreqTerms(tdm_list$uniMatrix, lowfreq = 20)
biCorpus <- findFreqTerms(tdm_list$biMatrix, lowfreq = 20)
triCorpus <- findFreqTerms(tdm_list$triMatrix, lowfreq = 20)

# Calculates frequencies for the frequent terms
uniCorpusFreq <- rowSums(as.matrix(tdm_list$uniMatrix[uniCorpus,]))
uniCorpusFreq <- data.frame(word = names(uniCorpusFreq), frequency = uniCorpusFreq)

biCorpusFreq <- rowSums(as.matrix(tdm_list$biMatrix[biCorpus,]))
biCorpusFreq <- data.frame(word = names(biCorpusFreq), frequency = biCorpusFreq)

triCorpusFreq <- rowSums(as.matrix(tdm_list$triMatrix[triCorpus,]))
triCorpusFreq <- data.frame(word = names(triCorpusFreq), frequency = triCorpusFreq)

# Sets the order of each corpus frequency to descending
uniCorpusFreqDescend <- arrange(uniCorpusFreq, desc(frequency))
biCorpusFreqDescend <- arrange(biCorpusFreq, desc(frequency))
triCorpusFreqDescend <- arrange(triCorpusFreq, desc(frequency))

Data Visualisation

# Visualises the top 20 Unigrams
uniBar <- ggplot(data = head(uniCorpusFreqDescend, 20), aes(x = reorder(word, -frequency), y = frequency)) +
  geom_bar(stat = "identity", fill = "red") +
  xlab("Words") +
  ylab("Frequency") +
  ggtitle("Top 20 Unigrams") +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(axis.text.x = element_text(angle = 60, hjust = 1))

uniBar

# Visualises the top 20 Bigrams
biBar <- ggplot(data = head(biCorpusFreqDescend, 20), aes(x = reorder(word, -frequency), y = frequency)) +
  geom_bar(stat = "identity", fill = "green") +
  xlab("Words") +
  ylab("Frequency") +
  ggtitle("Top 20 Bigrams") +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(axis.text.x = element_text(angle = 60, hjust = 1))

biBar

# Visualises the top 20 Trigrams
triBar <- ggplot(data = head(triCorpusFreqDescend, 20), aes(x = reorder(word, -frequency), y = frequency)) +
  geom_bar(stat = "identity", fill = "blue") +
  xlab("Words") +
  ylab("Frequency") +
  ggtitle("Top 20 Trigrams") +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(axis.text.x = element_text(angle = 60, hjust = 1))

triBar

Unique Data Visualisation

# Adds a column to each data frame to indicate the type of n-gram
uniCorpusFreqDescend$type <- "Unigram"
biCorpusFreqDescend$type <- "Bigram"
triCorpusFreqDescend$type <- "Trigram"

# Selects the top 10 words for each type of n-gram
top_uniCorpusFreqDescend <- head(uniCorpusFreqDescend, 10)
top_biCorpusFreqDescend <- head(biCorpusFreqDescend, 10)
top_triCorpusFreqDescend <- head(triCorpusFreqDescend, 10)

# Creates a bubble cloud for the top 10 words of Unigram
ggplot(top_uniCorpusFreqDescend, aes(x = reorder(word, -frequency), y = frequency, size = frequency)) +
  geom_point(alpha = 0.7) +
  scale_size(range = c(1, 20)) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
  labs(title = "Bubble Cloud for Top 10 Words of Unigram", x = "Words", y = "Frequency", size = "Frequency")

# Creates a bubble cloud for the top 10 words of Bigram
ggplot(top_biCorpusFreqDescend, aes(x = reorder(word, -frequency), y = frequency, size = frequency)) +
  geom_point(alpha = 0.7) +
  scale_size(range = c(1, 20)) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
  labs(title = "Bubble Cloud for Top 10 Words of Bigram", x = "Words", y = "Frequency", size = "Frequency")

# Creates a bubble cloud for the top 10 words of Trigram
ggplot(top_triCorpusFreqDescend, aes(x = reorder(word, -frequency), y = frequency, size = frequency)) +
  geom_point(alpha = 0.7) +
  scale_size(range = c(1, 20)) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
  labs(title = "Bubble Cloud for Top 10 Words of Trigram", x = "Words", y = "Frequency", size = "Frequency")

Summary

As per the data above it an be seen that the combination of all data sets has its common words and word strings. We took a 1% sample from the data as the data file was quite large, thus the question is whether a 1% sample is enough for an accurate prediction model. Additionally we need to consider removing stopwords or creating a filter for profane words if they are suggested in our model.

What Next?

Build and test different prediction models and evaluate each based on their performance.
Create and test any necessary modifications to resolve any issues encountered during modeling.
Build, test and deploy a Shiny app with a simple user interface that has acceptable run time and reliably and accurately predicts the next word based on a word or phrase entered by the user.
Decide whether to remove the stopwords and filter out any profanity, if need be.

SwiftKey Project

Youssef Wannouch

2023-06-01