Introduction

This report explores the SwiftKey text dataset used for building a predictive text model. The goal is to understand the structure of the data and identify patterns that will support the development of a next-word prediction algorithm.

The dataset consists of three sources: - Blogs - News articles - Twitter posts

Load Packages

library(stringi)
library(dplyr)
library(ggplot2)
library(tm)
library(wordcloud)
library(knitr)

Load Data

blogsFile <- "en_US.blogs.txt"
newsFile <- "en_US.news.txt"
twitterFile <- "en_US.twitter.txt"

blogs <- readLines(blogsFile, encoding = "UTF-8", skipNul = TRUE)
news <- readLines(newsFile, encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines(twitterFile, encoding = "UTF-8", skipNul = TRUE)

Summary Statistics

summaryData <- data.frame(
  File = c("Blogs", "News", "Twitter"),
  File_Size_MB = round(file.info(c(blogsFile, newsFile, twitterFile))$size / 1024^2, 2),
  Lines = c(length(blogs), length(news), length(twitter)),
  Words = c(
    sum(stri_count_words(blogs)),
    sum(stri_count_words(news)),
    sum(stri_count_words(twitter))
  ),
  Characters = c(
    sum(nchar(blogs)),
    sum(nchar(news)),
    sum(nchar(twitter))
  )
)

kable(summaryData, caption = "Summary Statistics of Text Data")
Summary Statistics of Text Data
File File_Size_MB Lines Words Characters
Blogs 200.42 899288 37546250 206824505
News 196.28 1010242 34762395 203223159
Twitter 159.36 2360148 30093413 162096241

Sampling the Data

Because the complete data set contains millions of lines of text, a random sample is used for exploratory analysis. Sampling greatly reduces computation time while preserving representative language patterns.

set.seed(12345)

sampleBlogs <- sample(blogs, 5000)
sampleNews <- sample(news, 5000)
sampleTwitter <- sample(twitter, 5000)

Cleaning Function

clean_corpus <- function(text) {
  corpus <- Corpus(VectorSource(text))

  corpus <- tm_map(corpus, content_transformer(tolower))
  corpus <- tm_map(corpus, removePunctuation)
  corpus <- tm_map(corpus, removeNumbers)
  corpus <- tm_map(corpus, removeWords, stopwords("english"))
  corpus <- tm_map(corpus, stripWhitespace)

  return(corpus)
}

Create Corpora

blog_corpus <- clean_corpus(sampleBlogs)
news_corpus <- clean_corpus(sampleNews)
twitter_corpus <- clean_corpus(sampleTwitter)

Frequency Function

get_freq <- function(corpus) {
  dtm <- DocumentTermMatrix(corpus)
  freq <- colSums(as.matrix(dtm))
  freq <- sort(freq, decreasing = TRUE)

  data.frame(
    Word = names(freq),
    Frequency = unname(freq)
  )
}

Word Frequencies

blog_freq <- get_freq(blog_corpus)
news_freq <- get_freq(news_corpus)
twitter_freq <- get_freq(twitter_corpus)

Individual Word Frequency Plots

blog_top <- head(blog_freq, 15)

ggplot(blog_top,
       aes(x = reorder(Word, Frequency),
           y = Frequency)) +
  geom_col(fill = "blue") +
  coord_flip() +
  labs(title = "Top Words in Blogs",
       x = "Word",
       y = "Frequency")

news_top <- head(news_freq, 15)

ggplot(news_top,
       aes(x = reorder(Word, Frequency),
           y = Frequency)) +
  geom_col(fill = "green") +
  coord_flip() +
  labs(title = "Top Words in News",
       x = "Word",
       y = "Frequency")

twitter_top <- head(twitter_freq, 15)

ggplot(twitter_top,
       aes(x = reorder(Word, Frequency),
           y = Frequency)) +
  geom_col(fill = "red") +
  coord_flip() +
  labs(title = "Top Words in Twitter",
       x = "Word",
       y = "Frequency")

Side-by-Side Word Clouds

par(mfrow = c(1, 3))

wordcloud(blog_freq$Word, blog_freq$Frequency,
          max.words = 60,
          colors = brewer.pal(8, "Dark2"),
          scale = c(3, 0.4),
          random.order = FALSE)

wordcloud(news_freq$Word, news_freq$Frequency,
          max.words = 60,
          colors = brewer.pal(8, "Set2"),
          scale = c(3, 0.4),
          random.order = FALSE)

wordcloud(twitter_freq$Word, twitter_freq$Frequency,
          max.words = 60,
          colors = brewer.pal(8, "Paired"),
          scale = c(3, 0.4),
          random.order = FALSE)

par(mfrow = c(1, 1))

Single Combined Comparison Chart

top_blog <- head(blog_freq, 10)
top_news <- head(news_freq, 10)
top_twitter <- head(twitter_freq, 10)

combined <- data.frame(
  Word = c(top_blog$Word,
           top_news$Word,
           top_twitter$Word),

  Frequency = c(top_blog$Frequency,
                top_news$Frequency,
                top_twitter$Frequency),

  Source = c(rep("Blogs", 10),
             rep("News", 10),
             rep("Twitter", 10))
)

ggplot(combined,
       aes(x = reorder(Word, Frequency),
           y = Frequency,
           fill = Source)) +
  geom_col(position = "dodge") +
  coord_flip() +
  labs(title = "Top Words Comparison Across Blogs, News, and Twitter",
       x = "Word",
       y = "Frequency",
       fill = "Source") +
  scale_fill_manual(values = c(
    "Blogs" = "blue",
    "News" = "green",
    "Twitter" = "red"
  ))

Findings

The analysis shows clear differences in language patterns across datasets:

Despite differences, all datasets share a core set of high-frequency words common in English language usage.

Next Steps

The final model will: - Build n-gram models (unigram, bigram, trigram, quadgram) - Estimate next-word probabilities - Implement a back-off prediction algorithm - Deploy an interactive Shiny application for real-time prediction

Conclusion

This exploratory analysis confirms that the dataset is suitable for predictive modeling. The identified patterns and word frequencies provide a strong foundation for building a next-word prediction algorithm.