Introduction

This report explores the SwiftKey text dataset used for building a predictive text model. The goal is to understand the structure of the data and identify patterns that will support the development of a next-word prediction algorithm.

The dataset consists of three sources: - Blogs - News articles - Twitter posts

Load Packages

library(stringi)
library(dplyr)
library(ggplot2)
library(tm)
library(wordcloud)
library(knitr)

Load Data

blogsFile <- "en_US.blogs.txt"
newsFile <- "en_US.news.txt"
twitterFile <- "en_US.twitter.txt"

blogs <- readLines(blogsFile, encoding = "UTF-8", skipNul = TRUE)
news <- readLines(newsFile, encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines(twitterFile, encoding = "UTF-8", skipNul = TRUE)

Summary Statistics

summaryData <- data.frame(
  File = c("Blogs", "News", "Twitter"),
  File_Size_MB = round(file.info(c(blogsFile, newsFile, twitterFile))$size / 1024^2, 2),
  Lines = c(length(blogs), length(news), length(twitter)),
  Words = c(
    sum(stri_count_words(blogs)),
    sum(stri_count_words(news)),
    sum(stri_count_words(twitter))
  ),
  Characters = c(
    sum(nchar(blogs)),
    sum(nchar(news)),
    sum(nchar(twitter))
  )
)

kable(summaryData, caption = "Summary Statistics of Text Data")

Summary Statistics of Text Data
File	File_Size_MB	Lines	Words	Characters
Blogs	200.42	899288	37546250	206824505
News	196.28	1010242	34762395	203223159
Twitter	159.36	2360148	30093413	162096241

Sampling the Data

Because the complete data set contains millions of lines of text, a random sample is used for exploratory analysis. Sampling greatly reduces computation time while preserving representative language patterns.

set.seed(12345)

sampleBlogs <- sample(blogs, 5000)
sampleNews <- sample(news, 5000)
sampleTwitter <- sample(twitter, 5000)

Cleaning Function

clean_corpus <- function(text) {
  corpus <- Corpus(VectorSource(text))

  corpus <- tm_map(corpus, content_transformer(tolower))
  corpus <- tm_map(corpus, removePunctuation)
  corpus <- tm_map(corpus, removeNumbers)
  corpus <- tm_map(corpus, removeWords, stopwords("english"))
  corpus <- tm_map(corpus, stripWhitespace)

  return(corpus)
}

Create Corpora

blog_corpus <- clean_corpus(sampleBlogs)
news_corpus <- clean_corpus(sampleNews)
twitter_corpus <- clean_corpus(sampleTwitter)

Frequency Function

get_freq <- function(corpus) {
  dtm <- DocumentTermMatrix(corpus)
  freq <- colSums(as.matrix(dtm))
  freq <- sort(freq, decreasing = TRUE)

  data.frame(
    Word = names(freq),
    Frequency = unname(freq)
  )
}

Word Frequencies

blog_freq <- get_freq(blog_corpus)
news_freq <- get_freq(news_corpus)
twitter_freq <- get_freq(twitter_corpus)

Individual Word Frequency Plots

blog_top <- head(blog_freq, 15)

ggplot(blog_top,
       aes(x = reorder(Word, Frequency),
           y = Frequency)) +
  geom_col(fill = "blue") +
  coord_flip() +
  labs(title = "Top Words in Blogs",
       x = "Word",
       y = "Frequency")

news_top <- head(news_freq, 15)

ggplot(news_top,
       aes(x = reorder(Word, Frequency),
           y = Frequency)) +
  geom_col(fill = "green") +
  coord_flip() +
  labs(title = "Top Words in News",
       x = "Word",
       y = "Frequency")

twitter_top <- head(twitter_freq, 15)

ggplot(twitter_top,
       aes(x = reorder(Word, Frequency),
           y = Frequency)) +
  geom_col(fill = "red") +
  coord_flip() +
  labs(title = "Top Words in Twitter",
       x = "Word",
       y = "Frequency")

Side-by-Side Word Clouds

par(mfrow = c(1, 3))

wordcloud(blog_freq$Word, blog_freq$Frequency,
          max.words = 60,
          colors = brewer.pal(8, "Dark2"),
          scale = c(3, 0.4),
          random.order = FALSE)

wordcloud(news_freq$Word, news_freq$Frequency,
          max.words = 60,
          colors = brewer.pal(8, "Set2"),
          scale = c(3, 0.4),
          random.order = FALSE)

wordcloud(twitter_freq$Word, twitter_freq$Frequency,
          max.words = 60,
          colors = brewer.pal(8, "Paired"),
          scale = c(3, 0.4),
          random.order = FALSE)

par(mfrow = c(1, 1))

Single Combined Comparison Chart

top_blog <- head(blog_freq, 10)
top_news <- head(news_freq, 10)
top_twitter <- head(twitter_freq, 10)

combined <- data.frame(
  Word = c(top_blog$Word,
           top_news$Word,
           top_twitter$Word),

  Frequency = c(top_blog$Frequency,
                top_news$Frequency,
                top_twitter$Frequency),

  Source = c(rep("Blogs", 10),
             rep("News", 10),
             rep("Twitter", 10))
)

ggplot(combined,
       aes(x = reorder(Word, Frequency),
           y = Frequency,
           fill = Source)) +
  geom_col(position = "dodge") +
  coord_flip() +
  labs(title = "Top Words Comparison Across Blogs, News, and Twitter",
       x = "Word",
       y = "Frequency",
       fill = "Source") +
  scale_fill_manual(values = c(
    "Blogs" = "blue",
    "News" = "green",
    "Twitter" = "red"
  ))

Findings

The analysis shows clear differences in language patterns across datasets:

Twitter contains shorter, more informal language.
News articles use structured and formal vocabulary.
Blogs show a mix of conversational and descriptive writing.

Despite differences, all datasets share a core set of high-frequency words common in English language usage.

Next Steps

The final model will: - Build n-gram models (unigram, bigram, trigram, quadgram) - Estimate next-word probabilities - Implement a back-off prediction algorithm - Deploy an interactive Shiny application for real-time prediction

Conclusion

This exploratory analysis confirms that the dataset is suitable for predictive modeling. The identified patterns and word frequencies provide a strong foundation for building a next-word prediction algorithm.

Data_Science_Capstone_Milesone_Report

LaKeya King

2026-06-16