This report explores the SwiftKey text dataset used for building a predictive text model. The goal is to understand the structure of the data and identify patterns that will support the development of a next-word prediction algorithm.
The dataset consists of three sources: - Blogs - News articles - Twitter posts
library(stringi)
library(dplyr)
library(ggplot2)
library(tm)
library(wordcloud)
library(knitr)
blogsFile <- "en_US.blogs.txt"
newsFile <- "en_US.news.txt"
twitterFile <- "en_US.twitter.txt"
blogs <- readLines(blogsFile, encoding = "UTF-8", skipNul = TRUE)
news <- readLines(newsFile, encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines(twitterFile, encoding = "UTF-8", skipNul = TRUE)
summaryData <- data.frame(
File = c("Blogs", "News", "Twitter"),
File_Size_MB = round(file.info(c(blogsFile, newsFile, twitterFile))$size / 1024^2, 2),
Lines = c(length(blogs), length(news), length(twitter)),
Words = c(
sum(stri_count_words(blogs)),
sum(stri_count_words(news)),
sum(stri_count_words(twitter))
),
Characters = c(
sum(nchar(blogs)),
sum(nchar(news)),
sum(nchar(twitter))
)
)
kable(summaryData, caption = "Summary Statistics of Text Data")
| File | File_Size_MB | Lines | Words | Characters |
|---|---|---|---|---|
| Blogs | 200.42 | 899288 | 37546250 | 206824505 |
| News | 196.28 | 1010242 | 34762395 | 203223159 |
| 159.36 | 2360148 | 30093413 | 162096241 |
Because the complete data set contains millions of lines of text, a random sample is used for exploratory analysis. Sampling greatly reduces computation time while preserving representative language patterns.
set.seed(12345)
sampleBlogs <- sample(blogs, 5000)
sampleNews <- sample(news, 5000)
sampleTwitter <- sample(twitter, 5000)
clean_corpus <- function(text) {
corpus <- Corpus(VectorSource(text))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, stripWhitespace)
return(corpus)
}
blog_corpus <- clean_corpus(sampleBlogs)
news_corpus <- clean_corpus(sampleNews)
twitter_corpus <- clean_corpus(sampleTwitter)
get_freq <- function(corpus) {
dtm <- DocumentTermMatrix(corpus)
freq <- colSums(as.matrix(dtm))
freq <- sort(freq, decreasing = TRUE)
data.frame(
Word = names(freq),
Frequency = unname(freq)
)
}
blog_freq <- get_freq(blog_corpus)
news_freq <- get_freq(news_corpus)
twitter_freq <- get_freq(twitter_corpus)
blog_top <- head(blog_freq, 15)
ggplot(blog_top,
aes(x = reorder(Word, Frequency),
y = Frequency)) +
geom_col(fill = "blue") +
coord_flip() +
labs(title = "Top Words in Blogs",
x = "Word",
y = "Frequency")
news_top <- head(news_freq, 15)
ggplot(news_top,
aes(x = reorder(Word, Frequency),
y = Frequency)) +
geom_col(fill = "green") +
coord_flip() +
labs(title = "Top Words in News",
x = "Word",
y = "Frequency")
twitter_top <- head(twitter_freq, 15)
ggplot(twitter_top,
aes(x = reorder(Word, Frequency),
y = Frequency)) +
geom_col(fill = "red") +
coord_flip() +
labs(title = "Top Words in Twitter",
x = "Word",
y = "Frequency")
par(mfrow = c(1, 3))
wordcloud(blog_freq$Word, blog_freq$Frequency,
max.words = 60,
colors = brewer.pal(8, "Dark2"),
scale = c(3, 0.4),
random.order = FALSE)
wordcloud(news_freq$Word, news_freq$Frequency,
max.words = 60,
colors = brewer.pal(8, "Set2"),
scale = c(3, 0.4),
random.order = FALSE)
wordcloud(twitter_freq$Word, twitter_freq$Frequency,
max.words = 60,
colors = brewer.pal(8, "Paired"),
scale = c(3, 0.4),
random.order = FALSE)
par(mfrow = c(1, 1))
top_blog <- head(blog_freq, 10)
top_news <- head(news_freq, 10)
top_twitter <- head(twitter_freq, 10)
combined <- data.frame(
Word = c(top_blog$Word,
top_news$Word,
top_twitter$Word),
Frequency = c(top_blog$Frequency,
top_news$Frequency,
top_twitter$Frequency),
Source = c(rep("Blogs", 10),
rep("News", 10),
rep("Twitter", 10))
)
ggplot(combined,
aes(x = reorder(Word, Frequency),
y = Frequency,
fill = Source)) +
geom_col(position = "dodge") +
coord_flip() +
labs(title = "Top Words Comparison Across Blogs, News, and Twitter",
x = "Word",
y = "Frequency",
fill = "Source") +
scale_fill_manual(values = c(
"Blogs" = "blue",
"News" = "green",
"Twitter" = "red"
))
The analysis shows clear differences in language patterns across datasets:
Despite differences, all datasets share a core set of high-frequency words common in English language usage.
The final model will: - Build n-gram models (unigram, bigram, trigram, quadgram) - Estimate next-word probabilities - Implement a back-off prediction algorithm - Deploy an interactive Shiny application for real-time prediction
This exploratory analysis confirms that the dataset is suitable for predictive modeling. The identified patterns and word frequencies provide a strong foundation for building a next-word prediction algorithm.