This report explores the SwiftKey corpus provided by Johns Hopkins University on Coursera. The dataset contains text from three sources: Blogs, News, and Twitter. The goal is to build a next-word prediction algorithm and deploy it as a Shiny app.
library(tm)
library(ggplot2)
blogs <- readLines("D:/SEMESTER 6 HORE/Coursera/capstone/final/en_US/en_US.blogs.txt",
encoding = "UTF-8", skipNul = TRUE)
news <- readLines("D:/SEMESTER 6 HORE/Coursera/capstone/final/en_US/en_US.news.txt",
encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines("D:/SEMESTER 6 HORE/Coursera/capstone/final/en_US/en_US.twitter.txt",
encoding = "UTF-8", skipNul = TRUE)
num_lines <- c(length(blogs), length(news), length(twitter))
num_words <- c(sum(lengths(strsplit(blogs, " "))),
sum(lengths(strsplit(news, " "))),
sum(lengths(strsplit(twitter, " "))))
file_size_mb <- round(
file.info(c(
"D:/SEMESTER 6 HORE/Coursera/capstone/final/en_US/en_US.blogs.txt",
"D:/SEMESTER 6 HORE/Coursera/capstone/final/en_US/en_US.news.txt",
"D:/SEMESTER 6 HORE/Coursera/capstone/final/en_US/en_US.twitter.txt"
))$size / 1e6, 1)
summary_table <- data.frame(
Source = c("Blogs", "News", "Twitter"),
Lines = num_lines,
Words = num_words,
Size_MB = file_size_mb
)
knitr::kable(summary_table, caption = "Summary of the Three Text Sources")
| Source | Lines | Words | Size_MB |
|---|---|---|---|
| Blogs | 899288 | 37334131 | 210.2 |
| News | 1010206 | 34371031 | 205.8 |
| 2360148 | 30373583 | 167.1 |
set.seed(1234)
small_sample <- c(sample(blogs, 1000),
sample(news, 1000),
sample(twitter, 1000))
corp2 <- VCorpus(VectorSource(small_sample))
corp2 <- tm_map(corp2, content_transformer(tolower))
corp2 <- tm_map(corp2, removePunctuation)
corp2 <- tm_map(corp2, removeNumbers)
corp2 <- tm_map(corp2, stripWhitespace)
dtm <- TermDocumentMatrix(corp2)
mat <- as.matrix(dtm)
freq <- sort(rowSums(mat), decreasing = TRUE)
df_freq <- data.frame(word = names(freq), frequency = freq)
ggplot(df_freq[1:20, ], aes(x = reorder(word, frequency), y = frequency)) +
geom_bar(stat = "identity", fill = "#2e86ab") +
coord_flip() +
labs(title = "Top 20 Most Frequent Words",
x = "Word", y = "Frequency") +
theme_minimal()
ggplot(summary_table, aes(x = Source, y = Lines, fill = Source)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("#2e86ab", "#a23b72", "#f18f01")) +
labs(title = "Number of Lines per Source",
x = "Source", y = "Number of Lines") +
theme_minimal()
ggplot(summary_table, aes(x = Source, y = Words, fill = Source)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("#2e86ab", "#a23b72", "#f18f01")) +
labs(title = "Number of Words per Source",
x = "Source", y = "Number of Words") +
theme_minimal()
The next-word prediction model will use an N-gram approach: