This report presents an exploratory data analysis of the Capstone dataset provided by SwiftKey. The ultimate goal of this project is to build a predictive text engine and deploy it as an interactive Shiny web application that suggests the most logical next word as a user types.
library(stringr)
library(dplyr)
library(tidytext)
library(ggplot2)
blogs_path <- "C:/Users/PC/Downloads/Coursera-SwiftKey/final/en_US/en_US.blogs.txt"
news_path <- "C:/Users/PC/Downloads/Coursera-SwiftKey/final/en_US/en_US.news.txt"
twitter_path <- "C:/Users/PC/Downloads/Coursera-SwiftKey/final/en_US/en_US.twitter.txt"
blogs <- readLines(blogs_path, encoding = "UTF-8", skipNul = TRUE, warn = FALSE)
news <- readLines(news_path, encoding = "UTF-8", skipNul = TRUE, warn = FALSE)
twitter <- readLines(twitter_path, encoding = "UTF-8", skipNul = TRUE, warn = FALSE)
summary_df <- data.frame(
File_Source = c("Blogs", "News Articles", "Twitter Posts"),
Size_in_MB = round(c(file.info(blogs_path)$size, file.info(news_path)$size, file.info(twitter_path)$size) / (1024^2), 2),
Total_Lines = c(length(blogs), length(news), length(twitter))
)
print(summary_df)
## File_Source Size_in_MB Total_Lines
## 1 Blogs 200.42 899288
## 2 News Articles 196.28 1010206
## 3 Twitter Posts 159.36 2360148
set.seed(42)
sample_data <- c(sample(blogs, length(blogs) * 0.01),
sample(news, length(news) * 0.01),
sample(twitter, length(twitter) * 0.01))
text_df <- tibble(line = 1:length(sample_data), text = sample_data)
bigrams_df <- text_df %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
filter(!is.na(bigram)) %>%
count(bigram, sort = TRUE)
ggplot(head(bigrams_df, 15), aes(x = reorder(bigram, n), y = n)) +
geom_col(fill = "steelblue") +
coord_flip() +
theme_minimal() +
labs(title = "Top 15 Most Common Word Pairs (Bigrams)", x = "Word Combinations", y = "Occurrences")