Executive Summary

This report presents an exploratory data analysis of the Capstone dataset provided by SwiftKey. The ultimate goal of this project is to build a predictive text engine and deploy it as an interactive Shiny web application that suggests the most logical next word as a user types.

1. Dataset Characteristics

library(stringr)
library(dplyr)
library(tidytext)
library(ggplot2)

blogs_path   <- "C:/Users/PC/Downloads/Coursera-SwiftKey/final/en_US/en_US.blogs.txt"
news_path    <- "C:/Users/PC/Downloads/Coursera-SwiftKey/final/en_US/en_US.news.txt"
twitter_path <- "C:/Users/PC/Downloads/Coursera-SwiftKey/final/en_US/en_US.twitter.txt"

blogs   <- readLines(blogs_path, encoding = "UTF-8", skipNul = TRUE, warn = FALSE)
news    <- readLines(news_path, encoding = "UTF-8", skipNul = TRUE, warn = FALSE)
twitter <- readLines(twitter_path, encoding = "UTF-8", skipNul = TRUE, warn = FALSE)

summary_df <- data.frame(
  File_Source = c("Blogs", "News Articles", "Twitter Posts"),
  Size_in_MB = round(c(file.info(blogs_path)$size, file.info(news_path)$size, file.info(twitter_path)$size) / (1024^2), 2),
  Total_Lines = c(length(blogs), length(news), length(twitter))
)
print(summary_df)
##     File_Source Size_in_MB Total_Lines
## 1         Blogs     200.42      899288
## 2 News Articles     196.28     1010206
## 3 Twitter Posts     159.36     2360148
set.seed(42)
sample_data <- c(sample(blogs, length(blogs) * 0.01), 
                 sample(news, length(news) * 0.01), 
                 sample(twitter, length(twitter) * 0.01))

text_df <- tibble(line = 1:length(sample_data), text = sample_data)
bigrams_df <- text_df %>% 
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>% 
  filter(!is.na(bigram)) %>% 
  count(bigram, sort = TRUE)

ggplot(head(bigrams_df, 15), aes(x = reorder(bigram, n), y = n)) +
  geom_col(fill = "steelblue") + 
  coord_flip() + 
  theme_minimal() +
  labs(title = "Top 15 Most Common Word Pairs (Bigrams)", x = "Word Combinations", y = "Occurrences")