Introduction

This report provides an exploratory analysis of the SwiftKey dataset.
The goal is to demonstrate successful data loading, summarize basic features, and outline plans for building a predictive text model and Shiny app.
The report is written in a concise style so non-technical managers can understand the progress.

Data Loading

The dataset consists of three text files from US English sources: blogs, news, and twitter.

blogs <- readLines("en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
news <- readLines("en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines("en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)

Summary Statistics

# Line Counts
cat("Blogs lines:", length(blogs), "\n")
## Blogs lines: 899288
cat("News lines:", length(news), "\n")
## News lines: 1010206
cat("Twitter lines:", length(twitter), "\n")
## Twitter lines: 2360148
# File Sizes (MB)
cat("Blogs size (MB):", file.info("en_US.blogs.txt")$size / (1024^2), "\n")
## Blogs size (MB): 200.4242
cat("News size (MB):", file.info("en_US.news.txt")$size / (1024^2), "\n")
## News size (MB): 196.2775
cat("Twitter size (MB):", file.info("en_US.twitter.txt")$size / (1024^2), "\n")
## Twitter size (MB): 159.3641
# Word Counts
cat("Blogs word stats:\n"); print(stri_stats_latex(blogs))
## Blogs word stats:
##     CharsWord CharsCmdEnvir    CharsWhite         Words          Cmds 
##     162464653             9      42636700      37570839             3 
##        Envirs 
##             0
cat("News word stats:\n"); print(stri_stats_latex(news))
## News word stats:
##     CharsWord CharsCmdEnvir    CharsWhite         Words          Cmds 
##     162220648             2      40262316      34493122             1 
##        Envirs 
##             0
cat("Twitter word stats:\n"); print(stri_stats_latex(twitter))
## Twitter word stats:
##     CharsWord CharsCmdEnvir    CharsWhite         Words          Cmds 
##     125570778          3032      35958529      30451170           963 
##        Envirs 
##             0

Basic Plots

blog_lengths <- nchar(blogs)
news_lengths <- nchar(news)
twitter_lengths <- nchar(twitter)

df_lengths <- data.frame(
  source = c(rep("Blogs", length(blog_lengths)),
             rep("News", length(news_lengths)),
             rep("Twitter", length(twitter_lengths))),
  length = c(blog_lengths, news_lengths, twitter_lengths)
)

ggplot(df_lengths, aes(x = length, fill = source)) +
  geom_histogram(bins = 100, alpha = 0.6, position = "identity") +
  scale_x_log10() +
  labs(title = "Distribution of Line Lengths (Log Scale)",
       x = "Line Length (characters, log scale)", y = "Count") +
  theme_minimal()

library(tm)
library(tidytext)
library(dplyr)

# Sample to keep it light
set.seed(1234)
sample_size <- 50000
sample_text <- c(
  sample(blogs, sample_size),
  sample(news, sample_size),
  sample(twitter, sample_size)
)

# Create a tidy text dataframe
text_df <- data.frame(text = sample_text, stringsAsFactors = FALSE)

# Unnest tokens (words)
tokens <- text_df %>%
  unnest_tokens(word, text) %>%
  filter(!word %in% stop_words$word,   # remove stopwords
         !str_detect(word, "^[0-9]+$")) # remove pure numbers

# --- Unigram frequencies ---
unigrams <- tokens %>%
  count(word, sort = TRUE)

# --- Bigrams ---
bigrams <- text_df %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
  count(bigram, sort = TRUE)

# --- Trigrams ---
trigrams <- text_df %>%
  unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
  count(trigram, sort = TRUE)

# --- Plot Top 20 Unigrams ---
unigrams %>%
  top_n(20) %>%
  ggplot(aes(x = reorder(word, n), y = n)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  labs(title = "Top 20 Most Frequent Words", x = "Word", y = "Frequency") +
  theme_minimal()

# --- Plot Top 20 Bigrams ---
bigrams %>%
  top_n(20) %>%
  ggplot(aes(x = reorder(bigram, n), y = n)) +
  geom_col(fill = "darkgreen") +
  coord_flip() +
  labs(title = "Top 20 Most Frequent Bigrams", x = "Bigram", y = "Frequency") +
  theme_minimal()

# --- Plot Top 20 Trigrams ---
trigrams %>%
  top_n(20) %>%
  ggplot(aes(x = reorder(trigram, n), y = n)) +
  geom_col(fill = "purple") +
  coord_flip() +
  labs(title = "Top 20 Most Frequent Trigrams", x = "Trigram", y = "Frequency") +
  theme_minimal()