This report provides an exploratory analysis of the SwiftKey
dataset.
The goal is to demonstrate successful data loading, summarize basic
features, and outline plans for building a predictive text model and
Shiny app.
The report is written in a concise style so non-technical managers can
understand the progress.
The dataset consists of three text files from US English sources: blogs, news, and twitter.
blogs <- readLines("en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
news <- readLines("en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines("en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
# Line Counts
cat("Blogs lines:", length(blogs), "\n")
## Blogs lines: 899288
cat("News lines:", length(news), "\n")
## News lines: 1010206
cat("Twitter lines:", length(twitter), "\n")
## Twitter lines: 2360148
# File Sizes (MB)
cat("Blogs size (MB):", file.info("en_US.blogs.txt")$size / (1024^2), "\n")
## Blogs size (MB): 200.4242
cat("News size (MB):", file.info("en_US.news.txt")$size / (1024^2), "\n")
## News size (MB): 196.2775
cat("Twitter size (MB):", file.info("en_US.twitter.txt")$size / (1024^2), "\n")
## Twitter size (MB): 159.3641
# Word Counts
cat("Blogs word stats:\n"); print(stri_stats_latex(blogs))
## Blogs word stats:
## CharsWord CharsCmdEnvir CharsWhite Words Cmds
## 162464653 9 42636700 37570839 3
## Envirs
## 0
cat("News word stats:\n"); print(stri_stats_latex(news))
## News word stats:
## CharsWord CharsCmdEnvir CharsWhite Words Cmds
## 162220648 2 40262316 34493122 1
## Envirs
## 0
cat("Twitter word stats:\n"); print(stri_stats_latex(twitter))
## Twitter word stats:
## CharsWord CharsCmdEnvir CharsWhite Words Cmds
## 125570778 3032 35958529 30451170 963
## Envirs
## 0
blog_lengths <- nchar(blogs)
news_lengths <- nchar(news)
twitter_lengths <- nchar(twitter)
df_lengths <- data.frame(
source = c(rep("Blogs", length(blog_lengths)),
rep("News", length(news_lengths)),
rep("Twitter", length(twitter_lengths))),
length = c(blog_lengths, news_lengths, twitter_lengths)
)
ggplot(df_lengths, aes(x = length, fill = source)) +
geom_histogram(bins = 100, alpha = 0.6, position = "identity") +
scale_x_log10() +
labs(title = "Distribution of Line Lengths (Log Scale)",
x = "Line Length (characters, log scale)", y = "Count") +
theme_minimal()
library(tm)
library(tidytext)
library(dplyr)
# Sample to keep it light
set.seed(1234)
sample_size <- 50000
sample_text <- c(
sample(blogs, sample_size),
sample(news, sample_size),
sample(twitter, sample_size)
)
# Create a tidy text dataframe
text_df <- data.frame(text = sample_text, stringsAsFactors = FALSE)
# Unnest tokens (words)
tokens <- text_df %>%
unnest_tokens(word, text) %>%
filter(!word %in% stop_words$word, # remove stopwords
!str_detect(word, "^[0-9]+$")) # remove pure numbers
# --- Unigram frequencies ---
unigrams <- tokens %>%
count(word, sort = TRUE)
# --- Bigrams ---
bigrams <- text_df %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
count(bigram, sort = TRUE)
# --- Trigrams ---
trigrams <- text_df %>%
unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
count(trigram, sort = TRUE)
# --- Plot Top 20 Unigrams ---
unigrams %>%
top_n(20) %>%
ggplot(aes(x = reorder(word, n), y = n)) +
geom_col(fill = "steelblue") +
coord_flip() +
labs(title = "Top 20 Most Frequent Words", x = "Word", y = "Frequency") +
theme_minimal()
# --- Plot Top 20 Bigrams ---
bigrams %>%
top_n(20) %>%
ggplot(aes(x = reorder(bigram, n), y = n)) +
geom_col(fill = "darkgreen") +
coord_flip() +
labs(title = "Top 20 Most Frequent Bigrams", x = "Bigram", y = "Frequency") +
theme_minimal()
# --- Plot Top 20 Trigrams ---
trigrams %>%
top_n(20) %>%
ggplot(aes(x = reorder(trigram, n), y = n)) +
geom_col(fill = "purple") +
coord_flip() +
labs(title = "Top 20 Most Frequent Trigrams", x = "Trigram", y = "Frequency") +
theme_minimal()