Introduction

This Milestone Report summarizes the exploratory analysis of the text data provided for the SwiftKey Text Prediction project. The goals are the same as before: show data is loaded, provide summary statistics, show initial findings, and outline the plan for model + Shiny app.


Environment & required packages

The chunk below will install only missing packages and then load them. Installation asks for user confirmation (CRAN mirror) the first time.

required <- c("stringi","tm","slam","RWeka","ggplot2","dplyr","knitr","NLP","SnowballC")
to_install <- required[!required %in% installed.packages()[, "Package"]]
if(length(to_install)) {
  install.packages(to_install, repos = "https://cran.rstudio.com/")
}
lapply(required, library, character.only = TRUE)
## [[1]]
## [1] "stringi"   "stats"     "graphics"  "grDevices" "utils"     "datasets" 
## [7] "methods"   "base"     
## 
## [[2]]
##  [1] "tm"        "NLP"       "stringi"   "stats"     "graphics"  "grDevices"
##  [7] "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "slam"      "tm"        "NLP"       "stringi"   "stats"     "graphics" 
##  [7] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[4]]
##  [1] "RWeka"     "slam"      "tm"        "NLP"       "stringi"   "stats"    
##  [7] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[5]]
##  [1] "ggplot2"   "RWeka"     "slam"      "tm"        "NLP"       "stringi"  
##  [7] "stats"     "graphics"  "grDevices" "utils"     "datasets"  "methods"  
## [13] "base"     
## 
## [[6]]
##  [1] "dplyr"     "ggplot2"   "RWeka"     "slam"      "tm"        "NLP"      
##  [7] "stringi"   "stats"     "graphics"  "grDevices" "utils"     "datasets" 
## [13] "methods"   "base"     
## 
## [[7]]
##  [1] "knitr"     "dplyr"     "ggplot2"   "RWeka"     "slam"      "tm"       
##  [7] "NLP"       "stringi"   "stats"     "graphics"  "grDevices" "utils"    
## [13] "datasets"  "methods"   "base"     
## 
## [[8]]
##  [1] "knitr"     "dplyr"     "ggplot2"   "RWeka"     "slam"      "tm"       
##  [7] "NLP"       "stringi"   "stats"     "graphics"  "grDevices" "utils"    
## [13] "datasets"  "methods"   "base"     
## 
## [[9]]
##  [1] "SnowballC" "knitr"     "dplyr"     "ggplot2"   "RWeka"     "slam"     
##  [7] "tm"        "NLP"       "stringi"   "stats"     "graphics"  "grDevices"
## [13] "utils"     "datasets"  "methods"   "base"

Locate the data files

This block tries a few common relative locations. If none exist it stops with a clear error telling you what to do.

possible_paths <- c(
  "data/final/en_US/",
  "final/en_US/",
  "./final/en_US/",
  "./data/final/en_US/"
)

data_path <- NULL
for(p in possible_paths) {
  if (file.exists(file.path(p, "en_US.blogs.txt"))) {
    data_path <- p
    break
  }
}

if (is.null(data_path)) {
  stop("Data files not found. Please place the folder 'final/en_US' (with en_US.blogs.txt, en_US.news.txt, en_US.twitter.txt) either in your project root or in a 'data' folder. Example structure:\n\nproject_folder/\n  milestone_report.Rmd\n  final/en_US/en_US.blogs.txt\n  final/en_US/en_US.news.txt\n  final/en_US/en_US.twitter.txt\n\nOr use 'data/final/en_US/'.")
}

data_path
## [1] "data/final/en_US/"

Loading the data (safe read)

We use skipNul = TRUE to avoid errors with embedded nulls.

blogs <- readLines(file.path(data_path, "en_US.blogs.txt"), warn = FALSE, skipNul = TRUE, encoding = "UTF-8")
news   <- readLines(file.path(data_path, "en_US.news.txt"), warn = FALSE, skipNul = TRUE, encoding = "UTF-8")
twitter<- readLines(file.path(data_path, "en_US.twitter.txt"), warn = FALSE, skipNul = TRUE, encoding = "UTF-8")

# quick confirmation
lengths <- c(length(blogs), length(news), length(twitter))
names(lengths) <- c("blogs","news","twitter")
lengths
##   blogs    news twitter 
##  899288 1010206 2360148

Basic Summary Statistics

Line counts, approximate word counts and file sizes (MB).

library(stringi)

summary_table <- data.frame(
  File = c("Blogs", "News", "Twitter"),
  Lines = c(length(blogs), length(news), length(twitter)),
  Words = c(sum(stri_count_words(blogs)),
            sum(stri_count_words(news)),
            sum(stri_count_words(twitter))),
  FileSizeMB = c(
    file.info(file.path(data_path, "en_US.blogs.txt"))$size / (1024*1024),
    file.info(file.path(data_path, "en_US.news.txt"))$size / (1024*1024),
    file.info(file.path(data_path, "en_US.twitter.txt"))$size / (1024*1024)
  )
)

knitr::kable(summary_table, digits = 0)
File Lines Words FileSizeMB
Blogs 899288 37546806 200
News 1010206 34761151 196
Twitter 2360148 30096690 159

Sampling the data (memory-safe)

We sample a small percentage to perform EDA and n-gram checks. This avoids huge memory consumption. We keep a minimum sample size so plots are meaningful.

set.seed(123)
sample_fraction <- 0.01  # 1% sample; change to 0.005 or 0.02 as needed
sampled_blogs  <- sample(blogs, max(1000, floor(length(blogs) * sample_fraction)))
sampled_news   <- sample(news,  max(1000, floor(length(news) * sample_fraction)))
sampled_twitter<- sample(twitter, max(1000, floor(length(twitter) * sample_fraction)))

sample_data <- c(sampled_blogs, sampled_news, sampled_twitter)
length(sample_data)
## [1] 42695

Create a Corpus from the sampled data

We do basic cleaning: lowercase, remove numbers, punctuation, and extra whitespace. We do not remove stopwords here for the unigram freq (we will show stopword effect separately).

library(tm)

corp <- VCorpus(VectorSource(sample_data))
corp <- tm_map(corp, content_transformer(tolower))
corp <- tm_map(corp, removeNumbers)
corp <- tm_map(corp, removePunctuation)
corp <- tm_map(corp, stripWhitespace)

Word Frequency Analysis (sparse-safe)

We build a DocumentTermMatrix on the sampled corpus, then use slam::col_sums() to avoid conversion to a dense matrix.

dtm <- DocumentTermMatrix(corp, control = list(wordLengths = c(1, Inf)))
term_sums <- slam::col_sums(dtm)
freq_df <- data.frame(
  word = names(term_sums),
  freq = as.integer(term_sums),
  stringsAsFactors = FALSE
)
freq_df <- freq_df[order(-freq_df$freq), ]
head(freq_df, 10)
##       word  freq
## 53252  the 47584
## 54092   to 27606
## 3072   and 24097
## 1254     a 23664
## 37389   of 19823
## 26487   in 16546
## 25969    i 16501
## 20458  for 11118
## 27603   is 10525
## 53205 that 10409

Plot: Top 20 Most Frequent Words

library(ggplot2)
topn <- 20
ggplot(freq_df[1:topn, ], aes(x = reorder(word, freq), y = freq)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = paste0("Top ", topn, " Most Frequent Words (sampled)"), x = "Word", y = "Frequency")


Stopwords effect (optional)

Show how removing stopwords changes the top words.

corp_nostop <- tm_map(corp, removeWords, stopwords("en"))
dtm_nostop <- DocumentTermMatrix(corp_nostop, control = list(wordLengths = c(1, Inf)))
freq_nostop <- slam::col_sums(dtm_nostop)
df_nostop <- data.frame(word = names(freq_nostop), freq = as.integer(freq_nostop))
df_nostop <- df_nostop[order(-df_nostop$freq), ]

ggplot(df_nostop[1:20, ], aes(x = reorder(word, freq), y = freq)) +
  geom_bar(stat="identity") + coord_flip() +
  labs(title = "Top 20 Words after Removing Stopwords", x = "Word", y = "Frequency")


N-gram Analysis (on sampled data)

We use RWeka’s tokenizer on the sampled raw text vector (sample_data) — this is much lighter than tokenizing the entire dataset.

library(RWeka)
options(mc.cores = 1) # RWeka can sometimes attempt parallel threads; keep it simple

W_control_uni  <- Weka_control(min = 1, max = 1)
W_control_bi   <- Weka_control(min = 2, max = 2)
W_control_tri  <- Weka_control(min = 3, max = 3)

unigram_tokens <- NGramTokenizer(sample_data, W_control_uni)
bigram_tokens  <- NGramTokenizer(sample_data, W_control_bi)
trigram_tokens <- NGramTokenizer(sample_data, W_control_tri)

uni_tab <- as.data.frame(table(unigram_tokens), stringsAsFactors = FALSE)
bi_tab  <- as.data.frame(table(bigram_tokens), stringsAsFactors = FALSE)
tri_tab <- as.data.frame(table(trigram_tokens), stringsAsFactors = FALSE)

uni_tab <- uni_tab[order(-uni_tab$Freq), ]
bi_tab  <- bi_tab[order(-bi_tab$Freq), ]
tri_tab <- tri_tab[order(-tri_tab$Freq), ]

Top 10 Unigrams

top_uni <- head(uni_tab, 10)
colnames(top_uni) <- c("unigram","Freq")
ggplot(top_uni, aes(x = reorder(unigram, Freq), y = Freq)) +
  geom_bar(stat="identity") + coord_flip() +
  labs(title = "Top 10 Unigrams (sampled)", x = "Word", y = "Frequency")


Top 10 Bigrams

top_bi <- head(bi_tab, 10)
colnames(top_bi) <- c("bigram","Freq")
ggplot(top_bi, aes(x = reorder(bigram, Freq), y = Freq)) +
  geom_bar(stat="identity") + coord_flip() +
  labs(title = "Top 10 Bigrams (sampled)", x = "Word Pair", y = "Frequency")


Interesting Findings


Plan for Prediction Algorithm & Shiny App

Prediction Algorithm

  • Use N-gram language models (unigram, bigram, trigram) built from cleaned, tokenized text.
  • Apply Katz backoff or Stupid Backoff for smoothing / handling unseen n-grams.
  • Store only frequent n-grams (pruning low-frequency items) to reduce memory usage.

Shiny App

  • Text input box to accept partial sentence.
  • Return top 3 predicted next words with estimated probabilities.
  • Lightweight UI so it runs quickly on a consumer laptop / mobile.

Conclusion

This corrected report uses a memory-safe workflow (sampling + sparse computations), includes clear checks for data placement, and will knit successfully on most laptops. Once you confirm the data path is correct and packages are installed, click Knit → HTML, then Publish → RPubs.