This Milestone Report summarizes the exploratory analysis of the text data provided for the SwiftKey Text Prediction project. The goals are the same as before: show data is loaded, provide summary statistics, show initial findings, and outline the plan for model + Shiny app.
The chunk below will install only missing packages and then load them. Installation asks for user confirmation (CRAN mirror) the first time.
required <- c("stringi","tm","slam","RWeka","ggplot2","dplyr","knitr","NLP","SnowballC")
to_install <- required[!required %in% installed.packages()[, "Package"]]
if(length(to_install)) {
install.packages(to_install, repos = "https://cran.rstudio.com/")
}
lapply(required, library, character.only = TRUE)
## [[1]]
## [1] "stringi" "stats" "graphics" "grDevices" "utils" "datasets"
## [7] "methods" "base"
##
## [[2]]
## [1] "tm" "NLP" "stringi" "stats" "graphics" "grDevices"
## [7] "utils" "datasets" "methods" "base"
##
## [[3]]
## [1] "slam" "tm" "NLP" "stringi" "stats" "graphics"
## [7] "grDevices" "utils" "datasets" "methods" "base"
##
## [[4]]
## [1] "RWeka" "slam" "tm" "NLP" "stringi" "stats"
## [7] "graphics" "grDevices" "utils" "datasets" "methods" "base"
##
## [[5]]
## [1] "ggplot2" "RWeka" "slam" "tm" "NLP" "stringi"
## [7] "stats" "graphics" "grDevices" "utils" "datasets" "methods"
## [13] "base"
##
## [[6]]
## [1] "dplyr" "ggplot2" "RWeka" "slam" "tm" "NLP"
## [7] "stringi" "stats" "graphics" "grDevices" "utils" "datasets"
## [13] "methods" "base"
##
## [[7]]
## [1] "knitr" "dplyr" "ggplot2" "RWeka" "slam" "tm"
## [7] "NLP" "stringi" "stats" "graphics" "grDevices" "utils"
## [13] "datasets" "methods" "base"
##
## [[8]]
## [1] "knitr" "dplyr" "ggplot2" "RWeka" "slam" "tm"
## [7] "NLP" "stringi" "stats" "graphics" "grDevices" "utils"
## [13] "datasets" "methods" "base"
##
## [[9]]
## [1] "SnowballC" "knitr" "dplyr" "ggplot2" "RWeka" "slam"
## [7] "tm" "NLP" "stringi" "stats" "graphics" "grDevices"
## [13] "utils" "datasets" "methods" "base"
This block tries a few common relative locations. If none exist it stops with a clear error telling you what to do.
possible_paths <- c(
"data/final/en_US/",
"final/en_US/",
"./final/en_US/",
"./data/final/en_US/"
)
data_path <- NULL
for(p in possible_paths) {
if (file.exists(file.path(p, "en_US.blogs.txt"))) {
data_path <- p
break
}
}
if (is.null(data_path)) {
stop("Data files not found. Please place the folder 'final/en_US' (with en_US.blogs.txt, en_US.news.txt, en_US.twitter.txt) either in your project root or in a 'data' folder. Example structure:\n\nproject_folder/\n milestone_report.Rmd\n final/en_US/en_US.blogs.txt\n final/en_US/en_US.news.txt\n final/en_US/en_US.twitter.txt\n\nOr use 'data/final/en_US/'.")
}
data_path
## [1] "data/final/en_US/"
We use skipNul = TRUE to avoid errors with embedded
nulls.
blogs <- readLines(file.path(data_path, "en_US.blogs.txt"), warn = FALSE, skipNul = TRUE, encoding = "UTF-8")
news <- readLines(file.path(data_path, "en_US.news.txt"), warn = FALSE, skipNul = TRUE, encoding = "UTF-8")
twitter<- readLines(file.path(data_path, "en_US.twitter.txt"), warn = FALSE, skipNul = TRUE, encoding = "UTF-8")
# quick confirmation
lengths <- c(length(blogs), length(news), length(twitter))
names(lengths) <- c("blogs","news","twitter")
lengths
## blogs news twitter
## 899288 1010206 2360148
Line counts, approximate word counts and file sizes (MB).
library(stringi)
summary_table <- data.frame(
File = c("Blogs", "News", "Twitter"),
Lines = c(length(blogs), length(news), length(twitter)),
Words = c(sum(stri_count_words(blogs)),
sum(stri_count_words(news)),
sum(stri_count_words(twitter))),
FileSizeMB = c(
file.info(file.path(data_path, "en_US.blogs.txt"))$size / (1024*1024),
file.info(file.path(data_path, "en_US.news.txt"))$size / (1024*1024),
file.info(file.path(data_path, "en_US.twitter.txt"))$size / (1024*1024)
)
)
knitr::kable(summary_table, digits = 0)
| File | Lines | Words | FileSizeMB |
|---|---|---|---|
| Blogs | 899288 | 37546806 | 200 |
| News | 1010206 | 34761151 | 196 |
| 2360148 | 30096690 | 159 |
We sample a small percentage to perform EDA and n-gram checks. This avoids huge memory consumption. We keep a minimum sample size so plots are meaningful.
set.seed(123)
sample_fraction <- 0.01 # 1% sample; change to 0.005 or 0.02 as needed
sampled_blogs <- sample(blogs, max(1000, floor(length(blogs) * sample_fraction)))
sampled_news <- sample(news, max(1000, floor(length(news) * sample_fraction)))
sampled_twitter<- sample(twitter, max(1000, floor(length(twitter) * sample_fraction)))
sample_data <- c(sampled_blogs, sampled_news, sampled_twitter)
length(sample_data)
## [1] 42695
We do basic cleaning: lowercase, remove numbers, punctuation, and extra whitespace. We do not remove stopwords here for the unigram freq (we will show stopword effect separately).
library(tm)
corp <- VCorpus(VectorSource(sample_data))
corp <- tm_map(corp, content_transformer(tolower))
corp <- tm_map(corp, removeNumbers)
corp <- tm_map(corp, removePunctuation)
corp <- tm_map(corp, stripWhitespace)
We build a DocumentTermMatrix on the sampled corpus, then use
slam::col_sums() to avoid conversion to a dense matrix.
dtm <- DocumentTermMatrix(corp, control = list(wordLengths = c(1, Inf)))
term_sums <- slam::col_sums(dtm)
freq_df <- data.frame(
word = names(term_sums),
freq = as.integer(term_sums),
stringsAsFactors = FALSE
)
freq_df <- freq_df[order(-freq_df$freq), ]
head(freq_df, 10)
## word freq
## 53252 the 47584
## 54092 to 27606
## 3072 and 24097
## 1254 a 23664
## 37389 of 19823
## 26487 in 16546
## 25969 i 16501
## 20458 for 11118
## 27603 is 10525
## 53205 that 10409
library(ggplot2)
topn <- 20
ggplot(freq_df[1:topn, ], aes(x = reorder(word, freq), y = freq)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = paste0("Top ", topn, " Most Frequent Words (sampled)"), x = "Word", y = "Frequency")
Show how removing stopwords changes the top words.
corp_nostop <- tm_map(corp, removeWords, stopwords("en"))
dtm_nostop <- DocumentTermMatrix(corp_nostop, control = list(wordLengths = c(1, Inf)))
freq_nostop <- slam::col_sums(dtm_nostop)
df_nostop <- data.frame(word = names(freq_nostop), freq = as.integer(freq_nostop))
df_nostop <- df_nostop[order(-df_nostop$freq), ]
ggplot(df_nostop[1:20, ], aes(x = reorder(word, freq), y = freq)) +
geom_bar(stat="identity") + coord_flip() +
labs(title = "Top 20 Words after Removing Stopwords", x = "Word", y = "Frequency")
We use RWeka’s tokenizer on the sampled raw text vector
(sample_data) — this is much lighter than tokenizing the
entire dataset.
library(RWeka)
options(mc.cores = 1) # RWeka can sometimes attempt parallel threads; keep it simple
W_control_uni <- Weka_control(min = 1, max = 1)
W_control_bi <- Weka_control(min = 2, max = 2)
W_control_tri <- Weka_control(min = 3, max = 3)
unigram_tokens <- NGramTokenizer(sample_data, W_control_uni)
bigram_tokens <- NGramTokenizer(sample_data, W_control_bi)
trigram_tokens <- NGramTokenizer(sample_data, W_control_tri)
uni_tab <- as.data.frame(table(unigram_tokens), stringsAsFactors = FALSE)
bi_tab <- as.data.frame(table(bigram_tokens), stringsAsFactors = FALSE)
tri_tab <- as.data.frame(table(trigram_tokens), stringsAsFactors = FALSE)
uni_tab <- uni_tab[order(-uni_tab$Freq), ]
bi_tab <- bi_tab[order(-bi_tab$Freq), ]
tri_tab <- tri_tab[order(-tri_tab$Freq), ]
top_uni <- head(uni_tab, 10)
colnames(top_uni) <- c("unigram","Freq")
ggplot(top_uni, aes(x = reorder(unigram, Freq), y = Freq)) +
geom_bar(stat="identity") + coord_flip() +
labs(title = "Top 10 Unigrams (sampled)", x = "Word", y = "Frequency")
top_bi <- head(bi_tab, 10)
colnames(top_bi) <- c("bigram","Freq")
ggplot(top_bi, aes(x = reorder(bigram, Freq), y = Freq)) +
geom_bar(stat="identity") + coord_flip() +
labs(title = "Top 10 Bigrams (sampled)", x = "Word Pair", y = "Frequency")
This corrected report uses a memory-safe workflow (sampling + sparse computations), includes clear checks for data placement, and will knit successfully on most laptops. Once you confirm the data path is correct and packages are installed, click Knit → HTML, then Publish → RPubs.