This report is part of the Johns Hopkins Data Science Capstone Project. The goal is to explore the SwiftKey dataset and show that we are ready to build a predictive text model.
The data comes from three sources: blogs, news, and Twitter. In this report, we summarize the dataset and show some initial analysis using text mining techniques.
blogs <- readLines("final/en_US/en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
news <- readLines("final/en_US/en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines("final/en_US/en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
library(stringi)
data_summary <- data.frame(
Source = c("Blogs", "News", "Twitter"),
Lines = c(length(blogs), length(news), length(twitter)),
Words = c(sum(stri_count_words(blogs)),
sum(stri_count_words(news)),
sum(stri_count_words(twitter))),
FileSizeMB = round(c(file.info("final/en_US/en_US.blogs.txt")$size,
file.info("final/en_US/en_US.news.txt")$size,
file.info("final/en_US/en_US.twitter.txt")$size) / 1024^2, 2)
)
print(data_summary)
## Source Lines Words FileSizeMB
## 1 Blogs 899288 37546806 200.42
## 2 News 1010206 34761151 196.28
## 3 Twitter 2360148 30096690 159.36
set.seed(1234)
sample_size <- 10000
sample_data <- c(
sample(blogs, sample_size),
sample(news, sample_size),
sample(twitter, sample_size)
)
library(tm)
corpus <- VCorpus(VectorSource(sample_data))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
library(RWeka)
library(ggplot2)
unigram_tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
tdm <- TermDocumentMatrix(corpus, control = list(tokenize = unigram_tokenizer))
tdm_matrix <- as.matrix(tdm)
word_freqs <- sort(rowSums(tdm_matrix), decreasing = TRUE)
top_unigrams <- data.frame(
word = names(head(word_freqs, 10)),
freq = head(word_freqs, 10)
)
ggplot(top_unigrams, aes(x = reorder(word, -freq), y = freq)) +
geom_bar(stat = "identity", fill = "skyblue") +
labs(title = "Top 10 Most Frequent Unigrams", x = "Word", y = "Frequency") +
theme_minimal()