blogs <- readLines("~/DataScienceSpecialization/Capstone/data/en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
news <- readLines("~/DataScienceSpecialization/Capstone/data/en_us.news.txt", encoding = "UTF-8", skipNul = TRUE)
## Warning in readLines("~/DataScienceSpecialization/Capstone/
## data/en_us.news.txt", : incomplete final line found on '~/
## DataScienceSpecialization/Capstone/data/en_us.news.txt'
twitter <- readLines("~/DataScienceSpecialization/Capstone/data/en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
# Convert coding to ASCII
blogs <- iconv(blogs, "latin1", "ASCII", sub = "")
news <- iconv(news, "latin1", "ASCII", sub = "")
twitter <- iconv(twitter, "latin1", "ASCII", sub = "")
# Remove last line from news
news <- news[-77259]
fsize_blogs <- file.size("~/DataScienceSpecialization/Capstone/data/en_US.blogs.txt") / 1024^2
fsize_news <- file.size("~/DataScienceSpecialization/Capstone/data/en_us.news.txt") / 1024^2
fsize_twitter <- file.size("~/DataScienceSpecialization/Capstone/data/en_US.twitter.txt") / 1024^2
lines_blogs <- length(blogs)
lines_news <- length(news)
lines_twitter <- length(twitter)
knitr::kable(data.frame(
DataSources = c("Blogs", "News", "Twitter"),
FileSize_MB = format(c(fsize_blogs, fsize_news, fsize_twitter), digits = 5),
TotalLines = format(c(lines_blogs, lines_news, lines_twitter), big.mark = ",")))
| DataSources | FileSize_MB | TotalLines |
|---|---|---|
| Blogs | 200.42 | 899,288 |
| News | 196.28 | 77,258 |
| 159.36 | 2,360,148 |
# sample 1% of the data
set.seed(54321)
sampleData <- c(sample(blogs, length(blogs) * 0.01),
sample(news, length(news) * 0.01),
sample(twitter, length(twitter) * 0.01))
corpus <- VCorpus(VectorSource(sampleData))
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)
Wordcloud of the most common words in the corpus.
wordcloud(corpus, max.words=100, random.order=FALSE, colors=brewer.pal(8,"Dark2"))
uni_token <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
bi_token <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
tri_token <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
getFreq <- function(tdm) {
freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE)
return(data.frame(word = names(freq), freq = freq))
}
makePlot <- function(data, label) {
ggplot(data[1:50,], aes(reorder(word, -freq), freq)) +
labs(x = label, y = "Frequency") +
theme(axis.text.x = element_text(angle = 60, size = 12, hjust = 1)) +
geom_bar(stat = "identity", fill = I("grey50"))
}
freq1 <- getFreq(removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = uni_token)), 0.99))
makePlot(freq1, "50 Most Common Unigrams")
freq2 <- getFreq(removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = bi_token)), 0.999))
makePlot(freq2, "50 Most Common Bigrams")
freq3 <- getFreq(removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = tri_token)), 0.9999))
makePlot(freq3, "50 Most Common Trigrams")
Further work will be to fine tune the Katz Back Off prediction algorithm and deploy the shiny app.
The predictive algorithm will utilize n-grams and the Katz Back Off model to predict the next word in a sentence. The Back Off algorithim searches for the terms using the largest n-gram model first, then the second largest and so forth (ex. 4-gram > 3-gram > 2-gram > 1-gram).
A simple Shiny app will be created that includes a text box for input of the search string and an output pane containing the prediction of the next word in the search term.