library(tm)
## Loading required package: NLP
library(wordcloud)
## Loading required package: RColorBrewer
library(RWeka)
The Capstone Milestone Week 2 Report serves as an initial analysis of the dataset, aimed at exploring how it can be used to develop a predictive app. The primary objective of this report is to identify key data features that will support the creation of an effective predictive algorithm. To achieve this, the following steps will be completed: confirming successful data download and loading, generating a basic summary report of statistics across the datasets, highlighting any notable findings so far, and gathering feedback on the proposed approach for building the prediction algorithm and Shiny app.
twitter <- readLines("en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
blogs <- readLines("en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
## News file has special characters so it is better to read it as a binary file
bin.news <- file("en_US.news.txt", open="rb")
news <- readLines(bin.news, encoding="UTF-8")
close(bin.news)
rm(bin.news)
The datasets are quite large and will take significant time to process and analyze. For this preliminary analysis, a random sample of 1,000 lines will be taken from the Blogs, News, and Twitter data.
set.seed(5454568)
sampleTwitter <- twitter[sample(1:length(twitter),1000)]
sampleBlogs <- blogs[sample(1:length(blogs),1000)]
sampleNews <- news[sample(1:length(news),1000)]
## Combine data samples
sampleData <- c(sampleTwitter,sampleBlogs,sampleNews)
## Save sample data and remove data not needed to free memory
writeLines(sampleData, "sampleData.txt")
rm(twitter,news,blogs,sampleTwitter,sampleNews,sampleBlogs)
## Read Sample
sampleData <- readLines("sampleData.txt", encoding="UTF-8")
corpus <- VCorpus(VectorSource(sampleData))
## Remove space, punctuation, numbers, whitespace, stopwords and change to lowercase
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpus <- tm_map(corpus, toSpace, "\"|/|@|\\|")
corpus <- tm_map(corpus, toSpace, "[^[:graph:]]")
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, stemDocument)
corpus <- tm_map(corpus, PlainTextDocument)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
#summary(corpus)
wordcloud(corpus, min.freq=5, max.words=101, random.order=TRUE,
rot.per=0.5, colors=brewer.pal(8, "Set2"), use.r.layout=FALSE)
## Warning in wordcloud(corpus, min.freq = 5, max.words = 101, random.order =
## TRUE, : day could not be fit on page. It will not be plotted.
## Warning in wordcloud(corpus, min.freq = 5, max.words = 101, random.order =
## TRUE, : one could not be fit on page. It will not be plotted.
corpus.dataframe <- data.frame(text = unlist(sapply(corpus, '[', 'content')), stringsAsFactors = F)
uniGramToken <- data.frame(table(NGramTokenizer(corpus.dataframe, Weka_control(min = 1, max = 1))))
biGramToken <- data.frame(table(NGramTokenizer(corpus.dataframe, Weka_control(min = 2, max = 2))))
triGramToken <- data.frame(table(NGramTokenizer(corpus.dataframe, Weka_control(min = 3, max = 3))))
#order by decreasing frequency
unigram <- uniGramToken[order(uniGramToken$Freq, decreasing = TRUE),]
bigram <- biGramToken[order(biGramToken$Freq, decreasing = TRUE),]
trigram <- triGramToken[order(triGramToken$Freq, decreasing = TRUE),]
par(mfrow = c(1, 1))
par(mar=c(5,4,2,0))
barplot(unigram[1:30,2],
names.arg=unigram[1:30,1],
col = "gold",
main="Most commonly used Words (Top 30)",
las=2,
ylab = "Frequency")
par(mar=c(8.5,4,2,1))
barplot(bigram[1:30,2],
names.arg=bigram[1:30,1],
col = "maroon",
main="Most commonly used two word combinations (Top 30)",
las=2,
ylab = "Frequency")
par(mar=c(8.5,4,2,1))
barplot(trigram[1:30,2],
names.arg=trigram[1:30,1],
col = "orange",
main="Most commonly used three word combinations (Top 30)",
las=2,
ylab = "Frequency")
Based on the analysis, I plan to use an n-gram dataframe to calculate the probabilities of the next word occurring in relation to the preceding words. For the Shiny app, the goal is to develop a straightforward interface where users can enter a text string, and our prediction model will provide a list of suggested words to predict the next word.