For the milestone project , the main objective is to build the corpus, calculated the tri-gram and bi-gram and unigram term document matrix and perform exploratory analysis on the words. The data is available to be downloaded from
https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip
The files are extracted from the zip file with three working files:
???en_US.blogs.txt??? ???en_US.news.txt??? ???en_US.twitter.txt???
rm(list = ls())
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 448618 24.0 966465 51.7 NA 630600 33.7
## Vcells 871460 6.7 8388608 64.0 16384 1767212 13.5
getwd()
## [1] "/Users/soni/Desktop/DATA_SCIENCE specialization/DataScienceCapstone/Week1"
library(NLP)
library(tm)
library(readtext)
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
library(SnowballC)
library(ngram)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
blogs <- readLines('final/en_US/en_US.blogs.txt')
news <- readLines('final/en_US/en_US.news.txt')
twitter <- readLines('final/en_US/en_US.twitter.txt')
## Warning in readLines("final/en_US/en_US.twitter.txt"): line 167155 appears
## to contain an embedded nul
## Warning in readLines("final/en_US/en_US.twitter.txt"): line 268547 appears
## to contain an embedded nul
## Warning in readLines("final/en_US/en_US.twitter.txt"): line 1274086 appears
## to contain an embedded nul
## Warning in readLines("final/en_US/en_US.twitter.txt"): line 1759032 appears
## to contain an embedded nul
blogs<-iconv(blogs, "UTF-8", "ASCII", sub="")
news<-iconv(news, "UTF-8", "ASCII", sub="")
twitter <- iconv(twitter, "UTF-8", "ASCII", sub="")
As the original data files (Blogs, News and Twitter) are extremely large, a small sample will be generated to study the data. A 10% of the contents of each of the data (Blogs, News and Twitter) will be sampled to create the corpus.
sample_size<-.1
blogs_index<-sample(seq_len(length(blogs)),length(blogs)*sample_size)
news_index<-sample(seq_len(length(news)),length(news)*sample_size)
twitter_index<-sample(seq_len(length(twitter)),length(twitter)*sample_size)
blogs<-blogs[blogs_index[]]
news<-news[news_index[]]
twitter<-twitter[twitter_index[]]
blogs<- paste(blogs , collapse = ' ')
news<- paste(news , collapse = ' ')
twitter<- paste(twitter , collapse = ' ')
As observed, in the given dataset there are numerous characters, words, numbers and punctuations that are not relevant to the prediction exercise. Therefore, few functions will be created to clean the corpus before the actual analysis can be performed. The transformation is performed using tm_map and it includes cleaning the urls,repeating words, punctuation,stripwhitespaces , non ASCII words,numbers.
vdocs <- VCorpus(VectorSource(c(blogs, news, twitter)),readerControl=list(reader=readPlain,language="en"))
vdocs <- tm_map(vdocs, stripWhitespace)
vdocs <- tm_map(vdocs, removePunctuation)
vdocs <- tm_map(vdocs, content_transformer(tolower))
vdocs <- tm_map(vdocs, removeWords, stopwords("english"))
vdocs <- tm_map(vdocs, stemDocument)
vdocs <- tm_map(vdocs, removeNumbers)
vdocs<-tm_map(vdocs,PlainTextDocument)
It is a sequence of single words from the corpus
unigram_NLPtrigramTokenizer <- function(x) {
unlist(lapply(ngrams(words(x), 1), paste, collapse = " "), use.names = FALSE)
}
dtm_NLP_unigram<- DocumentTermMatrix(vdocs ,control=list(tokenize = unigram_NLPtrigramTokenizer) )
m1 <- as.matrix(dtm_NLP_unigram)
v1 <- sort(colSums(m1),decreasing=TRUE)
d1<- data.frame(word = names(v1),freq=v1)
head(d1,10)
## word freq
## will will 32020
## one one 30790
## said said 30478
## just just 30451
## get get 30042
## like like 29623
## time time 25754
## can can 24607
## day day 22546
## year year 21617
plotd1<-d1[1:20,]
barplot(plotd1$freq, las = 2, names.arg = plotd1$word,
col ="pink", main ="Most frequent words",
ylab = "Word frequencies")
It is a sequence of two words from the corpus.
bigram_NLPtrigramTokenizer <- function(x) {
unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
}
dtm_NLP_bigram<- DocumentTermMatrix(vdocs ,control=list(tokenize = bigram_NLPtrigramTokenizer) )
m2 <- as.matrix(dtm_NLP_bigram)
v2 <- sort(colSums(m2),decreasing=TRUE)
d2 <- data.frame(word = names(v2),freq=v2)
head(d2,10)
## word freq
## right now right now 2490
## last year last year 2307
## look like look like 2087
## new york new york 2024
## dont know dont know 1947
## cant wait cant wait 1941
## feel like feel like 1708
## look forward look forward 1621
## year ago year ago 1620
## last night last night 1593
plotd2<-d2[1:20,]
barplot(plotd2$freq, las = 2, names.arg = plotd2$word,
col ="yellow", main ="Most frequent words",
ylab = "Word frequencies")
It is a sequence of three words from the corpus.
trigram_NLPtrigramTokenizer <- function(x) {
unlist(lapply(ngrams(words(x), 3), paste, collapse = " "), use.names = FALSE)
}
dtm_NLP_unigram<- DocumentTermMatrix(vdocs ,control=list(tokenize = trigram_NLPtrigramTokenizer) )
m3 <- as.matrix(dtm_NLP_unigram)
v3 <- sort(colSums(m3),decreasing=TRUE)
d3 <- data.frame(word = names(v3),freq=v3)
head(d3,10)
## word freq
## cant wait see cant wait see 378
## happi mother day happi mother day 334
## new york citi new york citi 273
## let us know let us know 256
## happi new year happi new year 213
## presid barack obama presid barack obama 197
## im pretti sure im pretti sure 184
## two year ago two year ago 156
## new york time new york time 152
## look forward see look forward see 147
plotd3<-d3[1:20,]
barplot(plotd3$freq, las = 2, names.arg = plotd3$word,
col ="blue", main ="Most frequent words",
ylab = "Word frequencies")
wordcloud(d1$word, d1$freq, max.words=100, random.order=FALSE, rot.per=0, fixed.asp = TRUE, use.r.layout = FALSE, colors=brewer.pal(8, "Dark2"))
## Warning in wordcloud(d1$word, d1$freq, max.words = 100, random.order =
## FALSE, : your could not be fit on page. It will not be plotted.
## Warning in wordcloud(d1$word, d1$freq, max.words = 100, random.order =
## FALSE, : talk could not be fit on page. It will not be plotted.
## Warning in wordcloud(d1$word, d1$freq, max.words = 100, random.order =
## FALSE, : long could not be fit on page. It will not be plotted.
wordcloud(d2$word, d2$freq, max.words=100, random.order=FALSE, rot.per=0, fixed.asp = TRUE, use.r.layout = FALSE, colors=brewer.pal(8, "Dark2"))
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : high school could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : make sure could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : happi birthday could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : even though could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : come back could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : year old could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : good morn could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : sound like could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : next year could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : everi day could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : unit state could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : follow back could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : can see could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : get back could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : good luck could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : thank much could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : new year could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : two year could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : one thing could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : just want could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : seem like could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : last month could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : mother day could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : san francisco could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : everi time could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : dont like could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : can make could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : long time could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : will take could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : will make could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : mani peopl could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : dont get could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : good thing could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : didnt know could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : im gonna could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : thank rt could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : san diego could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : want see could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : pretti much could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : three year could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : two week could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : wait see could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : next time could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : know im could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : health care could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : best friend could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : just one could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : can help could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : great day could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : will never could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : make sens could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : need get could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : take care could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : realli good could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : good time could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : realli want could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : pretti good could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : pleas follow could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : get better could not be fit on page. It will not be plotted.
wordcloud(d3$word, d3$freq, max.words=100, random.order=FALSE, rot.per=0, fixed.asp = TRUE, use.r.layout = FALSE, colors=brewer.pal(8, "Dark2"))
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : love love love could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : two week ago could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : right now im could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : dream come true could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : four year ago could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : ive ever seen could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order
## = FALSE, : follow follow back could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : want make sure could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : high school student could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : three year ago could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : make feel like could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order
## = FALSE, : martin luther king could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : didnt even know could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order
## = FALSE, : pleas pleas pleas could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : georg w bush could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order
## = FALSE, : happi valentin day could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : cant wait hear could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : make feel better could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : osama bin laden could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : realli look forward could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : cent per share could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : coupl week ago could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : dont think can could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : dont get wrong could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : five year ago could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : past two year could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : spend much time could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : come join us could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : spend lot time could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : think im go could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : superior court judg could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : long time ago could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : sever year ago could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order
## = FALSE, : good morn everyon could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order
## = FALSE, : us district court could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : keep good work could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : ill let know could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : im just go could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order
## = FALSE, : look forward meet could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : cant wait go could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : dont know im could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : im big fan could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : everi singl day could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : wall street journal could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : year ago today could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : attorney general offic could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order
## = FALSE, : occupi wall street could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : today good day could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : follow look forward could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order
## = FALSE, : senior vice presid could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order
## = FALSE, : thank follow look could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order
## = FALSE, : chief execut offic could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : counti sheriff offic could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : dont want go could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : just make sure could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : thank follow us could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : coupl year ago could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : dont realli know could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order
## = FALSE, : hope everyon great could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : im go go could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : just want say could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : last two year could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : luther king jr could not be fit on page. It will not be plotted.
After the exploratory analysis, I think we can start building the predictive model(s) and eventually the data product. Here is my further steps: Building the predictive model(s) by using the tokens. Develop data product (i.e. shiny app) to make word prediction based on user inputs.