Data Science Capstone Milestone Project

For the milestone project , the main objective is to build the corpus, calculated the tri-gram and bi-gram and unigram term document matrix and perform exploratory analysis on the words. The data is available to be downloaded from

https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip

The files are extracted from the zip file with three working files:

???en_US.blogs.txt??? ???en_US.news.txt??? ???en_US.twitter.txt???

Loading the libraries used in the analysis

rm(list = ls())
gc()
##          used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 448618 24.0     966465 51.7         NA   630600 33.7
## Vcells 871460  6.7    8388608 64.0      16384  1767212 13.5
getwd()
## [1] "/Users/soni/Desktop/DATA_SCIENCE specialization/DataScienceCapstone/Week1"
library(NLP)
library(tm)
library(readtext)
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
library(SnowballC)
library(ngram)
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)

Data loading for the given dataset

blogs <- readLines('final/en_US/en_US.blogs.txt')
news <- readLines('final/en_US/en_US.news.txt')
twitter <- readLines('final/en_US/en_US.twitter.txt')
## Warning in readLines("final/en_US/en_US.twitter.txt"): line 167155 appears
## to contain an embedded nul
## Warning in readLines("final/en_US/en_US.twitter.txt"): line 268547 appears
## to contain an embedded nul
## Warning in readLines("final/en_US/en_US.twitter.txt"): line 1274086 appears
## to contain an embedded nul
## Warning in readLines("final/en_US/en_US.twitter.txt"): line 1759032 appears
## to contain an embedded nul
blogs<-iconv(blogs, "UTF-8", "ASCII", sub="")
news<-iconv(news, "UTF-8", "ASCII", sub="")
twitter <- iconv(twitter, "UTF-8", "ASCII", sub="")

As the original data files (Blogs, News and Twitter) are extremely large, a small sample will be generated to study the data. A 10% of the contents of each of the data (Blogs, News and Twitter) will be sampled to create the corpus.

sample_size<-.1


blogs_index<-sample(seq_len(length(blogs)),length(blogs)*sample_size)
news_index<-sample(seq_len(length(news)),length(news)*sample_size)
twitter_index<-sample(seq_len(length(twitter)),length(twitter)*sample_size)


blogs<-blogs[blogs_index[]]
news<-news[news_index[]]
twitter<-twitter[twitter_index[]]


blogs<- paste(blogs , collapse = ' ')
news<- paste(news , collapse = ' ')
twitter<- paste(twitter , collapse = ' ')

Creation and Cleaning of the corpus

As observed, in the given dataset there are numerous characters, words, numbers and punctuations that are not relevant to the prediction exercise. Therefore, few functions will be created to clean the corpus before the actual analysis can be performed. The transformation is performed using tm_map and it includes cleaning the urls,repeating words, punctuation,stripwhitespaces , non ASCII words,numbers.

vdocs <- VCorpus(VectorSource(c(blogs, news, twitter)),readerControl=list(reader=readPlain,language="en"))
vdocs <- tm_map(vdocs, stripWhitespace)
vdocs <- tm_map(vdocs, removePunctuation)
vdocs <- tm_map(vdocs, content_transformer(tolower))
vdocs <- tm_map(vdocs, removeWords, stopwords("english"))

 
vdocs <- tm_map(vdocs, stemDocument)
vdocs <- tm_map(vdocs, removeNumbers)
vdocs<-tm_map(vdocs,PlainTextDocument)

Creation of the document term matrix

Unigram

It is a sequence of single words from the corpus

unigram_NLPtrigramTokenizer <- function(x) {
  unlist(lapply(ngrams(words(x), 1), paste, collapse = " "), use.names = FALSE)
}
dtm_NLP_unigram<- DocumentTermMatrix(vdocs ,control=list(tokenize = unigram_NLPtrigramTokenizer) )
m1 <- as.matrix(dtm_NLP_unigram)
v1 <- sort(colSums(m1),decreasing=TRUE)
d1<- data.frame(word = names(v1),freq=v1)
head(d1,10)
##      word  freq
## will will 32020
## one   one 30790
## said said 30478
## just just 30451
## get   get 30042
## like like 29623
## time time 25754
## can   can 24607
## day   day 22546
## year year 21617
plotd1<-d1[1:20,]
barplot(plotd1$freq, las = 2, names.arg = plotd1$word,
        col ="pink", main ="Most frequent words",
        ylab = "Word frequencies")

bigram

It is a sequence of two words from the corpus.

bigram_NLPtrigramTokenizer <- function(x) {
  unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
}
dtm_NLP_bigram<- DocumentTermMatrix(vdocs ,control=list(tokenize = bigram_NLPtrigramTokenizer) )
m2 <- as.matrix(dtm_NLP_bigram)
v2 <- sort(colSums(m2),decreasing=TRUE)
d2 <- data.frame(word = names(v2),freq=v2)
head(d2,10)
##                      word freq
## right now       right now 2490
## last year       last year 2307
## look like       look like 2087
## new york         new york 2024
## dont know       dont know 1947
## cant wait       cant wait 1941
## feel like       feel like 1708
## look forward look forward 1621
## year ago         year ago 1620
## last night     last night 1593
plotd2<-d2[1:20,]
barplot(plotd2$freq, las = 2, names.arg = plotd2$word,
        col ="yellow", main ="Most frequent words",
        ylab = "Word frequencies")

trigram

It is a sequence of three words from the corpus.

trigram_NLPtrigramTokenizer <- function(x) {
  unlist(lapply(ngrams(words(x), 3), paste, collapse = " "), use.names = FALSE)
}
dtm_NLP_unigram<- DocumentTermMatrix(vdocs ,control=list(tokenize = trigram_NLPtrigramTokenizer) )
m3 <- as.matrix(dtm_NLP_unigram)
v3 <- sort(colSums(m3),decreasing=TRUE)
d3 <- data.frame(word = names(v3),freq=v3)
head(d3,10)
##                                    word freq
## cant wait see             cant wait see  378
## happi mother day       happi mother day  334
## new york citi             new york citi  273
## let us know                 let us know  256
## happi new year           happi new year  213
## presid barack obama presid barack obama  197
## im pretti sure           im pretti sure  184
## two year ago               two year ago  156
## new york time             new york time  152
## look forward see       look forward see  147
plotd3<-d3[1:20,]

barplot(plotd3$freq, las = 2, names.arg = plotd3$word,
        col ="blue", main ="Most frequent words",
        ylab = "Word frequencies")

Creation of the wordcloud

unigram

wordcloud(d1$word, d1$freq, max.words=100, random.order=FALSE, rot.per=0, fixed.asp = TRUE, use.r.layout = FALSE, colors=brewer.pal(8, "Dark2"))
## Warning in wordcloud(d1$word, d1$freq, max.words = 100, random.order =
## FALSE, : your could not be fit on page. It will not be plotted.
## Warning in wordcloud(d1$word, d1$freq, max.words = 100, random.order =
## FALSE, : talk could not be fit on page. It will not be plotted.
## Warning in wordcloud(d1$word, d1$freq, max.words = 100, random.order =
## FALSE, : long could not be fit on page. It will not be plotted.

bigram

wordcloud(d2$word, d2$freq, max.words=100, random.order=FALSE, rot.per=0, fixed.asp = TRUE, use.r.layout = FALSE, colors=brewer.pal(8, "Dark2"))
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : high school could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : make sure could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : happi birthday could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : even though could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : come back could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : year old could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : good morn could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : sound like could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : next year could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : everi day could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : unit state could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : follow back could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : can see could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : get back could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : good luck could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : thank much could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : new year could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : two year could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : one thing could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : just want could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : seem like could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : last month could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : mother day could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : san francisco could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : everi time could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : dont like could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : can make could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : long time could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : will take could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : will make could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : mani peopl could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : dont get could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : good thing could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : didnt know could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : im gonna could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : thank rt could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : san diego could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : want see could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : pretti much could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : three year could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : two week could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : wait see could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : next time could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : know im could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : health care could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : best friend could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : just one could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : can help could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : great day could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : will never could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : make sens could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : need get could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : take care could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : realli good could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : good time could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : realli want could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : pretti good could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : pleas follow could not be fit on page. It will not be plotted.
## Warning in wordcloud(d2$word, d2$freq, max.words = 100, random.order =
## FALSE, : get better could not be fit on page. It will not be plotted.

trigram

wordcloud(d3$word, d3$freq, max.words=100, random.order=FALSE, rot.per=0, fixed.asp = TRUE, use.r.layout = FALSE, colors=brewer.pal(8, "Dark2"))
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : love love love could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : two week ago could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : right now im could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : dream come true could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : four year ago could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : ive ever seen could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order
## = FALSE, : follow follow back could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : want make sure could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : high school student could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : three year ago could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : make feel like could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order
## = FALSE, : martin luther king could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : didnt even know could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order
## = FALSE, : pleas pleas pleas could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : georg w bush could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order
## = FALSE, : happi valentin day could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : cant wait hear could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : make feel better could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : osama bin laden could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : realli look forward could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : cent per share could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : coupl week ago could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : dont think can could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : dont get wrong could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : five year ago could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : past two year could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : spend much time could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : come join us could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : spend lot time could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : think im go could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : superior court judg could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : long time ago could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : sever year ago could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order
## = FALSE, : good morn everyon could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order
## = FALSE, : us district court could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : keep good work could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : ill let know could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : im just go could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order
## = FALSE, : look forward meet could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : cant wait go could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : dont know im could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : im big fan could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : everi singl day could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : wall street journal could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : year ago today could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : attorney general offic could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order
## = FALSE, : occupi wall street could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : today good day could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : follow look forward could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order
## = FALSE, : senior vice presid could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order
## = FALSE, : thank follow look could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order
## = FALSE, : chief execut offic could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : counti sheriff offic could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : dont want go could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : just make sure could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : thank follow us could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : coupl year ago could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : dont realli know could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order
## = FALSE, : hope everyon great could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : im go go could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : just want say could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : last two year could not be fit on page. It will not be plotted.
## Warning in wordcloud(d3$word, d3$freq, max.words = 100, random.order =
## FALSE, : luther king jr could not be fit on page. It will not be plotted.

Further Development Plan

After the exploratory analysis, I think we can start building the predictive model(s) and eventually the data product. Here is my further steps: Building the predictive model(s) by using the tokens. Develop data product (i.e. shiny app) to make word prediction based on user inputs.