The goal of this project is to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm. This report explains the exploratory analysis and the goals for the eventual app and algorithm. This document is concise and explains only the major features of the data have been identified and briefly summarize the plans for creating the prediction algorithm and Shiny app in a way that would be understandable to a non-data scientist manager. Tables and plots are made to illustrate important summaries of the data set. The motivation for this project is to:
The zip file for the Capstone project is available for download here
The files are extracted from the zip file with three working files:
The libraries needed for this project are first loaded
library(NLP) # Natural language processing
library(tm) # Text mining
library(stringi) # stats files
library(RWeka) # tokenizer - create unigrams, bigrams, trigrams
library(ggplot2) # for visualization
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
blogsEN <- readLines("en_US/en_US.blogs.txt", warn = FALSE, encoding = "UTF-8")
newsEN <- readLines("en_US/en_US.news.txt", warn = FALSE, encoding = "UTF-8")
twitterEN <- readLines("en_US/en_US.twitter.txt", warn = FALSE, encoding = "UTF-8")
Non-English characters are first removed and then resampled to 1% original datasets.
size_blogs <- file.info("en_US/en_US.blogs.txt")$size / 1024^2 # MB
size_news <- file.info("en_US/en_US.news.txt")$size / 1024^2 # MB
size_twitter <- file.info("en_US/en_US.twitter.txt")$size / 1024^2 # MB
# Count the number of Lines num.lines
length_blogs <- length(blogsEN) # number of lines for blogs 899 288
length_news <- length(newsEN) # number of lines for news 1 010 242
length_twitter <- length(twitterEN) # number of lines for twitter 2 360 148
# Count the number of characters
nchar_blogs <- sum(nchar(blogsEN)) # number of characters for blogs
nchar_news <- sum(nchar(newsEN)) # number of characters for news
nchar_twitter <- sum(nchar(twitterEN)) # number of characters for twitter
# Count the number of words
nword_blogs <- sum(stri_count_words(blogsEN)) # number of words for blogs: 37 546 246
nword_news <- sum(stri_count_words(newsEN)) # number of words for news: 34 762 395
nword_twitter <-sum(stri_count_words(twitterEN)) # number of words for twitter: 30,093,410
# A table is created
data.frame(file.name = c("blogs", "news", "twitter"),
files.size.MB = c(size_blogs,size_news,size_twitter),
num.lines = c(length_blogs,length_news,length_twitter),
num.character = c(nchar_blogs,nchar_news,nchar_twitter),
num.words = c(nword_blogs,nword_news,nword_twitter))
## file.name files.size.MB num.lines num.character num.words
## 1 blogs 200.4242 899288 206824505 37546239
## 2 news 196.2775 77259 15639408 2674536
## 3 twitter 159.3641 2360148 162096031 30093372
set.seed(12345)
blogs1 <-iconv(blogsEN,"latin1","ASCII",sub="")
news1 <-iconv(newsEN,"latin1","ASCII",sub="")
twitter1 <-iconv(twitterEN,"latin1","ASCII",sub="")
# sample data set only 1% of each file
sample_data <-c(sample(blogs1,length(blogs1)*0.01),
sample(news1,length(news1)*0.01),
sample(twitter1,length(twitter1)*0.01))
corpus <- VCorpus(VectorSource(sample_data))
corpusOne <- tm_map(corpus,removePunctuation) # remove Punctuation
corpusTwo <- tm_map(corpusOne,stripWhitespace) # remove Whitespace
corpusThree <- tm_map(corpusTwo,tolower) # Convert to lowercase
corpusFour <- tm_map(corpusThree,removeNumbers) # remove Numbers
corpusFive <- tm_map(corpusFour,PlainTextDocument)
corpusSix <- tm_map(corpusFive,removeWords,stopwords("english")) # remove Stop words in English
N-grams is built using RWeka to extract 1-gram, 2-grams, 3-grams from the text Corpus.
1-gram is a contiguous sequence of single word from the corpus.
#RWeka package construct functions that tokenize the sample and construct matrices of 1-qrams, 2-grams, and 3-grams.
one<-function(x) NGramTokenizer(x,Weka_control(min=1,max=1))
two<-function(x) NGramTokenizer(x,Weka_control(min=2,max=2))
three<-function(x) NGramTokenizer(x,Weka_control(min=3,max=3))
one_table<-TermDocumentMatrix(corpusSix,control=list(tokenize=one))
two_table<-TermDocumentMatrix(corpusSix,control=list(tokenize=two))
three_table<-TermDocumentMatrix(corpusSix,control=list(tokenize=three))
#The frequency of terms is found in each of these 3 matrices
one_corpus<-findFreqTerms(one_table,lowfreq=1000)
two_corpus<-findFreqTerms(two_table,lowfreq=80)
three_corpus<-findFreqTerms(three_table,lowfreq=10)
one_corpus_num<-rowSums(as.matrix(one_table[one_corpus,]))
one_corpus_table<-data.frame(Word=names(one_corpus_num),frequency=one_corpus_num)
one_corpus_sort<-one_corpus_table[order(-one_corpus_table$frequency),]
head(one_corpus_sort)
## Word frequency
## just just 2576
## like like 2218
## will will 2211
## one one 2049
## get get 1869
## can can 1866
2-gram is a contiguous sequence of two words from the corpus.
two_corpus_num<-rowSums(as.matrix(two_table[two_corpus,]))
two_corpus_table<-data.frame(Word=names(two_corpus_num),frequency=two_corpus_num)
two_corpus_sort<-two_corpus_table[order(-two_corpus_table$frequency),]
head(two_corpus_sort)
## Word frequency
## cant wait cant wait 208
## right now right now 206
## dont know dont know 164
## last night last night 148
## im going im going 130
## feel like feel like 125
3-gram is a contiguous sequence of three words from the corpus.
three_corpus_num<-rowSums(as.matrix(three_table[three_corpus,]))
three_corpus_table<-data.frame(Word=names(three_corpus_num),frequency=three_corpus_num)
three_corpus_sort<-three_corpus_table[order(-three_corpus_table$frequency),]
head(three_corpus_sort)
## Word frequency
## cant wait see cant wait see 45
## happy mothers day happy mothers day 36
## happy new year happy new year 24
## im pretty sure im pretty sure 18
## italy lakes holidays italy lakes holidays 18
## little italy boston little italy boston 17
Three bar plots are plotted to visualize the frequency distribution of each n-grams
one_g<-ggplot(one_corpus_sort[1:10,],aes(x=reorder(Word,-frequency),y=frequency,fill=frequency))
one_g<-one_g+geom_bar(stat="identity", fill="#FFFF00", colour="white")
one_g<-one_g+labs(title="Unigrams",x="Words",y="Frequency")
one_g<-one_g+theme(axis.text.x=element_text(angle=90))
one_g
two_g<-ggplot(two_corpus_sort[1:10,],aes(x=reorder(Word,-frequency),y=frequency,fill=frequency))
two_g<-two_g+geom_bar(stat="identity", fill="#00FFFF", colour="white")
two_g<-two_g+labs(title="Bigrams",x="Words",y="Frequency")
two_g<-two_g+theme(axis.text.x=element_text(angle=90))
two_g
three_g<-ggplot(three_corpus_sort[1:10,],aes(x=reorder(Word,-frequency),y=frequency,fill=frequency))
three_g<-three_g+geom_bar(stat="identity", fill="#FF00FF", colour="white")
three_g<-three_g+labs(title="Trigrams",x="Words",y="Frequency")
three_g<-three_g+theme(axis.text.x=element_text(angle=90))
three_g
A predictive algorithm will be built and published on a Shiny app, that would predict the most likely next word after a phrase is typed