Introduction

The goal of this project is to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm. This report explains the exploratory analysis and the goals for the eventual app and algorithm. This document is concise and explains only the major features of the data have been identified and briefly summarize the plans for creating the prediction algorithm and Shiny app in a way that would be understandable to a non-data scientist manager. Tables and plots are made to illustrate important summaries of the data set. The motivation for this project is to:

  1. Demonstrate that you’ve downloaded the data and have successfully loaded it in.
  2. Create a basic report of summary statistics about the data sets.
  3. Report any interesting findings that you amassed so far.
  4. Get feedback on your plans for creating a prediction algorithm and Shiny app.

Data Preparation

The zip file for the Capstone project is available for download here

The files are extracted from the zip file with three working files:

Load Libraries

The libraries needed for this project are first loaded

library(NLP) # Natural language processing
library(tm) # Text mining
library(stringi) # stats files
library(RWeka) # tokenizer - create unigrams, bigrams, trigrams
library(ggplot2) # for visualization
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)

Data Loading

blogsEN <- readLines("en_US/en_US.blogs.txt", warn = FALSE, encoding = "UTF-8")
newsEN <- readLines("en_US/en_US.news.txt", warn = FALSE, encoding = "UTF-8")
twitterEN <- readLines("en_US/en_US.twitter.txt", warn = FALSE, encoding = "UTF-8")

Data Preprocessing

Non-English characters are first removed and then resampled to 1% original datasets.

size_blogs <- file.info("en_US/en_US.blogs.txt")$size / 1024^2 # MB
size_news <- file.info("en_US/en_US.news.txt")$size  / 1024^2 # MB
size_twitter <- file.info("en_US/en_US.twitter.txt")$size / 1024^2 # MB
# Count the number of Lines num.lines
length_blogs <- length(blogsEN) # number of lines for blogs 899 288
length_news <- length(newsEN)  #  number of lines for news 1 010 242 
length_twitter <- length(twitterEN) #  number of lines for twitter 2 360 148
# Count the number of characters
nchar_blogs <- sum(nchar(blogsEN)) # number of characters for blogs
nchar_news <- sum(nchar(newsEN)) # number of characters for news
nchar_twitter <- sum(nchar(twitterEN)) # number of characters for twitter
# Count the number of words 
nword_blogs <- sum(stri_count_words(blogsEN)) # number of words for blogs: 37 546 246
nword_news <- sum(stri_count_words(newsEN))  # number of words for news:  34 762 395
nword_twitter <-sum(stri_count_words(twitterEN)) # number of words for twitter: 30,093,410
# A table is created
data.frame(file.name = c("blogs", "news", "twitter"),
files.size.MB = c(size_blogs,size_news,size_twitter),
num.lines = c(length_blogs,length_news,length_twitter),
num.character = c(nchar_blogs,nchar_news,nchar_twitter),
num.words = c(nword_blogs,nword_news,nword_twitter))
##   file.name files.size.MB num.lines num.character num.words
## 1     blogs      200.4242    899288     206824505  37546239
## 2      news      196.2775     77259      15639408   2674536
## 3   twitter      159.3641   2360148     162096031  30093372
set.seed(12345)
blogs1 <-iconv(blogsEN,"latin1","ASCII",sub="")
news1 <-iconv(newsEN,"latin1","ASCII",sub="")
twitter1 <-iconv(twitterEN,"latin1","ASCII",sub="")
# sample data set only 1% of each file
sample_data <-c(sample(blogs1,length(blogs1)*0.01),
               sample(news1,length(news1)*0.01),
               sample(twitter1,length(twitter1)*0.01))

Transform the corpus

corpus <- VCorpus(VectorSource(sample_data))
corpusOne <- tm_map(corpus,removePunctuation) # remove Punctuation
corpusTwo <- tm_map(corpusOne,stripWhitespace) # remove Whitespace

corpusThree <- tm_map(corpusTwo,tolower) # Convert to lowercase
corpusFour <- tm_map(corpusThree,removeNumbers) # remove Numbers
corpusFive <- tm_map(corpusFour,PlainTextDocument)
corpusSix <- tm_map(corpusFive,removeWords,stopwords("english")) # remove Stop words in English

N-Grams

N-grams is built using RWeka to extract 1-gram, 2-grams, 3-grams from the text Corpus.

1-Gram

1-gram is a contiguous sequence of single word from the corpus.

#RWeka package construct functions that tokenize the sample and construct matrices of 1-qrams, 2-grams, and 3-grams.
one<-function(x) NGramTokenizer(x,Weka_control(min=1,max=1))
two<-function(x) NGramTokenizer(x,Weka_control(min=2,max=2))
three<-function(x) NGramTokenizer(x,Weka_control(min=3,max=3))
one_table<-TermDocumentMatrix(corpusSix,control=list(tokenize=one))
two_table<-TermDocumentMatrix(corpusSix,control=list(tokenize=two))
three_table<-TermDocumentMatrix(corpusSix,control=list(tokenize=three))

#The frequency of terms is found in each of these 3 matrices 
one_corpus<-findFreqTerms(one_table,lowfreq=1000)
two_corpus<-findFreqTerms(two_table,lowfreq=80)
three_corpus<-findFreqTerms(three_table,lowfreq=10)
one_corpus_num<-rowSums(as.matrix(one_table[one_corpus,]))
one_corpus_table<-data.frame(Word=names(one_corpus_num),frequency=one_corpus_num)
one_corpus_sort<-one_corpus_table[order(-one_corpus_table$frequency),]
head(one_corpus_sort)
##      Word frequency
## just just      2576
## like like      2218
## will will      2211
## one   one      2049
## get   get      1869
## can   can      1866

2-Grams

2-gram is a contiguous sequence of two words from the corpus.

two_corpus_num<-rowSums(as.matrix(two_table[two_corpus,]))
two_corpus_table<-data.frame(Word=names(two_corpus_num),frequency=two_corpus_num)
two_corpus_sort<-two_corpus_table[order(-two_corpus_table$frequency),]
head(two_corpus_sort)
##                  Word frequency
## cant wait   cant wait       208
## right now   right now       206
## dont know   dont know       164
## last night last night       148
## im going     im going       130
## feel like   feel like       125

3-Grams

3-gram is a contiguous sequence of three words from the corpus.

three_corpus_num<-rowSums(as.matrix(three_table[three_corpus,]))
three_corpus_table<-data.frame(Word=names(three_corpus_num),frequency=three_corpus_num)
three_corpus_sort<-three_corpus_table[order(-three_corpus_table$frequency),]
head(three_corpus_sort)
##                                      Word frequency
## cant wait see               cant wait see        45
## happy mothers day       happy mothers day        36
## happy new year             happy new year        24
## im pretty sure             im pretty sure        18
## italy lakes holidays italy lakes holidays        18
## little italy boston   little italy boston        17

Exploratory Analysis

Three bar plots are plotted to visualize the frequency distribution of each n-grams

one_g<-ggplot(one_corpus_sort[1:10,],aes(x=reorder(Word,-frequency),y=frequency,fill=frequency))
one_g<-one_g+geom_bar(stat="identity", fill="#FFFF00", colour="white")
one_g<-one_g+labs(title="Unigrams",x="Words",y="Frequency")
one_g<-one_g+theme(axis.text.x=element_text(angle=90))
one_g

two_g<-ggplot(two_corpus_sort[1:10,],aes(x=reorder(Word,-frequency),y=frequency,fill=frequency))
two_g<-two_g+geom_bar(stat="identity", fill="#00FFFF", colour="white")
two_g<-two_g+labs(title="Bigrams",x="Words",y="Frequency")
two_g<-two_g+theme(axis.text.x=element_text(angle=90))
two_g

three_g<-ggplot(three_corpus_sort[1:10,],aes(x=reorder(Word,-frequency),y=frequency,fill=frequency))
three_g<-three_g+geom_bar(stat="identity", fill="#FF00FF", colour="white")
three_g<-three_g+labs(title="Trigrams",x="Words",y="Frequency")
three_g<-three_g+theme(axis.text.x=element_text(angle=90))
three_g

Future steps

A predictive algorithm will be built and published on a Shiny app, that would predict the most likely next word after a phrase is typed

Reference

1