Data Scienece Specialization First Milestone

Description

The scope of this report is to show the initial work that I have done in order to build the Next Word Prediction Model. In order to build the Model I have used data obtained from here. The zip file has data for 4 languages but for this project we are interested in English. The Enlish file contains three txt files with data from blogs, news and twitter.
So the scope of this analysis is to load the data to clean them and to represent some descriptive statistics as well as some explanatory graphs.

Packages

For this analysis are used the following R Packages

library(tm)
library(R.utils)
library(stringi)
library(wordcloud)
library(RWeka)

Load the Data

blogs<-readLines("en_US.blogs.txt", encoding="UTF-8" )
news<-readLines("en_US.news.txt", encoding="UTF-8" )
twitter<-readLines("en_US.twitter.txt", encoding="UTF-8" )

Size of the data files

format(object.size(blogs), units="MB")

## [1] "248.5 Mb"

format(object.size(news), units="MB")

## [1] "19.2 Mb"

format(object.size(twitter), units="MB")

## [1] "301.4 Mb"

So the total sample size is:

format(object.size(blogs)+object.size(news)+object.size(twitter), units="Gb")

## [1] "0.6 Gb"

Lines per Files

#Number of Lines in Blogs file:
countLines("en_US.blogs.txt")[1]

## [1] 899288

#Number of Lines in News file:
countLines("en_US.news.txt")[1]

## [1] 1010242

#Number of Lines in Twitter file:
countLines("en_US.twitter.txt")[1]

## [1] 2360148

Number of Words per File

#Number of Words in Blogs File
sum(stri_count_words(blogs))

## [1] 37546246

#Number of Words in News File
sum(stri_count_words(news))

## [1] 2674536

#Number of Words in Twitter File
sum(stri_count_words(twitter))

## [1] 30093369

Cleaning the Data. Define a Function

#This function is used to clean the data and also to take a sample of the data


clean.data<-function(input=blogs, prob=0.1) {



input_file<-input
n<-trunc(length(input_file)*prob)

ids<-sample(length(input_file), n, replace=FALSE)
input_file<-input_file[ids]
#Remove the Non ASCII Characters
file.clean<-iconv(input_file, "latin1", "ASCII", sub="")
#Convert to Lower Case
file.clean<-tolower(file.clean)
file.clean<-Corpus(VectorSource(file.clean))
#The following Commands Remove Number, Punctuation
file.clean<- tm_map(file.clean, removeNumbers)
file.clean<- tm_map(file.clean, removePunctuation)
file.clean<- tm_map(file.clean, stripWhitespace)
###in case i want to remove the common english words
###file.clean<- tm_map(file.clean, removeWords,stopwords("english"))
output<-tm_map(file.clean,PlainTextDocument)
return(output)

}

Clean all files taking the 0.1% as a sample

#Concatenate all files and excluding Twitter due to the way that enforce users to type
all<-paste(blogs, news)
#Clean the 10% of the all files
clean.all<-clean.data(all,0.001)

Plot a Word Cloud

wordcld <- wordcloud (clean.all, 
                      scale=c(5,0.5), 
                      max.words=200, 
                      random.order=FALSE, 
                      rot.per=0.35, 
                      use.r.layout=FALSE, 
                      colors=brewer.pal(8, 'Dark2'))

Create the Ngrams

#Unigrams
corpus <- clean.all
onegramTokenizer   <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
dtm_onegram   <- DocumentTermMatrix(corpus, control=list(tokenize=onegramTokenizer))
onegram_frequency   <- sort(colSums(as.matrix(dtm_onegram)), decreasing=TRUE)

#Bigrams
twogramTokenizer   <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
dtm_twogram   <- DocumentTermMatrix(corpus, control=list(tokenize=twogramTokenizer))
twogram_frequency   <- sort(colSums(as.matrix(dtm_twogram)), decreasing=TRUE)

#Trigrams
threegramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
dtm_threegram <- DocumentTermMatrix(corpus, control=list(tokenize=threegramTokenizer))
threegram_frequency <- sort(colSums(as.matrix(dtm_threegram)), decreasing=TRUE)

The number of Ngrams

###The number of Unique Words of the Sample
length(onegram_frequency)

## [1] 11361

###The number of Bigrams of the Sample
length(twogram_frequency)

## [1] 45246

###The number of Trigrams of the Sample
length(threegram_frequency)

## [1] 59790

Save the Document Term Matrix

save(dtm_onegram , file="dtm_onegram .RData")
save(dtm_twogram , file="dtm_twogram .RData")
save(dtm_threegram, file="dtm_threegram.RData")

Word Clouds

#Bigram Word Cloud

wordcloud (words = names(twogram_frequency), 
           scale=c(5,0.5), 
           max.words=400, 
           freq = twogram_frequency,
           random.order=FALSE, 
           rot.per=0.35, 
           use.r.layout=FALSE, 
           colors=brewer.pal(8, 'Dark2'))

#Trigram Word Cloud

wordcloud (words = names(threegram_frequency), 
           scale=c(5,0.5), 
           max.words=400, 
           freq = threegram_frequency,
           random.order=FALSE, 
           rot.per=0.35, 
           use.r.layout=FALSE, 
           colors=brewer.pal(8, 'Dark2'))

Next Steps

Since we have created the Bi, Uni and Tri grams we will try to build a Next Word Prediction Algorithm. The scope is to examine different algoriths and to compare their performance. Of course there is a limitation with the train data set since the application will be run on shiny server and we also want to be fast. Most probably the train set will be around 5% of the whole dataset. For this Milestone I considered only 0.1% in order to run fast. At the beginnig the scope is to test the Katz’s back-off model.