Synoposis

This project is to explain the exploratory analysis and for the eventual app and algorithm (predictive model for the most likely next word in a sequence of words)

Data Preparation

Prepare and load the data

Download the data from https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip and unzip the data

# Set working directory
setwd("C:/Coursera")

#Download the data if not available and unzip the data
if(!file.exists("Coursera-SwiftKey.zip")){
        download.file("https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip", "Coursera-SwiftKey.zip")
        unzip("Coursera-SwiftKey.zip")
}

blogs <- readLines("final/en_US/en_US.blogs.txt", warn = FALSE, encoding = "UTF-8")
news <- readLines("final/en_US/en_US.news.txt", warn = FALSE, encoding = "UTF-8")
twitter <- readLines("final/en_US/en_US.twitter.txt", warn = FALSE, encoding = "UTF-8")

Data Summary

Data overview in terms of file size (in MB), lines (number of entries), total characters and the longest line.

summary <- data.frame("FileName" = c("Blogs","News","Twitter"),
                      "File Size" = sapply(list(blogs, news, twitter), function(x){format(object.size(x),"MB")}),
                      "Line" = sapply(list(blogs, news, twitter), function(x){length(x)}),
                      "TotalCharacters" = sapply(list(blogs, news, twitter), function(x){sum(nchar(x))}),
                      "MaxCharacters" = sapply(list(blogs, news, twitter), function(x){max(unlist(lapply(x, function(y) nchar(y))))})
                      )
summary
##   FileName File.Size    Line TotalCharacters MaxCharacters
## 1    Blogs  248.5 Mb  899288       206824505         40833
## 2     News   19.2 Mb   77259        15639408          5760
## 3  Twitter  301.4 Mb 2360148       162096031           140

Data Cleaning and Selection

Due to huge data, we are going to select 5% of the data and convert it to corpus.

set.seed(7890) # for reproducible data
sample_size <- 0.01 # 1% sample from the data
sample_data <- c(sample(blogs,length(blogs)*sample_size),
                 sample(blogs,length(news)*sample_size),
                 sample(blogs,length(twitter)*sample_size))

# remove non ASCII characters
sample_data <- iconv(sample_data, "latin1", "ASCII", sub="")

library(tm) # Load Text Mining library

# Make corpus out of the sampel data
corpus <- VCorpus(VectorSource(sample_data)) 

# Clean the corpus data
corpus <- tm_map(corpus, removePunctuation) # Remove punctuation
corpus <- tm_map(corpus, stripWhitespace) # Remove unneccesary white spaces
corpus <- tm_map(corpus, content_transformer(tolower)) # Convert to lowercase
corpus <- tm_map(corpus, removeNumbers) # Remove numbers
corpus <- tm_map(corpus, PlainTextDocument) # Plain text

Exploratory Analysis

A thorough exploratory analysis of the data will be performed to understand the distribution of words and relationship between the words in the corpora.

Tokenize and Calculate Frequencies of N-Grams

To convert the cleaned sample data into N-grams format using RWeka packages. The N-gram representation of a text lists all N-tuples of words that appear. The simplest case is the unigram (based on individual word), followed by bigram (based on pairs of words) and so on.

library(RWeka) # Weka is a collection of machine learning algorithms for data mining

unigram_tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
bigram_tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
trigram_tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))

Unigrams <- TermDocumentMatrix(corpus, control = list(tokenize = unigram_tokenizer))
Bigrams <- TermDocumentMatrix(corpus, control = list(tokenize = bigram_tokenizer))
Trigrams <- TermDocumentMatrix(corpus, control = list(tokenize = trigram_tokenizer))

## Excluding word/phrase with frequency lower than 100
unigrams_corpus <- findFreqTerms(Unigrams, lowfreq = 100)
bigrams_corpus <- findFreqTerms(Bigrams, lowfreq = 100)
trigrams_corpus <- findFreqTerms(Trigrams, lowfreq = 100)

unigrams_freq <- rowSums(as.matrix(Unigrams[unigrams_corpus,]))
unigrams_freq <- data.frame(word = names(unigrams_freq), frequency = unigrams_freq)
bigrams_freq <- rowSums(as.matrix(Bigrams[bigrams_corpus,]))
bigrams_freq <- data.frame(word = names(bigrams_freq), frequency = bigrams_freq)
trigrams_freq <- rowSums(as.matrix(Trigrams[trigrams_corpus,]))
trigrams_freq <- data.frame(word = names(trigrams_freq), frequency = trigrams_freq)

Plotting

Plot the Top 20 word usage using bar chart for each Unigrams, Bigrams, Trigrams

library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
plot_ngrams <- function(data, chart_title, top) {
    df <- data[order(-data$frequency),][1:top,] 
    ggplot(df, aes(x = reorder(word, -frequency), y = frequency)) +
        geom_bar(stat = "identity") +
        ggtitle(paste("Top", top, chart_title, "")) +
        xlab("Words") + 
        theme(axis.text.x = element_text(angle = 90, hjust = 1))
}

plot_ngrams(unigrams_freq, "One-Word Usage", 20)

plot_ngrams(bigrams_freq, "Two-Word Usage", 20)

plot_ngrams(trigrams_freq, "Three-Word Usage", 20)