Executive Summary

The goal here is to build simple model for the relationship between words.This is the first step in building a predictive text mining application.Using the exploratory analysis, I am going to build a basic n-gram model for predicting the next word based on the previous 1, 2, or 3 words.Also, my model will handle the unseen n-grams.The process followed in building the model comprises of following steps

1) Demonstrate that we have downloaded the data and have successfully loaded it in R.

2) Create a basic report of summary statistics about the data sets

3) Report any interesting findings that we have amassed so far

4) Get feedback on the plans for creating a prediction algorithm and Shiny app

Loading Necessary Libraries

library(NLP)
library(tm)
library(fpc)
library(RColorBrewer)
library(wordcloud)
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
library(stringi)

Loading The Dataset

blogs <- readLines("C:/Data Science/R/Coursera Capstone Project/Project 1/Coursera-SwiftKey/final/en_US/en_US.blogs.txt")
news <- readLines("C:/Data Science/R/Coursera Capstone Project/Project 1/Coursera-SwiftKey/final/en_US/en_US.news.txt")
twitter <- readLines("C:/Data Science/R/Coursera Capstone Project/Project 1/Coursera-SwiftKey/final/en_US/en_US.twitter.txt")

Basic Summary of The Dataset

b <- file.size("C:/Data Science/R/Coursera Capstone Project/Project 1/Coursera-SwiftKey/final/en_US/en_US.blogs.txt")
n <- file.size("C:/Data Science/R/Coursera Capstone Project/Project 1/Coursera-SwiftKey/final/en_US/en_US.news.txt")
t <- file.size("C:/Data Science/R/Coursera Capstone Project/Project 1/Coursera-SwiftKey/final/en_US/en_US.twitter.txt")
m <- matrix(c(NROW(blogs),NROW(news),NROW(twitter),sum(nchar(blogs)),sum(nchar(news)),sum(nchar(twitter)),(b/1024^2),(n/1024^2),(t/1024^2)),byrow = FALSE,nrow=3,ncol=3,dimnames = list(c("blogs","news","twitter"),c("No.Of Lines","No. Of Characters","File Size in Mb")))
Wordcount <- sapply(list(blogs,news,twitter),stri_stats_latex)['Words',]
BasicSummary <- cbind(m,Wordcount)
BasicSummary
##         No.Of Lines No. Of Characters File Size in Mb Wordcount
## blogs        899288         208361438        200.4242  37865888
## news          77259          15683765        196.2775   2665742
## twitter     2360148         162384825        159.3641  30578891

We will trim the dataset to make it a training dataset & use only 0.1% of rows in each of the datasets. This is done to operate with less data to start with.

factor <- 0.001
blogs1 <- readLines("C:/Data Science/R/Coursera Capstone Project/Project 1/Coursera-SwiftKey/final/en_US/en_US.blogs.txt",round(factor*length(blogs)))
news1 <- readLines("C:/Data Science/R/Coursera Capstone Project/Project 1/Coursera-SwiftKey/final/en_US/en_US.news.txt",round(factor*length(news)))
twitter1 <- readLines("C:/Data Science/R/Coursera Capstone Project/Project 1/Coursera-SwiftKey/final/en_US/en_US.twitter.txt",round(factor*length(twitter)))
BasicSummary1 <- matrix(c(NROW(blogs1),NROW(news1),NROW(twitter1)),byrow = TRUE,nrow=3,ncol=1,dimnames = list(c("blogs1","news1","twitter1"),"No.Of Rows"))
BasicSummary1
##          No.Of Rows
## blogs1          899
## news1            77
## twitter1       2360

Creating The Document Prior To Preprocessing

blogs2 <- VCorpus(VectorSource(blogs1))
news2 <- VCorpus(VectorSource(news1))
twitter2 <- VCorpus(VectorSource(twitter1))

Preprocessing These Documents

preprocess <- function(document){
    document <- tm_map(document, removePunctuation)
    document <- tm_map(document, removeNumbers)
    document <- tm_map(document, stripWhitespace)
    document <- tm_map(document, content_transformer(tolower))
    document <- tm_map(document, removeWords, stopwords("english"))
    document <- tm_map(document, PlainTextDocument)
    return(document)
}
blogs3 <- preprocess(blogs2)
news3 <- preprocess(news2)
twitter3 <- preprocess(twitter2)

Creating Tokenizers

Unigramtokenizer <-
    function(x)
        unlist(lapply(ngrams(words(x), 1), paste, collapse = " "), use.names = FALSE)
Bigramtokenizer <-
    function(x)
        unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
Trigramtokenizer <-
    function(x)
        unlist(lapply(ngrams(words(x), 3), paste, collapse = " "), use.names = FALSE)

Effect Of Tokenizers On Preprocessed “blogs” Corpuses

tokenizedblogs1 <- TermDocumentMatrix(blogs3, control = list(tokenize = Unigramtokenizer))
tokenizedblogs2 <- TermDocumentMatrix(blogs3, control = list(tokenize = Bigramtokenizer))
tokenizedblogs3 <- TermDocumentMatrix(blogs3, control = list(tokenize = Trigramtokenizer))

Effect Of Tokenizers On Preprocessed “news” Corpuses

tokenizednews1 <- TermDocumentMatrix(news3, control = list(tokenize = Unigramtokenizer))
tokenizednews2 <- TermDocumentMatrix(news3, control = list(tokenize = Bigramtokenizer))
tokenizednews3 <- TermDocumentMatrix(news3, control = list(tokenize = Trigramtokenizer))

Effect Of Tokenizers On Preprocessed “twitter” Corpuses

tokenizedtwitter1 <- TermDocumentMatrix(twitter3, control = list(tokenize = Unigramtokenizer))
tokenizedtwitter2 <- TermDocumentMatrix(twitter3, control = list(tokenize = Bigramtokenizer))
tokenizedtwitter3 <- TermDocumentMatrix(twitter3, control = list(tokenize = Trigramtokenizer))

Ploting The Graph

plotthegraph <- function(y)
{
    mat <- sort(rowSums(as.matrix(y)), decreasing=TRUE)
    df <- data.frame(word = names(mat),freq=mat)
    barplot(df[1:10,]$freq, las = 2, names.arg = df[1:10,]$word,
        col ="red", main ="Top 10 Frequent Words",
        ylab = "Word frequencies")
    return (df)
}

Plotting The Word Cloud

plotthewordcloud <- function(df)
{
    minimumfrequency = 40
    wordcloud(words = df$word, freq = df$freq, min.freq = minimumfrequency,
                  max.words=200, random.order=FALSE, rot.per=0.35, 
                  colors=brewer.pal(8, "Dark2"))
}

Interesting Findings In The Data

The Top 10 For Blogs Corpus for 1gram

par(mfrow=c(1,2))
df1 <- plotthegraph(tokenizedblogs1)
plotthewordcloud(df1)

The Top 10 For Blogs Corpus for 2gram

par(mfrow=c(1,2), mar=c(6,4,4,2))
df2 <- plotthegraph(tokenizedblogs2)
plotthewordcloud(df2)

The Top 10 For Blogs Corpus for 3gram

par(mfrow=c(1,2),mar=c(10,4,4,2))
df3 <- plotthegraph(tokenizedblogs3)
plotthewordcloud(df3)

The Top 10 For News Corpus for 1gram

par(mfrow=c(1,2))
df4 <- plotthegraph(tokenizednews1)
plotthewordcloud(df4)

The Top 10 For News Corpus for 2gram

par(mfrow=c(1,2),mar=c(10,4,4,2))
df5 <- plotthegraph(tokenizednews2)
plotthewordcloud(df5)

The Top 10 For News Corpus for 3gram

par(mfrow=c(1,2), mar=c(15,4,4,2))
df6 <- plotthegraph(tokenizednews3)
plotthewordcloud(df6)

The Top 10 For Twitter Corpus for 1gram

par(mfrow=c(1,2))
df7 <- plotthegraph(tokenizedtwitter1)
plotthewordcloud(df7)

The Top 10 For Twitter Corpus for 2gram

par(mfrow=c(1,2),mar=c(8,4,4,2))
df8 <- plotthegraph(tokenizedtwitter2)
plotthewordcloud(df8)

The Top 10 For Twitter Corpus for 3gram

par(mfrow=c(1,2),mar=c(12,4,4,2))
df9 <- plotthegraph(tokenizedtwitter3)
plotthewordcloud(df9)

Conclusion And Further Explorations

In the above model, we conducted an elementary explratory analysis using natural language processing.In the next stage, one will have to build a predictive algorithm using n-gram model whose frequency resonates with my above analysis. Subsequently, the algorithm such articulated will be deployed in a Shiny App which will predict the next word once a word/phrase is typed.