Executive Summary

The goal here is to build simple model for the relationship between words.This is the first step in building a predictive text mining application.Using the exploratory analysis, I am going to build a basic n-gram model for predicting the next word based on the previous 1, 2, or 3 words.Also, my model will handle the unseen n-grams.The process followed in building the model comprises of following steps

1) Demonstrate that we have downloaded the data and have successfully loaded it in R.

2) Create a basic report of summary statistics about the data sets

3) Report any interesting findings that we have amassed so far

4) Get feedback on the plans for creating a prediction algorithm and Shiny app

Loading Necessary Libraries

library(NLP)
library(tm)
library(fpc)
library(RColorBrewer)
library(wordcloud)
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
library(stringi)
library(data.table)

Loading The Dataset

blogs <- readLines("C:/Data Science/R/Coursera Capstone Project/Project 1/Coursera-SwiftKey/final/en_US/en_US.blogs.txt",encoding = "UTF-8", skipNul = TRUE)
news <- readLines("C:/Data Science/R/Coursera Capstone Project/Project 1/Coursera-SwiftKey/final/en_US/en_US.news.txt",encoding = "UTF-8", skipNul = TRUE)
## Warning in readLines("C:/Data Science/R/Coursera Capstone Project/Project
## 1/Coursera-SwiftKey/final/en_US/en_US.news.txt", : incomplete final line
## found on 'C:/Data Science/R/Coursera Capstone Project/Project 1/Coursera-
## SwiftKey/final/en_US/en_US.news.txt'
twitter <- readLines("C:/Data Science/R/Coursera Capstone Project/Project 1/Coursera-SwiftKey/final/en_US/en_US.twitter.txt",encoding = "UTF-8", skipNul = TRUE)

Basic Summary of The Dataset

b <- file.size("C:/Data Science/R/Coursera Capstone Project/Project 1/Coursera-SwiftKey/final/en_US/en_US.blogs.txt")
n <- file.size("C:/Data Science/R/Coursera Capstone Project/Project 1/Coursera-SwiftKey/final/en_US/en_US.news.txt")
t <- file.size("C:/Data Science/R/Coursera Capstone Project/Project 1/Coursera-SwiftKey/final/en_US/en_US.twitter.txt")
m <- matrix(c(NROW(blogs),NROW(news),NROW(twitter),sum(nchar(blogs)),sum(nchar(news)),sum(nchar(twitter)),(b/1024^2),(n/1024^2),(t/1024^2)),byrow = FALSE,nrow=3,ncol=3,dimnames = list(c("blogs","news","twitter"),c("No.Of Lines","No. Of Characters","File Size in Mb")))
Wordcount <- sapply(list(blogs,news,twitter),stri_stats_latex)['Words',]
BasicSummary <- cbind(m,Wordcount)
BasicSummary
##         No.Of Lines No. Of Characters File Size in Mb Wordcount
## blogs        899288         206824505        200.4242  37570839
## news          77259          15639408        196.2775   2651432
## twitter     2360148         162096241        159.3641  30451170

Removing Non-English Words

blogs <- iconv(blogs,"latin1","ASCII",sub = "")
news <- iconv(news,"latin1","ASCII",sub = "")
twitter <- iconv(twitter,"latin1","ASCII",sub = "")

We will trim the dataset to make it a training dataset & use only 1% of rows in each of the datasets. This is done to operate with less data to start with.

factor <- 0.01
blogs1 <- sample(blogs,round(factor*length(blogs)))
news1 <- sample(news,round(factor*length(news)))
twitter1 <- sample(twitter,round(factor*length(twitter)))
BasicSummary1 <- matrix(c(NROW(blogs1),NROW(news1),NROW(twitter1)),byrow = TRUE,nrow=3,ncol=1,dimnames = list(c("blogs1","news1","twitter1"),"No.Of Rows"))
BasicSummary1
##          No.Of Rows
## blogs1         8993
## news1           773
## twitter1      23601

Merging The Training / Sample Files & Creating The Corpus

set.seed(666)
trainingset <- c(blogs1,news1,twitter1)
trainingcorpus <- VCorpus(VectorSource(trainingset))

Preprocessing The ‘trainingcorpus’ file

preprocess <- function(document){
    document <- tm_map(document, removePunctuation)
    document <- tm_map(document, removeNumbers)
    document <- tm_map(document, stripWhitespace)
    document <- tm_map(document, content_transformer(tolower))
    document <- tm_map(document, PlainTextDocument)
    return(document)
}
trainingcorpus <- preprocess(trainingcorpus)

Creating Tokenizers

Unigramtokenizer <- function(x)
        unlist(lapply(ngrams(words(x), 1), paste, collapse = " "), use.names = FALSE)
Bigramtokenizer <- function(x)
        unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
Trigramtokenizer <-function(x)
        unlist(lapply(ngrams(words(x), 3), paste, collapse = " "), use.names = FALSE)

Creating Document Matrix

unigramdocumentmatrix <- TermDocumentMatrix(trainingcorpus,control = list(tokenize = Unigramtokenizer))
bigramdocumentmatrix <- TermDocumentMatrix(trainingcorpus,control = list(tokenize = Bigramtokenizer))
trigramdocumentmatrix <- TermDocumentMatrix(trainingcorpus,control = list(tokenize = Trigramtokenizer))

Computing Frequencies

unigramf <- findFreqTerms(unigramdocumentmatrix,lowfreq =50)
bigramf <- findFreqTerms(bigramdocumentmatrix,lowfreq = 50)
trigramf <- findFreqTerms(trigramdocumentmatrix,lowfreq = 50)

Computing Frequencies For n-grams

Unigramfreq <- rowSums(as.matrix(unigramdocumentmatrix[unigramf,]))
Unigramfreq <- data.frame(word=names(Unigramfreq),frequency=Unigramfreq)
Bigramfreq <- rowSums(as.matrix(bigramdocumentmatrix[bigramf,]))
Bigramfreq <- data.frame(word=names(Bigramfreq),frequency=Bigramfreq)
Trigramfreq <- rowSums(as.matrix(trigramdocumentmatrix[trigramf,]))
Trigramfreq <- data.frame(word=names(Trigramfreq),frequency=Trigramfreq)
head(Trigramfreq)
##                    word frequency
## a bit of       a bit of        50
## a couple of a couple of        74
## a lot of       a lot of       211
## all of the   all of the        55
## as well as   as well as        97
## at the end   at the end        60

Plotting The Graph

plotthegraph <- function(data,title,num){
        df <- data[order(-data$frequency),][1:num,]
        barplot(df[1:num,]$freq, las = 2, names.arg = df[1:num,]$word,
        col ="red", main = title,
        ylab = "Word frequencies",cex.axis =0.8)
}
par(mar=c(10,4,4,2))
plotthegraph(Unigramfreq,"Top Unigrams",20)

plotthegraph(Bigramfreq,"Top Bigrams",20)

plotthegraph(Trigramfreq,"Top Trigrams",20)