Executive Summary
The goal here is to build simple model for the relationship between words.This is the first step in building a predictive text mining application.Using the exploratory analysis, I am going to build a basic n-gram model for predicting the next word based on the previous 1, 2, or 3 words.Also, my model will handle the unseen n-grams.The process followed in building the model comprises of following steps
1) Demonstrate that we have downloaded the data and have successfully loaded it in R.
2) Create a basic report of summary statistics about the data sets
3) Report any interesting findings that we have amassed so far
4) Get feedback on the plans for creating a prediction algorithm and Shiny app
Loading Necessary Libraries
library(NLP)
library(tm)
library(fpc)
library(RColorBrewer)
library(wordcloud)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(stringi)
Loading The Dataset
blogs <- readLines("C:/Data Science/R/Coursera Capstone Project/Project 1/Coursera-SwiftKey/final/en_US/en_US.blogs.txt")
news <- readLines("C:/Data Science/R/Coursera Capstone Project/Project 1/Coursera-SwiftKey/final/en_US/en_US.news.txt")
twitter <- readLines("C:/Data Science/R/Coursera Capstone Project/Project 1/Coursera-SwiftKey/final/en_US/en_US.twitter.txt")
Basic Summary of The Dataset
b <- file.size("C:/Data Science/R/Coursera Capstone Project/Project 1/Coursera-SwiftKey/final/en_US/en_US.blogs.txt")
n <- file.size("C:/Data Science/R/Coursera Capstone Project/Project 1/Coursera-SwiftKey/final/en_US/en_US.news.txt")
t <- file.size("C:/Data Science/R/Coursera Capstone Project/Project 1/Coursera-SwiftKey/final/en_US/en_US.twitter.txt")
m <- matrix(c(NROW(blogs),NROW(news),NROW(twitter),sum(nchar(blogs)),sum(nchar(news)),sum(nchar(twitter)),(b/1024^2),(n/1024^2),(t/1024^2)),byrow = FALSE,nrow=3,ncol=3,dimnames = list(c("blogs","news","twitter"),c("No.Of Lines","No. Of Characters","File Size in Mb")))
Wordcount <- sapply(list(blogs,news,twitter),stri_stats_latex)['Words',]
BasicSummary <- cbind(m,Wordcount)
BasicSummary
## No.Of Lines No. Of Characters File Size in Mb Wordcount
## blogs 899288 208361438 200.4242 37865888
## news 77259 15683765 196.2775 2665742
## twitter 2360148 162384825 159.3641 30578891
We will trim the dataset to make it a training dataset & use only 0.1% of rows in each of the datasets. This is done to operate with less data to start with.
factor <- 0.001
blogs1 <- readLines("C:/Data Science/R/Coursera Capstone Project/Project 1/Coursera-SwiftKey/final/en_US/en_US.blogs.txt",round(factor*length(blogs)))
news1 <- readLines("C:/Data Science/R/Coursera Capstone Project/Project 1/Coursera-SwiftKey/final/en_US/en_US.news.txt",round(factor*length(news)))
twitter1 <- readLines("C:/Data Science/R/Coursera Capstone Project/Project 1/Coursera-SwiftKey/final/en_US/en_US.twitter.txt",round(factor*length(twitter)))
BasicSummary1 <- matrix(c(NROW(blogs1),NROW(news1),NROW(twitter1)),byrow = TRUE,nrow=3,ncol=1,dimnames = list(c("blogs1","news1","twitter1"),"No.Of Rows"))
BasicSummary1
## No.Of Rows
## blogs1 899
## news1 77
## twitter1 2360
Creating The Document Prior To Preprocessing
blogs2 <- VCorpus(VectorSource(blogs1))
news2 <- VCorpus(VectorSource(news1))
twitter2 <- VCorpus(VectorSource(twitter1))
Preprocessing These Documents
preprocess <- function(document){
document <- tm_map(document, removePunctuation)
document <- tm_map(document, removeNumbers)
document <- tm_map(document, stripWhitespace)
document <- tm_map(document, content_transformer(tolower))
document <- tm_map(document, removeWords, stopwords("english"))
document <- tm_map(document, PlainTextDocument)
return(document)
}
blogs3 <- preprocess(blogs2)
news3 <- preprocess(news2)
twitter3 <- preprocess(twitter2)
Creating Tokenizers
Unigramtokenizer <-
function(x)
unlist(lapply(ngrams(words(x), 1), paste, collapse = " "), use.names = FALSE)
Bigramtokenizer <-
function(x)
unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
Trigramtokenizer <-
function(x)
unlist(lapply(ngrams(words(x), 3), paste, collapse = " "), use.names = FALSE)
Effect Of Tokenizers On Preprocessed “blogs” Corpuses
tokenizedblogs1 <- TermDocumentMatrix(blogs3, control = list(tokenize = Unigramtokenizer))
tokenizedblogs2 <- TermDocumentMatrix(blogs3, control = list(tokenize = Bigramtokenizer))
tokenizedblogs3 <- TermDocumentMatrix(blogs3, control = list(tokenize = Trigramtokenizer))
Effect Of Tokenizers On Preprocessed “news” Corpuses
tokenizednews1 <- TermDocumentMatrix(news3, control = list(tokenize = Unigramtokenizer))
tokenizednews2 <- TermDocumentMatrix(news3, control = list(tokenize = Bigramtokenizer))
tokenizednews3 <- TermDocumentMatrix(news3, control = list(tokenize = Trigramtokenizer))
Ploting The Graph
plotthegraph <- function(y)
{
mat <- sort(rowSums(as.matrix(y)), decreasing=TRUE)
df <- data.frame(word = names(mat),freq=mat)
barplot(df[1:10,]$freq, las = 2, names.arg = df[1:10,]$word,
col ="red", main ="Top 10 Frequent Words",
ylab = "Word frequencies")
return (df)
}
Plotting The Word Cloud
plotthewordcloud <- function(df)
{
minimumfrequency = 40
wordcloud(words = df$word, freq = df$freq, min.freq = minimumfrequency,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
}
Interesting Findings In The Data
The Top 10 For Blogs Corpus for 1gram
par(mfrow=c(1,2))
df1 <- plotthegraph(tokenizedblogs1)
plotthewordcloud(df1)

The Top 10 For Blogs Corpus for 2gram
par(mfrow=c(1,2), mar=c(6,4,4,2))
df2 <- plotthegraph(tokenizedblogs2)
plotthewordcloud(df2)

The Top 10 For Blogs Corpus for 3gram
par(mfrow=c(1,2),mar=c(10,4,4,2))
df3 <- plotthegraph(tokenizedblogs3)
plotthewordcloud(df3)

The Top 10 For News Corpus for 1gram
par(mfrow=c(1,2))
df4 <- plotthegraph(tokenizednews1)
plotthewordcloud(df4)

The Top 10 For News Corpus for 2gram
par(mfrow=c(1,2),mar=c(10,4,4,2))
df5 <- plotthegraph(tokenizednews2)
plotthewordcloud(df5)

The Top 10 For News Corpus for 3gram
par(mfrow=c(1,2), mar=c(15,4,4,2))
df6 <- plotthegraph(tokenizednews3)
plotthewordcloud(df6)

Conclusion And Further Explorations
In the above model, we conducted an elementary explratory analysis using natural language processing.In the next stage, one will have to build a predictive algorithm using n-gram model whose frequency resonates with my above analysis. Subsequently, the algorithm such articulated will be deployed in a Shiny App which will predict the next word once a word/phrase is typed.