Introduction

This report is created to provide a summary of the data analysis that will help us create an app based on a prediction algorithm which takes a sequence of one or more words as input and provides a prediction of the next word to the user.

We break this into 5 steps: 1. Load the data 2. Perform data cleaning 3. Tokenize the data into bigrams, trigrams and 4grams. 4. Create a model to predict the next word 5. Create a Shiny app which takes input from the end user and provides the prediction as output

First we load the libraries necessary to perform our analysis.

library(tm)
## Warning: package 'tm' was built under R version 3.3.2
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.3.2
library(stringi)
## Warning: package 'stringi' was built under R version 3.3.2
library(stringr)
library(RWeka)
## Warning: package 'RWeka' was built under R version 3.3.2

Load data

Load the files

con1 <- file("./Coursera-SwiftKey/final/en_US/en_US.twitter.txt", "rb") 
twitter <- readLines(con1,encoding="UTF-8",skipNul=TRUE)
close(con1)

con2 <- file("./Coursera-SwiftKey/final/en_US/en_US.blogs.txt", "rb") 
blogs <- readLines(con2,encoding="UTF-8",skipNul=TRUE)
close(con2)

con3 <- file("./Coursera-SwiftKey/final/en_US/en_US.news.txt",open="rb")
news <- readLines(con3, encoding="UTF-8",skipNul=TRUE)
close(con3)

length(twitter)
length(blogs)
length(news)

Create Samples

As we can see there is a large amount of data and it will be very time and memory intensive to process this data. Therefore first we create samples from all 3 datasets.

Then we save these samples in text files so that we can work with these smaller datasets going forward

set.seed(12345)
s_twitter <- sample(twitter,5000)
s_blogs <- sample(blogs,5000)
s_news <- sample(news,5000)

writeLines(s_twitter, "./Coursera-SwiftKey/final/en_US/Sampled Data/s_twitter.txt")
writeLines(s_blogs, "./Coursera-SwiftKey/final/en_US/Sampled Data/s_blogs.txt")
writeLines(s_news, "./Coursera-SwiftKey/final/en_US/Sampled Data/s_news.txt")

Read Samples from text files

s_twitter <- readLines("./Coursera-SwiftKey/final/en_US/Sampled Data/s_twitter.txt",encoding = "UTF-8", skipNul=TRUE)
s_blogs <- readLines("./Coursera-SwiftKey/final/en_US/Sampled Data/s_blogs.txt",encoding = "UTF-8", skipNul=TRUE)
s_news <- readLines("./Coursera-SwiftKey/final/en_US/Sampled Data/s_news.txt",encoding = "UTF-8", skipNul=TRUE)

Create Corpora

First we define a corpora based on the sample files created. Then we create some functions to perform clean up of the samples.

# Create a corpus from the data samples
SampleCorpus <- VCorpus(DirSource("./Coursera-SwiftKey/final/en_US/Sampled Data"),
                     readerControl=list(
                       reader=readPlain,
                       language="en",
                       load=TRUE
                     ))

# Function to remove special characters from the text data

removeSpecial<- function(x){
  x<- str_replace_all(x,"[']","'")
  x<- str_replace_all(x,"[-]"," ")
  x<-str_replace(x, " \\(.*\\)", "")
  x<-str_replace_all(x, "[^[a-zA-Z][ ][']['][é]]", "")
  x
} 


# The cleaning process leaves some residual letters after removing special characters which need to be fixed.

fixBrokenWords <- function(x){
  x <- gsub("n't ","t ",x)
  x <- gsub("n t ","t ",x)
  x <- gsub("'ve "," have ",x)
  x <- gsub(" ve "," have ",x)
  x <- gsub("'s "," ",x)
  x <- gsub(" s "," ",x)
  x <- gsub("'m "," am ",x)
  x <- gsub("'ll "," will ",x)
  x <- gsub(" ll "," will ",x)
  x <- gsub("'d "," had ",x)
  x <- gsub(" d "," had ",x)
  x
}


# Function to break data samples into individual sentences

breakSentences <- function(x){
  x <- stri_split_boundaries(x, type="sentence")
  x <- as.vector(unlist(x))
  x
}

Clean Data

Break data into sentences Covert all characters to lower case Remove numbers and special characters Cleanup any residual characters Remove profanity words Strip extra white spaces

profanity <- readLines("Profanity.txt",skipNul=TRUE)
profanity<-str_replace_all(profanity, "[^[a-z]^[A-Z]^[0-9]^[']^[ ]]", "")
profanity<-str_trim(profanity)

# Cleaning Function

cleanData <- function (x) {
  cleanCorp <- tm_map(x, content_transformer(breakSentences))
  cleanCorp <- tm_map(cleanCorp,content_transformer(tolower))
  cleanCorp <- tm_map(cleanCorp,content_transformer(removeNumbers))
  cleanCorp <- tm_map(cleanCorp, content_transformer(removeSpecial))
  cleanCorp <- tm_map(cleanCorp, content_transformer(fixBrokenWords))
  cleanCorp <- tm_map(cleanCorp,removeWords,profanity)
  cleanCorp <- tm_map(cleanCorp,stripWhitespace)
  cleanCorp
}


cleanCorp<-cleanData(SampleCorpus)

Tokenize Data

Tokenize into Bigrams

Use N gram tokenizer to break down the data into sets of 2 words and then create a term document matrix with frequencies for each set. We put a threshold of 5 frequencies to remove sparce sets.

# Processing for bigram
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
tdm_bi <- TermDocumentMatrix(cleanCorp, control = list(tokenize = BigramTokenizer))

instdm2 <- as.matrix(tdm_bi)

FreqMat2 <- data.frame(ST = rownames(instdm2), Freq = rowSums(instdm2))
row.names(FreqMat2) <- NULL

FreqMat2<-FreqMat2[with(FreqMat2, order(-Freq, ST)), ]

FreqMat2<-FreqMat2[FreqMat2$Freq>5,]

FreqMat2$w1=""
FreqMat2$w2=""

i=0
for(i in (1:nrow(FreqMat2))){
  FreqMat2[i,3:4]<-as.vector(unlist(stri_split_boundaries(FreqMat2[i,1], type="word",n=2,skip_word_none=TRUE)))
}

nrow(FreqMat2)
## [1] 7561

Bigram Plot

Plot the top 15 bigrams

#Exploratory Plots - Bigrams
TopFreqMat2<-FreqMat2[1:15,1:2]
TopFreqMat2<-TopFreqMat2[order(TopFreqMat2$Freq),]
par(las=2)
par(mar=c(5,8,4,2))
barplot(TopFreqMat2$Freq,names.arg=TopFreqMat2$ST,horiz=TRUE,main="Most commonly occuring bigrams")

Tokenize into Trigrams

Use N gram tokenizer to break down the data into sets of 3 words and then create a term document matrix with frequencies for each set. We put a threshold of 5 frequencies to remove sparce sets.

# Processing for trigram
TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
tdm_tri <- TermDocumentMatrix(cleanCorp, control = list(tokenize = TrigramTokenizer))

instdm3 <- as.matrix(tdm_tri)

FreqMat3 <- data.frame(ST = rownames(instdm3), Freq = rowSums(instdm3))
row.names(FreqMat3) <- NULL

FreqMat3<-FreqMat3[with(FreqMat3, order(-Freq, ST)), ]

FreqMat3<-FreqMat3[FreqMat3$Freq>5,]

FreqMat3$w1=""
FreqMat3$w2=""
FreqMat3$w3=""
FreqMat3$phrase=""

i=0
for(i in (1:nrow(FreqMat3))){
  FreqMat3[i,3:5]<-as.vector(unlist(stri_split_boundaries(FreqMat3[i,1], type="word",n=3,skip_word_none=TRUE)))
  FreqMat3[i,6]<-paste(FreqMat3[i,3],FreqMat3[i,4])
  }

nrow(FreqMat3)
## [1] 1888

Trigram Plot

Plot the top 15 bigrams

#Exploratory Plots - Trigrams
TopFreqMat3<-FreqMat3[1:15,1:2]
TopFreqMat3<-TopFreqMat3[order(TopFreqMat3$Freq),]
par(las=2)
par(mar=c(5,8,4,2))
barplot(TopFreqMat3$Freq,names.arg=TopFreqMat3$ST,horiz=TRUE,main="Most commonly occuring trigrams")

Create 4gram

Use N gram tokenizer to break down the data into sets of 4 words and then create a term document matrix with frequencies for each set. We put a threshold of 5 frequencies to remove sparce sets.

# Processing for 4gram

QuadgramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 4, max = 4))
tdm_quad <- TermDocumentMatrix(cleanCorp, control = list(tokenize = QuadgramTokenizer))

instdm4 <- as.matrix(tdm_quad)

FreqMat4 <- data.frame(ST = rownames(instdm4), Freq = rowSums(instdm4))
row.names(FreqMat4) <- NULL

FreqMat4<-FreqMat4[with(FreqMat4, order(-Freq, ST)), ]

FreqMat4<-FreqMat4[FreqMat4$Freq>5,]


FreqMat4$w1=""
FreqMat4$w2=""
FreqMat4$w3=""
FreqMat4$w4=""
FreqMat4$phrase=""


i=0
for(i in (1:nrow(FreqMat4))){
  FreqMat4[i,3:6]<-as.vector(unlist(stri_split_boundaries(FreqMat4[i,1], type="word",n=4,skip_word_none=TRUE)))
  FreqMat4[i,7]<-paste(FreqMat4[i,3],FreqMat4[i,4],FreqMat4[i,5])
}

nrow(FreqMat4)
## [1] 170

Plot FourGrams

#Exploratory Plots - 4grams
TopFreqMat4<-FreqMat4[1:15,1:2]
TopFreqMat4<-TopFreqMat4[order(TopFreqMat4$Freq),]
par(las=2)
par(mar=c(5,10,4,2))
barplot(TopFreqMat4$Freq,names.arg=TopFreqMat4$ST,horiz=TRUE,main="Most commonly occuring Fourgrams")

Next Steps - Model creation

Now that we have created the n grams and have calculated their frequencies, we will create a simple model to predict the next word. The model will work as below: Ask the user to enter a phrase. If the phrase is more than 3 words long, pick the last 3 words and look those up in the 4gram to get a prediction for the fourth word based on which word has the highest frequency count. If no prediction is found, then drop the first word of the phrase and look for the 2 word phrase in the trigram matrix for a prediction. If no match is found then look it up in the bigram matrix for a match.