This report is created to provide a summary of the data analysis that will help us create an app based on a prediction algorithm which takes a sequence of one or more words as input and provides a prediction of the next word to the user.
We break this into 5 steps: 1. Load the data 2. Perform data cleaning 3. Tokenize the data into bigrams, trigrams and 4grams. 4. Create a model to predict the next word 5. Create a Shiny app which takes input from the end user and provides the prediction as output
First we load the libraries necessary to perform our analysis.
library(tm)
## Warning: package 'tm' was built under R version 3.3.2
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.3.2
library(stringi)
## Warning: package 'stringi' was built under R version 3.3.2
library(stringr)
library(RWeka)
## Warning: package 'RWeka' was built under R version 3.3.2
Load the files
con1 <- file("./Coursera-SwiftKey/final/en_US/en_US.twitter.txt", "rb")
twitter <- readLines(con1,encoding="UTF-8",skipNul=TRUE)
close(con1)
con2 <- file("./Coursera-SwiftKey/final/en_US/en_US.blogs.txt", "rb")
blogs <- readLines(con2,encoding="UTF-8",skipNul=TRUE)
close(con2)
con3 <- file("./Coursera-SwiftKey/final/en_US/en_US.news.txt",open="rb")
news <- readLines(con3, encoding="UTF-8",skipNul=TRUE)
close(con3)
length(twitter)
length(blogs)
length(news)
As we can see there is a large amount of data and it will be very time and memory intensive to process this data. Therefore first we create samples from all 3 datasets.
Then we save these samples in text files so that we can work with these smaller datasets going forward
set.seed(12345)
s_twitter <- sample(twitter,5000)
s_blogs <- sample(blogs,5000)
s_news <- sample(news,5000)
writeLines(s_twitter, "./Coursera-SwiftKey/final/en_US/Sampled Data/s_twitter.txt")
writeLines(s_blogs, "./Coursera-SwiftKey/final/en_US/Sampled Data/s_blogs.txt")
writeLines(s_news, "./Coursera-SwiftKey/final/en_US/Sampled Data/s_news.txt")
s_twitter <- readLines("./Coursera-SwiftKey/final/en_US/Sampled Data/s_twitter.txt",encoding = "UTF-8", skipNul=TRUE)
s_blogs <- readLines("./Coursera-SwiftKey/final/en_US/Sampled Data/s_blogs.txt",encoding = "UTF-8", skipNul=TRUE)
s_news <- readLines("./Coursera-SwiftKey/final/en_US/Sampled Data/s_news.txt",encoding = "UTF-8", skipNul=TRUE)
First we define a corpora based on the sample files created. Then we create some functions to perform clean up of the samples.
# Create a corpus from the data samples
SampleCorpus <- VCorpus(DirSource("./Coursera-SwiftKey/final/en_US/Sampled Data"),
readerControl=list(
reader=readPlain,
language="en",
load=TRUE
))
# Function to remove special characters from the text data
removeSpecial<- function(x){
x<- str_replace_all(x,"[']","'")
x<- str_replace_all(x,"[-]"," ")
x<-str_replace(x, " \\(.*\\)", "")
x<-str_replace_all(x, "[^[a-zA-Z][ ][']['][é]]", "")
x
}
# The cleaning process leaves some residual letters after removing special characters which need to be fixed.
fixBrokenWords <- function(x){
x <- gsub("n't ","t ",x)
x <- gsub("n t ","t ",x)
x <- gsub("'ve "," have ",x)
x <- gsub(" ve "," have ",x)
x <- gsub("'s "," ",x)
x <- gsub(" s "," ",x)
x <- gsub("'m "," am ",x)
x <- gsub("'ll "," will ",x)
x <- gsub(" ll "," will ",x)
x <- gsub("'d "," had ",x)
x <- gsub(" d "," had ",x)
x
}
# Function to break data samples into individual sentences
breakSentences <- function(x){
x <- stri_split_boundaries(x, type="sentence")
x <- as.vector(unlist(x))
x
}
Break data into sentences Covert all characters to lower case Remove numbers and special characters Cleanup any residual characters Remove profanity words Strip extra white spaces
profanity <- readLines("Profanity.txt",skipNul=TRUE)
profanity<-str_replace_all(profanity, "[^[a-z]^[A-Z]^[0-9]^[']^[ ]]", "")
profanity<-str_trim(profanity)
# Cleaning Function
cleanData <- function (x) {
cleanCorp <- tm_map(x, content_transformer(breakSentences))
cleanCorp <- tm_map(cleanCorp,content_transformer(tolower))
cleanCorp <- tm_map(cleanCorp,content_transformer(removeNumbers))
cleanCorp <- tm_map(cleanCorp, content_transformer(removeSpecial))
cleanCorp <- tm_map(cleanCorp, content_transformer(fixBrokenWords))
cleanCorp <- tm_map(cleanCorp,removeWords,profanity)
cleanCorp <- tm_map(cleanCorp,stripWhitespace)
cleanCorp
}
cleanCorp<-cleanData(SampleCorpus)
Use N gram tokenizer to break down the data into sets of 2 words and then create a term document matrix with frequencies for each set. We put a threshold of 5 frequencies to remove sparce sets.
# Processing for bigram
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
tdm_bi <- TermDocumentMatrix(cleanCorp, control = list(tokenize = BigramTokenizer))
instdm2 <- as.matrix(tdm_bi)
FreqMat2 <- data.frame(ST = rownames(instdm2), Freq = rowSums(instdm2))
row.names(FreqMat2) <- NULL
FreqMat2<-FreqMat2[with(FreqMat2, order(-Freq, ST)), ]
FreqMat2<-FreqMat2[FreqMat2$Freq>5,]
FreqMat2$w1=""
FreqMat2$w2=""
i=0
for(i in (1:nrow(FreqMat2))){
FreqMat2[i,3:4]<-as.vector(unlist(stri_split_boundaries(FreqMat2[i,1], type="word",n=2,skip_word_none=TRUE)))
}
nrow(FreqMat2)
## [1] 7561
Plot the top 15 bigrams
#Exploratory Plots - Bigrams
TopFreqMat2<-FreqMat2[1:15,1:2]
TopFreqMat2<-TopFreqMat2[order(TopFreqMat2$Freq),]
par(las=2)
par(mar=c(5,8,4,2))
barplot(TopFreqMat2$Freq,names.arg=TopFreqMat2$ST,horiz=TRUE,main="Most commonly occuring bigrams")
Use N gram tokenizer to break down the data into sets of 3 words and then create a term document matrix with frequencies for each set. We put a threshold of 5 frequencies to remove sparce sets.
# Processing for trigram
TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
tdm_tri <- TermDocumentMatrix(cleanCorp, control = list(tokenize = TrigramTokenizer))
instdm3 <- as.matrix(tdm_tri)
FreqMat3 <- data.frame(ST = rownames(instdm3), Freq = rowSums(instdm3))
row.names(FreqMat3) <- NULL
FreqMat3<-FreqMat3[with(FreqMat3, order(-Freq, ST)), ]
FreqMat3<-FreqMat3[FreqMat3$Freq>5,]
FreqMat3$w1=""
FreqMat3$w2=""
FreqMat3$w3=""
FreqMat3$phrase=""
i=0
for(i in (1:nrow(FreqMat3))){
FreqMat3[i,3:5]<-as.vector(unlist(stri_split_boundaries(FreqMat3[i,1], type="word",n=3,skip_word_none=TRUE)))
FreqMat3[i,6]<-paste(FreqMat3[i,3],FreqMat3[i,4])
}
nrow(FreqMat3)
## [1] 1888
Plot the top 15 bigrams
#Exploratory Plots - Trigrams
TopFreqMat3<-FreqMat3[1:15,1:2]
TopFreqMat3<-TopFreqMat3[order(TopFreqMat3$Freq),]
par(las=2)
par(mar=c(5,8,4,2))
barplot(TopFreqMat3$Freq,names.arg=TopFreqMat3$ST,horiz=TRUE,main="Most commonly occuring trigrams")
Use N gram tokenizer to break down the data into sets of 4 words and then create a term document matrix with frequencies for each set. We put a threshold of 5 frequencies to remove sparce sets.
# Processing for 4gram
QuadgramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 4, max = 4))
tdm_quad <- TermDocumentMatrix(cleanCorp, control = list(tokenize = QuadgramTokenizer))
instdm4 <- as.matrix(tdm_quad)
FreqMat4 <- data.frame(ST = rownames(instdm4), Freq = rowSums(instdm4))
row.names(FreqMat4) <- NULL
FreqMat4<-FreqMat4[with(FreqMat4, order(-Freq, ST)), ]
FreqMat4<-FreqMat4[FreqMat4$Freq>5,]
FreqMat4$w1=""
FreqMat4$w2=""
FreqMat4$w3=""
FreqMat4$w4=""
FreqMat4$phrase=""
i=0
for(i in (1:nrow(FreqMat4))){
FreqMat4[i,3:6]<-as.vector(unlist(stri_split_boundaries(FreqMat4[i,1], type="word",n=4,skip_word_none=TRUE)))
FreqMat4[i,7]<-paste(FreqMat4[i,3],FreqMat4[i,4],FreqMat4[i,5])
}
nrow(FreqMat4)
## [1] 170
#Exploratory Plots - 4grams
TopFreqMat4<-FreqMat4[1:15,1:2]
TopFreqMat4<-TopFreqMat4[order(TopFreqMat4$Freq),]
par(las=2)
par(mar=c(5,10,4,2))
barplot(TopFreqMat4$Freq,names.arg=TopFreqMat4$ST,horiz=TRUE,main="Most commonly occuring Fourgrams")
Now that we have created the n grams and have calculated their frequencies, we will create a simple model to predict the next word. The model will work as below: Ask the user to enter a phrase. If the phrase is more than 3 words long, pick the last 3 words and look those up in the 4gram to get a prediction for the fourth word based on which word has the highest frequency count. If no prediction is found, then drop the first word of the phrase and look for the 2 word phrase in the trigram matrix for a prediction. If no match is found then look it up in the bigram matrix for a match.