Predicting The Next Word

Loading Necessary Libraries

library(NLP)
library(tm)
library(fpc)
library(RColorBrewer)
library(ggplot2)
library(stringi)
library(RWeka)
library(data.table)

Reading The Table

ngramtable <- fread("C:/Data Science/R/Coursera Capstone Project/Final Project Submission/ngrams.txt")
setkeyv(ngramtable, c('word0', 'word1', 'word2', 'word3', 'freq'))

Cleaning The Text

cleanthetext <- function(text){
        cleanedtext <- tolower(text)
        cleanedtext <- stripWhitespace(cleanedtext)
        cleanedtext <- gsub("[^\\p{L}\\s]+", "", cleanedtext, perl=T)
        return(cleanedtext)
}

Spliting The Words In The Text

splittheword <- function(text){
        cleanedtext <- cleanthetext(text)
        splittedtext <- unlist(strsplit(cleanedtext," "))
        return(splittedtext)
}

Single Word Prediction

singlewordprediction <- function(cleanedtextlist){
        datatable <- ngramtable[list("-",cleanedtextlist[1])]
        datatable <- datatable[datatable$word2!="-",]
        datatable <- datatable[order(datatable$freq,decreasing = TRUE),]
        
        duplicate <- duplicated(subset(datatable,select = c("word1","word2")))
        datatable <- datatable[!duplicate,]
        
        alternateprediction = ''
        if (length(datatable) > 1){alternateprediction <- datatable$word2[2]}
        
        guessprediction <- datatable$word2[1]
        if(is.na(guessprediction)|is.null(guessprediction)){
                guessprediction <- "Unable to predict the next word"
        }
        return(c(guessprediction,alternateprediction))
}

Two Words Prediction

twowordprediction <- function(cleanedtextlist){
        datatable <- ngramtable[list("-",cleanedtextlist[1],cleanedtextlist[2])]
        datatable <- datatable[datatable$word3!="-",]
        datatable <- datatable[order(datatable$freq,decreasing = TRUE),]
        
        duplicate <- duplicated(subset(datatable,select = c("word1","word2","word3")))
        datatable <- datatable[!duplicate,]
        
        alternateprediction = ''
        if (length(datatable) > 1){alternateprediction <- datatable$word3[2]}
        
        guessprediction <- datatable$word3[1]
        if(is.na(guessprediction)|is.null(guessprediction)){
                guessprediction <- singlewordprediction(cleanedtextlist[2])
        }
        return(c(guessprediction,alternateprediction))
}

Three Words Predictions

threewordprediction <- function(cleanedtextlist){
        datatable <- ngramtable[list("-",cleanedtextlist[1],cleanedtextlist[2],
                                     cleanedtextlist[3])]
        datatable <- datatable[datatable$word4!="-",]
        datatable <- datatable[order(datatable$freq,decreasing = TRUE),]
        
        duplicate <- duplicated(subset(datatable,select = 
                                             c("word1","word2","word3","word4")))
        datatable <- datatable[!duplicate,]
        
        alternateprediction = ''
        if (length(datatable) > 1){alternateprediction <- datatable$word4[2]}
        
        guessprediction <- datatable$word4[1]
        if(is.na(guessprediction)|is.null(guessprediction)){
                shortlist <- c(cleanedtextlist[2],cleanedtextlist[3])
                guessprediction <- twowordprediction(shortlist)
                if(is.na(guessprediction)|is.null(guessprediction)){
                        guessprediction <-         
                                singlewordprediction(cleanedtextlist[3])
                }
        }
        return(c(guessprediction,alternateprediction))
}

Predicting The Next Word

Abhijit Jantre

20 February 2017

Loading Necessary Libraries

Reading The Table

Cleaning The Text

Spliting The Words In The Text

Single Word Prediction

Two Words Prediction

Three Words Predictions