library(tm)
library(ngram)
## Pre processing the input text for model function
prepro<- function (x){
# perform removepunctuation, removenumbers, tolower, stripWhitespace
a<-sapply(x, function (x) preprocess(x ,case="lower",
remove.punct = TRUE,
remove.numbers = TRUE,
fix.spacing = TRUE))
attr(a, "names") <- NULL
# perform bigram or onegram
Ngram<- ngram_asweka(a, min=2, max=2)
if (length(Ngram)==0) Ngram<- ngram_asweka(a, min=1, max=1)
return(Ngram)}
# Function required in the model, it takes the last word of a Ngram
lastword <- function(x1){
a <- ngram_asweka(x1, min=1, max=1)
a <- a[length(a)]
return(a)}
# Model function
model <- function (x, TrigramData, BigramData, gamma2= 0.5, gamma3=0.5) {
# Last Ngram of the user input data
df<-x[length(x)]
# look up in trigram database for matches
s3<- data.frame()
if (length(ngram_asweka(df, min=1, max=1))>1){
s3<-TrigramData[grep(paste0("^", df, " "), TrigramData$words),]
# keep only top10 matches
if (dim(s3)[1] >10) s3<-s3[1:10, ]
p3 <- (s3$frequency/ sum(s3$frequency))*gamma3
s3$probability <- p3}
# look up in bigram database for matches
lw<-lastword(df)
s2<-BigramData[grep(paste0("^", lw, " "), BigramData$words),]
# keep only top10
if (dim(s2)[1] >10) s2<-s2[1:10, ]
p2 <- (s2$frequency/ sum(s2$frequency))*gamma2
s2$probability <- p2
# Join together bigram and trigram matches
if (nrow(s3)>0) s<-rbind(s3,s2)
else s<-s2
# take last word from each bigram and trigram, these are the predicted words
s$words<-sapply(s$words, lastword)
# drop frequency column
s$frequency <- NULL
# join together predicted words from trigrams&bigrams
s<-aggregate(s$probability, by=list(s$word), sum)
s<-s[order(s$x, decreasing=TRUE),]
names(s)<-c("Words", "Probability")
return(s)}