Model Validation

  1. Let us validate our Model on 1000 rows of random sampled data.
  2. We will validate it using both Interpolation and Kneser-Ney Smoothing methods
  3. Validation of Text Prediction Model is tricky. Here is the approach -
  1. Our Shiny App makes 5 predictions (Ranked) for next word for each algorithm.
  2. If the target word is found in any of the 5 prediction, we call that as a SUCCESS case
  3. Calculate Accuracy Metric = Total SUCCESS/Total Rows
  1. Another Metric to track here is Response Time which will be calculated in seconds (in decimals) on a per Row Basis

Load Libraries

library(quanteda)
library(quanteda.textmodels)
library(quanteda.textplots)
library(quanteda.textstats)
library(qdapDictionaries)
library(RColorBrewer)
library(reshape2)
library(dplyr)
library(data.table)

Get Offensive Words Data

Clean Up Token Function

cleanUpToken <- function(myToken)
{
  myToken <- myToken %>% 
    tokens_select(pattern = fBufBadWords, selection = "remove",valuetype = "fixed",padding=TRUE) %>% tokens_select(pattern = fBufBadWords.2, selection = "remove",valuetype = "regex",padding=TRUE) %>% tokens_select(remove.list,selection="remove",valuetype = "fixed",padding=TRUE) %>% tokens_select(special.chars,selection="remove",valuetype = "regex",padding=TRUE)

myToken <- myToken %>% tokens_replace("rt","right", valuetype = "fixed") %>%
  tokens_replace("lol","laugh", valuetype = "fixed") %>%
  tokens_replace("im","i'm", valuetype = "fixed") %>%
  tokens_replace("ur","your", valuetype = "fixed") %>%
  tokens_replace("wanna","want", valuetype = "fixed") %>%
  tokens_replace("omg","oh", valuetype = "fixed") %>%
  tokens_replace("bro","friend", valuetype = "fixed") %>%
  tokens_replace("yo","nice", valuetype = "fixed") %>%
  tokens_replace("thx","thanks", valuetype = "fixed") %>%
  tokens_replace("ppl","people", valuetype = "fixed") %>%
  tokens_replace("haha","funny", valuetype = "fixed") %>%
  tokens_replace("^(ha)+$","funny", valuetype = "regex") %>%
  tokens_replace("lmao","funny", valuetype = "fixed") %>%
  tokens_replace("lmfao","funny", valuetype = "fixed") %>%
  tokens_replace("smh","shocked", valuetype = "fixed") %>%
  tokens_replace("dm","contact", valuetype = "fixed") %>%
  tokens_replace("cuz","because", valuetype = "fixed") %>%
  tokens_replace("aint","isn't", valuetype = "fixed") %>%
  tokens_replace("idk","unsure", valuetype = "fixed") %>%
  tokens_replace("nite","night", valuetype = "fixed") %>%
  tokens_replace("+","and",valuetype = "fixed") %>%
  tokens_replace("y'all","everyone",valuetype = "fixed") %>%
  tokens_replace("yr","year",valuetype = "fixed") %>%
  tokens_replace("gettin","getting",valuetype = "fixed") %>%
  tokens_replace("gotta","should",valuetype = "fixed") 

  return(myToken)
}

Generate Corpus Function

set.seed(12345)
generateCorpus <- function(fileName,doctag)
{
  con <- file(fileName, "rb")
  fBuf <- readLines(con,skipNul = TRUE)
  close(con)
  fBuf.length <- length(fBuf)
  fText <- corpus(fBuf,docnames = paste0(doctag,"_",seq_len(fBuf.length)))
  fText <- fText %>% corpus_reshape(to="sentences")
  rm(fBuf)
  return(fText)
}

Generate and Merge All Corpora

myBlogCorpus <- generateCorpus("./final/en_US/en_US.blogs.txt","blog")
myNewCorpus <- generateCorpus("./final/en_US/en_US.news.txt","news")
myTwitterCorpus <- generateCorpus("./final/en_US/en_US.twitter.txt","twitter")
myMergedCorpus <- c(myBlogCorpus,myNewCorpus,myTwitterCorpus)

Sample 1% of Corpus

#Consciously setting a different seed in a hope to get an unseen data
set.seed(555)
merged.corpus.length <- length(myMergedCorpus)
mySampleCorpus <- myMergedCorpus[sample(merged.corpus.length,merged.corpus.length*0.01)]

Generate Token

myToken <- tokens(mySampleCorpus,remove_punct = TRUE)
my.addon.words <- NULL
is.word  <- function(x) x %in% c(GradyAugmented,my.addon.words)

remove.list <- NULL
remove.list <- c(remove.list,0:9)
remove.list <- c(remove.list,LETTERS[!(LETTERS %in% c('A','I'))])
remove.list <- c(remove.list,letters[!(letters %in% c('a','i'))])
remove.list <- c(remove.list, TRUE, FALSE)

Clean Up Token

myToken <- myToken %>% cleanUpToken()
token.length <- length(myToken)
token.length
## [1] 80707

Generate N Grams and DFM

Here although we have generated 6 grams, 5 grams and 4 grams; we are conducting the validation exercise on 5grams. First 4 words are predictors and 5th word is the target word.

hGramToken <- tokens_ngrams(myToken, n = 6)
hGramToken <- hGramToken %>% tokens_remove("_{2,}")

pGramToken <- tokens_ngrams(myToken, n = 5)
pGramToken <- pGramToken %>% tokens_remove("_{2,}")

qGramToken <- tokens_ngrams(myToken, n = 4)
qGramToken <- qGramToken %>% tokens_remove("_{2,}")

tGramToken <- tokens_ngrams(myToken, n = 3)
tGramToken <- tGramToken %>% tokens_remove("_{2,}")
dfmHG <- dfm(hGramToken)
dfmPG <- dfm(pGramToken)
dfmQG <- dfm(qGramToken)
dfmTG <- dfm(tGramToken)
hgNames <- colnames(dfmHG)
pgNames <- colnames(dfmPG)
qgNames <- colnames(dfmQG)
tgNames <- colnames(dfmTG)

hgNames.length <- length(hgNames)
pgNames.length <- length(pgNames)
qgNames.length <- length(qgNames)
tgNames.length <- length(tgNames)
head(hgNames)
## [1] "or_do_we_just_embrace_the"            
## [2] "do_we_just_embrace_the_role"          
## [3] "we_just_embrace_the_role_of"          
## [4] "just_embrace_the_role_of_aesthetic"   
## [5] "embrace_the_role_of_aesthetic_editing"
## [6] "the_role_of_aesthetic_editing_in"
head(pgNames)
## [1] "or_do_we_just_embrace"         "do_we_just_embrace_the"       
## [3] "we_just_embrace_the_role"      "just_embrace_the_role_of"     
## [5] "embrace_the_role_of_aesthetic" "the_role_of_aesthetic_editing"
head(qgNames)
## [1] "or_do_we_just"          "do_we_just_embrace"     "we_just_embrace_the"   
## [4] "just_embrace_the_role"  "embrace_the_role_of"    "the_object_gets_dipped"
head(tgNames)
## [1] "or_do_we"         "do_we_just"       "we_just_embrace"  "just_embrace_the"
## [5] "embrace_the_role" "the_object_gets"
pgNames.length
## [1] 598015
qgNames.length
## [1] 657067
tgNames.length
## [1] 635524
set.seed(666)
hgSample <- hgNames[sample(hgNames.length,hgNames.length*0.01)]
pgSample <- pgNames[sample(pgNames.length,pgNames.length*0.01)]
qgSample <- qgNames[sample(qgNames.length,qgNames.length*0.01)]
tgSample <- tgNames[sample(tgNames.length,tgNames.length*0.01)]
pgSample.length <- length(pgSample)
qgSample.length <- length(qgSample)
tgSample.length <- length(tgSample)
head(hgSample)
## [1] "of_stress_as_we_started_out"           
## [2] "might_draw_an_offer_because_both"      
## [3] "vision_for_the_promotion_and_marketing"
## [4] "find_a_way_to_capitalize_hitting"      
## [5] "sketchbooks_which_i_always_alter_along"
## [6] "the_white_norfork_and_other_tailwaters"
head(pgSample)
## [1] "in_funding_the_trust_so"            "legislation_recently_passed_in_the"
## [3] "stabilizes_otherwise_i_do_not"      "have_to_endure_days_twice"         
## [5] "day_and_he_cares_about"             "eighth_grade_i_turned_out"
head(qgSample)
## [1] "of_things_by_placing" "rolls_around_it_can"  "follow_me_please_i"  
## [4] "practice_and_play_in" "waits_on_them_with"   "was_not_me_who"
head(tgSample)
## [1] "can't_even_imagine" "you're_out_there"   "her_open_mouth"    
## [4] "since_the_age"      "store_played_him"   "wish_i_can"
pgSample.length
## [1] 5980
qgSample.length
## [1] 6570
tgSample.length
## [1] 6355

Let us get down to prediction now

rdylgn <- brewer.pal(name="RdYlGn",n=11)
greys <- brewer.pal(name="Greys",n=9)
dfHG <- readRDS("./DSCapstone_TextPredictor/HG_Short.rds")
dfPG <- readRDS("./DSCapstone_TextPredictor/PG_Short.rds")
dfQG <- readRDS("./DSCapstone_TextPredictor/QG_Short.rds")
dfTG <- readRDS("./DSCapstone_TextPredictor/TG_Short.rds")
dfBG <- readRDS("./DSCapstone_TextPredictor/BG_Short.rds")
dfUG <- readRDS("./DSCapstone_TextPredictor/UG_Short.rds")

dfHG.CC <- readRDS("./DSCapstone_TextPredictor/HG_CC.rds")
dfPG.CC <- readRDS("./DSCapstone_TextPredictor/PG_CC.rds")
dfQG.CC <- readRDS("./DSCapstone_TextPredictor/QG_CC.rds")
dfTG.CC <- readRDS("./DSCapstone_TextPredictor/TG_CC.rds")
dfBG.CC <- readRDS("./DSCapstone_TextPredictor/BG_CC.rds")

conStop <- file("morestopwords.txt", "rb" )
fBufStop <- readLines(conStop,skipNul = TRUE)
close(conStop)
fBufStop <- union(fBufStop,stopwords("en"))

short.stop.words <- unlist(strsplit("my our your his her its their the a an and",split=" "))

substrRight <- function(x, n){
  substr(x, nchar(x)-n+1, nchar(x))
}

isMatchFound <- function(df)
{
  if (!is.null(df))
  {
    if (nrow(df) > 0)
    {
        return(TRUE)
    }
  }
  return(FALSE)
}
searchPatternInNG <- function(search.prefix, dfNames, dfIndex, short.stop.words, maskStopWords = TRUE)
{
  search.prefix <- gsub("\\\\w+",paste0("(",paste(short.stop.words,collapse="|"),")"),search.prefix)
  search.prefix <- gsub("[+]","",search.prefix)
  
  search.string <- paste0("^",search.prefix,"_[a-z']+$")
  mytext <- paste0("dfMatch <- as.data.frame(", dfNames[dfIndex], "[feature %like% search.string])")
  eval(parse(text=mytext))
  
  if (maskStopWords)
  {
    #mytext <- paste0("dfDenom <- as.data.frame(", dfNames[dfIndex-1], "[feature %like% search.prefix])")
  }
  else
  {
    #mytext <- paste0("dfDenom <- as.data.frame(", dfNames[dfIndex-1], "[feature == search.prefix])")
  }
  #eval(parse(text=mytext))
  
  #denom <- sum(dfDenom$value)
  dfMatch$word <- gsub(paste0("^",search.prefix,"_"),"",dfMatch$feature)
  if (maskStopWords)
  {
    dfMatch <- dfMatch[!(dfMatch$word %in% stopwords("en")),]
  }
  denom <- sum(dfMatch$value)
  
  dfMatch$probs <- dfMatch$value / denom
  dfMatch <- dfMatch[c("word","probs")]
  dfMatch <- dfMatch[order(-dfMatch$probs),]
  colnames(dfMatch)[2] <- paste0("p",dfIndex)
  dfMatch <- head(dfMatch,10)
  return(dfMatch)
}

returnSimpleMatch <- function(mytext, dfNames, short.stop.words = "", maskStopWords = TRUE)
{
  nGramText <- unlist(strsplit(mytext," "))
  if (maskStopWords)
  {
    nGramText[nGramText %in% short.stop.words] <- "\\w+"
  }
  dfIndex <- length(nGramText) + 1

  if (dfIndex > 6)
  {
    #print(nGramText)
    nGramText <- nGramText[(length(nGramText)-4):length(nGramText)]
    #print(nGramText)
  }
  dfIndex <- length(nGramText) + 1
  
  max.dfIndex <- dfIndex
  search.prefix <- paste(nGramText,collapse="_")

  
  dfMatchMain <- NULL
  max.p <- 0
  while ((search.prefix != "")  & (dfIndex >= 1))
  {
    dfMatchTemp <- searchPatternInNG(search.prefix,dfNames, dfIndex, short.stop.words, maskStopWords)
    
    if (isMatchFound(dfMatchTemp))
    {
      if (is.null(dfMatchMain))
      {
        max.p <- dfIndex
        dfMatchMain <- dfMatchTemp
      }
      else
      {
        dfMatchMain <- full_join(dfMatchMain,dfMatchTemp,by="word")
      }
    }
    
    search.prefix <- gsub("^[a-z'\\+]+_?","",search.prefix)
    dfIndex <- dfIndex - 1
  }
  
  if (is.null(dfMatchMain)) return(NULL)
  match.cols <- colnames(dfMatchMain)
  for (i in max.dfIndex:2)
  {
    if (!(paste0("p", i) %in% match.cols))
    {
      mytext <- paste0("dfMatchMain$p", i, " <- rep(NA,nrow(dfMatchMain))")
      eval(parse(text=mytext))
    }
  }

  for (i in max.dfIndex:2)
  {
    mytext <- paste0("dfMatchMain$w", i, "<- rep(", i, ",nrow(dfMatchMain))")
    eval(parse(text=mytext))
  }
  dfMatchMain$p <- rep(0,nrow(dfMatchMain))
  for (i in max.dfIndex:2)
  {
    mytext <- paste0("dfMatchMain$p <- with(dfMatchMain,p + coalesce(p", i, ",0) * w", i , ")")
    eval(parse(text=mytext))
  }
  dfMatchMain$p <- dfMatchMain$p / (sum(max.dfIndex:2)) 
  dfMatchMain <- dfMatchMain[order(-dfMatchMain$p),]
  
  dfMatchMain <- head(dfMatchMain,5)
  rownames(dfMatchMain) <- seq_len(nrow(dfMatchMain))
  dfMatchMain$Rank <- seq_len(nrow(dfMatchMain))
  #print("Printing Interpolation Results")
  #print(dfMatchMain)
  colnames(dfMatchMain)[colnames(dfMatchMain) == "p"] <- "score"
  dfMatchMain <- dfMatchMain[c("word",paste0("p",max.dfIndex:2),paste0("w",max.dfIndex:2),"score","Rank")]
  return(dfMatchMain)
  
}

getdValue <- function(value, dfIndex, max.df.Index)
{
  if (dfIndex >= max.df.Index)
  {
    return(0)
  }
  dvalue <- ifelse(value==1,0.554,0.75)
  return(dvalue)
}

searchKN <- function(search.prefix,dfNames,dfCCNames, dfIndex, max.df.Index, rLevel, dfMatch, maskStopWords=TRUE)
{
  if (rLevel == 0)
  {
    search.string <- paste0("^",search.prefix,"_[a-z']+$")
    mytext <- paste0("dfMatch <- ", dfNames[dfIndex], "[feature %like% search.string]")
    eval(parse(text=mytext))
    if (nrow(dfMatch) == 0) return(NULL)
    dfMatch$word <- gsub(paste0("^",search.prefix,"_"),"",dfMatch$feature)
    if (maskStopWords)
    {
      dfMatch <- dfMatch[!(dfMatch$word %in% stopwords("en")),]
    }
    
    denom <- sum(dfMatch$value)
    
    dValue <- getdValue(dfMatch$value, dfIndex, max.df.Index)
    p <- ifelse(dfMatch$value - dValue>0,dfMatch$value - dValue,0) / denom
    lambda <- dValue * nrow(dfMatch) / denom
    dfMatch$lastword <- dfMatch$feature
    dfMatch$ngram <- rep(dfIndex,nrow(dfMatch))
    dfMatch$word <- gsub("^([a-z']+_)+","",dfMatch$feature)
    dfMatch <- dfMatch[,.(feature,lastword,word,ngram,value)]
    dfMatch$p <- p + lambda * searchKN(gsub("^[a-z']+_?","",search.prefix)
                                       , dfNames, dfCCNames, dfIndex-1,
                                       max.df.Index, rLevel+1, dfMatch,maskStopWords)$p
  }
  else
  {
    dfMatch$lastword <- gsub("^[a-z']+_","",dfMatch$lastword)
    mytext <- paste0("dfMatch <- merge(dfMatch,", dfCCNames[dfIndex+1], ",by='lastword',all.x=TRUE)")
    eval(parse(text=mytext))
    dfMatch$contcount <- coalesce(dfMatch$contcount,0)
    mytext <- paste0("denom <- nrow(", dfNames[dfIndex+1], ")")
    eval(parse(text=mytext))
    
    #Manipulate Denominator for some smoothing otherwise we will get hardly any
    denom <- sum(dfMatch$contcount)
    if (denom == 0) denom <- 1
    
    dValue <- getdValue(dfMatch$contcount, dfIndex, max.df.Index)
    p <- ifelse(dfMatch$contcount - dValue>0,dfMatch$contcount - dValue,0) / denom
    
    lambda <- dValue * nrow(dfMatch) / denom
    dfMatch <- dfMatch[,.(feature,lastword,word,ngram,value)]
    if (dfIndex == 1)
    {
      dfMatch$p <- p
    }
    else
    {
      dfMatch$p <- p + lambda * searchKN(gsub("^[a-z']+_?","",search.prefix)
                                         , dfNames, dfCCNames, dfIndex-1,
                                         max.df.Index, rLevel+1, dfMatch,maskStopWords)$p
    }
    
  }
  return(dfMatch)
}

getKNDF <- function(mytext,dfNames, dfCCNames, max.df.Index, maskStopWords=TRUE)
{
  nGramText <- unlist(strsplit(mytext," "))
  dfIndex <- length(nGramText) + 1
  if (dfIndex > max.df.Index)
  {
    #print(nGramText)
    nGramText <- nGramText[(length(nGramText)-(max.df.Index-2)):length(nGramText)]
    #print(nGramText)
  }
  dfIndex <- length(nGramText) + 1
  search.prefix <- paste(nGramText,collapse="_")
  original.search.prefix <- search.prefix
  
  dfMatchMain <- NULL
  while ((search.prefix != "") & (dfIndex >= 1))
  {
    dfMatchTemp <- searchKN(search.prefix,dfNames,dfCCNames, dfIndex, max.df.Index, 0, NULL, maskStopWords)
    if (isMatchFound(dfMatchTemp))
    {
      if (is.null(dfMatchMain))
      {
          dfMatchMain <- dfMatchTemp
      }
      else
      {
        dfMatchTemp <- dfMatchTemp[!(word %in% intersect(dfMatchTemp$word,dfMatchMain$word))]
        if (!is.null(dfMatchTemp))
        {
            dfMatchMain <- rbind(dfMatchMain,dfMatchTemp)
        }  
      }
    }  

    search.prefix <- gsub("^[a-z'\\+]+_?","",search.prefix)
    dfIndex <- dfIndex - 1
  }
  if (is.null(dfMatchMain)) return(NULL)
  if (nrow(dfMatchMain) == 0) return(NULL)
  dfMatchMain$word <- gsub("^([a-z']+_)+","",dfMatchMain$feature)
  dfMatchMain <- dfMatchMain[,.(word,ngram,p)]
  dfMatchMain <- dfMatchMain[order(-p)]
  dfMatchMain <- head(dfMatchMain,5)
  dfMatchMain$Rank <- seq_len(nrow(dfMatchMain))
  return(dfMatchMain)
}

dfNames <- c("dfUG","dfBG","dfTG","dfQG","dfPG","dfHG")
dfCCNames <- c("","dfBG.CC","dfTG.CC","dfQG.CC","dfPG.CC","dfHG.CC")
pgSample.short <- pgSample[sample(length(pgSample),1000)]
length.short <- length(pgSample.short)
df.CV <- data.frame(phrase = pgSample.short, 
                    ip_rank1 = rep(0,length.short),
                    ip_rank2 = rep(0,length.short),
                    ip_rank3 = rep(0,length.short),
                    ip_rank4 = rep(0,length.short),
                    ip_rank5 = rep(0,length.short),
                    kn_rank1 = rep(0,length.short),
                    kn_rank2 = rep(0,length.short),
                    kn_rank3 = rep(0,length.short),
                    kn_rank4 = rep(0,length.short),
                    kn_rank5 = rep(0,length.short),
                    stopword = rep(0,length.short)
                    )

Let us make predictions using Interpolation Approach

oldTime <- as.numeric(Sys.time())*1000
for (mytext in pgSample.short)
{
  targetword <- gsub("^([a-z']+_)+","",mytext)
  inputtext <- gsub("_"," ",gsub("_[a-z']+$","",mytext))
  allowStopWords <- FALSE
  if (targetword %in% stopwords("en"))
  {
    allowStopWords <- TRUE
    df.CV[df.CV$phrase == mytext,"stopword"] <- 1
  }
  df.IP <- returnSimpleMatch(inputtext,dfNames,maskStopWords = !allowStopWords)
  if (isMatchFound(df.IP))
  {
    df.Match <- df.IP[df.IP$word == targetword,]
    ip.rank <- 0
    if (nrow(df.Match) > 0)
    {
      ip.rank <- df.Match$Rank
      df.CV[df.CV$phrase == mytext,paste0("ip_rank",ip.rank)] <- 1
    }
  }
}
newTime <- as.numeric(Sys.time())*1000

Let us make predictions using Kneser-Ney Smoothing

oldTime.KN <- as.numeric(Sys.time())*1000
for (mytext in pgSample.short)
{
  targetword <- gsub("^([a-z']+_)+","",mytext)
  inputtext <- gsub("_"," ",gsub("_[a-z']+$","",mytext))
  allowStopWords <- FALSE
  if (targetword %in% stopwords("en"))
  {
    allowStopWords <- TRUE
    df.CV[df.CV$phrase == mytext,"stopword"] <- 1
  }
  df.KN <- getKNDF(inputtext,dfNames,dfCCNames,6,maskStopWords = !allowStopWords)
  if (isMatchFound(df.KN))
  {
    df.Match <- df.KN[df.KN$word == targetword,]
    kn.rank <- 0
    if (nrow(df.Match) > 0)
    {
      kn.rank <- df.Match$Rank
      df.CV[df.CV$phrase == mytext,paste0("kn_rank",kn.rank)] <- 1
    }
  }
}
newTime.KN <- as.numeric(Sys.time())*1000