Model Validation

Let us validate our Model on 1000 rows of random sampled data.
We will validate it using both Interpolation and Kneser-Ney Smoothing methods
Validation of Text Prediction Model is tricky. Here is the approach -

Our Shiny App makes 5 predictions (Ranked) for next word for each algorithm.
If the target word is found in any of the 5 prediction, we call that as a SUCCESS case
Calculate Accuracy Metric = Total SUCCESS/Total Rows

Another Metric to track here is Response Time which will be calculated in seconds (in decimals) on a per Row Basis

Load Libraries

library(quanteda)
library(quanteda.textmodels)
library(quanteda.textplots)
library(quanteda.textstats)
library(qdapDictionaries)
library(RColorBrewer)
library(reshape2)
library(dplyr)
library(data.table)

Print Log Function (if needed)

printToLog <- function(X,Y){
  X <- paste(format(Sys.time(),"%a %b %d %X %Y"),X,sep="--  ")
  cat(X,file=Y,sep="\n",append=TRUE)
}

Get Offensive Words Data

Clean Up Token Function

cleanUpToken <- function(myToken)
{
  myToken <- myToken %>% 
    tokens_select(pattern = fBufBadWords, selection = "remove",valuetype = "fixed",padding=TRUE) %>% tokens_select(pattern = fBufBadWords.2, selection = "remove",valuetype = "regex",padding=TRUE) %>% tokens_select(remove.list,selection="remove",valuetype = "fixed",padding=TRUE) %>% tokens_select(special.chars,selection="remove",valuetype = "regex",padding=TRUE)

myToken <- myToken %>% tokens_replace("rt","right", valuetype = "fixed") %>%
  tokens_replace("lol","laugh", valuetype = "fixed") %>%
  tokens_replace("im","i'm", valuetype = "fixed") %>%
  tokens_replace("ur","your", valuetype = "fixed") %>%
  tokens_replace("wanna","want", valuetype = "fixed") %>%
  tokens_replace("omg","oh", valuetype = "fixed") %>%
  tokens_replace("bro","friend", valuetype = "fixed") %>%
  tokens_replace("yo","nice", valuetype = "fixed") %>%
  tokens_replace("thx","thanks", valuetype = "fixed") %>%
  tokens_replace("ppl","people", valuetype = "fixed") %>%
  tokens_replace("haha","funny", valuetype = "fixed") %>%
  tokens_replace("^(ha)+$","funny", valuetype = "regex") %>%
  tokens_replace("lmao","funny", valuetype = "fixed") %>%
  tokens_replace("lmfao","funny", valuetype = "fixed") %>%
  tokens_replace("smh","shocked", valuetype = "fixed") %>%
  tokens_replace("dm","contact", valuetype = "fixed") %>%
  tokens_replace("cuz","because", valuetype = "fixed") %>%
  tokens_replace("aint","isn't", valuetype = "fixed") %>%
  tokens_replace("idk","unsure", valuetype = "fixed") %>%
  tokens_replace("nite","night", valuetype = "fixed") %>%
  tokens_replace("+","and",valuetype = "fixed") %>%
  tokens_replace("y'all","everyone",valuetype = "fixed") %>%
  tokens_replace("yr","year",valuetype = "fixed") %>%
  tokens_replace("gettin","getting",valuetype = "fixed") %>%
  tokens_replace("gotta","should",valuetype = "fixed") 

  return(myToken)
}

Generate Corpus Function

set.seed(12345)
generateCorpus <- function(fileName,doctag)
{
  con <- file(fileName, "rb")
  fBuf <- readLines(con,skipNul = TRUE)
  close(con)
  fBuf.length <- length(fBuf)
  fText <- corpus(fBuf,docnames = paste0(doctag,"_",seq_len(fBuf.length)))
  fText <- fText %>% corpus_reshape(to="sentences")
  rm(fBuf)
  return(fText)
}

Generate and Merge All Corpora

myBlogCorpus <- generateCorpus("./final/en_US/en_US.blogs.txt","blog")

myNewCorpus <- generateCorpus("./final/en_US/en_US.news.txt","news")

myTwitterCorpus <- generateCorpus("./final/en_US/en_US.twitter.txt","twitter")

myMergedCorpus <- c(myBlogCorpus,myNewCorpus,myTwitterCorpus)

Sample 1% of Corpus

#Consciously setting a different seed in a hope to get an unseen data
set.seed(555)
merged.corpus.length <- length(myMergedCorpus)
mySampleCorpus <- myMergedCorpus[sample(merged.corpus.length,merged.corpus.length*0.01)]

Generate Token

myToken <- tokens(mySampleCorpus,remove_punct = TRUE)

my.addon.words <- NULL
is.word  <- function(x) x %in% c(GradyAugmented,my.addon.words)

remove.list <- NULL
remove.list <- c(remove.list,0:9)
remove.list <- c(remove.list,LETTERS[!(LETTERS %in% c('A','I'))])
remove.list <- c(remove.list,letters[!(letters %in% c('a','i'))])
remove.list <- c(remove.list, TRUE, FALSE)

Clean Up Token

myToken <- myToken %>% cleanUpToken()
token.length <- length(myToken)
token.length

## [1] 80707

Generate N Grams and DFM

Here although we have generated 6 grams, 5 grams and 4 grams; we are conducting the validation exercise on 5grams. First 4 words are predictors and 5th word is the target word.

hGramToken <- tokens_ngrams(myToken, n = 6)
hGramToken <- hGramToken %>% tokens_remove("_{2,}")

pGramToken <- tokens_ngrams(myToken, n = 5)
pGramToken <- pGramToken %>% tokens_remove("_{2,}")

qGramToken <- tokens_ngrams(myToken, n = 4)
qGramToken <- qGramToken %>% tokens_remove("_{2,}")

tGramToken <- tokens_ngrams(myToken, n = 3)
tGramToken <- tGramToken %>% tokens_remove("_{2,}")

dfmHG <- dfm(hGramToken)
dfmPG <- dfm(pGramToken)
dfmQG <- dfm(qGramToken)
dfmTG <- dfm(tGramToken)

hgNames <- colnames(dfmHG)
pgNames <- colnames(dfmPG)
qgNames <- colnames(dfmQG)
tgNames <- colnames(dfmTG)

hgNames.length <- length(hgNames)
pgNames.length <- length(pgNames)
qgNames.length <- length(qgNames)
tgNames.length <- length(tgNames)

head(hgNames)

## [1] "or_do_we_just_embrace_the"            
## [2] "do_we_just_embrace_the_role"          
## [3] "we_just_embrace_the_role_of"          
## [4] "just_embrace_the_role_of_aesthetic"   
## [5] "embrace_the_role_of_aesthetic_editing"
## [6] "the_role_of_aesthetic_editing_in"

head(pgNames)

## [1] "or_do_we_just_embrace"         "do_we_just_embrace_the"       
## [3] "we_just_embrace_the_role"      "just_embrace_the_role_of"     
## [5] "embrace_the_role_of_aesthetic" "the_role_of_aesthetic_editing"

head(qgNames)

## [1] "or_do_we_just"          "do_we_just_embrace"     "we_just_embrace_the"   
## [4] "just_embrace_the_role"  "embrace_the_role_of"    "the_object_gets_dipped"

head(tgNames)

## [1] "or_do_we"         "do_we_just"       "we_just_embrace"  "just_embrace_the"
## [5] "embrace_the_role" "the_object_gets"

pgNames.length

## [1] 598015

qgNames.length

## [1] 657067

tgNames.length

## [1] 635524

set.seed(666)
hgSample <- hgNames[sample(hgNames.length,hgNames.length*0.01)]
pgSample <- pgNames[sample(pgNames.length,pgNames.length*0.01)]
qgSample <- qgNames[sample(qgNames.length,qgNames.length*0.01)]
tgSample <- tgNames[sample(tgNames.length,tgNames.length*0.01)]
pgSample.length <- length(pgSample)
qgSample.length <- length(qgSample)
tgSample.length <- length(tgSample)

head(hgSample)

## [1] "of_stress_as_we_started_out"           
## [2] "might_draw_an_offer_because_both"      
## [3] "vision_for_the_promotion_and_marketing"
## [4] "find_a_way_to_capitalize_hitting"      
## [5] "sketchbooks_which_i_always_alter_along"
## [6] "the_white_norfork_and_other_tailwaters"

head(pgSample)

## [1] "in_funding_the_trust_so"            "legislation_recently_passed_in_the"
## [3] "stabilizes_otherwise_i_do_not"      "have_to_endure_days_twice"         
## [5] "day_and_he_cares_about"             "eighth_grade_i_turned_out"

head(qgSample)

## [1] "of_things_by_placing" "rolls_around_it_can"  "follow_me_please_i"  
## [4] "practice_and_play_in" "waits_on_them_with"   "was_not_me_who"

head(tgSample)

## [1] "can't_even_imagine" "you're_out_there"   "her_open_mouth"    
## [4] "since_the_age"      "store_played_him"   "wish_i_can"

pgSample.length

## [1] 5980

qgSample.length

## [1] 6570

tgSample.length

## [1] 6355

Let us get down to prediction now

rdylgn <- brewer.pal(name="RdYlGn",n=11)
greys <- brewer.pal(name="Greys",n=9)
dfHG <- readRDS("./DSCapstone_TextPredictor/HG_Short.rds")
dfPG <- readRDS("./DSCapstone_TextPredictor/PG_Short.rds")
dfQG <- readRDS("./DSCapstone_TextPredictor/QG_Short.rds")
dfTG <- readRDS("./DSCapstone_TextPredictor/TG_Short.rds")
dfBG <- readRDS("./DSCapstone_TextPredictor/BG_Short.rds")
dfUG <- readRDS("./DSCapstone_TextPredictor/UG_Short.rds")

dfHG.CC <- readRDS("./DSCapstone_TextPredictor/HG_CC.rds")
dfPG.CC <- readRDS("./DSCapstone_TextPredictor/PG_CC.rds")
dfQG.CC <- readRDS("./DSCapstone_TextPredictor/QG_CC.rds")
dfTG.CC <- readRDS("./DSCapstone_TextPredictor/TG_CC.rds")
dfBG.CC <- readRDS("./DSCapstone_TextPredictor/BG_CC.rds")

conStop <- file("morestopwords.txt", "rb" )
fBufStop <- readLines(conStop,skipNul = TRUE)
close(conStop)
fBufStop <- union(fBufStop,stopwords("en"))

short.stop.words <- unlist(strsplit("my our your his her its their the a an and",split=" "))

substrRight <- function(x, n){
  substr(x, nchar(x)-n+1, nchar(x))
}

isMatchFound <- function(df)
{
  if (!is.null(df))
  {
    if (nrow(df) > 0)
    {
        return(TRUE)
    }
  }
  return(FALSE)
}
searchPatternInNG <- function(search.prefix, dfNames, dfIndex, short.stop.words, maskStopWords = TRUE)
{
  search.prefix <- gsub("\\\\w+",paste0("(",paste(short.stop.words,collapse="|"),")"),search.prefix)
  search.prefix <- gsub("[+]","",search.prefix)
  
  search.string <- paste0("^",search.prefix,"_[a-z']+$")
  mytext <- paste0("dfMatch <- as.data.frame(", dfNames[dfIndex], "[feature %like% search.string])")
  eval(parse(text=mytext))
  
  if (maskStopWords)
  {
    #mytext <- paste0("dfDenom <- as.data.frame(", dfNames[dfIndex-1], "[feature %like% search.prefix])")
  }
  else
  {
    #mytext <- paste0("dfDenom <- as.data.frame(", dfNames[dfIndex-1], "[feature == search.prefix])")
  }
  #eval(parse(text=mytext))
  
  #denom <- sum(dfDenom$value)
  dfMatch$word <- gsub(paste0("^",search.prefix,"_"),"",dfMatch$feature)
  if (maskStopWords)
  {
    dfMatch <- dfMatch[!(dfMatch$word %in% stopwords("en")),]
  }
  denom <- sum(dfMatch$value)
  
  dfMatch$probs <- dfMatch$value / denom
  dfMatch <- dfMatch[c("word","probs")]
  dfMatch <- dfMatch[order(-dfMatch$probs),]
  colnames(dfMatch)[2] <- paste0("p",dfIndex)
  dfMatch <- head(dfMatch,10)
  return(dfMatch)
}

returnSimpleMatch <- function(mytext, dfNames, short.stop.words = "", maskStopWords = TRUE)
{
  nGramText <- unlist(strsplit(mytext," "))
  if (maskStopWords)
  {
    nGramText[nGramText %in% short.stop.words] <- "\\w+"
  }
  dfIndex <- length(nGramText) + 1

  if (dfIndex > 6)
  {
    #print(nGramText)
    nGramText <- nGramText[(length(nGramText)-4):length(nGramText)]
    #print(nGramText)
  }
  dfIndex <- length(nGramText) + 1
  
  max.dfIndex <- dfIndex
  search.prefix <- paste(nGramText,collapse="_")

  
  dfMatchMain <- NULL
  max.p <- 0
  while ((search.prefix != "")  & (dfIndex >= 1))
  {
    dfMatchTemp <- searchPatternInNG(search.prefix,dfNames, dfIndex, short.stop.words, maskStopWords)
    
    if (isMatchFound(dfMatchTemp))
    {
      if (is.null(dfMatchMain))
      {
        max.p <- dfIndex
        dfMatchMain <- dfMatchTemp
      }
      else
      {
        dfMatchMain <- full_join(dfMatchMain,dfMatchTemp,by="word")
      }
    }
    
    search.prefix <- gsub("^[a-z'\\+]+_?","",search.prefix)
    dfIndex <- dfIndex - 1
  }
  
  if (is.null(dfMatchMain)) return(NULL)
  match.cols <- colnames(dfMatchMain)
  for (i in max.dfIndex:2)
  {
    if (!(paste0("p", i) %in% match.cols))
    {
      mytext <- paste0("dfMatchMain$p", i, " <- rep(NA,nrow(dfMatchMain))")
      eval(parse(text=mytext))
    }
  }

  for (i in max.dfIndex:2)
  {
    mytext <- paste0("dfMatchMain$w", i, "<- rep(", i, ",nrow(dfMatchMain))")
    eval(parse(text=mytext))
  }
  dfMatchMain$p <- rep(0,nrow(dfMatchMain))
  for (i in max.dfIndex:2)
  {
    mytext <- paste0("dfMatchMain$p <- with(dfMatchMain,p + coalesce(p", i, ",0) * w", i , ")")
    eval(parse(text=mytext))
  }
  dfMatchMain$p <- dfMatchMain$p / (sum(max.dfIndex:2)) 
  dfMatchMain <- dfMatchMain[order(-dfMatchMain$p),]
  
  dfMatchMain <- head(dfMatchMain,5)
  rownames(dfMatchMain) <- seq_len(nrow(dfMatchMain))
  dfMatchMain$Rank <- seq_len(nrow(dfMatchMain))
  #print("Printing Interpolation Results")
  #print(dfMatchMain)
  colnames(dfMatchMain)[colnames(dfMatchMain) == "p"] <- "score"
  dfMatchMain <- dfMatchMain[c("word",paste0("p",max.dfIndex:2),paste0("w",max.dfIndex:2),"score","Rank")]
  return(dfMatchMain)
  
}

getdValue <- function(value, dfIndex, max.df.Index)
{
  if (dfIndex >= max.df.Index)
  {
    return(0)
  }
  dvalue <- ifelse(value==1,0.554,0.75)
  return(dvalue)
}

searchKN <- function(search.prefix,dfNames,dfCCNames, dfIndex, max.df.Index, rLevel, dfMatch, maskStopWords=TRUE)
{
  if (rLevel == 0)
  {
    search.string <- paste0("^",search.prefix,"_[a-z']+$")
    mytext <- paste0("dfMatch <- ", dfNames[dfIndex], "[feature %like% search.string]")
    eval(parse(text=mytext))
    if (nrow(dfMatch) == 0) return(NULL)
    dfMatch$word <- gsub(paste0("^",search.prefix,"_"),"",dfMatch$feature)
    if (maskStopWords)
    {
      dfMatch <- dfMatch[!(dfMatch$word %in% stopwords("en")),]
    }
    
    denom <- sum(dfMatch$value)
    
    dValue <- getdValue(dfMatch$value, dfIndex, max.df.Index)
    p <- ifelse(dfMatch$value - dValue>0,dfMatch$value - dValue,0) / denom
    lambda <- dValue * nrow(dfMatch) / denom
    dfMatch$lastword <- dfMatch$feature
    dfMatch$ngram <- rep(dfIndex,nrow(dfMatch))
    dfMatch$word <- gsub("^([a-z']+_)+","",dfMatch$feature)
    dfMatch <- dfMatch[,.(feature,lastword,word,ngram,value)]
    dfMatch$p <- p + lambda * searchKN(gsub("^[a-z']+_?","",search.prefix)
                                       , dfNames, dfCCNames, dfIndex-1,
                                       max.df.Index, rLevel+1, dfMatch,maskStopWords)$p
  }
  else
  {
    dfMatch$lastword <- gsub("^[a-z']+_","",dfMatch$lastword)
    mytext <- paste0("dfMatch <- merge(dfMatch,", dfCCNames[dfIndex+1], ",by='lastword',all.x=TRUE)")
    eval(parse(text=mytext))
    dfMatch$contcount <- coalesce(dfMatch$contcount,0)
    mytext <- paste0("denom <- nrow(", dfNames[dfIndex+1], ")")
    eval(parse(text=mytext))
    
    #Manipulate Denominator for some smoothing otherwise we will get hardly any
    denom <- sum(dfMatch$contcount)
    if (denom == 0) denom <- 1
    
    dValue <- getdValue(dfMatch$contcount, dfIndex, max.df.Index)
    p <- ifelse(dfMatch$contcount - dValue>0,dfMatch$contcount - dValue,0) / denom
    
    lambda <- dValue * nrow(dfMatch) / denom
    dfMatch <- dfMatch[,.(feature,lastword,word,ngram,value)]
    if (dfIndex == 1)
    {
      dfMatch$p <- p
    }
    else
    {
      dfMatch$p <- p + lambda * searchKN(gsub("^[a-z']+_?","",search.prefix)
                                         , dfNames, dfCCNames, dfIndex-1,
                                         max.df.Index, rLevel+1, dfMatch,maskStopWords)$p
    }
    
  }
  return(dfMatch)
}

getKNDF <- function(mytext,dfNames, dfCCNames, max.df.Index, maskStopWords=TRUE)
{
  nGramText <- unlist(strsplit(mytext," "))
  dfIndex <- length(nGramText) + 1
  if (dfIndex > max.df.Index)
  {
    #print(nGramText)
    nGramText <- nGramText[(length(nGramText)-(max.df.Index-2)):length(nGramText)]
    #print(nGramText)
  }
  dfIndex <- length(nGramText) + 1
  search.prefix <- paste(nGramText,collapse="_")
  original.search.prefix <- search.prefix
  
  dfMatchMain <- NULL
  while ((search.prefix != "") & (dfIndex >= 1))
  {
    dfMatchTemp <- searchKN(search.prefix,dfNames,dfCCNames, dfIndex, max.df.Index, 0, NULL, maskStopWords)
    if (isMatchFound(dfMatchTemp))
    {
      if (is.null(dfMatchMain))
      {
          dfMatchMain <- dfMatchTemp
      }
      else
      {
        dfMatchTemp <- dfMatchTemp[!(word %in% intersect(dfMatchTemp$word,dfMatchMain$word))]
        if (!is.null(dfMatchTemp))
        {
            dfMatchMain <- rbind(dfMatchMain,dfMatchTemp)
        }  
      }
    }  

    search.prefix <- gsub("^[a-z'\\+]+_?","",search.prefix)
    dfIndex <- dfIndex - 1
  }
  if (is.null(dfMatchMain)) return(NULL)
  if (nrow(dfMatchMain) == 0) return(NULL)
  dfMatchMain$word <- gsub("^([a-z']+_)+","",dfMatchMain$feature)
  dfMatchMain <- dfMatchMain[,.(word,ngram,p)]
  dfMatchMain <- dfMatchMain[order(-p)]
  dfMatchMain <- head(dfMatchMain,5)
  dfMatchMain$Rank <- seq_len(nrow(dfMatchMain))
  return(dfMatchMain)
}

dfNames <- c("dfUG","dfBG","dfTG","dfQG","dfPG","dfHG")
dfCCNames <- c("","dfBG.CC","dfTG.CC","dfQG.CC","dfPG.CC","dfHG.CC")

pgSample.short <- pgSample[sample(length(pgSample),1000)]
length.short <- length(pgSample.short)
df.CV <- data.frame(phrase = pgSample.short, 
                    ip_rank1 = rep(0,length.short),
                    ip_rank2 = rep(0,length.short),
                    ip_rank3 = rep(0,length.short),
                    ip_rank4 = rep(0,length.short),
                    ip_rank5 = rep(0,length.short),
                    kn_rank1 = rep(0,length.short),
                    kn_rank2 = rep(0,length.short),
                    kn_rank3 = rep(0,length.short),
                    kn_rank4 = rep(0,length.short),
                    kn_rank5 = rep(0,length.short),
                    stopword = rep(0,length.short)
                    )

Let us make predictions using Interpolation Approach

oldTime <- as.numeric(Sys.time())*1000
for (mytext in pgSample.short)
{
  targetword <- gsub("^([a-z']+_)+","",mytext)
  inputtext <- gsub("_"," ",gsub("_[a-z']+$","",mytext))
  allowStopWords <- FALSE
  if (targetword %in% stopwords("en"))
  {
    allowStopWords <- TRUE
    df.CV[df.CV$phrase == mytext,"stopword"] <- 1
  }
  df.IP <- returnSimpleMatch(inputtext,dfNames,maskStopWords = !allowStopWords)
  if (isMatchFound(df.IP))
  {
    df.Match <- df.IP[df.IP$word == targetword,]
    ip.rank <- 0
    if (nrow(df.Match) > 0)
    {
      ip.rank <- df.Match$Rank
      df.CV[df.CV$phrase == mytext,paste0("ip_rank",ip.rank)] <- 1
    }
  }
}
newTime <- as.numeric(Sys.time())*1000

Let us make predictions using Kneser-Ney Smoothing

oldTime.KN <- as.numeric(Sys.time())*1000
for (mytext in pgSample.short)
{
  targetword <- gsub("^([a-z']+_)+","",mytext)
  inputtext <- gsub("_"," ",gsub("_[a-z']+$","",mytext))
  allowStopWords <- FALSE
  if (targetword %in% stopwords("en"))
  {
    allowStopWords <- TRUE
    df.CV[df.CV$phrase == mytext,"stopword"] <- 1
  }
  df.KN <- getKNDF(inputtext,dfNames,dfCCNames,6,maskStopWords = !allowStopWords)
  if (isMatchFound(df.KN))
  {
    df.Match <- df.KN[df.KN$word == targetword,]
    kn.rank <- 0
    if (nrow(df.Match) > 0)
    {
      kn.rank <- df.Match$Rank
      df.CV[df.CV$phrase == mytext,paste0("kn_rank",kn.rank)] <- 1
    }
  }
}
newTime.KN <- as.numeric(Sys.time())*1000

Print Ranks

length.CV <- nrow(df.CV)
sum.rank1 <- sum(df.CV$ip_rank1)
sum.rank2 <- sum(df.CV$ip_rank2)
sum.rank3 <- sum(df.CV$ip_rank3)
sum.rank4 <- sum(df.CV$ip_rank4)
sum.rank5 <- sum(df.CV$ip_rank5)

sum.kn.rank1 <- sum(df.CV$kn_rank1)
sum.kn.rank2 <- sum(df.CV$kn_rank2)
sum.kn.rank3 <- sum(df.CV$kn_rank3)
sum.kn.rank4 <- sum(df.CV$kn_rank4)
sum.kn.rank5 <- sum(df.CV$kn_rank5)

time.taken <- (newTime - oldTime) / length.CV
total.catches <- (sum.rank1 + sum.rank2 + sum.rank3 + sum.rank4 + sum.rank5)
accuracy <-  total.catches / length.CV

time.taken.kn <- (newTime.KN - oldTime.KN) / length.CV
total.catches.kn <- (sum.kn.rank1 + sum.kn.rank2 + sum.kn.rank3 + sum.kn.rank4 + sum.kn.rank5)
accuracy.kn <-  total.catches.kn / length.CV

df.CV$ip_catch <- rowSums(df.CV[c(paste0("ip_rank",1:5))])
df.CV$kn_catch <- rowSums(df.CV[c(paste0("kn_rank",1:5))])

df.CV[c("phrase","ip_catch","kn_catch")]

##                                                 phrase ip_catch kn_catch
## 1                      from_the_university_of_colorado        0        0
## 2                           wife_has_lingered_over_the        1        1
## 3                    some_might_have_actually_happened        0        0
## 4                             you_go_workout_in_khakis        0        0
## 5                               of_ideas_and_farce_and        1        1
## 6                        one_more_task_tuesday_cutting        0        0
## 7                             school_has_spoken_to_him        1        1
## 8                  in_writing_almost_exclusively_about        0        0
## 9                           impediments_at_the_time_of        1        1
## 10                              by_the_red_beet_stains        0        0
## 11                     academy_award_winning_film_real        0        0
## 12                              well_i_hope_things_get        1        1
## 13                            issue_with_the_fool_that        0        0
## 14                          for_several_more_hours_you        0        0
## 15                             you_guys_had_an_awesome        1        1
## 16                               may_be_a_gambit_based        0        0
## 17                          show_you_know_that_episode        0        0
## 18                     for_historical_shows_with_great        0        0
## 19                       is_probably_the_biggest_party        0        0
## 20                     tablet_makes_sense_for_limiting        0        0
## 21                          people_call_for_the_bitter        0        0
## 22                           then_picked_up_a_battered        0        0
## 23                        done_such_a_miraculous_thing        0        0
## 24                     chairman_of_regional_urology_at        0        0
## 25                    college_game_is_the_one-and-done        0        0
## 26                             energy_of_an_impact_and        0        0
## 27           northeastern_congo_two_prominent_militias        0        0
## 28                   gallery_aferro_suzanne_kammen_and        0        0
## 29                              again_i_don't_know_the        0        0
## 30                               first_home_run_hit_by        1        1
## 31                           worked_while_they_won_two        1        1
## 32                    more_vulnerable_to_simple_things        1        1
## 33                  with_pakistan's_nuclear_arsenal_as        0        0
## 34                   join_the_mountain_west_conference        1        1
## 35                            i_cannot_express_ow_much        0        0
## 36                       nation_in_york_and_naperville        0        0
## 37                        friday_ladies_free_all_night        1        1
## 38                                will_be_a_washing_of        0        0
## 39                  declined_even_further_when_schools        0        0
## 40                               i_am_studying_for_the        1        1
## 41                             goes_hiking_with_me_too        0        0
## 42                               i_think_we'll_be_able        1        1
## 43                        meaning_students_had_to_wait        0        0
## 44                                 is_a_lot_of_cussing        0        0
## 45                             from_each_side_this_was        0        0
## 46                to_make_reservations_contact_cynthia        0        0
## 47                           have_been_the_only_person        1        1
## 48   apron-strings_with_a_constantly-expanding_bloated        0        0
## 49                             you_doing_to_my_chicken        0        0
## 50                           who_questioned_the_use_of        1        1
## 51                       frames_bookcases_and_my_sleep        0        0
## 52                            she_just_had_an_argument        0        0
## 53                              which_to_hold_back_and        1        1
## 54                         one_woman_asked_the_senator        0        0
## 55                           there_that_read_listen_to        1        1
## 56                 ranking_state_and_federal_officials        1        0
## 57                              that_you_have_with_tea        0        0
## 58                         order_nearly_every_table_is        0        0
## 59                            card_or_project_and_link        0        0
## 60                                lysol_can_to_clean_a        0        0
## 61                          all_wide_awake_and_excited        0        0
## 62                             class_take_you_to_lunch        0        0
## 63                               may_we_draw_nearer_to        1        1
## 64                          reason_why_i've_played_for        0        0
## 65                       working_at_the_salvation_army        1        1
## 66                           approach_would_be_to_have        1        1
## 67                    landlord's_refusal_to_renew_your        1        0
## 68                            a_difference_in_the_life        1        1
## 69                          this_way_tastes_great_just        0        0
## 70              quite_primitive_especially_as_compared        0        0
## 71                                   at_an_ngo_most_of        1        1
## 72                               to_throw_a_right_hand        1        1
## 73                              train_in_time_to_react        0        0
## 74                               but_one_dynamo_in_the        1        1
## 75                           out_of_their_own_player's        0        0
## 76                             wyoming_will_be_seen_on        1        0
## 77                      says_analyst_egil_juliussen_of        0        0
## 78                     update_session_later_this_month        1        1
## 79               pretty_far-reaching_law_for_employers        0        0
## 80                          princess_is_great_for_this        0        0
## 81                       a_basic_understanding_of_what        1        1
## 82                            can_stop_calling_me_time        0        0
## 83                     users_feel_about_android_having        0        0
## 84                        deep_and_abiding_interest_in        1        1
## 85                  responsibility_at_the_lower_levels        0        0
## 86                       across_from_meldrum_bar_state        0        0
## 87                             you_for_following_me_my        0        0
## 88                    wandering_consisted_perhaps_of_a        1        1
## 89                   think_negative_sugar_ray_robinson        0        0
## 90                       west_they_establish_the_first        1        1
## 91                               very_last_page_of_the        1        1
## 92                                 a_line_test_on_your        1        1
## 93                  again_and_enjoyed_big-race_success        0        0
## 94                         late_i'm_working_on_getting        1        1
## 95                     night_to_even_their_first-round        0        0
## 96                            think_i_just_saw_someone        1        1
## 97                          opens_first_hotel_in_pearl        0        0
## 98                                love_the_name_of_the        1        1
## 99                    water_pipes_using_new_technology        0        0
## 100                       the_commercial_area_and_just        0        0
## 101                                    i_can_try_but_i        1        1
## 102                              brotha_i_wish_i_could        1        1
## 103                         stack_of_papers_leading_up        1        1
## 104                           and_admitted_that_i_must        0        0
## 105                             we_need_money_you_have        1        1
## 106                     in_tobacco_smoke_amplified_the        0        0
## 107                             by_sean_brand_atri_and        0        0
## 108                with_their_finance_and_intervention        0        0
## 109             pathetic_although_genuine_feelings_for        1        1
## 110                            see_some_good_friends_i        0        0
## 111                       more_fleets_are_moving_south        0        0
## 112             football_coach_jerry_sandusky_received        0        0
## 113                  the_long-term_survivability_of_an        0        0
## 114                          to_win_tonight_instead_of        1        1
## 115                              new_pic's_up_guys_and        1        1
## 116               anyone_with_fibromyalgia_please_send        0        0
## 117                     reason_for_demanding_recall_of        1        1
## 118                comparable_but_the_profitability_of        1        1
## 119                                 and_on_the_laws_of        1        1
## 120                      fielder_singled_up_the_middle        0        1
## 121                        ground_staff_and_cabin_crew        0        0
## 122                        and_enough_fruity_snacks_to        0        1
## 123                               her_may_be_we_should        0        0
## 124                      bone_marrow_paint_flows_thick        0        0
## 125                 into_an_upper_elementary_classroom        0        0
## 126              will_offer_a_family-friendly_pg-rated        0        0
## 127                           funny_one_pairing_i_like        0        0
## 128                            get_up_early_things_get        1        1
## 129                 a_conference_hosted_by_educational        0        0
## 130                                  ok_let's_set_up_a        1        1
## 131                          delfino_is_an_unknown_but        0        0
## 132                              read_gift_wrap_a_book        0        0
## 133                           wait_to_cross_the_finish        0        0
## 134                              want_to_hear_from_ach        0        0
## 135                            older_it_is_a_privilege        0        0
## 136                         games_this_summer_she_said        1        1
## 137                   on_the_individual_pictured_above        1        1
## 138                           to_spend_his_time_behind        0        0
## 139                               so_i_wont_be_posting        0        0
## 140                westfield_designed_a_landscape_plan        0        0
## 141                                is_the_thing_that_i        1        1
## 142                a_less-than-reliable_partner_in_the        0        0
## 143               wittmer_dubus_chowdhury_fehr_hartman        0        0
## 144                romania_the_caribbean_all_combining        0        0
## 145                         your_turn_because_you_were        0        0
## 146                        into_words_how_incredible_i        0        0
## 147                         creative_and_pr_teams_wrap        0        0
## 148                       insiders_ever_saw_in_wartime        0        0
## 149                     percentage_for_the_salukis_who        0        0
## 150                         make_such_bad_decisions_on        1        1
## 151                              is_also_a_good_friend        0        0
## 152                          i_loved_taking_those_step        0        0
## 153                      fashion_brigade_although_i_am        1        1
## 154                          crafting_and_most_days_my        0        0
## 155                therefore_i_initially_questioned_my        0        0
## 156                   insurance_companies_were_wary_of        1        1
## 157                             my_trips_downtown_as_i        1        1
## 158                              in_you_and_spewing_on        0        0
## 159                           when_the_news_was_broken        0        0
## 160                            if_water_had_a_facebook        0        0
## 161                      in_our_first_season_represent        0        0
## 162                               be_aware_of_how_much        1        1
## 163             prurient_interests_and_being_offensive        0        0
## 164                            the_u.s_economy_may_not        0        0
## 165                                a_season_and_a_half        1        1
## 166                          and_lucy_hollier_on_viola        0        0
## 167                         the_four_series_we've_seen        1        1
## 168                               to_add_more_would_be        1        1
## 169                          passport_then_you_need_to        1        1
## 170                       were_credit_card_fees_levied        0        0
## 171               whose_domestic_revenues_already_were        0        0
## 172                        bed_frames_bookcases_and_my        0        0
## 173                          it_i_don't_recall_getting        0        0
## 174                                  to_the_top_of_the        1        1
## 175                            farner_but_cant_ya_just        0        0
## 176                       has_been_close_to_impossible        0        0
## 177                                 in_awe_at_what_was        0        0
## 178                       with_a_credit_card_belonging        0        0
## 179                               and_i_get_along_just        1        1
## 180                  others_however_unconscious_of_the        1        1
## 181                            indicate_such_a_zone_is        0        0
## 182                             me_maybe_i_would_smile        0        0
## 183                          i'm_getting_back_up_there        0        0
## 184                participation_in_a_dangerous_office        0        0
## 185                        little_work_the_majority_of        1        1
## 186                             they_wasnt_meant_to_be        1        1
## 187                               because_bug_and_mr_a        0        0
## 188                               that_is_not_the_wait        0        0
## 189                         our_separate_work_life_and        1        1
## 190                          of_singing_and_moving_and        0        0
## 191                              oh_well_then_you_know        1        1
## 192                     were_recorded_in_aztec_codices        0        0
## 193                                the_frame_i_used_on        0        0
## 194                         in_chicago_with_the_family        0        0
## 195                               she_could_stand_on_a        1        1
## 196                         implies_this_type_of_water        0        0
## 197                   store_was_doing_booming_business        0        0
## 198                          was_no_evidence_dalal_had        0        0
## 199                              by_that_point_most_of        1        1
## 200         relatively_rarely_instituted_on_preservice        0        0
## 201                     or_parochial_schools_but_could        0        0
## 202                        and_a_brother_tim_cuddeback        0        0
## 203                   goodman_went_about_following_his        0        0
## 204                        transit_dream_won't_go_down        1        1
## 205                                 a_good_rub_down_so        0        0
## 206                         el_pollo_norteno_in_garden        0        0
## 207                               to_get_out_there_try        0        0
## 208                             but_i_thought_it_would        1        1
## 209                    the_bailouts_give_greece_enough        0        0
## 210                              helping_a_guest_at_my        0        0
## 211                                  has_to_use_it_for        1        1
## 212                         stick_and_you_just_brushed        0        0
## 213                      always_sided_with_the_vaccine        0        0
## 214                        story_and_some_viewers_will        0        0
## 215                          while_you_are_cooking_the        1        1
## 216                     should_something_happen_to_the        1        1
## 217                                 i_came_back_with_a        1        1
## 218             university_programs_classes_are_taught        0        0
## 219                          take_care_of_yourself_sir        0        0
## 220                                it_would_go_down_as        1        1
## 221                           they_wanted_to_make_sure        1        1
## 222                   encourage_their_peers_to_examine        0        0
## 223                              it's_hard_when_i_miss        0        0
## 224                              lips_and_smiles_but_i        1        1
## 225                           to_link_past_and_present        1        1
## 226                                i_am_trying_to_find        0        0
## 227                      quite_sure_when_exactly_we'll        0        0
## 228                          coffee_drink_just_say_the        1        1
## 229                          and_says_they_should_beat        0        0
## 230                         doubt_junior_seau_took_his        0        0
## 231                         is_a_noticeable_hop_flavor        1        1
## 232                        whether_it_was_the_business        0        0
## 233                                  if_i_see_one_more        1        1
## 234                   week_and_a_technology_bellwether        0        0
## 235                 service_urged_continued_caution_on        0        0
## 236                               ahh_moment_so_now_it        0        0
## 237                               and_so_today_is_just        1        1
## 238               howard's_jewelry_in_mayfield_heights        0        0
## 239                  oatmeal_stout_from_squatters-_and        0        0
## 240                            a_tree_and_drooping_its        0        0
## 241                donovan_all_appeared_in_promotional        0        0
## 242                       in_primaries_for_council_and        1        1
## 243                         shock_of_what_beyonce_wore        0        0
## 244                        shouldn't_be_allowed_to_not        0        0
## 245                    his_intended_actual_audience_it        0        0
## 246            favorite_thrashers_our_beloved_historic        0        0
## 247               of_generating_printed_engravings_and        0        0
## 248                  have_worked_on_several_categories        0        0
## 249                                  or_a_class_so_too        0        0
## 250                                so_wrong_but_i_cant        0        0
## 251                 october_mr_weathers_invited_occupy        0        0
## 252               modified_gladiator_meets_the_shootie        0        0
## 253              notes_concerning_water_supply_article        0        0
## 254                    people_deeply_invested_in_their        0        0
## 255                            rejoins_the_mix_at_some        0        0
## 256                             the_right_meds_for_him        0        0
## 257                              to_show_them_why_they        1        1
## 258                       two_women_wreaking_damage_on        0        0
## 259                           be_more_directly_felt_in        0        0
## 260                            open_so_it_appears_that        1        1
## 261                    little_disappointed_in_the_lack        0        0
## 262                          when_you_have_someone_who        1        1
## 263                            the_draft_but_i_managed        0        0
## 264                           then_put_the_marinade_on        0        0
## 265                           whom_london_is_one_giant        0        0
## 266                         i_miss_your_twitter_sprees        0        0
## 267                         with_bare_legs_come_summer        0        0
## 268                       who_gave_valuable_radio_real        0        0
## 269                         i_was_very_embarrassed_and        1        1
## 270                       this_year's_concert_is_april        0        0
## 271                             the_boy_has_been_found        0        0
## 272                the_indiependence_music_festival_in        1        1
## 273                             this_month_went_by_now        1        1
## 274                   legislature_and_skip_the_regular        0        0
## 275                     renowned_creature_in_the_world        1        1
## 276                           you_are_trying_to_change        0        0
## 277                        enough_effort_to_climb_over        0        0
## 278                        satisfy_any_of_the_purposes        0        0
## 279                          couldn't_do_it_laugh_glad        0        0
## 280                           animals_they_do_not_gang        0        0
## 281                             to_a_nearby_section_of        1        1
## 282                          forward_to_seeing_what_he        1        1
## 283                              last_time_we_must_say        0        1
## 284                              a_right_to_decline_to        1        1
## 285                     will_follow_tonight_with_dates        0        0
## 286                            can_wear_a_slim_fitting        0        0
## 287                        done_something_that_has_put        0        0
## 288                                and_then_sell_it_to        1        1
## 289                           jones_was_ordered_to_pay        1        1
## 290                    with_a_northern_ireland_setting        0        0
## 291                      go_to_his_graduation_ceremony        1        1
## 292                   being_seen_and_literally_calling        0        0
## 293                           show_now_this_good_music        0        0
## 294               control_fragmented_content_with_each        0        0
## 295       the_legislature's_primarily_its_leadership's        0        0
## 296                               by_a_staff_person_at        0        0
## 297                             any_in_those_days_when        1        1
## 298                        highways_can_afford_to_stay        0        0
## 299                              quite_a_while_to_tell        0        0
## 300                  pointed_out_conflicts_of_interest        1        1
## 301                            because_of_the_links_to        1        1
## 302                                do_not_be_afraid_to        1        1
## 303                          at_the_hotel_but_declined        0        0
## 304                      we_can_have_children_adoption        0        0
## 305                     she's_probably_just_as_nervous        0        0
## 306                                 that_but_to_me_her        0        0
## 307                in_february_compared_with_january's        0        0
## 308                               no_i_wasnt_hiding_it        0        0
## 309             during_the_project_included_underwater        0        0
## 310                              calif_i_would_take_ya        0        0
## 311                              any_of_it_should_have        1        1
## 312                    that's_why_ohioans_elect_mayors        0        0
## 313                              is_enough_room_in_the        1        1
## 314                    space_to_separate_them_visually        0        0
## 315                                work_as_a_writer_in        0        0
## 316                           which_listed_it_for_sale        0        0
## 317                          to_stay_a_separate_entity        0        0
## 318                                   it_is_to_a_woman        0        0
## 319                           he_was_surprised_to_wake        0        0
## 320                according_to_thought_leaders_howard        0        0
## 321                               on_i_have_alot_thing        0        0
## 322                               i_had_the_bounce_has        0        0
## 323                               punk_i_dnt_want_work        0        0
## 324                                for_some_of_the_fun        0        0
## 325                            filler_while_he_got_the        1        1
## 326                        a_board_of_student_auditors        0        0
## 327                  the_community_through_the_program        0        0
## 328                       eternity_just_because_we_all        0        0
## 329                         of_preference_i_think_this        0        0
## 330                                 ing_up_in_some_way        1        1
## 331                     educated_consumer_you_can_make        1        1
## 332            website_dialidol.com_measured_the_phone        0        0
## 333         circuit_attorney_jennifer_joyce_emphasized        0        0
## 334                           be_jailed_for_their_part        0        0
## 335                          table_spoons_of_butter_to        0        0
## 336                      glasses_three_glasses_and_not        0        0
## 337               the_departments_that_comprise_public        0        0
## 338               their_weight-loss_willpower_during_a        0        0
## 339                   quick_playful_inspection_more_on        0        0
## 340                     state_prison_behavior_unit_won        0        0
## 341                          been_open_for_garden_week        0        0
## 342                          keep_up_with_dion_waiters        0        0
## 343                        a_student's_approach_to_the        1        1
## 344                       just_finished_catching_up_on        1        1
## 345                        tech_is_the_state's_largest        0        0
## 346                               have_to_take_he_said        1        1
## 347                         mana_would_love_to_connect        0        0
## 348                          least_the_lobsters_get_to        1        1
## 349                      the_bill_were_concerned_about        1        1
## 350                           head_these_days_when_she        0        0
## 351                              so_there_is_this_girl        0        0
## 352                       my_brain_won't_stop_thinking        1        1
## 353                tuesday_morning_dropping_of_resumes        0        0
## 354                         to_know_this_festival_also        0        0
## 355                          annual_value_of_more_than        1        1
## 356                            my_jersey_name_for_team        0        0
## 357                 has_inspired_hundreds_of_thousands        1        1
## 358                                 date_on_my_gown_is        0        0
## 359                      deposited_into_a_bank_account        1        1
## 360                 youversion_has_hundreds_of_reading        0        0
## 361                               trying_to_make_it_to        1        1
## 362                         this_snowy_weather_i_think        1        1
## 363                       lack_of_clarity_has_resulted        0        0
## 364                               for_one_bride_may_be        1        1
## 365       take_everything_back_offline_personalization        0        0
## 366                               a_chance_to_win_both        0        0
## 367                              mom_to_mt_rainier_for        0        0
## 368                           cast_your_vote_and_leave        1        1
## 369                             keep_it_real_this_week        1        1
## 370                             tiny_bite_taken_on_the        1        1
## 371                  tonight_need_to_celebrate_winning        0        0
## 372                              the_river_kwai_on_tcm        0        0
## 373                            going_to_portland_for_a        1        1
## 374                           are_probably_a_couple_of        1        1
## 375                       of_casino_and_wiseguys_which        0        0
## 376                     late_laugh_yayeeee_to_tomorrow        0        0
## 377                           you_all_about_the_jungle        0        0
## 378                            act_for_failing_to_give        0        0
## 379                    pretty_simple_dishes_and_pretty        0        0
## 380                      deals_are_for_high-end_hotels        0        0
## 381                             honors_them_day_in_and        1        1
## 382                         with_win_over_warriors_and        1        1
## 383                                  i_don't_eat_a_lot        1        1
## 384                        service_side_of_the_project        0        0
## 385                        the_number_of_guests_horses        0        0
## 386                        my_husbands_photos_laugh_in        0        0
## 387                        feat_more_arduous_than_that        1        1
## 388                      of_wife_mother_and_successful        0        0
## 389                 business_from_the_concertgoers_and        0        0
## 390                          early_things_get_going_at        0        0
## 391                       our_film_premiered_next_door        0        0
## 392                            reply_to_this_enter_for        0        0
## 393                                and_we_got_to_enjoy        0        0
## 394                         of_the_denver_broncos_mark        0        0
## 395                 class_who_sacrificed_themselves_to        1        1
## 396                             sure_you_can_see_where        0        0
## 397                       documents_for_the_legal_case        0        0
## 398                        connected_to_the_east_coast        1        1
## 399                     men_with_both_conditions_don't        0        0
## 400                     margaret_singley_a_daughter_of        1        1
## 401                   immediately_respond_to_a_request        1        1
## 402                 discussing_the_ruling_with_engzell        0        0
## 403                       a_potential_matchup_in_vegas        0        0
## 404                                  with_at_or_not_hm        0        0
## 405                            appear_on_tv_talk_shows        0        0
## 406                       kennedy_a_harvard_law_school        1        1
## 407                          there_is_great_love_there        0        0
## 408                             be_able_to_get_someone        0        0
## 409                        because_you_either_trust_me        1        1
## 410                              i'm_pretty_sure_i_was        1        1
## 411                            to_my_school's_fun_fair        0        0
## 412                   exclusively_on_our_facebook_page        0        0
## 413                             album_of_the_year_also        0        0
## 414                             relates_to_her_the_way        1        1
## 415               efforts_have_also_yielded_additional        0        0
## 416                       it's_not_personal_it's_about        0        0
## 417                     if_there_was_deliberate_social        0        0
## 418                               a_lot_of_toppings_on        0        0
## 419                   penitentiary_in_kansas_and_later        0        0
## 420                    nancy_leigh_demoss_mary_kassian        0        0
## 421                     to_encourage_the_well-being_of        0        0
## 422      entering_the_country_including_asylum-seekers        0        0
## 423                     humans_human_is_precisely_that        1        1
## 424                      a_cleaner_better_written_code        0        0
## 425                       of_the_quarterback_studs_and        0        0
## 426                        gentle_words_stopped_him_at        0        0
## 427                                 her_to_stay_out_of        1        1
## 428                                 to_a_level_i_think        1        1
## 429                     of_singhvi_the_media_protected        0        0
## 430                     making_and_there's_very_little        1        1
## 431                     legislator_on_the_city_council        1        1
## 432                                to_keep_it_that_way        1        1
## 433                wakile_mother_to_teenagers_victoria        0        0
## 434                           good_beer_will_be_harmed        0        0
## 435                   have_made_headlines_and_unnerved        0        0
## 436                         to_cut_into_amazon's_sales        0        0
## 437                           only_a_couple_of_gagging        0        0
## 438                             of_the_journey_as_long        1        1
## 439                       around_or_handled_by_nephews        0        0
## 440              federal_investigators_probing_the_san        0        0
## 441                            somewhere_and_try_but_i        1        1
## 442                                 a_home_run_for_the        1        1
## 443                  a_class_action_lawsuit_concerning        0        0
## 444                     after_reading_it_through_twice        0        0
## 445                    news_conference_beside_what_was        0        0
## 446                   imagine_the_raiders_chances_last        0        0
## 447                        kindness_with_the_world_the        1        1
## 448                          mistakes_to_know_the_path        0        0
## 449                             in_a_sold_out_building        0        0
## 450                    first_millvale_game_dispute_the        1        1
## 451                            up_today_will_the_worst        0        0
## 452                        lasers_tattoo_removal_oh_my        1        1
## 453                     had_outstanding_depth_over_the        1        1
## 454                           by_comparison_has_been_a        1        1
## 455                   veneer_of_seemingly_endless_riff        0        0
## 456                             group_who_said_he_quit        0        0
## 457                 we're_already_getting_feedback_and        1        1
## 458                         it_the_language_and_tragic        0        0
## 459                             and_went_on_to_capture        0        0
## 460                     need_to_understand_two_crucial        0        0
## 461                          sell_and_post_around_town        1        1
## 462                          one_meet_where_i_could've        0        0
## 463                        police_blockade_of_the_bank        0        0
## 464                                  have_to_do_a_good        1        1
## 465                        pitchers_acquire_a_visa_mlb        0        0
## 466                       so_reminiscent_but_not_quite        1        1
## 467                            lower_than_i_thought_it        1        1
## 468                                 a_class_for_a_team        0        0
## 469                 conventional_dating_methods_and_do        0        0
## 470                                ball_back_to_me_and        1        1
## 471                         is_because_that_is_usually        0        0
## 472                    awesome_next_one_i'm_kidnapping        0        0
## 473                           lagos_he_had_been_beaten        0        0
## 474                         and_shameless_and_the_da's        0        0
## 475                       edx_project_will_include_not        0        0
## 476                               you_agree_a_guy_with        1        1
## 477                          come_home_from_school_eat        0        0
## 478                               that_he_had_used_the        1        1
## 479                              said_that_the_sale_is        1        0
## 480                        or_i'm_gonna_freaking_throw        0        0
## 481            enlisted_several_maintenance_workers_to        1        1
## 482                of_prostate-specific_antigen_or_psa        0        0
## 483                 emerging_markets_and_higher_retail        0        0
## 484                  also_usually_the_high_maintenance        0        0
## 485                           some_tulle_that_i_burned        0        0
## 486                               fork_to_mix_the_rice        0        0
## 487                         the_tragedy_in_these_poems        0        0
## 488           effectively_about_your_chosen_profession        0        0
## 489                           and_when_things_start_to        1        1
## 490              their_individual_business_goals_while        0        0
## 491                            so_i'm_the_park_letting        0        0
## 492                        high_profile_coach_for_free        0        0
## 493                 political_historians_in_the_county        0        0
## 494                             dear_captcha_yes_i'm_a        1        1
## 495                        ban_on_small_arms_back-door        0        0
## 496                        time_and_the_ginormous_grin        0        0
## 497                              movie_but_we_can_only        1        1
## 498                      control_faster_than_those_who        1        1
## 499                     anyone_laughs_he_starts_pretty        0        0
## 500                       switching_from_a_simple_oval        0        0
## 501                  mortgages_they're_often_linked_to        1        1
## 502                         smashwords_for_a_free_copy        1        1
## 503                          is_another_way_to_protect        0        0
## 504                           for_most_likely_the_rest        0        0
## 505                           see_how_they_turn_things        0        0
## 506                          to_raise_the_money_needed        0        0
## 507                            to_witness_these_two_on        0        0
## 508                       spoil_a_perfectly_good_story        0        0
## 509                    president_of_dollar_deals_world        0        0
## 510                   rays_of_computer_monitor_coupled        0        0
## 511                a_deadly_mixed-martial_arts_fighter        0        0
## 512                      it_is_the_sheer_concentration        0        0
## 513                     after_a_few_rollercoaster_days        0        0
## 514                           father_who_wanted_him_to        1        1
## 515                        response_times_are_too_slow        0        0
## 516                          wonder_if_someone_took_my        1        1
## 517                    kingpin_who_had_recruited_their        0        0
## 518                 conflict-free_world_he_depicted_in        0        0
## 519                 city_council_has_narrowly_rejected        0        0
## 520                            blessed_to_work_we_have        1        1
## 521                          they_stand_behind_the_man        0        0
## 522                               so_i_don't_know_what        1        1
## 523                    behaviour_that_one_can't_really        0        0
## 524                            the_more_power_can_flow        0        0
## 525                        would_chomp_on_them_happily        0        0
## 526                                 i_decided_to_go_to        1        1
## 527                         much_told_charlie_he_would        0        0
## 528                          love_the_ipad_mobile_site        0        0
## 529                      playing_football_the_next_day        1        1
## 530                       running_than_its_generals_as        0        0
## 531                    photographs_in_order_to_protect        0        0
## 532                         can_you_please_renane_your        0        0
## 533                     despite_her_good_grades_strong        0        0
## 534                    maviglio_shares_the_cahhc's_tin        0        0
## 535                the_while_feeling_important_because        0        0
## 536                       nose_dived_out_of_contention        0        0
## 537                    use_the_older_calendar_gathered        0        0
## 538                        tornado_warning_a_few_weeks        1        1
## 539                            on_the_season_my_friend        0        0
## 540                          thats_the_song_you_should        0        0
## 541                   waiting_anywhere_so_that_waiting        0        0
## 542                   assume_that_his_appointment_will        0        0
## 543                       that_prospect_was_none_other        1        1
## 544                           the_old_baptists_and_the        1        1
## 545              rundown_of_quality_opponents_defeated        0        0
## 546                 journalist_lisa_walter_who_covered        0        0
## 547                              the_fourth_round_in_a        1        1
## 548                              i_really_hope_they_do        1        1
## 549                      of_the_council's_research_and        1        1
## 550                        talks_about_memories_of_his        1        0
## 551                   warning_or_a_severe_thunderstorm        1        1
## 552                            for_the_capslock_it_was        1        1
## 553                boasts_is_an_established_industrial        0        0
## 554                      quote_on_my_facebook_timeline        0        0
## 555                            the_star_however_is_the        1        1
## 556                    learn_to_imagine_what_something        0        0
## 557                             he_shares_his_roof_and        1        1
## 558                        the_pool_and_back_courtside        0        0
## 559                 communications_came_through_on_its        0        0
## 560                          funny_said_you_always_use        0        0
## 561                        years_of_dedication_to_this        0        0
## 562                             use_to_make_a_tomatoes        0        0
## 563                       recently_the_central_role_of        1        1
## 564                             juror_in_their_case_to        1        1
## 565                             the_news_media_in_late        0        0
## 566                      ingredients_i_used_were_white        0        0
## 567                        he_still_hasn't_gotten_used        1        1
## 568                    act_and_other_special_attendees        0        0
## 569                   and_energy_and_solutions_germany        0        0
## 570                       avila_a_spokesman_there_said        0        0
## 571                       more_honest_and_truthful_and        1        1
## 572                 effort_to_expedite_the_application        0        0
## 573                         being_unsure_of_the_halter        0        0
## 574                  paris_hilton_stampeding_towards_a        1        1
## 575         seasonal_vegetables_and_creative_technique        0        0
## 576                     peel_finely_sliced_and_covered        0        0
## 577                 p.m_disappearing_behind_his_closed        0        0
## 578                              but_he_is_now_without        0        0
## 579                  the_marlins_through_three_innings        1        1
## 580                         more_apt_theme_something_i        1        1
## 581            week's_carnival_of_homeschooling_hosted        0        0
## 582                                night_as_she_lay_in        1        1
## 583                                   out_of_it_in_the        1        1
## 584                           month_clothes_and_a_size        0        0
## 585                           had_the_second_best_time        0        0
## 586                                ll_hate_the_one_and        1        1
## 587                             chance_to_give_the_mia        0        0
## 588                          artists_that_work_with_us        1        1
## 589                                   so_it_came_to_be        1        1
## 590                          is_once_again_enrolled_at        1        1
## 591                         tonight_the_new_james_bond        1        1
## 592                       beadbacking_that_i_wanted_to        1        1
## 593                                   him_to_a_big_boy        0        0
## 594                                is_and_there_are_no        1        1
## 595                          you_prefer_a_word_derived        0        0
## 596                           from_the_njdep_and_other        0        0
## 597                                as_one_who_does_not        1        1
## 598                     not_understand_where_they_were        1        1
## 599                          maria_lost_her_footing_at        0        0
## 600                          has_decided_you_will_work        0        0
## 601                       the_store_getting_donuts_and        1        1
## 602                             if_you're_not_using_it        1        1
## 603            for_business_writing_for_administrative        0        0
## 604                    a_mission_that_combined_armored        0        0
## 605                            least_for_the_next_week        1        1
## 606                            and_check_out_our_music        1        1
## 607                              to_tilt_the_scales_of        1        1
## 608                             pointed_out_iraq_has_a        1        1
## 609                         based_on_these_comments_we        0        0
## 610                    challenge_myself_but_will_still        0        0
## 611                         their_child_and_suffer_the        1        1
## 612                             up_with_are_flashes_of        1        1
## 613                          needs_job_growth_now_even        0        0
## 614              alternative_clothes_and_footwear_shop        0        0
## 615                         feel_like_going_to_walmart        0        0
## 616                          of_godly_marriages_in_our        0        0
## 617                  in_karate_competitions_and_talked        0        0
## 618                               which_we_get_a_sense        0        0
## 619                    outdoor_floodlights_that_cast_a        1        1
## 620                          that's_a_good_thing_right        1        1
## 621                            is_unable_to_play_coach        0        0
## 622                               to_try_a_scene_again        0        0
## 623                            or_pointed_in_the_right        0        0
## 624                            since_the_first_week_of        1        1
## 625                              best_pull_from_a_pack        0        0
## 626                               curious_as_to_what_i        1        1
## 627                           too_and_like_many_others        1        1
## 628                                in_the_park_while_i        1        1
## 629                        pregnant_and_wanted_to_have        0        0
## 630                           the_world_map_that_shows        0        0
## 631                national_agenda_france's_pursuit_of        1        1
## 632                        has_changed_my_life_because        0        0
## 633                 susanville_lassen_county_which_she        0        0
## 634                             from_abq_hp_for_giving        0        0
## 635                          effort_is_expected_to_pay        0        0
## 636                             a_nice_little_round_of        1        1
## 637                          much_of_the_region_during        0        0
## 638                                 rub_it_off_it_left        0        0
## 639                                 a_bike_lane_to_the        1        1
## 640                         out_on_a_stretcher_through        0        0
## 641                                with_them_for_a_few        1        1
## 642                      traditions_and_we_softly_call        0        0
## 643                              like_we_fixed_the_the        0        0
## 644                             by_living_a_life_first        0        0
## 645                              i_can_improve_my_game        0        0
## 646                         team_getting_ready_for_the        1        1
## 647                               excited_to_work_at_a        1        1
## 648                   happier_with_their_bank_branches        0        0
## 649                                  of_the_way_he_was        1        1
## 650                               was_not_in_order_but        0        0
## 651                        our_findings_are_a_snapshot        0        0
## 652                                  at_a_fast_pace_mr        0        0
## 653                          this_club_decided_to_give        0        0
## 654                            do_something_you_try_to        1        1
## 655                       recent_efforts_to_rewrite_an        0        0
## 656                          shared_home_of_the_giants        1        1
## 657                         film_it's_not_exactly_what        1        1
## 658                           sun_finally_came_out_for        0        0
## 659                          apartments_will_be_at_the        1        1
## 660                  of_leveraging_investment_in_space        0        0
## 661                           talk_about_why_i_believe        0        0
## 662                            list_stick_to_it_unless        0        0
## 663                            my_social_life_is_lived        0        0
## 664                          succeed_is_second_to_none        1        1
## 665                        any_turtles_might_appear_on        1        1
## 666                         you_strapped_wit_an_iphone        0        0
## 667             list_of_environmental_groups_including        1        1
## 668                     holding_a_meeting_to_determine        0        0
## 669                          helped_shroud_the_work_of        1        1
## 670                 have_reproductive_toxicity_and_are        0        0
## 671                    very_satisfying_ask_me_tomorrow        0        0
## 672                       the_highway_buses_and_nobody        0        0
## 673                          he_was_inside_his_vehicle        0        0
## 674                     this_beautiful_backyard_on_the        1        1
## 675                     gardens_officials_could_not_be        1        1
## 676                               may_weigh_in_as_well        1        1
## 677                     another_person_who_was_present        0        0
## 678                    previous_appeal_disposed_of_his        0        0
## 679                                 the_game_and_if_he        0        0
## 680                    the_british_mandate_lands_which        0        0
## 681                      students_picked_for_the_gates        0        0
## 682                              and_i_hope_you_follow        0        0
## 683                      will_pretty_much_convince_you        0        0
## 684                     by_attacking_the_president_and        1        1
## 685                    catherine_is_the_most_realistic        0        0
## 686                                ask_jeff_if_he's_as        0        0
## 687                      huge_oil_shortage_has_created        0        0
## 688                prettygoodwithwords_is_real_and_not        0        0
## 689                                with_the_sun_at_the        1        1
## 690               district_delegates_or_delegates_from        0        0
## 691                             the_other_end_it_stops        0        0
## 692               by_leveraging_their_assessment_money        0        0
## 693                     head_of_state_her_countrywoman        0        0
## 694                             is_not_into_getting_up        0        0
## 695                             to_hug_to_rejoice_with        0        0
## 696                                be_of_the_mindset_i        0        0
## 697                         spend_whatever_it_wants_to        1        1
## 698                          dropped_off_so_she_called        0        0
## 699                             tbh_i_dont_want_summer        0        0
## 700                        can_live_many_lives_through        0        0
## 701                former_michigan_state_wide_receiver        1        1
## 702                         no_cellie_yesterday_so_i'm        0        0
## 703                          will_be_achieved_by_using        1        1
## 704                               left_and_went_to_the        1        1
## 705                        a_different_person_a_better        0        0
## 706                                how_does_he_keep_me        0        0
## 707                 both_the_philosophy_and_psychology        0        0
## 708                    their_first-round_series_in_the        0        0
## 709                           rio_city_cafe_owner_bill        0        0
## 710                  room_chronic_health_problems_that        1        1
## 711              subtle_inclusion_of_foreign_relations        0        0
## 712                           people_who_like_him_seem        0        0
## 713                       to_switch_the_schedules_they        0        0
## 714                          didn't_match_her_image_of        1        1
## 715                   the_western_conference_finals_if        0        0
## 716                           your_exit_or_missed_your        0        0
## 717                   unfamiliar_with_the_tune_enjoyed        0        0
## 718                            appear_to_have_been_any        0        0
## 719                            all_of_our_reflexes_and        0        0
## 720                    standing_knee_deep_enjoying_the        1        1
## 721                    re_financial_systems_analyst_or        0        0
## 722                        think_you'll_hear_back_from        1        1
## 723                              now_in_control_of_the        1        1
## 724                           you_are_cooking_the_rice        0        0
## 725                            all_sizes_from_small_to        0        0
## 726                     for_neal's_newest_dvd_releases        0        0
## 727                     camp_at_a_municipal_campground        0        0
## 728                              does_anyone_know_of_a        1        1
## 729    whirlpool_for_hydrotherapy_muscle-stress_relief        0        0
## 730                      a_sobering_reminder_that_life        1        1
## 731                             frontier_is_a_book_you        0        0
## 732                           a_distorted_picture_of_a        1        1
## 733                               ty_drove_the_bus_and        1        1
## 734                      intent_of_the_legislation_was        1        1
## 735                         didn't_realize_it_would_be        1        1
## 736                               have_a_life_and_your        0        0
## 737                         want_my_push_button_dinner        0        0
## 738          imagination_to_compose_adventures_stories        0        0
## 739                          cut_or_freeze_some_unused        0        0
## 740                                of_the_left_foot_of        1        1
## 741                      appear_in_the_illusionists_at        0        0
## 742                           has_there_ever_been_that        0        0
## 743                                  may_be_on_his_way        1        1
## 744                       bands_with_an_amazing_friend        0        0
## 745                      basically_all_this_entails_is        0        0
## 746                                   do_you_get_a_day        0        0
## 747                           and_a_plastic_button_for        1        1
## 748                           an_extra_sign_might_want        1        1
## 749                             come_back_hear_from_my        0        1
## 750                                when_kids_see_a_toy        0        0
## 751                     chicago_said_king_should_focus        0        0
## 752                     is_numbingly_predictable_as_if        0        0
## 753                  pointedly_refused_to_invite_three        0        0
## 754                          it's_my_gameday_good_luck        1        1
## 755              habitat_for_humanity_international_in        0        0
## 756                              between_a_man_a_woman        0        0
## 757                      word_macguffin_and_i'm_really        0        0
## 758                  over_time_perhaps_sparing_current        0        0
## 759                                 far_and_a_sure_bet        1        1
## 760         classical_mythology_in_particular_children        0        0
## 761                  large_sections_omitted_because_of        1        1
## 762                         daughter_to_get_some_ideas        0        0
## 763                            very_well_still_call_it        1        1
## 764                                 top_not_to_many_of        1        1
## 765                       her_self-esteem_what_made_it        0        0
## 766        trimmed_the_drapes_in_perfectly-coordinated        0        0
## 767                         some_of_the_final_postures        0        0
## 768                   thirteen_chilling_tales_from_the        1        1
## 769                          so_many_downers_have_been        1        1
## 770          sustainable_practices_through_its_support        0        0
## 771                            beat_ourselves_on_a_lot        1        0
## 772                             no_signs_and_even_fill        0        0
## 773                            idea_didn't_work_out_as        1        1
## 774                          great_success_and_you_can        1        1
## 775                     rather_than_sling_hoses_across        0        0
## 776                           want_to_ease_yourself_in        1        1
## 777                            many_families_no_one_in        0        0
## 778                         parent_is_what_my_children        0        0
## 779                              school_and_went_on_to        1        1
## 780                    pull_out_a_come-from-behind_win        0        0
## 781                 parasitic_worms_or_helminths_which        0        0
## 782                           and_last_daughter_and_in        0        0
## 783                          pretty_much_about_to_drop        0        0
## 784                                  to_jazz_it_up_for        1        1
## 785                             goes_down_i_believe_in        1        1
## 786                            out_of_hades_and_headed        0        0
## 787             sure_they_were_implemented_exceedingly        0        0
## 788                            of_the_old_field_meadow        0        0
## 789           the_high-stakes_political_maneuvering_by        0        0
## 790                  statehouse_for_recognition_by_the        1        1
## 791                        piece_an_added_dimension_if        0        0
## 792                             i_think_you_could_just        0        0
## 793                             away_with_byrd_said_in        1        1
## 794                      city_crowd_gets_to_rediscover        0        0
## 795                      off_and_landing_larger_planes        0        0
## 796                        his_game_has_steadily_grown        0        0
## 797                           on_jobs_and_retail_sales        1        1
## 798                      shaking_his_head_recoiling_at        0        0
## 799                             on_or_this_storm_don't        0        0
## 800                            there_were_some_weeks_i        0        0
## 801                         ceiling_crisis_of_the_sort        0        0
## 802                               myself_ok_then_if_he        0        0
## 803                                but_not_hoin_to_bed        0        0
## 804                   they_wanted_to_y'all're_probably        0        0
## 805                               this_guy_is_going_to        1        1
## 806                                 to_get_to_hang_out        1        1
## 807                   private_practice_have_to_address        0        0
## 808                              twice_in_one_week_for        0        0
## 809                      to_several_articles_there's_a        1        1
## 810                   others_worried_that_the_hospital        0        0
## 811                            they_arrive_at_our_door        0        0
## 812                             ipod_touch_with_me_and        1        1
## 813                annoying_commercials_i_forget_which        0        0
## 814                           a_comment_about_me_being        0        0
## 815                    the_brewers_come_into_tonight's        0        0
## 816                      restaurant_and_asked_to_store        0        0
## 817                 the_hospital_random_science_museum        0        0
## 818                                me_happy_and_i_need        0        0
## 819                         or_perhaps_after_labor_day        1        1
## 820                           rate_is_expected_to_rise        0        0
## 821                  his_internship_at_ohio_historical        0        0
## 822                           a_virtue_that_comes_from        1        1
## 823                              are_so_ready_for_this        1        1
## 824                      begin_offering_the_service_in        0        0
## 825                          be_freshly_pressed_and_as        0        0
## 826                           industry_is_a_good_thing        1        1
## 827                        documents_for_four_areas_of        1        1
## 828                     which_isn't_just_in_sunderland        0        0
## 829                     made_national_headlines_in_one        1        1
## 830                           got_there_russo_began_to        1        1
## 831                          award_for_so_many_reasons        1        1
## 832                           they_were_treated_at_the        1        1
## 833                              are_well_come_to_come        0        0
## 834           their_experiences_in_interviews_anderson        0        0
## 835                           time_if_the_agreement_is        1        1
## 836                                  but_now_on_to_the        1        1
## 837                           i_had_crumb_cupcakes_for        1        1
## 838                           to_become_the_first_team        1        1
## 839                      couldn't_be_an_unbiased_juror        0        0
## 840                          teaching_her_a_lesson_not        0        0
## 841                                 has_gone_on_for_so        0        0
## 842                            day_i_emailed_my_former        0        0
## 843                              far_from_his_west_los        0        0
## 844                            memorial_and_if_not_for        1        1
## 845                       from_girl_shot_at_washington        0        0
## 846                        players_broke_ncaa_rules_by        0        0
## 847                        to_double_the_relative_risk        0        0
## 848                         going_anywhere_the_guy_was        0        0
## 849                 generation_after_the_family_member        0        0
## 850                                get_a_little_bit_of        1        1
## 851                     like_sopes_and_gorditas_remain        0        0
## 852                           which_is_difficult_to_do        0        0
## 853                            go_upscale_and_what_the        0        0
## 854                     have_high_cholesterol_or_heart        0        0
## 855              maintained_throughout_the_trials_that        0        0
## 856                 happened_when_a_nurse_inaccurately        0        0
## 857                            school_don't_have_to_se        0        0
## 858                and_numbers_released_wednesday_show        0        0
## 859                  land_the_aircraft_carrying_nearly        0        0
## 860                           its_joys_and_its_sorrows        0        0
## 861                       incredibly_excited_to_have_a        1        1
## 862                        to_download_albums_by_bands        0        0
## 863                          the_centre_had_a_footfall        0        0
## 864                          i_started_posting_on_here        0        0
## 865                           the_alphabet_i'd_put_you        0        0
## 866                     enchanted_is_an_adorable_novel        0        0
## 867                       played_his_full_potential_at        0        0
## 868                 three_galleries_and_two_bookstores        0        0
## 869                have_few_problems_assimilating_into        0        0
## 870                             he_is_teetering_on_the        1        1
## 871                      million_in_earnings_last_year        1        1
## 872                        back_and_laugh_at_ourselves        0        0
## 873                              of_knowing_if_that_is        1        1
## 874                          the_way_they_choose_their        0        0
## 875                      as_tenants_rights_and_housing        0        0
## 876                      a_double_concerto_for_timpani        0        0
## 877                      see_where_their_strengths_and        1        1
## 878                       anything_about_him_from_this        0        0
## 879                                  it_was_met_by_law        0        0
## 880                  element_an_elementary_school_fair        0        0
## 881                      diligently_to_learn_thanks_to        1        1
## 882                 here's_what_i've_received_recently        0        0
## 883                    executives_was_the_first_leader        0        0
## 884                              to_be_expected_as_the        1        1
## 885                              he_says_he's_going_to        1        1
## 886                      whom_ift_recently_promoted_to        1        1
## 887                             i_saw_that_too_shocked        0        0
## 888                         his_back_and_nothing_could        0        0
## 889                      largest_group_of_fmv_packages        0        0
## 890                  increasing_danger_of_not_entering        0        0
## 891                           win_by_minnesota_was_the        1        1
## 892                       that_with_these_guys_because        0        0
## 893                        the_deals_select_from_small        0        0
## 894                                at_it_like_i'm_from        0        0
## 895                     just_savoring_the_pristine_and        0        0
## 896                              it_and_she_can_barely        0        0
## 897                              of_what_you_must_know        1        1
## 898                              make_sure_the_lawn_it        0        0
## 899                            over_they_had_even_more        1        1
## 900                    in_many_varieties_and_cultivars        0        0
## 901                             money_for_next_year_on        0        0
## 902                     it's_not_surprising_that_there        0        0
## 903                    looking_forward_to_playing_here        0        0
## 904                       relentless_in_his_pursuit_to        0        0
## 905                          and_one_month_after_their        0        0
## 906                         a_growing_unease_about_the        1        1
## 907                        exam_rooms_are_all_equipped        0        0
## 908          nonsmoker_and_retired_electrical_engineer        1        1
## 909             conversion_was_a_blasphemer_persecutor        0        0
## 910                      can_get_messy_and_complicated        0        0
## 911                    of_imagination_an_essential_and        0        0
## 912                             will_use_these_type_of        1        1
## 913                         for_kershaw_or_halladay_or        0        0
## 914                          the_cut-off_time_for_same        0        0
## 915                     president_and_congress_both_of        1        1
## 916                         the_deer_then_crashed_into        1        1
## 917                       broken_every_couple_of_hours        1        1
## 918                     wiles_admitted_that_he_handled        0        0
## 919                        of_the_time_you're_actually        0        0
## 920              grants_because_planned_parenthood_was        0        0
## 921                     arguments_presume_that_one_can        1        1
## 922                              see_it_on_a_different        0        0
## 923                           get_are_colors_bars_page        0        0
## 924                     conner_ford_finished_second_in        1        1
## 925                           in_the_rochester_mn_area        0        0
## 926                                to_have_quite_a_few        1        1
## 927                       it's_happening_now_all_white        0        0
## 928                          until_of_course_the_movie        0        0
## 929                                a_cream_with_it_and        1        1
## 930                         will_have_a_greater_impact        0        0
## 931                          pimlico_on_tuesday_to_get        1        1
## 932                      lovely_ladies_pointed_out_the        1        1
## 933                    don't_forget_that_small_changes        0        0
## 934                      little_opening_where_the_barn        0        0
## 935                             fry_up_a_batch_without        0        0
## 936                            mag_who's_comin_with_me        0        0
## 937                anti-proliferation_goals_it_got_the        0        0
## 938                      these_little_glass_spice_jars        0        0
## 939                        level_and_longingly_look_at        1        1
## 940                          right_up_into_your_armpit        0        0
## 941                        want_to_protect_her_wishing        0        0
## 942                            you_stink_up_the_entire        0        0
## 943                              then_sit_on_you_again        0        0
## 944                    within_every_tear-_there's_love        0        0
## 945                          a_critic_in_lakewood_ohio        0        0
## 946                  back_kenjon_barner_and_linebacker        0        0
## 947                           also_a_sophomore_said_of        0        0
## 948                          and_the_sports_they_might        0        0
## 949                           table_next_to_him_siting        0        0
## 950                             south_korea_and_a_shot        0        0
## 951                     cutting_edge_of_modern_fascist        0        0
## 952               the_original_pollock_executive_chair        0        0
## 953                               of_my_friends_that_i        1        1
## 954                        everything_just_to_see_your        0        0
## 955      different_environmental_pressures_will_result        0        0
## 956                        trade_decided_to_take_their        0        0
## 957                            life_is_so_precious_and        1        1
## 958                          only_tired_in_the_morning        0        1
## 959                                 have_to_go_back_to        1        1
## 960                        will_probably_get_taken_for        0        0
## 961                           care_of_myself_more_wear        0        0
## 962                 synchronized_swimming_to_boxing_to        0        0
## 963                        over_yonder_in_kiwi_country        0        0
## 964                            and_reacted_with_a_loud        0        0
## 965                    many_food_specials_as_halloween        0        0
## 966                       led_to_believe_a_significant        0        0
## 967                            members_like_to_chew_on        1        1
## 968                            models_have_shown_to_be        1        1
## 969                                 as_a_result_as_was        0        0
## 970                      wrap_the_motherly_cloak_about        0        0
## 971                          of_those_parts_fall_under        0        0
## 972                      for_anyone_wanting_a_piercing        0        0
## 973                           about_our_ovaries_and_he        0        0
## 974                             he_sings_to_me_thought        0        0
## 975              sharply_curtailed_harvests_on_federal        0        0
## 976                            very_essence_is_part_of        1        1
## 977                     probably_different_but_i_think        1        1
## 978                             re_here_today_to_close        0        0
## 979                       a_shelter_established_by_the        1        1
## 980                             book_was_as_much_about        0        0
## 981                            of_making_me_excited_it        0        0
## 982                       the_library_doesn't_have_its        0        0
## 983                              mary_ann_lives_in_the        1        1
## 984                        other_buses_were_driving_by        1        1
## 985                             jackie_i_know_nick_was        0        0
## 986                           doubt_you_could_say_they        0        0
## 987                    being_successful_isn't_magic_is        1        1
## 988                        my_family_pillutla_said_and        0        0
## 989                     stars_and_routes_of_precession        0        0
## 990                    number_will_go_up_significantly        0        0
## 991                              when_you_get_tired_of        1        1
## 992                       i've_seen_of_this_technology        0        0
## 993                   i'm_already_completely_and_madly        0        0
## 994                        if_the_bullying_continues_a        0        0
## 995                        have_a_proper_poured_cement        0        0
## 996                       i_have_thought_about_calling        1        0
## 997                     more_than_anything_were_always        0        0
## 998                              do_not_think_it_means        1        1
## 999                           meditate_and_do_it_daily        0        0
## 1000                      brisket_there_are_still_many        1        1

print(paste0("Total Predictions: ", length.CV))

## [1] "Total Predictions: 1000"

print(paste0("Total Catches (Interpolation): ",total.catches))

## [1] "Total Catches (Interpolation): 330"

print(paste0("Accuracy (Interpolation): ",accuracy))

## [1] "Accuracy (Interpolation): 0.33"

print(paste0("Average Time Taken per Prediction in seconds (Interpolation): ", time.taken/1000))

## [1] "Average Time Taken per Prediction in seconds (Interpolation): 0.446346651855469"

print(paste0("Total Catches (Kneser-Ney Smoothing): ",total.catches.kn))

## [1] "Total Catches (Kneser-Ney Smoothing): 328"

print(paste0("Accuracy (Kneser-Ney Smoothing): ",accuracy.kn))

## [1] "Accuracy (Kneser-Ney Smoothing): 0.328"

print(paste0("Average Time Taken per Prediction in seconds (Kneser-Ney Smoothing): ", time.taken.kn/1000))

## [1] "Average Time Taken per Prediction in seconds (Kneser-Ney Smoothing): 0.51984421484375"

Conclusion

Accuracy (based on catches in top 5 predictions) is 33% for both Interpolation and Kneser-Ney Smoothing
Although here we do not see any increase in Accuracy using KN Smoothing, when we do some trial and error on the Shiny Web App, we can see rankings go up and down - at times for the better.
Time Taken for Interpolation is around 0.44 seconds per prediction (for Interpolation) and 0.52 seconds per prediction (for KN Smoothing) which seems reasonable. Note: To this we must add the network latency for Shiny App.
Caveat: We have set flag for stop words as ON if the target is a stop word. If we never predict stop words, then accuracy is lower (roughly 16%).

Model Validation

Jayesh Gokhale

6/5/2021

Model Validation

Load Libraries

Print Log Function (if needed)

Get Offensive Words Data

Clean Up Token Function

Generate Corpus Function

Generate and Merge All Corpora

Sample 1% of Corpus

Generate Token

Clean Up Token

Generate N Grams and DFM

Let us get down to prediction now

Let us make predictions using Interpolation Approach

Let us make predictions using Kneser-Ney Smoothing

Print Ranks

Conclusion