Description

Using three source files from Twitter, blogs, and news articles, build a text predictor that takes a phrase and returns a prediction for the next word.

Here is a link to the final app…
https://zburch.shinyapps.io/FinalProject

Here is a link to the server/ui code…
https://github.com/zburch/NLPFinalProject

Getting the data

Download files

#Main file for text mining
download.file("https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip","corpus.zip")
zipPath <- paste(getwd(),"corpus.zip",sep="/")
unzip(zipPath)

#List of profane words
download.file("https://www.frontgatemedia.com/new/wp-content/uploads/2014/03/Terms-to-Block.csv","profanity.csv")

Read in portion of data, save to file

fileList <- c("en_us/en_US.twitter.txt",
              "en_us/en_US.blogs.txt",
              "en_us/en_US.news.txt")

df <- NULL
set.seed(123)
for (i in 1:length(fileList)){
  con <- file(paste("final",fileList[i],sep="/"), "r")
  text <- NULL
  while(TRUE){
    #Bring in approx. 15% of the data
    if(rbinom(1,1,prob=.15)){
      #Grab 5 lines at a time
      newSet <- readLines(con,5)
      if(length(newSet)==0){
        break
      }
      text <- c(text,newSet)
    } else {
      #Skip 5 lines
      readLines(con,5)
    }
  }
  close(con)
  #Add filenames to the list
  fileName <- rep(strsplit(fileList[i],
                           split=".",
                           fixed=TRUE)[[1]][2],
                  length(text))
  newDF <- data.frame(fileName,text)
  df <- rbind(df,newDF)
}
df$fileName <- as.factor(df$fileName)
write.csv(df,"subset.csv")

Cleaning Data

Perform basic global cleaning

df <- read.csv("subset.csv",row.names=NULL)
df <- df[,2:3]
rownames(df) <- c()

#Lowercase
df$text <- tolower(df$text)
#Strip extra whitespace
df$text <- gsub("\\s+"," ",df$text)
#Trim whitespace
df$text <- trimws(df$text)
df$text <- gsub("@","at",df$text,fixed=TRUE)
df$text <- gsub("&","and",df$text,fixed=TRUE)
#remove punctuation
df$text <- gsub("[^a-z0-9# ]","",df$text)

#split training/test sets
split <- rep(0,length(df$text))
split[1:2000] <- 1
split <- sample(split)
training <- df[!split>0,]
test <- df[split>0,]
write.csv(test,"testing.csv",row.names=FALSE)

Build ngrams (uni-, bi-, tri-, and quad-)

library(dplyr)
library(tidytext)

#Get unigrams of all data
unigrams <- training %>%
  unnest_tokens(unigram,text,token="ngrams",n=1)

#Get bigrams of all data
bigrams <- training %>%
  unnest_tokens(bigram,text,token="ngrams",n=2)

#Get trigrams of all data
trigrams <- training %>%
  unnest_tokens(trigram,text,token="ngrams",n=3)

#Get quadgrams of all data
quadgrams <- training %>%
  unnest_tokens(quadgram,text,token="ngrams",n=4)

Clean ngrams

#Remove any ngram containing non-alpha or space characters
unigrams <- unigrams[grepl("^[a-z]*$",unigrams$unigram),]
bigrams <- bigrams[grepl("^[a-z]* [a-z]*$",
                          bigrams$bigram),]
trigrams <- trigrams[grepl("^[a-z]* [a-z]* [a-z]*$",
                           trigrams$trigram),]
quadgrams <- quadgrams[grepl("^[a-z]* [a-z]* [a-z]* [a-z]*$",
                           quadgrams$quadgram),]

#Remove retweets
unigrams <- unigrams[!grepl("^rt$",unigrams$unigram),]
bigrams <- bigrams[!grepl("(^rt | rt$)",bigrams$bigram),]
trigrams <- trigrams[!grepl("(^rt | rt | rt$)",trigrams$trigram),]
quadgrams <- quadgrams[!grepl("(^rt | rt | rt$)",quadgrams$quadgram),]

#Bring in profanity database profanity
badwords <- read.csv("profanity.csv")
#Get rid of extra header rows
badwords <- badwords[4:length(badwords[,1]),2]
#Strip whitespace
badwords <- gsub("\\s+"," ",badwords)
#Trim whitespace
badwords <- trimws(badwords)
#Lowercase
badwords <- tolower(badwords)
#Remove punctuation
badwords <- gsub(",","",badwords,fixed=TRUE)
#Remove any non-alpha or space characters
badwords <- badwords[grepl("^[a-z]*$",badwords)]

#Remove profanity rows from ngrams
unigrams <- unigrams[!grepl(paste("\\<",paste(badwords,collapse="\\>|")),
              unigrams$unigram),]
bigrams <- bigrams[!grepl(paste("\\<",paste(badwords,collapse="\\>|")),
              bigrams$bigram),]
trigrams <- trigrams[!grepl(paste("\\<",paste(badwords,collapse="\\>|")),
              trigrams$trigram),]
quadgrams <- quadgrams[!grepl(paste("\\<",paste(badwords,collapse="\\>|")),
              quadgrams$quadgram),]
#Save files
write.csv(unigrams,"unigrams.csv",row.names=FALSE)
write.csv(bigrams,"bigrams.csv",row.names=FALSE)
write.csv(trigrams,"trigrams.csv",row.names=FALSE)
write.csv(quadgrams,"quadgrams.csv",row.names=FALSE)

Basic exploratory analysis

library(dplyr)
library(tidytext)

uniCt <- unigrams %>%
  count(unigram,sort=TRUE) %>%
  mutate(rsum = cumsum(n))
uniCt <- as.data.frame(uniCt)
head(uniCt)
class(uniCt)

#How many unique words needed to cover 50% of all words?

fiftyPct <- sum(uniCt$n)*.5
fiftyPctUnigrams <- uniCt[uniCt$rsum<=fiftyPct,]
length(fiftyPctUnigrams[,1])

#90%?
ninetyPct <- sum(uniCt$n)*.9
ninetyPctUnigrams <- uniCt[uniCt$rsum<=ninetyPct,]
length(ninetyPctUnigrams[,1])

Summarize ngrams

library(dplyr)
library(tidyr)
bigram <- read.csv("bigrams.csv")
trigram <- read.csv("trigrams.csv")
quadgram <- read.csv("quadgrams.csv")

#1) Remove any n-grams that only occur once, as they are not 
#     likely to generalize well and slow down the model.
#2) Order ngrams by most used
#3) Remove any ngrams that have the same (n-1)gram 
#     and are not in the top x-most used for that ngram.

bigram <- bigram %>% group_by(bigram) %>% 
          tally() %>% filter(n>1) %>%
          arrange(desc(n)) %>%
          separate(bigram, c('first','second'), sep=" ") %>%
          group_by(first) %>% mutate(count = row_number()) %>%
          filter(count<=numResults) %>% select(first,second,n)
trigram <- trigram %>% group_by(trigram) %>% 
           tally() %>% filter(n>1) %>%
           arrange(desc(n)) %>%
           separate(trigram, c('first','second','third'), sep=" ") %>%
           group_by(first,second) %>% mutate(count = row_number()) %>%
           filter(count<=numResults) %>% select(first,second,third,n)
quadgram <- quadgram %>% group_by(quadgram) %>% 
            tally() %>% filter(n>1) %>%
            arrange(desc(n)) %>%
            separate(quadgram, c('first','second','third','fourth'), sep=" ") %>%
            group_by(first,second,third) %>% mutate(count = row_number()) %>%
            filter(count<=numResults) %>% select(first,second,third,fourth,n)

write.csv(bigram,"biSummary.csv",row.names=FALSE)
write.csv(trigram,"triSummary.csv",row.names=FALSE)
write.csv(quadgram,"quadSummary.csv",row.names=FALSE)

Build Model

Read in data

bigrams <- read.csv("biSummary.csv",row.names=NULL)
trigrams <- read.csv("triSummary.csv",row.names=NULL)
quadgrams <- read.csv("quadSummary.csv",row.names=NULL)

Create default values for unseen n-grams

library(dplyr)
def <- bigrams[,1:2]
def <- def %>% group_by(second) %>% 
  tally() %>% arrange(desc(n)) %>%
  mutate(pct = .99*(min(bigrams$pct)))
def <- def$second[1:10]
write.csv(def,"defaults.csv",row.names=FALSE)

Build Model

def <- read.csv("defaults.csv",row.names=NULL)
model <- function(oldest=NULL,middle=NULL,newest){
  preds <- NULL

  head(def)
  if (is.null(oldest) & is.null(middle)){
    preds <- bigrams[bigrams$first==newest,2]
    if(length(preds)<numResults){
      preds <- unique(c(as.character(preds),as.character(def)))
    }
    preds <- preds[1:numResults]
  } else if(is.null(oldest)){
      preds <- trigrams[trigrams$first==middle & trigrams$second==newest,3]
      if (length(preds)<numResults){
        biPreds <- bigrams[bigrams$first==newest,2]
        preds <- unique(c(as.character(preds),as.character(biPreds)))
      }
      if (length(preds)<numResults){
        preds <- unique(c(as.character(preds),as.character(def)))
      }
      preds <- preds[1:numResults]
  } else {
      preds <- quadgrams[quadgrams$first==oldest & quadgrams$second==middle &
                  quadgrams$third==newest,4]
      if (length(preds)<numResults){
        triPreds <- trigrams[trigrams$first==middle & trigrams$second==newest,3]
        preds <- unique(c(as.character(preds),as.character(triPreds)))
      }
      if (length(preds)<numResults){
        biPreds <- bigrams[bigrams$first==newest,2]
        preds <- unique(c(as.character(preds),as.character(biPreds)))
      }
      if (length(preds)<numResults){
        preds <- unique(c(as.character(preds),as.character(def)))
      }
      preds <- preds[1:numResults]
  }
  return(preds)
}

Testing

Build Test quadgrams

library(dplyr)
library(tidytext)
test <- read.csv("testing.csv")

#Get quadgrams of all data
quadgramsTest <- test %>%
  unnest_tokens(quadgram,text,token="ngrams",n=4)

Clean Test Quadgrams

#Remove any ngram containing non-alpha or space characters
quadgramsTest <- quadgramsTest[grepl("^[a-z]* [a-z]* [a-z]* [a-z]*$",
                           quadgramsTest$quadgram),]

#Remove retweets
quadgramsTest <- quadgramsTest[!grepl("(^rt | rt | rt$)",quadgramsTest$quadgram),]

#Bring in profanity database profanity
badwords <- read.csv("profanity.csv")
#Get rid of extra header rows
badwords <- badwords[4:length(badwords[,1]),2]
#Strip whitespace
badwords <- gsub("\\s+"," ",badwords)
#Trim whitespace
badwords <- trimws(badwords)
#Lowercase
badwords <- tolower(badwords)
#Remove punctuation
badwords <- gsub(",","",badwords,fixed=TRUE)
#Remove any non-alpha or space characters
badwords <- badwords[grepl("^[a-z]*$",badwords)]

#Remove profanity rows from ngrams
quadgramsTest <- quadgramsTest[!grepl(paste("\\<",paste(badwords,collapse="\\>|")),
              quadgramsTest$quadgram),]
#Save files
write.csv(quadgramsTest,"quadgramsTest.csv",row.names=FALSE)

Summarize Test ngrams

library(dplyr)
library(tidyr)
quadgramTest <- read.csv("quadgramsTest.csv")
quadLinesTest <- length(quadgramTest$quadgram)

quadgramTest <- quadgramTest %>% group_by(quadgram) %>% 
                tally() %>% arrange(desc(n)) %>%
                separate(quadgram, c('first','second','third','fourth'), sep=" ")

write.csv(quadgramTest,"quadSummaryTest.csv",row.names=FALSE)

Run test through model

file <- read.csv("quadSummaryTest.csv")
test <- file[,1:3]
truth <- file[,4]
results=NULL
for(i in 1:length(test$first)){
  results <- rbind(results,model(as.character(test$first[i]),
                             as.character(test$second[i]),
                             as.character(test$third[i])))
}
validate = NULL
for(i in 1:length(truth)){
  validate <- c(validate,truth[i] %in% results[i,])
}
correct <- sum(validate)
testCases <- length(validate)
pct <- round(sum(validate)/length(validate),4)*100

In testing quadgram performance, we find that out of 37235, the correct word was in the prediction of 3 words 8111 times, yielding an accuracy of 21.78%.