Code for NLP

This is the code I wrote to make my NextWord predictor Shiny App which can be found here:

https://eavwing.shinyapps.io/NextWord/

UI Shiny Code

library(shiny)

shinyUI(fluidPage(
  
  # Application title
  titlePanel("Guess your next word"),
  
  # Sidebar with a text input 
  sidebarLayout(
    sidebarPanel(
       textInput("sentence", label = "Start typing here:")
    ),
    
    # Show the most likely next word
    mainPanel(
       h3(textOutput("value")),
       plotOutput("plot")
    )
  )
))

Server Shiny Code

shinyServer(function(input, output) {
        library(shiny)
        library(NLP)
        library(tm)
        library(stringr)
        library(shiny)
        library(wordcloud)
        
        load("./corpusData.Rda")
        
        nextword <- function(sentence){
                ngrams <- data.frame(start=character(), last=character(), weight=numeric())
                sen <- tolower(sentence)
                sen = removePunctuation(sen)
                sen = stripWhitespace(sen)
                sen = removeNumbers(sen)
                sen = str_trim(sen)
                sen = unlist(strsplit(sen,' '))
                n = 7
                if(length(sen)<7){
                        n = length(sen) - 1
                }
                for(i in n:0){
                        g = sen[(length(sen)-i):length(sen)]
                        g = paste(g,collapse = ' ')
                        ngrams = rbind(ngrams, corpusData[which(corpusData$start == g),])
                }
                
                return(ngrams)
        }
        
        wordcloud_rep <- repeatable(wordcloud)
        
        output$value <- renderText({
                
                sentence <- input$sentence 
                test <- nextword(sentence)
                if(!is.na(test[1,2])){
                        test[1,2]
                }
                else{
                        "the"        
                }
        })
  
        output$plot <- renderPlot({
                sentence <- input$sentence 
                test <- nextword(sentence)
                if(!is.na(test[1,2])){
                        wordcloud_rep(test$last, test$weight, scale=c(4,0.5), max.words = 10, colors=brewer.pal(8, "Blues"))  
                }
                else{
                        ""
                }
                        })
})

Full Code Chunk For Generating Corpus Data

setwd("C:/Users/Steve/Data-Science-Toolbox/Capstone")

library(tm)
library(LaF)
library(RWeka)
library(NLP)
library(stringr)
library(stringi)

bigList <- data.frame(ngram=factor(), frequency=numeric())
bigListStopped <- data.frame(ngram=factor(), frequency=numeric())
sampleSize = 5000

for(i in c(1:5)){

        ## Make the samples for the unStopped files
        writeLines(sample_lines("./clean corpus/1.txt", sampleSize, determine_nlines("./clean corpus/1.txt")), "1Sample.txt")
        file.rename(from="./1Sample.txt", to="./clean corpus/Samples/1Sample.txt")
        writeLines(sample_lines("./clean corpus/2.txt", sampleSize, determine_nlines("./clean corpus/2.txt")), "2Sample.txt")
        file.rename(from="./2Sample.txt", to="./clean corpus/Samples/2Sample.txt")
        writeLines(sample_lines("./clean corpus/3.txt", sampleSize, determine_nlines("./clean corpus/3.txt")), "3Sample.txt")
        file.rename(from="./3Sample.txt", to="./clean corpus/Samples/3Sample.txt")
        
        ## Make the samples for the Stopped files
        writeLines(sample_lines("./clean corpus stopped/1s.txt", sampleSize, determine_nlines("./clean corpus stopped/1s.txt")), "1sSample.txt")
        file.rename(from="./1sSample.txt", to="./clean corpus stopped/Samples/1sSample.txt")
        writeLines(sample_lines("./clean corpus stopped/2s.txt", sampleSize, determine_nlines("./clean corpus stopped/2s.txt")), "2sSample.txt")
        file.rename(from="./2sSample.txt", to="./clean corpus stopped/Samples/2sSample.txt")
        writeLines(sample_lines("./clean corpus stopped/3s.txt", sampleSize, determine_nlines("./clean corpus stopped/3s.txt")), "3sSample.txt")
        file.rename(from="./3sSample.txt", to="./clean corpus stopped/Samples/3sSample.txt")
        
        ## Create the corpus
        corp <- Corpus(DirSource("./clean corpus/Samples/"))
        corpStopped <- Corpus(DirSource("./clean corpus stopped/Samples/"))
        
        ## Create the TDMs
        toke <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 8))
        tdm <- TermDocumentMatrix(corp, control = list(tokenize = toke))
        tdmS <- TermDocumentMatrix(corpStopped, control = list(tokenize = toke))
        
        ## Pull out any terms that don't occur at least twice
        freqTerms <- findFreqTerms(tdm, lowfreq = 1)
        m <- rowSums(as.matrix(tdm[freqTerms,]))
        data <- data.frame(ngram = as.factor(names(m)),frequency = as.numeric(m))

        freqTermsS <- findFreqTerms(tdmS, lowfreq = 1)
        mS <- rowSums(as.matrix(tdmS[freqTermsS,]))
        dataS <- data.frame(ngram = as.factor(names(mS)),frequency = as.numeric(mS))

        ## Bind onto the big list
        bigList <- rbind(bigList, data)
        bigListStopped <- rbind(bigListStopped, dataS)
        
        ## tapply over the big list to sum frequencies
        bigTap <- tapply(bigList$frequency, bigList$ngram, sum)
        bigList <- data.frame(ngram=as.factor(names(bigTap)), frequency=as.numeric(bigTap))
        bigTapS <- tapply(bigListStopped$frequency, bigListStopped$ngram, sum)
        bigListStopped <- data.frame(ngram=as.factor(names(bigTapS)), ... =   frequency=as.numeric(bigTapS))
        }

## Add a column with how many words are in the ngram
bigList$n_of_gram <- str_count(bigList$ngram, ' ') + 1
bigListStopped$n_of_gram <- str_count(bigListStopped$ngram, ' ') + 1

## Add columns for the start and last word of the ngrams
bigList$start <- word(bigList$ngram, 1, bigList$n_of_gram - 1)
bigList$last <- word(bigList$ngram, -1)

bigListStopped$start <- word(bigListStopped$ngram, 1, bigListStopped$n_of_gram - 1)
bigListStopped$last <- word(bigListStopped$ngram, -1)

## Add a weight
bigList$weight <- log(bigList$frequency) + bigList$n_of_gram^4
bigListStopped$weight <- log(bigListStopped$frequency) + bigListStopped$n_of_gram^4

##  Sort the lists by the length of the ngram
bigList <- bigList[order(-bigList$weight),]
bigListStopped <- bigListStopped[order(-bigListStopped$weight),]

save(bigList, file="bigList.Rda")
save(bigListStopped, file="bigListStopped.Rda")