This is the code I wrote to make my NextWord predictor Shiny App which can be found here:
library(shiny)
shinyUI(fluidPage(
# Application title
titlePanel("Guess your next word"),
# Sidebar with a text input
sidebarLayout(
sidebarPanel(
textInput("sentence", label = "Start typing here:")
),
# Show the most likely next word
mainPanel(
h3(textOutput("value")),
plotOutput("plot")
)
)
))
shinyServer(function(input, output) {
library(shiny)
library(NLP)
library(tm)
library(stringr)
library(shiny)
library(wordcloud)
load("./corpusData.Rda")
nextword <- function(sentence){
ngrams <- data.frame(start=character(), last=character(), weight=numeric())
sen <- tolower(sentence)
sen = removePunctuation(sen)
sen = stripWhitespace(sen)
sen = removeNumbers(sen)
sen = str_trim(sen)
sen = unlist(strsplit(sen,' '))
n = 7
if(length(sen)<7){
n = length(sen) - 1
}
for(i in n:0){
g = sen[(length(sen)-i):length(sen)]
g = paste(g,collapse = ' ')
ngrams = rbind(ngrams, corpusData[which(corpusData$start == g),])
}
return(ngrams)
}
wordcloud_rep <- repeatable(wordcloud)
output$value <- renderText({
sentence <- input$sentence
test <- nextword(sentence)
if(!is.na(test[1,2])){
test[1,2]
}
else{
"the"
}
})
output$plot <- renderPlot({
sentence <- input$sentence
test <- nextword(sentence)
if(!is.na(test[1,2])){
wordcloud_rep(test$last, test$weight, scale=c(4,0.5), max.words = 10, colors=brewer.pal(8, "Blues"))
}
else{
""
}
})
})
setwd("C:/Users/Steve/Data-Science-Toolbox/Capstone")
library(tm)
library(LaF)
library(RWeka)
library(NLP)
library(stringr)
library(stringi)
bigList <- data.frame(ngram=factor(), frequency=numeric())
bigListStopped <- data.frame(ngram=factor(), frequency=numeric())
sampleSize = 5000
for(i in c(1:5)){
## Make the samples for the unStopped files
writeLines(sample_lines("./clean corpus/1.txt", sampleSize, determine_nlines("./clean corpus/1.txt")), "1Sample.txt")
file.rename(from="./1Sample.txt", to="./clean corpus/Samples/1Sample.txt")
writeLines(sample_lines("./clean corpus/2.txt", sampleSize, determine_nlines("./clean corpus/2.txt")), "2Sample.txt")
file.rename(from="./2Sample.txt", to="./clean corpus/Samples/2Sample.txt")
writeLines(sample_lines("./clean corpus/3.txt", sampleSize, determine_nlines("./clean corpus/3.txt")), "3Sample.txt")
file.rename(from="./3Sample.txt", to="./clean corpus/Samples/3Sample.txt")
## Make the samples for the Stopped files
writeLines(sample_lines("./clean corpus stopped/1s.txt", sampleSize, determine_nlines("./clean corpus stopped/1s.txt")), "1sSample.txt")
file.rename(from="./1sSample.txt", to="./clean corpus stopped/Samples/1sSample.txt")
writeLines(sample_lines("./clean corpus stopped/2s.txt", sampleSize, determine_nlines("./clean corpus stopped/2s.txt")), "2sSample.txt")
file.rename(from="./2sSample.txt", to="./clean corpus stopped/Samples/2sSample.txt")
writeLines(sample_lines("./clean corpus stopped/3s.txt", sampleSize, determine_nlines("./clean corpus stopped/3s.txt")), "3sSample.txt")
file.rename(from="./3sSample.txt", to="./clean corpus stopped/Samples/3sSample.txt")
## Create the corpus
corp <- Corpus(DirSource("./clean corpus/Samples/"))
corpStopped <- Corpus(DirSource("./clean corpus stopped/Samples/"))
## Create the TDMs
toke <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 8))
tdm <- TermDocumentMatrix(corp, control = list(tokenize = toke))
tdmS <- TermDocumentMatrix(corpStopped, control = list(tokenize = toke))
## Pull out any terms that don't occur at least twice
freqTerms <- findFreqTerms(tdm, lowfreq = 1)
m <- rowSums(as.matrix(tdm[freqTerms,]))
data <- data.frame(ngram = as.factor(names(m)),frequency = as.numeric(m))
freqTermsS <- findFreqTerms(tdmS, lowfreq = 1)
mS <- rowSums(as.matrix(tdmS[freqTermsS,]))
dataS <- data.frame(ngram = as.factor(names(mS)),frequency = as.numeric(mS))
## Bind onto the big list
bigList <- rbind(bigList, data)
bigListStopped <- rbind(bigListStopped, dataS)
## tapply over the big list to sum frequencies
bigTap <- tapply(bigList$frequency, bigList$ngram, sum)
bigList <- data.frame(ngram=as.factor(names(bigTap)), frequency=as.numeric(bigTap))
bigTapS <- tapply(bigListStopped$frequency, bigListStopped$ngram, sum)
bigListStopped <- data.frame(ngram=as.factor(names(bigTapS)), ... = frequency=as.numeric(bigTapS))
}
## Add a column with how many words are in the ngram
bigList$n_of_gram <- str_count(bigList$ngram, ' ') + 1
bigListStopped$n_of_gram <- str_count(bigListStopped$ngram, ' ') + 1
## Add columns for the start and last word of the ngrams
bigList$start <- word(bigList$ngram, 1, bigList$n_of_gram - 1)
bigList$last <- word(bigList$ngram, -1)
bigListStopped$start <- word(bigListStopped$ngram, 1, bigListStopped$n_of_gram - 1)
bigListStopped$last <- word(bigListStopped$ngram, -1)
## Add a weight
bigList$weight <- log(bigList$frequency) + bigList$n_of_gram^4
bigListStopped$weight <- log(bigListStopped$frequency) + bigListStopped$n_of_gram^4
## Sort the lists by the length of the ngram
bigList <- bigList[order(-bigList$weight),]
bigListStopped <- bigListStopped[order(-bigListStopped$weight),]
save(bigList, file="bigList.Rda")
save(bigListStopped, file="bigListStopped.Rda")