download.file("https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip", "Coursera-SwiftKey.zip")
unzip("Coursera-SwiftKey.zip")
RawTwitterData <- readLines("final/en_US/en_US.twitter.txt")
RawBlogData <- readLines("final/en_US/en_US.blogs.txt")
RawNewsData <- readLines("final/en_US/en_US.news.txt")
length(RawTwitterData)
## [1] 2360148
length(RawBlogData)
## [1] 899288
length(RawNewsData)
## [1] 1010242
training <- 0.8
devtesting <- 0.1
testing <- 0.1
## TwitterData
trainingSampleSize <- floor(length(RawTwitterData)*training)
devtestingSampleSize <- floor(length(RawTwitterData)*devtesting)
testingSampleSize <- floor(length(RawTwitterData)*testing)
trainingIds <- sample(1:length(RawTwitterData), trainingSampleSize) #for training
trainingTwitter <- RawTwitterData[trainingIds]
testingTwitter <- RawTwitterData[-trainingIds]
# further split this testing data into dev-testing and final testing data
devtestingIds <- sample(1:length(testingTwitter), devtestingSampleSize) #for devtesting
devtestingTwitter <- testingTwitter[devtestingIds]
finaltestingTwitter <- testingTwitter[-devtestingIds]
We will use the ngram package of R to do preprocessing
Note: For purpose of demonstration I will use only the top 10000 lines of the trainingTwitterData dataset.
library(ngram)
SingleStringTwitterData <- paste(trainingTwitter[1:10000], collapse = " ")
PreprocessedData <- preprocess(SingleStringTwitterData, case = "lower",
remove.punct = T, remove.numbers = T, fix.spacing = T)
Again we will utilize the ngram package to create unigrams, bigrams and trigrams from the PreprocessedData
unigram <- ngram(PreprocessedData, n = 1)
UnigramTable <<- get.phrasetable(unigram)
library(ggplot2)
library(stringr)
plotdata <- UnigramTable[1:20, ]
p <- ggplot(data=plotdata, aes(x=ngrams, y=freq)) +
geom_bar(stat="identity", fill = "orange") +
scale_x_discrete(limits = plotdata$ngrams) +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
p
bigram <- ngram(PreprocessedData, n = 2)
BigramTable <<- get.phrasetable(bigram)
plotdata <- BigramTable[1:20, ]
p <- ggplot(data=plotdata, aes(x=ngrams, y=freq)) +
geom_bar(stat="identity", fill = "blue") +
scale_x_discrete(limits = plotdata$ngrams) +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
p
trigram <- ngram(PreprocessedData, n = 3)
TrigramTable <<- get.phrasetable(trigram)
plotdata <- TrigramTable[1:20, ]
p <- ggplot(data=plotdata, aes(x=ngrams, y=freq)) +
geom_bar(stat="identity", fill = "pink") +
scale_x_discrete(limits = plotdata$ngrams) +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
p
case1: if user enters only 1 word
If the word doesn’t exist in the training data, we suggest the top 3 most frequently occuring words in the training data. We can find this easily from the Unigram table
If the word does exist in the training data, we look for the most frequently occuring bigrams starting with the entered word, and find out their second words. We suggest the most frequently occuring such words as the probable next word
case2: if user enters 2 words
If the second/ last word doesnt exist in our training data, we suggest the top 3 most frequently occuring words in the training data. We can find this easily from the Unigram table.
If the first word doesnt exist in the training data but the second does. We proceed as if the user has entered only 1 word (the second one), and predict the next word as described in case 1.
If both words exist in the training data, we check if they ever appear together in the entered order, i.e. we check if the bigram formed by these words exists.
If the bigram does exist, we look for the most frequently occuring trigrams starting with the entered words, and find out their 3rd words. We suggest the most frequently occuring such 3rd words as the probable next word.
If the bigram doesn’t exist, we proceed as if the user has entered only 1 word (the second one), and predict the next word as described in case 1.
case 3: if user enters 3 words
case 4: if user enters more than 3 words
case 5: user enters an empty string