The objective of this report is to show that I have loaded the appropriate data, performed exploratory data analysis, and performed basic n-gram modeling. To that end, this report is broken into the following sections:
Project overview
Data acquisition
Data cleaning
Exploratory data analysis
Modeling
Model testing
Conclusion
The structured nature of language forces sequences of words to appear time and time again. These recurring patterns make it possible to predict what users will write. Specifically, by looking at the preceding few words, one should be able to predict the following few words. Here, I used data available from HC Corpora, which includes text scraped from blogs, to begin the modeling pipeline. I first downloaded the data and loaded it into R, see below.
Download data
# fileUrl <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
# download.file(fileUrl)
print(paste("Downloaded: ",date(),sep=""))
## [1] "Downloaded: Wed Feb 15 23:07:56 2017"
# unzip("SwiftKey.zip")
Create connections to the data This includes a text document added to the “Coursera-SwiftKey/final/en_US” directory titled “badWords.txt” that contains a new line separated list of bad words. This list was obtained from “https://github.com/shutterstock/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/blob/master/en”.
conT <- file("Coursera-SwiftKey/final/en_US/en_US.twitter.txt","r") #Twitter
conB <- file("Coursera-SwiftKey/final/en_US/en_US.blogs.txt","r") #Blogs
conN <- file("Coursera-SwiftKey/final/en_US/en_US.news.txt","r") #News
conBad <- file("Coursera-SwiftKey/final/en_US/badWords.txt","r") #Bad words
Test data loading
readLines(conT, 1)
## [1] "How are you? Btw thanks for the RT. You gonna be in DC anytime soon? Love to see you. Been way, way too long."
readLines(conB, 1)
## [1] "In the years thereafter, most of the Oil fields and platforms were named after pagan â<U+0080><U+009C>godsâ<U+0080>."
readLines(conN, 1)
## [1] "He wasn't home alone, apparently."
# close(conT)
# close(conB)
# close(conN)
Get the number of lines in each document
system('find /v /c "" Coursera-SwiftKey/final/en_US/en_US.twitter.txt')
system('find /v /c "" Coursera-SwiftKey/final/en_US/en_US.blogs.txt')
system('find /v /c "" Coursera-SwiftKey/final/en_US/en_US.news.txt')
Load training subsets from each data source
numLines <- 10000 #Lines to read in
subsetT <- readLines(conT, numLines)
subsetB <- readLines(conB, numLines)
subsetN <- readLines(conN, numLines)
# Merge subsetted data
mergedSubset <- paste(subsetT,subsetB,subsetN)
Load testing subsets from each data source
numLines <- 1000 #Lines to read in
subsetTestT <- readLines(conT, numLines)
subsetTestB <- readLines(conB, numLines)
subsetTestN <- readLines(conN, numLines)
# Merge subsetted data
mergedSubsetTest <- paste(subsetTestT,subsetTestB,subsetTestN)
Close connections
close(conT)
close(conB)
close(conN)
Clean data
badWordData <- readLines(conBad,5000)
# Break the data into sentences
library(qdap)
library(tm)
dataProcessor <- function(textInput, badWords = badWordData) {
textInput <- sent_detect(textInput, language = "en", model = NULL)
newCorpus <- VCorpus(VectorSource(textInput))
newCorpus <- tm_map(newCorpus, removePunctuation)
newCorpus <- tm_map(newCorpus, removeNumbers)
newCorpus <- tm_map(newCorpus, content_transformer(tolower)) #Convert to lower case
newCorpus <- tm_map(newCorpus, removeWords, badWords)
newCorpus <- tm_map(newCorpus, stripWhitespace)
}
subsetCorpus <- dataProcessor(mergedSubset)
Convert corpus to data.frame
dfData <- data.frame(sentences = unlist(sapply(subsetCorpus, `[`, "content")), stringsAsFactors = FALSE)
Use RWeka to create n-grams
library(RWeka)
oneGram <- NGramTokenizer(dfData, Weka_control(min = 1, max = 1))
twoGram <- NGramTokenizer(dfData, Weka_control(min = 2, max = 2))
threeGram <- NGramTokenizer(dfData, Weka_control(min = 3, max = 3))
fourGram <- NGramTokenizer(dfData, Weka_control(min = 4, max = 4))
The objective of exploratory data analysis is to compare the frequencies of 1-, 2-, 3-, and 4-grams, as well as figure out how many unique 1-, 2-, 3-, and 4-grams are required to account for 50% of the language.
oneGramDf <- table(oneGram) %>%
data.frame()
oneGramDf <- oneGramDf[order(oneGramDf$Freq, decreasing = TRUE),]
twoGramDf <- table(twoGram) %>%
data.frame()
twoGramDf <- twoGramDf[order(twoGramDf$Freq, decreasing = TRUE),]
threeGramDf <- table(threeGram) %>%
data.frame()
threeGramDf <- threeGramDf[order(threeGramDf$Freq, decreasing = TRUE),]
fourGramDf <- table(fourGram) %>%
data.frame()
fourGramDf <- fourGramDf[order(fourGramDf$Freq, decreasing = TRUE),]
# Make Df headings uniform
headings <- c("Term", "Frequency")
names(oneGramDf) <- headings
names(twoGramDf) <- headings
names(threeGramDf) <- headings
names(fourGramDf) <- headings
# Reorder factors for plotting
oneGramDf$Term <- factor(oneGramDf$Term, as.character(oneGramDf$Term))
twoGramDf$Term <- factor(twoGramDf$Term, as.character(twoGramDf$Term))
threeGramDf$Term <- factor(threeGramDf$Term, as.character(threeGramDf$Term))
fourGramDf$Term <- factor(fourGramDf$Term, as.character(fourGramDf$Term))
Display the 1-gram results graphically
library(ggplot2)
n <- 20 #Number of terms to display
graph <- ggplot(oneGramDf[1:n,], aes(x = Term, y = Frequency),) + geom_bar(stat = "Identity", fill = "red")
graph
Display the 2-gram results graphically
library(ggplot2)
n <- 20 #Number of terms to display
graph <- ggplot(twoGramDf[1:n,], aes(x = Term, y = Frequency),) + geom_bar(stat = "Identity", fill = "red") + theme(axis.text.x = element_text(angle = 45, hjust = 1))
graph
Display the 3-gram results graphically
library(ggplot2)
n <- 20 #Number of terms to display
graph <- ggplot(threeGramDf[1:n,], aes(x = Term, y = Frequency),) + geom_bar(stat = "Identity", fill = "red") + theme(axis.text.x = element_text(angle = 45, hjust = 1))
graph
Display the 4-gram results graphically
library(ggplot2)
n <- 20 #Number of terms to display
graph <- ggplot(fourGramDf[1:n,], aes(x = Term, y = Frequency),) + geom_bar(stat = "Identity", fill = "red") + theme(axis.text.x = element_text(angle = 45, hjust = 1))
graph
How many 1-gram terms are required to account for “X” % of the language?
oneGramDf$percent <- oneGramDf$Frequency / sum(oneGramDf$Frequency) * 100
oneGramDf$cumulativeCoverage <- NA
cumulativeCoverageCalculator <- function(input) {#input = 'percent' column
output <- matrix(ncol = 1, nrow = length(input))
output[1] <- input[1]
for (i in 2:length(output)) {
output[i] <- output[i-1] + input[i]
}
return(output)
}
oneGramDf$cumulativeCoverage <- cumulativeCoverageCalculator(oneGramDf$percent)
oneGramDf$numberOfTerms <- 1:nrow(oneGramDf)
graph <- ggplot(oneGramDf, aes(x = as.numeric(numberOfTerms), y = as.numeric(cumulativeCoverage))) +
geom_point() +
labs(x = "Number of Terms", y = "Cumulative Coverage (%)")
graph
How many 2-gram terms are required to account for “X” % of the language?
twoGramDf$percent <- twoGramDf$Frequency / sum(twoGramDf$Frequency) * 100
twoGramDf$cumulativeCoverage <- NA
twoGramDf$cumulativeCoverage <- cumulativeCoverageCalculator(twoGramDf$percent)
twoGramDf$numberOfTerms <- 1:nrow(twoGramDf)
graph <- ggplot(twoGramDf, aes(x = as.numeric(numberOfTerms), y = as.numeric(cumulativeCoverage))) +
geom_point() +
labs(x = "Number of Terms", y = "Cumulative Coverage (%)")
graph
How many 3-gram terms are required to account for “X” % of the language?
threeGramDf$percent <- threeGramDf$Frequency / sum(threeGramDf$Frequency) * 100
threeGramDf$cumulativeCoverage <- NA
threeGramDf$cumulativeCoverage <- cumulativeCoverageCalculator(threeGramDf$percent)
threeGramDf$numberOfTerms <- 1:nrow(threeGramDf)
graph <- ggplot(threeGramDf, aes(x = as.numeric(numberOfTerms), y = as.numeric(cumulativeCoverage))) +
geom_point() +
labs(x = "Number of Terms", y = "Cumulative Coverage (%)")
graph
How many 4-gram terms are required to account for “X” % of the language?
fourGramDf$percent <- fourGramDf$Frequency / sum(fourGramDf$Frequency) * 100
fourGramDf$cumulativeCoverage <- NA
fourGramDf$cumulativeCoverage <- cumulativeCoverageCalculator(fourGramDf$percent)
fourGramDf$numberOfTerms <- 1:nrow(fourGramDf)
graph <- ggplot(fourGramDf, aes(x = as.numeric(numberOfTerms), y = as.numeric(cumulativeCoverage))) +
geom_point() +
labs(x = "Number of Terms", y = "Cumulative Coverage (%)")
graph
Add predictive information
allGramVector <- c(oneGramDf$percent,
twoGramDf$percent,
threeGramDf$percent,
fourGramDf$percent)
names(allGramVector) <- c(as.character(oneGramDf$Term),
as.character(twoGramDf$Term),
as.character(threeGramDf$Term),
as.character(fourGramDf$Term))
twoGramSplit <- strsplit(as.character(twoGramDf$Term), " ")
twoGramDf$predictor <- sapply(twoGramSplit, "[", 1)
twoGramDf$prediction <- sapply(twoGramSplit, "[", 2)
threeGramSplit <- strsplit(as.character(threeGramDf$Term), " ")
threeGramDf$predictor <- lapply(threeGramSplit, function(item) {
paste(item[1:2], collapse = " ")
})
threeGramDf$prediction <- sapply(threeGramSplit, "[", 3)
fourGramSplit <- strsplit(as.character(fourGramDf$Term), " ")
fourGramDf$predictor <- lapply(fourGramSplit, function(item) {
paste(item[1:3], collapse = " ")
})
fourGramDf$prediction <- sapply(fourGramSplit, "[", 4)
predictorDf <- rbind(twoGramDf[,c("predictor","prediction","percent","Frequency")],
threeGramDf[,c("predictor","prediction","percent","Frequency")],
fourGramDf[,c("predictor","prediction","percent","Frequency")])
Use model to identify the top n predictions This model takes two inputs, text and an integer. The last 1, 2, and 3 words of the text are used to predict the word that will come next. The five most likely words to follow the text are output in order of decreasing likelihood. This is how it works:
The text input is processed in the same way that the training text was processed. Next, the most likely following word (given no information about preceding words) and their percent usages are appended to a data frame. Then, like a key in a HashMap the first 1, 2, or 3 words, in 2-, 3-, and 4-grams are used to determine the frequencies of the final word in the n-gram. The percent usage is recalculated to account for the additional information. After appending the predictions and percent usages for these n-grams, the data is aggregated based on the predicted word and the percent values are summed. The relative value of these summed percents gives the relative likelihood of their appearance (larger value = higher probability).
predictorFunction <- function(input, n = 5) {# n is the number of predictions returned
# Ensure input is valid
if (class(input) != "character") {
return("Invalid input!")
}
# Perform the same language processing steps performed above
inputCorpus <- dataProcessor(input)
input <- data.frame(inputCorpus)[1,"text"] #input has been converted to a valid string
# Account for possibility of length 0 input
output <- oneGramDf[1:n,c("Term","percent")]
names(output) <- c("prediction","percent")
# Split the input string based on whitespace
if (input != "") {#Account for the possibility of an empty string
splitString <- strsplit(input, " ")[[1]]
maxPredictionLength <- 3
if (length(splitString) < 3) maxPredictionLength <- length(splitString) # Account for the possibility that there are fewer available words to predict from than could be used
for (i in 1:maxPredictionLength) {
predictor <- tail(splitString, i)
predictor <- paste(predictor, collapse = " ")
matches <- which(predictorDf[,"predictor"] == predictor)
if (length(matches) == 0) next
matchedDf <- predictorDf[matches,c("prediction","Frequency")]
matchedDf$percent <- matchedDf$Frequency / sum(matchedDf$Frequency) * 100
output <- rbind(output, matchedDf[, c("prediction","percent")])
}
}
output <- aggregate(output$percent, by=list(Category = output$prediction), FUN=sum)
output <- output[order(output[,2], decreasing = TRUE), ]
return(matrix(as.character(output[1:n, "Category"]), nrow = n))
}
Examples
example1 <- "for the first"
predictorFunction(example1)
## [,1]
## [1,] "time"
## [2,] "ever"
## [3,] "to"
## [4,] "of"
## [5,] "seven"
example2 <- "for the most"
predictorFunction(example2)
## [,1]
## [1,] "part"
## [2,] "of"
## [3,] "extensive"
## [4,] "important"
## [5,] "popular"
example3 <- "when it comes"
predictorFunction(example3)
## [,1]
## [1,] "to"
## [2,] "the"
## [3,] "out"
## [4,] "earl"
## [5,] "in"
Prepare test data
testCorpus <- dataProcessor(mergedSubsetTest)
testData <- data.frame(testCorpus)[,"text"]
twoGramTest <- NGramTokenizer(testData, Weka_control(min = 2, max = 2))
threeGramTest <- NGramTokenizer(testData, Weka_control(min = 3, max = 3))
fourGramTest <- NGramTokenizer(testData, Weka_control(min = 4, max = 4))
twoGramTestSplit <- strsplit(twoGramTest, " ")
threeGramTestSplit <- strsplit(threeGramTest, " ")
fourGramTestSplit <- strsplit(fourGramTest, " ")
twoGramTestDf <- data.frame(predictor = sapply(twoGramTestSplit, "[", 1),
prediction = sapply(twoGramTestSplit, "[", 2))
threeGramTestDf <- data.frame(predictor = unlist(lapply(threeGramTestSplit, function(item) {
paste(item[1:2], collapse = " ")
})),
prediction = sapply(threeGramTestSplit, "[", 3))
fourGramTestDf <- data.frame(predictor = unlist(lapply(fourGramTestSplit, function(item) {
paste(item[1:3], collapse = " ")
})),
prediction = sapply(fourGramTestSplit, "[", 4))
testDf <- rbind(twoGramTestDf,threeGramTestDf,fourGramTestDf)
testDf$predictor <- as.character(testDf$predictor)
testDf$prediction <- as.character(testDf$prediction)
Make predictions on test data
library(dplyr)
smallTestDf <- sample_n(testDf, 500)
predictions <- lapply(smallTestDf$predictor, function(item){
predictorFunction(item, n = 1)
}) %>% unlist()
Compare predictions to real data
names(smallTestDf) <- c("predictor", "nextWord")
smallTestDf$prediction <- predictions
percentSuccess <- mean(smallTestDf$nextWord == smallTestDf$prediction) * 100
print(paste(percentSuccess,"% accuracy", sep = ""))
## [1] "10.6% accuracy"
Since the method I used here makes predictions with only moderately low accuracy despite such a small sample size, it is tempting to pursue this method further. However, there is substantial overhead involved in this method’s implementation. Specifically, while training data processing can be performed prior to utilization by the user, it is unfortunately the case that the prediction method is relatively slow and requires a large 85MB data frame. Thus, it would be impractical to use this algorithm on resource-limited devices, such as phones. My future algorithms will attempt to reduce this overhead.