Capstone Milestone Report

Getting the Data

We will utilize a collection of text documents to develop a predictive text analytics application in Shiny that will predict n-th term in an n-gram given user input of the (n-1) terms preceeding the value that we want to predict. The dataset that we will use to train our algorithm can be found in the below zip file.

https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip

The zip file contains separate ‘collections’ of files that represent different four different languages. For the purposes of this exercise we will only utilize the collection of files that are in English. This collection contains a set of three documents:

A collection of blogs
A collection of news articles
A collection of Twitter posts

After downloading the data, we create a corpus so that we can further perform some exploratory data analysis.

createCorpus <- function(fullDirectory){
        Corpus(DirSource(fullDirectory))
}

en_US_corpus <- createCorpus("C:/Users/Rick/Documents/JHU Data Science Specialization/Capstone Project/final/en_US")

Initial Exploratory Analysis (Full Documents)

In our initial analysis, we review the full to determine approximate line counts and word counts. We can see that the twitter document contains the largest count of lines in the dataset, but the blogs document contains the largest count of words.

for(i in 1:(length(en_US_corpus))){
        if(i == 1){
                linedf <- data.frame(Document = names(en_US_corpus[i]), 
                                 Line_Count = str_count(en_US_corpus[[i]], "\\n"))
                worddf <- data.frame(Document = names(en_US_corpus[i]),
                                 Word_Count = str_count(en_US_corpus[[i]], "\\s+"))
        }else{
                linedf <- rbind(linedf, 
                    data.frame(Document = names(en_US_corpus[i]), 
                               Line_Count = str_count(en_US_corpus[[i]], "\\n")))
                worddf <- rbind(worddf, 
                    data.frame(Document = names(en_US_corpus[i]), 
                               Word_Count = str_count(en_US_corpus[[i]], "\\s+")))
        }
}

lineplot <- ggplot(linedf, aes(x = Document, y = Line_Count, fill = Document)) + 
                geom_bar(stat = "identity") + 
                theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
                guides(fill = FALSE) + 
                scale_y_continuous(name = "Count of Lines", labels = comma)

wordplot <- ggplot(worddf, aes(x = Document, y = Word_Count, fill = Document)) + 
                geom_bar(stat = "identity") + 
                theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
                guides(fill = FALSE) + 
                scale_y_continuous(name = "Count of Words", labels = comma)

grid.arrange(lineplot, wordplot, nrow = 1)

Partioning the Dataset

For the purposes of our predictive algorithm, we will need to partition the dataset into a training set and a testing set. For this exploratory analysis, we choose to select roughly 60% of the lines from each document to use in our training set.

for(i in 1:(length(en_US_corpus))){
        set.seed(8 + i)
        lines <- strsplit(en_US_corpus[[i]]$content, "\\n")[[1]]
        inTrain <- rbinom(length(lines), 1, prob = .6)
        if(i == 1){
                trainingLines <- data.frame(Document = rep(names(en_US_corpus[i]), 
                                                           length(lines[inTrain == 1])),
                                            Lines = lines[inTrain == 1])
        }else{
                trainingLines <- rbind(trainingLines, 
                        data.frame(Document = rep(names(en_US_corpus[i]), 
                                                  length(lines[inTrain == 1])), 
                                   Lines = lines[inTrain == 1]))
        }

}

Further Exploratory Analysis (Training Set)

Now that we have partioned our dataset into a training set, we can perform some further exploratory analysis of it’s features. First we can inspect the attributes of each line, mainly the number of characters in each line as well as the number of approximate words in each line.

trainingLines <- mutate(trainingLines, charsPerLine = str_count(trainingLines$Lines, "."), wordsPerLine = sapply(str_split(trainingLines$Lines, "\\s++"), length))

charHistogram <- ggplot(trainingLines, aes(x = charsPerLine, fill = Document)) + geom_histogram(bins = 750) + facet_grid(Document ~ ., scales = "free_y") + guides(fill = FALSE) + scale_y_continuous(name = "Frequency") + scale_x_continuous(name = "Characters per Line", limits = c(0,750))

wordHistogram <- ggplot(trainingLines, aes(x = wordsPerLine, fill = Document)) + geom_histogram(bins = 225) + facet_grid(Document ~ ., scales = "free_y") + guides(fill = FALSE) + scale_y_continuous(name = "Frequency") + scale_x_continuous(name = "Words per Line", limits = c(0,225))

charHistogram

## Warning: Removed 22133 rows containing non-finite values (stat_bin).

wordHistogram

## Warning: Removed 3481 rows containing non-finite values (stat_bin).

## Warning: Removed 3 rows containing missing values (geom_bar).

Next, we can being looking at the n-grams that exist throughout the training set. We choose to explore the most common unigrams and bigrams in the dataset.

listNGrams <- function(wordVector, n){
        if(length(wordVector)<=n){
                
        }else{
                for(index in 1:(length(wordVector)-(n-1))){
                        if(index == 1){
                                ngramVector <- paste(wordVector[index:(index+(n-1))], 
                                                     collapse = " ")
                        }else{
                                ngramVector <- c(ngramVector, paste(wordVector[index:(index+(n-1))], 
                                                                    collapse = " "))        
                        }
                        
                }
                ngramVector
        }
}                

unigrams <- unlist(sapply(str_split(trainingLines$Lines, "\\s+"), listNGrams, 1))
bigrams <- unlist(sapply(str_split(trainingLines$Lines, "\\s+"), listNGrams, 2))
#trigrams <- unlist(sapply(str_split(trainingLines$Lines, "\\s+"), listNGrams, 3))

topunigrams <- as.data.frame(table(unigrams))[order(as.data.frame(table(unigrams))$Freq, decreasing = TRUE),][1:10,]
topbigrams <- as.data.frame(table(bigrams))[order(as.data.frame(table(bigrams))$Freq, decreasing = TRUE),][1:10,]
#toptrigrams <- as.data.frame(table(trigrams))[order(as.data.frame(table(trigrams))$Freq, decreasing = TRUE),][1:10,]

unigramplot <- ggplot(topunigrams, aes(unigrams, Freq)) + geom_bar(stat = "identity") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + ggtitle("Top 10 Most Frequent Unigrams")
bigramplot <- ggplot(topbigrams, aes(bigrams, Freq)) + geom_bar(stat = "identity") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + ggtitle("Top 10 Most Frequent Bigrams")
#trigramplot <- ggplot(toptrigrams, aes(trigrams, Freq)) + geom_bar(stat = "identity") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + ggtitle("Top 10 Most Frequent Trigrams")

unigramplot

bigramplot

#trigramplot

Next Steps

Additional pre-processing steps should be taken to further standardize the training data set. Punctuation, capitalization, profanity, and other features of the training set should be processed to optimize the efficacy of the machine learning algorithm to follow. Once pre-processing has taken place, various algorithms will be implemented to create a predictive model that will guess the next work in a user provided n-gram.