Basic Summaries of Data Files

Note: The code for all results in the report will be shown in the appendix.

First, we will determine the number of lines in each file. As shown by the line counts below, these are very big data sets!

##   Blogs    News Twitter 
##  899288   77259 2360148

Given the large size of these datasets, we will take a random sample of the data to perform exploratory analysis. The rbinom function will be used to select 1% of the lines from each file. The line counts of the subsetted data are shown below.

##   Blogs Subset    News Subset Twitter Subset 
##          89737           7684         236039

We will use the tm package to create a collection of the data files. This package will be used to perform preprocessing on the data. Specifically, it will be used to remove numbers, punctuation, extra whitespace, convert the text to lowercase and remove stopwords (commonly used words such as “and” and “the”).

Next, we will create a term document matrix containing the number of times each term appears in each document. Using the term document matrix, we can find the word counts and number of unique words in each data file. The results of this initial analysis are shown in the table below.

##         Subsetted Line Count Word Count Number of Unique Words
## Blogs                  89737     186280                  30878
## News                    7684      14683                   6535
## Twitter               236039     165463                  26188

Word Frequency Exploratory Analysis

Now let’s look at histograms of word frequency for each of the three data sources. From the graphs below, we see that the majority of words are used less than 100 times in each data set.

Let’s also consider the words that appear most frequently in the three different data sets. From the graphs below, we see that there is slight variation for the most commonly used words within these different data sources. However, certain words such as “will” appear in the top 10 most commonly used word lists for all three data sets. An interesting observation is that the word “said” is used more than twice as often as any other individual word in the news data set.

Plans for Prediction Algorithm

The goal of the capstone project is to develop an algorithm to predict the next word based on user input. For the prediction algorithm, it will be useful to consider phrases (for example, 2-grams or 3-grams) instead of just individual words. Determining the frequency with which words appear next to each other would be useful in developing the prediction algorithm. A potentially useful function for building this model is the “findAssocs” function in the tm package which can be used to find words that are often associated with an input word.

Appendix of R Code

Determine and summarize the number of lines in each file.

con1 <- file("Coursera-SwiftKey/final/en_US/en_US.blogs.txt", "r") 
blogs_length <- length(readLines(con1))
close(con1)

con2 <- file("Coursera-SwiftKey/final/en_US/en_US.news.txt", "r") 
news_length <- length(readLines(con2))
close(con2)

con3 <- file("Coursera-SwiftKey/final/en_US/en_US.twitter.txt", "r") 
twitter_length <- length(readLines(con3))
close(con3)

Print a summary of the line counts.

lengths <- c(blogs_length, news_length, twitter_length)
names <- c("Blogs", "News", "Twitter")
names(lengths) <- names
lengths

Subset each data file.

set.seed(5678)
binom_blogs <- rbinom(blogs_length, 1, 0.01)
keep_blogs <- which(binom_blogs == 1)
subset_blogs <- vector()
con_blogs <- file("Coursera-SwiftKey/final/en_US/en_US.blogs.txt", "r") 
for(i in 1:length(keep_blogs)){
    if (i == 1) {skipnum = keep_blogs[i] - 1} 
    else {skipnum <- (keep_blogs[i]-keep_blogs[i-1])-1}
    subset_blogs <- c(subset_blogs,
                          scan(con_blogs, what = character(), skip = skipnum, nlines = 1, sep = "\n"))
}
close(con_blogs)

output_blogs <- file("output_blogs.text")
writeLines(subset_blogs, output_blogs, sep = "\n")
close(output_blogs)
binom_news <- rbinom(news_length, 1, 0.01)
keep_news <- which(binom_news == 1)
subset_news <- vector()
con_news <- file("Coursera-SwiftKey/final/en_US/en_US.news.txt", "r") 
for(i in 1:length(keep_news)){
    if (i == 1) {skipnum = keep_news[i] - 1} 
    else {skipnum <- (keep_news[i]-keep_news[i-1])-1}
    subset_news <- c(subset_news,
                          scan(con_news, what = character(), skip = skipnum, nlines = 1, sep = "\n"))
}
close(con_news)

output_news <- file("output_news.text")
writeLines(subset_news, output_news, sep = "\n")
close(output_news)
binom_twitter <- rbinom(twitter_length, 1, 0.01)
keep_twitter <- which(binom_twitter == 1)
subset_twitter <- vector()
con_twitter <- file("Coursera-SwiftKey/final/en_US/en_US.twitter.txt", "r") 
for(i in 1:length(keep_twitter)){
    if (i == 1) {skipnum = keep_twitter[i] - 1} 
    else {skipnum <- (keep_twitter[i]-keep_twitter[i-1])-1}
    subset_twitter <- c(subset_twitter,
                          scan(con_twitter, what = character(), skip = skipnum, nlines = 1, sep = "\n"))
}
close(con_twitter)

output_twitter <- file("output_twitter.text")
writeLines(subset_twitter, output_twitter, sep = "\n")
close(output_twitter)

Determine and print the line counts of the subsetted data.

con1_s <- file("output_blogs.text", "r") 
blogs_length_s <- length(readLines(con1_s))
close(con1_s)

con2_s <- file("output_news.text", "r") 
news_length_s <- length(readLines(con2_s))
close(con2_s)

con3_s <- file("output_twitter.text", "r") 
twitter_length_s <- length(readLines(con3_s))
close(con3_s)
lengths_s <- c(blogs_length_s, news_length_s, twitter_length_s)
names <- c("Blogs Subset", "News Subset", "Twitter Subset")
names(lengths_s) <- names
lengths_s

Perform preprocessing to remove numbers, punctuation, extra whitespace, and convert the text to lowercase.

library(tm)
library(SnowballC)
corpus <- Corpus(DirSource("Data Subset"), readerControl = list(language="en_US"))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeWords, stopwords("english"))

Create a term document matrix containing the number of times each term appears in each document.

tdm <- TermDocumentMatrix(corpus) 

Use the term document matrix to determine the word counts and number of unique words.

wordcounts <- colSums(as.matrix(tdm))
uniquewords <- colSums(as.matrix(tdm) != 0)

Print a table summarizing the analysis on the subsetted data files.

lengths <- c(blogs_length_s, news_length_s, twitter_length_s)
names <- c("Blogs", "News", "Twitter")
summary <- cbind(lengths, wordcounts, uniquewords)
rownames(summary) <- names
colnames(summary) <- c("Subsetted Line Count", "Word Count", "Number of Unique Words")
summary

Make histograms showing frequency of word usage.

tdm_graph <- as.matrix(tdm)
blogs_nozero <- tdm_graph[,1] != 0
news_nozero <- tdm_graph[,2] != 0
twitter_nozero <- tdm_graph[,3] != 0
par(mfrow = c(1,3))
hist(tdm_graph[blogs_nozero,1], xlab = "Number of usages", main = "Histogram of Blogs Data")
hist(tdm_graph[news_nozero,2], xlab = "Number of usages", main = "Histogram of News Data")
hist(tdm_graph[twitter_nozero,3], xlab = "Number of usages", main = "Histogram of Twitter Data")

Make barplots of the most common words in each data set.

high_blogs <- sort(tdm_graph[,1], decreasing=TRUE)[1:10]
high_news <- sort(tdm_graph[,2], decreasing=TRUE)[1:10]
high_twitter <- sort(tdm_graph[,3], decreasing=TRUE)[1:10]
par(mfrow = c(1,3))
barplot(high_blogs, names.arg = names(high_blogs), las = 2, ylab = "Number of Usages", main = "Most Common Words - Blogs")
barplot(high_news, names.arg = names(high_news),las = 2, ylab = "Number of Usages", main = "Most Common Words - News")
barplot(high_twitter, names.arg = names(high_twitter),las = 2, ylab = "Number of Usages", main = "Most Common Words - Twitter")