# Load necessary packages
library(dplyr) # For fast data manipulation
library(tm) # For Corpus Handling
library(RWeka) # For Ngramming
library(ggplot2) # for sampling and graphing
library(wordcloud) # For visualizing word clouds
This is an exploration of the data sets for the data science capstone with coursera. The data was provided by swiftkey and represents a large data set of input to generate a preditive text model.
The data set contains 4 folders in 4 different languages. This analysis will take place on the english data set (en_US). This data contains 3 files. Each file is a single news item, twitter post or blog post.
## [1] "en_US.blogs.txt" "en_US.news.txt" "en_US.twitter.txt"
First we load these 3 files into memory and take a look at their properties.
# Create data connections and read each file
blogfile <- file("en_US.blogs.txt", open ="r")
blogs <- readLines(blogfile)
twitterfile <- file("en_US.twitter.txt", open="r")
twitter <- readLines(twitterfile)
newsfile <- file("en_US.news.txt", open="r")
news <- readLines(newsfile, n=-1)
# Close data connections for reach file
close(blogfile)
close(twitterfile)
close(newsfile)
The size of each file
Number of posts/entries in each file:
Maximum post length (in number of characters) of the longest post from each file:
These files are very large, so we will take 10,000 entries from each file and save it as a single file to speed up further analysis:
# Sample each file
twittersample <- sample_n(as.data.frame(twitter), 10000)
blogsample <- sample_n(as.data.frame(blogs), 10000)
newssample <- sample_n(as.data.frame(news), 10000)
# Standardize the names of each sample, for row binding
names(twittersample) <- "text"
names(blogsample) <- "text"
names(newssample) <- "text"
# Merge all all 10,000 samples together
alltext <- rbind(twittersample, blogsample, newssample)
dir.create("sample")
write.table(newssample, file="sample/all.txt",
row.names=FALSE,
col.names=FALSE)
This will speed of up the rest of the data exploration and model creation. In the final analysis we wil use the entire data set. Let’s clean up the R environment, and load the new 30,000 entry file as a corpus and explore further.
# Clean up and garbage collect
rm(list=ls())
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 542351 29.0 4547696 242.9 3936061 210.3
## Vcells 2758124 21.1 89324915 681.5 110268872 841.3
# Change working directory to the sample
directory <- "~/Coursera/sample"
setwd(directory)
# Make sure that globally strings never interpreted as factors
options(stringsAsFactors = FALSE)
# Load up the merge sample
# UTF-8 correctly interprets quotes
docs <- Corpus(DirSource(directory, encoding = "UTF-8"))
With the data fully loaded we need to create a function to clean up corpuses. In particular it needs to:
cleanCorpus <- function(corpus) {
corpus.tmp <- tm_map(corpus, removePunctuation)
corpus.tmp <- tm_map(corpus.tmp, stripWhitespace)
corpus.tmp <- tm_map(corpus.tmp, content_transformer(tolower))
corpus.tmp <- tm_map(corpus.tmp, removeWords, stopwords("english"))
corpus.tmp <- tm_map(corpus.tmp, stemDocument)
return (corpus.tmp)
}
Next steps:
# Clean corpus
docsClean <- cleanCorpus(docs)
# Create DTM
dtm <- DocumentTermMatrix(docsClean)
# Remove sparse Terms
dtmHeavy <- removeSparseTerms(dtm, 0.99)
# Create a vector of term frequency and sort
termFreq <- colSums(as.matrix(dtmHeavy))
termFreq <- sort(termFreq, decreasing = TRUE)
Let’s see the top 20 terms:
# 20 most popular terms
termFreq[1:20]
## said will year one time say new state can get also first
## 2494 1126 1053 894 688 664 662 654 627 611 602 569
## like just two last make peopl work game
## 568 545 540 529 524 499 475 464
qplot(names(termFreq[1:20]), termFreq[1:20]) + xlab("Words") + ylab("Counts")
Let’s explore the n-grams in the set.
Let’s verify and look at the first at the top 6 unigrams and plot a wordcloud
# Create unigram
NMgramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
ngram1 = DocumentTermMatrix(docsClean, control = list(tokenize = NMgramTokenizer))
ngram1Freq <- colSums(as.matrix(ngram1))
ngram1Freq <- sort(ngram1Freq, decreasing = TRUE)
head(ngram1Freq)
## said will year one time say
## 2494 1126 1053 894 688 664
# Create wordcloud
wordcloud(names(ngram1Freq)[1:50], ngram1Freq[1:50], colors=brewer.pal(6, "Dark2"))
Let’s look at top 6 bigrams and plot a wordcloud:
NMgramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
ngram2 = DocumentTermMatrix(docsClean, control = list(tokenize = NMgramTokenizer))
ngram2Freq <- colSums(as.matrix(ngram2))
ngram2Freq <- sort(ngram2Freq, decreasing = TRUE)
head(ngram2Freq)
## last year new york high school year ago st loui new jersey
## 145 117 96 84 80 75
# Create wordcloud
wordcloud(names(ngram2Freq)[1:50], ngram2Freq[1:50], colors=brewer.pal(6, "Dark2"))
Let’s look at the top 6 trigrams. The wordcloud has been omitted because of limited space.
NMgramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
ngram3 = DocumentTermMatrix(docsClean, control = list(tokenize = NMgramTokenizer))
ngram3Freq <- colSums(as.matrix(ngram3))
ngram3Freq <- sort(ngram3Freq, decreasing = TRUE)
head(ngram3Freq)
## presid barack obama new york citi first time sinc
## 18 12 9
## gov chris christi nation weather servic new york time
## 9 8 8
Let’s look at the top 6 four-grams. The wordcloud has been omitted because of limited space.
NMgramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 4, max = 4))
ngram4 = DocumentTermMatrix(docsClean, control = list(tokenize = NMgramTokenizer))
ngram4Freq <- colSums(as.matrix(ngram4))
ngram4Freq <- sort(ngram4Freq, decreasing = TRUE)
head(ngram4Freq)
## highway traffic safeti administr nation highway traffic safeti
## 5 5
## case western reserv univers counti superior court judg
## 4 4
## <U+0097> gov chris christi assembl speaker sheila oliv
## 3 3
Using these ngrams, I believe that we can create a model for predicting the next word given a set of inputs. We can predict based of up to 4 words simultaneously, but the most likely predictive inputs will be 2 words or fewer.
I need to find a package that will help me develop markov chains based off the relative association of different n-grams.