Background

This is a Milestone Report for the Coursera Data Science Specialization Capstone project, the goal of which is to build a predictive text app. This report will show data cleaning and exploration steps. The data consist of a three U.S. English based corpora: blogs, tweets, and news stories.

Reading in and exploring the data

#Reading in the data
con <- file("final/en_US/en_US.blogs.txt", "r")
con2 <- file("final/en_US/en_US.news.txt", "r")
con3 <- file("final/en_US/en_US.twitter.txt", "r")

blogs <- readLines(con)
news <- readLines(con2)
twitter <- readLines(con3)

#Understanding the dimensions of the data
sumdf <- data.frame(Dimension=c("No. of documents", "Chars. in longest document", "Average characters"),
      Blogs=c(length(blogs), max(nchar(blogs)), mean(nchar(blogs))),
      News=c(length(news), max(nchar(news)), mean(nchar(news))),
      Twitter=c(length(twitter), max(nchar(twitter)), mean(nchar(twitter))))
sumdf
##                    Dimension      Blogs       News      Twitter
## 1           No. of documents 899288.000 77259.0000 2.360148e+06
## 2 Chars. in longest document  40835.000  5760.0000 2.130000e+02
## 3         Average characters    231.696   203.0024 6.880281e+01

The twitter corpus has the most documents followed by blogs and then news. Blog documents have the largest document and were slightly larger on average than the news documents. Twitter documents were the shortest.

Subsetting

Given the large size of the corpora, the following random subsetting was applied to work with more management amounts of data to run on my laptop.

sample.b <- blogs[rbinom(n = length(blogs), size = 1, prob = .01) == 1]
sample.n <- news[rbinom(n = length(news), size = 1, prob = .01) == 1]
sample.t <- twitter[rbinom(n = length(twitter), size = 1, prob = .001) == 1]

sample.sumdf <- data.frame(Dimension=c("No. of documents", "Chars. in longest document", "Average characters"),
      Blogs=c(length(sample.b), max(nchar(sample.b)), mean(nchar(sample.b))),
      News=c(length(sample.n), max(nchar(sample.n)), mean(nchar(sample.n))),
      Twitter=c(length(sample.t), max(nchar(sample.t)), mean(nchar(sample.t))))
sample.sumdf
##                    Dimension     Blogs      News    Twitter
## 1           No. of documents 9077.0000  794.0000 2369.00000
## 2 Chars. in longest document 3119.0000 1178.0000  141.00000
## 3         Average characters  228.4832  202.8552   69.54749

Cleaning the data

In this section I convert the text files to corpus files, and use the tm package to remove numbers, take out extra white space, remove punctuation and profanity, and convert characters to lower case. (Profanity word list from bannedwords.com.) Stemming is not conducted to keep the full meaning of words intact.

Here I remove stopwords to get a better understanding of word frequency. However, in building the predictive text app I will keep in the stopwords.

Finally I create a document term matrix, where documents are rows, words are the columns, and each cell is the frequency that a word appears in a document.

library(tm)
## Loading required package: NLP
profanity <- readLines("bannedWords.txt")
#Blogs
text.b <- VectorSource(sample.b)
b <- Corpus(text.b)
b <- tm_map(b, removeNumbers)
b <- tm_map(b, stripWhitespace)
b <- tm_map(b, removePunctuation)
b <- tm_map(b, tolower)
b <- tm_map(b, removeWords, profanity)
b2 <- tm_map(b, removeWords, stopwords("english")) #optional
dtm.b <- DocumentTermMatrix(b2)
#calcuating word frequencies - blogs
m <- as.matrix(dtm.b)
v <- sort(colSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)

#News
text.n <- VectorSource(sample.n)
n <- Corpus(text.n)
n <- tm_map(n, removeNumbers)
n <- tm_map(n, stripWhitespace)
n <- tm_map(n, removePunctuation)
n <- tm_map(n, tolower)
n <- tm_map(n, removeWords, profanity) 
n2 <- tm_map(n, removeWords, stopwords("english")) #optional
dtm.n <- DocumentTermMatrix(n2)
#calcuating word frequencies - news
m.n <- as.matrix(dtm.n)
v.n <- sort(colSums(m.n),decreasing=TRUE)
d.n <- data.frame(word = names(v.n),freq=v.n)


#Twitter
text.t <- VectorSource(sample.t)
t <- Corpus(text.t)
t <- tm_map(t, removeNumbers)
t <- tm_map(t, stripWhitespace)
t <- tm_map(t, removePunctuation)
t <- tm_map(t, tolower)
t <- tm_map(t, removeWords, profanity) 
t2 <- tm_map(t, removeWords, stopwords("english")) #optional
dtm.t <- DocumentTermMatrix(t2)
#calcuating word frequencies - twitter
m.t <- as.matrix(dtm.t)
v.t <- sort(colSums(m.t),decreasing=TRUE)
d.t <- data.frame(word = names(v.t),freq=v.t)

Exploring the text data

Next I look at the distribution of words in each corpus and the relationship between words. To look the distribution of words I created a barchart of the most frequent terms.

To look at the relationship between words, I look at bigrams. Two popular methods did not work for me (they only produced unigrams):

#Method 1:
#BigramTokenizer <- function(x) unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
#dtm.g <- DocumentTermMatrix(n, control = list(tokenize = BigramTokenizer))
#Method 2:
#library(RWeka)
#BigramTokenizerW <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
#dtm.g <- DocumentTermMatrix(n, control = list(tokenize = BigramTokenizerW))

Instead I use the tidytext package to create histograms of the most frequent bigrams, leaving in stopwords.

library(tidytext)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#blogs
b.df <- data.frame(sample.b)
blogbigrams <- unnest_tokens(b.df, bigram, sample.b, token = "ngrams", n = 2)
count.b <- count(blogbigrams, bigram, sort = TRUE)

#news
n.df <- data.frame(sample.n)
newsbigrams <- unnest_tokens(n.df, bigram, sample.n, token = "ngrams", n = 2)
count.n <- count(newsbigrams, bigram, sort = TRUE)

#twitter
t.df <- data.frame(sample.t)
twitbigrams <- unnest_tokens(t.df, bigram, sample.t, token = "ngrams", n = 2)
count.t <- count(twitbigrams, bigram, sort = TRUE)

Next steps

The next steps for building the prediction models and app are as follows: