This is a Milestone Report for the Coursera Data Science Specialization Capstone project, the goal of which is to build a predictive text app. This report will show data cleaning and exploration steps. The data consist of a three U.S. English based corpora: blogs, tweets, and news stories.
#Reading in the data
con <- file("final/en_US/en_US.blogs.txt", "r")
con2 <- file("final/en_US/en_US.news.txt", "r")
con3 <- file("final/en_US/en_US.twitter.txt", "r")
blogs <- readLines(con)
news <- readLines(con2)
twitter <- readLines(con3)
#Understanding the dimensions of the data
sumdf <- data.frame(Dimension=c("No. of documents", "Chars. in longest document", "Average characters"),
Blogs=c(length(blogs), max(nchar(blogs)), mean(nchar(blogs))),
News=c(length(news), max(nchar(news)), mean(nchar(news))),
Twitter=c(length(twitter), max(nchar(twitter)), mean(nchar(twitter))))
sumdf
## Dimension Blogs News Twitter
## 1 No. of documents 899288.000 77259.0000 2.360148e+06
## 2 Chars. in longest document 40835.000 5760.0000 2.130000e+02
## 3 Average characters 231.696 203.0024 6.880281e+01
The twitter corpus has the most documents followed by blogs and then news. Blog documents have the largest document and were slightly larger on average than the news documents. Twitter documents were the shortest.
Given the large size of the corpora, the following random subsetting was applied to work with more management amounts of data to run on my laptop.
sample.b <- blogs[rbinom(n = length(blogs), size = 1, prob = .01) == 1]
sample.n <- news[rbinom(n = length(news), size = 1, prob = .01) == 1]
sample.t <- twitter[rbinom(n = length(twitter), size = 1, prob = .001) == 1]
sample.sumdf <- data.frame(Dimension=c("No. of documents", "Chars. in longest document", "Average characters"),
Blogs=c(length(sample.b), max(nchar(sample.b)), mean(nchar(sample.b))),
News=c(length(sample.n), max(nchar(sample.n)), mean(nchar(sample.n))),
Twitter=c(length(sample.t), max(nchar(sample.t)), mean(nchar(sample.t))))
sample.sumdf
## Dimension Blogs News Twitter
## 1 No. of documents 9077.0000 794.0000 2369.00000
## 2 Chars. in longest document 3119.0000 1178.0000 141.00000
## 3 Average characters 228.4832 202.8552 69.54749
In this section I convert the text files to corpus files, and use the tm package to remove numbers, take out extra white space, remove punctuation and profanity, and convert characters to lower case. (Profanity word list from bannedwords.com.) Stemming is not conducted to keep the full meaning of words intact.
Here I remove stopwords to get a better understanding of word frequency. However, in building the predictive text app I will keep in the stopwords.
Finally I create a document term matrix, where documents are rows, words are the columns, and each cell is the frequency that a word appears in a document.
library(tm)
## Loading required package: NLP
profanity <- readLines("bannedWords.txt")
#Blogs
text.b <- VectorSource(sample.b)
b <- Corpus(text.b)
b <- tm_map(b, removeNumbers)
b <- tm_map(b, stripWhitespace)
b <- tm_map(b, removePunctuation)
b <- tm_map(b, tolower)
b <- tm_map(b, removeWords, profanity)
b2 <- tm_map(b, removeWords, stopwords("english")) #optional
dtm.b <- DocumentTermMatrix(b2)
#calcuating word frequencies - blogs
m <- as.matrix(dtm.b)
v <- sort(colSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
#News
text.n <- VectorSource(sample.n)
n <- Corpus(text.n)
n <- tm_map(n, removeNumbers)
n <- tm_map(n, stripWhitespace)
n <- tm_map(n, removePunctuation)
n <- tm_map(n, tolower)
n <- tm_map(n, removeWords, profanity)
n2 <- tm_map(n, removeWords, stopwords("english")) #optional
dtm.n <- DocumentTermMatrix(n2)
#calcuating word frequencies - news
m.n <- as.matrix(dtm.n)
v.n <- sort(colSums(m.n),decreasing=TRUE)
d.n <- data.frame(word = names(v.n),freq=v.n)
#Twitter
text.t <- VectorSource(sample.t)
t <- Corpus(text.t)
t <- tm_map(t, removeNumbers)
t <- tm_map(t, stripWhitespace)
t <- tm_map(t, removePunctuation)
t <- tm_map(t, tolower)
t <- tm_map(t, removeWords, profanity)
t2 <- tm_map(t, removeWords, stopwords("english")) #optional
dtm.t <- DocumentTermMatrix(t2)
#calcuating word frequencies - twitter
m.t <- as.matrix(dtm.t)
v.t <- sort(colSums(m.t),decreasing=TRUE)
d.t <- data.frame(word = names(v.t),freq=v.t)
Next I look at the distribution of words in each corpus and the relationship between words. To look the distribution of words I created a barchart of the most frequent terms.
To look at the relationship between words, I look at bigrams. Two popular methods did not work for me (they only produced unigrams):
#Method 1:
#BigramTokenizer <- function(x) unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
#dtm.g <- DocumentTermMatrix(n, control = list(tokenize = BigramTokenizer))
#Method 2:
#library(RWeka)
#BigramTokenizerW <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
#dtm.g <- DocumentTermMatrix(n, control = list(tokenize = BigramTokenizerW))
Instead I use the tidytext package to create histograms of the most frequent bigrams, leaving in stopwords.
library(tidytext)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#blogs
b.df <- data.frame(sample.b)
blogbigrams <- unnest_tokens(b.df, bigram, sample.b, token = "ngrams", n = 2)
count.b <- count(blogbigrams, bigram, sort = TRUE)
#news
n.df <- data.frame(sample.n)
newsbigrams <- unnest_tokens(n.df, bigram, sample.n, token = "ngrams", n = 2)
count.n <- count(newsbigrams, bigram, sort = TRUE)
#twitter
t.df <- data.frame(sample.t)
twitbigrams <- unnest_tokens(t.df, bigram, sample.t, token = "ngrams", n = 2)
count.t <- count(twitbigrams, bigram, sort = TRUE)
The next steps for building the prediction models and app are as follows: