Summary

Aim of the capstone project is a word predicting app. The prediction is based on the usage of phrases in texts from three different sources: blogs, news and tweets. This report describes the exploration of the given text sources.

Preparation Steps

Loading Libs

library(parallel)
library(tm)
library(RWeka)
library(ggplot2)

Loading Data

The data is provided under https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip by Coursera for the capstone course, which is the final element of the data science specialization.

The data contains text files with example texts from blogs, news and twitter in different languages. In the following we only consider the english text files.

datapath <- "data/final/en_US"

blogfile    <- "en_US.blogs.txt"
newsfile    <- "en_US.news.txt"
twitterfile <- "en_US.twitter.txt"

bcon <- file(paste(datapath,blogfile,sep="/"),"r", blocking=FALSE)
blog <- readLines(bcon)
close(bcon)
ncon <- file(paste(datapath,newsfile,sep="/"),"r", blocking=FALSE)
news <- readLines(ncon)
close(ncon)
tcon <- file(paste(datapath,twitterfile,sep="/"),"r", blocking=FALSE)
twit <- readLines(tcon)
close(tcon)

Building Sample Dataset

A first look at the data shows differences in the file sizes, but nearly the same number of words or word like entries (+-10%). So twitter has short text entries with short words or abbreviations, while blogs have the longest texts.

syssizes <- system(
    sprintf('cd %s; wc -lwm *.txt | tr -s \" \" | awk \'{print $4,$1,$2,$3}\'',datapath),intern=TRUE)
sizes <- strsplit(syssizes, "\\s+",perl=TRUE)
sizematrix <- matrix(unlist(sizes), ncol=4, byrow=TRUE)
colnames(sizematrix) <- c("file","# lines","# words","size")
sizematrix[4,1] <- "total"

sizematrix
##      file                # lines   # words     size       
## [1,] "en_US.blogs.txt"   "899288"  "37334114"  "208623081"
## [2,] "en_US.news.txt"    "1010242" "34365936"  "205243643"
## [3,] "en_US.twitter.txt" "2360148" "30359804"  "166816544"
## [4,] "total"             "4269678" "102059854" "580683268"

For exploration purposes it is handier to look only onto a smaller part of the data. A sample of 1% should be sufficient.

set.seed(9182)
bsample <- sample(blog,size = round(length(blog) * 0.01))
nsample <- sample(news,size = round(length(news) * 0.01))
tsample <- sample(twit,size = round(length(twit) * 0.01))

Cleaning Data

To simplify datahandling each text is converted to a corpus object.

bcorpus <- Corpus(VectorSource(bsample), readerControl=list(reader=readPlain, language="en_US", load=TRUE))
ncorpus <- Corpus(VectorSource(nsample), readerControl=list(reader=readPlain, language="en_US", load=TRUE))
tcorpus <- Corpus(VectorSource(tsample), readerControl=list(reader=readPlain, language="en_US", load=TRUE))

The corpora are then cleaned with the following steps:

Stemming is not performed, because in the end we will predict full words. So we have to look at complete words and word combinations. The bad word list (Bad-Words-to-Block-on-Facebook.txt) is taken from http://ramseymohsen.com/pictures/blog/Bad-Words-to-Block-on-Facebook.docx.

badWords <- scan("data/final/Bad-Words-to-Block-on-Facebook.txt", "")

bcorpus <- tm_map(bcorpus, content_transformer(tolower))
bcorpus <- tm_map(bcorpus, stripWhitespace)
bcorpus <- tm_map(bcorpus, removePunctuation)
bcorpus <- tm_map(bcorpus, removeNumbers)
bcorpus <- tm_map(bcorpus, removeWords, badWords)

ncorpus <- tm_map(ncorpus, content_transformer(tolower))
ncorpus <- tm_map(ncorpus, stripWhitespace)
ncorpus <- tm_map(ncorpus, removePunctuation)
ncorpus <- tm_map(ncorpus, removeNumbers)
ncorpus <- tm_map(ncorpus, removeWords, badWords)

tcorpus <- tm_map(tcorpus, content_transformer(tolower))
tcorpus <- tm_map(tcorpus, stripWhitespace)
tcorpus <- tm_map(tcorpus, removePunctuation)
tcorpus <- tm_map(tcorpus, removeNumbers)
removeURL <- function(x) gsub("http[[:alnum:]]*", "", x)
tcorpus <- tm_map(tcorpus, content_transformer(removeURL))
tcorpus <- tm_map(tcorpus, removeWords, badWords)

Tokenize data

With the tokenizer from the RWeka package three n-gram types are built to explore the word frequencies:

First the blogs:

unigramTknzr <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
bigramTknzr <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
trigramTknzr <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))

b1words <- TermDocumentMatrix(bcorpus, control = list(tokenize = unigramTknzr))
b1words <-  removeSparseTerms(b1words, 0.95)
b2words <- TermDocumentMatrix(bcorpus, control = list(tokenize = bigramTknzr))
b2words <-  removeSparseTerms(b2words, 0.999)
b3words <- TermDocumentMatrix(bcorpus, control = list(tokenize = trigramTknzr))
b3words <-  removeSparseTerms(b3words, 0.999)

b1wordsfreq <- sort(rowSums(as.matrix(b1words)), decreasing = TRUE)
b1wordsfreqdf <- data.frame("Term"=names(head(b1wordsfreq,30)), "Frequency"=head(b1wordsfreq,30))
b1wordsfreqdf$TermOrd <- reorder(b1wordsfreqdf$Term, b1wordsfreqdf$Frequency)
b2wordsfreq <- rowSums(as.matrix(b2words))
b2wordsfreq <- sort(b2wordsfreq, decreasing = TRUE)
b2wordsfreqdf <- data.frame("Term"=names(head(b2wordsfreq,30)), "Frequency"=head(b2wordsfreq,30))
b2wordsfreqdf$TermOrd <- reorder(b2wordsfreqdf$Term, b2wordsfreqdf$Frequency)
b3wordsfreq <- rowSums(as.matrix(b3words))
b3wordsfreq <- sort(b3wordsfreq, decreasing = TRUE)
b3wordsfreqdf <- data.frame("Term"=names(head(b3wordsfreq,30)), "Frequency"=head(b3wordsfreq,30))
b3wordsfreqdf$TermOrd <- reorder(b3wordsfreqdf$Term, b3wordsfreqdf$Frequency)

then the news:

# n1words <- TermDocumentMatrix(ncorpus, control = list(tokenize = unigramTknzr))
# n1words <-  removeSparseTerms(n1words, 0.9)
# n2words <- TermDocumentMatrix(ncorpus, control = list(tokenize = bigramTknzr))
# n2words <-  removeSparseTerms(n2words, 0.999)
n3words <- TermDocumentMatrix(ncorpus, control = list(tokenize = trigramTknzr))
n3words <-  removeSparseTerms(n3words, 0.999)

# n1wordsfreq <- sort(rowSums(as.matrix(n1words)), decreasing = TRUE)
# n1wordsfreqdf <- data.frame("Term"=names(head(n1wordsfreq,30)), "Frequency"=head(n1wordsfreq,30))
# n1wordsfreqdf$TermOrd <- reorder(n1wordsfreqdf$Term, n1wordsfreqdf$Frequency)
# n2wordsfreq <- rowSums(as.matrix(n2words))
# n2wordsfreq <- sort(n2wordsfreq, decreasing = TRUE)
# n2wordsfreqdf <- data.frame("Term"=names(head(n2wordsfreq,30)), "Frequency"=head(n2wordsfreq,30))
# n2wordsfreqdf$TermOrd <- reorder(n2wordsfreqdf$Term, n2wordsfreqdf$Frequency)
n3wordsfreq <- rowSums(as.matrix(n3words))
n3wordsfreq <- sort(n3wordsfreq, decreasing = TRUE)
n3wordsfreqdf <- data.frame("Term"=names(head(n3wordsfreq,30)), "Frequency"=head(n3wordsfreq,30))
n3wordsfreqdf$TermOrd <- reorder(n3wordsfreqdf$Term, n3wordsfreqdf$Frequency)

and last tweets:

# t1words <- TermDocumentMatrix(tcorpus, control = list(tokenize = unigramTknzr))
# t1words <-  removeSparseTerms(t1words, 0.9)
# t2words <- TermDocumentMatrix(tcorpus, control = list(tokenize = bigramTknzr))
# t2words <-  removeSparseTerms(t2words, 0.999)
t3words <- TermDocumentMatrix(tcorpus, control = list(tokenize = trigramTknzr))
t3words <-  removeSparseTerms(t3words, 0.999)

# t1wordsfreq <- sort(rowSums(as.matrix(t1words)), decreasing = TRUE)
# t1wordsfreqdf <- data.frame("Term"=names(head(t1wordsfreq,30)), "Frequency"=head(t1wordsfreq,30))
# t1wordsfreqdf$TermOrd <- reorder(t1wordsfreqdf$Term, t1wordsfreqdf$Frequency)
# t2wordsfreq <- rowSums(as.matrix(t2words))
# t2wordsfreq <- sort(t2wordsfreq, decreasing = TRUE)
# t2wordsfreqdf <- data.frame("Term"=names(head(t2wordsfreq,30)), "Frequency"=head(t2wordsfreq,30))
# t2wordsfreqdf$TermOrd <- reorder(t2wordsfreqdf$Term, t2wordsfreqdf$Frequency)
t3wordsfreq <- rowSums(as.matrix(t3words))
t3wordsfreq <- sort(t3wordsfreq, decreasing = TRUE)
t3wordsfreqdf <- data.frame("Term"=names(head(t3wordsfreq,30)), "Frequency"=head(t3wordsfreq,30))
t3wordsfreqdf$TermOrd <- reorder(t3wordsfreqdf$Term, t3wordsfreqdf$Frequency)

Data exploration

The frequencies of term in blogs are visualized with histograms, one for each n-gram.

Blogs

bp1 <- ggplot(b1wordsfreqdf, aes(x = TermOrd, y = Frequency)) +
      geom_bar(stat = "identity", color="white", fill="blue") +
      geom_text(data=b1wordsfreqdf,aes(x=TermOrd,y=-2000,label=Frequency),vjust=0, size=3) +
      xlab("Uni-gram Terms") + ylab("Count") + ggtitle("Top 30 Uni-grams of Blogs") +
      theme(plot.title = element_text(lineheight=.8, face="bold")) +
      coord_flip()
bp2 <- ggplot(b2wordsfreqdf, aes(x = TermOrd, y = Frequency)) +
      geom_bar(stat = "identity", color="white", fill="orange") +
      geom_text(data=b2wordsfreqdf,aes(x=TermOrd,y=-100,label=Frequency),vjust=0, size=3) +
      xlab("Bi-gram Terms") + ylab("Count") + ggtitle("Top 30 Bi-grams of Blogs") +
      theme(plot.title = element_text(lineheight=.8, face="bold")) +
      coord_flip()
bp3 <- ggplot(b3wordsfreqdf, aes(x = TermOrd, y = Frequency)) +
      geom_bar(stat = "identity", color="white", fill="red") +
      geom_text(data=b3wordsfreqdf,aes(x=TermOrd,y=-10,label=Frequency),vjust=0, size=3) +
      xlab("Tri-gram Terms") + ylab("Count") + ggtitle("Top 30 Tri-grams of Blogs") +
      theme(plot.title = element_text(lineheight=.8, face="bold")) +
      coord_flip()

Comparison of News and Twitter Tri-grams

The comparison of the tri-grams gives a good insight about the difference of three text sources. The most frequent tri-grams of twitter are significantly more personal than those of the other two sources.

np3 <- ggplot(n3wordsfreqdf, aes(x = TermOrd, y = Frequency)) +
      geom_bar(stat = "identity", color="white", fill="green") +
      geom_text(data=n3wordsfreqdf,aes(x=TermOrd,y=-10,label=Frequency),vjust=0, size=3) +
      xlab("Tri-gram Terms") + ylab("Count") + ggtitle("Top 30 from News") +
      theme(plot.title = element_text(lineheight=.8, face="bold")) +
      coord_flip()
tp3 <- ggplot(t3wordsfreqdf, aes(x = TermOrd, y = Frequency)) +
      geom_bar(stat = "identity", color="white", fill="black") +
      geom_text(data=t3wordsfreqdf,aes(x=TermOrd,y=-10,label=Frequency),vjust=0, size=3) +
      xlab("Tri-gram Terms") + ylab("Count") + ggtitle("Top 30 from Twitter") +
      theme(plot.title = element_text(lineheight=.8, face="bold")) +
      coord_flip()
# multiplot function from cookbook for R
multiplot(np3,tp3, cols=2)

Preview

Based on the n-gram models an algorithm has to be implemented to predict next words in a sentence. It can be based on a lookup table to fasten the execution. The algorithm will search in n-grams with decreasing length until a match is found. If there is no match, a best guess based on word frequency is performed. At last a shiny app is to develop to demonstrate the usage of the algorithm.