Introduction

This milestone report is based on exploratory data analysis of the SwifKey data provided in the context of the Coursera Data Science Capstone. The data consist of 3 text files containing text from three different sources (blogs, news & twitter).

Basic Document Summary

Set directory

setwd(“./1Janus-Doc/DataScience/Capstone/en_US”)

Check file size

file.info("./en_US.blogs.txt")$size / (1024*1024)
## [1] 200.4242
file.info("./en_US.news.txt")$size / (1024*1024)
## [1] 196.2775
file.info("./en_US.twitter.txt")$size / (1024*1024)
## [1] 159.3641

Performing Sample Data

Given the large amount of text and limited computational resources, sampling is performed. 10000 lines per file is randomly sampled and saved to disk.
twitter <- readLines('./en_US.twitter.txt', encoding = 'UTF-8')
news <- readLines('./en_US.news.txt', encoding = 'UTF-8')
blogs <- readLines('./en_US.blogs.txt', encoding = 'UTF-8')

R Packages require R.utils:

library(R.utils)
## Loading required package: R.oo
## Loading required package: R.methodsS3
## R.methodsS3 v1.7.0 (2015-02-19) successfully loaded. See ?R.methodsS3 for help.
## R.oo v1.19.0 (2015-02-27) successfully loaded. See ?R.oo for help.
## 
## Attaching package: 'R.oo'
## The following objects are masked from 'package:methods':
## 
##     getClasses, getMethods
## The following objects are masked from 'package:base':
## 
##     attach, detach, gc, load, save
## R.utils v2.1.0 (2015-05-27) successfully loaded. See ?R.utils for help.
## 
## Attaching package: 'R.utils'
## The following object is masked from 'package:utils':
## 
##     timestamp
## The following objects are masked from 'package:base':
## 
##     cat, commandArgs, getOption, inherits, isOpen, parse, warnings

set.seed(39) sampleTwitter <- twitter[sample(1:length(twitter),10000)] sampleNews <- news[sample(1:length(news),10000)] sampleBlogs <- blogs[sample(1:length(blogs),10000)] sampleData <- c(sampleTwitter,sampleNews,sampleBlogs) writeLines(sampleData, “./sample1/sampleData.txt”)

remove temporary variables
rm(twitter,news,blogs,sampleTwitter,sampleNews,sampleBlogs,sampleData)

Create and Clean Corpus

Using the tm package, the sampled data is used to create a corpus. Subsequently, the the following transformations are performed:
-convert to lowercase
-characters /, @ |
-common punctuation
-numbers
-English stop words
-strip whitespace
-stemming (Porter’s stemming)

R Packages require tm:

library(tm)
## Warning: package 'tm' was built under R version 3.2.3
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.2.3

cname <- file.path(“.”, “sample1”) docs <- Corpus(DirSource(cname))

docs <- tm_map(docs, content_transformer(tolower))               
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
docs <- tm_map(docs, toSpace, "/|@|\\|")
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, stripWhitespace)
docs <- tm_map(docs, removeWords, stopwords("english"))

Ngram Tokenization

Ngrams models are created to explore word frequencies. Using the RWeka package, unigrams, bigrams and trigrams are created.

R Packages require RWeka

library(RWeka)
Tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
unidtm <- DocumentTermMatrix(docs, 
                             control = list(tokenize = Tokenizer))
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
bidtm <- DocumentTermMatrix(docs, 
                            control = list(tokenize = BigramTokenizer))

TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
tridtm <- DocumentTermMatrix(docs, 
                             control = list(tokenize = TrigramTokenizer))

Exploratory Data Analysis

Top 10 Frequencies below, you can see the top 10 unigrams with the highest frequencies.
tm_unifreq <- sort(colSums(as.matrix(unidtm)), decreasing=TRUE)
tm_uniwordfreq <- data.frame(word=names(tm_unifreq), freq=tm_unifreq)
paste("Unigrams - Top 5 highest frequencies")
## [1] "Unigrams - Top 5 highest frequencies"
head(tm_uniwordfreq,5)
##      word freq
## said said 3051
## will will 2897
## one   one 2606
## like like 2354
## just just 2294
tm_bifreq <- sort(colSums(as.matrix(bidtm)), decreasing=TRUE)
tm_biwordfreq <- data.frame(word=names(tm_bifreq), freq=tm_bifreq)
paste("Bigrams - Top 5 highest frequencies")
## [1] "Bigrams - Top 5 highest frequencies"
head(tm_biwordfreq,5)
##                word freq
## last year last year  216
## new york   new york  177
## year ago   year ago  166
## look like look like  153
## right now right now  151
tm_trifreq <- sort(colSums(as.matrix(tridtm)), decreasing=TRUE)
tm_triwordfreq <- data.frame(word=names(tm_trifreq), freq=tm_trifreq)
paste("Trigrams - Top 5 highest frequencies")
## [1] "Trigrams - Top 5 highest frequencies"
head(tm_triwordfreq,5)
##                                    word freq
## happi mother day       happi mother day   21
## cant wait see             cant wait see   19
## new york citi             new york citi   17
## presid barack obama presid barack obama   15
## world war ii               world war ii   15

Explore Frequencies

Base on the diagrams below, you can explore the Ngrams by frequencies:
R packages requirementr: ggplot2, dplyr
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
Presentation: Trigrams with frequcy > 1000
tm_uniwordfreq %>% 
  filter(freq > 1000) %>%
  ggplot(aes(word,freq)) +
  geom_bar(stat="identity") +
  ggtitle("Unigrams with frequencies > 1000") +
  xlab("Unigrams") + ylab("Frequency") +
  theme(axis.text.x=element_text(angle=45, hjust=1))

Presentation: Trigrams with frequcy > 100
tm_biwordfreq %>% 
  filter(freq > 100) %>%
  ggplot(aes(word,freq)) +
  geom_bar(stat="identity") +
  ggtitle("Bigrams with frequencies > 100") +
  xlab("Bigrams") + ylab("Frequency") +
  theme(axis.text.x=element_text(angle=45, hjust=1))

Presentation: Trigrams with frequcy > 10
tm_triwordfreq %>% 
  filter(freq > 10) %>%
  ggplot(aes(word,freq)) +
  geom_bar(stat="identity") +
  ggtitle("Trigrams with frequencies > 10") +
  xlab("Trigrams") + ylab("Frequency") +
  theme(axis.text.x=element_text(angle=45, hjust=1))

R packages requirementr: Wordcloud
library(wordcloud)
## Loading required package: RColorBrewer
Presentation: Wordcloud Top 200 Unigrams
set.seed(39)
wordcloud(names(tm_unifreq), tm_unifreq, max.words=200, scale=c(5, .5), colors=brewer.pal(6, "Dark2"))

Presentation: Wordcloud Top 200 Bigrams
wordcloud(names(tm_bifreq), tm_bifreq, max.words=200, scale=c(5, .5), colors=brewer.pal(6, "Dark2"))

Presentation: Wordcloud Top 200 Trigrams
wordcloud(names(tm_trifreq), tm_trifreq, max.words=200, scale=c(5, .5), colors=brewer.pal(6, "Dark2"))