1. Executive Summary

The goal of this project is to do an exploratory data analysis on text files as part of Week 2 activities from Data Science Specialization SwiftKey Capstone. Data for the analysis can be downloaded from the link below:

https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip

2. Preparing Environment

2.1. Loading Libraries

Loading required packages:

set.seed(500)
library(ggplot2)
library(knitr)
library(RWeka)
library(SnowballC)
library(tm)
library(wordcloud)

Complementary information:

sessionInfo()

## R version 3.4.1 (2017-06-30)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 15063)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=Portuguese_Brazil.1252  LC_CTYPE=Portuguese_Brazil.1252   
## [3] LC_MONETARY=Portuguese_Brazil.1252 LC_NUMERIC=C                      
## [5] LC_TIME=Portuguese_Brazil.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] wordcloud_2.5        RColorBrewer_1.1-2   tm_0.7-1            
## [4] NLP_0.1-11           SnowballC_0.5.1      RWeka_0.4-34        
## [7] knitr_1.17           ggplot2_2.2.1        RevoUtilsMath_10.0.0
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.12      magrittr_1.5      RWekajars_3.9.1-3
##  [4] munsell_0.4.3     colorspace_1.3-2  rlang_0.1.2      
##  [7] stringr_1.2.0     plyr_1.8.4        tools_3.4.1      
## [10] parallel_3.4.1    grid_3.4.1        gtable_0.2.0     
## [13] htmltools_0.3.6   yaml_2.1.14       lazyeval_0.2.0   
## [16] rprojroot_1.2     digest_0.6.12     tibble_1.3.4     
## [19] rJava_0.9-8       slam_0.1-40       evaluate_0.10.1  
## [22] rmarkdown_1.6     stringi_1.1.5     compiler_3.4.1   
## [25] RevoUtils_10.0.5  scales_0.5.0      backports_1.1.0

2.2. Loading Datasets

# Read text files
Blogs <- readLines("./source/en_US.blogs.txt")
News <- readLines("./source/en_US.news.txt")
Twitter <- readLines("./source/en_US.twitter.txt")

2.2.1. Basic summaries of the three files

Blogs_Summary <- c(sum(nchar(Blogs)), 
                   length(unlist(strsplit(Blogs, " "))),
                   format(object.size(Blogs), units = "Mb"))

News_Summary <- c(sum(nchar(News)), 
                  length(unlist(strsplit(News, " "))), 
                  format(object.size(News), units = "Mb"))

Twitter_Summary <- c(sum(nchar(Twitter)), 
                  length(unlist(strsplit(Twitter, " "))), 
                  format(object.size(Twitter), units = "Mb"))

var_names <- c("Characters", "Words", "Size")
summary_files <- data.frame(Blogs_Summary, News_Summary, Twitter_Summary, row.names = var_names)
names(summary_files) <- c("en_US.blogs.txt", "en_US.news.txt", "en_US.twitter.txt")
kable(summary_files, align = "c")

	en_US.blogs.txt	en_US.news.txt	en_US.twitter.txt
Characters	208361438	15683765	162384825
Words	37334131	2643969	30373543
Size	248.5 Mb	19.2 Mb	301.4 Mb

2.3. Preparing Data

2.3.1. Sampling and Corpus

Since the source files are large, a sample will be taken from each one to do the analysis:

Sample_Text <- rbind(   sample(Blogs,10000),
                                  sample(News, 10000),
                                  sample(Twitter, 10000))

# Delete no longer needed large data
rm(Blogs, News, Twitter)

Now create a corpus (collection of text documents) from the sample texts:

Corpus_ST <- Corpus(VectorSource(Sample_Text))

2.3.2. Clean and prep data for analysis

Corpus_ST <- tm_map(Corpus_ST, removeWords, stopwords("english"))
Corpus_ST <- tm_map(Corpus_ST, removePunctuation) 
Corpus_ST <- tm_map(Corpus_ST, removeNumbers) 
Corpus_ST <- tm_map(Corpus_ST, stripWhitespace)
Corpus_ST <- tm_map(Corpus_ST, tolower)
Corpus_ST <- tm_map(Corpus_ST, stemDocument)

3. Exploratory Data Analysis

3.1. Finding n-grams

# Function for tokenizing the Corpus
f_tokenizer <- function (corpus, i) {
  temp <- c()
  ngram <-c()
    temp <- NGramTokenizer(corpus, Weka_control(min=i,max=i))
    ngram <- data.frame(table(temp))
    return(ngram)
}

# Find n-grams
ngram_US_2 <- f_tokenizer(Corpus_ST, 2)
ngram_US_4 <- f_tokenizer(Corpus_ST, 4)

3.1.1. Most used sequences of 2 and 4 words

ngram_US_2 <- ngram_US_2[order(ngram_US_2$Freq, decreasing = TRUE),]
ngram_US_4 <- ngram_US_4[order(ngram_US_4$Freq, decreasing = TRUE),]

head(ngram_US_2, 10)

##             temp Freq
## 168895   i think  543
## 168103    i know  394
## 168164    i love  326
## 169016    i want  314
## 167449     i can  308
## 169058    i will  273
## 168083    i just  236
## 194669 last year  231
## 402541  year ago  186
## 168141    i like  175

head(ngram_US_4, 10)

##                       temp Freq
## 283048         me me me me   36
## 207375       i feel like i   16
## 479751 ugli ugli ugli ugli   14
## 207482       i felt like i    7
## 209752       i know i know    7
## 214658       i think i can    7
## 451947   the new york time    7
## 206733    i donâ<U+0080><U+0099>t know i    5
## 214729       i think im go    5
## 208866        i hope i can    4

3.1.2. Plot most used sequences of 2 words

Bigrams <- ngram_US_2[order(ngram_US_2$Freq,decreasing = TRUE),]
colnames(Bigrams)<-c("Bigram","Frequency" )
Bigrams<- Bigrams[1:10,]

barplot(Bigrams$Frequency, las = 2, 
        names.arg = Bigrams$Bigram,
        col ="lightgreen", main ="Top 10 Bigrams",
        ylab = "Frequency")

3.2.3. Plot most used sequences of 4 words

Quadgrams <- ngram_US_4[order(ngram_US_4$Freq,decreasing = TRUE),]
colnames(Quadgrams)<-c("Quadgram","Frequency" )
Quadgrams<- Quadgrams[1:10,]

barplot(Quadgrams$Frequency, las = 2, 
        names.arg = Quadgrams[1:10,]$Quadgram,
        col ="lightblue", main ="Top 10 Quadgrams",
        ylab = "Frequency")

3.2. Most Common Words

3.2.1. Top 50 words used in the texts

Matrix_US <- DocumentTermMatrix(Corpus_ST)
Matrix_US <- removeSparseTerms(Matrix_US, 0.99)
frequency <- colSums(as.matrix(Matrix_US))
order_freq <- order(frequency, decreasing=TRUE)
frequency[head(order_freq,50)]

##    the    one   will   said    get   like   just   time    can   year 
##   4988   2854   2848   2838   2282   2245   2225   2182   2093   2037 
##   make    day    new   work   know    now   good   love    say  peopl 
##   1775   1641   1547   1528   1418   1359   1352   1337   1311   1302 
##   want  think   also    use    but   look  first    see  thing   back 
##   1297   1277   1267   1244   1199   1190   1186   1186   1156   1150 
##    two    and   need   come   last   take   even    way   much   this 
##   1147   1142   1127   1126   1124   1086   1072   1057    957    956 
##   week  state  start realli   well  right  still  great   play   game 
##    924    919    918    910    904    872    864    823    818    816

3.2.2. Word Cloud

colors = c("blue", "red", "orange", "green")
wordcloud(names(frequency), frequency, max.words=50, min.freq=2, colors=colors)

4. Future Actions

My goal for the eventual app and algorithm is to create a “Shiny version” of a word prediction/completios apps available for cell phones.

Data Science Capstone - Week 2 Milestone - Exploratory Data Analysis on Text Files

Leandro Freitas

10/26/2017