Synopsis

This is the Milestone Report for Week 2 of the Capstone Project for the Data Sciense Specialization provided by Coursera and JHU. In this report we will try to demostrate the exploratory analysis we have done trying to understand in the best possible way the nature of the data sets that will be used for creating the prediction algorithm and the final Shiny app. We will show how we downloaded the datasets, how we loaded them into R, we will show some statistics for our data that will be useful in the future creating the prediction algorithm and the final app.

Download and load data

We use the link provided in the syllabus of the Capstone project to download the Swiftkey zip file containing the data sets that will be used for our project. We unzip and use only the files from the English database as it is explicitly noted in “Task 1 - Getting and cleaning the data”. the English database consists of three separate text files named en_US.blogs.txt, en_US.news.txt and en_US.twitter.txt.

# Get the directory where the script that is executed is located 
# and set it as working directory
script.dir <- dirname(sys.frame(1)$ofile)

setwd(script.dir)

library(downloader)
suppressMessages(library(sqldf))

url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"


if(!file.exists("./data/Coursera-SwiftKey.zip")){
     setwd("./data")
     download(url, dest="Coursera-SwiftKey.zip", mode="wb")
}

if (getwd() != paste(script.dir,"data",sep="/")){
     setwd(paste(script.dir,"data",sep="/"))
}

txtFiles <- unzip("Coursera-SwiftKey.zip",list=TRUE)
txtFiles <- as.vector(t(sqldf("select Name from txtFiles where Name LIKE ('final/en_US/en%')")))

if(!dir.exists("./data/final")){
     unzip("Coursera-SwiftKey.zip",files=txtFiles,overwrite = TRUE)     
} else {
     for(txtFile in txtFiles){
          if(!file.exists(txtFile)){
               unzip("Coursera-SwiftKey.zip",files=txtFile,overwrite = TRUE) 
          }
     }
}

rm(list=ls(all=TRUE))

After downloading the data sets, we load them into R using readLines() function.

con <- file("./final/en_US/en_US.twitter.txt", "rb")
twitter <- readLines(con,skipNul = TRUE ,warn = FALSE, encoding = "UTF-8")
close(con)

con <- file("./final/en_US/en_US.blogs.txt", "rb")
blogs <- readLines(con,skipNul = TRUE, warn = FALSE)
close(con)

con <- file("./final/en_US/en_US.news.txt", "rb")
news <- readLines(con,skipNul = TRUE, warn = FALSE)
close(con)

Preprocessing

The size of the downloaded files in MBytes and also the row count for every file is:

File Name Size Row Count
en_US.blogs.txt 200.42 MB 899288
en_US.news.txt 196.28 MB 1010242
en_US.twitter.txt 159.36 MB 2360148

The script that was used to find file size and row count can be found in the Appendix (fileStats.R).

Due to the big size of our files that made difficult/time consuming to do calculations on our data sets, we create three separate sample files, 10000 rows each, one for every text file, in order to create our corpus and do all the preprocessing and calculations. the files are called sample_blogs.txt, sample_news.txt and sample_twitter.txt.

sampleTwitter <- sample(twitter,10000,replace=FALSE)
sampleBlogs <- sample(blogs,10000,replace=FALSE)
sampleNews <- sample(news,10000,replace=FALSE)

repo <- "./final/en_US/sample"

if(!dir.exists(repo)){
     dir.create(repo)
}

fileConn<-file(paste(repo,"twitter.txt",sep="/"))
writeLines(sampleTwitter, fileConn)
close(fileConn)

fileConn<-file(paste(repo,"twitter.txt",sep="/"))
writeLines(sampleNews, fileConn)
close(fileConn)

fileConn<-file(paste(repo,"twitter.txt",sep="/"))
writeLines(sampleBlogs, fileConn)
close(fileConn)

We create our corpus from the sample files and we do all the preprocessing needed to cleanse our data. We name it myCorpus.

library(tm)
library(SnowballC)

# I have found a really good list of "bad words" in this link (https://github.com/shutterstock/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words) which i 've used to cleanse the corpus from them.
con <- file("./profanity.txt", "rb")
profanity <- readLines(con,skipNul = TRUE, warn = FALSE)
close(con)

repo <- "./final/en_US/sample"

myCorpus <- VCorpus(
                     DirSource(repo, pattern = 'txt', encoding = 'UTF-8'),
                     readerControl = list(language = 'en')
                     )

# Custom function used to remove non ASCII characters.
removeNonASCII <- content_transformer(function(x)
     gsub("[^\x20-\x7E]","", x))
# Custom function used to remove URLs.
removeURLs <- content_transformer(function(x)
     gsub("(f|ht)tp(s?):(\\s*?)//(.*)[.][a-z]+(/?)", "", x))


# Data cleansing and transformations.
myCorpus <- tm_map(myCorpus, removeNonASCII)
myCorpus <- tm_map(myCorpus, content_transformer(tolower))
myCorpus <- tm_map(myCorpus, removeURLs)
myCorpus <- tm_map(myCorpus, removeWords, stopwords("english"))
myCorpus <- tm_map(myCorpus, removeWords, stopwords("SMART"))
myCorpus <- tm_map(myCorpus, removePunctuation)
myCorpus <- tm_map(myCorpus, removeNumbers)
myCorpus <- tm_map(myCorpus, stripWhitespace)
myCorpus <- tm_map(myCorpus, removeWords, profanity)
myCorpus <- tm_map(myCorpus, stemDocument)

To proceed, we create a document term matrix.This is what we will be using from this point on:

dtm <- DocumentTermMatrix(myCorpus)   
dtm   
## <<DocumentTermMatrix (documents: 3, terms: 34659)>>
## Non-/sparse entries: 51691/52286
## Sparsity           : 50%
## Maximal term length: 74
## Weighting          : term frequency (tf)

We refine our document term matrix by removing sparse terms:

dtms <- removeSparseTerms(dtm, 0.1)    
dtms  
## <<DocumentTermMatrix (documents: 3, terms: 5359)>>
## Non-/sparse entries: 16077/0
## Sparsity           : 0%
## Maximal term length: 13
## Weighting          : term frequency (tf)

Exploratory analysis

We will organize our terms in order to be useful for the calculations and the plotting that we wil do. At first we organize all the terms in the dtms by frequency in descending order:

freq <- sort(colSums(as.matrix(dtms)), decreasing=TRUE)    

The ten most frequent words are:

head(freq,10)    
##  year  time   day  make  work  good  love peopl  back thing 
##  2244  2219  1719  1649  1422  1404  1386  1366  1199  1148

The ten least frequent words are:

tail(freq,10)    
##      zulkif      zumbar      zunami        zuni      zurich zurichswiss 
##           1           1           1           1           1           1 
##  zwillingen      zydeco     zylroux       zynga 
##           1           1           1           1

We can find the associations (terms that correlate) of the most or least frequent words using the findAssocs() function. For example, the correlated terms, with a correlation percentage 0.99, for the word “love” are the following:

findAssocs(dtms,"love",corlimit = 0.99)
## $love
##      amaz     aveng     blast      bore    classi     coffe      cool 
##      1.00      1.00      1.00      1.00      1.00      1.00      1.00 
##      crap     crazi     dairi       est  facebook    forgot      hell 
##      1.00      1.00      1.00      1.00      1.00      1.00      1.00 
##     hurri      hurt      kiss      ladi       mad    newest      rain 
##      1.00      1.00      1.00      1.00      1.00      1.00      1.00 
##  sandwich     sleep      song    stupid     super      till      wait 
##      1.00      1.00      1.00      1.00      1.00      1.00      1.00 
##     weird    youtub   amateur     awhil     beast     check  daylight 
##      1.00      1.00      0.99      0.99      0.99      0.99      0.99 
##     email      fool     funni     guess      idol multitask   penguin 
##      0.99      0.99      0.99      0.99      0.99      0.99      0.99 
##    pillow      quit     spell       tan      whoa 
##      0.99      0.99      0.99      0.99      0.99

Plotting

We plot the word frequencies in our data:

The script used for creating the word frequencies plot can be found in the Appendix (barplotCode.R).

We use the wordcloud package in order to take another look in the most frequent words of our corpus.The script used for creating the wordcloud can be found in the Appendix (wordcloud.R).

We can clearly see that there are certain words like year, time or love that are repeated many times in our corpus as we have already confirmed in prior steps.

Tokenization

We proceed with tokenization processing in order to create some ngrams which will help us to find the word combinations that occur more often. We create two kind of ngrams, a two word ngram and a three word ngram and in the following plot we show which are the top ten most frequent word combinations for both kinds. The script used for creating the ngrams and the ngram frequency plot can be found in the Appendix (ngrams.R).

Appendinx

fileStats.R

paste("en_US.blogs:",format(round(file.size("./final/en_US/en_US.blogs.txt")/1024^2,2)),"MB",sep=" ")
paste("en_US.news:",format(round(file.size("./final/en_US/en_US.news.txt")/1024^2,2)),"MB",sep=" ")
paste("en_US.twitter:",format(round(file.size("./final/en_US/en_US.twitter.txt")/1024^2,2)),"MB",sep=" ")

paste("en_US.blogs:",length(blogs),sep=" ")
paste("en_US.news:",length(news),sep=" ")
paste("en_US.twitter:",length(twitter),sep=" ")

barplotCode.R

library(ggplot2)
df <- data.frame(word=names(freq),freq=freq)
p <- ggplot(subset(df, freq>500), aes(word, freq))
p <- p + geom_bar(stat="identity")
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1)) 
p <- p + xlab("Word")
p <- p + ylab("Frequency")
p <- p + ggtitle("Words with more than 500 appearences") 
p

wordCloud.R

library(wordcloud)
library(RColorBrewer)

set.seed(12345)

pal <- brewer.pal(8,"Dark2")

tmp<-data.frame(word = names(freq), freq = freq)
row.names(tmp)<-NULL

tmp<-head(tmp,100)

wordcloud(words=tmp$word,freq=tmp$freq,random.order = FALSE, colors=pal)

ngrams.R

library("RWeka")
library("tm")
library("ggplot2")
library("grid")
library("gridExtra")

BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))

token2 <- DocumentTermMatrix(Corpus(VectorSource(myCorpus)),
                             control = list(weighting = weightTf, 
                                            tokenize = BigramTokenizer))

token3 <- DocumentTermMatrix(Corpus(VectorSource(myCorpus)),
                             control = list(weighting = weightTf, 
                                            tokenize = TrigramTokenizer))

token2freq <-  sort(colSums(as.matrix(token2)),decreasing = TRUE)
two_words <- data.frame(word = names(token2freq), freq = token2freq)
rownames(two_words) <- NULL

token3freq <-  sort(colSums(as.matrix(token3)),decreasing = TRUE)
three_words <- data.frame(word = names(token3freq), freq = token3freq)
rownames(three_words) <- NULL

p2 <- ggplot(head(two_words,10), aes(reorder(word, -freq), freq))
p2 <- p2 + geom_bar(stat="identity")
p2 <- p2 + theme(axis.text.x=element_text(angle=45, hjust=1)) 
p2 <- p2 + xlab("Bigram")
p2 <- p2 + ylab("Frequency")
p2 <- p2 + ggtitle("Top 10 Bigrams Frequency ") 

p3 <- ggplot(head(three_words,10), aes(reorder(word, -freq), freq))
p3 <- p3 + geom_bar(stat="identity")
p3 <- p3 + theme(axis.text.x=element_text(angle=45, hjust=1)) 
p3 <- p3 + xlab("Trigram")
p3 <- p3 + ylab("Frequency")
p3 <- p3 + ggtitle("Top 10 Trigrams Frequency ") 

grid.arrange(p2,p3,ncol=2)