This is the Milestone Report for Week 2 of the Capstone Project for the Data Sciense Specialization provided by Coursera and JHU. In this report we will try to demostrate the exploratory analysis we have done trying to understand in the best possible way the nature of the data sets that will be used for creating the prediction algorithm and the final Shiny app. We will show how we downloaded the datasets, how we loaded them into R, we will show some statistics for our data that will be useful in the future creating the prediction algorithm and the final app.
We use the link provided in the syllabus of the Capstone project to download the Swiftkey zip file containing the data sets that will be used for our project. We unzip and use only the files from the English database as it is explicitly noted in “Task 1 - Getting and cleaning the data”. the English database consists of three separate text files named en_US.blogs.txt, en_US.news.txt and en_US.twitter.txt.
# Get the directory where the script that is executed is located
# and set it as working directory
script.dir <- dirname(sys.frame(1)$ofile)
setwd(script.dir)
library(downloader)
suppressMessages(library(sqldf))
url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
if(!file.exists("./data/Coursera-SwiftKey.zip")){
setwd("./data")
download(url, dest="Coursera-SwiftKey.zip", mode="wb")
}
if (getwd() != paste(script.dir,"data",sep="/")){
setwd(paste(script.dir,"data",sep="/"))
}
txtFiles <- unzip("Coursera-SwiftKey.zip",list=TRUE)
txtFiles <- as.vector(t(sqldf("select Name from txtFiles where Name LIKE ('final/en_US/en%')")))
if(!dir.exists("./data/final")){
unzip("Coursera-SwiftKey.zip",files=txtFiles,overwrite = TRUE)
} else {
for(txtFile in txtFiles){
if(!file.exists(txtFile)){
unzip("Coursera-SwiftKey.zip",files=txtFile,overwrite = TRUE)
}
}
}
rm(list=ls(all=TRUE))
After downloading the data sets, we load them into R using readLines() function.
con <- file("./final/en_US/en_US.twitter.txt", "rb")
twitter <- readLines(con,skipNul = TRUE ,warn = FALSE, encoding = "UTF-8")
close(con)
con <- file("./final/en_US/en_US.blogs.txt", "rb")
blogs <- readLines(con,skipNul = TRUE, warn = FALSE)
close(con)
con <- file("./final/en_US/en_US.news.txt", "rb")
news <- readLines(con,skipNul = TRUE, warn = FALSE)
close(con)
The size of the downloaded files in MBytes and also the row count for every file is:
| File Name | Size | Row Count |
|---|---|---|
| en_US.blogs.txt | 200.42 MB | 899288 |
| en_US.news.txt | 196.28 MB | 1010242 |
| en_US.twitter.txt | 159.36 MB | 2360148 |
The script that was used to find file size and row count can be found in the Appendix (fileStats.R).
Due to the big size of our files that made difficult/time consuming to do calculations on our data sets, we create three separate sample files, 10000 rows each, one for every text file, in order to create our corpus and do all the preprocessing and calculations. the files are called sample_blogs.txt, sample_news.txt and sample_twitter.txt.
sampleTwitter <- sample(twitter,10000,replace=FALSE)
sampleBlogs <- sample(blogs,10000,replace=FALSE)
sampleNews <- sample(news,10000,replace=FALSE)
repo <- "./final/en_US/sample"
if(!dir.exists(repo)){
dir.create(repo)
}
fileConn<-file(paste(repo,"twitter.txt",sep="/"))
writeLines(sampleTwitter, fileConn)
close(fileConn)
fileConn<-file(paste(repo,"twitter.txt",sep="/"))
writeLines(sampleNews, fileConn)
close(fileConn)
fileConn<-file(paste(repo,"twitter.txt",sep="/"))
writeLines(sampleBlogs, fileConn)
close(fileConn)
We create our corpus from the sample files and we do all the preprocessing needed to cleanse our data. We name it myCorpus.
library(tm)
library(SnowballC)
# I have found a really good list of "bad words" in this link (https://github.com/shutterstock/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words) which i 've used to cleanse the corpus from them.
con <- file("./profanity.txt", "rb")
profanity <- readLines(con,skipNul = TRUE, warn = FALSE)
close(con)
repo <- "./final/en_US/sample"
myCorpus <- VCorpus(
DirSource(repo, pattern = 'txt', encoding = 'UTF-8'),
readerControl = list(language = 'en')
)
# Custom function used to remove non ASCII characters.
removeNonASCII <- content_transformer(function(x)
gsub("[^\x20-\x7E]","", x))
# Custom function used to remove URLs.
removeURLs <- content_transformer(function(x)
gsub("(f|ht)tp(s?):(\\s*?)//(.*)[.][a-z]+(/?)", "", x))
# Data cleansing and transformations.
myCorpus <- tm_map(myCorpus, removeNonASCII)
myCorpus <- tm_map(myCorpus, content_transformer(tolower))
myCorpus <- tm_map(myCorpus, removeURLs)
myCorpus <- tm_map(myCorpus, removeWords, stopwords("english"))
myCorpus <- tm_map(myCorpus, removeWords, stopwords("SMART"))
myCorpus <- tm_map(myCorpus, removePunctuation)
myCorpus <- tm_map(myCorpus, removeNumbers)
myCorpus <- tm_map(myCorpus, stripWhitespace)
myCorpus <- tm_map(myCorpus, removeWords, profanity)
myCorpus <- tm_map(myCorpus, stemDocument)
To proceed, we create a document term matrix.This is what we will be using from this point on:
dtm <- DocumentTermMatrix(myCorpus)
dtm
## <<DocumentTermMatrix (documents: 3, terms: 34659)>>
## Non-/sparse entries: 51691/52286
## Sparsity : 50%
## Maximal term length: 74
## Weighting : term frequency (tf)
We refine our document term matrix by removing sparse terms:
dtms <- removeSparseTerms(dtm, 0.1)
dtms
## <<DocumentTermMatrix (documents: 3, terms: 5359)>>
## Non-/sparse entries: 16077/0
## Sparsity : 0%
## Maximal term length: 13
## Weighting : term frequency (tf)
We will organize our terms in order to be useful for the calculations and the plotting that we wil do. At first we organize all the terms in the dtms by frequency in descending order:
freq <- sort(colSums(as.matrix(dtms)), decreasing=TRUE)
The ten most frequent words are:
head(freq,10)
## year time day make work good love peopl back thing
## 2244 2219 1719 1649 1422 1404 1386 1366 1199 1148
The ten least frequent words are:
tail(freq,10)
## zulkif zumbar zunami zuni zurich zurichswiss
## 1 1 1 1 1 1
## zwillingen zydeco zylroux zynga
## 1 1 1 1
We can find the associations (terms that correlate) of the most or least frequent words using the findAssocs() function. For example, the correlated terms, with a correlation percentage 0.99, for the word “love” are the following:
findAssocs(dtms,"love",corlimit = 0.99)
## $love
## amaz aveng blast bore classi coffe cool
## 1.00 1.00 1.00 1.00 1.00 1.00 1.00
## crap crazi dairi est facebook forgot hell
## 1.00 1.00 1.00 1.00 1.00 1.00 1.00
## hurri hurt kiss ladi mad newest rain
## 1.00 1.00 1.00 1.00 1.00 1.00 1.00
## sandwich sleep song stupid super till wait
## 1.00 1.00 1.00 1.00 1.00 1.00 1.00
## weird youtub amateur awhil beast check daylight
## 1.00 1.00 0.99 0.99 0.99 0.99 0.99
## email fool funni guess idol multitask penguin
## 0.99 0.99 0.99 0.99 0.99 0.99 0.99
## pillow quit spell tan whoa
## 0.99 0.99 0.99 0.99 0.99
We plot the word frequencies in our data:
The script used for creating the word frequencies plot can be found in the Appendix (barplotCode.R).
We use the wordcloud package in order to take another look in the most frequent words of our corpus.The script used for creating the wordcloud can be found in the Appendix (wordcloud.R).
We can clearly see that there are certain words like year, time or love that are repeated many times in our corpus as we have already confirmed in prior steps.
We proceed with tokenization processing in order to create some ngrams which will help us to find the word combinations that occur more often. We create two kind of ngrams, a two word ngram and a three word ngram and in the following plot we show which are the top ten most frequent word combinations for both kinds. The script used for creating the ngrams and the ngram frequency plot can be found in the Appendix (ngrams.R).
fileStats.R
paste("en_US.blogs:",format(round(file.size("./final/en_US/en_US.blogs.txt")/1024^2,2)),"MB",sep=" ")
paste("en_US.news:",format(round(file.size("./final/en_US/en_US.news.txt")/1024^2,2)),"MB",sep=" ")
paste("en_US.twitter:",format(round(file.size("./final/en_US/en_US.twitter.txt")/1024^2,2)),"MB",sep=" ")
paste("en_US.blogs:",length(blogs),sep=" ")
paste("en_US.news:",length(news),sep=" ")
paste("en_US.twitter:",length(twitter),sep=" ")
barplotCode.R
library(ggplot2)
df <- data.frame(word=names(freq),freq=freq)
p <- ggplot(subset(df, freq>500), aes(word, freq))
p <- p + geom_bar(stat="identity")
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
p <- p + xlab("Word")
p <- p + ylab("Frequency")
p <- p + ggtitle("Words with more than 500 appearences")
p
wordCloud.R
library(wordcloud)
library(RColorBrewer)
set.seed(12345)
pal <- brewer.pal(8,"Dark2")
tmp<-data.frame(word = names(freq), freq = freq)
row.names(tmp)<-NULL
tmp<-head(tmp,100)
wordcloud(words=tmp$word,freq=tmp$freq,random.order = FALSE, colors=pal)
ngrams.R
library("RWeka")
library("tm")
library("ggplot2")
library("grid")
library("gridExtra")
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
token2 <- DocumentTermMatrix(Corpus(VectorSource(myCorpus)),
control = list(weighting = weightTf,
tokenize = BigramTokenizer))
token3 <- DocumentTermMatrix(Corpus(VectorSource(myCorpus)),
control = list(weighting = weightTf,
tokenize = TrigramTokenizer))
token2freq <- sort(colSums(as.matrix(token2)),decreasing = TRUE)
two_words <- data.frame(word = names(token2freq), freq = token2freq)
rownames(two_words) <- NULL
token3freq <- sort(colSums(as.matrix(token3)),decreasing = TRUE)
three_words <- data.frame(word = names(token3freq), freq = token3freq)
rownames(three_words) <- NULL
p2 <- ggplot(head(two_words,10), aes(reorder(word, -freq), freq))
p2 <- p2 + geom_bar(stat="identity")
p2 <- p2 + theme(axis.text.x=element_text(angle=45, hjust=1))
p2 <- p2 + xlab("Bigram")
p2 <- p2 + ylab("Frequency")
p2 <- p2 + ggtitle("Top 10 Bigrams Frequency ")
p3 <- ggplot(head(three_words,10), aes(reorder(word, -freq), freq))
p3 <- p3 + geom_bar(stat="identity")
p3 <- p3 + theme(axis.text.x=element_text(angle=45, hjust=1))
p3 <- p3 + xlab("Trigram")
p3 <- p3 + ylab("Frequency")
p3 <- p3 + ggtitle("Top 10 Trigrams Frequency ")
grid.arrange(p2,p3,ncol=2)