In this report the data set is downloaded, loaded and analised and, using some tables and plots to illustrate important findings of the data set, a basic report of summary statistics is presented.
The Swiftkey zip file containing the data sets that are used for this project.
The download of the data was made from the link below:
https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip
The files were unziped and stored on a local directory. Only the English files are used in the project.
There are three separate text files named en_US.blogs.txt, en_US.news.txt and en_US.twitter.txt.
Stats related with size in Mb, row count and word count of Files:
# File Size in Mb
blogs.size <- format(round(file.size("./data/en_US/en_US.blogs.txt")/1024^2,1))
news.size <- format(round(file.size("./data/en_US/en_US.news.txt")/1024^2,1))
twitter.size <- format(round(file.size("./data/en_US/en_US.twitter.txt")/1024^2,1))
blogs.file <- file("./data/en_US/en_US.blogs.txt", "rb")
news.file <- file("./data/en_US/en_US.news.txt", "rb")
twitter.file <- file("./data/en_US/en_US.twitter.txt", "rb")
# Loading
blogs <- readLines(blogs.file, skipNul = TRUE, encoding = "UTF-8")
news <- readLines(news.file, skipNul = TRUE, encoding = "UTF-8")
twitter <- readLines(twitter.file, skipNul = TRUE, encoding = "UTF-8")
# Number of lines
blogs.len <- length(blogs)
news.len <- length(news)
twitter.len <- length(twitter)
# Number of words
blogs.wcount <- length(unlist(strsplit(blogs, " ")))
news.wcount <- length(unlist(strsplit(news, " ")))
twitter.wcount <- length(unlist(strsplit(twitter, " ")))
paste("en_US.blogs.txt size:", blogs.size,"MB | row count: ", blogs.len, "lines | word count: ", blogs.wcount, "words")
## [1] "en_US.blogs.txt size: 200.4 MB | row count: 899288 lines | word count: 37334131 words"
paste("en_US.news.txt size:", news.size,"MB | row count: ", news.len, "lines | word count: ", news.wcount, "words")
## [1] "en_US.news.txt size: 196.3 MB | row count: 1010242 lines | word count: 34372530 words"
paste("en_US.twitter.txt size:", twitter.size,"MB | row count: ", twitter.len, "lines | word count: ", twitter.wcount, "words")
## [1] "en_US.twitter.txt size: 159.4 MB | row count: 2360148 lines | word count: 30373583 words"
close(blogs.file); rm(blogs.file)
close(twitter.file); rm(twitter.file)
close(news.file); rm(news.file)
For this analysis the data was randomly sampled.
# Data samples
set.seed(1970)
blogs.sample <- sample(blogs,blogs.len*0.005,replace = FALSE)
news.sample <- sample(news,news.len*0.005,replace = FALSE)
twitter.sample <- sample(twitter,twitter.len*0.005,replace = FALSE)
# Combining data samples
sampleData <- c(blogs.sample,news.sample,twitter.sample)
# Remove objects
rm(blogs, news, twitter)
Excluding invalid characters from sample data, making it lowercase and removing whitespaces, punctuations, numbers, stop-words and stem words.
library(NLP)
library(tm)
corpus <- Corpus(VectorSource(sampleData))
rm.spchar <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpus <- tm_map(corpus, rm.spchar, "\"|/|@|\\|")
corpus <- tm_map(corpus, rm.spchar, "[^[:graph:]]")
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, PlainTextDocument)
corpus <- tm_map(corpus, stemDocument)
rm(sampleData, twitter.sample, blogs.sample, news.sample)
library("RWeka")
tokenizer.uni <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
dtm.uni <- DocumentTermMatrix(corpus, control = list(tokenize = tokenizer.uni))
dtms.uni <- removeSparseTerms(dtm.uni, 0.99)
freq.uni <- sort(colSums(as.matrix(dtms.uni)), decreasing = TRUE)
barplot(freq.uni[1:20],ylab = "Frequency",las = 2, main = "Top 20 Unigrams")
tokenizer.bi <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
dtm.bi <- DocumentTermMatrix(corpus, control = list(tokenize = tokenizer.bi))
dtms.bi <- removeSparseTerms(dtm.bi, 0.9999)
freq.bi <- sort(colSums(as.matrix(dtms.bi)), decreasing = TRUE)
barplot(freq.bi[1:20],ylab = "Frequency",las = 2, main = "Top 20 Bigrams")
tokenizer.tri <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
dtm.tri <- DocumentTermMatrix(corpus, control = list(tokenize = tokenizer.tri))
dtms.tri <- removeSparseTerms(dtm.tri, 0.9999)
freq.tri <- sort(colSums(as.matrix(dtms.tri)), decreasing = TRUE)
#top 20 most common words
par(mar = c(9,4,2,1))
barplot(freq.tri[1:20],ylab = "Frequency",las = 2, main = "Top 20 Trigrams")
library(RColorBrewer)
library(wordcloud)
set.seed(1970)
wordcloud(words = names(freq.uni),freq = freq.uni, max.words = 30, random.order = FALSE, colors = brewer.pal(8, "Dark2"))
The future plans involve some more data cleansing, a better understanding to determine a good sample size and test and train datasets to choose the most efficient predictive model. Next to that, a Shinny application will be build.