Swiftkey Dataset Exploratory Analysis

Synopsis

The objective of this report is to show some simple exploratory analysis and strategy to construct the final predictive model app and algorithm.
The raw zip dataset can be downloaded here.

Data Processing

The data is from Swiftkey and contains 3 extractions, for 4 languages. The extractions are from:
* Blogs
* News
* Twitter

The raw dataset contains the following basic informations:

library("tm")
## Loading required package: NLP
setwd("C:\\Users\\lc43922\\Coursera_Final_Project\\final")

blog_us <- readLines("en_US/en_US.blogs.txt", warn = FALSE)
news_us <- readLines("en_US/en_US.news.txt", warn = FALSE)
twitter_us <- readLines("en_US/en_US.twitter.txt", warn = FALSE)
blog_us <- iconv(blog_us, "latin1", "ASCII", sub="")
news_us <- iconv(news_us, "latin1", "ASCII", sub="")
twitter_us <- iconv(twitter_us, "latin1", "ASCII", sub="")


# do some simple exploratory analysis
# line count
blogsLineCount <- length(blog_us)
newsLineCount <- length(news_us)
twitterLineCount <- length(twitter_us)
# word count
blogsWordCount <- sum(sapply(gregexpr("\\W+", blog_us), length) + 1)
newsWordCount <- sum(sapply(gregexpr("\\W+", news_us), length) + 1)
twitterWordCount <- sum(sapply(gregexpr("\\W+", twitter_us), length) + 1)


exploratoryAnalysis <- data.frame(File = c("Blogs","News","Twitter"),
                                  Line_Count = c(blogsLineCount, newsLineCount, twitterLineCount),
                                  Word_Count = c(blogsWordCount, newsWordCount, twitterWordCount))
exploratoryAnalysis
##      File Line_Count Word_Count
## 1   Blogs     899288   38704754
## 2    News      77259    2815456
## 3 Twitter    2360148   32766379

Clean the data and tokenize

To transform the raw dataset in a separate words variable we must tokenize it. We will do this process, using the TM library, and construct NGRAMS, where N is equal to the quantity of grouped words. For example, consider the string “THIS IS A TEST”, the 1GRAM for it would be “THIS”,“IS”, “A”, TEST. The 2GRAM would be “THIS IS”, “IS A”, “A TEST” and so on.
Before this transformation we must clean the dataset, in order to remove punctioations, translate no english characters in english characters, remove the spaces and so on …

To speed up the things we will get a sample of 1% of the three datasets (blogs, news and twitter).

library("RWeka")
# remove punctuation, lower the words, remove numbers, remove whitespaces ...
dealWithWords <- function(charInput){
  #charInput --> Type large character
  charInputVectorized <- VCorpus(VectorSource(charInput))
  charInputVectorized <- tm_map(charInputVectorized, tolower)
  charInputVectorized <- tm_map(charInputVectorized, removePunctuation)
  charInputVectorized <- tm_map(charInputVectorized, removeNumbers)
  charInputVectorized <- tm_map(charInputVectorized, stripWhitespace)
  charInputVectorized <- tm_map(charInputVectorized, PlainTextDocument)
}

nGramMaker1 <- function(charVectorized, n = 1){
  #charVectorize --> Type TM Vectorize (Large Character)
  #n             --> The size of the NGRAM
  tokenizer <- NGramTokenizer(charVectorized, Weka_control(min = n, max = n))
}

nGramMaker2 <- function(charVectorized, n = 2){
  #charVectorize --> Type TM Vectorize (Large Character)
  #n             --> The size of the NGRAM
  tokenizer <- NGramTokenizer(charVectorized, Weka_control(min = n, max = n))
}

nGramMaker3 <- function(charVectorized, n = 3){
  #charVectorize --> Type TM Vectorize (Large Character)
  #n             --> The size of the NGRAM
  tokenizer <- NGramTokenizer(charVectorized, Weka_control(min = n, max = n))
}

wordCountVector<- function(vector, searchOn){
  # this function receive a vector of words or ngrams and count the ocurrencies on the "search on"
  # vector --> a vector of words or ngrams (already tokenized)
  # searchOn --> data frame, matrix, large character types to search the ocurrencies of words
  # return   --> return a data frame with 2 columns. Word and Ocurrencies
  wordCountDF <- data.frame(word=character(length(vector)), 
                            ocurrencies = integer(length(vector)), stringsAsFactors = FALSE)
  for (i in 1:length(vector)){
    wordVector = vector[i]
    wordCountDF$word[i] <- wordVector
    search <- paste("\\",wordVector,"\\>", sep = '')
    numOcurrences <- length(grep(paste("\\<",wordVector,"\\>", sep = ''), searchOn))
    wordCountDF$ocurrencies[i] <- numOcurrences
  }
  return(wordCountDF)
}
set.seed(999)
sampleEnglishOnly <- c(sample(blog_us, length(blog_us) * 0.01),
                       sample(news_us, length(news_us) * 0.01),
                       sample(twitter_us, length(twitter_us) * 0.01))
englishWordsVectorized <- dealWithWords(sampleEnglishOnly)
gram1 <- TermDocumentMatrix(englishWordsVectorized, control = list(tokenize = nGramMaker1))
gram2 <- TermDocumentMatrix(englishWordsVectorized, control = list(tokenize = nGramMaker2))
gram3 <- TermDocumentMatrix(englishWordsVectorized, control = list(tokenize = nGramMaker3))
# only get frequencies greater than 100
gram1Filtered <- findFreqTerms(gram1, lowfreq = 100)
gram2Filtered <- findFreqTerms(gram2, lowfreq = 100)
gram3Filtered <- findFreqTerms(gram3, lowfreq = 100)

wordCountGram1 <- wordCountVector(gram1Filtered, sampleEnglishOnly)
wordCountGram2 <- wordCountVector(gram2Filtered, sampleEnglishOnly)
wordCountGram3 <- wordCountVector(gram3Filtered, sampleEnglishOnly)

Plots

We can make some plots to visualize the distribution and frequencies of the tokens found.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
wordCountGram1Top10 <- head(arrange(wordCountGram1, desc(ocurrencies)), n = 10)
wordCountGram2Top10 <- head(arrange(wordCountGram2, desc(ocurrencies)), n = 10)
wordCountGram3Top10 <- head(arrange(wordCountGram3, desc(ocurrencies)), n = 10)
#construct the plot using ggplot2
g1 <- ggplot(wordCountGram1Top10, aes(x=reorder(word, -ocurrencies),y=ocurrencies)) + 
        geom_bar(stat="identity", fill="blue") + 
        ggtitle("Unigram") + 
        xlab("Unigrams") + ylab("Count") + 
        theme(axis.text.x=element_text(angle=90, hjust=1))
g2 <- ggplot(wordCountGram2Top10, aes(x=reorder(word, -ocurrencies),y=ocurrencies)) + 
        geom_bar(stat="identity", fill="red") + 
        ggtitle("Bigram") + 
        xlab("Bigrams") + ylab("Count") + 
        theme(axis.text.x=element_text(angle=90, hjust=1))
g3 <- ggplot(wordCountGram3Top10, aes(x=reorder(word, -ocurrencies),y=ocurrencies)) + 
        geom_bar(stat="identity", fill="black") + 
        ggtitle("Trigram") + 
        xlab("Trigrams") + ylab("Count") + 
        theme(axis.text.x=element_text(angle=90, hjust=1))
gridExtra::grid.arrange(g1, g2, g3, ncol = 3)

Next Steps

Now we are ready to develop the model by itself, using the tokens and the corpus that we have already built.
After the model we will start to develop the data product, using Shiny, and expose this to the world.