The objective of this report is to show some simple exploratory analysis and strategy to construct the final predictive model app and algorithm.
The raw zip dataset can be downloaded here.
The data is from Swiftkey and contains 3 extractions, for 4 languages. The extractions are from:
* Blogs
* News
* Twitter
The raw dataset contains the following basic informations:
library("tm")
## Loading required package: NLP
setwd("C:\\Users\\lc43922\\Coursera_Final_Project\\final")
blog_us <- readLines("en_US/en_US.blogs.txt", warn = FALSE)
news_us <- readLines("en_US/en_US.news.txt", warn = FALSE)
twitter_us <- readLines("en_US/en_US.twitter.txt", warn = FALSE)
blog_us <- iconv(blog_us, "latin1", "ASCII", sub="")
news_us <- iconv(news_us, "latin1", "ASCII", sub="")
twitter_us <- iconv(twitter_us, "latin1", "ASCII", sub="")
# do some simple exploratory analysis
# line count
blogsLineCount <- length(blog_us)
newsLineCount <- length(news_us)
twitterLineCount <- length(twitter_us)
# word count
blogsWordCount <- sum(sapply(gregexpr("\\W+", blog_us), length) + 1)
newsWordCount <- sum(sapply(gregexpr("\\W+", news_us), length) + 1)
twitterWordCount <- sum(sapply(gregexpr("\\W+", twitter_us), length) + 1)
exploratoryAnalysis <- data.frame(File = c("Blogs","News","Twitter"),
Line_Count = c(blogsLineCount, newsLineCount, twitterLineCount),
Word_Count = c(blogsWordCount, newsWordCount, twitterWordCount))
exploratoryAnalysis
## File Line_Count Word_Count
## 1 Blogs 899288 38704754
## 2 News 77259 2815456
## 3 Twitter 2360148 32766379
To transform the raw dataset in a separate words variable we must tokenize it. We will do this process, using the TM library, and construct NGRAMS, where N is equal to the quantity of grouped words. For example, consider the string “THIS IS A TEST”, the 1GRAM for it would be “THIS”,“IS”, “A”, TEST. The 2GRAM would be “THIS IS”, “IS A”, “A TEST” and so on.
Before this transformation we must clean the dataset, in order to remove punctioations, translate no english characters in english characters, remove the spaces and so on …
To speed up the things we will get a sample of 1% of the three datasets (blogs, news and twitter).
library("RWeka")
# remove punctuation, lower the words, remove numbers, remove whitespaces ...
dealWithWords <- function(charInput){
#charInput --> Type large character
charInputVectorized <- VCorpus(VectorSource(charInput))
charInputVectorized <- tm_map(charInputVectorized, tolower)
charInputVectorized <- tm_map(charInputVectorized, removePunctuation)
charInputVectorized <- tm_map(charInputVectorized, removeNumbers)
charInputVectorized <- tm_map(charInputVectorized, stripWhitespace)
charInputVectorized <- tm_map(charInputVectorized, PlainTextDocument)
}
nGramMaker1 <- function(charVectorized, n = 1){
#charVectorize --> Type TM Vectorize (Large Character)
#n --> The size of the NGRAM
tokenizer <- NGramTokenizer(charVectorized, Weka_control(min = n, max = n))
}
nGramMaker2 <- function(charVectorized, n = 2){
#charVectorize --> Type TM Vectorize (Large Character)
#n --> The size of the NGRAM
tokenizer <- NGramTokenizer(charVectorized, Weka_control(min = n, max = n))
}
nGramMaker3 <- function(charVectorized, n = 3){
#charVectorize --> Type TM Vectorize (Large Character)
#n --> The size of the NGRAM
tokenizer <- NGramTokenizer(charVectorized, Weka_control(min = n, max = n))
}
wordCountVector<- function(vector, searchOn){
# this function receive a vector of words or ngrams and count the ocurrencies on the "search on"
# vector --> a vector of words or ngrams (already tokenized)
# searchOn --> data frame, matrix, large character types to search the ocurrencies of words
# return --> return a data frame with 2 columns. Word and Ocurrencies
wordCountDF <- data.frame(word=character(length(vector)),
ocurrencies = integer(length(vector)), stringsAsFactors = FALSE)
for (i in 1:length(vector)){
wordVector = vector[i]
wordCountDF$word[i] <- wordVector
search <- paste("\\",wordVector,"\\>", sep = '')
numOcurrences <- length(grep(paste("\\<",wordVector,"\\>", sep = ''), searchOn))
wordCountDF$ocurrencies[i] <- numOcurrences
}
return(wordCountDF)
}
set.seed(999)
sampleEnglishOnly <- c(sample(blog_us, length(blog_us) * 0.01),
sample(news_us, length(news_us) * 0.01),
sample(twitter_us, length(twitter_us) * 0.01))
englishWordsVectorized <- dealWithWords(sampleEnglishOnly)
gram1 <- TermDocumentMatrix(englishWordsVectorized, control = list(tokenize = nGramMaker1))
gram2 <- TermDocumentMatrix(englishWordsVectorized, control = list(tokenize = nGramMaker2))
gram3 <- TermDocumentMatrix(englishWordsVectorized, control = list(tokenize = nGramMaker3))
# only get frequencies greater than 100
gram1Filtered <- findFreqTerms(gram1, lowfreq = 100)
gram2Filtered <- findFreqTerms(gram2, lowfreq = 100)
gram3Filtered <- findFreqTerms(gram3, lowfreq = 100)
wordCountGram1 <- wordCountVector(gram1Filtered, sampleEnglishOnly)
wordCountGram2 <- wordCountVector(gram2Filtered, sampleEnglishOnly)
wordCountGram3 <- wordCountVector(gram3Filtered, sampleEnglishOnly)
We can make some plots to visualize the distribution and frequencies of the tokens found.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
wordCountGram1Top10 <- head(arrange(wordCountGram1, desc(ocurrencies)), n = 10)
wordCountGram2Top10 <- head(arrange(wordCountGram2, desc(ocurrencies)), n = 10)
wordCountGram3Top10 <- head(arrange(wordCountGram3, desc(ocurrencies)), n = 10)
#construct the plot using ggplot2
g1 <- ggplot(wordCountGram1Top10, aes(x=reorder(word, -ocurrencies),y=ocurrencies)) +
geom_bar(stat="identity", fill="blue") +
ggtitle("Unigram") +
xlab("Unigrams") + ylab("Count") +
theme(axis.text.x=element_text(angle=90, hjust=1))
g2 <- ggplot(wordCountGram2Top10, aes(x=reorder(word, -ocurrencies),y=ocurrencies)) +
geom_bar(stat="identity", fill="red") +
ggtitle("Bigram") +
xlab("Bigrams") + ylab("Count") +
theme(axis.text.x=element_text(angle=90, hjust=1))
g3 <- ggplot(wordCountGram3Top10, aes(x=reorder(word, -ocurrencies),y=ocurrencies)) +
geom_bar(stat="identity", fill="black") +
ggtitle("Trigram") +
xlab("Trigrams") + ylab("Count") +
theme(axis.text.x=element_text(angle=90, hjust=1))
gridExtra::grid.arrange(g1, g2, g3, ncol = 3)
Now we are ready to develop the model by itself, using the tokens and the corpus that we have already built.
After the model we will start to develop the data product, using Shiny, and expose this to the world.