Overview

This is the milestone report for the Data Science Capstone project. In this documentation, we explore the data provided by Coursera in order to help in building a predictive text model.

Reading the Data

We read the 3 separate data files provided.

blogs <- readLines("final/en_US/en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
news <- readLines("final/en_US/en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)
## Warning in readLines("final/en_US/en_US.news.txt", encoding =
## "UTF-8", skipNul = TRUE): incomplete final line found on 'final/en_US/
## en_US.news.txt'
twitter <- readLines("final/en_US/en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)

We then take a look at what data we’ve been given. In particular, per text file we want find out how many lines there are in each text file (line count), how many words are there in each file (word count), what is the max and minimum word count and, lastly, roughly what is the average word count.

library("stringi")
## Warning: package 'stringi' was built under R version 3.5.3
source <- c("blogs","news","twitter")
word.count.per.line <- c(stri_count_words(blogs),stri_count_words(news), stri_count_words(twitter))
line.count <- c(length(blogs),length(news),length(twitter))
word.count <- c(sum(stri_count_words(blogs)),sum(stri_count_words(news)),sum(stri_count_words(twitter)))
word.max <- c(max(stri_count_words(blogs)), max(stri_count_words(news)), max(stri_count_words(twitter)))
word.min <- c(min(stri_count_words(blogs)), min(stri_count_words(news)), min(stri_count_words(twitter)))
word.mean <- c(mean(stri_count_words(blogs)), mean(stri_count_words(news)), mean(stri_count_words(twitter)))
cbind(source, line.count, word.count,word.max,word.min,word.mean)
##      source    line.count word.count word.max word.min word.mean         
## [1,] "blogs"   "899288"   "37546239" "6726"   "0"      "41.7510730711407"
## [2,] "news"    "77259"    "2674536"  "1123"   "1"      "34.617792101891" 
## [3,] "twitter" "2360148"  "30093413" "47"     "1"      "12.7506465696219"

Here, we can see that the twitter text file has a huge lead with 2 million lines, followed by blogs with around 900,000 and news with only 70,000.

Cleaning the Data

Now, we move along to cleaning the data. First, we get a sample from each file source then we create the corpus to be used for our text predictor.

library(tm)
## Warning: package 'tm' was built under R version 3.5.3
## Loading required package: NLP
library(SnowballC)

set.seed(123)
compiled <- c(sample(blogs,length(blogs)*.02),sample(news,length(news)*.02),sample(twitter,length(twitter)*.02))

corpus = VCorpus(VectorSource(compiled))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)

Exploring the data

We now explore the contents of the data presented to us. We take a look at the top words used by looking at the top unigrams, bigrams (2 sets of words) and trigrams (three sets of words).

library(RWeka)
uni_tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
bi_tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
tri_tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))

uni_matrix <- TermDocumentMatrix(corpus, control = list(tokenize = uni_tokenizer))
bi_matrix <- TermDocumentMatrix(corpus, control = list(tokenize = bi_tokenizer))
tri_matrix <- TermDocumentMatrix(corpus, control = list(tokenize = tri_tokenizer))

uni_corpus <- findFreqTerms(uni_matrix,lowfreq = 50)
bi_corpus <- findFreqTerms(bi_matrix,lowfreq=50)
tri_corpus <- findFreqTerms(tri_matrix,lowfreq=50)

uni_corpus_freq <- rowSums(as.matrix(uni_matrix[uni_corpus,]))
uni_corpus_freq <- data.frame(word=names(uni_corpus_freq), frequency=uni_corpus_freq)
bi_corpus_freq <- rowSums(as.matrix(bi_matrix[bi_corpus,]))
bi_corpus_freq <- data.frame(word=names(bi_corpus_freq), frequency=bi_corpus_freq)
tri_corpus_freq <- rowSums(as.matrix(tri_matrix[tri_corpus,]))
tri_corpus_freq <- data.frame(word=names(tri_corpus_freq), frequency=tri_corpus_freq)

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
desc.uni <- head(uni_corpus_freq %>% arrange(desc(frequency)), n=20)
desc.bi <- head(bi_corpus_freq %>% arrange(desc(frequency)), n=20)
View(desc.bi)
desc.tri <- head(tri_corpus_freq %>% arrange(desc(frequency)), n=20)
View(desc.tri)

Let us make a bar plot for the top 20 unigrams, bigrams and trigrams.

barplot(desc.uni$frequency, names.arg = desc.uni$word,col="lightblue",las=2)

barplot(desc.bi$frequency, names.arg = desc.bi$word,col="lightblue",las=2)

barplot(desc.tri$frequency, names.arg = desc.tri$word,col="lightblue",las=2)

Next Steps

This concludes our milestone report. We have cleaned and explored our data. Now, we have to create a predictive algorithm as well as our shiny app.