SYNOPSIS

This is the milestone report of the Capstone Project of Data Science Specialization. This will be the base to create the shiny app for the second assignment in the same course. The objective of this report id to show that the dataset has been downloaded and basic summary statistics have been performed. A corpus has been created that is cleaned of unnecessary words,spaces and puctuations. This corpus has been used to find the frequently occurring words/phrases. Using this corpus, further code will be written to create a text prediction app.

LOADING THE REQUIRED LIBRARIES

knitr::opts_chunk$set(echo = TRUE)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringi)
library(knitr)
library(ggplot2)
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(ngram)
library(NLP)
library(tesseract)
library(corpus)
library(wordcloud)
## Loading required package: RColorBrewer
library(doParallel)
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
library(RWeka)

GETTING THE DATA

Download the dataset from the given URL and unzip the file. The dataset contains data of blogs, news and tweets in four languages including English. Here, the english data has been read using the readLines() function as shown below.

## Download the dataset and unzip it.
if(!file.exists("Coursera-SwiftKey.zip"))
  download.file("https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip", "Coursera-SwiftKey.zip")
unzip("Coursera-SwiftKey.zip")
## Read the twitter, blogs and news data from the English dataset into R.
blogs <- readLines("final/en_US/en_US.blogs.txt", encoding="UTF-8", skipNul=TRUE)
news <- readLines("final/en_US/en_US.news.txt", encoding="UTF-8", skipNul=TRUE)
## Warning in readLines("final/en_US/en_US.news.txt", encoding = "UTF-8", skipNul =
## TRUE): incomplete final line found on 'final/en_US/en_US.news.txt'
twitter <- readLines("final/en_US/en_US.twitter.txt", encoding="UTF-8", skipNul=TRUE)

BASIC SUMMARIES AND STATISTICS

After reading the data in the three .txt files, we need to find out the basic summary of them such as the size of the file, no. of words, no. of lines, words per line,no. of characters,etc.

## Find the size of each file.
blogs_size <- file.info("final/en_US/en_US.blogs.txt")$size / 1024 ^ 2
news_size <- file.info("final/en_US/en_US.news.txt")$size / 1024 ^ 2
twitter_size <- file.info("final/en_US/en_US.twitter.txt")$size / 1024 ^ 2
## Find the word count in each file.
blogs_words <- stri_count_words(blogs)
news_words <- stri_count_words(news)
twitter_words <- stri_count_words(twitter)
## Find the line count in each file.
length(blogs)
## [1] 899288
length(news)
## [1] 77259
length(twitter)
## [1] 2360148
total_lines <- length(blogs)+length(news)+length(twitter)
## Find the total character count and characters per line in each file.
### Characters per line
blogs_nchar   <- nchar(blogs)
news_nchar    <- nchar(news)
twitter_nchar <- nchar(twitter)
### Total number of characters
blogs_nchar_tot   <- sum(blogs_nchar)
news_nchar_tot    <- sum(news_nchar)
twitter_nchar_tot <- sum(twitter_nchar)
## Basic Summary
data_summary <- data.frame(data_filename = c("blogs","news","twitter"),
           file_size_MB = c(blogs_size,news_size,twitter_size),
           line_count = c(length(blogs),length(news),length(twitter)),
           words_count = c(sum(blogs_words),sum(news_words),sum(twitter_words)),
           char_count = c(blogs_nchar_tot, news_nchar_tot, twitter_nchar_tot),
           wordsperlinemean = c(mean(blogs_words),mean(news_words),mean(twitter_words)))
## Create a table of the data_summary.
kable(data_summary, caption ="Data Summary (en_US)")
Data Summary (en_US)
data_filename file_size_MB line_count words_count char_count wordsperlinemean
blogs 200.4242 899288 37546239 206824505 41.75107
news 196.2775 77259 2674536 15639408 34.61779
twitter 159.3641 2360148 30093413 162096241 12.75065

CLEANING THE SAMPLE AND CREATING A CORPUS

For the training set we create a sample corpus using the 1% of the total data in each text file. Also, create a virtual corpus of the sample data and clean it of puctuation marks, spaces,stopwords,numbers, etc. Convert all the text to lowercase letters for easy working.

## Set the seed to a large number.
set.seed(1000)
## Create a sample data that is 1% of the dataset.
sample_data <- c(sample(blogs, length(blogs)*0.01),
                 sample(news, length(news)*0.01),
                 sample(twitter, length(twitter)*0.01))
## Create a corpus and print it. 
clean_corpus <- VCorpus(VectorSource(sample_data))
print(clean_corpus)
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 33365
## Clean the newly created corpus of white spaces, numbers, punctuation marks, etc.
clean_corpus <- tm_map(clean_corpus, content_transformer(tolower))
clean_corpus <- tm_map(clean_corpus, removeWords, stopwords("en"))
clean_corpus <- tm_map(clean_corpus, removePunctuation)
clean_corpus <- tm_map(clean_corpus, removeNumbers)
clean_corpus <- tm_map(clean_corpus, stripWhitespace)
clean_corpus <- tm_map(clean_corpus, PlainTextDocument)
print(clean_corpus)
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 33365

N-GRAM TOKENIZATION AND EXPLORATORY DATA ANALYSIS

N-gram words like unigrams, bigrams, trigrams,etc can be tokenized from the corpus that is cleaned to find out the frequency of occurrence of each n-gram. We also create histograms to visualize the top 10 frequent n-grams.This will help us in further building the prediction model required for creating the shiny app.

## Set mc.cores=1
options(mc.cores=1)
## Create a function to find the frequency of specified n-gram words and return in the form of a table.
getFreq <- function(tdm) {
  freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE)
  return(data.frame(word = names(freq), freq = freq))
}
## Create functions to tokenize bi/tri-grams.
bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
## Create a Document term matrix of the clean corpus to find uni/bi/tri- grams frequencies and remove sparce terms from the required dataset/corpus.
dtm_corpus <- TermDocumentMatrix(clean_corpus)
dtm_corpus
## <<TermDocumentMatrix (terms: 45528, documents: 33365)>>
## Non-/sparse entries: 341824/1518699896
## Sparsity           : 100%
## Maximal term length: 250
## Weighting          : term frequency (tf)
freq1 <- getFreq(removeSparseTerms(dtm_corpus, 0.9999))
dtm_bigram <- TermDocumentMatrix(clean_corpus, control = list(tokenize = bigram))
dtm_bigram
## <<TermDocumentMatrix (terms: 293610, documents: 33365)>>
## Non-/sparse entries: 343546/9795954104
## Sparsity           : 100%
## Maximal term length: 172
## Weighting          : term frequency (tf)
freq2 <- getFreq(removeSparseTerms(dtm_bigram, 0.9999))
dtm_trigram <- TermDocumentMatrix(clean_corpus, control = list(tokenize = trigram))
dtm_trigram
## <<TermDocumentMatrix (terms: 311093, documents: 33365)>>
## Non-/sparse entries: 313338/10379304607
## Sparsity           : 100%
## Maximal term length: 181
## Weighting          : term frequency (tf)
freq3 <- getFreq(removeSparseTerms(dtm_trigram, 0.9999))
## Define a function to make histograms showing the frequencies of occurrences of each n-gram word.
makePlot <- function(data, label, color) {
  ggplot(data[1:10,], aes(reorder(word, -freq), freq)) +
    labs(x = label, y = "Frequency") +
    theme(axis.text.x = element_text(angle = 60, size = 12, hjust = 1)) +
    geom_bar(stat = "identity", fill = I(color))
}
## Use the defined function to make histogram plots.
makePlot(freq1, "Top 10 Frequent Unigrams","red")

makePlot(freq2, "Top 10 Frequent Bigrams","pink")

makePlot(freq3, "Top 10 Frequent Trigrams", "yellow")

PREDICTION ALGORITHM AND SHINY APP (PLAN)

We have done basic statistics, cleaning and exploratory analysis on the downloaded corpus and successfully created a training dataset. This dataset will be used to train the prediction algorithm using machine learning concepts that will be built in the next assignment. This algorithm will be used to create the shiny app which will be similar to the text prediction app by SwiftKey.