Exploratory data analysis Milestone Report

Overview

The goal of this project is just to display that we have gotten used to working with the data and that we are on track to create a prediction algorithm. This repor will be submitted on R Pubs (http://rpubs.com/) and will explain the exploratory analysis and the goals for the eventual app and algorithm.

Demonstrate that you’ve downloaded the data and have successfully loaded it in.
Create a basic report of summary statistics about the data sets.
Report any interesting findings that you amassed so far.
Get feedback on your plans for creating a prediction algorithm and Shiny app.

Loading Library

# Preload necessary R librabires
library(dplyr)
library(doParallel)
library(stringi)
library(SnowballC)
library(tm)
# To solve rJava package issues while loading it or Rweka, set the directory of your Java location by setting it before loading the library:
if(Sys.getenv("JAVA_HOME")!="")
      Sys.setenv(JAVA_HOME="")
#options(java.home="C:\\Program Files\\Java\\jre1.8.0_171\\")
#library(rJava)
library(RWeka)
library(ggplot2)

Download and Import Data

The data is from HC Corpora with access to 4 languages, but only English will be used. The dataset has three files includes en_US.blogs.txt, en_US.news.txt, and en_US.twitter.txt. ##Donloading the data

if(!file.exists("Coursera-SwiftKey.zip")) {
      download.file("https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip", "Coursera-SwiftKey.zip")
      unzip("Coursera-SwiftKey.zip")
}
# Read the blogs and twitter files using readLines
blogs <- readLines("final/en_US/en_US.blogs.txt", warn = FALSE, encoding = "UTF-8")
twitter <- readLines("final/en_US/en_US.twitter.txt", warn = FALSE, encoding = "UTF-8")
# Read the news file using binary/binomial mode as there are special characters in the text
con <- file("final/en_US/en_US.news.txt", open="rb")
news <- readLines(con, encoding = "UTF-8")
close(con)
rm(con)

Summary of the data

Calculate some summary stats for each file: Size in Megabytes, number of entries (rows), total characters and length of longest entry.

# Get file sizes
blogs_size <- file.info("final/en_US/en_US.blogs.txt")$size / 1024 ^ 2
news_size <- file.info("final/en_US/en_US.news.txt")$size / 1024 ^ 2
twitter_size <- file.info("final/en_US/en_US.twitter.txt")$size / 1024 ^ 2
pop_summary <- data.frame('File' = c("Blogs","News","Twitter"),
                      "FileSizeinMB" = c(blogs_size, news_size, twitter_size),
                      'NumberofLines' = sapply(list(blogs, news, twitter), function(x){length(x)}),
                      'TotalCharacters' = sapply(list(blogs, news, twitter), function(x){sum(nchar(x))}),
                      TotalWords = sapply(list(blogs,news,twitter),stri_stats_latex)[4,],
                      'MaxCharacters' = sapply(list(blogs, news, twitter), function(x){max(unlist(lapply(x, function(y) nchar(y))))})
                      )
pop_summary

##      File FileSizeinMB NumberofLines TotalCharacters TotalWords MaxCharacters
## 1   Blogs     200.4242        899288       206824505   37570839         40833
## 2    News     196.2775       1010242       203223159   34494539         11384
## 3 Twitter     159.3641       2360148       162096031   30451128           140

The data is too big for the initial tests and analysis so we will sample to make it easier

Sampling the data

Creating the models does not require the loading of the whole data sets so we will use a 5% subset to apply all the calcualtions

set.seed(10)
# Remove all non english characters as they cause issues
blogs <- iconv(blogs, "latin1", "ASCII", sub="")
news <- iconv(news, "latin1", "ASCII", sub="")
twitter <- iconv(twitter, "latin1", "ASCII", sub="")
# Binomial sampling of the data and create the relevant files
sample <- function(population, percentage) {
      return(population[as.logical(rbinom(length(population),1,percentage))])
}
# Set sample percentage
percent <- 0.05 #If memory issues comes, it needs to be further reduced
samp_blogs   <- sample(blogs, percent)
samp_news   <- sample(news, percent)
samp_twitter   <- sample(twitter, percent)
dir.create("sample", showWarnings = FALSE)
#write(samp_blogs, "sample/sample.blogs.txt")
#write(samp_news, "sample/sample.news.txt")
#write(samp_twitter, "sample/sample.twitter.txt")
samp_data <- c(samp_blogs,samp_news,samp_twitter)
write(samp_data, "sample/sampleData.txt")

Sample Summary Stats

Calculate some summary stats for each file on sample data.

samp_summary <- data.frame(
      File = c("blogs","news","twitter"),
      t(rbind(sapply(list(samp_blogs,samp_news,samp_twitter),stri_stats_general),
              TotalWords = sapply(list(samp_blogs,samp_news,samp_twitter),stri_stats_latex)[4,]))
)
samp_summary

##      File  Lines LinesNEmpty    Chars CharsNWhite TotalWords
## 1   blogs  44820       44813 10305669     8481988    1862344
## 2    news  50480       50480 10178559     8504440    1726424
## 3 twitter 117802      117802  8081879     6683738    1517599

# remove temporary variables
rm(blogs, news, twitter, samp_blogs, samp_news, samp_twitter, samp_data, pop_summary, samp_summary)

Data Preprocessing

Data Cleaning

The data can be cleaned using techniues such as removing whitespaces, numbers, URLs, punctuations and profanity etc.

directory <- file.path(".", "sample")
#sample_data <- Corpus(DirSource(directory))
sample_data <- VCorpus(DirSource(directory)) # load the data as a corpus
sample_data <- tm_map(sample_data, content_transformer(tolower))
# Removing Profanity Words using one of the available dictionaries of 1384 words,
# but removed from it some words which which dont consider profanity.
profanity_words = readLines("http://www.cs.cmu.edu/~biglou/resources/bad-words.txt")
profanity_words = profanity_words[-(which(profanity_words%in%c("refugee","reject","remains","screw","welfare","sweetness","shoot","sick","shooting","servant","sex","radical","racial","racist","republican","public","molestation","mexican","looser","lesbian","liberal","kill","killing","killer","heroin","fraud","fire","fight","fairy","^die","death","desire","deposit","crash","^crim","crack","^color","cigarette","church","^christ","canadian","cancer","^catholic","cemetery","buried","burn","breast","^bomb","^beast","attack","australian","balls","baptist","^addict","abuse","abortion","amateur","asian","aroused","angry","arab","bible")==TRUE))]
sample_data <- tm_map(sample_data,removeWords, profanity_words)
## removing URLs
removeURL <- function(x) gsub("http[[:alnum:]]*", "", x)
sample_data <- tm_map(sample_data, content_transformer(removeURL))
#sample_data[[1]]$content
# Replacing special chars with space
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
sample_data <- tm_map(sample_data, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
sample_data <- tm_map(sample_data, toSpace, "@[^\\s]+")
sample_data <- tm_map(sample_data, tolower) # convert to lowercase
#sample_data <- tm_map(sample_data, removeWords, stopwords("en"))#remove english stop words
sample_data <- tm_map(sample_data, removePunctuation) # remove punctuation
sample_data <- tm_map(sample_data, removeNumbers) # remove numbers
sample_data <- tm_map(sample_data, stripWhitespace) # remove extra whitespaces
#sample_data <- tm_map(sample_data, stemDocument) # initiate stemming
sample_data <- tm_map(sample_data, PlainTextDocument)
sample_corpus <- data.frame(text=unlist(sapply(sample_data,'[',"content")),stringsAsFactors = FALSE)
head(sample_corpus)

##                                                                                                                                                                                                                                                                                                                                                                                                           text
## character(0).content1                                                                                                                                                                                                                                                                                                                                                                  its a sickening feeling
## character(0).content2 even if you dont like the so called screwball comedy that some critic also called sex comedy without sex whose trouble in paradise gives a perfect example you could enjoy two things from this movie the typical art deco interior design in mme colet house and the beautiful gowns designed by travis banton one of the most famous costume designer that show at its best this style
## character(0).content3                                                                                                                                                                                                                                                     cat is looking for more pictures of cute animals with their tongues sticking out email cuteanimaltongues at gmail dot com with yours
## character(0).content4                                                                                                              sunday the festivities continued i went back to shadowbox to see back to the garden again told ya i love that place then after that was over i had to work at the store it was survivor finale night and we do a special survivor crop each season so i had to go host that
## character(0).content5                                                                    they are both chunky knits and were a complete bargainthe green was and the multi colour knit was the charity shops have now started putting out their winter stocks so using these knits as inspiration why dont you go and hunt down a stylish cosy bargain for much less than the high street or designer versions
## character(0).content6                                                                                                                                                                           its official i made the spellbinders team its been an amazing year and i am so glad that it doesnt have to end i love this company their products their values and the people who make spellbinders what it is

After the above transformations the first review looks like:

inspect(sample_data[1])

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1
## 
## [[1]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 18682810

N-gram Tokenization

review_dtm <- DocumentTermMatrix(sample_data)
review_dtm

## <<DocumentTermMatrix (documents: 1, terms: 104463)>>
## Non-/sparse entries: 104463/0
## Sparsity           : 0%
## Maximal term length: 110
## Weighting          : term frequency (tf)

Unigram Analysis

Unigram Analysis shows that which words are the most frequent and what their frequency is. Unigram is based on individual words.

unigramTokenizer <- function(x) {
      NGramTokenizer(x, Weka_control(min = 1, max = 1))
}
#unigrams <- TermDocumentMatrix(sample_data, control = list(tokenize = unigramTokenizer))
unigrams <- DocumentTermMatrix(sample_data, control = list(tokenize = unigramTokenizer))

Bigram Analysis

Bigram Analysis shows that which words are the most frequent and what their frequency is. Bigram is based on two word combinations.

BigramTokenizer <- function(x) {
      NGramTokenizer(x, Weka_control(min = 2, max = 2))
}
bigrams <- DocumentTermMatrix(sample_data, control = list(tokenize = BigramTokenizer))

Trigram Analysis

Trigram Analysis shows that which words are the most frequent and what their frequency is. Trigram is based on three word combinations.

trigramTokenizer <- function(x) {
      NGramTokenizer(x, Weka_control(min = 3, max = 3))
}
#trigrams <- TermDocumentMatrix(sample_data, control = list(tokenize = trigramTokenizer))
trigrams <- DocumentTermMatrix(sample_data, control = list(tokenize = trigramTokenizer))

Quadgram Analysis

Quadgram Analysis shows that which words are the most frequent and what their frequency is. Quadgram is based on four word combinations.

quadgramTokenizer <- function(x) {
      NGramTokenizer(x, Weka_control(min = 4, max = 4))
}
#quadgrams <- TermDocumentMatrix(sample_data, control = list(tokenize = trigramTokenizer))
quadgrams <- DocumentTermMatrix(sample_data, control = list(tokenize = quadgramTokenizer))

Exploratory Data Analysis

Now we can perform exploratory analysis on the tidy data. For each Term Document Matrix, we list the most common unigrams, bigrams, trigrams and fourgrams. It would be interesting and helpful to find the most frequently occurring words in the data.

Top 10 frequencies of unigrams

unigrams_frequency <- sort(colSums(as.matrix(unigrams)),decreasing = TRUE)
unigrams_freq_df <- data.frame(word = names(unigrams_frequency), frequency = unigrams_frequency)
head(unigrams_freq_df, 10)

##      word frequency
## the   the    183506
## and   and     95477
## that that     39230
## for   for     34120
## with with     26331
## was   was     24765
## you   you     19432
## this this     18600
## have have     17514
## but   but     17120

Plot the Unigram frequency

unigrams_freq_df %>%
      filter(frequency > 3000) %>%
      ggplot(aes(reorder(word,-frequency), frequency)) +
      geom_bar(stat = "identity", colour= "lightblue", fill= "darkblue") +
      ggtitle("Unigrams with frequencies > 3000") +
      xlab("Unigrams") + ylab("Frequency") +
      theme(axis.text.x = element_text(angle = 45, hjust = 1))

Top 10 frequencies of bigrams

bigrams_frequency <- sort(colSums(as.matrix(bigrams)),decreasing = TRUE)
bigrams_freq_df <- data.frame(word = names(bigrams_frequency), frequency = bigrams_frequency)
head(bigrams_freq_df, 10)

##              word frequency
## of the     of the     18089
## in the     in the     15841
## to the     to the      8080
## on the     on the      7148
## for the   for the      6068
## to be       to be      5567
## and the   and the      5467
## at the     at the      5038
## in a         in a      4636
## with the with the      4303

Here, create generic function to plot the top 50 frequences for Bigrams and Trigrams.

hist_plot <- function(data, label) {
      ggplot(data[1:50,], aes(reorder(word, -frequency), frequency)) +
            labs(x = label, y = "Frequency") +
            theme(axis.text.x = element_text(angle = 60, size = 12, hjust = 1)) +
            geom_bar(stat = "identity",colour= "lightblue", fill= "darkblue")
}

Plot the Bigram frequency

hist_plot(bigrams_freq_df, "50 Most Common Bigrams")

Top 10 frequencies of trigrams

trigrams_frequency <- sort(colSums(as.matrix(trigrams)),decreasing = TRUE)
trigrams_freq_df <- data.frame(word = names(trigrams_frequency), frequency = trigrams_frequency)
head(trigrams_freq_df, 10)

##                    word frequency
## one of the   one of the      1420
## a lot of       a lot of      1175
## as well as   as well as       678
## to be a         to be a       596
## out of the   out of the       580
## some of the some of the       576
## the end of   the end of       572
## it was a       it was a       546
## part of the part of the       545
## be able to   be able to       500

Plot the Trigram frequency

hist_plot(trigrams_freq_df, "50 Most Common Trigrams")

Top 10 frequencies of quadgrams

quadgrams_frequency <- sort(colSums(as.matrix(quadgrams)),decreasing = TRUE)
quadgrams_freq_df <- data.frame(word = names(quadgrams_frequency), frequency = quadgrams_frequency)
head(quadgrams_freq_df, 10)

##                                  word frequency
## the end of the         the end of the       311
## at the end of           at the end of       257
## the rest of the       the rest of the       252
## for the first time for the first time       214
## at the same time     at the same time       191
## one of the most       one of the most       172
## is one of the           is one of the       170
## when it comes to     when it comes to       163
## to be able to           to be able to       146
## in the middle of     in the middle of       141

Plot the Quadgram frequency

hist_plot(quadgrams_freq_df, "50 Most Common Quadgrams")

Summary of Findings

Building N-grams takes some time, even when downsampling to 2%. Caching helps to speed the process up when run the next time (cache = TRUE).

The longer the N-grams, the lower their abundance (e.g. the most abundant Bigrams frequency is 14485, the most abundant Trigrams frequency is 1135 and that of the most abundant Quadgrams frequency is 241).

For the Shiny applicaiton, the plan is to create an application with a simple interface where the user can enter a string of text. Our prediction model will then give a list of suggested words to update the next word.