Exploratory Data Analaysis for Text Data

This is part of the Data Science Capstone project, which includes the exploratory data analysis for the three documents provided (blogs, news, twitter). In this report, a corpus will be created and pre-processed. Next, the word/phrases frequency will be determined and observed before building a model.

Import library

#load library
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringi)
library(corpus)
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 4.1.3
## Loading required package: RColorBrewer
library(RWeka)
## Warning: package 'RWeka' was built under R version 4.1.3
## java.home option:
## JAVA_HOME environment variable: .;C:\Program Files\Java\jre1.8.0_311\bin;
## Warning in fun(libname, pkgname): Java home setting is INVALID, it will be ignored.
## Please do NOT set it unless you want to override system settings.
suppressWarnings(expr)
## function (expr) 
## {
##     enexpr(expr)
## }
## <bytecode: 0x0000000025616278>
## <environment: namespace:rlang>

Load Data

The data files are downloaded and unzipped at the working directory. In this project, we will be working on three English text files, namely the ‘blogs’, ‘news’, ‘twitter’ files.

folder<- "C:\\Users\\think\\Documents\\R\\coursera-SwiftKey\\final\\en_US"
filelist<- list.files(path=folder, pattern="*.txt")
paste(folder, "\\", filelist, sep= "")
## [1] "C:\\Users\\think\\Documents\\R\\coursera-SwiftKey\\final\\en_US\\en_US.blogs.txt"  
## [2] "C:\\Users\\think\\Documents\\R\\coursera-SwiftKey\\final\\en_US\\en_US.news.txt"   
## [3] "C:\\Users\\think\\Documents\\R\\coursera-SwiftKey\\final\\en_US\\en_US.twitter.txt"
files<- lapply(filelist, FUN=readLines)
## Warning in FUN(X[[i]], ...): incomplete final line found on 'en_US.news.txt'
## Warning in FUN(X[[i]], ...): line 167155 appears to contain an embedded nul
## Warning in FUN(X[[i]], ...): line 268547 appears to contain an embedded nul
## Warning in FUN(X[[i]], ...): line 1274086 appears to contain an embedded nul
## Warning in FUN(X[[i]], ...): line 1759032 appears to contain an embedded nul

Basic Summary

A basic summary will be generated by checking on the total line count, total word count, average, maximum and minimum word count in one entry of all three files.The summary is as shown below the code:

summary<- data.frame()
for(file in files){
  linecount<- length(file)
  wordcount<- stri_count_words(file)
  tot_wordcount<- sum(wordcount)
  avg_wordcount<- mean(wordcount)
  max_wordcount<- max(wordcount)
  output<- c(linecount, tot_wordcount, avg_wordcount, max_wordcount)
  summary<- rbind(summary, output)
}
colnames(summary)<- c('linecount', 'total wordcount', 'average wordcount', 'max wordcount')
rownames(summary)<- c('Blog', 'News', 'Twitter')
summary
##         linecount total wordcount average wordcount max wordcount
## Blog       899288        38154238          42.42716          6726
## News        77259         2693898          34.86840          1123
## Twitter   2360148        30218125          12.80349            60

From the summary above, the twitter file has the highest number of line count followed by the blog file. However, the blog file has the highest total, average and maximum word count per entry; while the twitter file has the lowest average and maximum word count per entry despite having highest line count.

Sampling

As each file is quite large in size, we will proceed to further steps with only 3% of the data from each file, then append all terms into a corpus for further data processing and analysis.

library(tm)
## Loading required package: NLP
set.seed(1)
corpus_samples<- list()
for (file in files){
  sample<- sample(file, size= length(file)*0.03)
  sample<- paste(sample, collapse= " ")
  corpus_samples<- append(corpus_samples, sample)
}
#create corpus
corpus_text<- VCorpus(VectorSource(corpus_samples))

Data Preprocessing

The files will be cleaned by removing punctuations, symbols, numbers, stopwords and whitespace. Lastly, all letters will be transformed to lower case.

library(gsubfn)
## Loading required package: proto
toSpace <- content_transformer(function(x, pattern){
            return (gsub(pattern, " ", x))})
corpus_text<- tm_map(corpus_text, toSpace, "-")
corpus_text<- tm_map(corpus_text, toSpace, "'")
corpus_text<- tm_map(corpus_text, toSpace, "â€")
corpus_text<- tm_map(corpus_text, toSpace, "Å“")
corpus_text<- tm_map(corpus_text, toSpace, "â„¢")
corpus_text<- tm_map(corpus_text, toSpace, "¦ ¦ ¦")

#remove punctuation
corpus_text<- tm_map(corpus_text, removePunctuation)
#remove digits
corpus_text<- tm_map(corpus_text, removeNumbers)
#remove stopwords
corpus_text<- tm_map(corpus_text, removeWords, stopwords("english"))
#remove single character
corpus_text<- tm_map(corpus_text, toSpace, "\\b[A-z]\\b{1}")
#change to lower case
corpus_text<- tm_map(corpus_text, content_transformer(tolower))
#remove white space
corpus_text<- tm_map(corpus_text, stripWhitespace)

Exploratory Data Analysis

Based on Wikipedia, ‘N-gram is a contiguous sequence of n items from a speech.’. In the following, the frequency of uni-gram (single term), 2-gram (2 term phrase) and 3-gram (3 term phrase) will be analyzed.

We will first generate the term document matrix by using the relevant tokenizer and specific sparsity. The frequency of each term will then be generated in a dataframe and relevant bar graphs of the top 15 single term, 2 term phrase and 3 term phrase will be plotted for better visualization.

#set function for 2-gram and 3-gram tokenizer
UniGramTokenizer<- function(x)NGramTokenizer(x, control= Weka_control(min=1, max=1))
TwoGramTokenizer<- function(x) NGramTokenizer(x, control= Weka_control(min=2, max=2))
ThreeGramTokenizer<- function(x) NGramTokenizer(x, control= Weka_control(min=3, max=3))

#set function to create the output as dataframe and sort in descending order
Ngram_freq<- function(tdm, sparsity){
  m<- as.matrix(removeSparseTerms(tdm, sparsity))
  freq<- sort(rowSums(m), decreasing=TRUE)
  return(data.frame(word= names(freq), freq=freq))
}

#1-gram 
OneGram_tdm<- TermDocumentMatrix(corpus_text,
                                 control=list(tokenize=UniGramTokenizer))
OneGram_freq<- Ngram_freq(OneGram_tdm, 0.99)

#2-gram
TwoGram_tdm<- TermDocumentMatrix(corpus_text, 
                                  control= list(tokenize= TwoGramTokenizer))
TwoGram_freq<- Ngram_freq(TwoGram_tdm, 0.99)

#3-gram
ThreeGram_tdm<- TermDocumentMatrix(corpus_text, 
                                  control= list(tokenize= ThreeGramTokenizer))
ThreeGram_freq<- Ngram_freq(ThreeGram_tdm, 0.99)

Data Visualization

The top 15 single word, 2 term phrase and 3 term phrase are as follows:

library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(viridis)
## Loading required package: viridisLite
#bar plot to visualize the top 15 unigram, 2-gram and 3-gram phrases
ngram_barplot<- function(data, title){
  ggplot(head(data,15), aes(x=reorder(word, -freq), y=freq))+
    geom_bar(stat='identity', aes(fill= freq))+
    scale_fill_viridis(option = "D")+
    labs(x=' ', y='frequency', title=title)+
    theme(axis.text.x= element_text(angle=45, hjust=1))
}

OneGram_plot<- ngram_barplot(OneGram_freq, 'Top 15 Single Words')
TwoGram_plot<- ngram_barplot(TwoGram_freq, 'Top 15 2-gram Phrases')
ThreeGram_plot<- ngram_barplot(ThreeGram_freq, 'Top 15 3-gram Phrases')

OneGram_plot

TwoGram_plot

ThreeGram_plot

Next Step

Next step would be to develop a prediction model.