Exploratory Data Analysis: Milestone Report

Introduction

The goal of this project is just to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm.

Getting the library

library(stringi)
library(knitr)

Loading the data

setwd("D:/KAW_DOC/CERTS/Coursera/Coursera_Courses/10_DataScience_Capstone/final/en_US")
blogs_file <- readLines("en_US.blogs.txt", warn = FALSE, encoding = "UTF-8")
news_file <- readLines("en_US.news.txt", warn = FALSE, encoding = "UTF-8")
twitter_file <- readLines("en_US.twitter.txt", warn = FALSE, encoding = "UTF-8")

Summary Statistics

The table shows the summary statistic for the files:

en_US.blogs.txt
en_US.news.txt
en_US.twitter.txt

size_blogs <- file.info("en_US.blogs.txt")$size/(1024^2)
len_blogs  <- length(blogs_file)
char_blogs <- sum(sapply(strsplit(blogs_file, " "), length))

size_news  <- file.info("en_US.news.txt")$size/(1024^2)
len_news   <- length(news_file)
char_news  <- sum(sapply(strsplit(news_file, " "), length))

size_twitter <- file.info("en_US.twitter.txt")$size/(1024^2)
len_twitter  <- length(twitter_file)
char_twitter <- sum(sapply(strsplit(twitter_file, " "), length))

Table_Summary <- data.frame(
  Files   = c("en_US.blogs", "en_US.news", "en_US.twitter"),
  "Lines" = c(len_blogs, len_news, len_twitter),
  "SizeInMB"  = c(size_blogs, size_news, size_twitter),
  "Characters" =c(char_blogs, char_news, char_twitter)
)

kable(Table_Summary,caption = "Files Summary")

Files Summary
Files	Lines	SizeInMB	Characters
en_US.blogs	899288	200.4242	37334131
en_US.news	77259	196.2775	2643969
en_US.twitter	2360148	159.3641	30373543

Data Sampling

Performing data sampling by selected 2% of lines from the datasets blogs, news and twitter and combine the lines together into new dataset named sampleData.

sampleData <-c(sample(blogs_file,length(blogs_file)*0.02),
               sample(news_file,length(blogs_file)*0.02),
               sample(twitter_file,length(blogs_file)*0.02))

Clean and Build Corpus - Corpus process

Performing data cleansing:
1) Converting all words into lowercase
2) Remove punctuation
3) Remove numbers
4) Remove whitespaces
5) Remove words not in English language
6) Remove stops words in English language

library(tm)

## Warning: package 'tm' was built under R version 3.6.2

## Loading required package: NLP

library(NLP)
corpus <- VCorpus(VectorSource(sampleData))
corpus <- tm_map(corpus,tolower)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)
corpus <- tm_map(corpus,removeWords,stopwords("english"))

Ngram Tokenization

In Natural Language Processing (NLP), n-gram is a contiguous sequence of n items from a given sample of text or speech. Ngram Tokenization will be performed by applying function from RWeka to create:
1) Unigram
2) Bigrams
3) Trigrams

library(RWeka)

## Warning: package 'RWeka' was built under R version 3.6.2

UniGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
BiGramsTokenizer  <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
TriGramsTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))

UniGram  <- TermDocumentMatrix(corpus, control = list(tokenize = UniGramTokenizer))
BiGrams  <- TermDocumentMatrix(corpus, control = list(tokenize = BiGramsTokenizer))
TriGrams <- TermDocumentMatrix(corpus, control = list(tokenize = TriGramsTokenizer))

Exploratory Data Analysis on the N-grams

Finding the top 10 frequencies and visualized using histograms for the N-grams:
1) Unigram
2) Bigrams
3) Trigrams

Unigram

library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

UniGram_num  <-findFreqTerms(UniGram,lowfreq=1000)
UniGram_num_freq <- rowSums(as.matrix(UniGram[UniGram_num,]))
UniGram_num_freq <- data.frame(UniGram=names(UniGram_num_freq), Frequency=UniGram_num_freq)
UniGram_num_freq <- UniGram_num_freq[order(-UniGram_num_freq$Frequency),]
kable(head(UniGram_num_freq,10),caption = "Top 10 frequencies for the Unigram")

Top 10 frequencies for the Unigram
	UniGram	Frequency
said	said	5300
will	will	4990
one	one	4558
just	just	4066
like	like	3751
can	can	3624
time	time	3278
get	get	2936
new	new	2817
now	now	2456

UniGram_plot <-ggplot(UniGram_num_freq[1:10,],aes(x=reorder(UniGram, Frequency),y=Frequency,fill=Frequency))
UniGram_plot <- UniGram_plot + geom_bar(stat="identity") + coord_flip() + 
                xlab("Unigram") + ylab("Frequency") + labs(title = "Top 10 frequencies for the Unigrams")
UniGram_plot

Bigrams

BiGrams_num  <-findFreqTerms(BiGrams,lowfreq=80)
BiGrams_num_freq <- rowSums(as.matrix(BiGrams[BiGrams_num,]))
BiGrams_num_freq <- data.frame(BiGrams=names(BiGrams_num_freq), Frequency=BiGrams_num_freq)
BiGrams_num_freq <- BiGrams_num_freq[order(-BiGrams_num_freq$Frequency),]
kable(head(BiGrams_num_freq,10),caption = "Top 10 frequencies for the Bigrams")

Top 10 frequencies for the Bigrams
	BiGrams	Frequency
last year	last year	320
new york	new york	306
right now	right now	293
high school	high school	264
years ago	years ago	243
last week	last week	198
first time	first time	191
dont know	dont know	183
st louis	st louis	174
last night	last night	162

BiGrams_plot <-ggplot(BiGrams_num_freq[1:10,],aes(x=reorder(BiGrams, Frequency),y=Frequency,fill=Frequency))
BiGrams_plot <- BiGrams_plot + geom_bar(stat="identity") + coord_flip() + 
  xlab("BiGrams") + ylab("Frequency") + labs(title = "Top 10 frequencies for the BiGrams")
BiGrams_plot

Trigrams

TriGrams_num  <-findFreqTerms(TriGrams,lowfreq=10)
TriGrams_num_freq <- rowSums(as.matrix(TriGrams[TriGrams_num,]))
TriGrams_num_freq <- data.frame(TriGrams=names(TriGrams_num_freq), Frequency=TriGrams_num_freq)
TriGrams_num_freq <- TriGrams_num_freq[order(-TriGrams_num_freq$Frequency),]
kable(head(TriGrams_num_freq,10),caption = "Top 10 frequencies for the Trigrams")

Top 10 frequencies for the Trigrams
	TriGrams	Frequency
new york city	new york city	38
happy mothers day	happy mothers day	24
cant wait see	cant wait see	23
president barack obama	president barack obama	23
two years ago	two years ago	23
dont even know	dont even know	21
world war ii	world war ii	21
happy new year	happy new year	20
let us know	let us know	19
cinco de mayo	cinco de mayo	17

TriGrams_plot <-ggplot(TriGrams_num_freq[1:10,],aes(x=reorder(TriGrams, Frequency),y=Frequency,fill=Frequency))
TriGrams_plot <- TriGrams_plot + geom_bar(stat="identity") + coord_flip() + 
  xlab("TriGrams") + ylab("Frequency") + labs(title = "Top 10 frequencies for the TriGrams")
TriGrams_plot

Findings

The datasets were sampled 2% from the original datasets to improve the efficiency of analysis.
Profanity were removed in the datasets to improve the accuracy of findings.
From the Bigrams and Trigrams, it is found that the words new york is most frequently used.

However, the sampling only represent 2% from the original datasets. For building the model, more data needed for the model training.

Planning

For the next approach, a Shiny app will be created to takes input phrase and making prediction of the next word.