Introduction

The goal of this project is just to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm.

Getting the library

library(stringi)
library(knitr)

Loading the data

setwd("D:/KAW_DOC/CERTS/Coursera/Coursera_Courses/10_DataScience_Capstone/final/en_US")
blogs_file <- readLines("en_US.blogs.txt", warn = FALSE, encoding = "UTF-8")
news_file <- readLines("en_US.news.txt", warn = FALSE, encoding = "UTF-8")
twitter_file <- readLines("en_US.twitter.txt", warn = FALSE, encoding = "UTF-8")

Summary Statistics

The table shows the summary statistic for the files:

  1. en_US.blogs.txt
  2. en_US.news.txt
  3. en_US.twitter.txt
size_blogs <- file.info("en_US.blogs.txt")$size/(1024^2)
len_blogs  <- length(blogs_file)
char_blogs <- sum(sapply(strsplit(blogs_file, " "), length))

size_news  <- file.info("en_US.news.txt")$size/(1024^2)
len_news   <- length(news_file)
char_news  <- sum(sapply(strsplit(news_file, " "), length))

size_twitter <- file.info("en_US.twitter.txt")$size/(1024^2)
len_twitter  <- length(twitter_file)
char_twitter <- sum(sapply(strsplit(twitter_file, " "), length))

Table_Summary <- data.frame(
  Files   = c("en_US.blogs", "en_US.news", "en_US.twitter"),
  "Lines" = c(len_blogs, len_news, len_twitter),
  "SizeInMB"  = c(size_blogs, size_news, size_twitter),
  "Characters" =c(char_blogs, char_news, char_twitter)
)

kable(Table_Summary,caption = "Files Summary")
Files Summary
Files Lines SizeInMB Characters
en_US.blogs 899288 200.4242 37334131
en_US.news 77259 196.2775 2643969
en_US.twitter 2360148 159.3641 30373543

Data Sampling

Performing data sampling by selected 2% of lines from the datasets blogs, news and twitter and combine the lines together into new dataset named sampleData.

sampleData <-c(sample(blogs_file,length(blogs_file)*0.02),
               sample(news_file,length(blogs_file)*0.02),
               sample(twitter_file,length(blogs_file)*0.02))

Clean and Build Corpus - Corpus process

Performing data cleansing:
1) Converting all words into lowercase
2) Remove punctuation
3) Remove numbers
4) Remove whitespaces
5) Remove words not in English language
6) Remove stops words in English language

library(tm)
## Warning: package 'tm' was built under R version 3.6.2
## Loading required package: NLP
library(NLP)
corpus <- VCorpus(VectorSource(sampleData))
corpus <- tm_map(corpus,tolower)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)
corpus <- tm_map(corpus,removeWords,stopwords("english")) 

Ngram Tokenization

In Natural Language Processing (NLP), n-gram is a contiguous sequence of n items from a given sample of text or speech. Ngram Tokenization will be performed by applying function from RWeka to create:
1) Unigram
2) Bigrams
3) Trigrams

library(RWeka)
## Warning: package 'RWeka' was built under R version 3.6.2
UniGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
BiGramsTokenizer  <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
TriGramsTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))

UniGram  <- TermDocumentMatrix(corpus, control = list(tokenize = UniGramTokenizer))
BiGrams  <- TermDocumentMatrix(corpus, control = list(tokenize = BiGramsTokenizer))
TriGrams <- TermDocumentMatrix(corpus, control = list(tokenize = TriGramsTokenizer))

Exploratory Data Analysis on the N-grams

Finding the top 10 frequencies and visualized using histograms for the N-grams:
1) Unigram
2) Bigrams
3) Trigrams

Unigram

library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
UniGram_num  <-findFreqTerms(UniGram,lowfreq=1000)
UniGram_num_freq <- rowSums(as.matrix(UniGram[UniGram_num,]))
UniGram_num_freq <- data.frame(UniGram=names(UniGram_num_freq), Frequency=UniGram_num_freq)
UniGram_num_freq <- UniGram_num_freq[order(-UniGram_num_freq$Frequency),]
kable(head(UniGram_num_freq,10),caption = "Top 10 frequencies for the Unigram")
Top 10 frequencies for the Unigram
UniGram Frequency
said said 5300
will will 4990
one one 4558
just just 4066
like like 3751
can can 3624
time time 3278
get get 2936
new new 2817
now now 2456
UniGram_plot <-ggplot(UniGram_num_freq[1:10,],aes(x=reorder(UniGram, Frequency),y=Frequency,fill=Frequency))
UniGram_plot <- UniGram_plot + geom_bar(stat="identity") + coord_flip() + 
                xlab("Unigram") + ylab("Frequency") + labs(title = "Top 10 frequencies for the Unigrams")
UniGram_plot

Bigrams

BiGrams_num  <-findFreqTerms(BiGrams,lowfreq=80)
BiGrams_num_freq <- rowSums(as.matrix(BiGrams[BiGrams_num,]))
BiGrams_num_freq <- data.frame(BiGrams=names(BiGrams_num_freq), Frequency=BiGrams_num_freq)
BiGrams_num_freq <- BiGrams_num_freq[order(-BiGrams_num_freq$Frequency),]
kable(head(BiGrams_num_freq,10),caption = "Top 10 frequencies for the Bigrams")
Top 10 frequencies for the Bigrams
BiGrams Frequency
last year last year 320
new york new york 306
right now right now 293
high school high school 264
years ago years ago 243
last week last week 198
first time first time 191
dont know dont know 183
st louis st louis 174
last night last night 162
BiGrams_plot <-ggplot(BiGrams_num_freq[1:10,],aes(x=reorder(BiGrams, Frequency),y=Frequency,fill=Frequency))
BiGrams_plot <- BiGrams_plot + geom_bar(stat="identity") + coord_flip() + 
  xlab("BiGrams") + ylab("Frequency") + labs(title = "Top 10 frequencies for the BiGrams")
BiGrams_plot

Trigrams

TriGrams_num  <-findFreqTerms(TriGrams,lowfreq=10)
TriGrams_num_freq <- rowSums(as.matrix(TriGrams[TriGrams_num,]))
TriGrams_num_freq <- data.frame(TriGrams=names(TriGrams_num_freq), Frequency=TriGrams_num_freq)
TriGrams_num_freq <- TriGrams_num_freq[order(-TriGrams_num_freq$Frequency),]
kable(head(TriGrams_num_freq,10),caption = "Top 10 frequencies for the Trigrams")
Top 10 frequencies for the Trigrams
TriGrams Frequency
new york city new york city 38
happy mothers day happy mothers day 24
cant wait see cant wait see 23
president barack obama president barack obama 23
two years ago two years ago 23
dont even know dont even know 21
world war ii world war ii 21
happy new year happy new year 20
let us know let us know 19
cinco de mayo cinco de mayo 17
TriGrams_plot <-ggplot(TriGrams_num_freq[1:10,],aes(x=reorder(TriGrams, Frequency),y=Frequency,fill=Frequency))
TriGrams_plot <- TriGrams_plot + geom_bar(stat="identity") + coord_flip() + 
  xlab("TriGrams") + ylab("Frequency") + labs(title = "Top 10 frequencies for the TriGrams")
TriGrams_plot

Findings

  1. The datasets were sampled 2% from the original datasets to improve the efficiency of analysis.
  2. Profanity were removed in the datasets to improve the accuracy of findings.
  3. From the Bigrams and Trigrams, it is found that the words new york is most frequently used.

However, the sampling only represent 2% from the original datasets. For building the model, more data needed for the model training.

Planning

For the next approach, a Shiny app will be created to takes input phrase and making prediction of the next word.