The goal of this project is just to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm.
library(stringi)
library(knitr)
setwd("D:/KAW_DOC/CERTS/Coursera/Coursera_Courses/10_DataScience_Capstone/final/en_US")
blogs_file <- readLines("en_US.blogs.txt", warn = FALSE, encoding = "UTF-8")
news_file <- readLines("en_US.news.txt", warn = FALSE, encoding = "UTF-8")
twitter_file <- readLines("en_US.twitter.txt", warn = FALSE, encoding = "UTF-8")
The table shows the summary statistic for the files:
size_blogs <- file.info("en_US.blogs.txt")$size/(1024^2)
len_blogs <- length(blogs_file)
char_blogs <- sum(sapply(strsplit(blogs_file, " "), length))
size_news <- file.info("en_US.news.txt")$size/(1024^2)
len_news <- length(news_file)
char_news <- sum(sapply(strsplit(news_file, " "), length))
size_twitter <- file.info("en_US.twitter.txt")$size/(1024^2)
len_twitter <- length(twitter_file)
char_twitter <- sum(sapply(strsplit(twitter_file, " "), length))
Table_Summary <- data.frame(
Files = c("en_US.blogs", "en_US.news", "en_US.twitter"),
"Lines" = c(len_blogs, len_news, len_twitter),
"SizeInMB" = c(size_blogs, size_news, size_twitter),
"Characters" =c(char_blogs, char_news, char_twitter)
)
kable(Table_Summary,caption = "Files Summary")
| Files | Lines | SizeInMB | Characters |
|---|---|---|---|
| en_US.blogs | 899288 | 200.4242 | 37334131 |
| en_US.news | 77259 | 196.2775 | 2643969 |
| en_US.twitter | 2360148 | 159.3641 | 30373543 |
Performing data sampling by selected 2% of lines from the datasets blogs, news and twitter and combine the lines together into new dataset named sampleData.
sampleData <-c(sample(blogs_file,length(blogs_file)*0.02),
sample(news_file,length(blogs_file)*0.02),
sample(twitter_file,length(blogs_file)*0.02))
Performing data cleansing:
1) Converting all words into lowercase
2) Remove punctuation
3) Remove numbers
4) Remove whitespaces
5) Remove words not in English language
6) Remove stops words in English language
library(tm)
## Warning: package 'tm' was built under R version 3.6.2
## Loading required package: NLP
library(NLP)
corpus <- VCorpus(VectorSource(sampleData))
corpus <- tm_map(corpus,tolower)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)
corpus <- tm_map(corpus,removeWords,stopwords("english"))
In Natural Language Processing (NLP), n-gram is a contiguous sequence of n items from a given sample of text or speech. Ngram Tokenization will be performed by applying function from RWeka to create:
1) Unigram
2) Bigrams
3) Trigrams
library(RWeka)
## Warning: package 'RWeka' was built under R version 3.6.2
UniGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
BiGramsTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
TriGramsTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
UniGram <- TermDocumentMatrix(corpus, control = list(tokenize = UniGramTokenizer))
BiGrams <- TermDocumentMatrix(corpus, control = list(tokenize = BiGramsTokenizer))
TriGrams <- TermDocumentMatrix(corpus, control = list(tokenize = TriGramsTokenizer))
Finding the top 10 frequencies and visualized using histograms for the N-grams:
1) Unigram
2) Bigrams
3) Trigrams
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
UniGram_num <-findFreqTerms(UniGram,lowfreq=1000)
UniGram_num_freq <- rowSums(as.matrix(UniGram[UniGram_num,]))
UniGram_num_freq <- data.frame(UniGram=names(UniGram_num_freq), Frequency=UniGram_num_freq)
UniGram_num_freq <- UniGram_num_freq[order(-UniGram_num_freq$Frequency),]
kable(head(UniGram_num_freq,10),caption = "Top 10 frequencies for the Unigram")
| UniGram | Frequency | |
|---|---|---|
| said | said | 5300 |
| will | will | 4990 |
| one | one | 4558 |
| just | just | 4066 |
| like | like | 3751 |
| can | can | 3624 |
| time | time | 3278 |
| get | get | 2936 |
| new | new | 2817 |
| now | now | 2456 |
UniGram_plot <-ggplot(UniGram_num_freq[1:10,],aes(x=reorder(UniGram, Frequency),y=Frequency,fill=Frequency))
UniGram_plot <- UniGram_plot + geom_bar(stat="identity") + coord_flip() +
xlab("Unigram") + ylab("Frequency") + labs(title = "Top 10 frequencies for the Unigrams")
UniGram_plot
BiGrams_num <-findFreqTerms(BiGrams,lowfreq=80)
BiGrams_num_freq <- rowSums(as.matrix(BiGrams[BiGrams_num,]))
BiGrams_num_freq <- data.frame(BiGrams=names(BiGrams_num_freq), Frequency=BiGrams_num_freq)
BiGrams_num_freq <- BiGrams_num_freq[order(-BiGrams_num_freq$Frequency),]
kable(head(BiGrams_num_freq,10),caption = "Top 10 frequencies for the Bigrams")
| BiGrams | Frequency | |
|---|---|---|
| last year | last year | 320 |
| new york | new york | 306 |
| right now | right now | 293 |
| high school | high school | 264 |
| years ago | years ago | 243 |
| last week | last week | 198 |
| first time | first time | 191 |
| dont know | dont know | 183 |
| st louis | st louis | 174 |
| last night | last night | 162 |
BiGrams_plot <-ggplot(BiGrams_num_freq[1:10,],aes(x=reorder(BiGrams, Frequency),y=Frequency,fill=Frequency))
BiGrams_plot <- BiGrams_plot + geom_bar(stat="identity") + coord_flip() +
xlab("BiGrams") + ylab("Frequency") + labs(title = "Top 10 frequencies for the BiGrams")
BiGrams_plot
TriGrams_num <-findFreqTerms(TriGrams,lowfreq=10)
TriGrams_num_freq <- rowSums(as.matrix(TriGrams[TriGrams_num,]))
TriGrams_num_freq <- data.frame(TriGrams=names(TriGrams_num_freq), Frequency=TriGrams_num_freq)
TriGrams_num_freq <- TriGrams_num_freq[order(-TriGrams_num_freq$Frequency),]
kable(head(TriGrams_num_freq,10),caption = "Top 10 frequencies for the Trigrams")
| TriGrams | Frequency | |
|---|---|---|
| new york city | new york city | 38 |
| happy mothers day | happy mothers day | 24 |
| cant wait see | cant wait see | 23 |
| president barack obama | president barack obama | 23 |
| two years ago | two years ago | 23 |
| dont even know | dont even know | 21 |
| world war ii | world war ii | 21 |
| happy new year | happy new year | 20 |
| let us know | let us know | 19 |
| cinco de mayo | cinco de mayo | 17 |
TriGrams_plot <-ggplot(TriGrams_num_freq[1:10,],aes(x=reorder(TriGrams, Frequency),y=Frequency,fill=Frequency))
TriGrams_plot <- TriGrams_plot + geom_bar(stat="identity") + coord_flip() +
xlab("TriGrams") + ylab("Frequency") + labs(title = "Top 10 frequencies for the TriGrams")
TriGrams_plot
However, the sampling only represent 2% from the original datasets. For building the model, more data needed for the model training.
For the next approach, a Shiny app will be created to takes input phrase and making prediction of the next word.