knitr::opts_chunk$set(echo = TRUE)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringi)
library(knitr)
library(ggplot2)
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(ngram)
library(NLP)
library(tesseract)
library(corpus)
library(wordcloud)
## Loading required package: RColorBrewer
library(doParallel)
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
library(RWeka)
## Download the dataset and unzip it.
if(!file.exists("Coursera-SwiftKey.zip"))
download.file("https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip", "Coursera-SwiftKey.zip")
unzip("Coursera-SwiftKey.zip")
## Read the twitter, blogs and news data from the English dataset into R.
blogs <- readLines("final/en_US/en_US.blogs.txt", encoding="UTF-8", skipNul=TRUE)
news <- readLines("final/en_US/en_US.news.txt", encoding="UTF-8", skipNul=TRUE)
## Warning in readLines("final/en_US/en_US.news.txt", encoding = "UTF-8", skipNul =
## TRUE): incomplete final line found on 'final/en_US/en_US.news.txt'
twitter <- readLines("final/en_US/en_US.twitter.txt", encoding="UTF-8", skipNul=TRUE)
## Find the size of each file.
blogs_size <- file.info("final/en_US/en_US.blogs.txt")$size / 1024 ^ 2
news_size <- file.info("final/en_US/en_US.news.txt")$size / 1024 ^ 2
twitter_size <- file.info("final/en_US/en_US.twitter.txt")$size / 1024 ^ 2
## Find the word count in each file.
blogs_words <- stri_count_words(blogs)
news_words <- stri_count_words(news)
twitter_words <- stri_count_words(twitter)
## Find the line count in each file.
length(blogs)
## [1] 899288
length(news)
## [1] 77259
length(twitter)
## [1] 2360148
total_lines <- length(blogs)+length(news)+length(twitter)
## Find the total character count and characters per line in each file.
### Characters per line
blogs_nchar <- nchar(blogs)
news_nchar <- nchar(news)
twitter_nchar <- nchar(twitter)
### Total number of characters
blogs_nchar_tot <- sum(blogs_nchar)
news_nchar_tot <- sum(news_nchar)
twitter_nchar_tot <- sum(twitter_nchar)
## Basic Summary
data_summary <- data.frame(data_filename = c("blogs","news","twitter"),
file_size_MB = c(blogs_size,news_size,twitter_size),
line_count = c(length(blogs),length(news),length(twitter)),
words_count = c(sum(blogs_words),sum(news_words),sum(twitter_words)),
char_count = c(blogs_nchar_tot, news_nchar_tot, twitter_nchar_tot),
wordsperlinemean = c(mean(blogs_words),mean(news_words),mean(twitter_words)))
## Create a table of the data_summary.
kable(data_summary, caption ="Data Summary (en_US)")
| data_filename | file_size_MB | line_count | words_count | char_count | wordsperlinemean |
|---|---|---|---|---|---|
| blogs | 200.4242 | 899288 | 37546239 | 206824505 | 41.75107 |
| news | 196.2775 | 77259 | 2674536 | 15639408 | 34.61779 |
| 159.3641 | 2360148 | 30093413 | 162096241 | 12.75065 |
## Set the seed to a large number.
set.seed(1000)
## Create a sample data that is 1% of the dataset.
sample_data <- c(sample(blogs, length(blogs)*0.01),
sample(news, length(news)*0.01),
sample(twitter, length(twitter)*0.01))
## Create a corpus and print it.
clean_corpus <- VCorpus(VectorSource(sample_data))
print(clean_corpus)
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 33365
## Clean the newly created corpus of white spaces, numbers, punctuation marks, etc.
clean_corpus <- tm_map(clean_corpus, content_transformer(tolower))
clean_corpus <- tm_map(clean_corpus, removeWords, stopwords("en"))
clean_corpus <- tm_map(clean_corpus, removePunctuation)
clean_corpus <- tm_map(clean_corpus, removeNumbers)
clean_corpus <- tm_map(clean_corpus, stripWhitespace)
clean_corpus <- tm_map(clean_corpus, PlainTextDocument)
print(clean_corpus)
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 33365
## Set mc.cores=1
options(mc.cores=1)
## Create a function to find the frequency of specified n-gram words and return in the form of a table.
getFreq <- function(tdm) {
freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE)
return(data.frame(word = names(freq), freq = freq))
}
## Create functions to tokenize bi/tri-grams.
bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
## Create a Document term matrix of the clean corpus to find uni/bi/tri- grams frequencies and remove sparce terms from the required dataset/corpus.
dtm_corpus <- TermDocumentMatrix(clean_corpus)
dtm_corpus
## <<TermDocumentMatrix (terms: 45528, documents: 33365)>>
## Non-/sparse entries: 341824/1518699896
## Sparsity : 100%
## Maximal term length: 250
## Weighting : term frequency (tf)
freq1 <- getFreq(removeSparseTerms(dtm_corpus, 0.9999))
dtm_bigram <- TermDocumentMatrix(clean_corpus, control = list(tokenize = bigram))
dtm_bigram
## <<TermDocumentMatrix (terms: 293610, documents: 33365)>>
## Non-/sparse entries: 343546/9795954104
## Sparsity : 100%
## Maximal term length: 172
## Weighting : term frequency (tf)
freq2 <- getFreq(removeSparseTerms(dtm_bigram, 0.9999))
dtm_trigram <- TermDocumentMatrix(clean_corpus, control = list(tokenize = trigram))
dtm_trigram
## <<TermDocumentMatrix (terms: 311093, documents: 33365)>>
## Non-/sparse entries: 313338/10379304607
## Sparsity : 100%
## Maximal term length: 181
## Weighting : term frequency (tf)
freq3 <- getFreq(removeSparseTerms(dtm_trigram, 0.9999))
## Define a function to make histograms showing the frequencies of occurrences of each n-gram word.
makePlot <- function(data, label, color) {
ggplot(data[1:10,], aes(reorder(word, -freq), freq)) +
labs(x = label, y = "Frequency") +
theme(axis.text.x = element_text(angle = 60, size = 12, hjust = 1)) +
geom_bar(stat = "identity", fill = I(color))
}
## Use the defined function to make histogram plots.
makePlot(freq1, "Top 10 Frequent Unigrams","red")
makePlot(freq2, "Top 10 Frequent Bigrams","pink")
makePlot(freq3, "Top 10 Frequent Trigrams", "yellow")