This project is to explain the exploratory analysis and for the eventual app and algorithm (predictive model for the most likely next word in a sequence of words)
Download the data from https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip and unzip the data
# Set working directory
setwd("C:/Coursera")
#Download the data if not available and unzip the data
if(!file.exists("Coursera-SwiftKey.zip")){
download.file("https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip", "Coursera-SwiftKey.zip")
unzip("Coursera-SwiftKey.zip")
}
blogs <- readLines("final/en_US/en_US.blogs.txt", warn = FALSE, encoding = "UTF-8")
news <- readLines("final/en_US/en_US.news.txt", warn = FALSE, encoding = "UTF-8")
twitter <- readLines("final/en_US/en_US.twitter.txt", warn = FALSE, encoding = "UTF-8")
Data overview in terms of file size (in MB), lines (number of entries), total characters and the longest line.
summary <- data.frame("FileName" = c("Blogs","News","Twitter"),
"File Size" = sapply(list(blogs, news, twitter), function(x){format(object.size(x),"MB")}),
"Line" = sapply(list(blogs, news, twitter), function(x){length(x)}),
"TotalCharacters" = sapply(list(blogs, news, twitter), function(x){sum(nchar(x))}),
"MaxCharacters" = sapply(list(blogs, news, twitter), function(x){max(unlist(lapply(x, function(y) nchar(y))))})
)
summary
## FileName File.Size Line TotalCharacters MaxCharacters
## 1 Blogs 248.5 Mb 899288 206824505 40833
## 2 News 19.2 Mb 77259 15639408 5760
## 3 Twitter 301.4 Mb 2360148 162096031 140
Due to huge data, we are going to select 5% of the data and convert it to corpus.
set.seed(7890) # for reproducible data
sample_size <- 0.01 # 1% sample from the data
sample_data <- c(sample(blogs,length(blogs)*sample_size),
sample(blogs,length(news)*sample_size),
sample(blogs,length(twitter)*sample_size))
# remove non ASCII characters
sample_data <- iconv(sample_data, "latin1", "ASCII", sub="")
library(tm) # Load Text Mining library
# Make corpus out of the sampel data
corpus <- VCorpus(VectorSource(sample_data))
# Clean the corpus data
corpus <- tm_map(corpus, removePunctuation) # Remove punctuation
corpus <- tm_map(corpus, stripWhitespace) # Remove unneccesary white spaces
corpus <- tm_map(corpus, content_transformer(tolower)) # Convert to lowercase
corpus <- tm_map(corpus, removeNumbers) # Remove numbers
corpus <- tm_map(corpus, PlainTextDocument) # Plain text
A thorough exploratory analysis of the data will be performed to understand the distribution of words and relationship between the words in the corpora.
To convert the cleaned sample data into N-grams format using RWeka packages. The N-gram representation of a text lists all N-tuples of words that appear. The simplest case is the unigram (based on individual word), followed by bigram (based on pairs of words) and so on.
library(RWeka) # Weka is a collection of machine learning algorithms for data mining
unigram_tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
bigram_tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
trigram_tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
Unigrams <- TermDocumentMatrix(corpus, control = list(tokenize = unigram_tokenizer))
Bigrams <- TermDocumentMatrix(corpus, control = list(tokenize = bigram_tokenizer))
Trigrams <- TermDocumentMatrix(corpus, control = list(tokenize = trigram_tokenizer))
## Excluding word/phrase with frequency lower than 100
unigrams_corpus <- findFreqTerms(Unigrams, lowfreq = 100)
bigrams_corpus <- findFreqTerms(Bigrams, lowfreq = 100)
trigrams_corpus <- findFreqTerms(Trigrams, lowfreq = 100)
unigrams_freq <- rowSums(as.matrix(Unigrams[unigrams_corpus,]))
unigrams_freq <- data.frame(word = names(unigrams_freq), frequency = unigrams_freq)
bigrams_freq <- rowSums(as.matrix(Bigrams[bigrams_corpus,]))
bigrams_freq <- data.frame(word = names(bigrams_freq), frequency = bigrams_freq)
trigrams_freq <- rowSums(as.matrix(Trigrams[trigrams_corpus,]))
trigrams_freq <- data.frame(word = names(trigrams_freq), frequency = trigrams_freq)
Plot the Top 20 word usage using bar chart for each Unigrams, Bigrams, Trigrams
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
plot_ngrams <- function(data, chart_title, top) {
df <- data[order(-data$frequency),][1:top,]
ggplot(df, aes(x = reorder(word, -frequency), y = frequency)) +
geom_bar(stat = "identity") +
ggtitle(paste("Top", top, chart_title, "")) +
xlab("Words") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
}
plot_ngrams(unigrams_freq, "One-Word Usage", 20)
plot_ngrams(bigrams_freq, "Two-Word Usage", 20)
plot_ngrams(trigrams_freq, "Three-Word Usage", 20)