Summary

This is milestone report created for Capstone project of Coursera’s DataScience Specilization. The end goal is to create predictive text using text Corpus. Natural Language Processing will be used to analyse data.

# Setting Libraries
library(tm)
library(RColorBrewer)
library(wordcloud)
library(RWeka)
library(NLP)
library(ggplot2)
library(wordcloud)
library(dplyr)
library(stringi)

# Downloading and getting data
# Setting Working Directory
setwd("/Users/Amit/Documents/Hadoop/Coursera/Data Science1/10. Capstone/Week 2")

# Download and extract zip file from Coursera: https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip
# (Already downloaded, so commented to save time)
# download.file("https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip","/Users/Amit/Documents/Hadoop/Coursera/Data Science1/10. Capstone/Week 2/Coursera")
# unzip("Coursera", exdir = "./")

#  Reading only UTF-8 text from lines of blogs,news and twitter
Blogs <- readLines("./final/en_US/en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
News <- readLines("./final/en_US/en_US.news.txt", encoding = "UTF-8", skipNul = TRUE) 
Twitter <- readLines("./final/en_US/en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE) 

# Getting file sizes in kb
Blogs.size <- file.info("./final/en_US/en_US.blogs.txt")$size
News.size <- file.info("./final/en_US/en_US.news.txt")$size
Twitter.size <- file.info("./final/en_US/en_US.twitter.txt")$size

# Getting # of words in each dataset

require(stringi)

# Number of words
Blogs_words <- stri_count_words(Blogs)
News_words <- stri_count_words(News)
Twitter_words <- stri_count_words(Twitter)

# Summary of Data sets
data.frame(source = c("Blogs", "News", "Twitter"),file.size.KB = c(Blogs.size, News.size, Twitter.size), num.lines = c(length(Blogs), length(News), length(Twitter)),num.words = c(sum(Blogs_words), sum(News_words), sum(Twitter_words)), mean.num.words = c(mean(Blogs_words), mean(News_words), mean(Twitter_words)))
##    source file.size.KB num.lines num.words mean.num.words
## 1   Blogs    210160014    899288  37546246       41.75108
## 2    News    205811889   1010242  34762395       34.40997
## 3 Twitter    167105338   2360148  30093410       12.75065

Cleaning Data

# Let's clean dataset first by removing special characterss, URL's, whitespaces, numbers, punctuations, stopwords, special characters and changing case to lower. We will use 1% of data sample for this purpose.

set.seed(1234)

# Blog Sample cleaning
Data_sample_Blogs <- c(sample(Blogs, length(Blogs)*.01))
Corpus_Blogs <- VCorpus(VectorSource(Data_sample_Blogs))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
Corpus_Blogs <- tm_map(Corpus_Blogs, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
Corpus_Blogs <- tm_map(Corpus_Blogs, toSpace, "@[^\\s]+")
Corpus_Blogs <- tm_map(Corpus_Blogs, tolower)
Corpus_Blogs <- tm_map(Corpus_Blogs, removeWords, stopwords("en"))
Corpus_Blogs <- tm_map(Corpus_Blogs, removePunctuation)
Corpus_Blogs <- tm_map(Corpus_Blogs, removeNumbers)
Corpus_Blogs <- tm_map(Corpus_Blogs, stripWhitespace)
Corpus_Blogs <- tm_map(Corpus_Blogs, PlainTextDocument)

# News Sample cleaning
Data_sample_News <- c(sample(News, length(News)*.01))
Corpus_News <- VCorpus(VectorSource(Data_sample_News))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
Corpus_News <- tm_map(Corpus_News, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
Corpus_News <- tm_map(Corpus_News, toSpace, "@[^\\s]+")
Corpus_News <- tm_map(Corpus_News, tolower)
Corpus_News <- tm_map(Corpus_News, removeWords, stopwords("en"))
Corpus_News <- tm_map(Corpus_News, removePunctuation)
Corpus_News <- tm_map(Corpus_News, removeNumbers)
Corpus_News <- tm_map(Corpus_News, stripWhitespace)
Corpus_News <- tm_map(Corpus_News, PlainTextDocument)

# Twitter Sample Cleaning
Data_sample_Twitter <- c(sample(Twitter, length(Twitter)*.01))
Corpus_Twitter <- VCorpus(VectorSource(Data_sample_Twitter))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
Corpus_Twitter <- tm_map(Corpus_Twitter, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
Corpus_Twitter <- tm_map(Corpus_Twitter, toSpace, "@[^\\s]+")
Corpus_Twitter <- tm_map(Corpus_Twitter, tolower)
Corpus_Twitter <- tm_map(Corpus_Twitter, removeWords, stopwords("en"))
Corpus_Twitter <- tm_map(Corpus_Twitter, removePunctuation)
Corpus_Twitter <- tm_map(Corpus_Twitter, removeNumbers)
Corpus_Twitter <- tm_map(Corpus_Twitter, stripWhitespace)
Corpus_Twitter <- tm_map(Corpus_Twitter, PlainTextDocument)

# All data Sampling
Data_sample <- c(Data_sample_Blogs,Data_sample_News,Data_sample_Twitter)
Corpus <- VCorpus(VectorSource(Data_sample))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
Corpus <- tm_map(Corpus, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
Corpus <- tm_map(Corpus, toSpace, "@[^\\s]+")
Corpus <- tm_map(Corpus, tolower)
Corpus <- tm_map(Corpus, removeWords, stopwords("en"))
Corpus <- tm_map(Corpus, removePunctuation)
Corpus <- tm_map(Corpus, removeNumbers)
Corpus <- tm_map(Corpus, stripWhitespace)
Corpus <- tm_map(Corpus, PlainTextDocument)

Exploratory Data Analysis

# We will be seeing most frequently occuring words

# Top 100 words in Blogs
wordcloud(Corpus_Blogs, max.words = 100, min.freq = 50, random.order = FALSE, rot.per = 0.35, use.r.layout = FALSE, colors = brewer.pal(8,"Dark2"))
text(x=0.5, y=0, "Corpus_Blogs")

# Top 100 words in News
wordcloud(Corpus_News, max.words = 100, min.freq = 50, random.order = FALSE, rot.per = 0.35, use.r.layout = FALSE, colors = brewer.pal(8,"Dark2"))
text(x=0.5, y=0, "Corpus_News")

# Top 100 words in Twitter
wordcloud(Corpus_Twitter, max.words = 100, min.freq = 50, random.order = FALSE, rot.per = 0.35, use.r.layout = FALSE, colors = brewer.pal(6,"Dark2"))
text(x=0.5, y=0, "Corpus_Twitter")

# Top 200 words in all
wordcloud(Corpus, max.words = 200, min.freq = 50, random.order = FALSE, rot.per = 0.35, use.r.layout = FALSE, colors = brewer.pal(6,"Dark2"))
text("Corpus_all")

# Let's plot unigram,bigram,trigram and 4gram by using functions
options(mc.cores=1)

getFreq <- function(tdm) {
  freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE)
  return(data.frame(word = names(freq), freq = freq))
}
bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
fgram <- function(x) NGramTokenizer(x, Weka_control(min = 4, max = 4))

# Using top 30 words 
makePlot <- function(data, label) {
  ggplot(data[1:30,], aes(reorder(word, -freq), freq)) + labs(x = label, y = "Frequency") + theme(axis.text.x = element_text(angle = 60, size =   12, hjust = 1)) + geom_bar(stat = "identity", fill = I("darkblue"))
}

# Get frequencies of most common n-grams in data sample
freq1 <- getFreq(removeSparseTerms(TermDocumentMatrix(Corpus), 0.9999))
freq2 <- getFreq(removeSparseTerms(TermDocumentMatrix(Corpus, control = list(tokenize = bigram)), 0.9999))
freq3 <- getFreq(removeSparseTerms(TermDocumentMatrix(Corpus, control = list(tokenize = trigram)), 0.9999))
freq4 <- getFreq(removeSparseTerms(TermDocumentMatrix(Corpus, control = list(tokenize = fgram)), 0.9999))

# Histogram of the 30 most common unigrams.
makePlot(freq1, "30 Most Common Unigrams")

# Histogram of the 30 most common bigrams.
makePlot(freq2, "30 Most Common Bigrams")

# Histogram of the 30 most common trigrams.
makePlot(freq3, "30 Most Common Trigrams")

# Histogram of the 30 most common 4grams.
makePlot(freq4, "30 Most Common 4grams")

Plan for Creating Prediction Algorithm and Application

We have completed exploratory analysis. Next is predictive algorithm and deploy algorithm using Shiny app.

Based on our findings in exploratory anaylsis we should be able to predictive next word. We can use trigrams to pretice 4th word or use bigram to prectict 4th word.

We will use Shiny App with i/p and output will be deliver based on our algorithm.