# Setting Libraries
library(tm)
library(RColorBrewer)
library(wordcloud)
library(RWeka)
library(NLP)
library(ggplot2)
library(wordcloud)
library(dplyr)
library(stringi)
# Downloading and getting data
# Setting Working Directory
setwd("/Users/Amit/Documents/Hadoop/Coursera/Data Science1/10. Capstone/Week 2")
# Download and extract zip file from Coursera: https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip
# (Already downloaded, so commented to save time)
# download.file("https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip","/Users/Amit/Documents/Hadoop/Coursera/Data Science1/10. Capstone/Week 2/Coursera")
# unzip("Coursera", exdir = "./")
# Reading only UTF-8 text from lines of blogs,news and twitter
Blogs <- readLines("./final/en_US/en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
News <- readLines("./final/en_US/en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)
Twitter <- readLines("./final/en_US/en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
# Getting file sizes in kb
Blogs.size <- file.info("./final/en_US/en_US.blogs.txt")$size
News.size <- file.info("./final/en_US/en_US.news.txt")$size
Twitter.size <- file.info("./final/en_US/en_US.twitter.txt")$size
# Getting # of words in each dataset
require(stringi)
# Number of words
Blogs_words <- stri_count_words(Blogs)
News_words <- stri_count_words(News)
Twitter_words <- stri_count_words(Twitter)
# Summary of Data sets
data.frame(source = c("Blogs", "News", "Twitter"),file.size.KB = c(Blogs.size, News.size, Twitter.size), num.lines = c(length(Blogs), length(News), length(Twitter)),num.words = c(sum(Blogs_words), sum(News_words), sum(Twitter_words)), mean.num.words = c(mean(Blogs_words), mean(News_words), mean(Twitter_words)))
## source file.size.KB num.lines num.words mean.num.words
## 1 Blogs 210160014 899288 37546246 41.75108
## 2 News 205811889 1010242 34762395 34.40997
## 3 Twitter 167105338 2360148 30093410 12.75065
# Let's clean dataset first by removing special characterss, URL's, whitespaces, numbers, punctuations, stopwords, special characters and changing case to lower. We will use 1% of data sample for this purpose.
set.seed(1234)
# Blog Sample cleaning
Data_sample_Blogs <- c(sample(Blogs, length(Blogs)*.01))
Corpus_Blogs <- VCorpus(VectorSource(Data_sample_Blogs))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
Corpus_Blogs <- tm_map(Corpus_Blogs, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
Corpus_Blogs <- tm_map(Corpus_Blogs, toSpace, "@[^\\s]+")
Corpus_Blogs <- tm_map(Corpus_Blogs, tolower)
Corpus_Blogs <- tm_map(Corpus_Blogs, removeWords, stopwords("en"))
Corpus_Blogs <- tm_map(Corpus_Blogs, removePunctuation)
Corpus_Blogs <- tm_map(Corpus_Blogs, removeNumbers)
Corpus_Blogs <- tm_map(Corpus_Blogs, stripWhitespace)
Corpus_Blogs <- tm_map(Corpus_Blogs, PlainTextDocument)
# News Sample cleaning
Data_sample_News <- c(sample(News, length(News)*.01))
Corpus_News <- VCorpus(VectorSource(Data_sample_News))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
Corpus_News <- tm_map(Corpus_News, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
Corpus_News <- tm_map(Corpus_News, toSpace, "@[^\\s]+")
Corpus_News <- tm_map(Corpus_News, tolower)
Corpus_News <- tm_map(Corpus_News, removeWords, stopwords("en"))
Corpus_News <- tm_map(Corpus_News, removePunctuation)
Corpus_News <- tm_map(Corpus_News, removeNumbers)
Corpus_News <- tm_map(Corpus_News, stripWhitespace)
Corpus_News <- tm_map(Corpus_News, PlainTextDocument)
# Twitter Sample Cleaning
Data_sample_Twitter <- c(sample(Twitter, length(Twitter)*.01))
Corpus_Twitter <- VCorpus(VectorSource(Data_sample_Twitter))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
Corpus_Twitter <- tm_map(Corpus_Twitter, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
Corpus_Twitter <- tm_map(Corpus_Twitter, toSpace, "@[^\\s]+")
Corpus_Twitter <- tm_map(Corpus_Twitter, tolower)
Corpus_Twitter <- tm_map(Corpus_Twitter, removeWords, stopwords("en"))
Corpus_Twitter <- tm_map(Corpus_Twitter, removePunctuation)
Corpus_Twitter <- tm_map(Corpus_Twitter, removeNumbers)
Corpus_Twitter <- tm_map(Corpus_Twitter, stripWhitespace)
Corpus_Twitter <- tm_map(Corpus_Twitter, PlainTextDocument)
# All data Sampling
Data_sample <- c(Data_sample_Blogs,Data_sample_News,Data_sample_Twitter)
Corpus <- VCorpus(VectorSource(Data_sample))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
Corpus <- tm_map(Corpus, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
Corpus <- tm_map(Corpus, toSpace, "@[^\\s]+")
Corpus <- tm_map(Corpus, tolower)
Corpus <- tm_map(Corpus, removeWords, stopwords("en"))
Corpus <- tm_map(Corpus, removePunctuation)
Corpus <- tm_map(Corpus, removeNumbers)
Corpus <- tm_map(Corpus, stripWhitespace)
Corpus <- tm_map(Corpus, PlainTextDocument)
# We will be seeing most frequently occuring words
# Top 100 words in Blogs
wordcloud(Corpus_Blogs, max.words = 100, min.freq = 50, random.order = FALSE, rot.per = 0.35, use.r.layout = FALSE, colors = brewer.pal(8,"Dark2"))
text(x=0.5, y=0, "Corpus_Blogs")
# Top 100 words in News
wordcloud(Corpus_News, max.words = 100, min.freq = 50, random.order = FALSE, rot.per = 0.35, use.r.layout = FALSE, colors = brewer.pal(8,"Dark2"))
text(x=0.5, y=0, "Corpus_News")
# Top 100 words in Twitter
wordcloud(Corpus_Twitter, max.words = 100, min.freq = 50, random.order = FALSE, rot.per = 0.35, use.r.layout = FALSE, colors = brewer.pal(6,"Dark2"))
text(x=0.5, y=0, "Corpus_Twitter")
# Top 200 words in all
wordcloud(Corpus, max.words = 200, min.freq = 50, random.order = FALSE, rot.per = 0.35, use.r.layout = FALSE, colors = brewer.pal(6,"Dark2"))
text("Corpus_all")
# Let's plot unigram,bigram,trigram and 4gram by using functions
options(mc.cores=1)
getFreq <- function(tdm) {
freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE)
return(data.frame(word = names(freq), freq = freq))
}
bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
fgram <- function(x) NGramTokenizer(x, Weka_control(min = 4, max = 4))
# Using top 30 words
makePlot <- function(data, label) {
ggplot(data[1:30,], aes(reorder(word, -freq), freq)) + labs(x = label, y = "Frequency") + theme(axis.text.x = element_text(angle = 60, size = 12, hjust = 1)) + geom_bar(stat = "identity", fill = I("darkblue"))
}
# Get frequencies of most common n-grams in data sample
freq1 <- getFreq(removeSparseTerms(TermDocumentMatrix(Corpus), 0.9999))
freq2 <- getFreq(removeSparseTerms(TermDocumentMatrix(Corpus, control = list(tokenize = bigram)), 0.9999))
freq3 <- getFreq(removeSparseTerms(TermDocumentMatrix(Corpus, control = list(tokenize = trigram)), 0.9999))
freq4 <- getFreq(removeSparseTerms(TermDocumentMatrix(Corpus, control = list(tokenize = fgram)), 0.9999))
# Histogram of the 30 most common unigrams.
makePlot(freq1, "30 Most Common Unigrams")
# Histogram of the 30 most common bigrams.
makePlot(freq2, "30 Most Common Bigrams")
# Histogram of the 30 most common trigrams.
makePlot(freq3, "30 Most Common Trigrams")
# Histogram of the 30 most common 4grams.
makePlot(freq4, "30 Most Common 4grams")
We have completed exploratory analysis. Next is predictive algorithm and deploy algorithm using Shiny app.
Based on our findings in exploratory anaylsis we should be able to predictive next word. We can use trigrams to pretice 4th word or use bigram to prectict 4th word.
We will use Shiny App with i/p and output will be deliver based on our algorithm.