This is milestone report for week 2 of data science specialization. The overall goal is to build a model which predicts the most likely next word when user inputs word or phrase.
The goal for this assignment is to understand dataset and perform exploratory data analysis for three text files namely blogs, news and twitter. Also, this analysis helps us to chose a prediction plan later in the project.
setwd("F:/Knowledge/Coursera/Data Science Specalization/Capstone Project")
In this step, we download the data from URL and unzip the contents and read these data into R.
if (!file.exists("Coursera-SwiftKey.zip")){
download.file(url = "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip", destfile = "Coursera-SwiftKey.zip")
unzip("Coursera-SwiftKey.zip")
}
blogs <- readLines(con = file("./final/en_US/en_US.blogs.txt"),
encoding= "UTF-8", skipNul = TRUE, warn = FALSE)
news <- readLines(con = file("./final/en_US/en_US.news.txt"),
encoding= "UTF-8", skipNul = TRUE,warn=FALSE)
twitter <- readLines(con = file("./final/en_US/en_US.twitter.txt"),
encoding= "UTF-8", skipNul = TRUE,warn=FALSE)
In this step, we generate file statistics like size, Nb of Lines, Characters and Words etc.
library(stringi)
file_stats <- data.frame(
FileName = c("Blogs","News","Twitter"),
FileSize = sapply(list(blogs,news,twitter),
function(x){format(object.size(x),"MB")}), t(rbind(sapply(list(blogs,news,twitter),stri_stats_general),
Words = sapply(list(blogs,news,twitter),stri_stats_latex)[4,])
)
)
file_stats
## FileName FileSize Lines LinesNEmpty Chars CharsNWhite Words
## 1 Blogs 255.4 Mb 899288 899288 206824382 170389539 37570839
## 2 News 19.8 Mb 77259 77259 15639408 13072698 2651432
## 3 Twitter 319 Mb 2360148 2360148 162096241 134082806 30451170
From the above step, we see that file sizes are huge. We will sample 1% of data and check the size of VCorpus loaded to memory.
set.seed(20210103)
sampleSize <- 0.01
blogs_sample <- sample(blogs, length(blogs) * sampleSize)
news_sample <- sample(news, length(news) * sampleSize)
twitter_sample <- sample(twitter, length(twitter) * sampleSize)
sampleData <- c(blogs_sample,news_sample,twitter_sample)
We will generate statistics for the sample data to make sure file sizes are not too large
samplefile_stats <- data.frame(
FileName = c("Blogs","News","Twitter","Sample"),
FileSize =sapply(list(blogs_sample,news_sample,twitter_sample,sampleData), function(x){format(object.size(x),"MB")}),
t(rbind(sapply(list(blogs_sample,news_sample,twitter_sample,sampleData),stri_stats_general),
Words = sapply(list(blogs_sample,news_sample,twitter_sample,sampleData),stri_stats_latex)[4,])
)
)
samplefile_stats
## FileName FileSize Lines LinesNEmpty Chars CharsNWhite Words
## 1 Blogs 2.6 Mb 8992 8992 2070449 1705825 377231
## 2 News 0.2 Mb 772 772 147695 123507 24881
## 3 Twitter 3.2 Mb 23601 23601 1618885 1339006 304234
## 4 Sample 6 Mb 33365 33365 3837029 3168338 706346
In this step, we build the corpus and then cleaning the data. The following cleanup we will perform in this step:
library(tm)
## Loading required package: NLP
library(pryr)
## Registered S3 method overwritten by 'pryr':
## method from
## print.bytes Rcpp
##
## Attaching package: 'pryr'
## The following object is masked from 'package:tm':
##
## inspect
sample_corpus <- VCorpus(VectorSource(sampleData))
object_size(sample_corpus)
## 77.8 MB
sample_corpus <- tm_map(sample_corpus,content_transformer(tolower)) # convert to lower case
sample_corpus <- tm_map(sample_corpus,removePunctuation)
sample_corpus <- tm_map(sample_corpus,removeNumbers)
sample_corpus <- tm_map(sample_corpus,stripWhitespace)
sample_corpus <- tm_map(sample_corpus,PlainTextDocument)
We need to tokenize the cleaned data and construct the set of N-grams. We will build the following N-grams:
library(RWeka)
unigram <- function(x) { NGramTokenizer(x, Weka_control(min=1, max=1))}
bigram <- function(x) { NGramTokenizer(x, Weka_control(min=2, max=2))}
trigram <- function(x) { NGramTokenizer(x, Weka_control(min=3, max=3))}
uni_mat <- TermDocumentMatrix(sample_corpus, control = list(tokenize = unigram))
bi_mat <- TermDocumentMatrix(sample_corpus, control = list(tokenize= bigram))
tri_mat <- TermDocumentMatrix(sample_corpus, control = list(tokenize= trigram))
After matrix is formed, we need to calculate frequencies
uni_ft <- findFreqTerms(uni_mat,lowfreq=30)
uniFreq <- rowSums(as.matrix(uni_mat[uni_ft,]))
uniFreq <- data.frame(word = names(uniFreq), frequency = uniFreq)
bi_ft <- findFreqTerms(bi_mat, lowfreq=25)
biFreq <- rowSums(as.matrix(bi_mat[bi_ft,]))
biFreq <- data.frame(word = names(biFreq), frequency = biFreq)
tri_ft <- findFreqTerms(tri_mat, lowfreq=20)
triFreq <- rowSums(as.matrix(tri_mat[tri_ft,]))
triFreq <- data.frame(word = names(triFreq), frequency = triFreq)
Now we will see the Words and their frequencies.
head(uniFreq)
## word frequency
## “the “the 89
## ability ability 43
## able able 231
## about about 2041
## above above 112
## absolutely absolutely 70
head(biFreq)
## word frequency
## – and – and 27
## – the – the 26
## a bad a bad 40
## a beautiful a beautiful 45
## a better a better 48
## a big a big 94
head(triFreq)
## word frequency
## a bit of a bit of 45
## a bunch of a bunch of 32
## a chance to a chance to 31
## a couple of a couple of 83
## a few days a few days 22
## a few weeks a few weeks 20
For visualization purpose , we will re-arrange the data in terms of descending frequency.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
uniFreqDesc <- arrange(uniFreq,desc(frequency))
biFreqDesc <- arrange(biFreq,desc(frequency))
triFreqDesc <- arrange(triFreq,desc(frequency))
Now, we will generate histograms to see Top 20 Unigrams, Bigrams and Trigrams.
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
ggplot(data=uniFreqDesc[1:20,], aes(x=reorder(word,-frequency), y = frequency))+
geom_bar(stat = "identity", fill="orange")+
xlab("Words")+
ylab("Frequency")+
ggtitle("Top 20 Unigrams")+
theme(plot.title = element_text(hjust = 0.5))+
theme(axis.text.x=element_text(angle=45, hjust=1))
ggplot(data=biFreqDesc[1:20,], aes(x=reorder(word,-frequency), y = frequency))+
geom_bar(stat = "identity", fill="green")+
xlab("Words")+
ylab("Frequency")+
ggtitle("Top 20 Bigrams")+
theme(plot.title = element_text(hjust = 0.5))+
theme(axis.text.x=element_text(angle=45, hjust=1))
ggplot(data=triFreqDesc[1:20,], aes(x=reorder(word,-frequency), y = frequency))+
geom_bar(stat = "identity", fill="blue")+
xlab("Words")+
ylab("Frequency")+
ggtitle("Top 20 Trigrams")+
theme(plot.title = element_text(hjust = 0.5))+
theme(axis.text.x=element_text(angle=45, hjust=1))
Also, we generate wordcloud to visualize in a better way. The most used keywords stands out better in word cloud.
library(wordcloud)
## Loading required package: RColorBrewer
uniCloud <- wordcloud(uniFreq$word, uniFreq$frequency, scale = c(4, 0.5),
max.words = 100, random.order = FALSE, rot.per = 0.25,
use.r.layout = FALSE, colors = brewer.pal(8, "Dark2"))
biCloud <- wordcloud(biFreq$word, biFreq$frequency, scale = c(4, 0.5),
max.words = 100, random.order = FALSE, rot.per = 0.25,
use.r.layout = FALSE, colors = brewer.pal(8, "Dark2"))
triCloud <- wordcloud(triFreq$word, triFreq$frequency, scale = c(2, 0.5),
max.words = 100, random.order = FALSE, rot.per = 0.25,
use.r.layout = FALSE, colors = brewer.pal(8, "Dark2"))