This is Week 2 - Peer Assigment as part of Coursera’s Data Science Specialization Capstone course. The goal for this assignment is to understand the dataset and do a exploratory data analaysis for each of the given files, en_US.blogs.tx, ex_US.news.txt and en_US.twitter.txt. Also we to identifies important feature for the data and explain the plan for prediction algorithm that we are going to develop later. We have to make use of plots and graphs to show our exploratory data analysis.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tm)
## Loading required package: NLP
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(stringi)
library(RWeka)
## Warning: package 'RWeka' was built under R version 4.0.2
library(wordcloud)
## Loading required package: RColorBrewer
Data already download for the project !(https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip)[Swiftkey]
url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
if(!file.exists("Coursera-SwiftKey.zip")) {
download.file(url, "Coursera-SwiftKey.zip")
unzip("Coursera-SwiftKey.zip", exdir = "Coursera-SwiftKey")
}
myPath <- "C:/Users/imdad/Desktop/coursera/Data Science Capstone/final/en_US/"
blogsPath <- paste(myPath, "en_US.blogs.txt", sep="")
twitterPath <- paste(myPath, "en_US.twitter.txt", sep="")
newsPath <- paste(myPath, "en_US.news.txt", sep="")
con <- file(blogsPath, open="r")
blogsFile <- readLines(con)
close(con)
con <- file(twitterPath, open="r")
twitterFile <- readLines(con)
## Warning in readLines(con): line 167155 appears to contain an embedded nul
## Warning in readLines(con): line 268547 appears to contain an embedded nul
## Warning in readLines(con): line 1274086 appears to contain an embedded nul
## Warning in readLines(con): line 1759032 appears to contain an embedded nul
close(con)
con <- file(newsPath, open="r")
newsFile <- readLines(con)
## Warning in readLines(con): incomplete final line found on 'C:/Users/imdad/
## Desktop/coursera/Data Science Capstone/final/en_US/en_US.news.txt'
close(con)
Summary of Blog File, Twitter File, and News File.
data_stats <- data.frame(File_Name=c("US_blogs", "US_news", "US_twitter"),
FileSize=c(file.info(blogsPath)$size/1024*1024,
file.info(newsPath)$size/1024*1024,
file.info(twitterPath)$size/1024*1024),
WordCount=sapply(list(blogsFile, twitterFile, newsFile),
stri_stats_latex)[4,],
t(rbind(sapply(list(blogsFile, twitterFile, newsFile), stri_stats_general)[c('Lines','Chars'),]
)))
print(data_stats)
## File_Name FileSize WordCount Lines Chars
## 1 US_blogs 210160014 37865888 899288 208361438
## 2 US_news 205811889 30578891 2360148 162384825
## 3 US_twitter 167105338 2665742 77259 15683765
Since data size is huge, we can sample data to train our models on the smaller sampled dataset. we are going to use 5% sample of data. Once we have sampled the data we can clean it using. We are using tm package for that. We are converting everything to lover case and removing white spaces, punctuation, stop words, numbers etc.
set.seed(12345)
test_data <- c(sample(blogsFile, length(blogsFile) * 0.005),
sample(twitterFile, length(twitterFile) * 0.005),
sample(newsFile, length(newsFile) * 0.005)
)
testdata <- iconv(test_data, "UTF-8", "ASCII", sub="")
sample_corpus <- VCorpus(VectorSource(testdata))
sample_corpus <- tm_map(sample_corpus, tolower)
sample_corpus <- tm_map(sample_corpus, stripWhitespace)
sample_corpus <- tm_map(sample_corpus, removePunctuation)
sample_corpus <- tm_map(sample_corpus, removeNumbers)
sample_corpus <- tm_map(sample_corpus, PlainTextDocument)
Now we can build our basic unigram, bi-grams and tri-grams as we have cleaned the data and have done some data processing too. We will use RWeka package for this purpose.
unigram <- function(x) NGramTokenizer(x, Weka_control(min=1, max=1))
bigram <- function(x) NGramTokenizer(x, Weka_control(min=2, max=2))
trigram <- function(x) NGramTokenizer(x, Weka_control(min=3, max=3))
unidtf <- TermDocumentMatrix(sample_corpus, control=list(tokenize=unigram))
bidtf <- TermDocumentMatrix(sample_corpus, control=list(tokenize=bigram))
tridtf <- TermDocumentMatrix(sample_corpus, control=list(tokenize=trigram))
uni_tf <- findFreqTerms(unidtf, lowfreq = 50 )
bi_tf <- findFreqTerms(bidtf, lowfreq = 50 )
tri_tf <- findFreqTerms(tridtf, lowfreq = 10 )
uni_freq <- rowSums(as.matrix(unidtf[uni_tf, ]))
uni_freq <- data.frame(words=names(uni_freq), frequency=uni_freq)
bi_freq <- rowSums(as.matrix(bidtf[bi_tf, ]))
bi_freq <- data.frame(words=names(bi_freq), frequency=bi_freq)
tri_freq <- rowSums(as.matrix(tridtf[tri_tf, ]))
tri_freq <- data.frame(words=names(tri_freq), frequency=tri_freq)
head(tri_freq)
## words frequency
## a bit of a bit of 16
## a bunch of a bunch of 16
## a chance to a chance to 19
## a couple of a couple of 35
## a few days a few days 24
## a few weeks a few weeks 14
Once we have created corresponding n-grams, we can plot their frequency plot. Also it would be nice to see a pictorial version of word frequency using wordcloud package
wordcloud(words=uni_freq$words, freq=uni_freq$frequency,
max.words=100, colors = brewer.pal(8, "Dark2"))
plot_freq <- ggplot(data = uni_freq[order(-uni_freq$frequency),][1:15, ],
aes(x = reorder(words, -frequency), y=frequency)) +
geom_bar(stat="identity", fill="blue") +
ggtitle("Top Unigram") + xlab("words") + ylab("frequency")
plot_freq
plot_freq <- ggplot(data = bi_freq[order(-bi_freq$frequency),][1:15, ],
aes(x = reorder(words, -frequency), y=frequency)) + geom_bar(stat="identity", fill="red") +
theme(axis.text.x = element_text(angle = 45)) +
ggtitle("Top Bigram") + xlab("words") + ylab("frequency")
plot_freq