Introduction

This is Week 2 - Peer Assigment as part of Coursera’s Data Science Specialization Capstone course. The goal for this assignment is to understand the dataset and do a exploratory data analaysis for each of the given files, en_US.blogs.tx, ex_US.news.txt and en_US.twitter.txt. Also we to identifies important feature for the data and explain the plan for prediction algorithm that we are going to develop later. We have to make use of plots and graphs to show our exploratory data analysis.

Library Used

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tm)
## Loading required package: NLP
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
library(stringi)
library(RWeka)
## Warning: package 'RWeka' was built under R version 4.0.2
library(wordcloud)
## Loading required package: RColorBrewer

Data Load

Data already download for the project !(https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip)[Swiftkey]

url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
if(!file.exists("Coursera-SwiftKey.zip")) {
  download.file(url, "Coursera-SwiftKey.zip")
  unzip("Coursera-SwiftKey.zip", exdir = "Coursera-SwiftKey")
}
myPath <- "C:/Users/imdad/Desktop/coursera/Data Science Capstone/final/en_US/"
blogsPath <- paste(myPath, "en_US.blogs.txt", sep="")
twitterPath <- paste(myPath, "en_US.twitter.txt", sep="")
newsPath <- paste(myPath, "en_US.news.txt", sep="")

con <- file(blogsPath, open="r")
blogsFile <- readLines(con)
close(con) 

con <- file(twitterPath, open="r")
twitterFile <- readLines(con)
## Warning in readLines(con): line 167155 appears to contain an embedded nul
## Warning in readLines(con): line 268547 appears to contain an embedded nul
## Warning in readLines(con): line 1274086 appears to contain an embedded nul
## Warning in readLines(con): line 1759032 appears to contain an embedded nul
close(con)

con <- file(newsPath, open="r")
newsFile <- readLines(con)
## Warning in readLines(con): incomplete final line found on 'C:/Users/imdad/
## Desktop/coursera/Data Science Capstone/final/en_US/en_US.news.txt'
close(con)

Basic Summary

Summary of Blog File, Twitter File, and News File.

data_stats <- data.frame(File_Name=c("US_blogs", "US_news", "US_twitter"), 
                         FileSize=c(file.info(blogsPath)$size/1024*1024,
                                    file.info(newsPath)$size/1024*1024,
                                    file.info(twitterPath)$size/1024*1024),
                         WordCount=sapply(list(blogsFile, twitterFile, newsFile),
                                          stri_stats_latex)[4,], 
                         t(rbind(sapply(list(blogsFile, twitterFile, newsFile), stri_stats_general)[c('Lines','Chars'),]
                         )))
print(data_stats)
##    File_Name  FileSize WordCount   Lines     Chars
## 1   US_blogs 210160014  37865888  899288 208361438
## 2    US_news 205811889  30578891 2360148 162384825
## 3 US_twitter 167105338   2665742   77259  15683765

Cleaning and Sampling of data

Since data size is huge, we can sample data to train our models on the smaller sampled dataset. we are going to use 5% sample of data. Once we have sampled the data we can clean it using. We are using tm package for that. We are converting everything to lover case and removing white spaces, punctuation, stop words, numbers etc.

set.seed(12345)
test_data <- c(sample(blogsFile, length(blogsFile) * 0.005),
              sample(twitterFile, length(twitterFile) * 0.005),
              sample(newsFile, length(newsFile) * 0.005)
          )
          
testdata <- iconv(test_data, "UTF-8", "ASCII", sub="")
sample_corpus <- VCorpus(VectorSource(testdata))
sample_corpus <- tm_map(sample_corpus, tolower)
sample_corpus <- tm_map(sample_corpus, stripWhitespace)
sample_corpus <- tm_map(sample_corpus, removePunctuation)
sample_corpus <- tm_map(sample_corpus, removeNumbers)
sample_corpus <- tm_map(sample_corpus, PlainTextDocument)

Creating N-grams for Data

Now we can build our basic unigram, bi-grams and tri-grams as we have cleaned the data and have done some data processing too. We will use RWeka package for this purpose.

unigram <- function(x) NGramTokenizer(x, Weka_control(min=1, max=1))
bigram <- function(x) NGramTokenizer(x, Weka_control(min=2, max=2))
trigram <- function(x) NGramTokenizer(x, Weka_control(min=3, max=3))

unidtf <- TermDocumentMatrix(sample_corpus, control=list(tokenize=unigram))
bidtf <- TermDocumentMatrix(sample_corpus, control=list(tokenize=bigram))
tridtf <- TermDocumentMatrix(sample_corpus, control=list(tokenize=trigram))
                             
uni_tf <- findFreqTerms(unidtf, lowfreq = 50 )
bi_tf <- findFreqTerms(bidtf, lowfreq = 50 )
tri_tf <- findFreqTerms(tridtf, lowfreq = 10 )

uni_freq <- rowSums(as.matrix(unidtf[uni_tf, ]))
uni_freq <- data.frame(words=names(uni_freq), frequency=uni_freq)

bi_freq <- rowSums(as.matrix(bidtf[bi_tf, ]))
bi_freq <- data.frame(words=names(bi_freq), frequency=bi_freq)

tri_freq <- rowSums(as.matrix(tridtf[tri_tf, ]))
tri_freq <- data.frame(words=names(tri_freq), frequency=tri_freq)

head(tri_freq)
##                   words frequency
## a bit of       a bit of        16
## a bunch of   a bunch of        16
## a chance to a chance to        19
## a couple of a couple of        35
## a few days   a few days        24
## a few weeks a few weeks        14

Ploting N-grams Data

Once we have created corresponding n-grams, we can plot their frequency plot. Also it would be nice to see a pictorial version of word frequency using wordcloud package

wordcloud(words=uni_freq$words, freq=uni_freq$frequency, 
          max.words=100, colors = brewer.pal(8, "Dark2"))

plot_freq <- ggplot(data = uni_freq[order(-uni_freq$frequency),][1:15, ], 
                    aes(x = reorder(words, -frequency), y=frequency)) +
              geom_bar(stat="identity", fill="blue") + 
              ggtitle("Top Unigram") + xlab("words") +  ylab("frequency")

plot_freq

plot_freq <- ggplot(data = bi_freq[order(-bi_freq$frequency),][1:15, ], 
                    aes(x = reorder(words, -frequency), y=frequency)) + geom_bar(stat="identity", fill="red") + 
  theme(axis.text.x = element_text(angle = 45)) + 
  ggtitle("Top Bigram") + xlab("words") +  ylab("frequency")
  
plot_freq