Data Science Capstone Project Week 2 Assignment

Download DataSet

Download the Zip file from the link given in the assignment brief.Link
Unzip the zip file and three file namely en_US.blogs.txt, en_US.news.txt, and en_US.twitter.txt can be found in folder en_US.

download.file(‘https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip’,method = “auto”, quiet=FALSE)

Before starting the primary data exploration. Load the libraries.

Import Libraries

library(knitr)
library(RColorBrewer)
library(stringi)
library(wordcloud2)
library(ngram)
library(NLP)
library(tm)
library(slam)
library(xtable)
options(mc.cores = 1)

Explore Files

Basic understanding of files by counting words in each file and length.

blogs <- readLines("en_US.blogs.txt")
news <- readLines("en_US.news.txt")
twitter <- readLines("en_US.twitter.txt")

lenblogs<-length(blogs)
lennews<-length(news)
lentwitter<-length(twitter)

wordblogs <-sum(stri_count_words(blogs))
wordnews <-sum(stri_count_words(news))
wordtwitter <-sum(stri_count_words(twitter))

tab<-data.frame(c(lenblogs,lennews, lentwitter), c(wordblogs,wordnews,wordtwitter))
kable(tab, col.names=c('No of Lines', 'No of words'))

No of Lines	No of words
899288	38154238
77259	2693898
2360148	30218125

Build Corpus

10 % of the total data of each file is used to build Corpus.

No of Lines	No of words
89928	3800858
7725	269492
236014	3019330

Clean the Data

Before removing Punctuation, Numbers, Whitespaces, and non-english words convert whole text to lowercase.

removeURL<-function(x) gsub("http[[:alnum:]]*","",x)
removeSign<-function(x) gsub("[[:punct:]]","",x)
removeNum<-function(x) gsub("[[:digit:]]","",x)
removeapo<-function(x) gsub("'","",x)
removeNonASCII<-function(x) iconv(x, "latin1", "ASCII", sub="")
removerepeat<- function(x) gsub("([[:alpha:]])\\1{2,}", "\\1\\1", x)
toLowerCase <- function(x) sapply(x,tolower)
removeSpace<-function(x) gsub("\\s+"," ",x)
removeTh<-function(x) gsub(" th", "",x)
corp <- Corpus(VectorSource(c(blogs, news, twitter)))
corp<-tm_map(corp,content_transformer(removeapo))
corp<-tm_map(corp,content_transformer(removeNum))
corp<-tm_map(corp,content_transformer(removeURL)) 
corp<-tm_map(corp,content_transformer(removeSign)) 
corp<-tm_map(corp,content_transformer(removeNonASCII))
corp<-tm_map(corp,content_transformer(toLowerCase))
corp<-tm_map(corp,content_transformer(removerepeat))
corp<-tm_map(corp,content_transformer(removeSpace)) 
corp<-tm_map(corp,removeWords,stopwords("english")) 
#corp<-tm_map(corp,removeWords,profanity)
corp<-tm_map(corp,content_transformer(removeTh))

Data Analysis

1-gram

tdm <- TermDocumentMatrix(corp)
wordMatrix = as.data.frame((as.matrix(tdm)) ) 
s <- sort(rowSums(wordMatrix),decreasing=TRUE)
d <- data.frame(word = names(s),freq=s)
plotd <- d[1:20,]
wordcloud2(data=plotd, size = 0.7, shape = 'pentagon',backgroundColor="black")

2-gram

bg <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
tdm <- TermDocumentMatrix(corp,control = list(tokenize = bg))
wordMatrix = as.data.frame((as.matrix(tdm))) 
s <- sort(rowSums(wordMatrix),decreasing=TRUE)
d <- data.frame(word = names(s),freq=s)
plotd2 <- d[1:20,]

3-gram

tg <- function(x) NGramTokenizer(x,Weka_control(min = 3, max = 3))
tdm <- TermDocumentMatrix(corp,control = list(tokenize = tg))
wordMatrix = as.data.frame((as.matrix(tdm))) 
s <- sort(rowSums(wordMatrix),decreasing=TRUE)
d <- data.frame(word = names(s),freq=s)
plotd3 <- d[1:20,]
wordcloud2(data=plotd3, size = 0.3, shape = 'diamond', backgroundColor="black")