Exploratory Analysis of Data Sets

We are exploring the following: 1. Basic report of statistics of datasets - word counts, line counts and basic data tables 2. Major features of the data sets using histograms and other graphs 3. Plan for the prediction algorithm and shinyapp

Downloaded the coursera swiftkey data in the working directory

if (!file.exists("./textData")) {
        dir.create("./textData")
}
swifturl <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
## if file is not already downloaded, download the file
if (!file.exists("./textData/Swiftkey Dataset.zip")) {
        download.file(swifturl, destfile = "./textData/Swiftkey Dataset.zip")
}
## Unzip file to datadirectory if the directory does not yet exist exists
if (!file.exists("./textData/final")) {
unzip("./textData/Swiftkey Dataset.zip", exdir = "./textData")
}
rm(swifturl)# removed the file to save memory space

Reading the text data from en_US files

library(dplyr,warn.conflicts = FALSE)

con_tweets <- file("/Users/rizwanmohamed/Coursera/textData/final/en_US/en_US.twitter.txt","r")
con_news<-file("/Users/rizwanmohamed/Coursera/textData/final/en_US/en_US.news.txt","r")
con_blogs<-file("/Users/rizwanmohamed/Coursera/textData/final/en_US/en_US.blogs.txt","r")
tweetdata<- readLines(con_tweets,encoding= "UTF-8",skipNul = TRUE);close(con_tweets)
newsdata <- readLines(con_news,encoding= "UTF-8",skipNul = TRUE);close(con_news)
blogdata <- readLines(con_blogs,encoding= "UTF-8",skipNul = TRUE);close(con_blogs)

Basic statistics and Data table of the text data

library(stringi)
# number of lines in each file (in million)
lt <-length(tweetdata)/1000000
ln <-length(newsdata)/1000000
lb <-length(blogdata)/1000000

# number of words in each file (in million)
wt <- sum(stri_count(tweetdata,regex="\\S+"))/1000000
wn <- sum(stri_count(newsdata,regex="\\S+"))/1000000
wb <- sum(stri_count(blogdata,regex="\\S+"))/1000000

# number of characters in each file (in million)
cht <-sum(nchar(tweetdata, type = "chars"))/1000000
chn <- sum(nchar(newsdata, type = "chars"))/1000000
chb <- sum(nchar(blogdata, type = "chars"))/1000000

# Size of the files in MB
size_MB <-c(file.info("/Users/rizwanmohamed/Coursera/textData/final/en_US/en_US.twitter.txt")$size/1024^2,file.info("/Users/rizwanmohamed/Coursera/textData/final/en_US/en_US.news.txt")$size/1024^2,file.info("/Users/rizwanmohamed/Coursera/textData/final/en_US/en_US.blogs.txt")$size/1024^2)

# Summary of the data sets
TextTable <- data.frame(source = c("en_US.twitter file","en_US.news file", "en_US.blog file"),Total_lines_in_million = c(lt,ln,lb),Total_words_in_million = c(wt,wn,wb),Total_characters_in_million =c(cht,chn,chb),mean_num_of_words_per_line =round(c((wt/lt),(wn/ln),(wb/lb)),2),size_MB)
                        
print(TextTable)

##               source Total_lines_in_million Total_words_in_million
## 1 en_US.twitter file               2.360148               30.37358
## 2    en_US.news file               1.010242               34.37253
## 3    en_US.blog file               0.899288               37.33413
##   Total_characters_in_million mean_num_of_words_per_line  size_MB
## 1                    162.0962                      12.87 159.3641
## 2                    203.2232                      34.02 196.2775
## 3                    206.8245                      41.52 200.4242

rm(TextTable)# removed the file to save memory space

# There are more than 2 million lines in twitter file and 90 lakh lines in blog file.The sentences in the blog file are more lenghthy compared to that of others. Total size of the three files is more than 600MB. Blog file occupies 1/3rd of total size of the files.

# It is computationally expensive to explore data on the total size of the files. We are taking samples (50000 lines) from each of these three files to create the sample text file to do exploratory data analysis. We are sampling data using rbinom function to take randomly normalised data for better interpretation.

Sampling of the text data

set.seed(201911)
# Selected 25000 lines from each of three files to create text sample using binomial distribution
blogsample <- rbinom(length(blogdata),100000,0.5)
newssample <- rbinom(length(newsdata),100000,0.5)
tweetsample <- rbinom(length(tweetdata),100000,0.5)

text_directory<- c(sample(blogdata[blogsample],25000,replace = FALSE),sample(newsdata[newssample],25000,replace = FALSE),
             sample(tweetdata[tweetsample],25000,replace = FALSE))# sample of text data

# sample sentences from the blogs
text_directory[1]

## [1] "THE SHAMROCK TSUNAMI"

#  sample sentences from the news 
text_directory[25001]

## [1] "This is fuzzy, to say the least. If the jury doesn't believe that Bonds perjured himself, then what, pray tell, did he obstruct?"

# sample sentences from the twwets
text_directory[75000]

## [1] "All Hail the King! with an EPIC 39 Save Shutout!!!"

Saving and loading the text sample

cat(text_directory, file = (con <- file("textsample.txt", "w+", encoding = "UTF-8")), sep = "\n", fill = FALSE, labels = NULL,append = FALSE)

textsample <- readLines(con<-file("textsample.txt",encoding = "UTF-8"))
close(con)

Cleaning the sampled text data(textsample) to create corpus of words

set.seed(201911)
library(NLP)
library(tm)# tm package in R for text data mining

corp<- VCorpus(VectorSource(textsample),list(readPlain,language="en"))
corp<-tm_map(corp,content_transformer(tolower))
reg_exp <- content_transformer(function(x,pattern) gsub(pattern,replacement =" ",x))
corp <- tm_map(corp,reg_exp,"[[:alnum:]]+\\@[[:alpha:]]+\\.com")
corp<- tm_map(corp,reg_exp,"[^a-z\\s]+")
corp <- tm_map(corp,reg_exp,"\\(.*?\\)")
corp<-tm_map(corp,reg_exp,"[[:punct:]]+")

Removing stopwords and profanity

library(stopwords,warn.conflicts = FALSE)
set.seed(201911)
corp<- tm_map(corp,removeWords,stopwords(language = "en",source = "smart"))
con<- file("/Users/rizwanmohamed/Coursera/bad-words.txt","r")
profanityWords <- readLines(con)
close(con = con)
corp<- tm_map(corp,removeWords,profanityWords)
rm(profanityWords)

TermDocumentMatrix and 1-gram tdm

options(mc.cores=1)
corp<-tm_map(corp,removeNumbers)
corp<-tm_map(corp,stripWhitespace)
corp<-tm_map(corp,PlainTextDocument)
tdm <- TermDocumentMatrix(corp)
Onegram<- removeSparseTerms(tdm, 0.99999)# Tokenizing not required for one gram text data LM

Word Cloud of 100 more frequent one gram words of the text sample

library(RColorBrewer)
library(wordcloud)
v <- sort(rowSums(as.matrix(Onegram)),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)

pal <-brewer.pal(9,'Spectral')
wordcloud(d$word, d$freq, scale=c(2,.9),color="black",max.words=100,random.order=FALSE,rot.per=.25,use.r.layout=FALSE,pal)

## Warning in if (min.freq > max(freq)) min.freq <- 0: the condition has length > 1
## and only the first element will be used

## Warning in freq >= min.freq: longer object length is not a multiple of shorter
## object length

## Warning in freq >= min.freq: longer object length is not a multiple of shorter
## object length

#Tokenizing words of the Term Document Matrix

Exploratory Analysis

#Here is a histogram of the 30 most common unigrams in the data sample.