Synopsis

This report provides a short overview of the exploratory analysis of the text data to be used for the Capstone project for the Data Science Specialization.

The motivation for this project is to:
  1. Demonstrate that you’ve downloaded the data and have successfully loaded it in.
  2. Create a basic report of summary statistics about the data sets.
  3. Report any interesting findings that you amassed so far.
  4. Get feedback on your plans for creating a prediction algorithm and Shiny app.

Data loading

library("stringi")

file.list = c("C:/Users/gehad/Desktop/final/en_US/en_US.blogs.txt",
"C:/Users/gehad/Desktop/final/en_US/en_US.news.txt","C:/Users/gehad/Desktop/final/en_US/en_US.twitter.txt")

Table summary for blogs, news and twitter files

text <- list(blogs = "", news = "", twitter = "")

matrix.summary <- matrix(0, nrow = 3, ncol = 3, dimnames = list(c("blogs", "news", ""),c("file size, Mb", "lines", "words")))
for (i in 1:3) {
  con <- file(file.list[i], "rb")
  text[[i]] <- readLines(con, encoding = "UTF-8",skipNul = TRUE)
  close(con)
  matrix.summary[i,1] <- round(file.info(file.list[i])$size / 1024^2, 2)
  matrix.summary[i,2] <- length(text[[i]])
  matrix.summary[i,3] <- sum(stri_count_words(text[[i]]))
}
library(kableExtra)

matrix.summary %>%
  kable() %>%
  kable_styling()
file size, Mb lines words
blogs 200.42 899288 37546239
news 196.28 1010242 34762395
159.36 2360148 30093413

The dataset is too large so I’ll take 3k random lines for the following analysis

set.seed(50)
blogs_sample <- sample(text$blogs, 0.003*length(text$blogs))
news_sample <- sample(text$news, 0.003*length(text$news))
twitter_sample <- sample(text$twitter, 0.003*length(text$twitter))

Data Preprocessing

Let’s build the corpus and do some cleaning ( Remove punctuation marks , Remove numbers , remove stop words , Remove whitespaces)

1- Blogs Sample

library("tm")
## Loading required package: NLP
# Create corpus
corpus_Blogs <- Corpus(VectorSource(blogs_sample))
# To lower case
corpus_Blogs <- tm_map(corpus_Blogs, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpus_Blogs, content_transformer(tolower)):
## transformation drops documents
# Remove punctuation marks
corpus_Blogs <- tm_map(corpus_Blogs, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus_Blogs, removePunctuation):
## transformation drops documents
# Remove numbers
corpus_Blogs <- tm_map(corpus_Blogs, removeNumbers)
## Warning in tm_map.SimpleCorpus(corpus_Blogs, removeNumbers): transformation
## drops documents
#remove stop words
corpus_Blogs <- tm_map(corpus_Blogs, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpus_Blogs, removeWords,
## stopwords("english")): transformation drops documents
#Remove whitespaces
corpus_Blogs <- tm_map(corpus_Blogs, stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpus_Blogs, stripWhitespace):
## transformation drops documents

Let’s plot 10 most frequent words in Blogs

frequentWords <- head(sort(rowSums(as.matrix(TermDocumentMatrix(corpus_Blogs))),decreasing=TRUE), 10)

barplot(frequentWords, 
        main = "Blogs Data: Most Frequent Words", 
        xlab="Word", 
        ylab = "Count")

2- News Data

# Create corpus
corpus_News <- Corpus(VectorSource(news_sample))
# To lower case
corpus_News <- tm_map(corpus_News, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpus_News, content_transformer(tolower)):
## transformation drops documents
# Remove punctuation marks
corpus_News <- tm_map(corpus_News, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus_News, removePunctuation):
## transformation drops documents
# Remove numbers
corpus_News <- tm_map(corpus_News, removeNumbers)
## Warning in tm_map.SimpleCorpus(corpus_News, removeNumbers): transformation
## drops documents
#remove stop words
corpus_News <- tm_map(corpus_News, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpus_News, removeWords,
## stopwords("english")): transformation drops documents
#Remove whitespaces
corpus_News <- tm_map(corpus_News, stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpus_News, stripWhitespace):
## transformation drops documents

Let’s plot 10 most frequent words in News

frequentWords <- head(sort(rowSums(as.matrix(TermDocumentMatrix(corpus_News))),decreasing=TRUE), 10)

barplot(frequentWords, 
        main = "News Data: Most Frequent Words", 
        xlab="Word", 
        ylab = "Count")

3- Twitter Data

# Create corpus
corpus_Twitter <- Corpus(VectorSource(twitter_sample))

# To lower case
corpus_Twitter <- tm_map(corpus_Twitter, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpus_Twitter,
## content_transformer(tolower)): transformation drops documents
# Remove punctuation marks
corpus_Twitter <- tm_map(corpus_Twitter, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus_Twitter, removePunctuation):
## transformation drops documents
# Remove numbers
corpus_Twitter <- tm_map(corpus_Twitter, removeNumbers)
## Warning in tm_map.SimpleCorpus(corpus_Twitter, removeNumbers):
## transformation drops documents
#remove stop words
corpus_Twitter <- tm_map(corpus_Twitter, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpus_Twitter, removeWords,
## stopwords("english")): transformation drops documents
#Remove whitespaces
corpus_Twitter <- tm_map(corpus_Twitter, stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpus_Twitter, stripWhitespace):
## transformation drops documents

Let’s plot 10 most frequent words in Twitter

frequentWords <- head(sort(rowSums(as.matrix(TermDocumentMatrix(corpus_Twitter))),decreasing=TRUE), 10)

barplot(frequentWords, 
        main = "Twitter Data: Most Frequent Words", 
        xlab="Word", 
        ylab = "Count")

Summary

1- the data sets are too big and processing them requires time and computing resources 2- the most frequent words are stopwords so they must be removed