This report provides a short overview of the exploratory analysis of the text data to be used for the Capstone project for the Data Science Specialization.
library("stringi")
file.list = c("C:/Users/gehad/Desktop/final/en_US/en_US.blogs.txt",
"C:/Users/gehad/Desktop/final/en_US/en_US.news.txt","C:/Users/gehad/Desktop/final/en_US/en_US.twitter.txt")
text <- list(blogs = "", news = "", twitter = "")
matrix.summary <- matrix(0, nrow = 3, ncol = 3, dimnames = list(c("blogs", "news", ""),c("file size, Mb", "lines", "words")))
for (i in 1:3) {
con <- file(file.list[i], "rb")
text[[i]] <- readLines(con, encoding = "UTF-8",skipNul = TRUE)
close(con)
matrix.summary[i,1] <- round(file.info(file.list[i])$size / 1024^2, 2)
matrix.summary[i,2] <- length(text[[i]])
matrix.summary[i,3] <- sum(stri_count_words(text[[i]]))
}
library(kableExtra)
matrix.summary %>%
kable() %>%
kable_styling()
| file size, Mb | lines | words | |
|---|---|---|---|
| blogs | 200.42 | 899288 | 37546239 |
| news | 196.28 | 1010242 | 34762395 |
| 159.36 | 2360148 | 30093413 |
The dataset is too large so I’ll take 3k random lines for the following analysis
set.seed(50)
blogs_sample <- sample(text$blogs, 0.003*length(text$blogs))
news_sample <- sample(text$news, 0.003*length(text$news))
twitter_sample <- sample(text$twitter, 0.003*length(text$twitter))
Let’s build the corpus and do some cleaning ( Remove punctuation marks , Remove numbers , remove stop words , Remove whitespaces)
library("tm")
## Loading required package: NLP
# Create corpus
corpus_Blogs <- Corpus(VectorSource(blogs_sample))
# To lower case
corpus_Blogs <- tm_map(corpus_Blogs, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpus_Blogs, content_transformer(tolower)):
## transformation drops documents
# Remove punctuation marks
corpus_Blogs <- tm_map(corpus_Blogs, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus_Blogs, removePunctuation):
## transformation drops documents
# Remove numbers
corpus_Blogs <- tm_map(corpus_Blogs, removeNumbers)
## Warning in tm_map.SimpleCorpus(corpus_Blogs, removeNumbers): transformation
## drops documents
#remove stop words
corpus_Blogs <- tm_map(corpus_Blogs, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpus_Blogs, removeWords,
## stopwords("english")): transformation drops documents
#Remove whitespaces
corpus_Blogs <- tm_map(corpus_Blogs, stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpus_Blogs, stripWhitespace):
## transformation drops documents
Let’s plot 10 most frequent words in Blogs
frequentWords <- head(sort(rowSums(as.matrix(TermDocumentMatrix(corpus_Blogs))),decreasing=TRUE), 10)
barplot(frequentWords,
main = "Blogs Data: Most Frequent Words",
xlab="Word",
ylab = "Count")
# Create corpus
corpus_News <- Corpus(VectorSource(news_sample))
# To lower case
corpus_News <- tm_map(corpus_News, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpus_News, content_transformer(tolower)):
## transformation drops documents
# Remove punctuation marks
corpus_News <- tm_map(corpus_News, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus_News, removePunctuation):
## transformation drops documents
# Remove numbers
corpus_News <- tm_map(corpus_News, removeNumbers)
## Warning in tm_map.SimpleCorpus(corpus_News, removeNumbers): transformation
## drops documents
#remove stop words
corpus_News <- tm_map(corpus_News, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpus_News, removeWords,
## stopwords("english")): transformation drops documents
#Remove whitespaces
corpus_News <- tm_map(corpus_News, stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpus_News, stripWhitespace):
## transformation drops documents
Let’s plot 10 most frequent words in News
frequentWords <- head(sort(rowSums(as.matrix(TermDocumentMatrix(corpus_News))),decreasing=TRUE), 10)
barplot(frequentWords,
main = "News Data: Most Frequent Words",
xlab="Word",
ylab = "Count")
# Create corpus
corpus_Twitter <- Corpus(VectorSource(twitter_sample))
# To lower case
corpus_Twitter <- tm_map(corpus_Twitter, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpus_Twitter,
## content_transformer(tolower)): transformation drops documents
# Remove punctuation marks
corpus_Twitter <- tm_map(corpus_Twitter, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus_Twitter, removePunctuation):
## transformation drops documents
# Remove numbers
corpus_Twitter <- tm_map(corpus_Twitter, removeNumbers)
## Warning in tm_map.SimpleCorpus(corpus_Twitter, removeNumbers):
## transformation drops documents
#remove stop words
corpus_Twitter <- tm_map(corpus_Twitter, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpus_Twitter, removeWords,
## stopwords("english")): transformation drops documents
#Remove whitespaces
corpus_Twitter <- tm_map(corpus_Twitter, stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpus_Twitter, stripWhitespace):
## transformation drops documents
Let’s plot 10 most frequent words in Twitter
frequentWords <- head(sort(rowSums(as.matrix(TermDocumentMatrix(corpus_Twitter))),decreasing=TRUE), 10)
barplot(frequentWords,
main = "Twitter Data: Most Frequent Words",
xlab="Word",
ylab = "Count")
1- the data sets are too big and processing them requires time and computing resources 2- the most frequent words are stopwords so they must be removed