Introduction
The goal of this project is to present the main findings after exploring and cleaning the Capstone Dataset. We will present histograms of the most frequent words and pair of words in the sample data.
Basic summary
This report provides a short overview of the exploratory analysis of the text data to be used for the Capstone project for the Data Science Specialization along with a description of plans for the word prediction algorithm.
list.of.packages <- c("stringi", "tm", "wordcloud", "RColorBrewer")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages, repos="http://cran.rstudio.com/")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.2
library(stringi)
I have downloaded and used Blog, News and Tweeter files from the Capstone Dataset for the blow data anlysis
file.list = c("C:/Users/Solomon/Documents/Coursera_2_Data_Science_Capstone/en_US/en_US.blogs.txt",
"C:/Users/Solomon/Documents/Coursera_2_Data_Science_Capstone/en_US/en_US.news.txt",
"C:/Users/Solomon/Documents/Coursera_2_Data_Science_Capstone/en_US/en_US.twitter.txt")
text <- list(blogs = "", news = "", twitter = "")
table.summary <- matrix(0, nrow = 3, ncol = 3, dimnames = list(c("Blogs", "News", "Twitter"),c("File size (Mb)", "No of Lines", "Wordcount")))
for (i in 1:3) {
con <- file(file.list[i], "rb")
text[[i]] <- readLines(con, encoding = "UTF-8",skipNul = TRUE)
close(con)
table.summary[i,1] <- round(file.info(file.list[i])$size / 1024^2, 2)
table.summary[i,2] <- length(text[[i]])
table.summary[i,3] <- sum(stri_count_words(text[[i]]))
}
library(knitr)
## Warning: package 'knitr' was built under R version 4.0.2
kable(table.summary)
| File size (Mb) | No of Lines | Wordcount | |
|---|---|---|---|
| Blogs | 200.42 | 899288 | 37546239 |
| News | 196.28 | 1010242 | 34762395 |
| 159.36 | 2360148 | 30093413 |
Since the datasets are too big to analyse and will take a huge memory space and processing time, I have generated sample datasets with set.seed() function
-blogs_sample
-twitter_sample
-news_sample
set.seed(100)
blogs_sample <- sample(text$blogs, 0.01*length(text$blogs))
news_sample <- sample(text$news, 0.001*length(text$news))
twitter_sample <- sample(text$twitter, 0.00001*length(text$twitter))
sampled_data <- c(blogs_sample, news_sample, twitter_sample)
sum <- sum(stri_count_words(sampled_data))
sum
## [1] 411197
The sample datasets consists 0f 411197 wordcount in total.
4 Build the corpus and Create a basic report of summary statistics about the data sets
library(tm)
## Warning: package 'tm' was built under R version 4.0.2
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 4.0.2
## Loading required package: RColorBrewer
Creating corpus for Blogs Sample Dataset
library(RColorBrewer)
# remove emoticons
sampled_data <- iconv(sampled_data, 'UTF-8', 'ASCII')
# Create corpus
corpusBlogsSample <- Corpus(VectorSource(blogs_sample))
# To lower case
corpusBlogsSample <- tm_map(corpusBlogsSample, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpusBlogsSample, content_transformer(tolower)):
## transformation drops documents
Data Cleansing for sample Blogs dataset - corpusBlogsSample
corpusBlogsSample <- tm_map(corpusBlogsSample, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpusBlogsSample, removePunctuation):
## transformation drops documents
corpusBlogsSample <- tm_map(corpusBlogsSample, removeNumbers)
## Warning in tm_map.SimpleCorpus(corpusBlogsSample, removeNumbers): transformation
## drops documents
corpusBlogsSample <- tm_map(corpusBlogsSample, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpusBlogsSample, removeWords,
## stopwords("english")): transformation drops documents
corpusBlogsSample <- tm_map(corpusBlogsSample, stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpusBlogsSample, stripWhitespace):
## transformation drops documents
frequentWords <- head(sort(rowSums(as.matrix(TermDocumentMatrix(corpusBlogsSample))),decreasing=TRUE), 10)
barplot(frequentWords,
main = "Blogs Data: Most Frequent Words",
xlab="Word",
ylab = "Count")
term.doc.matrix1 <- TermDocumentMatrix(corpusBlogsSample)
term.doc.matrix1 <- as.matrix(term.doc.matrix1)
word.freqs1 <- sort(rowSums(term.doc.matrix1), decreasing=TRUE)
dm1 <- data.frame(word=names(word.freqs1), freq=word.freqs1)
Word cloud plot of the most common words in the corpus “corpusBlogsSample” datase
wordcloud(dm1$word, dm1$freq, min.freq= 150,scale=c(4,.5), random.order=TRUE, rot.per=.15, colors=brewer.pal(8, "Dark2"))
Creating corpus for News Sample dataset
corpusNewsSample <- Corpus(VectorSource(news_sample))
Data Cleansing for sample News dataset
corpusNewsSample <- tm_map(corpusNewsSample, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpusNewsSample, content_transformer(tolower)):
## transformation drops documents
corpusNewsSample <- tm_map(corpusNewsSample, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpusNewsSample, removePunctuation):
## transformation drops documents
corpusNewsSample <- tm_map(corpusNewsSample, removeNumbers)
## Warning in tm_map.SimpleCorpus(corpusNewsSample, removeNumbers): transformation
## drops documents
corpusNewsSample <- tm_map(corpusNewsSample, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpusNewsSample, removeWords,
## stopwords("english")): transformation drops documents
corpusNewsSample <- tm_map(corpusNewsSample, stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpusNewsSample, stripWhitespace):
## transformation drops documents
frequentWords <- head(sort(rowSums(as.matrix(TermDocumentMatrix(corpusNewsSample))),decreasing=TRUE), 10)
barplot(frequentWords,
main = "News Data: Most Frequent Words",
xlab="Word",
ylab = "word Count")
term.doc.matrix2 <- TermDocumentMatrix(corpusNewsSample)
term.doc.matrix2 <- as.matrix(term.doc.matrix2)
word.freqs2 <- sort(rowSums(term.doc.matrix2), decreasing=TRUE)
dm2 <- data.frame(word=names(word.freqs2), freq=word.freqs2)
Now we will see the Most common words in the corpus - “corpusNewsSample” dataset
wordcloud(dm2$word, dm2$freq, min.freq= 10, random.order=TRUE, rot.per=.25, colors=brewer.pal(8, "Dark2"))
Creating corpus for Twitter Data
# Create corpus
corpusTwitterSample <- Corpus(VectorSource(twitter_sample))
## Convert Character Vector between Encodings
corpusTwitterSample <- tm_map(corpusTwitterSample, content_transformer(function(x)
iconv(x, to = "UTF-8", sub = "byte")))
## Warning in tm_map.SimpleCorpus(corpusTwitterSample,
## content_transformer(function(x) iconv(x, : transformation drops documents
Data Cleansing for sample Twitter dataset - corpusTwitterSample
# To lower case
corpusTwitterSample <- tm_map(corpusTwitterSample, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpusTwitterSample,
## content_transformer(tolower)): transformation drops documents
corpusTwitterSample <- tm_map(corpusTwitterSample, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpusTwitterSample, removePunctuation):
## transformation drops documents
corpusTwitterSample <- tm_map(corpusTwitterSample, removeNumbers)
## Warning in tm_map.SimpleCorpus(corpusTwitterSample, removeNumbers):
## transformation drops documents
corpusTwitterSample <- tm_map(corpusTwitterSample, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpusTwitterSample, removeWords,
## stopwords("english")): transformation drops documents
corpusTwitterSample <- tm_map(corpusTwitterSample, stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpusTwitterSample, stripWhitespace):
## transformation drops documents
frequentWords <- head(sort(rowSums(as.matrix(TermDocumentMatrix(corpusTwitterSample))),decreasing=TRUE), 10)
barplot(frequentWords,
main = "Twitter Data: Most Frequent Words",
xlab="Word",
ylab = "Count")
term.doc.matrix3 <- TermDocumentMatrix(corpusTwitterSample)
term.doc.matrix3 <- as.matrix(term.doc.matrix3)
word.freqs3 <- sort(rowSums(term.doc.matrix3), decreasing=TRUE)
dm3 <- data.frame(word=names(word.freqs3), freq=word.freqs3)
wordcloud(dm3$word, dm3$freq, min.freq= 10,scale=c(2,.5), random.order=FALSE, rot.per=.15, colors=brewer.pal(8, "Dark2"))
Summary and Conclusion for Milestone Report
1.The “blogs” and “news” datasets appear to have less word counts, contrary to “twitter” dataset.
The data sets are very big, as aresult, the nature of the data sets processing requires different computing resources
Most of the top ranking n-grams obtains English stop words
Using the n-grams we can conceive a crude algorithm to suggest the next words in a text editor; For example, the probability of an untyped word can be estimated from the frequencies in the corpus of the n-grams containing that word in the last position conditioned on the presence the last typed word(s) as the first n - 1 words in the n-gram.
5.In general, before i end this report, it is important to note that each of the steps are very crutial and each steps need to be re-evaluated continuosly to get properly functioninig and accurate for our predictive text app. Thus, I am looking forward on the next report on the predictive model and shiny app I am going to build.