This report provides a short overview of the exploratory analysis of the text data to be used for the Capstone project for the Data Science Specialization along with a description of plans for the word prediction algorithm.
After download the file from Coursera: https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip
list.of.packages <- c("stringi", "tm", "wordcloud", "RColorBrewer")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages, repos="http://cran.rstudio.com/")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(stringi)
file.list = c("C:/Users/be174.BARQSYSTEMS/Desktop/Data Science Capstone/final/en_US/en_US.blogs.txt", "C:/Users/be174.BARQSYSTEMS/Desktop/Data Science Capstone/final/en_US/en_US.news.txt", "C:/Users/be174.BARQSYSTEMS/Desktop/Data Science Capstone/final/en_US/en_US.twitter.txt")
text <- list(blogs = "", news = "", twitter = "")
matrix.summary <- matrix(0, nrow = 3, ncol = 3, dimnames = list(c("blogs", "news", "twitter"),c("file size, Mb", "lines", "words")))
for (i in 1:3) {
con <- file(file.list[i], "rb")
text[[i]] <- readLines(con, encoding = "UTF-8",skipNul = TRUE)
close(con)
matrix.summary[i,1] <- round(file.info(file.list[i])$size / 1024^2, 2)
matrix.summary[i,2] <- length(text[[i]])
matrix.summary[i,3] <- sum(stri_count_words(text[[i]]))
}
library(knitr)
kable(matrix.summary)
| file size, Mb | lines | words | |
|---|---|---|---|
| blogs | 200.42 | 899288 | 37546239 |
| news | 196.28 | 1010242 | 34762395 |
| 159.36 | 2360148 | 30093413 |
These datasets are rather large, I will proceed with the analysis using a small fraction to get a sample. For example, News file is 196MB of size and 1.010,242 Lines. I will use 10k random lines for analysis.
set.seed(123)
blogs_sample <- sample(text$blogs, 0.01*length(text$blogs))
news_sample <- sample(text$news, 0.01*length(text$news))
twitter_sample <- sample(text$twitter, 0.01*length(text$twitter))
sampled_data <- c(blogs_sample, news_sample, twitter_sample)
sum <- sum(stri_count_words(sampled_data))
sum
## [1] 1023563
The new data set consists of (1023563) words.
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
# remove emoticons
sampled_data <- iconv(sampled_data, 'UTF-8', 'ASCII')
# Create corpus
corpus1 <- Corpus(VectorSource(blogs_sample))
# To lower case
corpus1 <- tm_map(corpus1, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpus1, content_transformer(tolower)):
## transformation drops documents
# Remove punctuation marks
corpus1 <- tm_map(corpus1, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus1, removePunctuation): transformation
## drops documents
# Remove numbers
corpus1 <- tm_map(corpus1, removeNumbers)
## Warning in tm_map.SimpleCorpus(corpus1, removeNumbers): transformation
## drops documents
#remove stop words
corpus1 <- tm_map(corpus1, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpus1, removeWords, stopwords("english")):
## transformation drops documents
#Remove whitespaces
corpus1 <- tm_map(corpus1, stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpus1, stripWhitespace): transformation
## drops documents
frequentWords <- head(sort(rowSums(as.matrix(TermDocumentMatrix(corpus1))),decreasing=TRUE), 10)
barplot(frequentWords,
main = "Blogs Data: Most Frequent Words",
xlab="Word",
ylab = "Count")
term.doc.matrix1 <- TermDocumentMatrix(corpus1)
term.doc.matrix1 <- as.matrix(term.doc.matrix1)
word.freqs1 <- sort(rowSums(term.doc.matrix1), decreasing=TRUE)
dm1 <- data.frame(word=names(word.freqs1), freq=word.freqs1)
Word cloud plot of the most common words in the corpus
wordcloud(dm1$word, dm1$freq, min.freq= 150,scale=c(4,.5), random.order=TRUE, rot.per=.15, colors=brewer.pal(8, "Dark2"))
## Warning in wordcloud(dm1$word, dm1$freq, min.freq = 150, scale = c(4,
## 0.5), : know could not be fit on page. It will not be plotted.
## Warning in wordcloud(dm1$word, dm1$freq, min.freq = 100, random.order =
## TRUE, : can could not be fit on page. It will not be plotted.
# Create corpus
corpus2 <- Corpus(VectorSource(news_sample))
# To lower case
corpus2 <- tm_map(corpus2, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpus2, content_transformer(tolower)):
## transformation drops documents
# Remove punctuation marks
corpus2 <- tm_map(corpus2, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus2, removePunctuation): transformation
## drops documents
# Remove numbers
corpus2 <- tm_map(corpus2, removeNumbers)
## Warning in tm_map.SimpleCorpus(corpus2, removeNumbers): transformation
## drops documents
#remove stop words
corpus2 <- tm_map(corpus2, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpus2, removeWords, stopwords("english")):
## transformation drops documents
#Remove whitespaces
corpus2 <- tm_map(corpus2, stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpus2, stripWhitespace): transformation
## drops documents
frequentWords <- head(sort(rowSums(as.matrix(TermDocumentMatrix(corpus2))),decreasing=TRUE), 10)
barplot(frequentWords,
main = "News Data: Most Frequent Words",
xlab="Word",
ylab = "Count")
term.doc.matrix2 <- TermDocumentMatrix(corpus2)
term.doc.matrix2 <- as.matrix(term.doc.matrix2)
word.freqs2 <- sort(rowSums(term.doc.matrix2), decreasing=TRUE)
dm2 <- data.frame(word=names(word.freqs2), freq=word.freqs2)
wordcloud(dm2$word, dm2$freq, min.freq= 100, random.order=TRUE, rot.per=.25, colors=brewer.pal(8, "Dark2"))
# Create corpus
corpus3 <- Corpus(VectorSource(twitter_sample))
## Convert Character Vector between Encodings
corpus3 <- tm_map(corpus3, content_transformer(function(x)
iconv(x, to = "UTF-8", sub = "byte")))
## Warning in tm_map.SimpleCorpus(corpus3, content_transformer(function(x)
## iconv(x, : transformation drops documents
# To lower case
corpus3 <- tm_map(corpus3, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpus3, content_transformer(tolower)):
## transformation drops documents
# Remove punctuation marks
corpus3 <- tm_map(corpus3, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus3, removePunctuation): transformation
## drops documents
# Remove numbers
corpus3 <- tm_map(corpus3, removeNumbers)
## Warning in tm_map.SimpleCorpus(corpus3, removeNumbers): transformation
## drops documents
#remove stop words
corpus3 <- tm_map(corpus3, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpus3, removeWords, stopwords("english")):
## transformation drops documents
#Remove whitespaces
corpus3 <- tm_map(corpus3, stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpus3, stripWhitespace): transformation
## drops documents
frequentWords <- head(sort(rowSums(as.matrix(TermDocumentMatrix(corpus3))),decreasing=TRUE), 10)
barplot(frequentWords,
main = "Twitter Data: Most Frequent Words",
xlab="Word",
ylab = "Count")
term.doc.matrix3 <- TermDocumentMatrix(corpus3)
term.doc.matrix3 <- as.matrix(term.doc.matrix3)
word.freqs3 <- sort(rowSums(term.doc.matrix3), decreasing=TRUE)
dm3 <- data.frame(word=names(word.freqs3), freq=word.freqs3)
wordcloud(dm3$word, dm3$freq, min.freq= 150,scale=c(4,.5), random.order=FALSE, rot.per=.15, colors=brewer.pal(8, "Dark2"))