Basic summary

This report provides a short overview of the exploratory analysis of the text data to be used for the Capstone project for the Data Science Specialization along with a description of plans for the word prediction algorithm.

Tasks to accomplish

  1. Demonstrate that you’ve downloaded the data and have successfully loaded it in.
  2. Create a basic report of summary statistics about the data sets.
  3. Report any interesting findings that you amassed so far.
  4. Get feedback on your plans for creating a prediction algorithm and Shiny app

Data loading and Analysis

After download the file from Coursera: https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip

  1. Load the R packages necessary for running the analysis
list.of.packages <- c("stringi", "tm", "wordcloud", "RColorBrewer")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages, repos="http://cran.rstudio.com/")
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(stringi)
  1. Load the data
file.list = c("C:/Users/be174.BARQSYSTEMS/Desktop/Data Science Capstone/final/en_US/en_US.blogs.txt", "C:/Users/be174.BARQSYSTEMS/Desktop/Data Science Capstone/final/en_US/en_US.news.txt", "C:/Users/be174.BARQSYSTEMS/Desktop/Data Science Capstone/final/en_US/en_US.twitter.txt")
  1. Building a table
text <- list(blogs = "", news = "", twitter = "")

matrix.summary <- matrix(0, nrow = 3, ncol = 3, dimnames = list(c("blogs", "news", "twitter"),c("file size, Mb", "lines", "words")))
for (i in 1:3) {
  con <- file(file.list[i], "rb")
  text[[i]] <- readLines(con, encoding = "UTF-8",skipNul = TRUE)
  close(con)
  matrix.summary[i,1] <- round(file.info(file.list[i])$size / 1024^2, 2)
  matrix.summary[i,2] <- length(text[[i]])
  matrix.summary[i,3] <- sum(stri_count_words(text[[i]]))
}
library(knitr)
kable(matrix.summary)
file size, Mb lines words
blogs 200.42 899288 37546239
news 196.28 1010242 34762395
twitter 159.36 2360148 30093413

These datasets are rather large, I will proceed with the analysis using a small fraction to get a sample. For example, News file is 196MB of size and 1.010,242 Lines. I will use 10k random lines for analysis.

set.seed(123)
blogs_sample <- sample(text$blogs, 0.01*length(text$blogs))
news_sample <- sample(text$news, 0.01*length(text$news))
twitter_sample <- sample(text$twitter, 0.01*length(text$twitter))
sampled_data <- c(blogs_sample, news_sample, twitter_sample)
sum <- sum(stri_count_words(sampled_data))
sum
## [1] 1023563

The new data set consists of (1023563) words.

Build the corpus

library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
# remove emoticons
sampled_data <- iconv(sampled_data, 'UTF-8', 'ASCII')
# Create corpus
corpus1 <- Corpus(VectorSource(blogs_sample))
# To lower case
corpus1 <- tm_map(corpus1, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpus1, content_transformer(tolower)):
## transformation drops documents
# Remove punctuation marks
corpus1 <- tm_map(corpus1, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus1, removePunctuation): transformation
## drops documents
# Remove numbers
corpus1 <- tm_map(corpus1, removeNumbers)
## Warning in tm_map.SimpleCorpus(corpus1, removeNumbers): transformation
## drops documents
#remove stop words
corpus1 <- tm_map(corpus1, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpus1, removeWords, stopwords("english")):
## transformation drops documents
#Remove whitespaces
corpus1 <- tm_map(corpus1, stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpus1, stripWhitespace): transformation
## drops documents
frequentWords <- head(sort(rowSums(as.matrix(TermDocumentMatrix(corpus1))),decreasing=TRUE), 10)

barplot(frequentWords, 
        main = "Blogs Data: Most Frequent Words", 
        xlab="Word", 
        ylab = "Count")

term.doc.matrix1 <- TermDocumentMatrix(corpus1)
term.doc.matrix1 <- as.matrix(term.doc.matrix1)
word.freqs1 <- sort(rowSums(term.doc.matrix1), decreasing=TRUE) 
dm1 <- data.frame(word=names(word.freqs1), freq=word.freqs1)

Word cloud plot of the most common words in the corpus

wordcloud(dm1$word, dm1$freq, min.freq= 150,scale=c(4,.5), random.order=TRUE, rot.per=.15, colors=brewer.pal(8, "Dark2"))
## Warning in wordcloud(dm1$word, dm1$freq, min.freq = 150, scale = c(4,
## 0.5), : know could not be fit on page. It will not be plotted.

## Warning in wordcloud(dm1$word, dm1$freq, min.freq = 100, random.order =
## TRUE, : can could not be fit on page. It will not be plotted.

News Data

# Create corpus
corpus2 <- Corpus(VectorSource(news_sample))
# To lower case
corpus2 <- tm_map(corpus2, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpus2, content_transformer(tolower)):
## transformation drops documents
# Remove punctuation marks
corpus2 <- tm_map(corpus2, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus2, removePunctuation): transformation
## drops documents
# Remove numbers
corpus2 <- tm_map(corpus2, removeNumbers)
## Warning in tm_map.SimpleCorpus(corpus2, removeNumbers): transformation
## drops documents
#remove stop words
corpus2 <- tm_map(corpus2, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpus2, removeWords, stopwords("english")):
## transformation drops documents
#Remove whitespaces
corpus2 <- tm_map(corpus2, stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpus2, stripWhitespace): transformation
## drops documents
frequentWords <- head(sort(rowSums(as.matrix(TermDocumentMatrix(corpus2))),decreasing=TRUE), 10)

barplot(frequentWords, 
        main = "News Data: Most Frequent Words", 
        xlab="Word", 
        ylab = "Count")

term.doc.matrix2 <- TermDocumentMatrix(corpus2)
term.doc.matrix2 <- as.matrix(term.doc.matrix2)
word.freqs2 <- sort(rowSums(term.doc.matrix2), decreasing=TRUE) 
dm2 <- data.frame(word=names(word.freqs2), freq=word.freqs2)

Most common words in the corpus

wordcloud(dm2$word, dm2$freq, min.freq= 100, random.order=TRUE, rot.per=.25, colors=brewer.pal(8, "Dark2"))

Twitter Data

# Create corpus
corpus3 <- Corpus(VectorSource(twitter_sample))

## Convert Character Vector between Encodings
corpus3 <- tm_map(corpus3, content_transformer(function(x)
  iconv(x, to = "UTF-8", sub = "byte")))
## Warning in tm_map.SimpleCorpus(corpus3, content_transformer(function(x)
## iconv(x, : transformation drops documents
# To lower case
corpus3 <- tm_map(corpus3, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpus3, content_transformer(tolower)):
## transformation drops documents
# Remove punctuation marks
corpus3 <- tm_map(corpus3, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus3, removePunctuation): transformation
## drops documents
# Remove numbers
corpus3 <- tm_map(corpus3, removeNumbers)
## Warning in tm_map.SimpleCorpus(corpus3, removeNumbers): transformation
## drops documents
#remove stop words
corpus3 <- tm_map(corpus3, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpus3, removeWords, stopwords("english")):
## transformation drops documents
#Remove whitespaces
corpus3 <- tm_map(corpus3, stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpus3, stripWhitespace): transformation
## drops documents
frequentWords <- head(sort(rowSums(as.matrix(TermDocumentMatrix(corpus3))),decreasing=TRUE), 10)

barplot(frequentWords, 
        main = "Twitter Data: Most Frequent Words", 
        xlab="Word", 
        ylab = "Count")

term.doc.matrix3 <- TermDocumentMatrix(corpus3)
term.doc.matrix3 <- as.matrix(term.doc.matrix3)
word.freqs3 <- sort(rowSums(term.doc.matrix3), decreasing=TRUE) 
dm3 <- data.frame(word=names(word.freqs3), freq=word.freqs3)
wordcloud(dm3$word, dm3$freq, min.freq= 150,scale=c(4,.5), random.order=FALSE, rot.per=.15, colors=brewer.pal(8, "Dark2"))

Summary

  1. the data sets are pretty big and processing them requires time and computing resources;
  2. most of the top ranking n-grams contains English stop words
  3. using the n-grams we can conceive a crude algorithm to suggest the next words in a text editor; For example, the probability of an untyped word can be estimated from the frequencies in the corpus of the n-grams containing that word in the last position conditioned on the presence the last typed word(s) as the first n - 1 words in the n-gram.
  4. use a pre-built R algorithm, like one based on Hidden Markov model and the n-grams calculated from the data sets provided in this class.