Peer-graded Assignment: Milestone Report

Basic summary

This report provides a short overview of the exploratory analysis of the text data to be used for the Capstone project for the Data Science Specialization along with a description of plans for the word prediction algorithm.

Tasks to accomplish

Demonstrate that you’ve downloaded the data and have successfully loaded it in.
Create a basic report of summary statistics about the data sets.
Report any interesting findings that you amassed so far.
Get feedback on your plans for creating a prediction algorithm and Shiny app

Data loading and Analysis

After download the file from Coursera: https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip

Load the R packages necessary for running the analysis

list.of.packages <- c("stringi", "tm", "wordcloud", "RColorBrewer")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages, repos="http://cran.rstudio.com/")
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(stringi)

Load the data

file.list = c("C:/Users/be174.BARQSYSTEMS/Desktop/Data Science Capstone/final/en_US/en_US.blogs.txt", "C:/Users/be174.BARQSYSTEMS/Desktop/Data Science Capstone/final/en_US/en_US.news.txt", "C:/Users/be174.BARQSYSTEMS/Desktop/Data Science Capstone/final/en_US/en_US.twitter.txt")

Building a table

text <- list(blogs = "", news = "", twitter = "")

matrix.summary <- matrix(0, nrow = 3, ncol = 3, dimnames = list(c("blogs", "news", "twitter"),c("file size, Mb", "lines", "words")))
for (i in 1:3) {
  con <- file(file.list[i], "rb")
  text[[i]] <- readLines(con, encoding = "UTF-8",skipNul = TRUE)
  close(con)
  matrix.summary[i,1] <- round(file.info(file.list[i])$size / 1024^2, 2)
  matrix.summary[i,2] <- length(text[[i]])
  matrix.summary[i,3] <- sum(stri_count_words(text[[i]]))
}
library(knitr)
kable(matrix.summary)

	file size, Mb	lines	words
blogs	200.42	899288	37546239
news	196.28	1010242	34762395
twitter	159.36	2360148	30093413

These datasets are rather large, I will proceed with the analysis using a small fraction to get a sample. For example, News file is 196MB of size and 1.010,242 Lines. I will use 10k random lines for analysis.

set.seed(123)
blogs_sample <- sample(text$blogs, 0.01*length(text$blogs))
news_sample <- sample(text$news, 0.01*length(text$news))
twitter_sample <- sample(text$twitter, 0.01*length(text$twitter))
sampled_data <- c(blogs_sample, news_sample, twitter_sample)
sum <- sum(stri_count_words(sampled_data))
sum

## [1] 1023563

The new data set consists of (1023563) words.

Build the corpus

library(tm)

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

library(wordcloud)

## Loading required package: RColorBrewer

library(RColorBrewer)
# remove emoticons
sampled_data <- iconv(sampled_data, 'UTF-8', 'ASCII')
# Create corpus
corpus1 <- Corpus(VectorSource(blogs_sample))
# To lower case
corpus1 <- tm_map(corpus1, content_transformer(tolower))

## Warning in tm_map.SimpleCorpus(corpus1, content_transformer(tolower)):
## transformation drops documents

# Remove punctuation marks
corpus1 <- tm_map(corpus1, removePunctuation)

## Warning in tm_map.SimpleCorpus(corpus1, removePunctuation): transformation
## drops documents

# Remove numbers
corpus1 <- tm_map(corpus1, removeNumbers)

## Warning in tm_map.SimpleCorpus(corpus1, removeNumbers): transformation
## drops documents

#remove stop words
corpus1 <- tm_map(corpus1, removeWords, stopwords("english"))

## Warning in tm_map.SimpleCorpus(corpus1, removeWords, stopwords("english")):
## transformation drops documents

#Remove whitespaces
corpus1 <- tm_map(corpus1, stripWhitespace)

## Warning in tm_map.SimpleCorpus(corpus1, stripWhitespace): transformation
## drops documents

frequentWords <- head(sort(rowSums(as.matrix(TermDocumentMatrix(corpus1))),decreasing=TRUE), 10)

barplot(frequentWords, 
        main = "Blogs Data: Most Frequent Words", 
        xlab="Word", 
        ylab = "Count")

term.doc.matrix1 <- TermDocumentMatrix(corpus1)
term.doc.matrix1 <- as.matrix(term.doc.matrix1)
word.freqs1 <- sort(rowSums(term.doc.matrix1), decreasing=TRUE) 
dm1 <- data.frame(word=names(word.freqs1), freq=word.freqs1)

Word cloud plot of the most common words in the corpus

wordcloud(dm1$word, dm1$freq, min.freq= 150,scale=c(4,.5), random.order=TRUE, rot.per=.15, colors=brewer.pal(8, "Dark2"))

## Warning in wordcloud(dm1$word, dm1$freq, min.freq = 150, scale = c(4,
## 0.5), : know could not be fit on page. It will not be plotted.

## Warning in wordcloud(dm1$word, dm1$freq, min.freq = 100, random.order =
## TRUE, : can could not be fit on page. It will not be plotted.

News Data

# Create corpus
corpus2 <- Corpus(VectorSource(news_sample))
# To lower case
corpus2 <- tm_map(corpus2, content_transformer(tolower))

## Warning in tm_map.SimpleCorpus(corpus2, content_transformer(tolower)):
## transformation drops documents

# Remove punctuation marks
corpus2 <- tm_map(corpus2, removePunctuation)

## Warning in tm_map.SimpleCorpus(corpus2, removePunctuation): transformation
## drops documents

# Remove numbers
corpus2 <- tm_map(corpus2, removeNumbers)

## Warning in tm_map.SimpleCorpus(corpus2, removeNumbers): transformation
## drops documents

#remove stop words
corpus2 <- tm_map(corpus2, removeWords, stopwords("english"))

## Warning in tm_map.SimpleCorpus(corpus2, removeWords, stopwords("english")):
## transformation drops documents

#Remove whitespaces
corpus2 <- tm_map(corpus2, stripWhitespace)

## Warning in tm_map.SimpleCorpus(corpus2, stripWhitespace): transformation
## drops documents

frequentWords <- head(sort(rowSums(as.matrix(TermDocumentMatrix(corpus2))),decreasing=TRUE), 10)

barplot(frequentWords, 
        main = "News Data: Most Frequent Words", 
        xlab="Word", 
        ylab = "Count")

term.doc.matrix2 <- TermDocumentMatrix(corpus2)
term.doc.matrix2 <- as.matrix(term.doc.matrix2)
word.freqs2 <- sort(rowSums(term.doc.matrix2), decreasing=TRUE) 
dm2 <- data.frame(word=names(word.freqs2), freq=word.freqs2)

Most common words in the corpus

wordcloud(dm2$word, dm2$freq, min.freq= 100, random.order=TRUE, rot.per=.25, colors=brewer.pal(8, "Dark2"))

Twitter Data

# Create corpus
corpus3 <- Corpus(VectorSource(twitter_sample))

## Convert Character Vector between Encodings
corpus3 <- tm_map(corpus3, content_transformer(function(x)
  iconv(x, to = "UTF-8", sub = "byte")))

## Warning in tm_map.SimpleCorpus(corpus3, content_transformer(function(x)
## iconv(x, : transformation drops documents

# To lower case
corpus3 <- tm_map(corpus3, content_transformer(tolower))

## Warning in tm_map.SimpleCorpus(corpus3, content_transformer(tolower)):
## transformation drops documents

# Remove punctuation marks
corpus3 <- tm_map(corpus3, removePunctuation)

## Warning in tm_map.SimpleCorpus(corpus3, removePunctuation): transformation
## drops documents

# Remove numbers
corpus3 <- tm_map(corpus3, removeNumbers)

## Warning in tm_map.SimpleCorpus(corpus3, removeNumbers): transformation
## drops documents

#remove stop words
corpus3 <- tm_map(corpus3, removeWords, stopwords("english"))

## Warning in tm_map.SimpleCorpus(corpus3, removeWords, stopwords("english")):
## transformation drops documents

#Remove whitespaces
corpus3 <- tm_map(corpus3, stripWhitespace)

## Warning in tm_map.SimpleCorpus(corpus3, stripWhitespace): transformation
## drops documents

frequentWords <- head(sort(rowSums(as.matrix(TermDocumentMatrix(corpus3))),decreasing=TRUE), 10)

barplot(frequentWords, 
        main = "Twitter Data: Most Frequent Words", 
        xlab="Word", 
        ylab = "Count")

term.doc.matrix3 <- TermDocumentMatrix(corpus3)
term.doc.matrix3 <- as.matrix(term.doc.matrix3)
word.freqs3 <- sort(rowSums(term.doc.matrix3), decreasing=TRUE) 
dm3 <- data.frame(word=names(word.freqs3), freq=word.freqs3)

wordcloud(dm3$word, dm3$freq, min.freq= 150,scale=c(4,.5), random.order=FALSE, rot.per=.15, colors=brewer.pal(8, "Dark2"))

Summary

the data sets are pretty big and processing them requires time and computing resources;
most of the top ranking n-grams contains English stop words
using the n-grams we can conceive a crude algorithm to suggest the next words in a text editor; For example, the probability of an untyped word can be estimated from the frequencies in the corpus of the n-grams containing that word in the last position conditioned on the presence the last typed word(s) as the first n - 1 words in the n-gram.
use a pre-built R algorithm, like one based on Hidden Markov model and the n-grams calculated from the data sets provided in this class.