Introduction

This is the Milestone Report for the Coursera Data Science Capstone project. The goal of the capstone project is to create a predictive text model using a large text corpus of documents as training data. Natural language processing techniques will be used to perform the analysis and build the predictive model.

This milestone report describes the major features of the training data with our exploratory data analysis and summarizes our plans for creating the predictive model.

Data source

Location: https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip

The data sets consist of text from 3 different sources:
1) News,
2) Blogs and
3) Twitter feeds.

The text data are provided in 4 different languages:

  1. German,
  2. English - United States,
  3. Finnish and
  4. Russian.
    In this project, we will only focus on the English - United States data sets.

Set up libraries and working directory

library(tm)
## Loading required package: NLP
library(stringi)
library(RWeka)
library(wordcloud)
## Loading required package: RColorBrewer
library(stringi)

setwd("C:\\Users\\mahajvi1\\Desktop\\Coursera_capstone\\final\\en_US\\")

Read the blogs and Twitter data into R

Examine the data sets and summarize our findings (file sizes, line counts, word counts, and mean words per line) below.

blogs <- readLines("en_US.blogs.txt", encoding = "ASCII", skipNul = TRUE, warn = FALSE)
news <- readLines("en_US.news.txt", encoding = "ASCII", skipNul = TRUE,warn = FALSE)
twitter <- readLines("en_US.twitter.txt", encoding = "ASCII", skipNul = TRUE, warn = FALSE)

# Get file sizes

blogs.size <- file.info("en_US.blogs.txt")$size / 1024 ^ 2
news.size <- file.info("en_US.news.txt")$size / 1024 ^ 2
twitter.size <- file.info("en_US.twitter.txt")$size / 1024 ^ 2

# Get words in files

blogs.words <- stri_count_words(blogs)
news.words <- stri_count_words(news)
twitter.words <- stri_count_words(twitter)

# Summary of the data sets

data.frame(source = c("blogs", "news", "twitter"),
           
           file.size.MB = c(blogs.size, news.size, twitter.size),
           num.lines = c(length(blogs), length(news), length(twitter)),
           num.words = c(sum(blogs.words), sum(news.words), sum(twitter.words)),
           mean.num.words = c(mean(blogs.words), mean(news.words), mean(twitter.words)))
##    source file.size.MB num.lines num.words mean.num.words
## 1   blogs     200.4242    899288  38154238       42.42716
## 2    news     196.2775     77259   2693898       34.86840
## 3 twitter     159.3641   2360148  30218166       12.80350

Use limited data for testing

# Load 5000 lines from every set in corpus
merged <- paste(news[1:5000], blogs[1:5000], twitter[1:5000])
corpus <- VCorpus(VectorSource(merged))

# Remove large files to clean up memory
rm (blogs.words)
rm(news.words)
rm(twitter.words)

rm(blogs)
rm(news)
rm(twitter)

Clean The Data

Before performing exploratory analysis, we must clean the data first.

This involves removing
(1) URLs,
(2) special characters,
(3) punctuations,
(4) numbers,
(5) excess whitespace,
(6) stopwords, and
(7) changing the text to lower case.

corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeWords, stopwords())

corpusDf <-data.frame(text=unlist(sapply(corpus, 
                                         `[`, "content")), stringsAsFactors=F)

findNGrams <- function(corp, grams) {
  ngram <- NGramTokenizer(corp, Weka_control(min = grams, max = grams,
                                             delimiters = " \\r\\n\\t.,;:\"()?!"))
  ngram2 <- data.frame(table(ngram))
  #pick only top 25
  ngram3 <- ngram2[order(ngram2$Freq,decreasing = TRUE),][1:100,]
  colnames(ngram3) <- c("String","Count")
  ngram3
}

TwoGrams <- findNGrams(corpusDf, 2)
ThreeGrams <- findNGrams(corpusDf, 3)
FourGrams <- findNGrams(corpusDf, 4)

Plot word clouds and histograms

require(RColorBrewer)

par(mfrow = c(1, 3))
palette <- brewer.pal(8,"Dark2")

wordcloud(TwoGrams[,1], TwoGrams[,2], min.freq =1, 
          random.order = F, ordered.colors = F, colors=palette)
text(x=0.5, y=0, "2-gram cloud")

wordcloud(ThreeGrams[,1], ThreeGrams[,2], min.freq =1, 
          random.order = F, ordered.colors = F, colors=palette)
text(x=0.5, y=0, "3-gram cloud")

wordcloud(FourGrams[,1], FourGrams[,2], min.freq =1, 
          random.order = F, ordered.colors = F, colors=palette)
text(x=0.5, y=0, "4-gram cloud")

par(mfrow = c(1, 1))

barplot(TwoGrams[1:20,2], 
        cex.names=0.5, 
        names.arg=TwoGrams[1:20,1], 
        col="red", 
        main="2-Grams", 
        las=2)

barplot(ThreeGrams[1:20,2], 
        cex.names=0.5, 
        names.arg=ThreeGrams[1:20,1], 
        col="green", 
        main="3-Grams", 
        las=2)

barplot(FourGrams[1:20,2], 
        cex.names=0.5, 
        names.arg=FourGrams[1:20,1], 
        col="blue", 
        main="4-Grams", 
        las=2)