Capstone - Milestone Report

Goal and Motivation

The goal of this project is just to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm. Please submit a report on R Pubs (http://rpubs.com/) that explains your exploratory analysis and your goals for the eventual app and algorithm. This document should be concise and explain only the major features of the data you have identified and briefly summarize your plans for creating the prediction algorithm and Shiny app in a way that would be understandable to a non-data scientist manager. You should make use of tables and plots to illustrate important summaries of the data set.

The motivation for this project is to:

Demonstrate that you’ve downloaded the data and have successfully loaded it in.
Create a basic report of summary statistics about the data sets.
Report any interesting findings that you amassed so far.
Get feedback on your plans for creating a prediction algorithm and Shiny app.

The data provided for NLP (Natural Language Processing) consists of 3 “corpora” of data:

Blog posts
News articles
“Tweets” on Twitter

Load Requisite Libraries, Download/Extract Data, Read Data

# Load libraries and suppress messages for ease of reading report
suppressMessages(library(dplyr)) 
suppressMessages(library(ggplot2)) 
suppressMessages(library(LaF)) 
suppressMessages(library(quanteda)) 
suppressMessages(library(RColorBrewer)) 
suppressMessages(library(RWeka)) 
suppressMessages(library(SnowballC))
suppressMessages(library(tau)) 
suppressMessages(library(tm)) 
suppressMessages(library(wordcloud))

Data: Download, Extract and Read

# Download and extract data
source_file <- "http://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
destination_file <- "Coursera-SwiftKey.zip"
download.file(source_file, destination_file)
unzip(destination_file)

# Unzip file
unzip(destination_file, list = TRUE )

##                             Name    Length                Date
## 1                         final/         0 2014-07-22 10:10:00
## 2                   final/de_DE/         0 2014-07-22 10:10:00
## 3  final/de_DE/de_DE.twitter.txt  75578341 2014-07-22 10:11:00
## 4    final/de_DE/de_DE.blogs.txt  85459666 2014-07-22 10:11:00
## 5     final/de_DE/de_DE.news.txt  95591959 2014-07-22 10:11:00
## 6                   final/ru_RU/         0 2014-07-22 10:10:00
## 7    final/ru_RU/ru_RU.blogs.txt 116855835 2014-07-22 10:12:00
## 8     final/ru_RU/ru_RU.news.txt 118996424 2014-07-22 10:12:00
## 9  final/ru_RU/ru_RU.twitter.txt 105182346 2014-07-22 10:12:00
## 10                  final/en_US/         0 2014-07-22 10:10:00
## 11 final/en_US/en_US.twitter.txt 167105338 2014-07-22 10:12:00
## 12    final/en_US/en_US.news.txt 205811889 2014-07-22 10:13:00
## 13   final/en_US/en_US.blogs.txt 210160014 2014-07-22 10:13:00
## 14                  final/fi_FI/         0 2014-07-22 10:10:00
## 15    final/fi_FI/fi_FI.news.txt  94234350 2014-07-22 10:11:00
## 16   final/fi_FI/fi_FI.blogs.txt 108503595 2014-07-22 10:12:00
## 17 final/fi_FI/fi_FI.twitter.txt  25331142 2014-07-22 10:10:00

# Load the data en_US data
dataBlogs <- readLines("./final/en_US/en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
dataNews <- readLines("./final/en_US/en_US.news.txt", encoding = "UTF-8", skipNul = TRUE, warn = FALSE)
dataTwitter <- readLines("./final/en_US/en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)

#Convert to ASCII
dataNews <- iconv(dataNews, 'UTF-8', 'ASCII', "byte")
dataBlogs <- iconv(dataBlogs, 'UTF-8', 'ASCII', "byte")
dataTwitter <- iconv(dataTwitter, 'UTF-8', 'ASCII', "byte")

Exploratory Analysis, inc. Word Counts, Line Counts

# Assess size of all 3 files - blogs, news and Twitter
dataBlogs.filesizeMB <- file.size("./final/en_US/en_US.blogs.txt")
dataNews.filesizeMB <- file.size("./final/en_US/en_US.news.txt")
dataTwitter.filesizeMB <- file.size("./final/en_US/en_US.twitter.txt")

# Determine word count and length of the longest line seen
dataBlogs.wordsCount <- nchar(dataBlogs)
tmax <- which.max(dataBlogs.wordsCount)
dataBlogs.longestWordCount <- nchar(dataBlogs[tmax])

dataNews.wordsCount <- nchar(dataNews)
tmax <- which.max(dataNews.wordsCount)
dataNews.longestWordCount <- nchar(dataNews[tmax])

dataTwitter.wordsCount <- nchar(dataTwitter)
tmax <- which.max(dataTwitter.wordsCount)
dataTwitter.longestWordCount <- nchar(dataTwitter[tmax])

# Combine into a data frame
dataframe.blogs <- c(dataBlogs.filesizeMB, length(dataBlogs.wordsCount), dataBlogs.longestWordCount)
dataframe.news <- c(dataNews.filesizeMB, length(dataNews.wordsCount), dataNews.longestWordCount)
dataframe.twitter <- c(dataTwitter.filesizeMB, length(dataTwitter.wordsCount), dataTwitter.longestWordCount)

Create Table: File Size, Word Count and Longest Line

info <- data.frame(rbind(dataframe.blogs, dataframe.news, dataframe.twitter))
names(info) <- c("File Size (MB)", "Word Count", "Longest Line")
row.names(info) <- c("Blogs", "News", "Twitter")

# Showcase table
info

##         File Size (MB) Word Count Longest Line
## Blogs        210160014     899288        40844
## News         205811889    1010242        11384
## Twitter      167105338    2360148          589

Sampling Exercise: Word Frequency

# Assess maximum number of characters in a line of the files
summary(nchar(dataBlogs))[6]

##  Max. 
## 40840

summary(nchar(dataNews))[6]

##  Max. 
## 11380

summary(nchar(dataTwitter))[6]

## Max. 
##  589

# Run sampling at 5% of the actual file parameters because of sizes of files
dataBlogs_sample_size   <- round(.05 * length(dataBlogs), 0)
dataNews_sample_size    <- round(.05 * length(dataNews), 0) 
dataTwitter_sample_size <- round(.05 * length(dataTwitter), 0)

# Compute with approximately 5% of the population for each file
dataBlogs_sample <- sample_lines("./final/en_US/en_US.blogs.txt", n = dataBlogs_sample_size, nlines = NULL) 
dataNews_sample <- sample_lines("./final/en_US/en_US.news.txt", n = dataNews_sample_size , nlines = NULL) 
dataTwitter_sample <- sample_lines("./final/en_US/en_US.twitter.txt", n = dataTwitter_sample_size, nlines = NULL)

# Determine word frequency for each of the 3 files
dataBlogs_word_freq <- dfm(dataBlogs_sample, verbose = FALSE)
dataNews_word_freq <- dfm(dataNews_sample, verbose = FALSE)
dataTwitter_word_freq <- dfm(dataTwitter_sample, verbose = FALSE)

docfreq(dataBlogs_word_freq)[1:11]

##                 folks               hanging                   out 
##                   131                   100                  4548 
##                    in                   the                  park 
##                 16949                 27534                   204 
##             applauded                  free                  show 
##                     5                   532                   752 
##                    of precisely-coordinated 
##                 20470                     1

docfreq(dataNews_word_freq)[1:11]

##         for     traders  treasuries represented          an     obvious 
##       13924          19           3          64        5462          56 
##        safe       haven        from      stocks         and 
##         156          19        6587          79       26525

docfreq(dataTwitter_word_freq)[1:11]

##   drunk     ppl     are    very    loud   bruhh     too fucking    late 
##     126     257    7533    1224     125       1    2827     352     399 
##    lmao     i'm 
##     423    6082

Clean Corpus & Incorporate “Bad Words”

require(tm)
require(SnowballC)
require(RWeka)
require(slam)

## Loading required package: slam

require(ggplot2)

# Set CleanR function
CleanR <- function(corpus){
        tm_map(corpus, removeNumbers) %>%
                tm_map(removePunctuation) %>%
                tm_map(content_transformer(tolower)) %>%
                tm_map(stripWhitespace) %>%
                tm_map(stemDocument)
                tm_map(PlainTextDocument)
}

# Combine all twitter, blogs and news and save to RData
all <- c(dataBlogs, dataNews, dataTwitter)
save(all, file="all.RData")

all.sample <- sample(all, round(0.02*length(all)))
save(all.sample, file="sample-2p.RData")

# Create the corpus
corpus <- Corpus(VectorSource(all.sample))
corpus <- tm_map(corpus, content_transformer(removePunctuation), lazy = TRUE)
corpus <- tm_map(corpus, content_transformer(removeNumbers), lazy = TRUE)
corpus <- tm_map(corpus, content_transformer(tolower), lazy = TRUE)
corpus <- tm_map(corpus, content_transformer(stripWhitespace), lazy = TRUE)
corpus <- tm_map(corpus, content_transformer(PlainTextDocument), lazy = TRUE)

# Set CleanR function
CleanR <- function(corpus){
        tm_map(corpus, removeNumbers) %>%
                tm_map(removePunctuation) %>%
                tm_map(content_transformer(tolower)) %>%
                tm_map(stripWhitespace) %>%
                tm_map(stemDocument)
}

# reading the text file
bad.word <- read.delim(file = "googlebadwords.txt", sep = ":", header = FALSE)
bad.word_new<- gsub("[*()]","",bad.word[,1])
corpus <- tm_map(corpus, removeWords, bad.word_new)

# Save the corpus for next phase of capstone
save(corpus, file="WorkingCorpus.RData")

NGram Analysis

# Create a few NGram functions via RWeka
unigram_token <- function(x)  NGramTokenizer(x, Weka_control(min = 1, max = 1))
bigram_token <- function(x)   NGramTokenizer(x, Weka_control(min = 2, max = 2))
trigram_token <- function(x)  NGramTokenizer(x, Weka_control(min = 3, max = 3))

UniGram analysis, inc. Histogram

# Create UniGram functions via RWeka
options(stringsAsFactors = FALSE)
options(mc.cores = 1)
unigram <- TermDocumentMatrix(corpus, control=list(tokenize=unigram_token))
unigram.good <- rollup(unigram, 2, na.rm=TRUE, FUN = sum)

# Sort with decreasing frequency
unigram.tf <- findFreqTerms(unigram.good, lowfreq = 3)
unigram.tf <- sort(rowSums(as.matrix(unigram.good[unigram.tf, ])), decreasing = TRUE)
unigram.tf <- data.frame(unigram.good=names(unigram.tf), frequency=unigram.tf)
names(unigram.tf) <- c("word", "frequency")
head(unigram.tf, 10)

##      word frequency
## the   the     95210
## and   and     48111
## for   for     22235
## that that     20585
## you   you     18712
## with with     14113
## was   was     12374
## this this     10862
## have have     10500
## are   are      9856

# Plot top 10 word frequency for UniGram
g <- ggplot(data = head(unigram.tf, 10), aes(x = word, y = frequency))
g <- g + geom_bar(stat="Identity", fill="red", colour = "black")
g <- g + geom_text(aes(label=frequency), vjust=-0.1)
g <- g + theme(axis.text.x = element_text(angle = 45, hjust = 2))
g

BiGram Analysis

# BiGram work
bi.gram.dataBlogs <- textcnt(dataBlogs_sample, n = 2, method = "string") 
bi.gram.dataBlogs <- bi.gram.dataBlogs[order(bi.gram.dataBlogs, decreasing = TRUE)]
bi.gram.dataBlogs[1:3] # top three, 2-Word combinations

## of the in the to the 
##   9420   7817   4386

Word Clouds

blogs_corpus <- VCorpus(DataframeSource(data.frame(dataBlogs_sample)))
news_corpus <- VCorpus(DataframeSource(data.frame(dataNews_sample)))
twitter_corpus <- VCorpus(DataframeSource(data.frame(dataTwitter_sample)))

rm(dataBlogs_sample); rm(dataNews_sample); rm(dataTwitter_sample)

blogs_corpus <- CleanR(blogs_corpus)
news_corpus <- CleanR(news_corpus)
twitter_corpus <- CleanR(twitter_corpus)

pal <- brewer.pal(8,"Accent")

wordcloud(blogs_corpus, max.words = 90, random.order = FALSE, colors = pal)

wordcloud(news_corpus, max.words = 90, random.order = FALSE, colors = pal)

wordcloud(twitter_corpus, max.words = 90, random.order = FALSE, colors = pal)