Introduction

This Milestone Report goes over the exploratory data analysis for the Capstone Project of the Data Science Specialization on Coursera

The Key partners for this project are Swiftkey and Coursera. The project explores the Natural Language Processing facet of Data Science

A large text corpus of documents to predict the next word on preceding input.

After the initial extraction and cleaning of data, it is presented on a Shiny application

This report goes over the plan on data preparation and presentation from a large corpus of text

library(ggplot2)
library(stringi)
library(NLP)
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(tm)
library(rJava)
library(RWekajars)
library(SnowballC)
library(RColorBrewer)
library(qdap)
## Loading required package: qdapDictionaries
## Loading required package: qdapRegex
## 
## Attaching package: 'qdapRegex'
## The following object is masked from 'package:ggplot2':
## 
##     %+%
## Loading required package: qdapTools
## 
## Attaching package: 'qdap'
## The following objects are masked from 'package:tm':
## 
##     as.DocumentTermMatrix, as.TermDocumentMatrix
## The following object is masked from 'package:NLP':
## 
##     ngrams
## The following objects are masked from 'package:base':
## 
##     Filter, proportions
library(RWeka)
library(openNLP)
blogs <- file("C:\\Users\\Dell\\Desktop\\Coursera-SwiftKey\\final\\en_US\\en_US.blogs.txt", open="rb")
blogs <- readLines(blogs, encoding = "UTF-8", skipNul=TRUE)

news <- file("C:\\Users\\Dell\\Desktop\\Coursera-SwiftKey\\final\\en_US\\en_US.news.txt", open = "rb") 
news <- readLines(news, encoding = "UTF-8", skipNul=TRUE)

twitter <- file("C:\\Users\\Dell\\Desktop\\Coursera-SwiftKey\\final\\en_US\\en_US.twitter.txt", open = "rb")
twitter <- readLines(twitter, encoding = "UTF-8", skipNul=TRUE)
set.seed(1234)

subTwitter <- sample(twitter, size = 5000, replace = TRUE)
subBlogs <- sample(blogs, size = 5000, replace = TRUE)
subNews <- sample(news, size = 5000, replace = TRUE)
sample <- c(subTwitter, subBlogs, subNews)
corpus <- Corpus(VectorSource(sample))

corpus <- tm_map(corpus, content_transformer(function(x) iconv(x, to="UTF-8", sub="byte")))
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(function(x)
## iconv(x, : transformation drops documents
corpus <- tm_map(corpus, content_transformer(tolower)) 
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents
corpus <- tm_map(corpus, content_transformer(removePunctuation), preserve_intra_word_dashes=TRUE)
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(removePunctuation), :
## transformation drops documents
## Removing Profanity
profanityWords = readLines('profane_words.txt')
## Warning in readLines("profane_words.txt"): incomplete final line found on
## 'profane_words.txt'
corpus <- tm_map(corpus,removeWords, profanityWords)
## Warning in tm_map.SimpleCorpus(corpus, removeWords, profanityWords):
## transformation drops documents
corpus <- tm_map(corpus, content_transformer(removeNumbers))
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(removeNumbers)):
## transformation drops documents
## Taking out URLs
removeURL <- function(x) gsub("http[[:alnum:]]*", "", x)
corpus <- tm_map(corpus, content_transformer(removeURL))
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(removeURL)):
## transformation drops documents
corpus <- tm_map(corpus, removeWords, stopwords("english")) 
## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents
corpus <- tm_map(corpus, stripWhitespace) 
## Warning in tm_map.SimpleCorpus(corpus, stripWhitespace): transformation drops
## documents
unigram <- NGramTokenizer(corpus, Weka_control(min = 1, max = 1,delimiters = " \\r\\n\\t.,;:\"()?!"))
unigram <- data.frame(table(unigram))
unigram <- unigram[order(unigram$Freq,decreasing = TRUE),]

names(unigram) <- c("word1", "freq")
head(unigram)
##       word1 freq
## 28103  said 1428
## 35674  will 1333
## 22880   one 1300
## 17583  just 1161
## 5369    can 1057
## 18921  like 1045
unigram$word1 <- as.character(unigram$word1)
g1 <- ggplot(data=unigram[1:10,], aes(x = word1, y = freq))
g2 <- g1 + geom_bar(stat="identity") + coord_flip() + ggtitle("Frequently Words")
g3 <- g2 + geom_text(data = unigram[1:10,], aes(x = word1, y = freq, label = freq), hjust=-1, position = "identity")
g3

bigram <- NGramTokenizer(corpus, Weka_control(min = 2, max = 2,delimiters = " \\r\\n\\t.,;:\"()?!"))
bigram <- data.frame(table(bigram))
bigram <- bigram[order(bigram$Freq,decreasing = TRUE),]
names(bigram) <- c("words","freq")
head(bigram)
##              words freq
## 100517   last year   83
## 156435   right now   82
## 124625    new york   81
## 100508   last week   77
## 213711   years ago   73
## 84089  high school   60
bigram$words <- as.character(bigram$words)
str2 <- strsplit(bigram$words,split=" ")
bigram <- transform(bigram, 
                    one = sapply(str2,"[[",1),   
                    two = sapply(str2,"[[",2))
bigram <- data.frame(word1 = bigram$one,word2 = bigram$two,freq = bigram$freq,stringsAsFactors=FALSE)

names(bigram)[names(bigram) == 'word1'] <- 'w1'
names(bigram)[names(bigram) == 'word2'] <- 'w1'
trigram <- NGramTokenizer(corpus, Weka_control(min = 3, max = 3,delimiters = " \\r\\n\\t.,;:\"()?!"))
trigram <- data.frame(table(trigram))
trigram <- trigram[order(trigram$Freq,decreasing = TRUE),]
names(trigram) <- c("words","freq")
head(trigram)
##                         words freq
## 218727          two years ago   16
## 89763           hate job hate   11
## 104727           job hate job   10
## 160045 president barack obama   10
## 218703          two weeks ago    9
## 30670           cant wait see    8
trigram$words <- as.character(trigram$words)
str3 <- strsplit(trigram$words,split=" ")
trigram <- transform(trigram,
                     one = sapply(str3,"[[",1),
                     two = sapply(str3,"[[",2),
                     three = sapply(str3,"[[",3))

trigram <- data.frame(word1 = trigram$one,word2 = trigram$two, 
                      word3 = trigram$three, freq = trigram$freq,stringsAsFactors=FALSE)

names(trigram)[names(trigram) == 'word1'] <- 'w1'
names(trigram)[names(trigram) == 'word2'] <- 'w2'
names(trigram)[names(trigram) == 'word3'] <- 'w3'