This Milestone Report goes over the exploratory data analysis for the Capstone Project of the Data Science Specialization on Coursera
The Key partners for this project are Swiftkey and Coursera. The project explores the Natural Language Processing facet of Data Science
A large text corpus of documents to predict the next word on preceding input.
After the initial extraction and cleaning of data, it is presented on a Shiny application
This report goes over the plan on data preparation and presentation from a large corpus of text
library(ggplot2)
library(stringi)
library(NLP)
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(tm)
library(rJava)
library(RWekajars)
library(SnowballC)
library(RColorBrewer)
library(qdap)
## Loading required package: qdapDictionaries
## Loading required package: qdapRegex
##
## Attaching package: 'qdapRegex'
## The following object is masked from 'package:ggplot2':
##
## %+%
## Loading required package: qdapTools
##
## Attaching package: 'qdap'
## The following objects are masked from 'package:tm':
##
## as.DocumentTermMatrix, as.TermDocumentMatrix
## The following object is masked from 'package:NLP':
##
## ngrams
## The following objects are masked from 'package:base':
##
## Filter, proportions
library(RWeka)
library(openNLP)
blogs <- file("C:\\Users\\Dell\\Desktop\\Coursera-SwiftKey\\final\\en_US\\en_US.blogs.txt", open="rb")
blogs <- readLines(blogs, encoding = "UTF-8", skipNul=TRUE)
news <- file("C:\\Users\\Dell\\Desktop\\Coursera-SwiftKey\\final\\en_US\\en_US.news.txt", open = "rb")
news <- readLines(news, encoding = "UTF-8", skipNul=TRUE)
twitter <- file("C:\\Users\\Dell\\Desktop\\Coursera-SwiftKey\\final\\en_US\\en_US.twitter.txt", open = "rb")
twitter <- readLines(twitter, encoding = "UTF-8", skipNul=TRUE)
set.seed(1234)
subTwitter <- sample(twitter, size = 5000, replace = TRUE)
subBlogs <- sample(blogs, size = 5000, replace = TRUE)
subNews <- sample(news, size = 5000, replace = TRUE)
sample <- c(subTwitter, subBlogs, subNews)
corpus <- Corpus(VectorSource(sample))
corpus <- tm_map(corpus, content_transformer(function(x) iconv(x, to="UTF-8", sub="byte")))
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(function(x)
## iconv(x, : transformation drops documents
corpus <- tm_map(corpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents
corpus <- tm_map(corpus, content_transformer(removePunctuation), preserve_intra_word_dashes=TRUE)
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(removePunctuation), :
## transformation drops documents
## Removing Profanity
profanityWords = readLines('profane_words.txt')
## Warning in readLines("profane_words.txt"): incomplete final line found on
## 'profane_words.txt'
corpus <- tm_map(corpus,removeWords, profanityWords)
## Warning in tm_map.SimpleCorpus(corpus, removeWords, profanityWords):
## transformation drops documents
corpus <- tm_map(corpus, content_transformer(removeNumbers))
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(removeNumbers)):
## transformation drops documents
## Taking out URLs
removeURL <- function(x) gsub("http[[:alnum:]]*", "", x)
corpus <- tm_map(corpus, content_transformer(removeURL))
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(removeURL)):
## transformation drops documents
corpus <- tm_map(corpus, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents
corpus <- tm_map(corpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpus, stripWhitespace): transformation drops
## documents
unigram <- NGramTokenizer(corpus, Weka_control(min = 1, max = 1,delimiters = " \\r\\n\\t.,;:\"()?!"))
unigram <- data.frame(table(unigram))
unigram <- unigram[order(unigram$Freq,decreasing = TRUE),]
names(unigram) <- c("word1", "freq")
head(unigram)
## word1 freq
## 28103 said 1428
## 35674 will 1333
## 22880 one 1300
## 17583 just 1161
## 5369 can 1057
## 18921 like 1045
unigram$word1 <- as.character(unigram$word1)
g1 <- ggplot(data=unigram[1:10,], aes(x = word1, y = freq))
g2 <- g1 + geom_bar(stat="identity") + coord_flip() + ggtitle("Frequently Words")
g3 <- g2 + geom_text(data = unigram[1:10,], aes(x = word1, y = freq, label = freq), hjust=-1, position = "identity")
g3
bigram <- NGramTokenizer(corpus, Weka_control(min = 2, max = 2,delimiters = " \\r\\n\\t.,;:\"()?!"))
bigram <- data.frame(table(bigram))
bigram <- bigram[order(bigram$Freq,decreasing = TRUE),]
names(bigram) <- c("words","freq")
head(bigram)
## words freq
## 100517 last year 83
## 156435 right now 82
## 124625 new york 81
## 100508 last week 77
## 213711 years ago 73
## 84089 high school 60
bigram$words <- as.character(bigram$words)
str2 <- strsplit(bigram$words,split=" ")
bigram <- transform(bigram,
one = sapply(str2,"[[",1),
two = sapply(str2,"[[",2))
bigram <- data.frame(word1 = bigram$one,word2 = bigram$two,freq = bigram$freq,stringsAsFactors=FALSE)
names(bigram)[names(bigram) == 'word1'] <- 'w1'
names(bigram)[names(bigram) == 'word2'] <- 'w1'
trigram <- NGramTokenizer(corpus, Weka_control(min = 3, max = 3,delimiters = " \\r\\n\\t.,;:\"()?!"))
trigram <- data.frame(table(trigram))
trigram <- trigram[order(trigram$Freq,decreasing = TRUE),]
names(trigram) <- c("words","freq")
head(trigram)
## words freq
## 218727 two years ago 16
## 89763 hate job hate 11
## 104727 job hate job 10
## 160045 president barack obama 10
## 218703 two weeks ago 9
## 30670 cant wait see 8
trigram$words <- as.character(trigram$words)
str3 <- strsplit(trigram$words,split=" ")
trigram <- transform(trigram,
one = sapply(str3,"[[",1),
two = sapply(str3,"[[",2),
three = sapply(str3,"[[",3))
trigram <- data.frame(word1 = trigram$one,word2 = trigram$two,
word3 = trigram$three, freq = trigram$freq,stringsAsFactors=FALSE)
names(trigram)[names(trigram) == 'word1'] <- 'w1'
names(trigram)[names(trigram) == 'word2'] <- 'w2'
names(trigram)[names(trigram) == 'word3'] <- 'w3'