library(RColorBrewer)
library(wordcloud)
library(NLP)
library(tm)
library(stringi)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
capstoneDatasetUrl<-"https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
dataDir = "final"
if (!dir.exists(dataDir)) {
dataZipFile <- "Coursera-SwiftKey.zip"
if (!file.exists(dataZipFile))
download.file(url, dataZipFile, method = "auto")
unzip(dataZipFile)
if (dir.exists(dataDir))
file.remove(dataZipFile)
}
dataBlogs <- readLines("./final/en_US/en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
dataNews <- readLines("./final/en_US/en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)
## Warning in readLines("./final/en_US/en_US.news.txt", encoding = "UTF-8", :
## incomplete final line found on './final/en_US/en_US.news.txt'
dataTwitter <- readLines("./final/en_US/en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
stri_stats_general(dataBlogs)
## Lines LinesNEmpty Chars CharsNWhite
## 899288 899288 206824382 170389539
stri_stats_general(dataNews)
## Lines LinesNEmpty Chars CharsNWhite
## 77259 77259 15639408 13072698
stri_stats_general(dataTwitter)
## Lines LinesNEmpty Chars CharsNWhite
## 2360148 2360148 162096241 134082806
wordsblogs<- stri_count_words(dataBlogs)
summary(wordsblogs)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 9.00 28.00 41.75 60.00 6726.00
qplot(wordsblogs)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
wordsnews <- stri_count_words(dataNews)
summary(wordsnews)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 19.00 32.00 34.62 46.00 1123.00
qplot( wordsnews)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
wordstwitter<- stri_count_words(dataTwitter)
qplot( wordstwitter )
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
summary(wordstwitter)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 7.00 12.00 12.75 18.00 47.00
twitter<-sample(dataTwitter, 200)
twitter<- iconv(twitter, 'UTF-8', 'ASCII', "byte")
twitterVec <- VectorSource(twitter)
twitterCorp <- Corpus(twitterVec)
twitterCorp<- tm_map(twitterCorp, tolower)
## Warning in tm_map.SimpleCorpus(twitterCorp, tolower): transformation drops
## documents
twitterCorp<- tm_map(twitterCorp, removeNumbers)
## Warning in tm_map.SimpleCorpus(twitterCorp, removeNumbers): transformation drops
## documents
twitterCorp<- tm_map(twitterCorp, removePunctuation)
## Warning in tm_map.SimpleCorpus(twitterCorp, removePunctuation): transformation
## drops documents
twitter2 <- tm_map(twitterCorp, PlainTextDocument)
## Warning in tm_map.SimpleCorpus(twitterCorp, PlainTextDocument): transformation
## drops documents
wordcloud(twitterCorp, max.words = 5000, random.order = FALSE,rot.per=0.35, use.r.layout=FALSE,colors=brewer.pal(10, "Set1"))
## Warning in brewer.pal(10, "Set1"): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors
#.
n<-sample(dataNews, 200)
n<- iconv(n, 'UTF-8', 'ASCII', "byte")
nVec <- VectorSource(n)
nCorp <- Corpus(nVec)
nCorp<- tm_map(nCorp, tolower)
## Warning in tm_map.SimpleCorpus(nCorp, tolower): transformation drops documents
nCorp<- tm_map(nCorp, removeNumbers)
## Warning in tm_map.SimpleCorpus(nCorp, removeNumbers): transformation drops
## documents
nCorp<- tm_map(nCorp, removePunctuation)
## Warning in tm_map.SimpleCorpus(nCorp, removePunctuation): transformation drops
## documents
n2 <- tm_map(nCorp, PlainTextDocument)
## Warning in tm_map.SimpleCorpus(nCorp, PlainTextDocument): transformation drops
## documents
wordcloud(nCorp, max.words = 5000, random.order = FALSE,rot.per=0.35, use.r.layout=FALSE,colors=brewer.pal(10, "Set1"))
## Warning in brewer.pal(10, "Set1"): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors
#.
b<-sample(dataBlogs, 200)
b<- iconv(b, 'UTF-8', 'ASCII', "byte")
bVec <- VectorSource(b)
bCorp <- Corpus(bVec)
bCorp<- tm_map(bCorp, tolower)
## Warning in tm_map.SimpleCorpus(bCorp, tolower): transformation drops documents
bCorp<- tm_map(bCorp, removeNumbers)
## Warning in tm_map.SimpleCorpus(bCorp, removeNumbers): transformation drops
## documents
bCorp<- tm_map(bCorp, removePunctuation)
## Warning in tm_map.SimpleCorpus(bCorp, removePunctuation): transformation drops
## documents
b2 <- tm_map(bCorp, PlainTextDocument)
## Warning in tm_map.SimpleCorpus(bCorp, PlainTextDocument): transformation drops
## documents
wordcloud(bCorp, max.words = 5000, random.order = FALSE,rot.per=0.35, use.r.layout=FALSE,colors=brewer.pal(10, "Set1"))
## Warning in brewer.pal(10, "Set1"): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors
#.
-Q plots and word clouds have been made use of to gain a basic understanding of corpus data from th required sources.
-N gram and other statistical models will be incorporated to obtain a wholistic understanding of the corpus data. -I plan to familiarize myself with NLP frameworks and machine learning models such as Hidden Markov model to develop a deeper understandng of NLP which help me incorporae the reqired frameworks for the development of the Shiny app.