setwd(“./1Janus-Doc/DataScience/Capstone/en_US”)
file.info("./en_US.blogs.txt")$size / (1024*1024)
## [1] 200.4242
file.info("./en_US.news.txt")$size / (1024*1024)
## [1] 196.2775
file.info("./en_US.twitter.txt")$size / (1024*1024)
## [1] 159.3641
twitter <- readLines('./en_US.twitter.txt', encoding = 'UTF-8')
news <- readLines('./en_US.news.txt', encoding = 'UTF-8')
blogs <- readLines('./en_US.blogs.txt', encoding = 'UTF-8')
library(R.utils)
## Loading required package: R.oo
## Loading required package: R.methodsS3
## R.methodsS3 v1.7.0 (2015-02-19) successfully loaded. See ?R.methodsS3 for help.
## R.oo v1.19.0 (2015-02-27) successfully loaded. See ?R.oo for help.
##
## Attaching package: 'R.oo'
## The following objects are masked from 'package:methods':
##
## getClasses, getMethods
## The following objects are masked from 'package:base':
##
## attach, detach, gc, load, save
## R.utils v2.1.0 (2015-05-27) successfully loaded. See ?R.utils for help.
##
## Attaching package: 'R.utils'
## The following object is masked from 'package:utils':
##
## timestamp
## The following objects are masked from 'package:base':
##
## cat, commandArgs, getOption, inherits, isOpen, parse, warnings
set.seed(39) sampleTwitter <- twitter[sample(1:length(twitter),10000)] sampleNews <- news[sample(1:length(news),10000)] sampleBlogs <- blogs[sample(1:length(blogs),10000)] sampleData <- c(sampleTwitter,sampleNews,sampleBlogs) writeLines(sampleData, “./sample1/sampleData.txt”)
rm(twitter,news,blogs,sampleTwitter,sampleNews,sampleBlogs,sampleData)
library(tm)
## Warning: package 'tm' was built under R version 3.2.3
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.2.3
cname <- file.path(“.”, “sample1”) docs <- Corpus(DirSource(cname))
docs <- tm_map(docs, content_transformer(tolower))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
docs <- tm_map(docs, toSpace, "/|@|\\|")
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, stripWhitespace)
docs <- tm_map(docs, removeWords, stopwords("english"))
library(RWeka)
Tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
unidtm <- DocumentTermMatrix(docs,
control = list(tokenize = Tokenizer))
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
bidtm <- DocumentTermMatrix(docs,
control = list(tokenize = BigramTokenizer))
TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
tridtm <- DocumentTermMatrix(docs,
control = list(tokenize = TrigramTokenizer))
tm_unifreq <- sort(colSums(as.matrix(unidtm)), decreasing=TRUE)
tm_uniwordfreq <- data.frame(word=names(tm_unifreq), freq=tm_unifreq)
paste("Unigrams - Top 5 highest frequencies")
## [1] "Unigrams - Top 5 highest frequencies"
head(tm_uniwordfreq,5)
## word freq
## said said 3051
## will will 2897
## one one 2606
## like like 2354
## just just 2294
tm_bifreq <- sort(colSums(as.matrix(bidtm)), decreasing=TRUE)
tm_biwordfreq <- data.frame(word=names(tm_bifreq), freq=tm_bifreq)
paste("Bigrams - Top 5 highest frequencies")
## [1] "Bigrams - Top 5 highest frequencies"
head(tm_biwordfreq,5)
## word freq
## last year last year 216
## new york new york 177
## year ago year ago 166
## look like look like 153
## right now right now 151
tm_trifreq <- sort(colSums(as.matrix(tridtm)), decreasing=TRUE)
tm_triwordfreq <- data.frame(word=names(tm_trifreq), freq=tm_trifreq)
paste("Trigrams - Top 5 highest frequencies")
## [1] "Trigrams - Top 5 highest frequencies"
head(tm_triwordfreq,5)
## word freq
## happi mother day happi mother day 21
## cant wait see cant wait see 19
## new york citi new york citi 17
## presid barack obama presid barack obama 15
## world war ii world war ii 15
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
tm_uniwordfreq %>%
filter(freq > 1000) %>%
ggplot(aes(word,freq)) +
geom_bar(stat="identity") +
ggtitle("Unigrams with frequencies > 1000") +
xlab("Unigrams") + ylab("Frequency") +
theme(axis.text.x=element_text(angle=45, hjust=1))
tm_biwordfreq %>%
filter(freq > 100) %>%
ggplot(aes(word,freq)) +
geom_bar(stat="identity") +
ggtitle("Bigrams with frequencies > 100") +
xlab("Bigrams") + ylab("Frequency") +
theme(axis.text.x=element_text(angle=45, hjust=1))
tm_triwordfreq %>%
filter(freq > 10) %>%
ggplot(aes(word,freq)) +
geom_bar(stat="identity") +
ggtitle("Trigrams with frequencies > 10") +
xlab("Trigrams") + ylab("Frequency") +
theme(axis.text.x=element_text(angle=45, hjust=1))
library(wordcloud)
## Loading required package: RColorBrewer
set.seed(39)
wordcloud(names(tm_unifreq), tm_unifreq, max.words=200, scale=c(5, .5), colors=brewer.pal(6, "Dark2"))
wordcloud(names(tm_bifreq), tm_bifreq, max.words=200, scale=c(5, .5), colors=brewer.pal(6, "Dark2"))
wordcloud(names(tm_trifreq), tm_trifreq, max.words=200, scale=c(5, .5), colors=brewer.pal(6, "Dark2"))