library(tm)
## Loading required package: NLP
library(RWeka)
library(openNLP)
library(tau)
library(Rstem)
library(SnowballC)
##
## Attaching package: 'SnowballC'
## The following objects are masked from 'package:Rstem':
##
## getStemLanguages, wordStem
library(quanteda)
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:tau':
##
## tokenize
## The following objects are masked from 'package:tm':
##
## as.DocumentTermMatrix, stopwords
## The following object is masked from 'package:NLP':
##
## ngrams
## The following object is masked from 'package:stats':
##
## df
## The following object is masked from 'package:base':
##
## sample
library(stringr)
library(slam)
library(stylo)
## stylo version: 0.6.3
library(plyr)
library(data.table)
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
library(parallel)
library(foreach)
library(doParallel)
## Loading required package: iterators
library(doSNOW)
## Loading required package: snow
##
## Attaching package: 'snow'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, clusterSplit, makeCluster,
## parApply, parCapply, parLapply, parRapply, parSapply,
## splitIndices, stopCluster
setting of directory and creation of files:
setwd("D:/personal/data science/Capstone Project/final/en_US")
con<-file("en_US.news.txt", open="rt")
news<-readLines(con, skipNul = T, warn=FALSE)
close(con)
##summary(news)
con<-file("en_US.blogs.txt", open="rt")
blogs<-readLines(con, skipNul = T, warn=FALSE)
close(con)
##summary(blogs)
con<-file("en_US.twitter.txt", open="rt")
tweets<-readLines(con, skipNul = T, warn=FALSE)
close(con)
Analize size of files to check if they are too big
## Analize size of files
news.size<-file.info("en_US.news.txt")$size/1000^2
blog.size<-file.info("en_US.blogs.txt")$size/1000^2
twitter.size<-file.info("en_US.twitter.txt")$size/1000^2
size<-c(news=news.size,blog=blog.size,twitter=twitter.size)
barplot(size, main="Size of the files", col="magenta")
text(1.9, 200, round(blog.size,0))
text(0.7, 190, round(news.size,0))
text(3, 160, round(twitter.size,0))
Analize length of files to check if they are too big
## Analize length of files
news.length<-length(news)
blogs.length<-length(blogs)
twitter.length<-length(tweets)
lengths<-c(news=news.length,blog=blogs.length,twitter=twitter.length)
lengths2<-data.frame(news=news.length,blogs=blogs.length,twitter=twitter.length)
lengths2
## news blogs twitter
## 1 77259 899288 2360148
barplot(lengths, main="Length of the files", col="green")
Analize amount of words to check if they are too big
## Amount of words of files
news.words<-sum(sapply(gregexpr("\\S+", news), length))
blogs.words<-sum(sapply(gregexpr("\\S+", blogs), length))
twitter.words<-sum(sapply(gregexpr("\\S+", tweets), length))
words<-c(news=news.words,blog=blogs.words,twitter=twitter.words)
words2<-data.frame(news=news.words,blogs=blogs.words,twitter=twitter.words)
words2
## news blogs twitter
## 1 2643972 37334441 30373832
barplot(words, main="Amount of words of each file", col="purple")
After checking the three files, the conclusion is that tweets has the biggest amount of words and length although blogs has the biggest size of the file of them.
They are too big. It is needed mix them and create a sample them. A small amount of data is needed to avoid losing of speed
set.seed(0625)
SampleTweets=sample(tweets, 1000)
set.seed(0625)
SampleBlogs=sample(blogs, 1000)
set.seed(0625)
SampleNews=sample(news, 1000)
total.samples=rbind(SampleBlogs,SampleNews,SampleTweets)
path<-"D:/personal/data science/Capstone Project/final/en_US/total.samples.txt"
Cleaning of data and creation of Corpora
profanity <- read.csv(paste0(getwd(),'/profanity.csv'),header=FALSE,stringsAsFactors=FALSE)
src <- VectorSource(total.samples)
corpora <- VCorpus(src)
CharacterToRemove<-c("“","’","’", "-")
clc <- tm_map(corpora, FUN = removeNumbers)
clc <- tm_map(clc, content_transformer(tolower))
clc <- tm_map(clc, removeWords, profanity$V1)
clc <- tm_map(clc, removeWords, CharacterToRemove) ## 4
clc <- tm_map(clc, removePunctuation, preserve_intra_word_dashes=T) ##1
clc <- tm_map(clc, stripWhitespace) ## 2
Creation of N-grams
## Creation of Ngrams
removeNonAsciiNGrams <- function(corporaNGrams) {
if (length(corporaNGrams) == 0)
corporaNGrams
else
corporaNGrams[!grepl("It_is_not_ASCII", iconv(corporaNGrams, "UTF-8", "ASCII", sub="It_is_not_ASCII"))]
}
removeProfaneNGrams <- function(corporaNGrams, profanityWords) {
result <- if (length(corporaNGrams) == 0)
corporaNGrams
else
corporaNGrams[!any(str_split(corporaNGrams, '\\s+')[[1]] %in% profanityWords)]
result[!(result %in% c('$', '%', '&', '-', '--'))]
}
buildNGrams <- function(corpora, n, profanities = F, profanityWords = c()) {
result <- NULL
for (i in 1:length(corpora)) {
extNGrams <- NGramTokenizer(corpora[[i]], Weka_control(min = n, max = n, delimiters = " \\r\\n\\t.,;:\"()?!"))
result <- c(extNGrams, result)
}
result <- removeNonAsciiNGrams(result)
if (!profanities) {
removeProfaneNGrams(result, profanityWords)
} else {
result
}
}
summarizeNGrams <- function(corpora, n, profanities = F, profanityWords = c()) {
corporaNGrams <- buildNGrams(corpora, n, profanities, profanityWords)
if (length(corporaNGrams) > 0) {
result <- as.data.table(table(corporaNGrams))
setnames(result, c('corporaNGrams', 'N'), c('ngrams', 'freq'))
result
} else {
NULL
}
}
buildModel <- function(corpora, profanityWords = c(), n) {
summaryOfNGrams <- list()
for (i in 1:n) {
summaryOfNGrams[[i]] <- arrange(summarizeNGrams(corpora, i, profanityWords = profanity), desc(freq))
}
summaryOfNGrams
}
summaryOfNGrams <- buildModel(clc, profanity, 3)
pal2 <- brewer.pal(8,"Dark2")
onegrams1 <- summaryOfNGrams[[1]][1:50]
wordcloud(onegrams1$ngrams, onegrams1$freq, random.color = FALSE, colors = pal2)
When you analyse the one gram you find that the most frequent words are mainly prepossitions. Maybe bigram can give more predictive value
bigrams <- summaryOfNGrams[[2]][1:50]
wordcloud(bigrams$ngrams, bigrams$freq, random.color = FALSE, colors = pal2)
When you analyse bigrams find prepositions also have and important place although you find verbs start to appear. Nouns are very scarse. Lets see trigrams to check what happens
trigrams <- summaryOfNGrams[[3]][1:20]
wordcloud(trigrams$ngrams, trigrams$freq, random.color = FALSE, colors = pal2)
I see that nouns and adjectives are more scarse tan prepositions and verbs. I do not know if prepositions give more predictive value or better avoiding them
```