milestones Capstone Project

Creation of Libraries

library(tm)

## Loading required package: NLP

library(RWeka)
library(openNLP)
library(tau)
library(Rstem)
library(SnowballC)

## 
## Attaching package: 'SnowballC'

## The following objects are masked from 'package:Rstem':
## 
##     getStemLanguages, wordStem

library(quanteda)

## 
## Attaching package: 'quanteda'

## The following object is masked from 'package:tau':
## 
##     tokenize

## The following objects are masked from 'package:tm':
## 
##     as.DocumentTermMatrix, stopwords

## The following object is masked from 'package:NLP':
## 
##     ngrams

## The following object is masked from 'package:stats':
## 
##     df

## The following object is masked from 'package:base':
## 
##     sample

library(stringr)
library(slam)
library(stylo)

## stylo version: 0.6.3

library(plyr)
library(data.table)
library(wordcloud)

## Loading required package: RColorBrewer

library(RColorBrewer)
library(parallel)
library(foreach)
library(doParallel)

## Loading required package: iterators

library(doSNOW)

## Loading required package: snow

## 
## Attaching package: 'snow'

## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, clusterSplit, makeCluster,
##     parApply, parCapply, parLapply, parRapply, parSapply,
##     splitIndices, stopCluster

setting of directory and creation of files:

setwd("D:/personal/data science/Capstone Project/final/en_US")

con<-file("en_US.news.txt", open="rt")
news<-readLines(con, skipNul = T, warn=FALSE)
close(con)
##summary(news)
con<-file("en_US.blogs.txt", open="rt")
blogs<-readLines(con, skipNul = T, warn=FALSE)
close(con)
##summary(blogs)

con<-file("en_US.twitter.txt", open="rt")
tweets<-readLines(con, skipNul = T, warn=FALSE)
close(con)

Analize size of files to check if they are too big

## Analize size of files
news.size<-file.info("en_US.news.txt")$size/1000^2
blog.size<-file.info("en_US.blogs.txt")$size/1000^2
twitter.size<-file.info("en_US.twitter.txt")$size/1000^2
size<-c(news=news.size,blog=blog.size,twitter=twitter.size)
barplot(size, main="Size of the files", col="magenta")
text(1.9, 200, round(blog.size,0))
text(0.7, 190, round(news.size,0))
text(3, 160, round(twitter.size,0))

Analize length of files to check if they are too big

## Analize length of files
news.length<-length(news)
blogs.length<-length(blogs)
twitter.length<-length(tweets)
lengths<-c(news=news.length,blog=blogs.length,twitter=twitter.length)
lengths2<-data.frame(news=news.length,blogs=blogs.length,twitter=twitter.length)
lengths2

##    news  blogs twitter
## 1 77259 899288 2360148

barplot(lengths, main="Length of the files", col="green")

Analize amount of words to check if they are too big

## Amount of words of files
news.words<-sum(sapply(gregexpr("\\S+", news), length))
blogs.words<-sum(sapply(gregexpr("\\S+", blogs), length))
twitter.words<-sum(sapply(gregexpr("\\S+", tweets), length))
words<-c(news=news.words,blog=blogs.words,twitter=twitter.words)
words2<-data.frame(news=news.words,blogs=blogs.words,twitter=twitter.words)
words2

##      news    blogs  twitter
## 1 2643972 37334441 30373832

barplot(words, main="Amount of words of each file", col="purple")

After checking the three files, the conclusion is that tweets has the biggest amount of words and length although blogs has the biggest size of the file of them.

Building n-grams

They are too big. It is needed mix them and create a sample them. A small amount of data is needed to avoid losing of speed

set.seed(0625)
SampleTweets=sample(tweets, 1000)
set.seed(0625)
SampleBlogs=sample(blogs, 1000)
set.seed(0625)
SampleNews=sample(news, 1000)
total.samples=rbind(SampleBlogs,SampleNews,SampleTweets)
path<-"D:/personal/data science/Capstone Project/final/en_US/total.samples.txt"

Cleaning of data and creation of Corpora

profanity <- read.csv(paste0(getwd(),'/profanity.csv'),header=FALSE,stringsAsFactors=FALSE)

src <- VectorSource(total.samples)
corpora <- VCorpus(src)

CharacterToRemove<-c("Ã¢â¬Å","Ã¢â¬â¢","Ã¢â¬â¢", "-")
clc <- tm_map(corpora, FUN = removeNumbers)
clc <- tm_map(clc, content_transformer(tolower))
clc <- tm_map(clc, removeWords, profanity$V1)
clc <- tm_map(clc, removeWords, CharacterToRemove) ## 4
clc <- tm_map(clc, removePunctuation, preserve_intra_word_dashes=T) ##1
clc <- tm_map(clc, stripWhitespace) ## 2

Creation of N-grams

 ## Creation of Ngrams
  
  
removeNonAsciiNGrams <- function(corporaNGrams) {
    if (length(corporaNGrams) == 0)
      corporaNGrams
    else
      corporaNGrams[!grepl("It_is_not_ASCII", iconv(corporaNGrams, "UTF-8", "ASCII", sub="It_is_not_ASCII"))]
  }
  
removeProfaneNGrams <- function(corporaNGrams, profanityWords) {
    result <- if (length(corporaNGrams) == 0)
      corporaNGrams
    else
      corporaNGrams[!any(str_split(corporaNGrams, '\\s+')[[1]] %in% profanityWords)]
    result[!(result %in% c('$', '%', '&', '-', '--'))]
  }
  
buildNGrams <- function(corpora, n, profanities = F, profanityWords = c()) {
    result <- NULL
    for (i in 1:length(corpora)) {
      extNGrams <- NGramTokenizer(corpora[[i]], Weka_control(min = n, max = n, delimiters = " \\r\\n\\t.,;:\"()?!"))
      result <- c(extNGrams, result)
    }
    result <- removeNonAsciiNGrams(result)
    if (!profanities) {
      removeProfaneNGrams(result, profanityWords)
    } else {
      result
    }
  }
 
summarizeNGrams <- function(corpora, n, profanities = F, profanityWords = c()) {
    corporaNGrams <- buildNGrams(corpora, n, profanities, profanityWords)
    if (length(corporaNGrams) > 0) {
      result <- as.data.table(table(corporaNGrams))
      setnames(result, c('corporaNGrams', 'N'), c('ngrams', 'freq'))
      result
    } else {
      NULL
    }
  }
  
buildModel <- function(corpora, profanityWords = c(), n) {
  
  
    summaryOfNGrams <- list()
    for (i in 1:n) {
      summaryOfNGrams[[i]] <- arrange(summarizeNGrams(corpora, i, profanityWords = profanity), desc(freq))
    }
    summaryOfNGrams
  }


summaryOfNGrams <- buildModel(clc, profanity, 3)

pal2 <- brewer.pal(8,"Dark2")
onegrams1 <- summaryOfNGrams[[1]][1:50]
wordcloud(onegrams1$ngrams, onegrams1$freq, random.color = FALSE, colors = pal2)

When you analyse the one gram you find that the most frequent words are mainly prepossitions. Maybe bigram can give more predictive value

bigrams <- summaryOfNGrams[[2]][1:50]
wordcloud(bigrams$ngrams, bigrams$freq, random.color = FALSE, colors = pal2)

When you analyse bigrams find prepositions also have and important place although you find verbs start to appear. Nouns are very scarse. Lets see trigrams to check what happens

trigrams <- summaryOfNGrams[[3]][1:20]
wordcloud(trigrams$ngrams, trigrams$freq, random.color = FALSE, colors = pal2)

I see that nouns and adjectives are more scarse tan prepositions and verbs. I do not know if prepositions give more predictive value or better avoiding them

```

milestones Capstone Project

Gonzalo Andres Moreno

29 de marzo de 2016

Creation of Libraries

Building n-grams