librerias<-c("tm", "RWeka", "ggplot2", "dplyr")
sapply(librerias, require, character.only=TRUE)
## Loading required package: tm
## Warning: package 'tm' was built under R version 3.2.5
## Loading required package: NLP
## Loading required package: RWeka
## Warning: package 'RWeka' was built under R version 3.2.5
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.2.5
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 3.2.5
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
##      tm   RWeka ggplot2   dplyr 
##    TRUE    TRUE    TRUE    TRUE

1. Resume

I’ve prepared a report that provides a summary of data text for US_news, US_twitter, US_blogs. In that sense, I’ve divided the document in three parts: Data processing, descriptive analysis and modelling strategies.

2. Data Processing

2.1 Getting the files

The analysis will use “English language”. Firstly, I’ve gotten the original files, and I’ve created samples of each document. I’ve aggregated these samples in a vector called “todos”.

newst<-readLines("F:/RTraining/TextMining/Text2/US.news.txt", encoding = "UTF-8")
## Warning in readLines("F:/RTraining/TextMining/Text2/US.news.txt", encoding
## = "UTF-8"): incomplete final line found on 'F:/RTraining/TextMining/Text2/
## US.news.txt'
twitterst<-readLines("F:/RTraining/TextMining/Text2/US.twitter.txt", encoding = "UTF-8")
## Warning in readLines("F:/RTraining/TextMining/Text2/US.twitter.txt",
## encoding = "UTF-8"): line 167155 appears to contain an embedded nul
## Warning in readLines("F:/RTraining/TextMining/Text2/US.twitter.txt",
## encoding = "UTF-8"): line 268547 appears to contain an embedded nul
## Warning in readLines("F:/RTraining/TextMining/Text2/US.twitter.txt",
## encoding = "UTF-8"): line 1274086 appears to contain an embedded nul
## Warning in readLines("F:/RTraining/TextMining/Text2/US.twitter.txt",
## encoding = "UTF-8"): line 1759032 appears to contain an embedded nul
blogst<-readLines("F:/RTraining/TextMining/Text2/US.blogs.txt", encoding = "UTF-8")


set.seed(50)
newssamplet<-sample(newst, length(newst)*0.005, replace=FALSE)
twittersamplet<-sample(twitterst, length(twitterst)*0.005, replace=FALSE)
blogssamplet<-sample(blogst, length(blogst)*0.005, replace=FALSE)
remove(newst)
remove(twitterst)
remove(blogst)


todost<-c(blogssamplet, newssamplet, twittersamplet)

2.2 Auxiliar functions

I’ve worked with a cleaning function using tm package.

limpia.decimals <- function(x) {gsub("([0-9]*)\\.([0-9]+)", "\\1 \\2", x)}
limpia.hashtags <- function(x) { gsub("#[a-zA-z0-9]+", " ", x)}
limpia.noneng <- function(x) {gsub("\\W+", " ",x)}

limpieza<-function(cuerpo){
        cuerpo.cl<-tm_map(cuerpo, removeNumbers)
        cuerpo.cl<-tm_map(cuerpo, removePunctuation)
        cuerpo.cl<-tm_map(cuerpo, limpia.decimals)
        cuerpo.cl<-tm_map(cuerpo, limpia.hashtags)
        cuerpo.cl<-tm_map(cuerpo, limpia.noneng)
        cuerpo.cl<-tm_map(cuerpo, stripWhitespace)
        cuerpo.cl<-tm_map(cuerpo, removeWords, stopwords("english"))
        cuerpo.cl<-tm_map(cuerpo, PlainTextDocument)
}

2.3 Creating a Corpus

Later, I’ve created a Corpus and applied the cleaning function to that Corpus

textos<-VCorpus(VectorSource(todost))
textos
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 16682
textos.cl<-limpieza(textos)
textos.cl
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 16682

3. Descriptive analysis

For the descriptive analysis, I’ve worked with N-grams (onegram, bigrams and threegrams)

3.1 Tokenizing and getting frequencies

OnegramToken<- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
BigramToken<- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
ThreegramToken<-function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))


getFreq <- function(tdm) {
        count <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE)
        return(data.frame(word = names(count), freq = count))
}

3.2 NGrams

3.2.1 One gram words

UniGramFreq <- getFreq(removeSparseTerms(
        TermDocumentMatrix(textos.cl, control = list(tokenize = OnegramToken))
        , 0.9999))

Mainone<-UniGramFreq[1:20,]

Mainonegraph <- ggplot(data=Mainone, aes(x=word, y=freq)) + 
              geom_bar(stat="identity") + 
              theme(axis.text.x = element_text(angle = 45, hjust = 1))
Mainonegraph

3.2.2 Two gram words

TwoGramFreq <- getFreq(removeSparseTerms(
        TermDocumentMatrix(textos.cl, control = list(tokenize = BigramToken))
        , 0.9999))

Maintwo<-TwoGramFreq[1:20,]

Maintwograph <- ggplot(data=Maintwo, aes(x=word, y=freq)) + 
              geom_bar(stat="identity") + 
              theme(axis.text.x = element_text(angle = 45, hjust = 1))
Maintwograph

3.2.3 Three gram words

ThreeGramFreq <- getFreq(removeSparseTerms(
        TermDocumentMatrix(textos.cl, control = list(tokenize = ThreegramToken))
        , 0.9999))


Mainthree<-ThreeGramFreq[1:20,]

Mainthreegraph <- ggplot(data=Mainthree, aes(x=word, y=freq)) + 
              geom_bar(stat="identity") + 
              theme(axis.text.x = element_text(angle = 45, hjust = 1))

Mainthreegraph

4. Strategie of development

This first exploratory analysis has to be improved in the cleaning function. Later, I would like to construct a predictive model in the shiny environment.