librerias<-c("tm", "RWeka", "ggplot2", "dplyr")
sapply(librerias, require, character.only=TRUE)
## Loading required package: tm
## Warning: package 'tm' was built under R version 3.2.5
## Loading required package: NLP
## Loading required package: RWeka
## Warning: package 'RWeka' was built under R version 3.2.5
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.2.5
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 3.2.5
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## tm RWeka ggplot2 dplyr
## TRUE TRUE TRUE TRUE
I’ve prepared a report that provides a summary of data text for US_news, US_twitter, US_blogs. In that sense, I’ve divided the document in three parts: Data processing, descriptive analysis and modelling strategies.
The analysis will use “English language”. Firstly, I’ve gotten the original files, and I’ve created samples of each document. I’ve aggregated these samples in a vector called “todos”.
newst<-readLines("F:/RTraining/TextMining/Text2/US.news.txt", encoding = "UTF-8")
## Warning in readLines("F:/RTraining/TextMining/Text2/US.news.txt", encoding
## = "UTF-8"): incomplete final line found on 'F:/RTraining/TextMining/Text2/
## US.news.txt'
twitterst<-readLines("F:/RTraining/TextMining/Text2/US.twitter.txt", encoding = "UTF-8")
## Warning in readLines("F:/RTraining/TextMining/Text2/US.twitter.txt",
## encoding = "UTF-8"): line 167155 appears to contain an embedded nul
## Warning in readLines("F:/RTraining/TextMining/Text2/US.twitter.txt",
## encoding = "UTF-8"): line 268547 appears to contain an embedded nul
## Warning in readLines("F:/RTraining/TextMining/Text2/US.twitter.txt",
## encoding = "UTF-8"): line 1274086 appears to contain an embedded nul
## Warning in readLines("F:/RTraining/TextMining/Text2/US.twitter.txt",
## encoding = "UTF-8"): line 1759032 appears to contain an embedded nul
blogst<-readLines("F:/RTraining/TextMining/Text2/US.blogs.txt", encoding = "UTF-8")
set.seed(50)
newssamplet<-sample(newst, length(newst)*0.005, replace=FALSE)
twittersamplet<-sample(twitterst, length(twitterst)*0.005, replace=FALSE)
blogssamplet<-sample(blogst, length(blogst)*0.005, replace=FALSE)
remove(newst)
remove(twitterst)
remove(blogst)
todost<-c(blogssamplet, newssamplet, twittersamplet)
I’ve worked with a cleaning function using tm package.
limpia.decimals <- function(x) {gsub("([0-9]*)\\.([0-9]+)", "\\1 \\2", x)}
limpia.hashtags <- function(x) { gsub("#[a-zA-z0-9]+", " ", x)}
limpia.noneng <- function(x) {gsub("\\W+", " ",x)}
limpieza<-function(cuerpo){
cuerpo.cl<-tm_map(cuerpo, removeNumbers)
cuerpo.cl<-tm_map(cuerpo, removePunctuation)
cuerpo.cl<-tm_map(cuerpo, limpia.decimals)
cuerpo.cl<-tm_map(cuerpo, limpia.hashtags)
cuerpo.cl<-tm_map(cuerpo, limpia.noneng)
cuerpo.cl<-tm_map(cuerpo, stripWhitespace)
cuerpo.cl<-tm_map(cuerpo, removeWords, stopwords("english"))
cuerpo.cl<-tm_map(cuerpo, PlainTextDocument)
}
Later, I’ve created a Corpus and applied the cleaning function to that Corpus
textos<-VCorpus(VectorSource(todost))
textos
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 16682
textos.cl<-limpieza(textos)
textos.cl
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 16682
For the descriptive analysis, I’ve worked with N-grams (onegram, bigrams and threegrams)
OnegramToken<- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
BigramToken<- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
ThreegramToken<-function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
getFreq <- function(tdm) {
count <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE)
return(data.frame(word = names(count), freq = count))
}
UniGramFreq <- getFreq(removeSparseTerms(
TermDocumentMatrix(textos.cl, control = list(tokenize = OnegramToken))
, 0.9999))
Mainone<-UniGramFreq[1:20,]
Mainonegraph <- ggplot(data=Mainone, aes(x=word, y=freq)) +
geom_bar(stat="identity") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Mainonegraph
TwoGramFreq <- getFreq(removeSparseTerms(
TermDocumentMatrix(textos.cl, control = list(tokenize = BigramToken))
, 0.9999))
Maintwo<-TwoGramFreq[1:20,]
Maintwograph <- ggplot(data=Maintwo, aes(x=word, y=freq)) +
geom_bar(stat="identity") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Maintwograph
ThreeGramFreq <- getFreq(removeSparseTerms(
TermDocumentMatrix(textos.cl, control = list(tokenize = ThreegramToken))
, 0.9999))
Mainthree<-ThreeGramFreq[1:20,]
Mainthreegraph <- ggplot(data=Mainthree, aes(x=word, y=freq)) +
geom_bar(stat="identity") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Mainthreegraph
This first exploratory analysis has to be improved in the cleaning function. Later, I would like to construct a predictive model in the shiny environment.