LMarques
Thursday, October 1, 2015
library(SnowballC) #stemming
library(qdap) #Analisis
library(qdapDictionaries)
library(dplyr) #Preparacion de datos
library(RColorBrewer) #paleta de colores
library(ggplot2) #plots
library(scales) #incluye comas & numeros
library(Rgraphviz) #plots correlacionados
library(twitteR) # Datos que trabajaremos tweets +/_ 200 @rdatammining
library(NLP)
library(tm) # analisis de texto
Fuentes soportadas por el Package tm
getSources()
## [1] "DataframeSource" "DirSource" "URISource" "VectorSource"
## [5] "XMLSource" "ZipSource"
Formatos que soporta el Package tm.
getReaders()
## [1] "readDOC" "readPDF"
## [3] "readPlain" "readRCV1"
## [5] "readRCV1asPlain" "readReut21578XML"
## [7] "readReut21578XMLasPlain" "readTabular"
## [9] "readTagged" "readXML"
rdmTweets <-rdmTweets
for (i in 1:10) {
cat(paste("[[", i, "]] ", sep=""))
writeLines(strwrap(rdmTweets[[i]]$getText(), width=73)) }
## [[1]] Postdoc/Research Scientist Position on Big Data at MIT
## http://t.co/hZ1ojAW2
## [[2]] Research scientist position for privacy-preserving data publishing,
## Singapore http://t.co/GPA0TyG5
## [[3]] Easier Parallel Computing in R with snowfall and sfCluster
## http://t.co/BPcinvzK
## [[4]] Tutorial: Parallel computing using R package snowfall
## http://t.co/CHBCyr76
## [[5]] handling big data: Interacting with Data using the filehash Package for
## R http://t.co/7RB3sChx
## [[6]] Parallel Computing with R using snow and snowfall http://t.co/nxp8EZpv
## [[7]] State of the Art in Parallel Computing with R http://t.co/zmClglqi
## [[8]] Slides on Parallel Computing in R http://t.co/AdDVxbOY
## [[9]] R with High Performance Computing: Parallel processing and large memory
## http://t.co/XZ3ZZBRF
## [[10]] High Performance Computing with R http://t.co/4fnGArSc
Transformar los tweets a data frame
df <- do.call("rbind", lapply(rdmTweets ,as.data.frame))
dim(df)
## [1] 154 10
Crear nuestro objeto de texto: CORPUS
myCorpus <- Corpus(VectorSource(df$text))
myCorpus
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 154
La funcion tm_map() se utiliza para aplicar distintas transformaciones en todos los documentos dentro de un corpus para obtener un text “limpio”.
getTransformations()
## [1] "removeNumbers" "removePunctuation" "removeWords"
## [4] "stemDocument" "stripWhitespace"
for (i in 11:15) {
cat(paste("[[", i, "]] ", sep=""))
writeLines(strwrap(myCorpus[[i]], width=73))
}
## [[11]] Slides on massive data, shared and distributed memory,and concurrent
## programming: bigmemory and foreach http://t.co/a6bQzxj5
## [[12]] The R Reference Card for Data Mining is updated with functions &
## packages for handling big data & parallel computing.
## http://t.co/FHoVZCyk
## [[13]] Post-doc on Optimizing a Cloud for Data Mining primitives, INRIA, France
## http://t.co/cA28STPO
## [[14]] Chief Scientist - Data Intensive Analytics, Pacific Northwest National
## Laboratory (PNNL), US http://t.co/0Gdzq1Nt
## [[15]] Top 10 in Data Mining http://t.co/7kAuNvuf
leertexto<-function(x) {
for (i in 11:15) {
cat(paste("[[", i, "]] ", sep=""))
writeLines(strwrap(myCorpus[[i]], width=73))
}
}
leertexto(mycorpus)
Transformar mayúsculas a minúsculas
myCorpus <- tm_map(myCorpus, tolower)
Quitar la puntuación
myCorpus <- tm_map(myCorpus, removePunctuation)
Quitar los números, siempre y cuando sean irrelevantes para nuestro estudio
myCorpus <- tm_map(myCorpus, removeNumbers)
Quitar las URLs
removeURL <- function(x) gsub("http[[:alnum:]]*", "", x)
myCorpus <- tm_map(myCorpus, removeURL)
length(stopwords("spanish"))
## [1] 308
length(stopwords("english"))
## [1] 174
stopwords("spanish")[1:10]
## [1] "de" "la" "que" "el" "en" "y" "a" "los" "del" "se"
stopwords("spanish")[200:210]
## [1] "hubiera" "hubieras" "hubiéramos" "hubierais" "hubieran"
## [6] "hubiese" "hubieses" "hubiésemos" "hubieseis" "hubiesen"
## [11] "habiendo"
stopwords("english")[1:10]
## [1] "i" "me" "my" "myself" "we"
## [6] "our" "ours" "ourselves" "you" "your"
Añadir dos extra stopwords: “available” and “via”
myStopwords <- c(stopwords('english'), "available", "via")
length(myStopwords )
## [1] 176
Eliminar palabras del conjunto Stopwords
myStopwords <- setdiff(myStopwords, c("r", "big"))
myStopwords <- setdiff(myStopwords, c("and", "until"))
length(myStopwords )
## [1] 174
myCorpus <- tm_map(myCorpus, removeWords, myStopwords)
Eliminar espacios en blanco
myCorpus <- tm_map(myCorpus, stripWhitespace)
inspect(myCorpus[11:15])
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 5
##
## [[1]]
## [1] slides massive data shared and distributed memoryand concurrent programming bigmemory and foreach
##
## [[2]]
## [1] r reference card data mining updated functions packages handling big data parallel computing
##
## [[3]]
## [1] postdoc optimizing cloud data mining primitives inria france
##
## [[4]]
## [1] chief scientist data intensive analytics pacific northwest national laboratory pnnl us
##
## [[5]]
## [1] top data mining
Realizar algunas transformaciones concretas, por ejemplo nombre de la empresa, instituciones o texto muy evidente.
toString <- content_transformer(function(x, from, to) gsub(from, to, x))
docs <- tm_map(docs, toString, " institute technology", " IT")
docs <- tm_map(docs, toString, "shenzhen institutes advanced technology", "SIAT")
docs <- tm_map(docs, toString, "chinese academy sciences", "CAS")
library(SnowballC)
Se utiliza un algoritmo que elimina terminaciones de las palabras comunes y nos quedamos con la raiz. La funcionalidad es proporcionada por wordStem () de SnowballC.
myCorpusCopy <- myCorpus
Stem words
myCorpus <- tm_map(myCorpus, stemDocument)
for (i in 11:15) {
cat(paste("[[", i, "]] ", sep=""))
writeLines(strwrap(myCorpus[[i]], width=73))
}
## [[11]] slides massive data shared and distributed memoryand concurrent
## programming bigmemory and foreach
## [[12]] r reference card data mining updated functions packages handling big
## data parallel computing
## [[13]] postdoc optimizing cloud data mining primitives inria france
## [[14]] chief scientist data intensive analytics pacific northwest national
## laboratory pnnl us
## [[15]] top data mining
Contar frecuencias de palabras: Ej. “mining”
miningCases <- tm_map(myCorpusCopy, grep, pattern="\\<mining")
sum(unlist(miningCases))
[1] 82 Ej.“miners”
minerCases <- tm_map(myCorpusCopy, grep, pattern="\\<miners")
sum(unlist(minerCases))
[1] 3
Reemplazamos “miners” with “mining”
myCorpus <- tm_map(myCorpus, gsub, pattern="miners", replacement="mining")
TermDocumentMatrix crea una tabla de contingencia con las palabras en filas y los documentos en columnas. DocumentTermMatrix crea la traspuesta.
myCorpus <- tm_map(myCorpus, PlainTextDocument)
myTdm <- TermDocumentMatrix(myCorpus, control=list(wordLengths=c(1,Inf)))
myTdm
## <<TermDocumentMatrix (terms: 584, documents: 154)>>
## Non-/sparse entries: 1238/88698
## Sparsity : 99%
## Maximal term length: 25
## Weighting : term frequency (tf)
idx <- which(dimnames(myTdm)$Terms == "r")
inspect(myTdm[idx+(0:5),101:110])
## <<TermDocumentMatrix (terms: 6, documents: 10)>>
## Non-/sparse entries: 9/51
## Sparsity : 85%
## Maximal term length: 12
## Weighting : term frequency (tf)
##
## Docs
## Terms character(0) character(0) character(0) character(0)
## r 1 1 0 0
## ramachandran 0 0 0 0
## random 0 0 0 0
## ranked 0 0 0 0
## rapidminer 1 0 0 0
## rdatamining 0 0 0 0
## Docs
## Terms character(0) character(0) character(0) character(0)
## r 2 0 0 1
## ramachandran 0 0 0 0
## random 0 0 0 0
## ranked 0 0 0 0
## rapidminer 0 0 0 0
## rdatamining 0 0 0 1
## Docs
## Terms character(0) character(0)
## r 1 1
## ramachandran 0 0
## random 0 0
## ranked 1 0
## rapidminer 0 0
## rdatamining 0 0
myTdm <- TermDocumentMatrix(myCorpus, control=list(minWordLength=1))
class(myTdm)
## [1] "TermDocumentMatrix" "simple_triplet_matrix"
dim(myTdm)
## [1] 571 154
Encontrar las palabras más relevantes para nuestro estudio
findFreqTerms(myTdm, lowfreq=10)
## [1] "analysis" "and" "data" "examples"
## [5] "introduction" "mining" "network" "package"
## [9] "research" "slides" "social" "tutorial"
## [13] "using"
findFreqTerms(myTdm, lowfreq=20)
## [1] "analysis" "and" "data" "mining"
freq <- colSums(as.matrix(myTdm))
length(freq)
## [1] 154
ord <- order(freq)
freq[head(ord)]
## character(0) character(0) character(0) character(0) character(0)
## 2 3 3 3 3
## character(0)
## 3
freq[tail(ord)]
## character(0) character(0) character(0) character(0) character(0)
## 13 13 14 14 15
## character(0)
## 15
termFrequency <- rowSums(as.matrix(myTdm))
length(termFrequency)
## [1] 571
termFrequency <- subset(termFrequency, termFrequency>=5)
length(termFrequency)
## [1] 42
freq[tail(termFrequency)]
## character(0) character(0) character(0) character(0) character(0)
## 4 3 8 8 8
## character(0)
## 3
head(table(freq), 5)
## freq
## 2 3 4 5 6
## 1 8 16 19 21
tail(table(freq), 5)
## freq
## 11 12 13 14 15
## 17 15 5 2 2
library(ggplot2)
##
## Attaching package: 'ggplot2'
##
## The following object is masked from 'package:NLP':
##
## annotate
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
freq <- sort(rowSums(as.matrix(myTdm)), decreasing=TRUE)
head(freq, 14)
## data and mining analysis package
## 63 54 52 22 17
## slides examples network tutorial research
## 16 15 15 13 12
## social introduction using computing
## 12 10 10 9
wf <- data.frame(word=names(freq), freq=freq)
head(wf)
## word freq
## data data 63
## and and 54
## mining mining 52
## analysis analysis 22
## package package 17
## slides slides 16
subset(wf, freq>5 ) %>% ggplot(aes(word, freq)) + geom_bar(stat="identity") + theme(axis.text.x=element_text(angle=45, hjust=1))
barplot(termFrequency, las=2)
Si dos palabras aparecen siempre juntas, entonces la correlación seri???a 1.0 y si nunca aparecen juntas la correlación serí???a 0.0. Así la correlación es una medida de cuan estrechamente asociado las palabras se encuentran en el Corpus.
Que palabras estan asociadas con “data”?
findAssocs(myTdm, 'data', 0.30)
## data
## mining 0.55
## incl 0.34
## practice 0.34
## realworld 0.34
## small 0.34
Que palabras estan asociadas con “mining”?
findAssocs(myTdm, 'mining', 0.25)
## mining
## data 0.55
## mahout 0.39
## recommendation 0.39
## sets 0.39
## supports 0.39
## frequent 0.35
## itemset 0.34
## card 0.29
## reference 0.29
## functions 0.27
## classification 0.26
## experience 0.26
## lecture 0.26
## text 0.26
library(graph)
library(Rgraphviz)
## Loading required package: grid
#plot(myTdm,terms=findFreqTerms(myTdm, lowfreq=10)[1:5],corThreshold=0.25)
plot(myTdm,terms=findFreqTerms(myTdm, lowfreq=5)[1:30 ] ,corThreshold=0.25)
Generar una nube de palabras para proporcionar un rápido resumen visual de la frecuencia de las palabras en un Corpus.
El paquete wordcloud() Proporciona la función requerida.
El uso de set.seed() es para que podamos obtener el mismo diseño cada vez por lo contrario se elige un diseño al azar.
library(RColorBrewer)
library(wordcloud)
Se calcula la frecuencia de palabras y orden por frecuencia
m <- as.matrix(myTdm)
wordFreq <- sort(rowSums(m), decreasing=TRUE)
set.seed(375) # to make it reproducible
grayLevels <- gray( (wordFreq+10) / (max(wordFreq)+10) )
wordcloud(words=names(wordFreq), freq=wordFreq, min.freq=3, random.order=F,colors=grayLevels)
set.seed(142)
wordcloud(words=names(wordFreq), freq=wordFreq,max.words=50)
set.seed(375)
wordcloud(words=names(wordFreq), freq=wordFreq,max.words=150, scale=c(6,.5), colors=brewer.pal(6,"Dark2"))
Se quitan terminos sparse
myTdm2 <- removeSparseTerms(myTdm, sparse=0.95)
m2 <- as.matrix(myTdm2)
cluster terms
distMatrix <- dist(scale(m2))
fit <- hclust(distMatrix, method="ward")
## The "ward" method has been renamed to "ward.D"; note new "ward.D2"
plot(fit)
#cut tree into 10 clusters
rect.hclust(fit, k=10)
(groups <- cutree(fit, k=10))
## analysis and computing data examples
## 1 2 3 4 5
## introduction mining network package parallel
## 3 6 1 7 3
## research series slides social time
## 8 9 10 1 9
## tutorial using
## 7 3
Se transpone la matriz transpose the matrix to cluster documents (tweets)
m3 <- t(m2)
Fijar una semilla aleatoria
set.seed(122)
k-means clustering of tweets
k <- 8
kmeansResult <- kmeans(m3, k)
cluster centers
round(kmeansResult$centers, digits=3)
## analysis and computing data examples introduction mining network
## 1 0.033 0.262 0.016 0.000 0.098 0.049 0.000 0.049
## 2 0.000 0.375 0.125 2.000 0.000 0.125 1.625 0.000
## 3 0.917 0.083 0.000 0.000 0.083 0.167 0.083 1.000
## 4 0.000 0.625 0.875 0.000 0.000 0.000 0.000 0.000
## 5 0.000 0.206 0.000 0.735 0.059 0.029 1.000 0.000
## 6 0.250 1.875 0.000 0.625 0.375 0.250 0.125 0.000
## 7 0.500 0.500 0.000 0.125 0.125 0.125 0.375 0.000
## 8 0.200 0.200 0.000 1.067 0.133 0.000 0.000 0.000
## package parallel research series slides social time tutorial using
## 1 0.098 0.000 0.049 0 0.115 0.033 0 0.066 0.016
## 2 0.125 0.125 0.125 0 0.125 0.000 0 0.000 0.125
## 3 0.083 0.000 0.083 0 0.083 0.833 0 0.167 0.083
## 4 0.375 0.875 0.000 0 0.125 0.000 0 0.125 0.250
## 5 0.029 0.000 0.000 0 0.059 0.000 0 0.118 0.029
## 6 0.500 0.000 0.125 0 0.125 0.000 0 0.000 0.375
## 7 0.000 0.000 0.000 1 0.250 0.000 1 0.125 0.000
## 8 0.067 0.000 0.400 0 0.067 0.000 0 0.067 0.067
for (i in 1:k) {
cat(paste("cluster ", i, ": ", sep=""))
s <- sort(kmeansResult$centers[i,], decreasing=T)
cat(names(s)[1:3], "\n")
}
## cluster 1: and slides examples
## cluster 2: data mining and
## cluster 3: network analysis social
## cluster 4: computing parallel and
## cluster 5: mining data and
## cluster 6: and data package
## cluster 7: series time analysis
## cluster 8: data research analysis
library(fpc)
## Warning: package 'fpc' was built under R version 3.2.2
## Warning in .recacheSubclasses(def@className, def, doSubclasses, env):
## undefined subclass "externalRefMethod" of class "kfunction"; definition not
## updated
partitioning around medoids with estimation of number of clusters
pamResult <- pamk(m3, metric="manhattan")
pamResult <- pamk(m3, metric="euclidian")
#number of clusters identified
(k <- pamResult$nc)
## [1] 10
pamResult <- pamResult$pamobject
#print cluster medoids
for (i in 1:k) {
cat(paste("cluster", i, ": "))
cat(colnames(pamResult$medoids)[which(pamResult$medoids[i,]==1)], "\n")
# print tweets in cluster i
# print(rdmTweets[pamResult$clustering==i])
}
## cluster 1 : data
## cluster 2 : and computing parallel
## cluster 3 :
## cluster 4 : and data mining
## cluster 5 : data mining
## cluster 6 : analysis network social
## cluster 7 : and
## cluster 8 : and package
## cluster 9 : mining
## cluster 10 : analysis and mining series time
layout(matrix(c(1,2),2,1)) # set to two graphs per page
plot(pamResult, color=F, labels=4, lines=0, cex=.8, col.clus=1,col.p=pamResult$clustering)
layout(matrix(1)) # change back to one graph per page
library(cluster)
d = dist(distMatrix, method="euclidian")
kfit = kmeans(d, 2)
clusplot(as.matrix(d), kfit$cluster, color=T, shade=T, labels=2, lines=0)
library(qdap)
## Loading required package: qdapDictionaries
## Loading required package: qdapRegex
##
## Attaching package: 'qdapRegex'
##
## The following objects are masked from 'package:dplyr':
##
## escape, explain
##
## The following object is masked from 'package:ggplot2':
##
## %+%
##
## Loading required package: qdapTools
##
## Attaching package: 'qdapTools'
##
## The following object is masked from 'package:dplyr':
##
## id
##
## WARNING: Rtools is required to build R packages, but no version of Rtools compatible with R 3.2.0 was found. (Only the following incompatible version(s) of Rtools were found:3.0)
##
## Please download and install Rtools 3.3 from http://cran.r-project.org/bin/windows/Rtools/ and then run find_rtools().
##
## Attaching package: 'qdap'
##
## The following object is masked from 'package:dplyr':
##
## %>%
##
## The following objects are masked from 'package:tm':
##
## as.DocumentTermMatrix, as.TermDocumentMatrix
##
## The following object is masked from 'package:NLP':
##
## ngrams
##
## The following object is masked from 'package:base':
##
## Filter
Resumir la lista de palabras.
words <- myTdm %>% as.matrix %>% rownames %>% (function(x) x[nchar(x) < 20])
head(words, 15)
## [1] "\034big" "access" "accessible" "acm"
## [5] "added" "addition" "administration" "advanced"
## [9] "afl" "aggregating" "ago" "algorithms"
## [13] "also" "america" "amounts"
summary(nchar(words))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.0 5.0 7.0 7.1 9.0 19.0
table(nchar(words))
##
## 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 19
## 44 74 68 68 77 76 56 45 30 10 5 9 4 1 2 1
dist_tab(nchar(words))
## interval freq cum.freq percent cum.percent
## 1 3 44 44 7.72 7.72
## 2 4 74 118 12.98 20.70
## 3 5 68 186 11.93 32.63
## 4 6 68 254 11.93 44.56
## 5 7 77 331 13.51 58.07
## 6 8 76 407 13.33 71.40
## 7 9 56 463 9.82 81.23
## 8 10 45 508 7.89 89.12
## 9 11 30 538 5.26 94.39
## 10 12 10 548 1.75 96.14
## 11 13 5 553 0.88 97.02
## 12 14 9 562 1.58 98.60
## 13 15 4 566 0.70 99.30
## 14 16 1 567 0.18 99.47
## 15 17 2 569 0.35 99.82
## 16 19 1 570 0.18 100.00
data.frame(nletters=nchar(words)) %>% ggplot(aes(x=nletters)) +
geom_histogram(binwidth=1) +
geom_vline(xintercept=mean(nchar(words)),colour="green", size=1, alpha=.5) +
labs(x="Number of Letters", y="Number of Words")
library(dplyr)
library(stringr)
##
## Attaching package: 'stringr'
##
## The following object is masked from 'package:qdap':
##
## %>%
##
## The following object is masked from 'package:graph':
##
## boundary
words %>% str_split("") %>% sapply(function(x) x[-1]) %>% unlist %>% dist_tab %>% mutate(Letter=factor(toupper(interval),levels=toupper(interval[order(freq)]))) %>%
ggplot(aes(Letter, weight=percent)) + geom_bar() + coord_flip() + ylab("Proportion") + scale_y_continuous(breaks=seq(0, 12, 2), label=function(x) paste0(x, "%"),expand=c(0,0), limits=c(0,12))
words %>% lapply(function(x) sapply(letters, gregexpr, x, fixed=TRUE)) %>% unlist %>%
(function(x) x[x!=-1]) %>%
(function(x) setNames(x, gsub("nnd", "", names(x)))) %>%
(function(x) apply(table(data.frame(letter=toupper(names(x)), position=unname(x))), 1, function(y) y/length(x))) %>%
qheat(high="green", low="yellow", by.column=NULL,
values=TRUE, digits=3, plot=FALSE) +
ylab("Letter") +
xlab("Position") +
theme(axis.text.x=element_text(angle=0)) +
guides(fill=guide_legend(title="Proportion"))