L. Marqués Padreny
Thursday, October 1, 2015
library(SnowballC) #stemming
library(qdap) #Analisis
library(qdapDictionaries)
library(dplyr) #Preparacion de datos
library(RColorBrewer)#paleta de colores
library(wordcloud)
library(ggplot2) #plots
library(scales) #incluye comas & numeros
library(Rgraphviz) #plots correlacionados
library(twitteR) # Datos que trabajaremos tweets +/_ 200 @rdatammining
library(NLP)
library(tm) # analisis de texto
Fuentes soportadas por el Package tm
getSources()
## [1] "DataframeSource" "DirSource" "URISource" "VectorSource"
## [5] "XMLSource" "ZipSource"
Formatos que soporta el Package tm.
getReaders()
## [1] "readDOC" "readPDF"
## [3] "readPlain" "readRCV1"
## [5] "readRCV1asPlain" "readReut21578XML"
## [7] "readReut21578XMLasPlain" "readTabular"
## [9] "readTagged" "readXML"
Datos:
## Opción 1: se obtiene tweets de Twitter
#library(twitteR)
#tweets <- userTimeline("RDataMining", n = 3200)
## Opción 2:
#AL final de la presentaciÓn está el link de los datos, hay tres conjuntos distintos para practicar.
rdmTweets <-rdmTweets
for (i in 1:10) {
cat(paste("[[", i, "]] ", sep=""))
writeLines(strwrap(rdmTweets[[i]]$getText(), width=73)) }
## [[1]] Postdoc/Research Scientist Position on Big Data at MIT
## http://t.co/hZ1ojAW2
## [[2]] Research scientist position for privacy-preserving data publishing,
## Singapore http://t.co/GPA0TyG5
## [[3]] Easier Parallel Computing in R with snowfall and sfCluster
## http://t.co/BPcinvzK
## [[4]] Tutorial: Parallel computing using R package snowfall
## http://t.co/CHBCyr76
## [[5]] handling big data: Interacting with Data using the filehash Package for
## R http://t.co/7RB3sChx
## [[6]] Parallel Computing with R using snow and snowfall http://t.co/nxp8EZpv
## [[7]] State of the Art in Parallel Computing with R http://t.co/zmClglqi
## [[8]] Slides on Parallel Computing in R http://t.co/AdDVxbOY
## [[9]] R with High Performance Computing: Parallel processing and large memory
## http://t.co/XZ3ZZBRF
## [[10]] High Performance Computing with R http://t.co/4fnGArSc
Transformar los tweets a data frame (package twitter)
df <- do.call("rbind", lapply(rdmTweets ,as.data.frame))
dim(df)
## [1] 154 10
Crear nuestro objeto de texto: CORPUS (package tm)
myCorpus <- Corpus(VectorSource(df$text))
myCorpus
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 154
La funcion tm_map() se utiliza para aplicar distintas transformaciones en todos los documentos dentro de un corpus para obtener un text “limpio”.
getTransformations()
## [1] "removeNumbers" "removePunctuation" "removeWords"
## [4] "stemDocument" "stripWhitespace"
for (i in 11:15) {
cat(paste("[[", i, "]] ", sep=""))
writeLines(strwrap(myCorpus[[i]], width=73))
}
## [[11]] Slides on massive data, shared and distributed memory,and concurrent
## programming: bigmemory and foreach http://t.co/a6bQzxj5
## [[12]] The R Reference Card for Data Mining is updated with functions &
## packages for handling big data & parallel computing.
## http://t.co/FHoVZCyk
## [[13]] Post-doc on Optimizing a Cloud for Data Mining primitives, INRIA, France
## http://t.co/cA28STPO
## [[14]] Chief Scientist - Data Intensive Analytics, Pacific Northwest National
## Laboratory (PNNL), US http://t.co/0Gdzq1Nt
## [[15]] Top 10 in Data Mining http://t.co/7kAuNvuf
leertexto<-function(x) {
for (i in 11:15) {
cat(paste("[[", i, "]] ", sep=""))
writeLines(strwrap(myCorpus[[i]], width=73))
}
}
leertexto(mycorpus)
Transformar mayúsculas a minúsculas
myCorpus <- tm_map(myCorpus, tolower)
Quitar la puntuación
myCorpus <- tm_map(myCorpus, removePunctuation)
Quitar los números, siempre y cuando sean irrelevantes para nuestro estudio
myCorpus <- tm_map(myCorpus, removeNumbers)
Quitar las URLs
removeURL <- function(x) gsub("http[[:alnum:]]*", "", x)
myCorpus <- tm_map(myCorpus, removeURL)
length(stopwords("spanish"))
## [1] 308
length(stopwords("english"))
## [1] 174
stopwords("spanish")[1:10]
## [1] "de" "la" "que" "el" "en" "y" "a" "los" "del" "se"
stopwords("spanish")[200:210]
## [1] "hubiera" "hubieras" "hubiéramos" "hubierais" "hubieran"
## [6] "hubiese" "hubieses" "hubiésemos" "hubieseis" "hubiesen"
## [11] "habiendo"
stopwords("english")[1:10]
## [1] "i" "me" "my" "myself" "we"
## [6] "our" "ours" "ourselves" "you" "your"
Añadir dos extra stopwords: “available” and “via”
myStopwords <- c(stopwords('english'), "available", "via")
length(myStopwords )
## [1] 176
Eliminar palabras del conjunto Stopwords
myStopwords <- setdiff(myStopwords, c("r", "big"))
myStopwords <- setdiff(myStopwords, c("and", "until"))
length(myStopwords )
## [1] 174
myCorpus <- tm_map(myCorpus, removeWords, myStopwords)
Eliminar espacios extra en blanco
myCorpus <- tm_map(myCorpus, stripWhitespace)
inspect(myCorpus[11:15])
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 5
##
## [[1]]
## [1] slides massive data shared and distributed memoryand concurrent programming bigmemory and foreach
##
## [[2]]
## [1] r reference card data mining updated functions packages handling big data parallel computing
##
## [[3]]
## [1] postdoc optimizing cloud data mining primitives inria france
##
## [[4]]
## [1] chief scientist data intensive analytics pacific northwest national laboratory pnnl us
##
## [[5]]
## [1] top data mining
Realizar algunas transformaciones concretas,
por ejemplo nombre de empresas, instituciones o texto muy evidente.
toString <- content_transformer(function(x, from, to) gsub(from, to, x))
docs <- tm_map(docs, toString, "Institute technology", " IT")
docs <- tm_map(docs, toString, "universitat politecnica catalunya", "UPC")
docs <- tm_map(docs, toString, "Universitat de Barcelona", "UB")
library(SnowballC)
Se utiliza un algoritmo que elimina terminaciones de las palabras comunes y nos quedamos con la raiz. La funcionalidad es proporcionada por wordStem () de SnowballC.
Ejemplo de esta funcion:
library(SnowballC)
getStemLanguages()#esta funcion soporta varios idiomas
## [1] "danish" "dutch" "english" "finnish" "french"
## [6] "german" "hungarian" "italian" "norwegian" "porter"
## [11] "portuguese" "romanian" "russian" "spanish" "swedish"
## [16] "turkish"
#wordStem(words, language = "en")
wordStem(c("win", "winning", "winner"))
## [1] "win" "win" "winner"
Stem words
myCorpusCopy <- myCorpus
myCorpus <- tm_map(myCorpus, stemDocument)
for (i in 11:15) {
cat(paste("[[", i, "]] ", sep=""))
writeLines(strwrap(myCorpus[[i]], width=73))
}
## [[11]] slides massive data shared and distributed memoryand concurrent
## programming bigmemory and foreach
## [[12]] r reference card data mining updated functions packages handling big
## data parallel computing
## [[13]] postdoc optimizing cloud data mining primitives inria france
## [[14]] chief scientist data intensive analytics pacific northwest national
## laboratory pnnl us
## [[15]] top data mining
TermDocumentMatrix crea una tabla de contingencia con las palabras en filas y los documentos en columnas. DocumentTermMatrix crea la traspuesta.
myCorpus <- tm_map(myCorpus, PlainTextDocument)
myTdm <- TermDocumentMatrix(myCorpus, control=list(wordLengths=c(1,Inf)))
myTdm
## <<TermDocumentMatrix (terms: 585, documents: 154)>>
## Non-/sparse entries: 1239/88851
## Sparsity : 99%
## Maximal term length: 25
## Weighting : term frequency (tf)
idx <- which(dimnames(myTdm)$Terms == "r")
inspect(myTdm[idx+(0:5),101:110])
## <<TermDocumentMatrix (terms: 6, documents: 10)>>
## Non-/sparse entries: 9/51
## Sparsity : 85%
## Maximal term length: 12
## Weighting : term frequency (tf)
##
## Docs
## Terms character(0) character(0) character(0) character(0)
## r 1 1 0 0
## ramachandran 0 0 0 0
## random 0 0 0 0
## ranked 0 0 0 0
## rapidminer 1 0 0 0
## rdatamining 0 0 0 0
## Docs
## Terms character(0) character(0) character(0) character(0)
## r 2 0 0 1
## ramachandran 0 0 0 0
## random 0 0 0 0
## ranked 0 0 0 0
## rapidminer 0 0 0 0
## rdatamining 0 0 0 1
## Docs
## Terms character(0) character(0)
## r 1 1
## ramachandran 0 0
## random 0 0
## ranked 1 0
## rapidminer 0 0
## rdatamining 0 0
myTdm <- TermDocumentMatrix(myCorpus, control=list(minWordLength=1))
class(myTdm)
## [1] "TermDocumentMatrix" "simple_triplet_matrix"
dim(myTdm)
## [1] 572 154
Encontrar las palabras más relevantes para nuestro estudio
findFreqTerms(myTdm, lowfreq=10)
## [1] "analysis" "and" "data" "examples"
## [5] "introduction" "mining" "network" "package"
## [9] "research" "slides" "social" "tutorial"
## [13] "using"
findFreqTerms(myTdm, lowfreq=20)
## [1] "analysis" "and" "data" "mining"
Columnas
mydtm=DocumentTermMatrix(myCorpus)
freqC <- colSums(as.matrix(mydtm))
length(freqC)
## [1] 572
ord <- order(freqC)
freqC[head(ord)]
## \034big accessible
## 1 1
## acm addictedtorfreefrgraphiqu
## 1 1
## addition administration
## 1 1
freqC[tail(ord)]
## slides package analysis mining and data
## 16 17 22 50 54 63
Filas
termFrequency <- rowSums(as.matrix(myTdm))
length(termFrequency)
## [1] 572
termFrequency <- subset(termFrequency, termFrequency>=5)
length(termFrequency)
## [1] 42
head(termFrequency)
## analysis and australia card clustering code
## 22 54 5 5 8 7
head(table(termFrequency), 5)
## termFrequency
## 5 6 7 8 9
## 15 5 3 5 1
tail(table(termFrequency), 5)
## termFrequency
## 17 22 50 54 63
## 1 1 1 1 1
library(ggplot2)
library(dplyr)
freq <- sort(rowSums(as.matrix(myTdm)), decreasing=TRUE)
head(freq, 14)
## data and mining analysis package
## 63 54 50 22 17
## slides examples network tutorial research
## 16 15 15 13 12
## social introduction using computing
## 12 10 10 9
wf <- data.frame(word=names(freq), freq=freq)
head(wf)
## word freq
## data data 63
## and and 54
## mining mining 50
## analysis analysis 22
## package package 17
## slides slides 16
subset(wf, freq>5 ) %>% ggplot(aes(word, freq)) + geom_bar(stat="identity") + theme(axis.text.x=element_text(angle=45, hjust=1))
barplot(termFrequency, las=2)
Si dos palabras aparecen siempre juntas, entonces la correlación sería 1.0 y si nunca aparecen juntas la correlación sería 0.0. Así la correlación es una medida de cuan estrechamente asociado las palabras se encuentran en el Corpus.
Que palabras estan asociadas con “data”?
findAssocs(myTdm, 'data', 0.30)
## data
## mining 0.52
## incl 0.34
## practice 0.34
## realworld 0.34
## small 0.34
Que palabras estan asociadas con “mining”?
findAssocs(myTdm, 'mining', 0.25)
## mining
## data 0.52
## mahout 0.41
## recommendation 0.41
## sets 0.41
## supports 0.41
## frequent 0.36
## itemset 0.35
## card 0.30
## reference 0.30
## functions 0.28
## text 0.28
## classification 0.27
## lecture 0.27
## appl 0.25
## chapters 0.25
## lists 0.25
## useful 0.25
## want 0.25
library(graph)
library(Rgraphviz)
## Loading required package: grid
#plot(myTdm,terms=findFreqTerms(myTdm, lowfreq=10)[1:5],corThreshold=0.25)
plot(myTdm,terms=findFreqTerms(myTdm, lowfreq=5)[1:30 ] ,corThreshold=0.25)
Generar una nube de palabras para proporcionar un rápido resumen visual de la frecuencia de las palabras en un Corpus.
El paquete wordcloud() Proporciona la función requerida.
El uso de set.seed() es para que podamos obtener el mismo diseño cada vez por lo contrario se elige un diseño al azar.
library(RColorBrewer)
library(wordcloud)
Se calcula la frecuencia de palabras y orden por frecuencia
m <- as.matrix(myTdm)
wordFreq <- sort(rowSums(m), decreasing=TRUE)
set.seed(375) # to make it reproducible
grayLevels <- gray( (wordFreq+10) / (max(wordFreq)+10) )
wordcloud(words=names(wordFreq), freq=wordFreq, min.freq=3, random.order=F,colors=grayLevels)
set.seed(142)
wordcloud(words=names(wordFreq), freq=wordFreq,max.words=50)
set.seed(375)
wordcloud(words=names(wordFreq), freq=wordFreq,max.words=150, scale=c(6,.5), colors=brewer.pal(6,"Dark2"))
Se quitan los términos sparse
myTdm2 <- removeSparseTerms(myTdm, sparse=0.95)
m2 <- as.matrix(myTdm2)
cluster terms
distMatrix <- dist(scale(m2))
fit <- hclust(distMatrix, method="ward")
## The "ward" method has been renamed to "ward.D"; note new "ward.D2"
plot(fit)
#cortamos el arbol en 10 clusters
rect.hclust(fit, k=10)
(groups <- cutree(fit, k=10))
## analysis and computing data examples
## 1 2 3 4 5
## introduction mining network package parallel
## 3 6 1 7 3
## research series slides social time
## 8 9 10 1 9
## tutorial using
## 7 3
Se transpone la matriz para calcular los distintos clusters
m3 <- t(m2)
Fijar una semilla aleatoria
set.seed(122)
k-means clustering de tweets
k <- 8
kmeansResult <- kmeans(m3, k)
cluster centers
round(kmeansResult$centers, digits=3)
## analysis and computing data examples introduction mining network
## 1 0.033 0.262 0.016 0.000 0.098 0.049 0.000 0.049
## 2 0.000 0.375 0.125 2.125 0.000 0.125 1.125 0.000
## 3 0.917 0.083 0.000 0.000 0.083 0.167 0.083 1.000
## 4 0.000 0.625 0.875 0.000 0.000 0.000 0.000 0.000
## 5 0.000 0.176 0.000 0.735 0.059 0.029 1.059 0.000
## 6 0.250 1.875 0.000 0.625 0.375 0.250 0.125 0.000
## 7 0.500 0.500 0.000 0.125 0.125 0.125 0.375 0.000
## 8 0.200 0.267 0.000 1.000 0.133 0.000 0.000 0.000
## package parallel research series slides social time tutorial using
## 1 0.098 0.000 0.049 0 0.115 0.033 0 0.066 0.016
## 2 0.250 0.125 0.125 0 0.125 0.000 0 0.000 0.250
## 3 0.083 0.000 0.083 0 0.083 0.833 0 0.167 0.083
## 4 0.375 0.875 0.000 0 0.125 0.000 0 0.125 0.250
## 5 0.029 0.000 0.000 0 0.059 0.000 0 0.118 0.029
## 6 0.500 0.000 0.125 0 0.125 0.000 0 0.000 0.375
## 7 0.000 0.000 0.000 1 0.250 0.000 1 0.125 0.000
## 8 0.000 0.000 0.400 0 0.067 0.000 0 0.067 0.000
for (i in 1:k) {
cat(paste("cluster ", i, ": ", sep=""))
s <- sort(kmeansResult$centers[i,], decreasing=T)
cat(names(s)[1:3], "\n")
}
## cluster 1: and slides examples
## cluster 2: data mining and
## cluster 3: network analysis social
## cluster 4: computing parallel and
## cluster 5: mining data and
## cluster 6: and data package
## cluster 7: series time analysis
## cluster 8: data research and
library(cluster)
d = dist(distMatrix , method="euclidian")
kfit = kmeans(d, 4)
clusplot(as.matrix(d), kfit$cluster, color=T, shade=T, labels=2, lines=0)
library(fpc)
## Warning: package 'fpc' was built under R version 3.2.2
## Warning in .recacheSubclasses(def@className, def, doSubclasses, env):
## undefined subclass "externalRefMethod" of class "kfunction"; definition not
## updated
Particionar con k-medoids y estimar el número de clusters
pamResult <- pamk(m3, metric="manhattan")
pamResult <- pamk(m3, metric="euclidian")
#número de clusters identificados
(k <- pamResult$nc)
## [1] 10
pamResult <- pamResult$pamobject
#print cluster medoids
for (i in 1:k) {
cat(paste("cluster", i, ": "))
cat(colnames(pamResult$medoids)[which(pamResult$medoids[i,]==1)], "\n")
# print tweets in cluster i
# print(rdmTweets[pamResult$clustering==i])
}
## cluster 1 : data
## cluster 2 : and computing parallel
## cluster 3 :
## cluster 4 : and data mining
## cluster 5 : data mining
## cluster 6 : analysis network social
## cluster 7 : and
## cluster 8 : and package
## cluster 9 : mining
## cluster 10 : analysis and mining series time
layout(matrix(c(1,2),2,1)) # set to two graphs per page
plot(pamResult, color=F, labels=4, lines=0, cex=.8, col.clus=1,col.p=pamResult$clustering)
layout(matrix(1)) # change back to one graph per page
library(qdap)
Resumir la lista de palabras.
words <- myTdm %>% as.matrix %>% rownames %>% (function(x) x[nchar(x) < 20])
head(words, 15)
## [1] "\034big" "access" "accessible" "acm"
## [5] "added" "addition" "administration" "advanced"
## [9] "afl" "aggregating" "ago" "algorithms"
## [13] "also" "america" "amounts"
summary(nchar(words))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.000 5.000 7.000 7.098 9.000 19.000
table(nchar(words))
##
## 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 19
## 44 74 68 69 77 76 56 45 30 10 5 9 4 1 2 1
dist_tab(nchar(words))
## interval freq cum.freq percent cum.percent
## 1 3 44 44 7.71 7.71
## 2 4 74 118 12.96 20.67
## 3 5 68 186 11.91 32.57
## 4 6 69 255 12.08 44.66
## 5 7 77 332 13.49 58.14
## 6 8 76 408 13.31 71.45
## 7 9 56 464 9.81 81.26
## 8 10 45 509 7.88 89.14
## 9 11 30 539 5.25 94.40
## 10 12 10 549 1.75 96.15
## 11 13 5 554 0.88 97.02
## 12 14 9 563 1.58 98.60
## 13 15 4 567 0.70 99.30
## 14 16 1 568 0.18 99.47
## 15 17 2 570 0.35 99.82
## 16 19 1 571 0.18 100.00
data.frame(nletters=nchar(words)) %>% ggplot(aes(x=nletters)) +
geom_histogram(binwidth=1) +
geom_vline(xintercept=mean(nchar(words)),colour="green", size=1, alpha=.5) +
labs(x="Number of Letters", y="Number of Words")
library(dplyr)
library(stringr)
##
## Attaching package: 'stringr'
##
## The following object is masked from 'package:qdap':
##
## %>%
##
## The following object is masked from 'package:graph':
##
## boundary
words %>% str_split("") %>% sapply(function(x) x[-1]) %>% unlist %>% dist_tab %>% mutate(Letter=factor(toupper(interval),levels=toupper(interval[order(freq)]))) %>%
ggplot(aes(Letter, weight=percent)) + geom_bar() + coord_flip() + ylab("Proportion") + scale_y_continuous(breaks=seq(0, 12, 2), label=function(x) paste0(x, "%"),expand=c(0,0), limits=c(0,12))
words %>% lapply(function(x) sapply(letters, gregexpr, x, fixed=TRUE)) %>% unlist %>%
(function(x) x[x!=-1]) %>%
(function(x) setNames(x, gsub("nnd", "", names(x)))) %>%
(function(x) apply(table(data.frame(letter=toupper(names(x)), position=unname(x))), 1, function(y) y/length(x))) %>%
qheat(high="green", low="yellow", by.column=NULL,
values=TRUE, digits=3, plot=FALSE) +
ylab("Letter") +
xlab("Position") +
theme(axis.text.x=element_text(angle=0)) +
guides(fill=guide_legend(title="Proportion"))
Links data:
https://drive.google.com/file/d/0B8gvT0gpUPyvV1NJeWFRUTVpSk0/view?usp=sharing https://drive.google.com/file/d/0B8gvT0gpUPyvclE4REhvQnZRemc/view?usp=sharing https://drive.google.com/file/d/0B8gvT0gpUPyvaFlRYmd4NEJ0Z1k/view?usp=sharing
#0.-datos
install.packages("tm", "wordcloud", "twitter")
load("data")
# 1.-Crear Un data Frame
df <- do.call("rbind", lapply( "data" ,as.data.frame))
dim(df)
#2.-Crear nuestro objeto de texto: CORPUS
myCorpus <- Corpus(VectorSource(df$text))
myCorpus
#3.- Hacer alguna transformacion :
myCorpus <- tm_map(myCorpus, tolower)
myCorpus <- tm_map(myCorpus, removePunctuation)
myCorpus <- tm_map(myCorpus, removeNumbers)
removeURL <- function(x) gsub("http[[:alnum:]]*", "", x)
myCorpus <- tm_map(myCorpus, removeURL)
myCorpus <- tm_map(myCorpus, stripWhitespace)
#leer texto cada paso
#4.-Crear Matriz
myCorpus <- tm_map(myCorpus, PlainTextDocument)
myTdm <- TermDocumentMatrix(myCorpus, control=list(wordLengths=c(1,Inf)))
m <- as.matrix(myTdm)
#5.-Crear un Wordplot
wordFreq <- sort(rowSums(m), decreasing=TRUE)
set.seed(xxxxxx)
wordcloud(xxxxx)