Text Mining

LMarques

Thursday, October 1, 2015

Packages Necesarios

library(SnowballC)  #stemming
library(qdap) #Analisis
library(qdapDictionaries)
library(dplyr) #Preparacion de datos
library(RColorBrewer) #paleta de colores
library(ggplot2) #plots
library(scales) #incluye comas & numeros 
library(Rgraphviz) #plots correlacionados
library(twitteR) # Datos que trabajaremos tweets +/_ 200 @rdatammining
library(NLP)
library(tm) # analisis de texto

Fuentes y Formatos

Fuentes soportadas por el Package tm

getSources()
## [1] "DataframeSource" "DirSource"       "URISource"       "VectorSource"   
## [5] "XMLSource"       "ZipSource"

Formatos que soporta el Package tm.

getReaders()
##  [1] "readDOC"                 "readPDF"                
##  [3] "readPlain"               "readRCV1"               
##  [5] "readRCV1asPlain"         "readReut21578XML"       
##  [7] "readReut21578XMLasPlain" "readTabular"            
##  [9] "readTagged"              "readXML"

Cargar datos “rdmtweets” del package Twitter, tuits sobre Rdatamining

rdmTweets <-rdmTweets
for (i in 1:10) {
     cat(paste("[[", i, "]] ", sep=""))
     writeLines(strwrap(rdmTweets[[i]]$getText(), width=73))  }
## [[1]] Postdoc/Research Scientist Position on Big Data at MIT
## http://t.co/hZ1ojAW2
## [[2]] Research scientist position for privacy-preserving data publishing,
## Singapore http://t.co/GPA0TyG5
## [[3]] Easier Parallel Computing in R with snowfall and sfCluster
## http://t.co/BPcinvzK
## [[4]] Tutorial: Parallel computing using R package snowfall
## http://t.co/CHBCyr76
## [[5]] handling big data: Interacting with Data using the filehash Package for
## R http://t.co/7RB3sChx
## [[6]] Parallel Computing with R using snow and snowfall http://t.co/nxp8EZpv
## [[7]] State of the Art in Parallel Computing with R http://t.co/zmClglqi
## [[8]] Slides on Parallel Computing in R http://t.co/AdDVxbOY
## [[9]] R with High Performance Computing: Parallel processing and large memory
## http://t.co/XZ3ZZBRF
## [[10]] High Performance Computing with R http://t.co/4fnGArSc

Data frame & Objeto de texto “Corpus”"

Transformar los tweets a data frame

df <- do.call("rbind", lapply(rdmTweets  ,as.data.frame))
dim(df)
## [1] 154  10

Crear nuestro objeto de texto: CORPUS

myCorpus <- Corpus(VectorSource(df$text))
myCorpus
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 154

Transformaciones simples del Texto

La funcion tm_map() se utiliza para aplicar distintas transformaciones en todos los documentos dentro de un corpus para obtener un text “limpio”.

getTransformations()
## [1] "removeNumbers"     "removePunctuation" "removeWords"      
## [4] "stemDocument"      "stripWhitespace"

Descripción de transformaciones con tm_map()

Inspeccionar el texto Inicial (tweets) del 11 al 15

for (i in 11:15) {                                                      
    cat(paste("[[", i, "]] ", sep=""))                                      
    writeLines(strwrap(myCorpus[[i]], width=73))                            
  }
## [[11]] Slides on massive data, shared and distributed memory,and concurrent
## programming: bigmemory and foreach http://t.co/a6bQzxj5
## [[12]] The R Reference Card for Data Mining is updated with functions &
## packages for handling big data & parallel computing.
## http://t.co/FHoVZCyk
## [[13]] Post-doc on Optimizing a Cloud for Data Mining primitives, INRIA, France
## http://t.co/cA28STPO
## [[14]] Chief Scientist - Data Intensive Analytics, Pacific Northwest National
## Laboratory (PNNL), US http://t.co/0Gdzq1Nt
## [[15]] Top 10 in Data Mining http://t.co/7kAuNvuf
leertexto<-function(x) {
  for (i in 11:15) {                                                      
    cat(paste("[[", i, "]] ", sep=""))                                      
    writeLines(strwrap(myCorpus[[i]], width=73))                            
  } 
 
}

leertexto(mycorpus)

Las transfomaciones

Transformar mayúsculas a minúsculas

myCorpus <- tm_map(myCorpus, tolower)    

Quitar la puntuación

myCorpus <- tm_map(myCorpus, removePunctuation)           

Quitar los números, siempre y cuando sean irrelevantes para nuestro estudio

myCorpus <- tm_map(myCorpus, removeNumbers)               

Quitar las URLs

removeURL <- function(x) gsub("http[[:alnum:]]*", "", x)  
myCorpus <- tm_map(myCorpus, removeURL)                   

Stopwords

length(stopwords("spanish"))
## [1] 308
length(stopwords("english"))
## [1] 174
stopwords("spanish")[1:10]
##  [1] "de"  "la"  "que" "el"  "en"  "y"   "a"   "los" "del" "se"
stopwords("spanish")[200:210]
##  [1] "hubiera"    "hubieras"   "hubiéramos" "hubierais"  "hubieran"  
##  [6] "hubiese"    "hubieses"   "hubiésemos" "hubieseis"  "hubiesen"  
## [11] "habiendo"
stopwords("english")[1:10]
##  [1] "i"         "me"        "my"        "myself"    "we"       
##  [6] "our"       "ours"      "ourselves" "you"       "your"

Tratar “Stop Words” propias

Añadir dos extra stopwords: “available” and “via”

myStopwords <- c(stopwords('english'), "available", "via")
length(myStopwords )
## [1] 176

Eliminar palabras del conjunto Stopwords

myStopwords <- setdiff(myStopwords, c("r", "big"))  
myStopwords <- setdiff(myStopwords, c("and", "until"))  
length(myStopwords )
## [1] 174
myCorpus <- tm_map(myCorpus, removeWords, myStopwords)    

Eliminar espacios en blanco

myCorpus <- tm_map(myCorpus, stripWhitespace)

Resultados

inspect(myCorpus[11:15])
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 5
## 
## [[1]]
## [1] slides massive data shared and distributed memoryand concurrent programming bigmemory and foreach 
## 
## [[2]]
## [1]  r reference card data mining updated functions packages handling big data parallel computing 
## 
## [[3]]
## [1] postdoc optimizing cloud data mining primitives inria france 
## 
## [[4]]
## [1] chief scientist data intensive analytics pacific northwest national laboratory pnnl us 
## 
## [[5]]
## [1] top data mining

Ejemplo de Transformaciones especificas

Realizar algunas transformaciones concretas,   por ejemplo nombre de la empresa, instituciones  o texto muy evidente.

toString <- content_transformer(function(x, from, to) gsub(from, to, x))

docs <- tm_map(docs, toString, "  institute technology", " IT")
docs <- tm_map(docs, toString, "shenzhen institutes advanced technology", "SIAT")
docs <- tm_map(docs, toString, "chinese academy sciences", "CAS")  

Stemming Words

library(SnowballC)

Se utiliza un algoritmo que elimina terminaciones de las palabras comunes y nos quedamos con la raiz. La funcionalidad es proporcionada por wordStem () de SnowballC.

myCorpusCopy <- myCorpus                                                

Stem words

myCorpus <- tm_map(myCorpus, stemDocument) 

Inspeccionar el texto Final (tweets) del 11 al 15

for (i in 11:15) {                                                      
    cat(paste("[[", i, "]] ", sep=""))                                      
    writeLines(strwrap(myCorpus[[i]], width=73))                            
  }
## [[11]] slides massive data shared and distributed memoryand concurrent
## programming bigmemory and foreach
## [[12]] r reference card data mining updated functions packages handling big
## data parallel computing
## [[13]] postdoc optimizing cloud data mining primitives inria france
## [[14]] chief scientist data intensive analytics pacific northwest national
## laboratory pnnl us
## [[15]] top data mining

Contar frecuencias de palabras: Ej. “mining”

miningCases <- tm_map(myCorpusCopy, grep, pattern="\\<mining")
sum(unlist(miningCases))  

[1] 82 Ej.“miners”

minerCases <- tm_map(myCorpusCopy, grep, pattern="\\<miners")
sum(unlist(minerCases))   

[1] 3

Reemplazamos “miners” with “mining”

myCorpus <- tm_map(myCorpus, gsub, pattern="miners", replacement="mining")

Crear Term-Document-Matrix

TermDocumentMatrix crea una tabla de contingencia con las palabras en filas y los documentos en columnas. DocumentTermMatrix crea la traspuesta.

myCorpus <- tm_map(myCorpus, PlainTextDocument)
myTdm <- TermDocumentMatrix(myCorpus, control=list(wordLengths=c(1,Inf)))
myTdm
## <<TermDocumentMatrix (terms: 584, documents: 154)>>
## Non-/sparse entries: 1238/88698
## Sparsity           : 99%
## Maximal term length: 25
## Weighting          : term frequency (tf)
idx <- which(dimnames(myTdm)$Terms == "r")
inspect(myTdm[idx+(0:5),101:110])
## <<TermDocumentMatrix (terms: 6, documents: 10)>>
## Non-/sparse entries: 9/51
## Sparsity           : 85%
## Maximal term length: 12
## Weighting          : term frequency (tf)
## 
##               Docs
## Terms          character(0) character(0) character(0) character(0)
##   r                       1            1            0            0
##   ramachandran            0            0            0            0
##   random                  0            0            0            0
##   ranked                  0            0            0            0
##   rapidminer              1            0            0            0
##   rdatamining             0            0            0            0
##               Docs
## Terms          character(0) character(0) character(0) character(0)
##   r                       2            0            0            1
##   ramachandran            0            0            0            0
##   random                  0            0            0            0
##   ranked                  0            0            0            0
##   rapidminer              0            0            0            0
##   rdatamining             0            0            0            1
##               Docs
## Terms          character(0) character(0)
##   r                       1            1
##   ramachandran            0            0
##   random                  0            0
##   ranked                  1            0
##   rapidminer              0            0
##   rdatamining             0            0
myTdm <- TermDocumentMatrix(myCorpus, control=list(minWordLength=1))
class(myTdm)
## [1] "TermDocumentMatrix"    "simple_triplet_matrix"
dim(myTdm)
## [1] 571 154

Frecuencias & Asociaciones

Encontrar las palabras más relevantes para nuestro estudio
 findFreqTerms(myTdm, lowfreq=10)
##  [1] "analysis"     "and"          "data"         "examples"    
##  [5] "introduction" "mining"       "network"      "package"     
##  [9] "research"     "slides"       "social"       "tutorial"    
## [13] "using"
 findFreqTerms(myTdm, lowfreq=20)
## [1] "analysis" "and"      "data"     "mining"

freq <- colSums(as.matrix(myTdm))
length(freq)
## [1] 154
ord <- order(freq)
freq[head(ord)]
## character(0) character(0) character(0) character(0) character(0) 
##            2            3            3            3            3 
## character(0) 
##            3
freq[tail(ord)]
## character(0) character(0) character(0) character(0) character(0) 
##           13           13           14           14           15 
## character(0) 
##           15
termFrequency <- rowSums(as.matrix(myTdm))
length(termFrequency)
## [1] 571
termFrequency <- subset(termFrequency, termFrequency>=5)
length(termFrequency)
## [1] 42
freq[tail(termFrequency)]
## character(0) character(0) character(0) character(0) character(0) 
##            4            3            8            8            8 
## character(0) 
##            3
head(table(freq), 5)
## freq
##  2  3  4  5  6 
##  1  8 16 19 21
tail(table(freq), 5)
## freq
## 11 12 13 14 15 
## 17 15  5  2  2

Plots

library(ggplot2) 
## 
## Attaching package: 'ggplot2'
## 
## The following object is masked from 'package:NLP':
## 
##     annotate
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

freq <- sort(rowSums(as.matrix(myTdm)), decreasing=TRUE)
head(freq, 14)
##         data          and       mining     analysis      package 
##           63           54           52           22           17 
##       slides     examples      network     tutorial     research 
##           16           15           15           13           12 
##       social introduction        using    computing 
##           12           10           10            9
wf <- data.frame(word=names(freq), freq=freq)
head(wf)
##              word freq
## data         data   63
## and           and   54
## mining     mining   52
## analysis analysis   22
## package   package   17
## slides     slides   16

subset(wf, freq>5 ) %>% ggplot(aes(word, freq)) + geom_bar(stat="identity") + theme(axis.text.x=element_text(angle=45, hjust=1))

Barplot

barplot(termFrequency, las=2)

Identificar frecuencias Items & Asociaciones

Si dos palabras aparecen siempre juntas, entonces la correlación seri???a 1.0 y si nunca aparecen juntas la correlación serí???a 0.0. Así la correlación es una medida de cuan estrechamente asociado las palabras se encuentran en el Corpus.

Que palabras estan asociadas con “data”?

findAssocs(myTdm, 'data', 0.30)
##           data
## mining    0.55
## incl      0.34
## practice  0.34
## realworld 0.34
## small     0.34

Que palabras estan asociadas con “mining”?

findAssocs(myTdm, 'mining', 0.25)
##                mining
## data             0.55
## mahout           0.39
## recommendation   0.39
## sets             0.39
## supports         0.39
## frequent         0.35
## itemset          0.34
## card             0.29
## reference        0.29
## functions        0.27
## classification   0.26
## experience       0.26
## lecture          0.26
## text             0.26

plots de correlaciones

library(graph)
 library(Rgraphviz)
## Loading required package: grid
#plot(myTdm,terms=findFreqTerms(myTdm, lowfreq=10)[1:5],corThreshold=0.25)
plot(myTdm,terms=findFreqTerms(myTdm, lowfreq=5)[1:30 ] ,corThreshold=0.25)

wordcloud

Generar una nube de palabras para proporcionar un rápido resumen visual de la frecuencia de las palabras en un Corpus. 
El paquete wordcloud() Proporciona la función requerida.
El uso de set.seed() es para que podamos obtener el mismo diseño cada vez por lo contrario se elige un diseño al azar.
library(RColorBrewer)
library(wordcloud)

Se calcula la frecuencia de palabras y orden por frecuencia

m <- as.matrix(myTdm)
wordFreq <- sort(rowSums(m), decreasing=TRUE)

plot Wordcloud 1

set.seed(375) # to make it reproducible
grayLevels <- gray( (wordFreq+10) / (max(wordFreq)+10) )
wordcloud(words=names(wordFreq), freq=wordFreq, min.freq=3, random.order=F,colors=grayLevels)

plot Wordcloud 2

set.seed(142)
wordcloud(words=names(wordFreq), freq=wordFreq,max.words=50)

plot Wordcloud 3

set.seed(375)
wordcloud(words=names(wordFreq), freq=wordFreq,max.words=150, scale=c(6,.5), colors=brewer.pal(6,"Dark2"))

Clustering Words

Se quitan terminos sparse

myTdm2 <- removeSparseTerms(myTdm, sparse=0.95)
m2 <- as.matrix(myTdm2)

cluster terms

distMatrix <- dist(scale(m2))
fit <- hclust(distMatrix, method="ward")
## The "ward" method has been renamed to "ward.D"; note new "ward.D2"
plot(fit)
#cut tree into 10 clusters
rect.hclust(fit, k=10)

(groups <- cutree(fit, k=10))
##     analysis          and    computing         data     examples 
##            1            2            3            4            5 
## introduction       mining      network      package     parallel 
##            3            6            1            7            3 
##     research       series       slides       social         time 
##            8            9           10            1            9 
##     tutorial        using 
##            7            3

Clustering Tweets

Se transpone la matriz transpose the matrix to cluster documents (tweets)

m3 <- t(m2)

Fijar una semilla aleatoria

set.seed(122)

k-means clustering of tweets

k <- 8
kmeansResult <- kmeans(m3, k)

cluster centers

round(kmeansResult$centers, digits=3)
##   analysis   and computing  data examples introduction mining network
## 1    0.033 0.262     0.016 0.000    0.098        0.049  0.000   0.049
## 2    0.000 0.375     0.125 2.000    0.000        0.125  1.625   0.000
## 3    0.917 0.083     0.000 0.000    0.083        0.167  0.083   1.000
## 4    0.000 0.625     0.875 0.000    0.000        0.000  0.000   0.000
## 5    0.000 0.206     0.000 0.735    0.059        0.029  1.000   0.000
## 6    0.250 1.875     0.000 0.625    0.375        0.250  0.125   0.000
## 7    0.500 0.500     0.000 0.125    0.125        0.125  0.375   0.000
## 8    0.200 0.200     0.000 1.067    0.133        0.000  0.000   0.000
##   package parallel research series slides social time tutorial using
## 1   0.098    0.000    0.049      0  0.115  0.033    0    0.066 0.016
## 2   0.125    0.125    0.125      0  0.125  0.000    0    0.000 0.125
## 3   0.083    0.000    0.083      0  0.083  0.833    0    0.167 0.083
## 4   0.375    0.875    0.000      0  0.125  0.000    0    0.125 0.250
## 5   0.029    0.000    0.000      0  0.059  0.000    0    0.118 0.029
## 6   0.500    0.000    0.125      0  0.125  0.000    0    0.000 0.375
## 7   0.000    0.000    0.000      1  0.250  0.000    1    0.125 0.000
## 8   0.067    0.000    0.400      0  0.067  0.000    0    0.067 0.067
for (i in 1:k) {
    cat(paste("cluster ", i, ": ", sep=""))
    s <- sort(kmeansResult$centers[i,], decreasing=T)
    cat(names(s)[1:3], "\n")
    
}
## cluster 1: and slides examples 
## cluster 2: data mining and 
## cluster 3: network analysis social 
## cluster 4: computing parallel and 
## cluster 5: mining data and 
## cluster 6: and data package 
## cluster 7: series time analysis 
## cluster 8: data research analysis

Clustering Tweets con “the k-medoids Algorithm”

library(fpc)
## Warning: package 'fpc' was built under R version 3.2.2
## Warning in .recacheSubclasses(def@className, def, doSubclasses, env):
## undefined subclass "externalRefMethod" of class "kfunction"; definition not
## updated

partitioning around medoids with estimation of number of clusters

pamResult <- pamk(m3, metric="manhattan")
pamResult <- pamk(m3, metric="euclidian")
#number of clusters identified
(k <- pamResult$nc)
## [1] 10
pamResult <- pamResult$pamobject

#print cluster medoids

for (i in 1:k) {
    cat(paste("cluster", i, ": "))
    cat(colnames(pamResult$medoids)[which(pamResult$medoids[i,]==1)], "\n")
    # print tweets in cluster i
    # print(rdmTweets[pamResult$clustering==i])
} 
## cluster 1 : data 
## cluster 2 : and computing parallel 
## cluster 3 :  
## cluster 4 : and data mining 
## cluster 5 : data mining 
## cluster 6 : analysis network social 
## cluster 7 : and 
## cluster 8 : and package 
## cluster 9 : mining 
## cluster 10 : analysis and mining series time

plot cluster

layout(matrix(c(1,2),2,1)) # set to two graphs per page
plot(pamResult, color=F, labels=4, lines=0, cex=.8, col.clus=1,col.p=pamResult$clustering)

layout(matrix(1)) # change back to one graph per page

que poso aqu??? laura

library(cluster)
d = dist(distMatrix, method="euclidian")
kfit = kmeans(d, 2)
clusplot(as.matrix(d), kfit$cluster, color=T, shade=T, labels=2, lines=0)

Quantificar las palabras

library(qdap)
## Loading required package: qdapDictionaries
## Loading required package: qdapRegex
## 
## Attaching package: 'qdapRegex'
## 
## The following objects are masked from 'package:dplyr':
## 
##     escape, explain
## 
## The following object is masked from 'package:ggplot2':
## 
##     %+%
## 
## Loading required package: qdapTools
## 
## Attaching package: 'qdapTools'
## 
## The following object is masked from 'package:dplyr':
## 
##     id
## 
## WARNING: Rtools is required to build R packages, but no version of Rtools compatible with R 3.2.0 was found. (Only the following incompatible version(s) of Rtools were found:3.0)
## 
## Please download and install Rtools 3.3 from http://cran.r-project.org/bin/windows/Rtools/ and then run find_rtools().
## 
## Attaching package: 'qdap'
## 
## The following object is masked from 'package:dplyr':
## 
##     %>%
## 
## The following objects are masked from 'package:tm':
## 
##     as.DocumentTermMatrix, as.TermDocumentMatrix
## 
## The following object is masked from 'package:NLP':
## 
##     ngrams
## 
## The following object is masked from 'package:base':
## 
##     Filter

Resumir la lista de palabras.

words <- myTdm %>% as.matrix %>% rownames %>% (function(x) x[nchar(x) < 20])  
head(words, 15)                                                                                                       
##  [1] "\034big"        "access"         "accessible"     "acm"           
##  [5] "added"          "addition"       "administration" "advanced"      
##  [9] "afl"            "aggregating"    "ago"            "algorithms"    
## [13] "also"           "america"        "amounts"
summary(nchar(words))                                                                                                 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     3.0     5.0     7.0     7.1     9.0    19.0
table(nchar(words))                                                                                                   
## 
##  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 19 
## 44 74 68 68 77 76 56 45 30 10  5  9  4  1  2  1
dist_tab(nchar(words))                                                                                                
##    interval freq cum.freq percent cum.percent
## 1         3   44       44    7.72        7.72
## 2         4   74      118   12.98       20.70
## 3         5   68      186   11.93       32.63
## 4         6   68      254   11.93       44.56
## 5         7   77      331   13.51       58.07
## 6         8   76      407   13.33       71.40
## 7         9   56      463    9.82       81.23
## 8        10   45      508    7.89       89.12
## 9        11   30      538    5.26       94.39
## 10       12   10      548    1.75       96.14
## 11       13    5      553    0.88       97.02
## 12       14    9      562    1.58       98.60
## 13       15    4      566    0.70       99.30
## 14       16    1      567    0.18       99.47
## 15       17    2      569    0.35       99.82
## 16       19    1      570    0.18      100.00

Frecuencia de letras/palabras

data.frame(nletters=nchar(words)) %>% ggplot(aes(x=nletters)) + 
geom_histogram(binwidth=1) + 
geom_vline(xintercept=mean(nchar(words)),colour="green", size=1, alpha=.5) + 
labs(x="Number of Letters", y="Number of Words")

library(dplyr)                                  
library(stringr)     
## 
## Attaching package: 'stringr'
## 
## The following object is masked from 'package:qdap':
## 
##     %>%
## 
## The following object is masked from 'package:graph':
## 
##     boundary

Frecuencia de las distintas letras

words %>% str_split("") %>% sapply(function(x) x[-1]) %>%  unlist %>% dist_tab %>% mutate(Letter=factor(toupper(interval),levels=toupper(interval[order(freq)]))) %>%    
ggplot(aes(Letter, weight=percent)) +  geom_bar() + coord_flip() +   ylab("Proportion") +  scale_y_continuous(breaks=seq(0, 12, 2), label=function(x) paste0(x, "%"),expand=c(0,0), limits=c(0,12))                                     

Mapa calor Letras & Posicion

words %>%   lapply(function(x) sapply(letters, gregexpr, x, fixed=TRUE)) %>%  unlist %>%                                                       
(function(x) x[x!=-1]) %>%   
(function(x) setNames(x, gsub("nnd", "", names(x)))) %>%   
(function(x) apply(table(data.frame(letter=toupper(names(x)), position=unname(x))), 1, function(y) y/length(x))) %>%   
qheat(high="green", low="yellow", by.column=NULL,  
      values=TRUE, digits=3, plot=FALSE) +
  ylab("Letter") +
  xlab("Position") +  
  theme(axis.text.x=element_text(angle=0)) + 
  guides(fill=guide_legend(title="Proportion"))