Text Mining에 대한 R Code 예제 소개

author: ivan@saltlux.com

Reference 참고:

http://cran.r-project.org/doc/contrib/Zhao_R_and_data_mining.pdf

http://cran.r-project.org/web/views/NaturalLanguageProcessing.html

Illustration of Clustering:

For more advanced visualization of the clustering result, refer to this article (from R hclust to an amazing d3.js) http://quantifyingmemory.blogspot.kr/2013/11/d3-without-javascript.html

For showing trends, an example of Streamgraph showing Twitter https://www.google.co.kr/search?q=streamgraph+twitter&espv=2&biw=1136&bih=825&source=lnms&tbm=isch&sa=X&ei=751VVfjCL5aC8gXq5IOIDA&ved=0CAYQ_AUoAQ&dpr=1.1

An attempt to do it, starting with the R package http://www.r-bloggers.com/introducing-the-streamgraph-htmlwidget-r-package/ Still need to be completed …

#illustration of tm package
library(tm)
# load the extrafont package
library(extrafont)
par(family="AppleMyungjo") # 한글용 폰트로 설정
# ------------
# 검색 using DOR
# ------------
source(file="/Users/ivan/R/DORClient.R")
query <- "아이폰"; maxdocs <- 100
df <- dor_search("BLOG", query, maxdocs)
#Usage of tm package
#create a corpus
corp <- Corpus(DataframeSource(df))
#apply several operations: lowercase, remove stopwords etc.
corp <- tm_map(corp, removeWords, stopwords("english"))
#corp <- tm_map(corp, tolower)
#create a customized stopwords list & apply it to corpus
myStopwords <- c(stopwords('english'), "처음/NNDate", "2014/SN", "2015/SN", "시간/NNDate","quot", "6/SN", "0.8", "6.9")
corp <- tm_map(corp, removeWords, myStopwords)
#create the Terms x Document Matrix with some options
minFreq <- 20
TermsDocsMat <- TermDocumentMatrix(corp, control = list(removePunctuation = FALSE, bounds = list(global = c(minFreq,Inf))))
#create the Document x Terms Matrix
DocsTermsMat <- DocumentTermMatrix(corp, control = list(removePunctuation = FALSE, bounds = list(global = c(minFreq,Inf))))
tdm <- as.matrix(TermsDocsMat)
dtm <- as.matrix(DocsTermsMat)
#inspect(corp)
# inspect part of the matrix
#tdm[1:10,1:5]
# inspect frequent words
(freq.terms <- findFreqTerms(TermsDocsMat, lowfreq= 100))
##  [1] "갤럭시"         "디자인"         "배터리"         "스마트폰"      
##  [5] "아이폰"         "아이폰5"        "아이폰6"        "아이폰6_플러스"
##  [9] "카메라"         "케이블"         "케이스"         "포스팅"        
## [13] "플러스"         "핸드폰"         "화이트"
##get the names of the 10 words that correlate the highest with query
words <- rownames(findAssocs(DocsTermsMat, query, .005))[1:20]
find <- colnames(dtm) %in% words
corr <- cor(dtm[,find])
#plot heatmap of correlations
library(corrplot)
library(extrafont)
par(family="AppleMyungjo") # 한글용 폰트로 설정
corrplot(corr, type = "upper")

###Show terms frequencies with histogram
# can see the Zipf's law !
term.freq <- rowSums(tdm)
term.freq <- subset(term.freq, term.freq>=minFreq)
word_freqs = sort(term.freq, decreasing=FALSE) 
vocab <- names(word_freqs)
# create a data frame with words and their frequencies
df = data.frame(terms=vocab, freq=word_freqs)

Show Terms Frequencies Distibution

library(ggplot2)
df$terms <- factor( df$terms, levels=unique(as.character(df$terms)) )
ggplot(df, aes(terms,freq)) + geom_bar(stat= "identity") + scale_x_discrete(name="Terms", labels=df$terms) + xlab("Terms") + ylab("Freq") + coord_flip() +theme(text=element_text(family="AppleMyungjo",size=14))

######### TERMS * TERMS Matrix (Graph) #######
# transform into a term-term adjacency matrix
tdm <- tdm %*% t(tdm)
#### Create a graph from it
library(igraph)
#build a graph from the above matrix
g <- graph.adjacency(tdm, weighted=T, mode="undirected")
# remove loops
g <- simplify(g)
### Visualize it
plot.igraph(g, layout=layout.fruchterman.reingold(g, niter=1000, area=100*vcount(g)^2),
     vertex.label.family = "AppleMyungjo")
mtext("Terms Co-occurrences", side=1)

Illustration of Histogram in datatable (DT)

library(DT)
datatable(df)

This is an example of Streamgraph. Still finding a way to link it to our data similar as neoformix http://neoformix.com/

library(streamgraph)
library(dplyr)
 
dat <- read.csv("http://asbcllc.com/blog/2015/february/cre_stream_graph_test/data/cre_transaction-data.csv")
 
dat %>%
  streamgraph("asset_class", "volume_billions", "year") %>%
  sg_axis_x(1, "year", "%Y") %>%
  sg_colors("PuOr") %>%
  sg_legend(show=TRUE, label="DDSec names: ")

Clustering Illustration Examples using Word2Vec

library(extrafont)
par(family="AppleMyungjo") # 한글용 폰트로 설정
library(cluster)
Word2VecAPI <- "http://localhost:8181/service/w2v.csv?"  
query <- paste("q=", query, sep="")
url <- paste(Word2VecAPI, query, sep="")
# loading data into R 
mydata <- read.table(url, header=T, sep=",", row.names=1)

# Hierarchical Agglomerative Clustering
d <- dist(mydata, method = "euclidean") # distance matrix
fit <- hclust(d, method="ward.D") 
plot(fit) # display dendogram
# draw dendogram with red borders around the 7 clusters 
rect.hclust(fit, k=10, border="red")

# K-Means Clustering with 7 clusters
fit <- kmeans(mydata, 7)
# Cluster Plot against 1st 2 principal components
clusplot(mydata, fit$cluster, color=TRUE, shade=FALSE, labels=2, lines=0, main='K-Means on Word2Vec')

# Load the tsne package (Dimension Reduction)
library(tsne)
ecb = function(x,y){ plot(x, t='n', main="t-SNE on Word2Vec"); text(x,labels = rownames(mydata)) }
tsne_mydata = tsne(mydata, k = 2, initial_dims = 50, max_iter = 1000, epoch=1000, epoch_callback = ecb, perplexity=20)