TFIDF
a <- c('a')
abb <- c('a', 'b', 'b')
abc <- c('a', 'b', 'c')
D <- list(a, abb, abc)
#tfidf('a', a, D)
tf <- 1/1
idf <- log(3/3)
tf
[1] 1
idf
[1] 0
tf * idf
[1] 0
#tfidf('a', abb, D)
tf <- 1/3
idf <- log(3/3)
tf * idf
[1] 0
#tfidf('b', abb, D)
tf <- 2/3
idf <- log(3/2)
tf * idf
[1] 0.2703101
#tfidf('b', abc, D)
tf <- 1/3
idf <- log(3/2)
tf * idf
[1] 0.135155
#tfidf('c', abc, D)
tf <- 1/3
idf <- log(3/1)
tf * idf
[1] 0.3662041
a <- c('a')
abb <- c('a', 'b', 'b')
abc <- c('a', 'b', 'c')
D <- list(a, abb, abc)
#tfidf('c', abc, D)
tf <- sum(abc == 'c') /length(abc)
length(D)
[1] 3
#'c' %in% abc
idf <- log(length(D) / sum(sapply(D, function(e) 'c' %in% e )))
tfidf <- function(t, d, D){
tf <- sum(d == t) /length(d)
idf <- log(length(D) / sum(sapply(D, function(e) t %in% e )))
tf * idf
}
tfidf('c', abc, D)
[1] 0.3662041
jiebaR
#install.packages('jiebaR')
library(jiebaR)
s <- '大巨蛋案對市府同仁下封口令?柯P否認'
mixseg <- worker()
segment(code = s, jiebar = mixseg)
edit_dict()
library(tm)
e3 <- 'Hello, I am David, I have taken over 100 courses~~~'
e3.list <- strsplit(e3, ' ')
e3.corpus <- Corpus(VectorSource(e3.list))
e3.dtm <- DocumentTermMatrix(e3.corpus)
inspect(e3.dtm)
e3.dtm <- DocumentTermMatrix(e3.corpus, control = list(wordLengths = c(1,20)))
inspect(e3.dtm)
getTransformations()
doc <- tm_map(e3.corpus, removeNumbers)
doc <- tm_map(doc, removePunctuation)
doc <- tm_map(doc, stemDocument)
dtm <- DocumentTermMatrix(doc)
inspect(dtm)
removetilde <- content_transformer(
function(e) gsub('~', '', e))
doc <- tm_map(e3.corpus, removetilde)
dtm <- DocumentTermMatrix(doc)
inspect(dtm)
e1 <- 'this is a book'
e2 <- 'this is my car'
e.list <- strsplit(c(e1, e2), ' ')
e.corpus <- Corpus(VectorSource(e.list))
e.dtm <- DocumentTermMatrix(e.corpus, control = list(weighting = function(x)
weightTfIdf(x, normalize = FALSE)))
?DocumentTermMatrix
inspect(e.dtm)
Chinese DocumentTermMatrix
sapply(s.vec, function(t) {
if (t %in% names(synonym_dict)){
synonym_dict[t]
} else{
t
}
})
$`憭批楊<e8><9b>
[1] "憭批楊<e8><9b><8b>"
$獢<b0><8d>
[1] "獢<b0><8d>"
$撣<ba><9c>
[1] "撣<ba><9c>"
$<e5><90><bb><81>
[1] "<e5><90><bb><81>"
$銝<8b>
[1] "銝<8b>"
$撠隞<a4>
[1] "撠隞<a4>"
$<e6>P.<e6>P
[1] "<e6><e6><96>"
$<e5>隤<8d>
[1] "<e5>隤<8d>"
Warning message:
In strsplit(code, "\n", fixed = TRUE) :
input string 1 is invalid in this locale
The application of documentTermMatrix
download.file('https://github.com/ywchiu/rtibame/raw/master/data/applenews20160925.RData', 'applenews.RData')
load('applenews.RData')
library(jiebaR)
source('https://raw.githubusercontent.com/ywchiu/rtibame/master/Lib/CNCorpus.R')
mixseg <- worker()
apple.seg <- lapply(applenews$article, function(e) segment(e, jiebar = mixseg))
s.corpus <- CNCorpus(apple.seg)
doc <- tm_map(s.corpus, removeNumbers)
'jayson'[grepl('[\u4e00-\u9fa5]+', 'jayson')]
removeen <- content_transformer(
function(x, pattern){
print(x[grepl('[\u4e00-\u9fa5]+',x)])
return(x[grepl('[\u4e00-\u9fa5]+',x)])
}
)
removeen2 <- function(x, pattern){
return(x[grepl('[\u4e00-\u9fa5]+',x)])
}
doc <- tm_map(s.corpus, removeen2)
#doc <- tm_map(doc, removeNumbers)
doc
s.dtm <- DocumentTermMatrix(doc, control = list(wordLengths = c(2,Inf), tokenizer = space_tokenizer))
s.dtm
dim(s.dtm)
#s.dtm$dimnames$Terms[nchar(s.dtm$dimnames$Terms) == 21]
s.dtm$dimnames$Terms
findAssocs(s.dtm, '颱風', 0.5)
Clustering
dist(rbind(x,y), method = 'euclidean')
x
y 1.414214
iris clustering

Cosine distance
1- proxy::dist(rbind(a,b), method= "cosine")
a
b 0.9381942
News Clustering
# Load Data
download.file('https://raw.githubusercontent.com/ywchiu/rtibame/master/data/applenews.RData', destfile = 'appledaily.RData')
load('appledaily.RData')
head(applenews)
# Segmentation
library(jiebaR)
mixseg <- worker()
apple.seg <- lapply(applenews$content, function(article) segment(code= article, jiebar = mixseg))
class(apple.seg)
# Convert segments into corpus
source('https://raw.githubusercontent.com/ywchiu/rtibame/master/Lib/CNCorpus.R')
s.corpus <- CNCorpus(apple.seg)
s.corpus <- tm_map(s.corpus, removeNumbers)
s.corpus <- tm_map(s.corpus, removePunctuation)
# Build Document Term Matrix
control.list <- list(wordLengths = c(2, Inf), tokenize = space_tokenizer)
s.dtm <- DocumentTermMatrix(s.corpus, control = control.list)
dim(s.dtm)
dtm <- removeSparseTerms(s.dtm, 0.99)
dim(dtm)
#s.dtm
dtm.dist <- proxy::dist(as.matrix(dtm), method = 'cosine')
dtm.mat <- as.matrix(dtm.dist)
queryArticle <- function(query_idx){
#query_idx <- 9
print(paste('查詢文章:', applenews$title[query_idx]))
query_idx_score <- dtm.mat[query_idx,]
print(paste('相關文章:', applenews$title[order(query_idx_score)[2:11]]))
}
queryArticle(70)
applenews$content[70]
hc <- hclust(dtm.dist, 'ward.D2')
plot(hc, hang = -0.01)
rect.hclust(hc,13)
fit <- cutree(hc, 13 )
applenews$title[fit == 5]
