TFIDF
a <- c('a')
abb <- c('a', 'b', 'b')
abc <- c('a', 'b', 'c')
D <- list(a, abb, abc)
#tfidf('a', a, D)
tf <- 1/1
idf <- log(3/3)
tf
[1] 1
idf
[1] 0
tf * idf
[1] 0
#tfidf('a', abb, D)
tf <- 1/3
idf <- log(3/3)
tf * idf
[1] 0
#tfidf('b', abb, D)
tf <- 2/3
idf <- log(3/2)
tf * idf
[1] 0.2703101
#tfidf('b', abc, D)
tf <- 1/3
idf <- log(3/2)
tf * idf
[1] 0.135155
#tfidf('c', abc, D)
tf <- 1/3
idf <- log(3/1)
tf * idf
[1] 0.3662041
a <- c('a')
abb <- c('a', 'b', 'b')
abc <- c('a', 'b', 'c')
D <- list(a, abb, abc)
#tfidf('c', abc, D)
tf <- sum(abc == 'c') /length(abc)
length(D)
[1] 3
#'c' %in% abc
idf <- log(length(D) / sum(sapply(D, function(e) 'c' %in% e )))
tfidf <- function(t, d, D){
tf <- sum(d == t) /length(d)
idf <- log(length(D) / sum(sapply(D, function(e) t %in% e )))
tf * idf
}
tfidf('c', abc, D)
[1] 0.3662041
jiebaR
#install.packages('jiebaR')
library(jiebaR)
s <- '大巨蛋案對市府同仁下封口令?柯P否認'
mixseg <- worker()
segment(code = s, jiebar = mixseg)
edit_dict()
library(tm)
e3 <- 'Hello, I am David, I have taken over 100 courses~~~'
e3.list <- strsplit(e3, ' ')
e3.corpus <- Corpus(VectorSource(e3.list))
e3.dtm <- DocumentTermMatrix(e3.corpus)
inspect(e3.dtm)
e3.dtm <- DocumentTermMatrix(e3.corpus, control = list(wordLengths = c(1,20)))
inspect(e3.dtm)
getTransformations()
doc <- tm_map(e3.corpus, removeNumbers)
doc <- tm_map(doc, removePunctuation)
doc <- tm_map(doc, stemDocument)
dtm <- DocumentTermMatrix(doc)
inspect(dtm)
removetilde <- content_transformer(
function(e) gsub('~', '', e))
doc <- tm_map(e3.corpus, removetilde)
dtm <- DocumentTermMatrix(doc)
inspect(dtm)
e1 <- 'this is a book'
e2 <- 'this is my car'
e.list <- strsplit(c(e1, e2), ' ')
e.corpus <- Corpus(VectorSource(e.list))
e.dtm <- DocumentTermMatrix(e.corpus, control = list(weighting = function(x)
weightTfIdf(x, normalize = FALSE)))
?DocumentTermMatrix
inspect(e.dtm)
Chinese DocumentTermMatrix
sapply(s.vec, function(t) {
if (t %in% names(synonym_dict)){
synonym_dict[t]
} else{
t
}
})
$`憭批楊<e8><9b>
[1] "憭批楊<e8><9b><8b>"
$獢<b0><8d>
[1] "獢<b0><8d>"
$撣<ba><9c>
[1] "撣<ba><9c>"
$<e5><90><bb><81>
[1] "<e5><90><bb><81>"
$銝<8b>
[1] "銝<8b>"
$撠隞<a4>
[1] "撠隞<a4>"
$<e6>P.<e6>P
[1] "<e6><e6><96>"
$<e5>隤<8d>
[1] "<e5>隤<8d>"
Warning message:
In strsplit(code, "\n", fixed = TRUE) :
input string 1 is invalid in this locale
The application of documentTermMatrix
download.file('https://github.com/ywchiu/rtibame/raw/master/data/applenews20160925.RData', 'applenews.RData')
load('applenews.RData')
library(jiebaR)
source('https://raw.githubusercontent.com/ywchiu/rtibame/master/Lib/CNCorpus.R')
mixseg <- worker()
apple.seg <- lapply(applenews$article, function(e) segment(e, jiebar = mixseg))
s.corpus <- CNCorpus(apple.seg)
doc <- tm_map(s.corpus, removeNumbers)
'jayson'[grepl('[\u4e00-\u9fa5]+', 'jayson')]
removeen <- content_transformer(
function(x, pattern){
print(x[grepl('[\u4e00-\u9fa5]+',x)])
return(x[grepl('[\u4e00-\u9fa5]+',x)])
}
)
removeen2 <- function(x, pattern){
return(x[grepl('[\u4e00-\u9fa5]+',x)])
}
doc <- tm_map(s.corpus, removeen2)
#doc <- tm_map(doc, removeNumbers)
doc
s.dtm <- DocumentTermMatrix(doc, control = list(wordLengths = c(2,Inf), tokenizer = space_tokenizer))
s.dtm
dim(s.dtm)
#s.dtm$dimnames$Terms[nchar(s.dtm$dimnames$Terms) == 21]
s.dtm$dimnames$Terms
findAssocs(s.dtm, '颱風', 0.5)
Clustering
dist(rbind(x,y), method = 'euclidean')
x
y 1.414214
iris clustering

Cosine distance
1- proxy::dist(rbind(a,b), method= "cosine")
a
b 0.9381942
News Clustering
# Load Data
download.file('https://raw.githubusercontent.com/ywchiu/rtibame/master/data/applenews.RData', destfile = 'appledaily.RData')
load('appledaily.RData')
head(applenews)
# Segmentation
library(jiebaR)
mixseg <- worker()
apple.seg <- lapply(applenews$content, function(article) segment(code= article, jiebar = mixseg))
class(apple.seg)
# Convert segments into corpus
source('https://raw.githubusercontent.com/ywchiu/rtibame/master/Lib/CNCorpus.R')
s.corpus <- CNCorpus(apple.seg)
s.corpus <- tm_map(s.corpus, removeNumbers)
s.corpus <- tm_map(s.corpus, removePunctuation)
# Build Document Term Matrix
control.list <- list(wordLengths = c(2, Inf), tokenize = space_tokenizer)
s.dtm <- DocumentTermMatrix(s.corpus, control = control.list)
dim(s.dtm)
dtm <- removeSparseTerms(s.dtm, 0.99)
dim(dtm)
#s.dtm
dtm.dist <- proxy::dist(as.matrix(dtm), method = 'cosine')
dtm.mat <- as.matrix(dtm.dist)
queryArticle <- function(query_idx){
#query_idx <- 9
print(paste('查詢文章:', applenews$title[query_idx]))
query_idx_score <- dtm.mat[query_idx,]
print(paste('相關文章:', applenews$title[order(query_idx_score)[2:11]]))
}
queryArticle(70)
applenews$content[70]
hc <- hclust(dtm.dist, 'ward.D2')
plot(hc, hang = -0.01)
rect.hclust(hc,13)
fit <- cutree(hc, 13 )
applenews$title[fit == 5]
---
title: "Demo20170124"
output: html_notebook
---

## TFIDF
```{r}
a <- c('a')
abb <- c('a', 'b', 'b')
abc <- c('a', 'b', 'c')
D <- list(a, abb, abc)


#tfidf('a', a, D) 
tf  <- 1/1
idf <- log(3/3)
tf
idf
tf * idf

#tfidf('a', abb, D)
tf  <- 1/3
idf <- log(3/3)
tf * idf

#tfidf('b', abb, D)
tf  <- 2/3
idf <- log(3/2)
tf * idf 

#tfidf('b', abc, D)
tf  <- 1/3
idf <- log(3/2)
tf * idf 

#tfidf('c', abc, D)
tf  <- 1/3
idf <- log(3/1)
tf * idf
  
a <- c('a')
abb <- c('a', 'b', 'b')
abc <- c('a', 'b', 'c')
D <- list(a, abb, abc)

#tfidf('c', abc, D)
tf <- sum(abc == 'c') /length(abc)
length(D) 
#'c' %in% abc 
idf <- log(length(D) / sum(sapply(D, function(e) 'c' %in% e  )))


tfidf <- function(t, d, D){
  tf <- sum(d == t) /length(d)
  idf <- log(length(D) / sum(sapply(D, function(e) t %in% e  )))
  tf * idf
}

tfidf('c', abc, D)

```
## jiebaR
```
#install.packages('jiebaR')
library(jiebaR)
s <- '大巨蛋案對市府同仁下封口令?柯P否認'
mixseg <- worker()

segment(code = s, jiebar = mixseg)
edit_dict()


library(tm)


e3 <- 'Hello, I am David, I have taken over 100 courses~~~'
e3.list <- strsplit(e3, ' ')
e3.corpus <- Corpus(VectorSource(e3.list))
e3.dtm <- DocumentTermMatrix(e3.corpus)
inspect(e3.dtm)


e3.dtm <- DocumentTermMatrix(e3.corpus, control = list(wordLengths = c(1,20)))
inspect(e3.dtm)


getTransformations()
doc <- tm_map(e3.corpus, removeNumbers)
doc <- tm_map(doc, removePunctuation)
doc <- tm_map(doc, stemDocument)
dtm <- DocumentTermMatrix(doc)
inspect(dtm)

removetilde <- content_transformer(
     function(e) gsub('~', '', e))

doc <- tm_map(e3.corpus, removetilde)
dtm <- DocumentTermMatrix(doc)
inspect(dtm)


e1 <- 'this is a book'
e2 <- 'this is my car'
e.list <- strsplit(c(e1, e2), ' ')
e.corpus <- Corpus(VectorSource(e.list))
e.dtm <- DocumentTermMatrix(e.corpus, control = list(weighting = function(x)
          weightTfIdf(x, normalize = FALSE)))
?DocumentTermMatrix
inspect(e.dtm)
```

## Chinese DocumentTermMatrix
```{r}
library(jiebaR)
library(tm)

s <- '大巨蛋案對市府同仁下封口令?柯P否認'
s1 <- '柯P市府近來飽受大巨蛋爭議'
mixseg <- worker()
s.vec <- segment(code= s, jiebar = mixseg)

s1.vec <- segment(code= s1, jiebar = mixseg)

s.corpus <- Corpus(VectorSource(list(s.vec,s1.vec)))
s.dtm <- DocumentTermMatrix(s.corpus)
inspect(s.dtm)



source('https://raw.githubusercontent.com/ywchiu/rtibame/master/Lib/CNCorpus.R')

s.corpus <- CNCorpus(list(s.vec,s1.vec))

s.dtm <- DocumentTermMatrix(s.corpus, control = list(wordLengths = c(1,Inf), tokenizer = space_tokenizer))
inspect(s.dtm)


s.vec
?synonym_dict
'柯P' %in% names(synonym_dict)


synonym_dict <- list('柯P' = '柯文哲')
sapply(s.vec, function(t) { 
    if (t %in% names(synonym_dict)){
       synonym_dict[t]
    } else{
      t
    }
  })
```

## The application of documentTermMatrix
```
download.file('https://github.com/ywchiu/rtibame/raw/master/data/applenews20160925.RData', 'applenews.RData')
load('applenews.RData')

library(jiebaR)
source('https://raw.githubusercontent.com/ywchiu/rtibame/master/Lib/CNCorpus.R')

mixseg <- worker()
apple.seg <- lapply(applenews$article, function(e) segment(e, jiebar = mixseg))

s.corpus <- CNCorpus(apple.seg)

doc <- tm_map(s.corpus, removeNumbers)

'jayson'[grepl('[\u4e00-\u9fa5]+', 'jayson')]
removeen <- content_transformer(
  function(x, pattern){
    print(x[grepl('[\u4e00-\u9fa5]+',x)])
    return(x[grepl('[\u4e00-\u9fa5]+',x)])
  }
)

removeen2 <- function(x, pattern){
    return(x[grepl('[\u4e00-\u9fa5]+',x)])
  }


doc <- tm_map(s.corpus, removeen2)
#doc <- tm_map(doc, removeNumbers)

doc
s.dtm <- DocumentTermMatrix(doc, control = list(wordLengths = c(2,Inf), tokenizer = space_tokenizer))
s.dtm
dim(s.dtm)
#s.dtm$dimnames$Terms[nchar(s.dtm$dimnames$Terms) == 21]

s.dtm$dimnames$Terms
findAssocs(s.dtm, '颱風', 0.5)
```
## Clustering
```{r}
x <- c(0,0,1,1,1,1)
y <- c(1,0,1,1,0,1)
sqrt( sum((x - y ) ^2))
dist(rbind(x,y), method = 'euclidean')

```


## iris clustering
```{r}
data(iris)
#iris

iris_data <- iris[, -5]
hc <- hclust(dist(iris_data, 'euclidean'), 'ward.D2' )
plot(hc)


fit <- cutree(hc, 3)
fit

plot(hc)
rect.hclust(hc, 3)


par(mfrow = c(1,2))
plot(Petal.Length ~ Petal.Width, data = iris, col=iris$Species, main = "with species information")

plot(Petal.Length ~ Petal.Width, data = iris, col=fit, main = "clustering")


```



## Cosine distance
```{r}
a <- c(1,2,2,1,1,1,0)
b <- c(1,2,2,1,1,2,1)

sum(a * b) / (sqrt(sum(a ^ 2)) *sqrt(sum(b ^ 2)))

1 - proxy::dist(rbind(a,b), method= "cosine")


```
## News Clustering

```
# Load Data
download.file('https://raw.githubusercontent.com/ywchiu/rtibame/master/data/applenews.RData', destfile = 'appledaily.RData')
load('appledaily.RData')
head(applenews)

# Segmentation
library(jiebaR)
mixseg <- worker()
apple.seg <- lapply(applenews$content, function(article) segment(code= article, jiebar = mixseg))
class(apple.seg)

# Convert segments into corpus
source('https://raw.githubusercontent.com/ywchiu/rtibame/master/Lib/CNCorpus.R')
s.corpus <- CNCorpus(apple.seg)
s.corpus <- tm_map(s.corpus, removeNumbers)
s.corpus <- tm_map(s.corpus, removePunctuation)

# Build Document Term Matrix
control.list <- list(wordLengths = c(2, Inf), tokenize = space_tokenizer)
s.dtm <- DocumentTermMatrix(s.corpus, control = control.list)
dim(s.dtm)

dtm <- removeSparseTerms(s.dtm, 0.99)
dim(dtm)
#s.dtm
dtm.dist <- proxy::dist(as.matrix(dtm), method = 'cosine')

dtm.mat <- as.matrix(dtm.dist)

queryArticle <- function(query_idx){
#query_idx <- 9
print(paste('查詢文章:', applenews$title[query_idx]))
query_idx_score <- dtm.mat[query_idx,]
print(paste('相關文章:', applenews$title[order(query_idx_score)[2:11]]))
}

queryArticle(70)
applenews$content[70]



hc <- hclust(dtm.dist, 'ward.D2')
plot(hc, hang = -0.01)
rect.hclust(hc,13)
fit <- cutree(hc, 13 )
applenews$title[fit == 5]
```





