Euclidean Distance

x <- c(1,2)
y <- c(3,4)

sqrt(sum((x - y) ^2))
## [1] 2.828427
dist(rbind(x,y))
##          x
## y 2.828427
x <- c(1,2,3,4,5,6,7)
y <- c(3,4,6,3,5,1,2)

rbind(x,y)
##   [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## x    1    2    3    4    5    6    7
## y    3    4    6    3    5    1    2
dist(rbind(x,y))
##          x
## y 8.246211
x <- c(1,2,3,4,5,6,7)
y <- c(3,4,6,3,5,1,2)
z <- c(2,3,2,2,4,2,3)

d <- dist(rbind(x,y,z))
m <- as.matrix(d)
m
##          x        y        z
## x 0.000000 8.246211 6.324555
## y 8.246211 0.000000 4.690416
## z 6.324555 4.690416 0.000000
dist(rbind(x,y,z), method='minkowski', p= 2 )
##          x        y
## y 8.246211         
## z 6.324555 4.690416

Manhattan Distance

x <- c(1,2)
y <- c(3,4)

sum(abs(x - y))
## [1] 4
dist(rbind(x,y), method = 'manhattan')
##   x
## y 4
dist(rbind(x,y,z), method='minkowski', p= 1 )
## Warning in rbind(x, y, z): number of columns of result is not a multiple of
## vector length (arg 1)
##    x  y
## y 14   
## z  8  8

階層式分群

data(iris)
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
head(iris[, 5])
## [1] setosa setosa setosa setosa setosa setosa
## Levels: setosa versicolor virginica
head(iris[, 1:4])
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1          5.1         3.5          1.4         0.2
## 2          4.9         3.0          1.4         0.2
## 3          4.7         3.2          1.3         0.2
## 4          4.6         3.1          1.5         0.2
## 5          5.0         3.6          1.4         0.2
## 6          5.4         3.9          1.7         0.4
head(iris[, -5])
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1          5.1         3.5          1.4         0.2
## 2          4.9         3.0          1.4         0.2
## 3          4.7         3.2          1.3         0.2
## 4          4.6         3.1          1.5         0.2
## 5          5.0         3.6          1.4         0.2
## 6          5.4         3.9          1.7         0.4
d  <- dist(iris[,-5], method = 'euclidean')
hc <- hclust(d, method = 'ward.D2')
par(mfrow=c(1,1))
plot(hc, hang = -0.1, cex=0.7)
rect.hclust(hc, 3)

fit <- cutree(hc, k = 3)
#fit

par(mfrow=c(1,2))
plot(Petal.Width ~ Petal.Length, data = iris, col=fit, main = 'Clustered Result')

plot(Petal.Width ~ Petal.Length, data = iris, col=iris$Species, main = 'Iris Data')

## KMeans (iris)

data(iris)

set.seed(22)
fit <- kmeans(iris[,-5], centers = 3)
fit
## K-means clustering with 3 clusters of sizes 50, 62, 38
## 
## Cluster means:
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1     5.006000    3.428000     1.462000    0.246000
## 2     5.901613    2.748387     4.393548    1.433871
## 3     6.850000    3.073684     5.742105    2.071053
## 
## Clustering vector:
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [71] 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3
## [106] 3 2 3 3 3 3 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 2 3 3 3 3 3 2 3 3 3 3 2 3
## [141] 3 3 2 3 3 3 2 3 3 2
## 
## Within cluster sum of squares by cluster:
## [1] 15.15100 39.82097 23.87947
##  (between_SS / total_SS =  88.4 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"
barplot(fit$centers, beside = TRUE, xlab = 'cluster', ylab = 'value')

fit$cluster
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [71] 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3
## [106] 3 2 3 3 3 3 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 2 3 3 3 3 3 2 3 3 3 3 2 3
## [141] 3 3 2 3 3 3 2 3 3 2
cluster_centers <- fit$centers[,3:4]
plot(Petal.Length ~ Petal.Width, data = iris, col=fit$cluster)
points(cluster_centers[,2], cluster_centers[,1], col="orange", pch=19, cex=3)

KMeans (Customer)

download.file('https://raw.githubusercontent.com/ywchiu/cathayr/master/data/customers.csv', 'customers.csv')

customer <- read.csv('customers.csv')
customer <- customer[,c('Annual_Income', 'Spending_Score')]

plot(Spending_Score ~ Annual_Income, data = customer)

set.seed(70)
fit <- kmeans(customer, centers = 5)
fit
## K-means clustering with 5 clusters of sizes 23, 39, 81, 22, 35
## 
## Cluster means:
##   Annual_Income Spending_Score
## 1      26.30435       20.91304
## 2      86.53846       82.12821
## 3      55.29630       49.51852
## 4      25.72727       79.36364
## 5      88.20000       17.11429
## 
## Clustering vector:
##   [1] 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1
##  [36] 4 1 4 1 4 1 4 1 3 1 4 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##  [71] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [106] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 5 2 3 2 5 2 5 2 3 2 5 2 5 2 5 2
## [141] 5 2 3 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5
## [176] 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2
## 
## Within cluster sum of squares by cluster:
## [1]  5098.696 13444.051  9875.111  3519.455 12511.143
##  (between_SS / total_SS =  83.5 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"
fit$centers
##   Annual_Income Spending_Score
## 1      26.30435       20.91304
## 2      86.53846       82.12821
## 3      55.29630       49.51852
## 4      25.72727       79.36364
## 5      88.20000       17.11429
cluster_centers <- fit$centers
plot(Spending_Score ~ Annual_Income, data = customer, col = fit$cluster)
points(cluster_centers[,1], cluster_centers[,2], col="orange", pch=19, cex=3)

barplot(fit$centers, beside = TRUE, xlab = 'cluster', ylab = 'value')

set.seed(70)
fit <- kmeans(customer, centers = 3)
fit
## K-means clustering with 3 clusters of sizes 38, 39, 123
## 
## Cluster means:
##   Annual_Income Spending_Score
## 1      87.00000       18.63158
## 2      86.53846       82.12821
## 3      44.15447       49.82927
## 
## Clustering vector:
##   [1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##  [36] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##  [71] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [106] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2
## [141] 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1
## [176] 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2
## 
## Within cluster sum of squares by cluster:
## [1] 14204.84 13444.05 78699.48
##  (between_SS / total_SS =  60.6 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"
cluster_centers <- fit$centers
plot(Spending_Score ~ Annual_Income, data = customer, col = fit$cluster)
points(cluster_centers[,1], cluster_centers[,2], col="orange", pch=19, cex=3)

## Silhouette

set.seed(70)
fit <- kmeans(customer, centers = 5)
fit
## K-means clustering with 5 clusters of sizes 23, 39, 81, 22, 35
## 
## Cluster means:
##   Annual_Income Spending_Score
## 1      26.30435       20.91304
## 2      86.53846       82.12821
## 3      55.29630       49.51852
## 4      25.72727       79.36364
## 5      88.20000       17.11429
## 
## Clustering vector:
##   [1] 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1
##  [36] 4 1 4 1 4 1 4 1 3 1 4 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##  [71] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [106] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 5 2 3 2 5 2 5 2 3 2 5 2 5 2 5 2
## [141] 5 2 3 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5
## [176] 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2
## 
## Within cluster sum of squares by cluster:
## [1]  5098.696 13444.051  9875.111  3519.455 12511.143
##  (between_SS / total_SS =  83.5 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"
library(cluster)
kms <- silhouette(fit$cluster, dist(customer))
summary(kms)
## Silhouette of 200 units in 5 clusters from silhouette.default(x = fit$cluster, dist = dist(customer)) :
##  Cluster sizes and average silhouette widths:
##        23        39        81        22        35 
## 0.5122676 0.5091706 0.5966512 0.5990129 0.5039873 
## Individual silhouette widths:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.02338  0.49425  0.59614  0.55393  0.66011  0.75807
plot(kms)

## 如何決定 k 值

a <- c(2,6,8,9,10)
b <- c()
for(ele in a){
  b <- c(b, (ele ^ 2))
}
b
## [1]   4  36  64  81 100
sapply(a, function(ele){ele ^ 2} )
## [1]   4  36  64  81 100
nk <- 2:10
WSS <- sapply(nk,
  function(ele){
    kmeans(customer, center= ele)$tot.withinss
  }
)
plot(nk,WSS, type= 'b')
WSS
## [1] 186234.17 106348.37  73880.64  44448.46  37233.81  35596.82  27444.44
## [8]  22824.40  21031.67
#install.packages('fpc')
library(fpc)
## Warning: package 'fpc' was built under R version 3.4.2

nk <- 2:10
SW <- sapply(nk, function(k){
  cluster.stats(dist(customer), kmeans(customer, k)$cluster)$avg.silwidth
})
plot(nk, SW, type = 'b', xlab= 'Number of Cluster', ylab = 'Silouette Width', main = 'Determine K Value With Silhouette Width')

## Sapply

#method 1
a <- c(2,3,4,5,6)
b <- c()
for(ele in a){
  b <- c(b, ele ^ 2)
}
b
## [1]  4  9 16 25 36
# method 2
a <- c(2,3,4,5,6)
b <- rep(0, length(a))
for(ele in seq_along(a) ){
  b[ele] <- a[ele] ^ 2
  #print(ele)
}
b
## [1]  4  9 16 25 36
#method 3
lapply(a, function(ele) ele ^ 2)
## [[1]]
## [1] 4
## 
## [[2]]
## [1] 9
## 
## [[3]]
## [1] 16
## 
## [[4]]
## [1] 25
## 
## [[5]]
## [1] 36
unlist(lapply(a, function(ele) ele ^ 2))
## [1]  4  9 16 25 36
#method 4
sapply(a, function(ele) ele ^ 2)
## [1]  4  9 16 25 36
#Method 4 > Method 3 > Method 2 > Method 1

比較不同分群演算法

single_c <- hclust(dist(customer), method = 'single')
hc_single <- cutree(single_c, 5)

complete_c <- hclust(dist(customer), method = 'complete')
hc_complete <- cutree(complete_c, 5)

ward_c <- hclust(dist(customer), method = 'ward.D2')
hc_ward <- cutree(ward_c, 5)

set.seed(70)
km <- kmeans(customer, 5)

cs <- cluster.stats(dist(customer), km$cluster)
cs[c('within.cluster.ss', 'avg.silwidth')]
## $within.cluster.ss
## [1] 44448.46
## 
## $avg.silwidth
## [1] 0.553932
plot(ward_c)

sapply(list(kmeans = km$cluster, single = hc_single, complete = hc_complete, ward = hc_ward), function(c) cluster.stats(dist(customer), c)[c('within.cluster.ss', 'avg.silwidth')])
##                   kmeans   single    complete  ward     
## within.cluster.ss 44448.46 232196.4  45101.51  45101.51 
## avg.silwidth      0.553932 0.2694896 0.5529946 0.5529946

Iris Clustering

data(iris)
data   <- iris[, -5]
target <- iris[,5]

result <- kmeans(data, 3)
result$size
## [1] 38 50 62
result$cluster
##   [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [36] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##  [71] 3 3 3 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 3 1 1 1
## [106] 1 3 1 1 1 1 1 1 3 3 1 1 1 1 3 1 3 1 3 1 1 3 3 1 1 1 1 1 3 1 1 1 1 3 1
## [141] 1 1 3 1 1 1 3 1 1 3
res <- factor(result$cluster, levels=c(2,3,1), labels = c(1,2,3))
table(target, res)
##             res
## target        1  2  3
##   setosa     50  0  0
##   versicolor  0 48  2
##   virginica   0 14 36
par(mfrow=c(2,2))
plot(Petal.Width ~ Petal.Length, data = iris, col=target)
plot(Petal.Width ~ Petal.Length, data = iris, col=res)
plot(Sepal.Width ~ Sepal.Length, data = iris, col=target)
plot(Sepal.Width ~ Sepal.Length, data = iris, col=res)

文章相似度比對

a <- c(  1    , 1    , 1)
b <- c(  1    , 1    , 1)
dist(rbind(a,b))

a <- c(  1    , 1    , 1, 0, 0, 0, 0, 0,  0, 0, 0)
b <- c(  1    , 1    , 1, 1, 1, 1, 1, 1 , 1, 1, 1)
dist(rbind(a,b))

a <- c(1,1,0,0,1)
b <- c(1,0,1,1,0)
sum(a * b)

a <- c(1,2,2,1,1,1,0)
b <- c(1,2,2,1,1,2,1)
cs <- sum(a*b) / (sqrt(sum(a ^ 2)) * sqrt(sum(b ^ 2)))
cd <- 1 - cs
cd

#install.packages('proxy')
library(proxy)
?proxy::dist
proxy::dist(rbind(a,b), method = 'cosine')
proxy::simil(rbind(a,b), method = 'cosine')

# https://github.com/ywchiu/rtibame/blob/master/data/applenews.RData
load('applenews.RData')

library(jiebaR)
mixseg <- worker()
apple.seg <- lapply(applenews$content, function(news){
  segment(news, jiebar = mixseg)
})

library(tm)
corpus <- Corpus(VectorSource(apple.seg))
dtm    <- DocumentTermMatrix(corpus)
dtm.remove <- removeSparseTerms(dtm, 0.99)
cs <- proxy::dist(as.matrix(dtm.remove), method = 'cosine')
csm <- as.matrix(cs)

applenews$title[order(csm[20,])][1:10]

article.query = function(idx){
  applenews$title[as.integer(names(sort(csm[idx, which(csm[idx,] < 0.8)])))]
}
article.query(18)[1:10]

自動問答機器人

library(rvest)
disease_links <- read_html('http://www.cdc.gov.tw/disease.aspx?treeid=8d54c504e820735b&nowtreeid=dec84a2f0c6fac5b') %>%
  html_nodes('.list_content td a') %>% html_attr('href') %>% 
  paste0('http://www.cdc.gov.tw/', .)


getDisease <- function(url){
  content <- read_html(url) %>%
    html_nodes('.in_p')
  
  qa <- list()
  for (p in content){
    header  <- p %>% html_nodes('h2') %>% 
      html_text() %>% trimws()
    article <- p %>% html_nodes('p') %>% 
      html_text() %>% trimws()
    qa[header] = article
    #qa <- cbind(qa, )
  }
  df <- data.frame(matrix(unlist(qa), nrow=1, byrow=T), stringsAsFactors = FALSE)
  
  colnames(df) <- names(qa)
  return(df)
}

url <- 'http://www.cdc.gov.tw/diseaseinfo.aspx?treeid=8d54c504e820735b&nowtreeid=dec84a2f0c6fac5b&tid=77BFF3D4F9CB7982'
getDisease(url)

res <- lapply(disease_links[1:10], getDisease)
library(plyr)
cdc_qa <- do.call(rbind.fill, res)
write.csv(x = cdc_qa, file ='cdc_qa.csv')


library(jiebaR)
mixseg <- worker()

qa_set <- c('我今天被不知名物種咬了, 感覺頭痛, 有點發燒, 淋巴結腫大, 請問醫生該怎麼辦', cdc_qa$`發病症狀:`)

qa_seg <- lapply(qa_set, function(e) segment(e, jiebar = mixseg))

doc <- Corpus(VectorSource(qa_seg))
dtm <- DocumentTermMatrix(doc)
m <- as.matrix(dtm)
cs <- proxy::dist(m, method = 'cosine')
csm <- as.matrix(cs)

rank <- order(csm[1,])[2:5] -1
cdc_qa$`治療方法與就醫資訊:`[rank]

文章分群

# https://github.com/ywchiu/rtibame/blob/master/data/applenews.RData
load('applenews.RData')

library(jiebaR)
mixseg <- worker()
apple.seg <- lapply(applenews$content, function(news){
  segment(news, jiebar = mixseg)
})

library(tm)
corpus <- Corpus(VectorSource(apple.seg))
dtm    <- DocumentTermMatrix(corpus)
dtm.remove <- removeSparseTerms(dtm, 0.99)
dim(dtm)
dim(dtm.remove)

cs <- proxy::dist(as.matrix(dtm.remove), method = 'cosine')

hc <- hclust(cs, method ='ward.D2')
plot(hc)
rect.hclust(hc, 13)

fit <- cutree(hc, 13)


applenews$title[fit == 10]


kc <- kmeans(as.matrix(dtm.remove), 13)
applenews$title[kc$cluster == 13]


m <- as.matrix(dtm.remove)
word_rank <- order(colSums(m[kc$cluster == 13,]), decreasing = TRUE)
dtm.remove$dimnames$Terms[word_rank[1:10]]

igraph

#install.packages('igraph')
library(igraph)

mg <- make_graph(m)
wc <- cluster_walktrap(karate)
modularity(wc)

cs <- proxy::dist(as.matrix(dtm.remove), method = 'cosine')

csm <- as.matrix(cs)

filter_score <- function(x) {
  x <- ifelse(x < 0.5, 1, 0)
  return(x)
}
score.csm <- apply(csm, MARGIN = 2, filter_score)

score.csm[1:10,1:10]

G <- graph_from_adjacency_matrix(score.csm, mode = c( "undirected"))

wc <- cluster_walktrap(G)
applenews$title[wc$membership == 7]