x <- c(1,2)
y <- c(3,4)
sqrt(sum((x - y) ^2))
## [1] 2.828427
dist(rbind(x,y))
## x
## y 2.828427
x <- c(1,2,3,4,5,6,7)
y <- c(3,4,6,3,5,1,2)
rbind(x,y)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## x 1 2 3 4 5 6 7
## y 3 4 6 3 5 1 2
dist(rbind(x,y))
## x
## y 8.246211
x <- c(1,2,3,4,5,6,7)
y <- c(3,4,6,3,5,1,2)
z <- c(2,3,2,2,4,2,3)
d <- dist(rbind(x,y,z))
m <- as.matrix(d)
m
## x y z
## x 0.000000 8.246211 6.324555
## y 8.246211 0.000000 4.690416
## z 6.324555 4.690416 0.000000
dist(rbind(x,y,z), method='minkowski', p= 2 )
## x y
## y 8.246211
## z 6.324555 4.690416
x <- c(1,2)
y <- c(3,4)
sum(abs(x - y))
## [1] 4
dist(rbind(x,y), method = 'manhattan')
## x
## y 4
dist(rbind(x,y,z), method='minkowski', p= 1 )
## Warning in rbind(x, y, z): number of columns of result is not a multiple of
## vector length (arg 1)
## x y
## y 14
## z 8 8
data(iris)
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
head(iris[, 5])
## [1] setosa setosa setosa setosa setosa setosa
## Levels: setosa versicolor virginica
head(iris[, 1:4])
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 5.1 3.5 1.4 0.2
## 2 4.9 3.0 1.4 0.2
## 3 4.7 3.2 1.3 0.2
## 4 4.6 3.1 1.5 0.2
## 5 5.0 3.6 1.4 0.2
## 6 5.4 3.9 1.7 0.4
head(iris[, -5])
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 5.1 3.5 1.4 0.2
## 2 4.9 3.0 1.4 0.2
## 3 4.7 3.2 1.3 0.2
## 4 4.6 3.1 1.5 0.2
## 5 5.0 3.6 1.4 0.2
## 6 5.4 3.9 1.7 0.4
d <- dist(iris[,-5], method = 'euclidean')
hc <- hclust(d, method = 'ward.D2')
par(mfrow=c(1,1))
plot(hc, hang = -0.1, cex=0.7)
rect.hclust(hc, 3)
fit <- cutree(hc, k = 3)
#fit
par(mfrow=c(1,2))
plot(Petal.Width ~ Petal.Length, data = iris, col=fit, main = 'Clustered Result')
plot(Petal.Width ~ Petal.Length, data = iris, col=iris$Species, main = 'Iris Data')
## KMeans (iris)
data(iris)
set.seed(22)
fit <- kmeans(iris[,-5], centers = 3)
fit
## K-means clustering with 3 clusters of sizes 50, 62, 38
##
## Cluster means:
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 5.006000 3.428000 1.462000 0.246000
## 2 5.901613 2.748387 4.393548 1.433871
## 3 6.850000 3.073684 5.742105 2.071053
##
## Clustering vector:
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [71] 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3
## [106] 3 2 3 3 3 3 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 2 3 3 3 3 3 2 3 3 3 3 2 3
## [141] 3 3 2 3 3 3 2 3 3 2
##
## Within cluster sum of squares by cluster:
## [1] 15.15100 39.82097 23.87947
## (between_SS / total_SS = 88.4 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
barplot(fit$centers, beside = TRUE, xlab = 'cluster', ylab = 'value')
fit$cluster
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [71] 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3
## [106] 3 2 3 3 3 3 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 2 3 3 3 3 3 2 3 3 3 3 2 3
## [141] 3 3 2 3 3 3 2 3 3 2
cluster_centers <- fit$centers[,3:4]
plot(Petal.Length ~ Petal.Width, data = iris, col=fit$cluster)
points(cluster_centers[,2], cluster_centers[,1], col="orange", pch=19, cex=3)
download.file('https://raw.githubusercontent.com/ywchiu/cathayr/master/data/customers.csv', 'customers.csv')
customer <- read.csv('customers.csv')
customer <- customer[,c('Annual_Income', 'Spending_Score')]
plot(Spending_Score ~ Annual_Income, data = customer)
set.seed(70)
fit <- kmeans(customer, centers = 5)
fit
## K-means clustering with 5 clusters of sizes 23, 39, 81, 22, 35
##
## Cluster means:
## Annual_Income Spending_Score
## 1 26.30435 20.91304
## 2 86.53846 82.12821
## 3 55.29630 49.51852
## 4 25.72727 79.36364
## 5 88.20000 17.11429
##
## Clustering vector:
## [1] 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1
## [36] 4 1 4 1 4 1 4 1 3 1 4 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [71] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [106] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 5 2 3 2 5 2 5 2 3 2 5 2 5 2 5 2
## [141] 5 2 3 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5
## [176] 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2
##
## Within cluster sum of squares by cluster:
## [1] 5098.696 13444.051 9875.111 3519.455 12511.143
## (between_SS / total_SS = 83.5 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
fit$centers
## Annual_Income Spending_Score
## 1 26.30435 20.91304
## 2 86.53846 82.12821
## 3 55.29630 49.51852
## 4 25.72727 79.36364
## 5 88.20000 17.11429
cluster_centers <- fit$centers
plot(Spending_Score ~ Annual_Income, data = customer, col = fit$cluster)
points(cluster_centers[,1], cluster_centers[,2], col="orange", pch=19, cex=3)
barplot(fit$centers, beside = TRUE, xlab = 'cluster', ylab = 'value')
set.seed(70)
fit <- kmeans(customer, centers = 3)
fit
## K-means clustering with 3 clusters of sizes 38, 39, 123
##
## Cluster means:
## Annual_Income Spending_Score
## 1 87.00000 18.63158
## 2 86.53846 82.12821
## 3 44.15447 49.82927
##
## Clustering vector:
## [1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [36] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [71] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [106] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2
## [141] 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1
## [176] 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2
##
## Within cluster sum of squares by cluster:
## [1] 14204.84 13444.05 78699.48
## (between_SS / total_SS = 60.6 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
cluster_centers <- fit$centers
plot(Spending_Score ~ Annual_Income, data = customer, col = fit$cluster)
points(cluster_centers[,1], cluster_centers[,2], col="orange", pch=19, cex=3)
## Silhouette
set.seed(70)
fit <- kmeans(customer, centers = 5)
fit
## K-means clustering with 5 clusters of sizes 23, 39, 81, 22, 35
##
## Cluster means:
## Annual_Income Spending_Score
## 1 26.30435 20.91304
## 2 86.53846 82.12821
## 3 55.29630 49.51852
## 4 25.72727 79.36364
## 5 88.20000 17.11429
##
## Clustering vector:
## [1] 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1
## [36] 4 1 4 1 4 1 4 1 3 1 4 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [71] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [106] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 5 2 3 2 5 2 5 2 3 2 5 2 5 2 5 2
## [141] 5 2 3 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5
## [176] 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2
##
## Within cluster sum of squares by cluster:
## [1] 5098.696 13444.051 9875.111 3519.455 12511.143
## (between_SS / total_SS = 83.5 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
library(cluster)
kms <- silhouette(fit$cluster, dist(customer))
summary(kms)
## Silhouette of 200 units in 5 clusters from silhouette.default(x = fit$cluster, dist = dist(customer)) :
## Cluster sizes and average silhouette widths:
## 23 39 81 22 35
## 0.5122676 0.5091706 0.5966512 0.5990129 0.5039873
## Individual silhouette widths:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.02338 0.49425 0.59614 0.55393 0.66011 0.75807
plot(kms)
## 如何決定 k 值
a <- c(2,6,8,9,10)
b <- c()
for(ele in a){
b <- c(b, (ele ^ 2))
}
b
## [1] 4 36 64 81 100
sapply(a, function(ele){ele ^ 2} )
## [1] 4 36 64 81 100
nk <- 2:10
WSS <- sapply(nk,
function(ele){
kmeans(customer, center= ele)$tot.withinss
}
)
plot(nk,WSS, type= 'b')
WSS
## [1] 186234.17 106348.37 73880.64 44448.46 37233.81 35596.82 27444.44
## [8] 22824.40 21031.67
#install.packages('fpc')
library(fpc)
## Warning: package 'fpc' was built under R version 3.4.2
nk <- 2:10
SW <- sapply(nk, function(k){
cluster.stats(dist(customer), kmeans(customer, k)$cluster)$avg.silwidth
})
plot(nk, SW, type = 'b', xlab= 'Number of Cluster', ylab = 'Silouette Width', main = 'Determine K Value With Silhouette Width')
## Sapply
#method 1
a <- c(2,3,4,5,6)
b <- c()
for(ele in a){
b <- c(b, ele ^ 2)
}
b
## [1] 4 9 16 25 36
# method 2
a <- c(2,3,4,5,6)
b <- rep(0, length(a))
for(ele in seq_along(a) ){
b[ele] <- a[ele] ^ 2
#print(ele)
}
b
## [1] 4 9 16 25 36
#method 3
lapply(a, function(ele) ele ^ 2)
## [[1]]
## [1] 4
##
## [[2]]
## [1] 9
##
## [[3]]
## [1] 16
##
## [[4]]
## [1] 25
##
## [[5]]
## [1] 36
unlist(lapply(a, function(ele) ele ^ 2))
## [1] 4 9 16 25 36
#method 4
sapply(a, function(ele) ele ^ 2)
## [1] 4 9 16 25 36
#Method 4 > Method 3 > Method 2 > Method 1
single_c <- hclust(dist(customer), method = 'single')
hc_single <- cutree(single_c, 5)
complete_c <- hclust(dist(customer), method = 'complete')
hc_complete <- cutree(complete_c, 5)
ward_c <- hclust(dist(customer), method = 'ward.D2')
hc_ward <- cutree(ward_c, 5)
set.seed(70)
km <- kmeans(customer, 5)
cs <- cluster.stats(dist(customer), km$cluster)
cs[c('within.cluster.ss', 'avg.silwidth')]
## $within.cluster.ss
## [1] 44448.46
##
## $avg.silwidth
## [1] 0.553932
plot(ward_c)
sapply(list(kmeans = km$cluster, single = hc_single, complete = hc_complete, ward = hc_ward), function(c) cluster.stats(dist(customer), c)[c('within.cluster.ss', 'avg.silwidth')])
## kmeans single complete ward
## within.cluster.ss 44448.46 232196.4 45101.51 45101.51
## avg.silwidth 0.553932 0.2694896 0.5529946 0.5529946
data(iris)
data <- iris[, -5]
target <- iris[,5]
result <- kmeans(data, 3)
result$size
## [1] 38 50 62
result$cluster
## [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [36] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [71] 3 3 3 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 3 1 1 1
## [106] 1 3 1 1 1 1 1 1 3 3 1 1 1 1 3 1 3 1 3 1 1 3 3 1 1 1 1 1 3 1 1 1 1 3 1
## [141] 1 1 3 1 1 1 3 1 1 3
res <- factor(result$cluster, levels=c(2,3,1), labels = c(1,2,3))
table(target, res)
## res
## target 1 2 3
## setosa 50 0 0
## versicolor 0 48 2
## virginica 0 14 36
par(mfrow=c(2,2))
plot(Petal.Width ~ Petal.Length, data = iris, col=target)
plot(Petal.Width ~ Petal.Length, data = iris, col=res)
plot(Sepal.Width ~ Sepal.Length, data = iris, col=target)
plot(Sepal.Width ~ Sepal.Length, data = iris, col=res)
a <- c( 1 , 1 , 1)
b <- c( 1 , 1 , 1)
dist(rbind(a,b))
a <- c( 1 , 1 , 1, 0, 0, 0, 0, 0, 0, 0, 0)
b <- c( 1 , 1 , 1, 1, 1, 1, 1, 1 , 1, 1, 1)
dist(rbind(a,b))
a <- c(1,1,0,0,1)
b <- c(1,0,1,1,0)
sum(a * b)
a <- c(1,2,2,1,1,1,0)
b <- c(1,2,2,1,1,2,1)
cs <- sum(a*b) / (sqrt(sum(a ^ 2)) * sqrt(sum(b ^ 2)))
cd <- 1 - cs
cd
#install.packages('proxy')
library(proxy)
?proxy::dist
proxy::dist(rbind(a,b), method = 'cosine')
proxy::simil(rbind(a,b), method = 'cosine')
# https://github.com/ywchiu/rtibame/blob/master/data/applenews.RData
load('applenews.RData')
library(jiebaR)
mixseg <- worker()
apple.seg <- lapply(applenews$content, function(news){
segment(news, jiebar = mixseg)
})
library(tm)
corpus <- Corpus(VectorSource(apple.seg))
dtm <- DocumentTermMatrix(corpus)
dtm.remove <- removeSparseTerms(dtm, 0.99)
cs <- proxy::dist(as.matrix(dtm.remove), method = 'cosine')
csm <- as.matrix(cs)
applenews$title[order(csm[20,])][1:10]
article.query = function(idx){
applenews$title[as.integer(names(sort(csm[idx, which(csm[idx,] < 0.8)])))]
}
article.query(18)[1:10]
library(rvest)
disease_links <- read_html('http://www.cdc.gov.tw/disease.aspx?treeid=8d54c504e820735b&nowtreeid=dec84a2f0c6fac5b') %>%
html_nodes('.list_content td a') %>% html_attr('href') %>%
paste0('http://www.cdc.gov.tw/', .)
getDisease <- function(url){
content <- read_html(url) %>%
html_nodes('.in_p')
qa <- list()
for (p in content){
header <- p %>% html_nodes('h2') %>%
html_text() %>% trimws()
article <- p %>% html_nodes('p') %>%
html_text() %>% trimws()
qa[header] = article
#qa <- cbind(qa, )
}
df <- data.frame(matrix(unlist(qa), nrow=1, byrow=T), stringsAsFactors = FALSE)
colnames(df) <- names(qa)
return(df)
}
url <- 'http://www.cdc.gov.tw/diseaseinfo.aspx?treeid=8d54c504e820735b&nowtreeid=dec84a2f0c6fac5b&tid=77BFF3D4F9CB7982'
getDisease(url)
res <- lapply(disease_links[1:10], getDisease)
library(plyr)
cdc_qa <- do.call(rbind.fill, res)
write.csv(x = cdc_qa, file ='cdc_qa.csv')
library(jiebaR)
mixseg <- worker()
qa_set <- c('我今天被不知名物種咬了, 感覺頭痛, 有點發燒, 淋巴結腫大, 請問醫生該怎麼辦', cdc_qa$`發病症狀:`)
qa_seg <- lapply(qa_set, function(e) segment(e, jiebar = mixseg))
doc <- Corpus(VectorSource(qa_seg))
dtm <- DocumentTermMatrix(doc)
m <- as.matrix(dtm)
cs <- proxy::dist(m, method = 'cosine')
csm <- as.matrix(cs)
rank <- order(csm[1,])[2:5] -1
cdc_qa$`治療方法與就醫資訊:`[rank]
# https://github.com/ywchiu/rtibame/blob/master/data/applenews.RData
load('applenews.RData')
library(jiebaR)
mixseg <- worker()
apple.seg <- lapply(applenews$content, function(news){
segment(news, jiebar = mixseg)
})
library(tm)
corpus <- Corpus(VectorSource(apple.seg))
dtm <- DocumentTermMatrix(corpus)
dtm.remove <- removeSparseTerms(dtm, 0.99)
dim(dtm)
dim(dtm.remove)
cs <- proxy::dist(as.matrix(dtm.remove), method = 'cosine')
hc <- hclust(cs, method ='ward.D2')
plot(hc)
rect.hclust(hc, 13)
fit <- cutree(hc, 13)
applenews$title[fit == 10]
kc <- kmeans(as.matrix(dtm.remove), 13)
applenews$title[kc$cluster == 13]
m <- as.matrix(dtm.remove)
word_rank <- order(colSums(m[kc$cluster == 13,]), decreasing = TRUE)
dtm.remove$dimnames$Terms[word_rank[1:10]]
#install.packages('igraph')
library(igraph)
mg <- make_graph(m)
wc <- cluster_walktrap(karate)
modularity(wc)
cs <- proxy::dist(as.matrix(dtm.remove), method = 'cosine')
csm <- as.matrix(cs)
filter_score <- function(x) {
x <- ifelse(x < 0.5, 1, 0)
return(x)
}
score.csm <- apply(csm, MARGIN = 2, filter_score)
score.csm[1:10,1:10]
G <- graph_from_adjacency_matrix(score.csm, mode = c( "undirected"))
wc <- cluster_walktrap(G)
applenews$title[wc$membership == 7]