KMeans
data(iris)
set.seed(22)
fit <- kmeans(iris[,-5], 3)
fit
## K-means clustering with 3 clusters of sizes 50, 62, 38
##
## Cluster means:
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 5.006000 3.428000 1.462000 0.246000
## 2 5.901613 2.748387 4.393548 1.433871
## 3 6.850000 3.073684 5.742105 2.071053
##
## Clustering vector:
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [71] 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3
## [106] 3 2 3 3 3 3 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 2 3 3 3 3 3 2 3 3 3 3 2 3
## [141] 3 3 2 3 3 3 2 3 3 2
##
## Within cluster sum of squares by cluster:
## [1] 15.15100 39.82097 23.87947
## (between_SS / total_SS = 88.4 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
barplot(t(fit$centers), beside=TRUE, xlab = 'cluster', ylab = 'value')

par(mfrow=c(1,2))
plot(Petal.Width ~ Petal.Length, data = iris, col = Species, main = 'Original Answer' )
plot(Petal.Width ~ Petal.Length, data = iris, col = fit$cluster , main = 'Clustered Result' )

par(mfrow=c(1,1))
plot(Petal.Width ~ Petal.Length, data = iris, col = fit$cluster , main = 'Clustered Result' )
points(Petal.Width ~ Petal.Length, data = fit$centers, col = 'orange', pch = 19, cex =3)

Silhouette
#library(fpc)
library(cluster)
set.seed(22)
km <- kmeans(iris[,-5], 3)
kms <- silhouette(km$cluster, dist(iris[,-5]))
summary(kms)
## Silhouette of 150 units in 3 clusters from silhouette.default(x = km$cluster, dist = dist(iris[, -5])) :
## Cluster sizes and average silhouette widths:
## 50 62 38
## 0.7981405 0.4173199 0.4511051
## Individual silhouette widths:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.02636 0.39115 0.56231 0.55282 0.77552 0.85391
plot(kms)

Determine K-Value
# method 1
set.seed(42)
WSS <- c()
for (k in 2:10){
km <- kmeans(iris[,-5], k)
WSS <- c(WSS,km$tot.withinss)
}
plot(x = 2:10, y = WSS, type = 'l')
# method 2
set.seed(42)
WSS <- rep(0, 9) # pre-allocate vector
for (k in 2:10){
km <- kmeans(iris[,-5], k)
WSS[k-1] <- km$tot.withinss
}
plot(x = 2:10, y = WSS, type = 'l')

# method 3
set.seed(42)
nk <- 2:10
WSS <- sapply(nk, function(k) kmeans(iris[,-5], k)$tot.withinss)
plot(x = nk, y = WSS, type = 'l')
## Using Silhouette
library(fpc)
## Warning: package 'fpc' was built under R version 3.4.4

set.seed(88)
nk <- 2:10
SW <- sapply(nk, function(k) {
cluster.stats(dist(iris[,-5]), kmeans(iris[,-5],
centers=k)$cluster)$avg.silwidth
})
plot(nk, SW, type="l", xlab="number of clusers", ylab="average silhouette width")

比較不同分群方法
single_c <- hclust(dist(iris[,-5]), method="single")
plot(single_c, hang= -0.1)

hc_single <- cutree(single_c, k = 3)
complete_c <- hclust(dist(iris[,-5]), method="complete")
plot(complete_c, hang= -0.1)

hc_complete <- cutree(complete_c, k = 3)
ward_c <- hclust(dist(iris[,-5]), method="ward.D2")
plot(ward_c, hang= -0.1)

hc_ward <- cutree(ward_c, k = 3)
set.seed(42)
kmeans_c <- kmeans(iris[,-5], 3)
cs <- cluster.stats(dist(iris[,-5]), kmeans_c$cluster)
cs[c("within.cluster.ss","avg.silwidth")]
## $within.cluster.ss
## [1] 78.85144
##
## $avg.silwidth
## [1] 0.552819
sapply(
list(
kmeans = kmeans_c$cluster,
hc_single = hc_single,
hc_complete = hc_complete,
hc_ward = hc_ward),
function(c) {
cluster.stats(dist(iris[,-5]),
c)[c("within.cluster.ss","avg.silwidth")]
})
## kmeans hc_single hc_complete hc_ward
## within.cluster.ss 78.85144 142.4794 89.52501 79.29713
## avg.silwidth 0.552819 0.5121108 0.5135953 0.5543237
發病區域分析
library(readr)
## Warning: package 'readr' was built under R version 3.4.4
url <- 'https://raw.githubusercontent.com/ywchiu/cdc_course/master/data/Dengue_2015_09_Tainan.csv'
Dengue <- read_csv(url)
## `curl` package not installed, falling back to using `url()`
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_integer(),
## 最小統計區中心點X = col_double(),
## 最小統計區中心點Y = col_double(),
## 發病日 = col_date(format = "")
## )
head(Dengue)
## # A tibble: 6 x 4
## X1 最小統計區中心點X 最小統計區中心點Y 發病日
## <int> <dbl> <dbl> <date>
## 1 1 120. 23.0 2015-09-01
## 2 2 120. 23.0 2015-09-01
## 3 3 120. 23.0 2015-09-01
## 4 4 120. 23.0 2015-09-01
## 5 5 120. 23.0 2015-09-01
## 6 6 120. 23.0 2015-09-01
Dengue <- Dengue[,c('最小統計區中心點X',
'最小統計區中心點Y')]
sum(is.na(Dengue))
## [1] 22
Dengue <- na.omit(Dengue)
plot(最小統計區中心點Y~最小統計區中心點X, data = Dengue)

set.seed(123)
kc <- kmeans(Dengue, centers = 3)
plot(最小統計區中心點Y~最小統計區中心點X, data = Dengue, col=kc$cluster)
points(最小統計區中心點Y~最小統計區中心點X, data = kc$centers, col='orange', pch = 19, cex = 3)

nrow(Dengue)
## [1] 12951
set.seed(123)
kc <- kmeans(Dengue, centers = 12)
plot(最小統計區中心點Y~最小統計區中心點X, data = Dengue, col=kc$cluster)
points(最小統計區中心點Y~最小統計區中心點X, data = kc$centers, col='orange', pch = 19, cex = 3)

DBSCAN
set.seed(42)
idx <- sample.int(2, nrow(Dengue), replace=TRUE)
dataset <- Dengue[idx==1,]
plot(最小統計區中心點Y~最小統計區中心點X, data = dataset)

rm(Dengue)
library(fpc)
res <- dbscan(dataset,eps=0.1,MinPts=3)
table(res$cluster)
##
## 0 1 2 3 4 5 6
## 16 6353 14 32 6 6 4
plot(最小統計區中心點Y~最小統計區中心點X, data = dataset, col= res$cluster)

plotcluster(dataset, res$cluster)

Homework 6
## Answer1
dataset <- c(0.70,0.45,0.72,0.30,1.16,0.69,0.83,0.74,1.24,0.77, 0.65,0.76,0.42,0.94,0.36,0.98,0.64,0.90,0.63,0.55, 0.78,0.10,0.52,0.42,0.58,0.62,1.12,0.86,0.74,1.04, 0.65,0.66,0.81,0.48,0.85,0.75,0.73,0.50,0.34,0.88)
# Get Median
median(dataset)
## [1] 0.71
length(dataset) / 2
## [1] 20
sort(dataset)[(length(dataset) / 2) ]
## [1] 0.7
sort(dataset)[(length(dataset) / 2) + 1 ]
## [1] 0.72
# Get Mean
mean(dataset)
## [1] 0.6965
sum(dataset) / length(dataset)
## [1] 0.6965
# Get Quantile
quantile(dataset, 0.25)
## 25%
## 0.5425
quantile(dataset, 0.75)
## 75%
## 0.835
# Get Summary
summary(dataset)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1000 0.5425 0.7100 0.6965 0.8350 1.2400
## Answer2
avg_dataset <- (0 * 12 + 1 * 24 + 2 * 42 + 3 * 38 + 4 * 30 + 5 * 4) / (12 +24 + 42 + 38 + 30 + 4)
dataset <- c(rep(0,12), rep(1,24), rep(2,42), rep(3, 38), rep(4, 30), rep(5, 4) )
mean(dataset)
## [1] 2.413333
avg_dataset
## [1] 2.413333
inject_times <- c(0,1,2,3,4,5)
people_numbers <- c(12,24,42,38,30,4)
avg_dataset <- sum(inject_times * people_numbers) / sum(people_numbers)
sqrt(sum(((inject_times - avg_dataset) ^ 2) * people_numbers) / (sum(people_numbers) - 1))
## [1] 1.270135
sd(dataset)
## [1] 1.270135