Demo20180703

KMeans

data(iris)
set.seed(22)
fit <- kmeans(iris[,-5], 3)
fit

## K-means clustering with 3 clusters of sizes 50, 62, 38
## 
## Cluster means:
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1     5.006000    3.428000     1.462000    0.246000
## 2     5.901613    2.748387     4.393548    1.433871
## 3     6.850000    3.073684     5.742105    2.071053
## 
## Clustering vector:
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [71] 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3
## [106] 3 2 3 3 3 3 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 2 3 3 3 3 3 2 3 3 3 3 2 3
## [141] 3 3 2 3 3 3 2 3 3 2
## 
## Within cluster sum of squares by cluster:
## [1] 15.15100 39.82097 23.87947
##  (between_SS / total_SS =  88.4 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

barplot(t(fit$centers), beside=TRUE, xlab = 'cluster', ylab = 'value')

par(mfrow=c(1,2))
plot(Petal.Width ~ Petal.Length, data = iris, col = Species, main = 'Original Answer' )
plot(Petal.Width ~ Petal.Length, data = iris, col = fit$cluster , main = 'Clustered Result' )

par(mfrow=c(1,1))
plot(Petal.Width ~ Petal.Length, data = iris, col = fit$cluster , main = 'Clustered Result' )

points(Petal.Width ~ Petal.Length, data = fit$centers, col = 'orange', pch = 19, cex =3)

Silhouette

#library(fpc)
library(cluster)
set.seed(22)
km  <- kmeans(iris[,-5], 3)
kms <- silhouette(km$cluster, dist(iris[,-5]))
summary(kms)

## Silhouette of 150 units in 3 clusters from silhouette.default(x = km$cluster, dist = dist(iris[, -5])) :
##  Cluster sizes and average silhouette widths:
##        50        62        38 
## 0.7981405 0.4173199 0.4511051 
## Individual silhouette widths:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.02636 0.39115 0.56231 0.55282 0.77552 0.85391

plot(kms)

Determine K-Value

# method 1
set.seed(42)
WSS <- c()
for (k in 2:10){
  km  <- kmeans(iris[,-5], k)
  WSS <- c(WSS,km$tot.withinss)
}
plot(x = 2:10, y = WSS, type = 'l')


# method 2 
set.seed(42)
WSS <- rep(0, 9) # pre-allocate vector
for (k in 2:10){
  km  <- kmeans(iris[,-5], k)
  WSS[k-1] <- km$tot.withinss
}
plot(x = 2:10, y = WSS, type = 'l')

# method 3
set.seed(42)
nk <- 2:10
WSS <- sapply(nk, function(k) kmeans(iris[,-5], k)$tot.withinss)
plot(x = nk, y = WSS, type = 'l')


## Using Silhouette
library(fpc)

## Warning: package 'fpc' was built under R version 3.4.4

set.seed(88)
nk <- 2:10
SW <- sapply(nk, function(k) {
    cluster.stats(dist(iris[,-5]), kmeans(iris[,-5],
    centers=k)$cluster)$avg.silwidth
})
plot(nk, SW, type="l", xlab="number of clusers", ylab="average silhouette width")

比較不同分群方法

single_c <- hclust(dist(iris[,-5]), method="single")
plot(single_c, hang= -0.1)

hc_single <- cutree(single_c, k = 3)


complete_c <- hclust(dist(iris[,-5]), method="complete")
plot(complete_c, hang= -0.1)

hc_complete <- cutree(complete_c, k = 3)

ward_c <- hclust(dist(iris[,-5]), method="ward.D2")
plot(ward_c, hang= -0.1)

hc_ward <- cutree(ward_c, k = 3)

set.seed(42)
kmeans_c <- kmeans(iris[,-5], 3)

cs <- cluster.stats(dist(iris[,-5]), kmeans_c$cluster)
cs[c("within.cluster.ss","avg.silwidth")]

## $within.cluster.ss
## [1] 78.85144
## 
## $avg.silwidth
## [1] 0.552819

sapply(
  list(
    kmeans      = kmeans_c$cluster, 
    hc_single   = hc_single, 
    hc_complete = hc_complete,
    hc_ward     = hc_ward), 
  function(c) {
    cluster.stats(dist(iris[,-5]),
c)[c("within.cluster.ss","avg.silwidth")]
    })

##                   kmeans   hc_single hc_complete hc_ward  
## within.cluster.ss 78.85144 142.4794  89.52501    79.29713 
## avg.silwidth      0.552819 0.5121108 0.5135953   0.5543237

發病區域分析

library(readr)

## Warning: package 'readr' was built under R version 3.4.4

url <- 'https://raw.githubusercontent.com/ywchiu/cdc_course/master/data/Dengue_2015_09_Tainan.csv'
Dengue <- read_csv(url)

## `curl` package not installed, falling back to using `url()`

## Warning: Missing column names filled in: 'X1' [1]

## Parsed with column specification:
## cols(
##   X1 = col_integer(),
##   最小統計區中心點X = col_double(),
##   最小統計區中心點Y = col_double(),
##   發病日 = col_date(format = "")
## )

head(Dengue)

## # A tibble: 6 x 4
##      X1 最小統計區中心點X 最小統計區中心點Y 發病日    
##   <int>             <dbl>             <dbl> <date>    
## 1     1              120.              23.0 2015-09-01
## 2     2              120.              23.0 2015-09-01
## 3     3              120.              23.0 2015-09-01
## 4     4              120.              23.0 2015-09-01
## 5     5              120.              23.0 2015-09-01
## 6     6              120.              23.0 2015-09-01

Dengue <- Dengue[,c('最小統計區中心點X',
'最小統計區中心點Y')]


sum(is.na(Dengue))

## [1] 22

Dengue <- na.omit(Dengue)


plot(最小統計區中心點Y~最小統計區中心點X, data = Dengue)

set.seed(123)
kc <- kmeans(Dengue, centers = 3)
plot(最小統計區中心點Y~最小統計區中心點X, data = Dengue, col=kc$cluster)
points(最小統計區中心點Y~最小統計區中心點X, data = kc$centers, col='orange', pch = 19, cex = 3)

nrow(Dengue)

## [1] 12951

set.seed(123)
kc <- kmeans(Dengue, centers = 12)
plot(最小統計區中心點Y~最小統計區中心點X, data = Dengue, col=kc$cluster)
points(最小統計區中心點Y~最小統計區中心點X, data = kc$centers, col='orange', pch = 19, cex = 3)

DBSCAN

set.seed(42)
idx <- sample.int(2, nrow(Dengue), replace=TRUE)
dataset <- Dengue[idx==1,]
plot(最小統計區中心點Y~最小統計區中心點X, data = dataset)

rm(Dengue)

library(fpc)
res <- dbscan(dataset,eps=0.1,MinPts=3)
table(res$cluster)

## 
##    0    1    2    3    4    5    6 
##   16 6353   14   32    6    6    4

plot(最小統計區中心點Y~最小統計區中心點X, data = dataset, col= res$cluster)

plotcluster(dataset, res$cluster)

Homework 6

## Answer1
dataset <- c(0.70,0.45,0.72,0.30,1.16,0.69,0.83,0.74,1.24,0.77, 0.65,0.76,0.42,0.94,0.36,0.98,0.64,0.90,0.63,0.55, 0.78,0.10,0.52,0.42,0.58,0.62,1.12,0.86,0.74,1.04, 0.65,0.66,0.81,0.48,0.85,0.75,0.73,0.50,0.34,0.88)

# Get Median
median(dataset)

## [1] 0.71

length(dataset) / 2

## [1] 20

sort(dataset)[(length(dataset) / 2) ]

## [1] 0.7

sort(dataset)[(length(dataset) / 2) + 1 ]

## [1] 0.72

# Get Mean
mean(dataset)

## [1] 0.6965

sum(dataset) / length(dataset)

## [1] 0.6965

# Get Quantile
quantile(dataset, 0.25)

##    25% 
## 0.5425

quantile(dataset, 0.75)

##   75% 
## 0.835

# Get Summary
summary(dataset)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1000  0.5425  0.7100  0.6965  0.8350  1.2400

## Answer2

avg_dataset <- (0 * 12 + 1 * 24  + 2 * 42 + 3 * 38 + 4 * 30  + 5 * 4) / (12 +24 + 42 + 38 + 30 + 4)

dataset <- c(rep(0,12), rep(1,24), rep(2,42), rep(3, 38), rep(4, 30), rep(5, 4) )
mean(dataset)

## [1] 2.413333

avg_dataset

## [1] 2.413333

inject_times   <- c(0,1,2,3,4,5)
people_numbers <- c(12,24,42,38,30,4)

avg_dataset <- sum(inject_times * people_numbers) / sum(people_numbers)

sqrt(sum(((inject_times - avg_dataset) ^ 2) * people_numbers) / (sum(people_numbers)  - 1))

## [1] 1.270135

sd(dataset)

## [1] 1.270135

Demo20180703

David Chiu

2018年7月3日

KMeans

Silhouette

Determine K-Value

比較不同分群方法

發病區域分析

DBSCAN

Homework 6