Distance

x <- c(0,0,1,1,1,1)
y <- c(1,0,1,1,0,1)

# Euclidean Distance
sqrt(sum((x - y ) ^ 2))
## [1] 1.414214
dist(rbind(x,y), method = 'euclidean')
##          x
## y 1.414214
dist(rbind(x,y), method = 'minkowski', p = 2)
##          x
## y 1.414214
# Manhattan Distance
sum(abs(x - y))
## [1] 2
dist(rbind(x,y), method = 'manhattan')
##   x
## y 2
dist(rbind(x,y), method = 'minkowski', p = 1)
##   x
## y 2
?dist
## starting httpd help server ... done
# install.packages('proxy')
library(proxy)
## Warning: package 'proxy' was built under R version 3.4.2
## 
## Attaching package: 'proxy'
## 
## The following objects are masked from 'package:stats':
## 
##     as.dist, dist
## 
## The following object is masked from 'package:base':
## 
##     as.matrix
?proxy::dist

Hierarchical Clustering

data(iris)
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
head(iris[,1:4])
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1          5.1         3.5          1.4         0.2
## 2          4.9         3.0          1.4         0.2
## 3          4.7         3.2          1.3         0.2
## 4          4.6         3.1          1.5         0.2
## 5          5.0         3.6          1.4         0.2
## 6          5.4         3.9          1.7         0.4
head(iris[,-5])
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1          5.1         3.5          1.4         0.2
## 2          4.9         3.0          1.4         0.2
## 3          4.7         3.2          1.3         0.2
## 4          4.6         3.1          1.5         0.2
## 5          5.0         3.6          1.4         0.2
## 6          5.4         3.9          1.7         0.4
fit <- hclust(dist(iris[,-5], method = 'euclidean'), method = 'ward.D2')

fit
## 
## Call:
## hclust(d = dist(iris[, -5], method = "euclidean"), method = "ward.D2")
## 
## Cluster method   : ward.D2 
## Distance         : Euclidean 
## Number of objects: 150
plot(fit)

plot(fit, hang=-0.01, cex = 0.7)

fit2 <- hclust(dist(iris[,-5], method = 'euclidean'), method = 'single')
plot(fit2, hang =-0.01, cex = 0.7)

k_data <- cutree(fit, k =3 )
table(k_data)
## k_data
##  1  2  3 
## 50 64 36
plot(fit)
rect.hclust(fit, k =3 , border ="red")

k_data
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [71] 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3
## [106] 3 2 3 3 3 3 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 2 3 3 3 3 3 2 2 3 3 3 2 3
## [141] 3 3 2 3 3 3 2 3 3 2
par(mfrow=c(1,2))
plot(iris$Petal.Length, iris$Petal.Width, col=iris$Species, main= 'Original Result')
plot(iris$Petal.Length, iris$Petal.Width, col=k_data, main= 'Result By Clustering Method')

k_data2 <- cutree(fit, k =4 )
plot(fit)
rect.hclust(fit, k = 4 , border ="red")

plot(iris$Petal.Length, iris$Petal.Width, col=k_data2, main= 'Result By Clustering Method')

KMeans

set.seed(22)
head(.Random.seed)
## [1]         403         624  -555887406 -1874586069   298905456  -806851407
sample.int(42, 6)
## [1] 13 20 40 21 33 27
sample.int(42, 6)
## [1] 26 31 17 15 37 23
sample.int(42, 6)
## [1] 20 36 18  4 16  7
set.seed(22)
fit <- kmeans(iris[,-5], 3)
fit$cluster
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [71] 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3
## [106] 3 2 3 3 3 3 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 2 3 3 3 3 3 2 3 3 3 3 2 3
## [141] 3 3 2 3 3 3 2 3 3 2
plot(iris$Petal.Length, iris$Petal.Width, col = fit$cluster)

barplot(fit$centers, beside=TRUE)

plot(iris$Petal.Length, iris$Petal.Width, col = fit$cluster)
points(fit$centers[,3], fit$centers[,4], col = "orange", pch=19,cex=3 )

Silhouette

library(cluster)

set.seed(22)

km  <- kmeans(iris[,-5], 3)
kms <- silhouette(km$cluster, dist(iris[,-5]))

summary(kms)
## Silhouette of 150 units in 3 clusters from silhouette.default(x = km$cluster, dist = dist(iris[, -5])) :
##  Cluster sizes and average silhouette widths:
##        50        62        38 
## 0.7981405 0.4173199 0.4511051 
## Individual silhouette widths:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.02636 0.39115 0.56231 0.55282 0.77552 0.85391
plot(kms)

如何判斷K 值 (WSS)

nk <- 2:10
wss <- sapply(nk,  function(k){
  kmeans(iris[,-5], centers = k)$tot.withinss
})
plot(nk,wss, type= 'l')
abline(v= 3, col="red")

如何判斷K 值 (silhouette)

library(fpc)
## Warning: package 'fpc' was built under R version 3.4.2
nk <- 2:10
set.seed(42)
SW <- sapply(nk, function(k){
  fit_centers <- kmeans(iris[,-5], centers = k)$cluster
  cluster.stats(dist(iris[,-5]), fit_centers )$avg.silwidth 
})
plot(nk,SW, type = 'l')

set.seed(22)
km  <- kmeans(iris[,-5], 3)
kms <- silhouette(km$cluster, dist(iris[,-5]))
summary(kms)
## Silhouette of 150 units in 3 clusters from silhouette.default(x = km$cluster, dist = dist(iris[, -5])) :
##  Cluster sizes and average silhouette widths:
##        50        62        38 
## 0.7981405 0.4173199 0.4511051 
## Individual silhouette widths:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.02636 0.39115 0.56231 0.55282 0.77552 0.85391
set.seed(22)
km  <- kmeans(iris[,-5], 2)
kms <- silhouette(km$cluster, dist(iris[,-5]))
summary(kms)
## Silhouette of 150 units in 2 clusters from silhouette.default(x = km$cluster, dist = dist(iris[, -5])) :
##  Cluster sizes and average silhouette widths:
##        53        97 
## 0.7695262 0.6327014 
## Individual silhouette widths:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.04069 0.63938 0.71217 0.68105 0.79587 0.85503

比較不同分群方法

data(iris)
single_c  <- hclust(dist(iris[,-5]), method = 'single')
hc_single <- cutree(single_c, k = 3)

complete_c  <- hclust(dist(iris[,-5]), method = 'complete')
hc_complete <- cutree(complete_c, k = 3)

set.seed(42)
km <- kmeans(iris[,-5], 3)

cs <- cluster.stats(dist(iris[,-5]), km$cluster)
cs[c('within.cluster.ss', 'avg.silwidth')]
## $within.cluster.ss
## [1] 78.85144
## 
## $avg.silwidth
## [1] 0.552819
sapply(
  list(kmeans = km$cluster, 
       hc_single=hc_single, 
       hc_complete = hc_complete), 
  function(c) cluster.stats(dist(iris[,-5]), c)[c('within.cluster.ss', 'avg.silwidth')]
  )
##                   kmeans   hc_single hc_complete
## within.cluster.ss 78.85144 142.4794  89.52501   
## avg.silwidth      0.552819 0.5121108 0.5135953

客戶資料分群

customers <- read.csv('https://raw.githubusercontent.com/ywchiu/fubonr/master/data/customers.csv')

head(customers)
##   CustomerID  Genre Age Annual_Income Spending_Score
## 1          1   Male  19            15             39
## 2          2   Male  21            15             81
## 3          3 Female  20            16              6
## 4          4 Female  23            16             77
## 5          5 Female  31            17             40
## 6          6 Female  22            17             76
customers <- customers[, c('Annual_Income', 'Spending_Score')]
plot(customers$Annual_Income, customers$Spending_Score)

set.seed(54)
fit <- kmeans(customers, centers = 5)
plot(customers$Annual_Income, customers$Spending_Score, col=fit$cluster)
points(fit$centers[,1], fit$centers[,2], col='orange', pch = 19, cex = 3)

library(cluster)
kcs <- silhouette(fit$cluster, dist(customers))
summary(kcs)
## Silhouette of 200 units in 5 clusters from silhouette.default(x = fit$cluster, dist = dist(customers)) :
##  Cluster sizes and average silhouette widths:
##        39        81        35        23        22 
## 0.5091706 0.5966512 0.5039873 0.5122676 0.5990129 
## Individual silhouette widths:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.02338  0.49425  0.59614  0.55393  0.66011  0.75807
plot(kcs)

library(fpc)
nk <- 2:10
set.seed(123)
SW <- sapply(nk, function(k){
  fit <- kmeans(customers, centers = k)
  cluster.stats(dist(customers),fit$cluster)$avg.silwidth
})
plot(nk, SW, type= 'l')
abline(v = 5, col="red")

Digit Clustering

#install.packages('png')
library(png)
img1 <- readPNG("17.png", TRUE)
img3 <- img1[,nrow(img1):1]
b <- cbind(as.integer(which(img3 < -1) %% 28), which(img3 < -1) / 28)
plot(b, xlim=c(1,28), ylim=c(1,28))

fit <- kmeans(b, 2)
plot(b, col=fit$cluster, pch=fit$cluster)
points(fit$centers, col="orange", cex = 3, pch=19)

## DBSCAN

library(fpc)
ds <- dbscan(b, 2)
# run with showplot=1 to see how dbscan works.
ds
## dbscan Pts=202 MinPts=5 eps=2
##        1   2
## seed  72 130
## total 72 130
plot(ds, b)

Community Detection

library(readxl)
appledaily20171214 <- read_excel("appledaily20171214.xlsx")
#View(appledaily20171214)

library(jiebaR)
## Warning: package 'jiebaR' was built under R version 3.4.2
## Loading required package: jiebaRD
mixseg <- worker()
apple.seg <- lapply(appledaily20171214$content, function(e){
  segment(e, jiebar = mixseg)
})

library(tm)
## Warning: package 'tm' was built under R version 3.4.2
## Loading required package: NLP
corpus <- Corpus(VectorSource(apple.seg))
doc    <- tm_map(corpus, removeNumbers)
dtm    <- DocumentTermMatrix(doc)
dtm.remove <- removeSparseTerms(dtm, 0.995)
#dtm
#dtm.remove
library(proxy)
dtm.dist <- proxy::dist(as.matrix(dtm.remove), method = 'cosine')


fit <- kmeans(dtm.dist, 20 )
table(fit$cluster)
## 
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
##  34  15  38  44  28  67  31  58  13  66  43 173   6  17   7  20  52 100 
##  19  20 
##  57  30
appledaily20171214[fit$cluster == 7,'title']
## # A tibble: 31 x 1
##                                            title
##                                            <chr>
##  1                  天使換來金斯勒 解決二壘漏洞
##  2        遊騎兵與史博威簽小聯盟約 補強先發深度
##  3 伍茲1對1指導高爾夫有多貴? 先準備630萬再來 
##  4           馬恰多將成自由球員 現已有5隊想搶人
##  5                     努涅茲搶手 美東3強都想要
##  6        大谷手傷總教練不在乎 明年全力投入春訓
##  7      梅威瑟為錢復出 自爆打3場格鬥賽可撈300億
##  8              沃神開示 雷霆喬治兩個月內走定了
##  9      又交易掉歐蘇納 波拉斯:馬林魚隊變成當鋪
## 10   費德勒續約巧克力品牌 爽賺6億台幣難怪不言退
## # ... with 21 more rows
m <- as.matrix(dtm.dist)
m2 <- m < 0.5 


m        <- as.matrix(dtm.dist)

m2 <- ifelse(m < 0.4, 1, 0)
m2[1:3,1:3]
##   1 2 3
## 1 1 0 0
## 2 0 1 0
## 3 0 0 1
library(igraph)
## Warning: package 'igraph' was built under R version 3.4.3
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
G <- graph_from_adjacency_matrix(m2)

wc <- cluster_walktrap(G)
modularity(wc)
## [1] 0.3448493
table(membership(wc))
## 
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
##   7   4   4   3   3   3   3   8   2   2   2   2   2   2   2   2   2   2 
##  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
##  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   3   3 
##  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72 
##  30   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 793 794 795 
##   1   1   1
group <- membership(wc)
table(group)
## group
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
##   7   4   4   3   3   3   3   8   2   2   2   2   2   2   2   2   2   2 
##  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
##  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   3   3 
##  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72 
##  30   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 793 794 795 
##   1   1   1
appledaily20171214$title[group ==3]
## [1] "【不斷更新】桃園工廠惡火撲滅 6人仍失聯宿舍內發現一堆白骨"
## [2] "桃園工廠火勢熄滅 員工宿舍內發現一堆白骨"                
## [3] "桃園工廠大火6員工失聯 家屬焦急等待"                     
## [4] "汽車用品大廠「矽卡」燒毀 資本額達2億"
#plot(wc, G)

Feature Selection

#install.packages('FSelector')
library(FSelector)
## Warning: package 'FSelector' was built under R version 3.4.3
library(C50)
## Warning: package 'C50' was built under R version 3.4.2
data(churn)
trainset <- churnTrain[,! colnames(churnTrain) %in% c('state', 'acciunt_length', 'area_code') ]


weights <-  random.forest.importance(churn~., trainset, importance.type = 1)

#weights[order(weights$attr_importance), ]
#View(weights)

subset <- cutoff.k(weights, 5)
f <- as.simple.formula(subset, "Class")
print(f)
## Class ~ number_customer_service_calls + international_plan + 
##     total_intl_calls + total_day_minutes + total_day_charge
## <environment: 0x000000005fbb75c0>

Caret

library(caret)
## Warning: package 'caret' was built under R version 3.4.2
## Loading required package: lattice
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
control = trainControl(method="repeatedcv", number=10, repeats=3)

model <-  train(churn~., data=trainset, method="rpart", preProcess="scale", trControl=control)
## Warning: package 'rpart' was built under R version 3.4.2
importance = varImp(model, scale=FALSE)
importance
## rpart variable importance
## 
##                               Overall
## total_day_minutes             101.086
## total_day_charge               94.083
## number_customer_service_calls  80.306
## international_planyes          55.775
## voice_mail_planyes             33.806
## number_vmail_messages          33.806
## total_eve_charge               14.734
## total_eve_minutes              14.734
## total_intl_minutes              8.656
## total_intl_calls                0.000
## total_night_minutes             0.000
## total_night_calls               0.000
## account_length                  0.000
## total_intl_charge               0.000
## total_eve_calls                 0.000
## total_night_charge              0.000
## total_day_calls                 0.000
plot(importance)

使用rminer

#install.packages("rminer")
library(rminer)
## Warning: package 'rminer' was built under R version 3.4.2
model <- fit(churn~.,trainset,model="rpart")
VariableImportance=Importance(model,trainset,method="sensv")
L=list(runs=1,sen=t(VariableImportance$imp),sresponses=VariableImportance$sresponses)

mgraph(L,graph="IMP",leg=names(trainset),col="gray",Grid=10)

library(rminer)
model <- fit(churn~.,trainset,model="svm")
VariableImportance=Importance(model,trainset,method="sensv")
L=list(runs=1,sen=t(VariableImportance$imp),sresponses=VariableImportance$sresponses)

mgraph(L,graph="IMP",leg=names(trainset),col="gray",Grid=10)

head(trainset)
##   account_length international_plan voice_mail_plan number_vmail_messages
## 1            128                 no             yes                    25
## 2            107                 no             yes                    26
## 3            137                 no              no                     0
## 4             84                yes              no                     0
## 5             75                yes              no                     0
## 6            118                yes              no                     0
##   total_day_minutes total_day_calls total_day_charge total_eve_minutes
## 1             265.1             110            45.07             197.4
## 2             161.6             123            27.47             195.5
## 3             243.4             114            41.38             121.2
## 4             299.4              71            50.90              61.9
## 5             166.7             113            28.34             148.3
## 6             223.4              98            37.98             220.6
##   total_eve_calls total_eve_charge total_night_minutes total_night_calls
## 1              99            16.78               244.7                91
## 2             103            16.62               254.4               103
## 3             110            10.30               162.6               104
## 4              88             5.26               196.9                89
## 5             122            12.61               186.9               121
## 6             101            18.75               203.9               118
##   total_night_charge total_intl_minutes total_intl_calls total_intl_charge
## 1              11.01               10.0                3              2.70
## 2              11.45               13.7                3              3.70
## 3               7.32               12.2                5              3.29
## 4               8.86                6.6                7              1.78
## 5               8.41               10.1                3              2.73
## 6               9.18                6.3                6              1.70
##   number_customer_service_calls churn
## 1                             1    no
## 2                             1    no
## 3                             0    no
## 4                             2    no
## 5                             3    no
## 6                             0    no
model <- glm(churn ~., data = trainset, family = binomial())

#model
#model.step <- step(model)
#model.sep

建立評估器 (Evaluator)

 evaluator <- function(subset) {
   k <- 5  
   set.seed(2)
   ind = sample(5, nrow(trainset), replace = TRUE)
   results <- sapply(1:k, function(i) {
     train <- trainset[ind !=i,]
     test  <- trainset[ind  ==i,]
     tree  <- rpart(as.simple.formula(subset, "churn"), trainset)
     error.rate = sum(test$churn != predict(tree, test, type="class")) / nrow(test)
     return(1 - error.rate)
   })
   return(mean(results))
 }

attr.subset <- hill.climbing.search(names(trainset)[!names(trainset) %in% "churn"], evaluator)
f <-  as.simple.formula(attr.subset, "churn")
print(f)
## churn ~ account_length + international_plan + voice_mail_plan + 
##     total_day_minutes + total_eve_minutes + total_night_calls + 
##     total_intl_minutes + total_intl_calls + number_customer_service_calls
## <environment: 0x00000000677ffb40>

Feature Extraction

data(swiss)
head(swiss)
##              Fertility Agriculture Examination Education Catholic
## Courtelary        80.2        17.0          15        12     9.96
## Delemont          83.1        45.1           6         9    84.84
## Franches-Mnt      92.5        39.7           5         5    93.40
## Moutier           85.8        36.5          12         7    33.77
## Neuveville        76.9        43.5          17        15     5.16
## Porrentruy        76.1        35.3           9         7    90.57
##              Infant.Mortality
## Courtelary               22.2
## Delemont                 22.2
## Franches-Mnt             20.2
## Moutier                  20.3
## Neuveville               20.6
## Porrentruy               26.6
swiss <- swiss[,-1]
colnames(swiss)
## [1] "Agriculture"      "Examination"      "Education"       
## [4] "Catholic"         "Infant.Mortality"
swiss.pca <- prcomp(swiss, center=TRUE, scale = TRUE)
swiss.pca
## Standard deviations (1, .., p=5):
## [1] 1.6228065 1.0354873 0.9033447 0.5592765 0.4067472
## 
## Rotation (n x k) = (5 x 5):
##                          PC1         PC2          PC3        PC4
## Agriculture       0.52396452 -0.25834215  0.003003672 -0.8090741
## Examination      -0.57185792 -0.01145981 -0.039840522 -0.4224580
## Education        -0.49150243  0.19028476  0.539337412 -0.3321615
## Catholic          0.38530580  0.36956307  0.725888143  0.1007965
## Infant.Mortality  0.09167606  0.87197641 -0.424976789 -0.2154928
##                          PC5
## Agriculture       0.06411415
## Examination      -0.70198942
## Education         0.56656945
## Catholic         -0.42176895
## Infant.Mortality  0.06488642
summary(swiss.pca)
## Importance of components%s:
##                           PC1    PC2    PC3     PC4     PC5
## Standard deviation     1.6228 1.0355 0.9033 0.55928 0.40675
## Proportion of Variance 0.5267 0.2145 0.1632 0.06256 0.03309
## Cumulative Proportion  0.5267 0.7411 0.9043 0.96691 1.00000
predicted <- predict(swiss.pca, newsdata = head(swiss,1))
## Warning: In predict.prcomp(swiss.pca, newsdata = head(swiss, 1)) :
##  extra argument 'newsdata' will be disregarded
screeplot(swiss.pca,type= 'barplot')  
abline(h = 1, col="red")

screeplot(swiss.pca,type= 'line')  
abline(h = 1, col="red")

swiss.pca$sdev
## [1] 1.6228065 1.0354873 0.9033447 0.5592765 0.4067472
swiss.pca$sdev ^ 2 > 1
## [1]  TRUE  TRUE FALSE FALSE FALSE
plot(swiss.pca$x[,1], swiss.pca$x[,2],xlim =c(-4,4))
text(swiss.pca$x[,1], swiss.pca$x[,2], rownames(swiss.pca$x), col="red")

biplot(swiss.pca)

## 產生台灣幸福指標

dataset <- read.csv('https://raw.githubusercontent.com/ywchiu/fubonr/master/data/eco_index.csv', header = TRUE, row.names = 1)

head(dataset)
##                  sales expense income.average
## Taipei          122838   14.83         704024
## Tainan           24689   14.56         430680
## New Taipei City  41719   13.10         506143
## Kaohsiung        41982   11.56         487309
## Taichung         35856   10.56         511603
## Hsinchu County    9665   22.57         569021
pc.cr <- princomp(dataset, cor = TRUE)
pc.cr
## Call:
## princomp(x = dataset, cor = TRUE)
## 
## Standard deviations:
##    Comp.1    Comp.2    Comp.3 
## 1.4480423 0.8234342 0.4744782 
## 
##  3  variables and  20 observations.
screeplot(pc.cr, type= 'bar')
abline(h = 1, col="red")

screeplot(pc.cr, type= 'line')
abline(h = 1, col="red")

biplot(pc.cr)

pc.cr$scores[order(pc.cr$scores[,1])]
##  [1] -4.44166677 -1.47378234 -1.36963836 -1.24859975 -1.22361917
##  [6] -1.21393904 -0.09492659 -0.04573421  0.05588516  0.48721191
## [11]  0.56259111  0.57010732  0.57174521  0.63677543  0.88013703
## [16]  1.21208605  1.22879883  1.29191378  1.65058209  1.96407231
sort(-pc.cr$scores[,1], decreasing = TRUE)
##          Taipei         Hsinchu        Taichung  Taoyuan County 
##      4.44166677      1.47378234      1.36963836      1.24859975 
##       Kaohsiung New Taipei City          Tainan  Hsinchu County 
##      1.22361917      1.21393904      0.09492659      0.04573421 
## Changhua County   Yunlin County   Miaoli County     Chiayi City 
##     -0.05588516     -0.48721191     -0.56259111     -0.57010732 
##  Hualien County   Nantou County         Keelung    Yilan County 
##     -0.57174521     -0.63677543     -0.88013703     -1.21208605 
##         Taitung Pingtung County   Chiayi County   Penghu County 
##     -1.22879883     -1.29191378     -1.65058209     -1.96407231
barplot(sort(-pc.cr$scores[,1], decreasing = TRUE), cex.axis= 0.6, cex.names = 0.6)

SVD

lenna <- readPNG('Lenna.png')
lenna <- t(lenna[,,1])[,nrow(lenna):1]
image(lenna)

lenna.svd <- svd(scale(lenna))
plot(lenna.svd$d^2/sum(lenna.svd$d^2), type="l", xlab=" Singular vector", ylab = "Variance explained")

length(lenna.svd$d)
## [1] 200
min(which(cumsum(lenna.svd$d^2/sum(lenna.svd$d^2))> 0.9))
## [1] 24
lenna_compression = function(dim){
    u=as.matrix(lenna.svd$u[, 1:dim])
    v=as.matrix(lenna.svd$v[, 1:dim])
    d=as.matrix(diag(lenna.svd$d)[1:dim, 1:dim])
    image(u%*%d%*%t(v))
}
lenna_compression(24)

min(which(cumsum(lenna.svd$d^2/sum(lenna.svd$d^2))> 0.99))
## [1] 84
lenna_compression(84)

min(which(cumsum(lenna.svd$d^2/sum(lenna.svd$d^2))> 0.999))
## [1] 128
lenna_compression(128)