x <- c(0,0,1,1,1,1)
y <- c(1,0,1,1,0,1)
# Euclidean Distance
sqrt(sum((x - y ) ^ 2))
## [1] 1.414214
dist(rbind(x,y), method = 'euclidean')
## x
## y 1.414214
dist(rbind(x,y), method = 'minkowski', p = 2)
## x
## y 1.414214
# Manhattan Distance
sum(abs(x - y))
## [1] 2
dist(rbind(x,y), method = 'manhattan')
## x
## y 2
dist(rbind(x,y), method = 'minkowski', p = 1)
## x
## y 2
?dist
## starting httpd help server ... done
# install.packages('proxy')
library(proxy)
## Warning: package 'proxy' was built under R version 3.4.2
##
## Attaching package: 'proxy'
##
## The following objects are masked from 'package:stats':
##
## as.dist, dist
##
## The following object is masked from 'package:base':
##
## as.matrix
?proxy::dist
data(iris)
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
head(iris[,1:4])
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 5.1 3.5 1.4 0.2
## 2 4.9 3.0 1.4 0.2
## 3 4.7 3.2 1.3 0.2
## 4 4.6 3.1 1.5 0.2
## 5 5.0 3.6 1.4 0.2
## 6 5.4 3.9 1.7 0.4
head(iris[,-5])
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 5.1 3.5 1.4 0.2
## 2 4.9 3.0 1.4 0.2
## 3 4.7 3.2 1.3 0.2
## 4 4.6 3.1 1.5 0.2
## 5 5.0 3.6 1.4 0.2
## 6 5.4 3.9 1.7 0.4
fit <- hclust(dist(iris[,-5], method = 'euclidean'), method = 'ward.D2')
fit
##
## Call:
## hclust(d = dist(iris[, -5], method = "euclidean"), method = "ward.D2")
##
## Cluster method : ward.D2
## Distance : Euclidean
## Number of objects: 150
plot(fit)
plot(fit, hang=-0.01, cex = 0.7)
fit2 <- hclust(dist(iris[,-5], method = 'euclidean'), method = 'single')
plot(fit2, hang =-0.01, cex = 0.7)
k_data <- cutree(fit, k =3 )
table(k_data)
## k_data
## 1 2 3
## 50 64 36
plot(fit)
rect.hclust(fit, k =3 , border ="red")
k_data
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [71] 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3
## [106] 3 2 3 3 3 3 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 2 3 3 3 3 3 2 2 3 3 3 2 3
## [141] 3 3 2 3 3 3 2 3 3 2
par(mfrow=c(1,2))
plot(iris$Petal.Length, iris$Petal.Width, col=iris$Species, main= 'Original Result')
plot(iris$Petal.Length, iris$Petal.Width, col=k_data, main= 'Result By Clustering Method')
k_data2 <- cutree(fit, k =4 )
plot(fit)
rect.hclust(fit, k = 4 , border ="red")
plot(iris$Petal.Length, iris$Petal.Width, col=k_data2, main= 'Result By Clustering Method')
set.seed(22)
head(.Random.seed)
## [1] 403 624 -555887406 -1874586069 298905456 -806851407
sample.int(42, 6)
## [1] 13 20 40 21 33 27
sample.int(42, 6)
## [1] 26 31 17 15 37 23
sample.int(42, 6)
## [1] 20 36 18 4 16 7
set.seed(22)
fit <- kmeans(iris[,-5], 3)
fit$cluster
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [71] 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3
## [106] 3 2 3 3 3 3 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 2 3 3 3 3 3 2 3 3 3 3 2 3
## [141] 3 3 2 3 3 3 2 3 3 2
plot(iris$Petal.Length, iris$Petal.Width, col = fit$cluster)
barplot(fit$centers, beside=TRUE)
plot(iris$Petal.Length, iris$Petal.Width, col = fit$cluster)
points(fit$centers[,3], fit$centers[,4], col = "orange", pch=19,cex=3 )
library(cluster)
set.seed(22)
km <- kmeans(iris[,-5], 3)
kms <- silhouette(km$cluster, dist(iris[,-5]))
summary(kms)
## Silhouette of 150 units in 3 clusters from silhouette.default(x = km$cluster, dist = dist(iris[, -5])) :
## Cluster sizes and average silhouette widths:
## 50 62 38
## 0.7981405 0.4173199 0.4511051
## Individual silhouette widths:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.02636 0.39115 0.56231 0.55282 0.77552 0.85391
plot(kms)
nk <- 2:10
wss <- sapply(nk, function(k){
kmeans(iris[,-5], centers = k)$tot.withinss
})
plot(nk,wss, type= 'l')
abline(v= 3, col="red")
library(fpc)
## Warning: package 'fpc' was built under R version 3.4.2
nk <- 2:10
set.seed(42)
SW <- sapply(nk, function(k){
fit_centers <- kmeans(iris[,-5], centers = k)$cluster
cluster.stats(dist(iris[,-5]), fit_centers )$avg.silwidth
})
plot(nk,SW, type = 'l')
set.seed(22)
km <- kmeans(iris[,-5], 3)
kms <- silhouette(km$cluster, dist(iris[,-5]))
summary(kms)
## Silhouette of 150 units in 3 clusters from silhouette.default(x = km$cluster, dist = dist(iris[, -5])) :
## Cluster sizes and average silhouette widths:
## 50 62 38
## 0.7981405 0.4173199 0.4511051
## Individual silhouette widths:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.02636 0.39115 0.56231 0.55282 0.77552 0.85391
set.seed(22)
km <- kmeans(iris[,-5], 2)
kms <- silhouette(km$cluster, dist(iris[,-5]))
summary(kms)
## Silhouette of 150 units in 2 clusters from silhouette.default(x = km$cluster, dist = dist(iris[, -5])) :
## Cluster sizes and average silhouette widths:
## 53 97
## 0.7695262 0.6327014
## Individual silhouette widths:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.04069 0.63938 0.71217 0.68105 0.79587 0.85503
data(iris)
single_c <- hclust(dist(iris[,-5]), method = 'single')
hc_single <- cutree(single_c, k = 3)
complete_c <- hclust(dist(iris[,-5]), method = 'complete')
hc_complete <- cutree(complete_c, k = 3)
set.seed(42)
km <- kmeans(iris[,-5], 3)
cs <- cluster.stats(dist(iris[,-5]), km$cluster)
cs[c('within.cluster.ss', 'avg.silwidth')]
## $within.cluster.ss
## [1] 78.85144
##
## $avg.silwidth
## [1] 0.552819
sapply(
list(kmeans = km$cluster,
hc_single=hc_single,
hc_complete = hc_complete),
function(c) cluster.stats(dist(iris[,-5]), c)[c('within.cluster.ss', 'avg.silwidth')]
)
## kmeans hc_single hc_complete
## within.cluster.ss 78.85144 142.4794 89.52501
## avg.silwidth 0.552819 0.5121108 0.5135953
customers <- read.csv('https://raw.githubusercontent.com/ywchiu/fubonr/master/data/customers.csv')
head(customers)
## CustomerID Genre Age Annual_Income Spending_Score
## 1 1 Male 19 15 39
## 2 2 Male 21 15 81
## 3 3 Female 20 16 6
## 4 4 Female 23 16 77
## 5 5 Female 31 17 40
## 6 6 Female 22 17 76
customers <- customers[, c('Annual_Income', 'Spending_Score')]
plot(customers$Annual_Income, customers$Spending_Score)
set.seed(54)
fit <- kmeans(customers, centers = 5)
plot(customers$Annual_Income, customers$Spending_Score, col=fit$cluster)
points(fit$centers[,1], fit$centers[,2], col='orange', pch = 19, cex = 3)
library(cluster)
kcs <- silhouette(fit$cluster, dist(customers))
summary(kcs)
## Silhouette of 200 units in 5 clusters from silhouette.default(x = fit$cluster, dist = dist(customers)) :
## Cluster sizes and average silhouette widths:
## 39 81 35 23 22
## 0.5091706 0.5966512 0.5039873 0.5122676 0.5990129
## Individual silhouette widths:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.02338 0.49425 0.59614 0.55393 0.66011 0.75807
plot(kcs)
library(fpc)
nk <- 2:10
set.seed(123)
SW <- sapply(nk, function(k){
fit <- kmeans(customers, centers = k)
cluster.stats(dist(customers),fit$cluster)$avg.silwidth
})
plot(nk, SW, type= 'l')
abline(v = 5, col="red")
#install.packages('png')
library(png)
img1 <- readPNG("17.png", TRUE)
img3 <- img1[,nrow(img1):1]
b <- cbind(as.integer(which(img3 < -1) %% 28), which(img3 < -1) / 28)
plot(b, xlim=c(1,28), ylim=c(1,28))
fit <- kmeans(b, 2)
plot(b, col=fit$cluster, pch=fit$cluster)
points(fit$centers, col="orange", cex = 3, pch=19)
## DBSCAN
library(fpc)
ds <- dbscan(b, 2)
# run with showplot=1 to see how dbscan works.
ds
## dbscan Pts=202 MinPts=5 eps=2
## 1 2
## seed 72 130
## total 72 130
plot(ds, b)
library(readxl)
appledaily20171214 <- read_excel("appledaily20171214.xlsx")
#View(appledaily20171214)
library(jiebaR)
## Warning: package 'jiebaR' was built under R version 3.4.2
## Loading required package: jiebaRD
mixseg <- worker()
apple.seg <- lapply(appledaily20171214$content, function(e){
segment(e, jiebar = mixseg)
})
library(tm)
## Warning: package 'tm' was built under R version 3.4.2
## Loading required package: NLP
corpus <- Corpus(VectorSource(apple.seg))
doc <- tm_map(corpus, removeNumbers)
dtm <- DocumentTermMatrix(doc)
dtm.remove <- removeSparseTerms(dtm, 0.995)
#dtm
#dtm.remove
library(proxy)
dtm.dist <- proxy::dist(as.matrix(dtm.remove), method = 'cosine')
fit <- kmeans(dtm.dist, 20 )
table(fit$cluster)
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## 34 15 38 44 28 67 31 58 13 66 43 173 6 17 7 20 52 100
## 19 20
## 57 30
appledaily20171214[fit$cluster == 7,'title']
## # A tibble: 31 x 1
## title
## <chr>
## 1 天使換來金斯勒 解決二壘漏洞
## 2 遊騎兵與史博威簽小聯盟約 補強先發深度
## 3 伍茲1對1指導高爾夫有多貴? 先準備630萬再來
## 4 馬恰多將成自由球員 現已有5隊想搶人
## 5 努涅茲搶手 美東3強都想要
## 6 大谷手傷總教練不在乎 明年全力投入春訓
## 7 梅威瑟為錢復出 自爆打3場格鬥賽可撈300億
## 8 沃神開示 雷霆喬治兩個月內走定了
## 9 又交易掉歐蘇納 波拉斯:馬林魚隊變成當鋪
## 10 費德勒續約巧克力品牌 爽賺6億台幣難怪不言退
## # ... with 21 more rows
m <- as.matrix(dtm.dist)
m2 <- m < 0.5
m <- as.matrix(dtm.dist)
m2 <- ifelse(m < 0.4, 1, 0)
m2[1:3,1:3]
## 1 2 3
## 1 1 0 0
## 2 0 1 0
## 3 0 0 1
library(igraph)
## Warning: package 'igraph' was built under R version 3.4.3
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
G <- graph_from_adjacency_matrix(m2)
wc <- cluster_walktrap(G)
modularity(wc)
## [1] 0.3448493
table(membership(wc))
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## 7 4 4 3 3 3 3 8 2 2 2 2 2 2 2 2 2 2
## 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3
## 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
## 30 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 793 794 795
## 1 1 1
group <- membership(wc)
table(group)
## group
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## 7 4 4 3 3 3 3 8 2 2 2 2 2 2 2 2 2 2
## 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3
## 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
## 30 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 793 794 795
## 1 1 1
appledaily20171214$title[group ==3]
## [1] "【不斷更新】桃園工廠惡火撲滅 6人仍失聯宿舍內發現一堆白骨"
## [2] "桃園工廠火勢熄滅 員工宿舍內發現一堆白骨"
## [3] "桃園工廠大火6員工失聯 家屬焦急等待"
## [4] "汽車用品大廠「矽卡」燒毀 資本額達2億"
#plot(wc, G)
#install.packages('FSelector')
library(FSelector)
## Warning: package 'FSelector' was built under R version 3.4.3
library(C50)
## Warning: package 'C50' was built under R version 3.4.2
data(churn)
trainset <- churnTrain[,! colnames(churnTrain) %in% c('state', 'acciunt_length', 'area_code') ]
weights <- random.forest.importance(churn~., trainset, importance.type = 1)
#weights[order(weights$attr_importance), ]
#View(weights)
subset <- cutoff.k(weights, 5)
f <- as.simple.formula(subset, "Class")
print(f)
## Class ~ number_customer_service_calls + international_plan +
## total_intl_calls + total_day_minutes + total_day_charge
## <environment: 0x000000005fbb75c0>
library(caret)
## Warning: package 'caret' was built under R version 3.4.2
## Loading required package: lattice
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
control = trainControl(method="repeatedcv", number=10, repeats=3)
model <- train(churn~., data=trainset, method="rpart", preProcess="scale", trControl=control)
## Warning: package 'rpart' was built under R version 3.4.2
importance = varImp(model, scale=FALSE)
importance
## rpart variable importance
##
## Overall
## total_day_minutes 101.086
## total_day_charge 94.083
## number_customer_service_calls 80.306
## international_planyes 55.775
## voice_mail_planyes 33.806
## number_vmail_messages 33.806
## total_eve_charge 14.734
## total_eve_minutes 14.734
## total_intl_minutes 8.656
## total_intl_calls 0.000
## total_night_minutes 0.000
## total_night_calls 0.000
## account_length 0.000
## total_intl_charge 0.000
## total_eve_calls 0.000
## total_night_charge 0.000
## total_day_calls 0.000
plot(importance)
#install.packages("rminer")
library(rminer)
## Warning: package 'rminer' was built under R version 3.4.2
model <- fit(churn~.,trainset,model="rpart")
VariableImportance=Importance(model,trainset,method="sensv")
L=list(runs=1,sen=t(VariableImportance$imp),sresponses=VariableImportance$sresponses)
mgraph(L,graph="IMP",leg=names(trainset),col="gray",Grid=10)
library(rminer)
model <- fit(churn~.,trainset,model="svm")
VariableImportance=Importance(model,trainset,method="sensv")
L=list(runs=1,sen=t(VariableImportance$imp),sresponses=VariableImportance$sresponses)
mgraph(L,graph="IMP",leg=names(trainset),col="gray",Grid=10)
head(trainset)
## account_length international_plan voice_mail_plan number_vmail_messages
## 1 128 no yes 25
## 2 107 no yes 26
## 3 137 no no 0
## 4 84 yes no 0
## 5 75 yes no 0
## 6 118 yes no 0
## total_day_minutes total_day_calls total_day_charge total_eve_minutes
## 1 265.1 110 45.07 197.4
## 2 161.6 123 27.47 195.5
## 3 243.4 114 41.38 121.2
## 4 299.4 71 50.90 61.9
## 5 166.7 113 28.34 148.3
## 6 223.4 98 37.98 220.6
## total_eve_calls total_eve_charge total_night_minutes total_night_calls
## 1 99 16.78 244.7 91
## 2 103 16.62 254.4 103
## 3 110 10.30 162.6 104
## 4 88 5.26 196.9 89
## 5 122 12.61 186.9 121
## 6 101 18.75 203.9 118
## total_night_charge total_intl_minutes total_intl_calls total_intl_charge
## 1 11.01 10.0 3 2.70
## 2 11.45 13.7 3 3.70
## 3 7.32 12.2 5 3.29
## 4 8.86 6.6 7 1.78
## 5 8.41 10.1 3 2.73
## 6 9.18 6.3 6 1.70
## number_customer_service_calls churn
## 1 1 no
## 2 1 no
## 3 0 no
## 4 2 no
## 5 3 no
## 6 0 no
model <- glm(churn ~., data = trainset, family = binomial())
#model
#model.step <- step(model)
#model.sep
evaluator <- function(subset) {
k <- 5
set.seed(2)
ind = sample(5, nrow(trainset), replace = TRUE)
results <- sapply(1:k, function(i) {
train <- trainset[ind !=i,]
test <- trainset[ind ==i,]
tree <- rpart(as.simple.formula(subset, "churn"), trainset)
error.rate = sum(test$churn != predict(tree, test, type="class")) / nrow(test)
return(1 - error.rate)
})
return(mean(results))
}
attr.subset <- hill.climbing.search(names(trainset)[!names(trainset) %in% "churn"], evaluator)
f <- as.simple.formula(attr.subset, "churn")
print(f)
## churn ~ account_length + international_plan + voice_mail_plan +
## total_day_minutes + total_eve_minutes + total_night_calls +
## total_intl_minutes + total_intl_calls + number_customer_service_calls
## <environment: 0x00000000677ffb40>
data(swiss)
head(swiss)
## Fertility Agriculture Examination Education Catholic
## Courtelary 80.2 17.0 15 12 9.96
## Delemont 83.1 45.1 6 9 84.84
## Franches-Mnt 92.5 39.7 5 5 93.40
## Moutier 85.8 36.5 12 7 33.77
## Neuveville 76.9 43.5 17 15 5.16
## Porrentruy 76.1 35.3 9 7 90.57
## Infant.Mortality
## Courtelary 22.2
## Delemont 22.2
## Franches-Mnt 20.2
## Moutier 20.3
## Neuveville 20.6
## Porrentruy 26.6
swiss <- swiss[,-1]
colnames(swiss)
## [1] "Agriculture" "Examination" "Education"
## [4] "Catholic" "Infant.Mortality"
swiss.pca <- prcomp(swiss, center=TRUE, scale = TRUE)
swiss.pca
## Standard deviations (1, .., p=5):
## [1] 1.6228065 1.0354873 0.9033447 0.5592765 0.4067472
##
## Rotation (n x k) = (5 x 5):
## PC1 PC2 PC3 PC4
## Agriculture 0.52396452 -0.25834215 0.003003672 -0.8090741
## Examination -0.57185792 -0.01145981 -0.039840522 -0.4224580
## Education -0.49150243 0.19028476 0.539337412 -0.3321615
## Catholic 0.38530580 0.36956307 0.725888143 0.1007965
## Infant.Mortality 0.09167606 0.87197641 -0.424976789 -0.2154928
## PC5
## Agriculture 0.06411415
## Examination -0.70198942
## Education 0.56656945
## Catholic -0.42176895
## Infant.Mortality 0.06488642
summary(swiss.pca)
## Importance of components%s:
## PC1 PC2 PC3 PC4 PC5
## Standard deviation 1.6228 1.0355 0.9033 0.55928 0.40675
## Proportion of Variance 0.5267 0.2145 0.1632 0.06256 0.03309
## Cumulative Proportion 0.5267 0.7411 0.9043 0.96691 1.00000
predicted <- predict(swiss.pca, newsdata = head(swiss,1))
## Warning: In predict.prcomp(swiss.pca, newsdata = head(swiss, 1)) :
## extra argument 'newsdata' will be disregarded
screeplot(swiss.pca,type= 'barplot')
abline(h = 1, col="red")
screeplot(swiss.pca,type= 'line')
abline(h = 1, col="red")
swiss.pca$sdev
## [1] 1.6228065 1.0354873 0.9033447 0.5592765 0.4067472
swiss.pca$sdev ^ 2 > 1
## [1] TRUE TRUE FALSE FALSE FALSE
plot(swiss.pca$x[,1], swiss.pca$x[,2],xlim =c(-4,4))
text(swiss.pca$x[,1], swiss.pca$x[,2], rownames(swiss.pca$x), col="red")
biplot(swiss.pca)
## 產生台灣幸福指標
dataset <- read.csv('https://raw.githubusercontent.com/ywchiu/fubonr/master/data/eco_index.csv', header = TRUE, row.names = 1)
head(dataset)
## sales expense income.average
## Taipei 122838 14.83 704024
## Tainan 24689 14.56 430680
## New Taipei City 41719 13.10 506143
## Kaohsiung 41982 11.56 487309
## Taichung 35856 10.56 511603
## Hsinchu County 9665 22.57 569021
pc.cr <- princomp(dataset, cor = TRUE)
pc.cr
## Call:
## princomp(x = dataset, cor = TRUE)
##
## Standard deviations:
## Comp.1 Comp.2 Comp.3
## 1.4480423 0.8234342 0.4744782
##
## 3 variables and 20 observations.
screeplot(pc.cr, type= 'bar')
abline(h = 1, col="red")
screeplot(pc.cr, type= 'line')
abline(h = 1, col="red")
biplot(pc.cr)
pc.cr$scores[order(pc.cr$scores[,1])]
## [1] -4.44166677 -1.47378234 -1.36963836 -1.24859975 -1.22361917
## [6] -1.21393904 -0.09492659 -0.04573421 0.05588516 0.48721191
## [11] 0.56259111 0.57010732 0.57174521 0.63677543 0.88013703
## [16] 1.21208605 1.22879883 1.29191378 1.65058209 1.96407231
sort(-pc.cr$scores[,1], decreasing = TRUE)
## Taipei Hsinchu Taichung Taoyuan County
## 4.44166677 1.47378234 1.36963836 1.24859975
## Kaohsiung New Taipei City Tainan Hsinchu County
## 1.22361917 1.21393904 0.09492659 0.04573421
## Changhua County Yunlin County Miaoli County Chiayi City
## -0.05588516 -0.48721191 -0.56259111 -0.57010732
## Hualien County Nantou County Keelung Yilan County
## -0.57174521 -0.63677543 -0.88013703 -1.21208605
## Taitung Pingtung County Chiayi County Penghu County
## -1.22879883 -1.29191378 -1.65058209 -1.96407231
barplot(sort(-pc.cr$scores[,1], decreasing = TRUE), cex.axis= 0.6, cex.names = 0.6)
lenna <- readPNG('Lenna.png')
lenna <- t(lenna[,,1])[,nrow(lenna):1]
image(lenna)
lenna.svd <- svd(scale(lenna))
plot(lenna.svd$d^2/sum(lenna.svd$d^2), type="l", xlab=" Singular vector", ylab = "Variance explained")
length(lenna.svd$d)
## [1] 200
min(which(cumsum(lenna.svd$d^2/sum(lenna.svd$d^2))> 0.9))
## [1] 24
lenna_compression = function(dim){
u=as.matrix(lenna.svd$u[, 1:dim])
v=as.matrix(lenna.svd$v[, 1:dim])
d=as.matrix(diag(lenna.svd$d)[1:dim, 1:dim])
image(u%*%d%*%t(v))
}
lenna_compression(24)
min(which(cumsum(lenna.svd$d^2/sum(lenna.svd$d^2))> 0.99))
## [1] 84
lenna_compression(84)
min(which(cumsum(lenna.svd$d^2/sum(lenna.svd$d^2))> 0.999))
## [1] 128
lenna_compression(128)