Sys.setlocale("LC_ALL","C")
[1] "C"
packages = c(
"dplyr","ggplot2","d3heatmap","googleVis","devtools","plotly", "xgboost",
"magrittr","caTools","ROCR","corrplot", "rpart", "rpart.plot",
"doParallel", "caret", "glmnet", "Matrix", "e1071", "randomForest",
"flexclust", "FactoMineR", "factoextra"
)
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
rm(list=ls(all=T))
options(digits=4, scipen=12)
library(dplyr)
library(ggplot2)
library(flexclust)
library(FactoMineR)
library(factoextra)
A. 集群分析與尺度縮減
A1. 批發交易資料
W = read.csv('data/wholesales.csv')
W$Channel = factor( paste0("Ch",W$Channel) )
W$Region = factor( paste0("Reg",W$Region) )
W[3:8] = lapply(W[3:6], log, base=10)
summary(W)
Channel Region Fresh Milk Grocery Frozen Detergents_Paper
Ch1:298 Reg1: 77 Min. :0.477 Min. :1.74 Min. :0.477 Min. :1.40 Min. :0.477
Ch2:142 Reg2: 47 1st Qu.:3.495 1st Qu.:3.19 1st Qu.:3.333 1st Qu.:2.87 1st Qu.:3.495
Reg3:316 Median :3.930 Median :3.56 Median :3.677 Median :3.18 Median :3.930
Mean :3.792 Mean :3.53 Mean :3.666 Mean :3.17 Mean :3.792
3rd Qu.:4.229 3rd Qu.:3.86 3rd Qu.:4.028 3rd Qu.:3.55 3rd Qu.:4.229
Max. :5.050 Max. :4.87 Max. :4.967 Max. :4.78 Max. :5.050
Delicassen
Min. :1.74
1st Qu.:3.19
Median :3.56
Mean :3.53
3rd Qu.:3.86
Max. :4.87
A2. 兩個區隔變數
hc = W[,3:4] %>% scale %>% dist %>% hclust #先標準化scale,將平均值、標準差=0
plot(hc)
rect.hclust(hc, k=5, border="red")

W$group = cutree(hc, k=5) %>% factor
ggplot(W, aes(x=Fresh, y=Milk, col=group)) +
geom_point(size=3, alpha=0.5) +
theme_light()

A3. 六個區隔變數
hc = W[,3:7] %>% scale %>% dist %>% hclust
plot(hc)
W$group = factor(cutree(hc, k=8))
rect.hclust(hc, k=8, border="red")

library(FactoMineR)
library(factoextra)
fviz_dend(
hc, k=8, show_labels=F, rect=T, rect_fill=T,
labels_track_height=0,
palette="ucscgb", rect_border="ucscgb")

A4. 尺度縮減
Dimension Reduction with PCA (Principle Component Analysis, 主成分分析)
W[,3:8] %>% PCA(graph=F) %>% fviz_pca_biplot(
label="var", col.ind=W$group,
pointshape=19, mean.point=F,
addEllipses=T, ellipse.level=0.7,
ellipse.type = "convex", palette="ucscgb",
repel=T
)

1. Cluster Analysis for Movies
主要議題:依類型(Genre)對電影分類
學習重點:
- 集群分析的基本觀念
- 距離矩陣:Distance Matrix
- 層級式集群分析:Hierarchical Cluster Analysis
- 樹狀圖(Dendrogram)的判讀
- 依據樹狀圖決定要分多少群
- 以群組平均值檢視各族群的屬性
1.1 整理資料
M = read.table("data/movieLens.txt", header=FALSE, sep="|",quote="\"")
# Assign column names
colnames(M) = c(
"ID", "Title", "ReleaseDate", "VideoReleaseDate", "IMDB",
"Unknown", "Action", "Adventure", "Animation", "Childrens",
"Comedy", "Crime", "Documentary", "Drama", "Fantasy", "FilmNoir",
"Horror", "Musical", "Mystery", "Romance", "SciFi", "Thriller",
"War", "Western")
# Remove unnecessary variables
M$ID = NULL
M$ReleaseDate = NULL
M$VideoReleaseDate = NULL
M$IMDB = NULL
# Remove duplicates
M = unique(M)
1.2 檢視資料
head(M, 5)
sum(M$Comedy) # 喜劇片
[1] 502
sum(M$Western) # 西部片
[1] 27
sum(M$Romance | M$Drama) # 浪漫劇情片
[1] 863
1.3 距離矩陣
dmx= dist(M[2:20], method="euclidean")
dmx %>% as.matrix %>% dim
[1] 1664 1664
1.4 層級式集群分析
hclust1 = hclust(dmx, method = "ward.D")
1.5 檢視樹狀圖
plot(hclust1)
rect.hclust(hclust1, k=5, border="red")

1.6 切割群組
grp = cutree(hclust1, k = 5)
table(grp)
grp
1 2 3 4 5
824 370 209 196 65
1.7 檢查群組屬性
tapply(M$Action, grp, mean)
1 2 3 4 5
0.28641 0.00000 0.00000 0.06633 0.00000
tapply(M$Romance, grp, mean)
1 2 3 4 5
0.05825 0.00000 0.00000 1.00000 0.00000
1.8 The sapply-split-... Combo:
sapply(split(M[,2:20], grp), colMeans) %>% round(3) #grp是群集之後的結果,是整數的向量
1 2 3 4 5
Unknown 0.002 0 0 0.000 0
Action 0.286 0 0 0.066 0
Adventure 0.161 0 0 0.000 0
Animation 0.051 0 0 0.000 0
Childrens 0.146 0 0 0.000 0
Comedy 0.177 0 1 0.418 1
Crime 0.123 0 0 0.031 0
Documentary 0.061 0 0 0.000 0
Drama 0.238 1 0 0.434 1
Fantasy 0.027 0 0 0.000 0
FilmNoir 0.028 0 0 0.005 0
Horror 0.107 0 0 0.010 0
Musical 0.068 0 0 0.000 0
Mystery 0.073 0 0 0.000 0
Romance 0.058 0 0 1.000 0
SciFi 0.121 0 0 0.000 0
Thriller 0.279 0 0 0.092 0
War 0.086 0 0 0.000 0
Western 0.033 0 0 0.000 0
#round 取到小數點第幾位
1.9 資料視覺化(看每一群的大小)
layout(matrix(c(1,2,2), 3, 1))
par(mar=c(2,3,1,1), cex=0.8)
table(grp) %>% barplot(col=3:7, names.arg=paste0("Group-",1:5))
par(mar=c(6,3,2,1))
sapply(split(M[,2:20], grp), colMeans) %>% t %>%
barplot(beside=T, col=3:7, las=2)

【問題討論】
從管理的角度來看,我們為甚麼要分群?
- 透過分群,方便我們針對個種不同的狀況,做最好的決策。
- 例如在行銷上,透過分群可以讓我們更了解顧客的消費、喜好,進而投其所好,做出對顧客最有利的策略。
我們為甚麼要做尺度縮減?
- 在機器學習分類問題中,通常有太多因素在最終分類的基礎上完成。 這些因素基本上是稱為特徵的變量。 功能數量越多,就越難以可視化訓練集然後對其進行處理。 有時,大多數這些功能是相關的,因此是多餘的。 這就是降維算法發揮作用的地方。 降維是通過獲得一組主要變量來減少所考慮的隨機變量數量的過程。 它可以分為特徵選擇和特徵提取。
- In machine learning classification problems, there are often too many factors on the basis of which the final classification is done. These factors are basically variables called features. The higher the number of features, the harder it gets to visualize the training set and then work on it. Sometimes, most of these features are correlated, and hence redundant. This is wherve dimensionality reduction algorithms come into play. Dimensionality reduction is the process of reducing the number of random variables under consideration, by obtaining a set of principal variables. It can be divided into feature selection and feature extraction.
我們要如何把集群分析的結果轉化為策略呢?
- 以產品面來說,將每一個集群的特徵做分析,進而知道目前趨勢與狀況為何,針對弱項進行改進,對於強項則持續加強,以降低失誤率、增加顧客對產品的滿意度、保持產品競爭力。
- 以顧客面來說,將每一個集群的特徵作分析,進而得知每一個群集顧客的消費型態,並針對其消費型態做出適合且有效的行銷方式,增加行銷成功率,也可以提高公司整體的競爭力與市場佔有率。
2. Flower Image
2.1 整理資料
# Read data
flower = read.csv("data/flower.csv", header=FALSE)
# Change the data type to matrix
flowerMatrix = as.matrix(flower)
dim(flowerMatrix)
[1] 50 50
# Turn matrix into a vector
flowerVector = as.vector(flowerMatrix)
length(flowerVector)
[1] 2500
2.2 距離矩陣
# Compute distances
distance = dist(flowerVector, method = "euclidean")
2.3 層級式集群分析
# Hierarchical clustering
clusterIntensity = hclust(distance, method="ward.D")
2.4 樹狀圖
# Plot the dendrogram
plot(clusterIntensity)
# Select 3 clusters
rect.hclust(clusterIntensity, k = 3, border = "red")

切割群組
flowerClusters = cutree(clusterIntensity, k = 3)
table(flowerClusters)
flowerClusters
1 2 3
1634 272 594
# flowerClusters
族群平均(畫素顏色深淺度)
# Find mean intensity values
tapply(flowerVector, flowerClusters, mean)
1 2 3
0.08574 0.50826 0.93148
圖像比較
# Plot the image and the clusters
dim(flowerClusters) = c(50,50) #切成2500個點
par(mfrow=c(1,2), mar=c(2,2,2,2))
# Original image
image(flowerMatrix,axes=FALSE,col=grey(seq(0,1,length=256)),main="Original")
# New image
image(flowerClusters, axes = FALSE, main="3 Cluster")

3. MRI Image
3.1 整理資料
# Read data
healthy = read.csv("data/healthy.csv", header=FALSE)
healthyMatrix = as.matrix(healthy)
dim(healthyMatrix)
[1] 566 646
3.2 畫出圖形
# Plot image
par(mar=c(1,1,1,1))
image(healthyMatrix,axes=FALSE,col=grey(seq(0,1,length=256)))

3.3 距離矩陣
# Compute distances
healthyVector = as.vector(healthyMatrix)
distance = dist(healthyVector, method = "euclidean")
Error: cannot allocate vector of size 498.0 Gb
【Q】 What is the problem?
3.4 KMeans集群分析
# Run k-means
k = 5
set.seed(1)
KMC = kmeans(healthyVector, centers = k, iter.max = 1000)
3.5 檢查分群結果
# View(KMC)
table(KMC$cluster)
1 2 3 4 5
20556 101085 133162 31555 79278
KMC$centers
[,1]
1 0.48177
2 0.10619
3 0.01962
4 0.30943
5 0.18421
3.6 畫出分群結果
# Extract clusters
X = KMC$cluster
# Plot the image with the clusters
dim(X) = c(nrow(healthyMatrix), ncol(healthyMatrix))
# Plot image
par(mar=c(1,1,1,1))
image(X, axes = FALSE, col=rainbow(k))

3.7 讀進、轉換測試圖形
tumor = read.csv("data/tumor.csv", header=FALSE)
tumorMatrix = as.matrix(tumor)
dim(tumorMatrix)
[1] 571 512
tumorVector = as.vector(tumorMatrix)
length(tumorVector)
[1] 292352
3.8 將原圖形之分群規則套用到測試圖形
# Apply clusters from before to new image, using the flexclust package
library(flexclust)
t0 = Sys.time()
KMC.kcca = flexclust::as.kcca(KMC, healthyVector) # 建立模型
Found more than one class "kcca" in cache; using the first, from namespace 'kernlab'
Also defined by 'flexclust'
Found more than one class "kcca" in cache; using the first, from namespace 'kernlab'
Also defined by 'flexclust'
tumorClusters = predict(KMC.kcca, newdata = tumorVector) # 進行預測(轉換)
Found more than one class "kcca" in cache; using the first, from namespace 'kernlab'
Also defined by 'flexclust'
Sys.time() - t0
Time difference of 33.71 secs
3.9 圖像比較
# Visualize the clusters
dim(tumorClusters) = c(nrow(tumorMatrix), ncol(tumorMatrix))
par(mfrow=c(1,2), mar=c(1,1,2,1))
image(X, axes = FALSE, col=rainbow(k), main="Healthy")
image(t(tumorClusters)[,571:1], axes = FALSE, col=rainbow(k), main="Tumor")

【學習重點】
- 集群分析在圖像處理的應用
- 單區隔變數的集群分析
- 集群分析模型 ##### 【問題討論】
層級式和K-Means集群分析有什麼差異? 它們分別用在什麼狀況?
- 階層式集群是以階層架構的方式反覆進行分裂或聚合,以產生最後的樹狀架構,可從樹狀圖取得任何想要的集群數,缺點是只適合小量資料。
- k-means集群,分割式分群法,分群方式是先將原始事物分為k個群體,計算某一資料點到集群中心之距離(或相適度),將其分配到最接近的群體,重新計算增加及減少資料點的集群中心,重複計算直至各資料點不必重新分配至其他集群為止。
- 層級式集群常用在群數未知且適用在小樣本上,目的為想知道資料能夠被分成幾群。
- k-means集群常用於群數已知且適用於大數量樣本上,主要是想知道觀察個體會分到哪一群。
集群分析模型和普通的集群分析有什麼差異?
- 集群分析的目的則是將觀察個體分類,個體間的相似性或相異性,主要是用個體間的距離來判斷,若個體間的距離越大,表示相異性越大,換句話說相似性越小。
- 集群分析模型則是將集群分析形式化,讓使用者更好觀察集群變化。
- 集群分析與集群分析模型的差異就在於是否能做後續的預測。
什麼時候需要建集群分析模型? 集群分析模型的用法?
- 其主要用法就是將複雜的資料區分為較小部分,讓每部分更容易解釋與簡化。在行銷上時常用於市場區隔!
圖像處理和圖像辨識有什麼差異?
