Sys.setlocale("LC_ALL","C")
[1] "C"
packages = c(
  "dplyr","ggplot2","d3heatmap","googleVis","devtools","plotly", "xgboost",
  "magrittr","caTools","ROCR","corrplot", "rpart", "rpart.plot",
  "doParallel", "caret", "glmnet", "Matrix", "e1071", "randomForest",
  "flexclust", "FactoMineR", "factoextra"
  )
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
rm(list=ls(all=T))
options(digits=4, scipen=12)
library(dplyr)
library(ggplot2)
library(flexclust)
library(FactoMineR)
library(factoextra)

A. 集群分析與尺度縮減

A1. 批發交易資料
W = read.csv('data/wholesales.csv')
W$Channel = factor( paste0("Ch",W$Channel) )
W$Region = factor( paste0("Reg",W$Region) )
W[3:8] = lapply(W[3:6], log, base=10)
summary(W)
 Channel    Region        Fresh            Milk         Grocery          Frozen     Detergents_Paper   Delicassen  
 Ch1:298   Reg1: 77   Min.   :0.477   Min.   :1.74   Min.   :0.477   Min.   :1.40   Min.   :0.477    Min.   :1.74  
 Ch2:142   Reg2: 47   1st Qu.:3.495   1st Qu.:3.19   1st Qu.:3.333   1st Qu.:2.87   1st Qu.:3.495    1st Qu.:3.19  
           Reg3:316   Median :3.930   Median :3.56   Median :3.677   Median :3.18   Median :3.930    Median :3.56  
                      Mean   :3.792   Mean   :3.53   Mean   :3.666   Mean   :3.17   Mean   :3.792    Mean   :3.53  
                      3rd Qu.:4.229   3rd Qu.:3.86   3rd Qu.:4.028   3rd Qu.:3.55   3rd Qu.:4.229    3rd Qu.:3.86  
                      Max.   :5.050   Max.   :4.87   Max.   :4.967   Max.   :4.78   Max.   :5.050    Max.   :4.87  
A2. 兩個區隔變數
hc = W[,3:4] %>% scale %>% dist %>% hclust  #scale做標準化、dist是距離、hclist需要的是距離矩陣
plot(hc)
rect.hclust(hc, k=5, border="red")  #k自己調,沒有標準切多少比較好

W$group = cutree(hc, k=5) %>% factor
ggplot(W, aes(x=Fresh, y=Milk, col=group)) +
  geom_point(size=3, alpha=0.5) + 
  theme_light()

A3. 六個區隔變數
hc = W[,3:7] %>% scale %>% dist %>% hclust  #改3:7欄位
plot(hc)
W$group = factor(cutree(hc, k=8))
rect.hclust(hc, k=8, border="red")

library(FactoMineR)
library(factoextra)
fviz_dend(
  hc, k=8, show_labels=F, rect=T, rect_fill=T,
  labels_track_height=0,
  palette="ucscgb", rect_border="ucscgb")  #與上面無異,但比較美,跑挺久

A4. 尺度縮減

Dimension Reduction with PCA (Principle Component Analysis, 主成分分析)

W[,3:8] %>% PCA(graph=F) %>% fviz_pca_biplot(
  label="var", col.ind=W$group,
  pointshape=19, mean.point=F,                #集群分析主要做資料探索,尺度縮減主要做視覺化
  addEllipses=T, ellipse.level=0.7,
  ellipse.type = "convex", palette="ucscgb",  #可看出尺度與尺度之間的關係、也可從低尺度看出原本的尺度
  repel=T
  )



1. Cluster Analysis for Movies

主要議題:依類型(Genre)對電影分類

學習重點:


1.1 整理資料
M = read.table("data/movieLens.txt", header=FALSE, sep="|",quote="\"")
# Assign column names
colnames(M) = c(
  "ID", "Title", "ReleaseDate", "VideoReleaseDate", "IMDB", 
  "Unknown", "Action", "Adventure", "Animation", "Childrens", 
  "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "FilmNoir", 
  "Horror", "Musical", "Mystery", "Romance", "SciFi", "Thriller",
  "War", "Western")
# Remove unnecessary variables
M$ID = NULL
M$ReleaseDate = NULL
M$VideoReleaseDate = NULL
M$IMDB = NULL
# Remove duplicates
M = unique(M)
1.2 檢視資料
head(M, 5)
sum(M$Comedy)             # 喜劇片
[1] 502
sum(M$Western)            # 西部片
[1] 27
sum(M$Romance | M$Drama)  # 浪漫劇情片 
[1] 863
1.3 距離矩陣
dmx= dist(M[2:20], method="euclidean")  #一旦一兩萬點就做不出來了(罩門),就要用k-mean
dmx %>% as.matrix %>% dim               #meaning that we want to use euclidean distance
[1] 1664 1664
1.4 層級式集群分析
hclust1 = hclust(dmx, method = "ward.D")  #不同method會分出不同結果,但沒太多意義 
#The ward method cares about the distance between clusters using centroid distance, and also the variance in each of the clusters.
1.5 檢視樹狀圖
plot(hclust1)
rect.hclust(hclust1, k=5, border="red")

1.6 切割群組
grp = cutree(hclust1, k = 5)
table(grp)
grp
  1   2   3   4   5 
824 370 209 196  65 
1.7 檢查群組屬性
tapply(M$Action, grp, mean)  #通常會想知道每個變數的平均值#統計是動作片的比率
      1       2       3       4       5 
0.28641 0.00000 0.00000 0.06633 0.00000 
tapply(M$Romance, grp, mean)  #It divides our data points into the 10 clusters and then computes the average value of the action variable for each cluster.
      1       2       3       4       5 
0.05825 0.00000 0.00000 1.00000 0.00000 
1.8 The sapply-split-... Combo:
sapply(split(M[,2:20], grp), colMeans) %>% round(3)  #split看變數有幾個就切幾個資料框 #M是movie,從2開始因為1是片名
                1 2 3     4 5
Unknown     0.002 0 0 0.000 0
Action      0.286 0 0 0.066 0
Adventure   0.161 0 0 0.000 0
Animation   0.051 0 0 0.000 0
Childrens   0.146 0 0 0.000 0
Comedy      0.177 0 1 0.418 1
Crime       0.123 0 0 0.031 0
Documentary 0.061 0 0 0.000 0
Drama       0.238 1 0 0.434 1
Fantasy     0.027 0 0 0.000 0
FilmNoir    0.028 0 0 0.005 0
Horror      0.107 0 0 0.010 0
Musical     0.068 0 0 0.000 0
Mystery     0.073 0 0 0.000 0
Romance     0.058 0 0 1.000 0
SciFi       0.121 0 0 0.000 0
Thriller    0.279 0 0 0.092 0
War         0.086 0 0 0.000 0
Western     0.033 0 0 0.000 0
                                                     #想看每群的區別變數的平均,並視覺化
                                                     #round是設定小數點#colMeans把每個資料框的平均值算出來
1.9 資料視覺化
layout(matrix(c(1,2,2), 3, 1))
par(mar=c(2,3,1,1), cex=0.8)
table(grp) %>% barplot(col=3:7, names.arg=paste0("Group-",1:5))
par(mar=c(6,3,2,1))
sapply(split(M[,2:20], grp), colMeans) %>% t %>% 
  barplot(beside=T, col=3:7, las=2)

【問題討論】

從管理的角度來看,我們為甚麼要分群?

  • 目的是為了獲得新客戶、留存老客戶、提高客戶忠誠度
  • 企業若是不知道自己的會員分群,沒有區分過會員彼此之間的關係,以成本來說,有1000萬又要如何去分配,在管理上也就無所適從

我們為甚麼要做尺度縮減?

  • 尺度縮減主要做視覺化,呈現方便
  • 尺度縮減可看出尺度與尺度之間的關係、也可從低尺度看出原本的尺度,讓人比較好理解

我們要如何把集群分析的結果轉化為策略呢?

  • 使用集群分析,根據不同目標的特性將觀察值分成同質的組別,觀察其結果,可以用來區隔市場,並檢驗不同的市場策略。



2. Flower Image

2.1 整理資料
# Read data
flower = read.csv("data/flower.csv", header=FALSE)
# we have no headers in the CSV file because it only contains a matrix of intensity values.
# Change the data type to matrix
flowerMatrix = as.matrix(flower)
dim(flowerMatrix)
[1] 50 50
# Turn matrix into a vector
flowerVector = as.vector(flowerMatrix)
length(flowerVector)  # this reflects the 50 times 50 intensity values that we had in our matrix.
[1] 2500
2.2 距離矩陣
# Compute distances
distance = dist(flowerVector, method = "euclidean")
2.3 層級式集群分析
# Hierarchical clustering
clusterIntensity = hclust(distance, method="ward.D")#the Ward’s method is a minimum variance method, which tries to find compact and spherical clusters.
2.4 樹狀圖
# Plot the dendrogram
plot(clusterIntensity)
# Select 3 clusters
rect.hclust(clusterIntensity, k = 3, border = "red")

切割群組
flowerClusters = cutree(clusterIntensity, k = 3)
table(flowerClusters)
flowerClusters
   1    2    3 
1634  272  594 
# flowerClusters
族群平均(畫素顏色深淺度)
# Find mean intensity values
tapply(flowerVector, flowerClusters, mean)
      1       2       3 
0.08574 0.50826 0.93148 
圖像比較
# Plot the image and the clusters
dim(flowerClusters) = c(50,50)  #dim:convert it into a matrix.
par(mfrow=c(1,2), mar=c(2,2,2,2))
# Original image
image(flowerMatrix,axes=FALSE,col=grey(seq(0,1,length=256)),main="Original")
# New image
image(flowerClusters, axes = FALSE, main="3 Cluster")



3. MRI Image

3.1 整理資料
# Read data
healthy = read.csv("data/healthy.csv", header=FALSE)
healthyMatrix = as.matrix(healthy)  #We need to make sure that R reads in the matrix appropriately.
dim(healthyMatrix)
[1] 566 646
3.2 畫出圖形
# Plot image
par(mar=c(1,1,1,1))  #For 8 bits per pixel images, we have 256 color levels ranging from zero to one.
image(healthyMatrix,axes=FALSE,col=grey(seq(0,1,length=256)))

3.3 距離矩陣
# Compute distances
healthyVector = as.vector(healthyMatrix)
distance = dist(healthyVector, method = "euclidean")
Error: cannot allocate vector of size 498.0 Gb

【Q】 What is the problem?

  • 資料量太大,無法使用層級式集群分析
  • It was impossible for us to use hierarchical clustering because of the high resolution of our image.
3.4 KMeans集群分析
# Run k-means
k = 5  #setting the number of clusters depends on exactly what you're trying to extract from the image.
set.seed(1)
KMC = kmeans(healthyVector, centers = k, iter.max = 1000)
3.5 檢查分群結果
# View(KMC)
table(KMC$cluster)

     1      2      3      4      5 
 20556 101085 133162  31555  79278 
KMC$centers  #They are all less than 0.5.So they're all pretty close to 0.And this means that our images is pretty dark.
     [,1]
1 0.48177
2 0.10619
3 0.01962
4 0.30943
5 0.18421
3.6 畫出分群結果
# Extract clusters
X = KMC$cluster
# Plot the image with the clusters
dim(X) = c(nrow(healthyMatrix), ncol(healthyMatrix))
# Plot image
par(mar=c(1,1,1,1))
image(X, axes = FALSE, col=rainbow(k))

3.7 讀進、轉換測試圖形
tumor = read.csv("data/tumor.csv", header=FALSE)
tumorMatrix = as.matrix(tumor)
dim(tumorMatrix)
[1] 571 512
tumorVector = as.vector(tumorMatrix)
length(tumorVector)
[1] 292352
3.8 將原圖形之分群規則套用到測試圖形
# Apply clusters from before to new image, using the flexclust package
library(flexclust)
t0 = Sys.time()
KMC.kcca = flexclust::as.kcca(KMC, healthyVector)        # 建立模型
tumorClusters = predict(KMC.kcca, newdata = tumorVector) # 進行預測(轉換)
Sys.time() - t0
Time difference of 26.31 secs
3.9 圖像比較
# Visualize the clusters
dim(tumorClusters) = c(nrow(tumorMatrix), ncol(tumorMatrix))
par(mfrow=c(1,2), mar=c(1,1,2,1))
image(X, axes = FALSE, col=rainbow(k), main="Healthy")
image(t(tumorClusters)[,571:1], axes = FALSE, col=rainbow(k), main="Tumor")

【學習重點】
  • 集群分析在圖像處理的應用
  • 單區隔變數的集群分析
  • 集群分析模型
【問題討論】

層級式和K-Means集群分析有什麼差異? 它們分別用在什麼狀況?

  • K-Means集群分析必須事先決定集群數目(K個群組),而層級式不用。
  • 如果觀察值的個數較多或資料檔非常龐大(通常觀察值在200個以上),採用K-Means集群 分析法較為適宜,。如果觀察值樣本不大,則採用階層式集群分析法較為適宜。

集群分析模型和普通的集群分析有什麼差異?

  • 普通集群分析著重在分幾群上面(例如分三群)
  • 集群分析模型著重在建模,選擇什麼樣的參數會影響什麼結果

什麼時候需要建集群分析模型? 集群分析模型的用法?

  • 在有堆積如山資料的時候,希望利用自動或半自動的方式,發掘出隱藏在資料中的有用資訊。
  • 用法在於觀察參數調整後,模型的變化,分群的結果追求的不是模型的準度而是它的應用性,哪一個群集結果的產出最能讓人員解讀與進一步應用,才是適切的分群結果。因此如何快速的調整或找到適合的分群結果,才是重點。

圖像處理和圖像辨識有什麼差異?

  • 圖像處理是將圖像做一些強化或簡化的運算,凸顯出一些我們想要知道的特性,但是它的輸出資料還是一張圖像,譬如一張輪廓圖。
  • 圖像辨識就是必須有明確的文字或數字輸出,要明確到可以變成資料庫的資料,能提供搜尋、分析與比對之用的資訊。








