其他分類方法

data(iris)

#rpart 
library(rpart)

## Warning: package 'rpart' was built under R version 3.2.5

fit <- rpart(Species ~., data = iris)
table(predict(fit, iris, type= 'class'), iris$Species)

##             
##              setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         49         5
##   virginica       0          1        45

#glm 
df <- iris[iris$Species != 'versicolor', ]
df$Species <- as.factor(as.character(df$Species))
levels(df$Species) <- c(0,1)
df$Species <- as.numeric(df$Species) - 1
fit <- glm(Species ~., data = df, binomial(link = "logit"))

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

#svm
library(e1071)

## Warning: package 'e1071' was built under R version 3.2.5

fit <- svm(Species ~., data = iris)
table(predict(fit, iris, type= 'class'), iris$Species)

##             
##              setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         48         2
##   virginica       0          2        48

#naive bayes
library(e1071)
fit <- naiveBayes(Species ~., data = iris)
table(predict(fit, iris, type= 'class'), iris$Species)

##             
##              setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         47         3
##   virginica       0          3        47

使用R 計算距離

?dist

## starting httpd help server ...

##  done

x = c(0, 0, 1, 1, 1, 1)
y = c(1, 0, 1, 1, 0, 1)


#歐氏距離

sqrt(sum((x - y) ^2))

## [1] 1.414214

dist(rbind(x,y), method = "euclidean")

##          x
## y 1.414214

dist(rbind(x,y), method = "minkowski", p=2)

##          x
## y 1.414214

#曼哈頓距離
sum(abs(x - y))

## [1] 2

dist(rbind(x,y), method ="manhattan")

##   x
## y 2

dist(rbind(x,y), method ="minkowski", p=1)

##   x
## y 2

階層式分群

data(iris)
iris.dist <- dist(iris[,-5], method = "euclidean")

iris.cluster <- hclust(iris.dist, method="ward.D2")

plot(iris.cluster)

fit <- cutree(iris.cluster, k = 3)
table(fit)

## fit
##  1  2  3 
## 50 64 36

par(mfrow=c(1,2))
plot(iris$Petal.Width, iris$Petal.Length, col=fit, main ="clustered iris")

plot(iris$Petal.Width, iris$Petal.Length, col=iris$Species, main ="original iris")

分群顧客資料

download.file('https://github.com/ywchiu/rtibame/raw/master/data/customer.csv', 'customer.csv')
customer <- read.csv('customer.csv', header=TRUE)
head(customer)

##   ID Visit.Time Average.Expense Sex Age
## 1  1          3             5.7   0  10
## 2  2          5            14.5   0  27
## 3  3         16            33.5   0  32
## 4  4          5            15.9   0  30
## 5  5         16            24.9   0  23
## 6  6          3            12.0   0  15

customer <- scale(customer[,-1])
hc <- hclust(dist(customer, 'euclidean'), method="ward.D2")

par(mfrow=c(1,1))
#plot(hc)
plot(hc, hang = -0.01, cex=0.7)

hc2 = hclust(dist(customer), method="single")
plot(hc2, hang = -0.01, cex = 0.7)

分裂式分群

library(cluster)
dv<-diana(customer, metric='euclidean')
plot(dv)

使用cutree樹做分群

fit <- cutree(hc, k = 4)
fit

##  [1] 1 1 2 1 2 1 2 2 1 1 1 2 2 1 1 1 2 1 2 3 4 3 4 3 3 4 4 3 4 4 4 3 3 3 4
## [36] 4 3 4 4 4 4 4 4 4 3 3 4 4 4 3 4 3 3 4 4 4 3 4 4 3

table(fit)

## fit
##  1  2  3  4 
## 11  8 16 25

plot(hc, hang = -0.01)
rect.hclust(hc, k = 4, border ="red")

使用kmeans 對iris 分群

data(iris)
fit <- kmeans(iris[,-5], 3)
fit

## K-means clustering with 3 clusters of sizes 50, 62, 38
## 
## Cluster means:
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1     5.006000    3.428000     1.462000    0.246000
## 2     5.901613    2.748387     4.393548    1.433871
## 3     6.850000    3.073684     5.742105    2.071053
## 
## Clustering vector:
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [71] 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3
## [106] 3 2 3 3 3 3 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 2 3 3 3 3 3 2 3 3 3 3 2 3
## [141] 3 3 2 3 3 3 2 3 3 2
## 
## Within cluster sum of squares by cluster:
## [1] 15.15100 39.82097 23.87947
##  (between_SS / total_SS =  88.4 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

par(mfrow=c(1,2))
plot(iris$Petal.Width, iris$Petal.Length, col=fit$cluster, main ="clustered iris")

plot(iris$Petal.Width, iris$Petal.Length, col=iris$Species, main ="original iris")

par(mfrow=c(1,1))
barplot(t(fit$centers), beside = TRUE,xlab="cluster", ylab="value")

plot(iris$Petal.Width, iris$Petal.Length, col=fit$cluster, main ="clustered iris")
points(fit$centers[,4], fit$centers[,3], col="orange", pch=19)

使用kmeans 對customer 分群

fit <- kmeans(customer, 4)
fit

## K-means clustering with 4 clusters of sizes 11, 25, 16, 8
## 
## Cluster means:
##   Visit.Time Average.Expense        Sex        Age
## 1 -0.7771737      -0.5178412 -1.4566845 -0.4774599
## 2 -0.6322632      -0.7299063  0.6750489 -0.6411604
## 3  0.8571173       0.9887331  0.6750489  1.0505015
## 4  1.3302016       1.0155226 -1.4566845  0.5591307
## 
## Clustering vector:
##  [1] 1 1 4 1 4 1 4 4 1 1 1 4 4 1 1 1 4 1 4 3 2 3 2 3 3 2 2 3 2 2 2 3 3 3 2
## [36] 2 3 2 2 2 2 2 2 2 3 3 2 2 2 3 2 3 3 2 2 2 3 2 2 3
## 
## Within cluster sum of squares by cluster:
## [1] 11.97454 20.89159 22.58236  5.90040
##  (between_SS / total_SS =  74.0 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

par(mfrow=c(1,1))
barplot(t(fit$centers), beside = TRUE,xlab="cluster", ylab="value")

plot(customer[,3], customer[,1], col=fit$cluster)

使用silhouette 評估分群效果

set.seed(22)
km <- kmeans(customer, 4)
kms <- silhouette(km$cluster, dist(customer))
summary(kms)

## Silhouette of 60 units in 4 clusters from silhouette.default(x = km$cluster, dist = dist(customer)) :
##  Cluster sizes and average silhouette widths:
##         8        11        16        25 
## 0.5464597 0.4080823 0.3794910 0.5164434 
## Individual silhouette widths:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1931  0.4030  0.4890  0.4641  0.5422  0.6333

plot(kms)

評估各個數量的分群效果(WSS)

nk <- 2:10
set.seed(22)
WSS <- sapply(nk,  function(k){
  kmeans(customer, k)$tot.withinss
})
plot(x= nk,y= WSS, type='l', xlab = 'number of k', ylab = 'WSS')
abline(v = 4, col="red")

評估各個數量的分群效果(Silhoutte)

library(fpc)

## Warning: package 'fpc' was built under R version 3.2.5

SW = sapply(nk, function(k) {
  cluster.stats(dist(customer), kmeans(customer, centers=k)$cluster)$avg.silwidth
})
plot(nk, SW, type="l", xlab="number of clusers", ylab="average silhouette width")
abline(v = 4, col="red")

比較不同分群演算法

# single linkage
single_c =  hclust(dist(customer), method="single")
hc_single = cutree(single_c, k = 4)

# complete linkage
complete_c =  hclust(dist(customer), method="complete")
hc_complete =  cutree(complete_c, k = 4)

# kmeans
set.seed(22)
km = kmeans(customer, 4)

cs = cluster.stats(dist(customer), km$cluster)
cs[c("within.cluster.ss","avg.silwidth")]

## $within.cluster.ss
## [1] 61.3489
## 
## $avg.silwidth
## [1] 0.4640587

sapply(list(kmeans = km$cluster, 
            hc_single = hc_single, 
            hc_complete = hc_complete), 
       function(c) cluster.stats(dist(customer), c)[c("within.cluster.ss","avg.silwidth")])

##                   kmeans    hc_single hc_complete
## within.cluster.ss 61.3489   136.0092  65.94076   
## avg.silwidth      0.4640587 0.2481926 0.4255961

使用kmeans 在 iris 分析上

data(iris)
data  <- iris[,-5]
class <- iris[, 5]

results <- kmeans(data,3)
results

## K-means clustering with 3 clusters of sizes 62, 38, 50
## 
## Cluster means:
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1     5.901613    2.748387     4.393548    1.433871
## 2     6.850000    3.073684     5.742105    2.071053
## 3     5.006000    3.428000     1.462000    0.246000
## 
## Clustering vector:
##   [1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##  [36] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [71] 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2 2
## [106] 2 1 2 2 2 2 2 2 1 1 2 2 2 2 1 2 1 2 1 2 2 1 1 2 2 2 2 2 1 2 2 2 2 1 2
## [141] 2 2 1 2 2 2 1 2 2 1
## 
## Within cluster sum of squares by cluster:
## [1] 39.82097 23.87947 15.15100
##  (between_SS / total_SS =  88.4 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

results$size

## [1] 62 38 50

results$cluster

##   [1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##  [36] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [71] 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2 2
## [106] 2 1 2 2 2 2 2 2 1 1 2 2 2 2 1 2 1 2 1 2 2 1 1 2 2 2 2 2 1 2 2 2 2 1 2
## [141] 2 2 1 2 2 2 1 2 2 1

table(class,results$cluster)

##             
## class         1  2  3
##   setosa      0  0 50
##   versicolor 48  2  0
##   virginica  14 36  0

par(mfrow = c(2, 2))

plot(data$Petal.Length,data$Petal.Width,col=results$cluster, main = 'clustered(Petal)')
plot(data$Petal.Length,data$Petal.Width,col=class, main = 'real(Petal)')
plot(data$Sepal.Length, data$Sepal.Width,col=results$cluster, main = 'clustered(Sepal)')
plot(data$Sepal.Length, data$Sepal.Width,col=class, main = 'real(Sepal)')

文字分析

#載入 tm 套件
#install.packages('tm')
library(tm)

## Warning: package 'tm' was built under R version 3.2.5

## Loading required package: NLP

#載入 SnowballC 套件
#install.packages('SnowballC')
library(SnowballC)

#讀取資料
download.file('https://github.com/ywchiu/rtibame/raw/master/data/news.RData', destfile = 'news.RData')
load('news.RData')

#將資料轉換成Corpus
doc.vec <- VectorSource(news)
doc.corpus  <- Corpus(doc.vec)

# 移除標點符號
doc.corpus  <- tm_map(doc.corpus , removePunctuation)
#移除英文停用字
doc.corpus  <- tm_map(doc.corpus , removeWords, stopwords("english"))
# 還原字根
doc.corpus  <- tm_map(doc.corpus , stemDocument)

dtm <- DocumentTermMatrix(doc.corpus) 
inspect(dtm[1:3,1:10] )

## <<DocumentTermMatrix (documents: 3, terms: 10)>>
## Non-/sparse entries: 5/25
## Sparsity           : 83%
## Maximal term length: 9
## Weighting          : term frequency (tf)
## 
##     Terms
## Docs 2010 adviser agent anoth antisemit birther brexit britain brought cav
##    1    1       0     1     0         0       0      0       0       0   0
##    2    0       0     0     0         0       0      1       0       0   0
##    3    0       0     0     0         0       0      1       1       0   0

# Use kmeans to cluster document
set.seed(123)
fit = kmeans(dist(dtm), 3)
fit

## K-means clustering with 3 clusters of sizes 8, 8, 8
## 
## Cluster means:
##          1        2        3        4        5        6        7        8
## 1 2.855136 3.551086 3.822600 3.689352 3.256895 3.789171 3.078929 3.886756
## 2 3.736639 2.324264 2.469012 2.376606 2.031617 2.552871 3.736639 3.457711
## 3 3.916230 3.252011 3.547263 3.402887 2.926557 3.547263 3.948523 2.856986
##          9       10       11       12       13       14       15       16
## 1 3.654658 3.951341 3.654658 4.314515 2.741408 3.951341 2.671647 3.137671
## 2 3.153742 3.457711 2.462619 3.868477 3.599916 2.639420 3.457711 3.736639
## 3 2.620523 2.856986 3.402887 3.009535 3.819501 3.685923 3.685923 3.881535
##         17       18       19       20       21       22       23       24
## 1 3.148816 3.178114 3.689352 3.135821 3.689352 3.822600 3.822600 3.951341
## 2 3.599916 3.868477 3.153742 3.668516 2.462619 3.309292 3.309292 3.457711
## 3 3.819501 4.073427 2.703986 3.948523 3.402887 2.784304 2.822032 2.936445
## 
## Clustering vector:
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 
##  1  2  2  2  2  2  1  3  3  3  2  3  1  2  1  1  1  1  3  1  2  3  3  3 
## 
## Within cluster sum of squares by cluster:
## [1] 104.00933  62.40691  80.22489
##  (between_SS / total_SS =  33.1 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

news[fit$cluster == 3]

## [1] "Donald Trump's full CNN interview with Jake Tapper "         
## [2] "Donald Trump 'would love to' talk about birtherism"          
## [3] "Donald Trump corrects Zionist Israel, but lets hibby  "      
## [4] "Former Trump Adviser Corey Lewandowski Just Defended Trump's"
## [5] "Poll: Clinton extends lead over Trump"                       
## [6] "Trump defends criticism of judge with Mexican heritage"      
## [7] "Trump looks to unlikely speakers for convention help"        
## [8] "Trump tweet evoking anti-Semitic imagery first posted on"

findAssocs(dtm, 'trump', 0.3)

## $trump
##     adviser       corey      defend      former        just lewandowski 
##        0.60        0.60        0.60        0.60        0.60        0.60 
##      donald 
##        0.42

Random Forest

library(randomForest)

## Warning: package 'randomForest' was built under R version 3.2.5

## randomForest 4.6-12

## Type rfNews() to see new features/changes/bug fixes.

library(C50)

## Warning: package 'C50' was built under R version 3.2.5

data(churn)
churnTrain = churnTrain[,! names(churnTrain) %in% c("state", "area_code", "account_length") ]

set.seed(2)
ind <- sample(2, nrow(churnTrain), replace = TRUE, prob=c(0.7, 0.3))
trainset <- churnTrain[ind == 1,]
testset  <- churnTrain[ind == 2,]

dim(trainset)

## [1] 2315   17

dim(testset)

## [1] 1018   17

churn.rf = randomForest(churn ~ ., data = trainset, importance = T)

cm <-table(predict(churn.rf, testset, type= 'class'), testset$churn)

library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:randomForest':
## 
##     margin

## The following object is masked from 'package:NLP':
## 
##     annotate

confusionMatrix(cm)

## Confusion Matrix and Statistics
## 
##      
##       yes  no
##   yes 111   7
##   no   30 870
##                                           
##                Accuracy : 0.9637          
##                  95% CI : (0.9502, 0.9743)
##     No Information Rate : 0.8615          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8365          
##  Mcnemar's Test P-Value : 0.0002983       
##                                           
##             Sensitivity : 0.7872          
##             Specificity : 0.9920          
##          Pos Pred Value : 0.9407          
##          Neg Pred Value : 0.9667          
##              Prevalence : 0.1385          
##          Detection Rate : 0.1090          
##    Detection Prevalence : 0.1159          
##       Balanced Accuracy : 0.8896          
##                                           
##        'Positive' Class : yes             
##

par(mfrow=c(1,1))
plot(churn.rf)

importance(churn.rf)

##                                      yes        no MeanDecreaseAccuracy
## international_plan            68.9592890 54.118994            72.190204
## voice_mail_plan               18.8899994 15.832400            19.607844
## number_vmail_messages         21.3080062 16.262770            22.068514
## total_day_minutes             28.3237379 30.323756            39.961077
## total_day_calls                0.6325725 -1.131930            -0.802642
## total_day_charge              28.4798708 28.146414            35.858906
## total_eve_minutes             18.5242988 20.572464            24.484322
## total_eve_calls               -3.3431379 -2.301767            -3.495801
## total_eve_charge              20.4379809 20.619705            24.489771
## total_night_minutes            0.9451961 16.105720            16.694651
## total_night_calls             -0.3497164  2.202619             1.869193
## total_night_charge             0.1110824 15.977083            16.593633
## total_intl_minutes            17.3951655 20.063485            24.967698
## total_intl_calls              37.3613313 23.415764            35.497785
## total_intl_charge             16.7925666 19.636891            24.498369
## number_customer_service_calls 79.7530696 59.731615            85.221845
##                               MeanDecreaseGini
## international_plan                    50.35584
## voice_mail_plan                       10.44601
## number_vmail_messages                 19.05619
## total_day_minutes                     79.91474
## total_day_calls                       20.80946
## total_day_charge                      77.84837
## total_eve_minutes                     42.99373
## total_eve_calls                       17.45619
## total_eve_charge                      44.02855
## total_night_minutes                   22.93663
## total_night_calls                     19.94091
## total_night_charge                    22.22769
## total_intl_minutes                    26.05059
## total_intl_calls                      33.03289
## total_intl_charge                     26.60077
## number_customer_service_calls         67.29635

varImpPlot(churn.rf)

Demo20161023

David Chiu

2016年10月23日

其他分類方法

使用R 計算距離

階層式分群

分群顧客資料

分裂式分群

使用cutree樹做分群

使用kmeans 對iris 分群

使用kmeans 對customer 分群

使用silhouette 評估分群效果

評估各個數量的分群效果(WSS)

評估各個數量的分群效果(Silhoutte)

比較不同分群演算法

使用kmeans 在 iris 分析上

文字分析

Random Forest