其他分類方法
data(iris)
#rpart
library(rpart)
## Warning: package 'rpart' was built under R version 3.2.5
fit <- rpart(Species ~., data = iris)
table(predict(fit, iris, type= 'class'), iris$Species)
##
## setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 49 5
## virginica 0 1 45
#glm
df <- iris[iris$Species != 'versicolor', ]
df$Species <- as.factor(as.character(df$Species))
levels(df$Species) <- c(0,1)
df$Species <- as.numeric(df$Species) - 1
fit <- glm(Species ~., data = df, binomial(link = "logit"))
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
#svm
library(e1071)
## Warning: package 'e1071' was built under R version 3.2.5
fit <- svm(Species ~., data = iris)
table(predict(fit, iris, type= 'class'), iris$Species)
##
## setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 48 2
## virginica 0 2 48
#naive bayes
library(e1071)
fit <- naiveBayes(Species ~., data = iris)
table(predict(fit, iris, type= 'class'), iris$Species)
##
## setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 47 3
## virginica 0 3 47
使用R 計算距離
?dist
## starting httpd help server ...
## done
x = c(0, 0, 1, 1, 1, 1)
y = c(1, 0, 1, 1, 0, 1)
#歐氏距離
sqrt(sum((x - y) ^2))
## [1] 1.414214
dist(rbind(x,y), method = "euclidean")
## x
## y 1.414214
dist(rbind(x,y), method = "minkowski", p=2)
## x
## y 1.414214
#曼哈頓距離
sum(abs(x - y))
## [1] 2
dist(rbind(x,y), method ="manhattan")
## x
## y 2
dist(rbind(x,y), method ="minkowski", p=1)
## x
## y 2
階層式分群
data(iris)
iris.dist <- dist(iris[,-5], method = "euclidean")
iris.cluster <- hclust(iris.dist, method="ward.D2")
plot(iris.cluster)

fit <- cutree(iris.cluster, k = 3)
table(fit)
## fit
## 1 2 3
## 50 64 36
par(mfrow=c(1,2))
plot(iris$Petal.Width, iris$Petal.Length, col=fit, main ="clustered iris")
plot(iris$Petal.Width, iris$Petal.Length, col=iris$Species, main ="original iris")

分群顧客資料
download.file('https://github.com/ywchiu/rtibame/raw/master/data/customer.csv', 'customer.csv')
customer <- read.csv('customer.csv', header=TRUE)
head(customer)
## ID Visit.Time Average.Expense Sex Age
## 1 1 3 5.7 0 10
## 2 2 5 14.5 0 27
## 3 3 16 33.5 0 32
## 4 4 5 15.9 0 30
## 5 5 16 24.9 0 23
## 6 6 3 12.0 0 15
customer <- scale(customer[,-1])
hc <- hclust(dist(customer, 'euclidean'), method="ward.D2")
par(mfrow=c(1,1))
#plot(hc)
plot(hc, hang = -0.01, cex=0.7)

hc2 = hclust(dist(customer), method="single")
plot(hc2, hang = -0.01, cex = 0.7)

分裂式分群
library(cluster)
dv<-diana(customer, metric='euclidean')
plot(dv)


使用cutree樹做分群
fit <- cutree(hc, k = 4)
fit
## [1] 1 1 2 1 2 1 2 2 1 1 1 2 2 1 1 1 2 1 2 3 4 3 4 3 3 4 4 3 4 4 4 3 3 3 4
## [36] 4 3 4 4 4 4 4 4 4 3 3 4 4 4 3 4 3 3 4 4 4 3 4 4 3
table(fit)
## fit
## 1 2 3 4
## 11 8 16 25
plot(hc, hang = -0.01)
rect.hclust(hc, k = 4, border ="red")

使用kmeans 對iris 分群
data(iris)
fit <- kmeans(iris[,-5], 3)
fit
## K-means clustering with 3 clusters of sizes 50, 62, 38
##
## Cluster means:
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 5.006000 3.428000 1.462000 0.246000
## 2 5.901613 2.748387 4.393548 1.433871
## 3 6.850000 3.073684 5.742105 2.071053
##
## Clustering vector:
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [71] 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3
## [106] 3 2 3 3 3 3 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 2 3 3 3 3 3 2 3 3 3 3 2 3
## [141] 3 3 2 3 3 3 2 3 3 2
##
## Within cluster sum of squares by cluster:
## [1] 15.15100 39.82097 23.87947
## (between_SS / total_SS = 88.4 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
par(mfrow=c(1,2))
plot(iris$Petal.Width, iris$Petal.Length, col=fit$cluster, main ="clustered iris")
plot(iris$Petal.Width, iris$Petal.Length, col=iris$Species, main ="original iris")

par(mfrow=c(1,1))
barplot(t(fit$centers), beside = TRUE,xlab="cluster", ylab="value")

plot(iris$Petal.Width, iris$Petal.Length, col=fit$cluster, main ="clustered iris")
points(fit$centers[,4], fit$centers[,3], col="orange", pch=19)

使用kmeans 對customer 分群
fit <- kmeans(customer, 4)
fit
## K-means clustering with 4 clusters of sizes 11, 25, 16, 8
##
## Cluster means:
## Visit.Time Average.Expense Sex Age
## 1 -0.7771737 -0.5178412 -1.4566845 -0.4774599
## 2 -0.6322632 -0.7299063 0.6750489 -0.6411604
## 3 0.8571173 0.9887331 0.6750489 1.0505015
## 4 1.3302016 1.0155226 -1.4566845 0.5591307
##
## Clustering vector:
## [1] 1 1 4 1 4 1 4 4 1 1 1 4 4 1 1 1 4 1 4 3 2 3 2 3 3 2 2 3 2 2 2 3 3 3 2
## [36] 2 3 2 2 2 2 2 2 2 3 3 2 2 2 3 2 3 3 2 2 2 3 2 2 3
##
## Within cluster sum of squares by cluster:
## [1] 11.97454 20.89159 22.58236 5.90040
## (between_SS / total_SS = 74.0 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
par(mfrow=c(1,1))
barplot(t(fit$centers), beside = TRUE,xlab="cluster", ylab="value")

plot(customer[,3], customer[,1], col=fit$cluster)

使用silhouette 評估分群效果
set.seed(22)
km <- kmeans(customer, 4)
kms <- silhouette(km$cluster, dist(customer))
summary(kms)
## Silhouette of 60 units in 4 clusters from silhouette.default(x = km$cluster, dist = dist(customer)) :
## Cluster sizes and average silhouette widths:
## 8 11 16 25
## 0.5464597 0.4080823 0.3794910 0.5164434
## Individual silhouette widths:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1931 0.4030 0.4890 0.4641 0.5422 0.6333
plot(kms)

評估各個數量的分群效果(WSS)
nk <- 2:10
set.seed(22)
WSS <- sapply(nk, function(k){
kmeans(customer, k)$tot.withinss
})
plot(x= nk,y= WSS, type='l', xlab = 'number of k', ylab = 'WSS')
abline(v = 4, col="red")

評估各個數量的分群效果(Silhoutte)
library(fpc)
## Warning: package 'fpc' was built under R version 3.2.5
SW = sapply(nk, function(k) {
cluster.stats(dist(customer), kmeans(customer, centers=k)$cluster)$avg.silwidth
})
plot(nk, SW, type="l", xlab="number of clusers", ylab="average silhouette width")
abline(v = 4, col="red")

比較不同分群演算法
# single linkage
single_c = hclust(dist(customer), method="single")
hc_single = cutree(single_c, k = 4)
# complete linkage
complete_c = hclust(dist(customer), method="complete")
hc_complete = cutree(complete_c, k = 4)
# kmeans
set.seed(22)
km = kmeans(customer, 4)
cs = cluster.stats(dist(customer), km$cluster)
cs[c("within.cluster.ss","avg.silwidth")]
## $within.cluster.ss
## [1] 61.3489
##
## $avg.silwidth
## [1] 0.4640587
sapply(list(kmeans = km$cluster,
hc_single = hc_single,
hc_complete = hc_complete),
function(c) cluster.stats(dist(customer), c)[c("within.cluster.ss","avg.silwidth")])
## kmeans hc_single hc_complete
## within.cluster.ss 61.3489 136.0092 65.94076
## avg.silwidth 0.4640587 0.2481926 0.4255961
使用kmeans 在 iris 分析上
data(iris)
data <- iris[,-5]
class <- iris[, 5]
results <- kmeans(data,3)
results
## K-means clustering with 3 clusters of sizes 62, 38, 50
##
## Cluster means:
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 5.901613 2.748387 4.393548 1.433871
## 2 6.850000 3.073684 5.742105 2.071053
## 3 5.006000 3.428000 1.462000 0.246000
##
## Clustering vector:
## [1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [36] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [71] 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2 2
## [106] 2 1 2 2 2 2 2 2 1 1 2 2 2 2 1 2 1 2 1 2 2 1 1 2 2 2 2 2 1 2 2 2 2 1 2
## [141] 2 2 1 2 2 2 1 2 2 1
##
## Within cluster sum of squares by cluster:
## [1] 39.82097 23.87947 15.15100
## (between_SS / total_SS = 88.4 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
results$size
## [1] 62 38 50
results$cluster
## [1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [36] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [71] 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2 2
## [106] 2 1 2 2 2 2 2 2 1 1 2 2 2 2 1 2 1 2 1 2 2 1 1 2 2 2 2 2 1 2 2 2 2 1 2
## [141] 2 2 1 2 2 2 1 2 2 1
table(class,results$cluster)
##
## class 1 2 3
## setosa 0 0 50
## versicolor 48 2 0
## virginica 14 36 0
par(mfrow = c(2, 2))
plot(data$Petal.Length,data$Petal.Width,col=results$cluster, main = 'clustered(Petal)')
plot(data$Petal.Length,data$Petal.Width,col=class, main = 'real(Petal)')
plot(data$Sepal.Length, data$Sepal.Width,col=results$cluster, main = 'clustered(Sepal)')
plot(data$Sepal.Length, data$Sepal.Width,col=class, main = 'real(Sepal)')

文字分析
#載入 tm 套件
#install.packages('tm')
library(tm)
## Warning: package 'tm' was built under R version 3.2.5
## Loading required package: NLP
#載入 SnowballC 套件
#install.packages('SnowballC')
library(SnowballC)
#讀取資料
download.file('https://github.com/ywchiu/rtibame/raw/master/data/news.RData', destfile = 'news.RData')
load('news.RData')
#將資料轉換成Corpus
doc.vec <- VectorSource(news)
doc.corpus <- Corpus(doc.vec)
# 移除標點符號
doc.corpus <- tm_map(doc.corpus , removePunctuation)
#移除英文停用字
doc.corpus <- tm_map(doc.corpus , removeWords, stopwords("english"))
# 還原字根
doc.corpus <- tm_map(doc.corpus , stemDocument)
dtm <- DocumentTermMatrix(doc.corpus)
inspect(dtm[1:3,1:10] )
## <<DocumentTermMatrix (documents: 3, terms: 10)>>
## Non-/sparse entries: 5/25
## Sparsity : 83%
## Maximal term length: 9
## Weighting : term frequency (tf)
##
## Terms
## Docs 2010 adviser agent anoth antisemit birther brexit britain brought cav
## 1 1 0 1 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 1 0 0 0
## 3 0 0 0 0 0 0 1 1 0 0
# Use kmeans to cluster document
set.seed(123)
fit = kmeans(dist(dtm), 3)
fit
## K-means clustering with 3 clusters of sizes 8, 8, 8
##
## Cluster means:
## 1 2 3 4 5 6 7 8
## 1 2.855136 3.551086 3.822600 3.689352 3.256895 3.789171 3.078929 3.886756
## 2 3.736639 2.324264 2.469012 2.376606 2.031617 2.552871 3.736639 3.457711
## 3 3.916230 3.252011 3.547263 3.402887 2.926557 3.547263 3.948523 2.856986
## 9 10 11 12 13 14 15 16
## 1 3.654658 3.951341 3.654658 4.314515 2.741408 3.951341 2.671647 3.137671
## 2 3.153742 3.457711 2.462619 3.868477 3.599916 2.639420 3.457711 3.736639
## 3 2.620523 2.856986 3.402887 3.009535 3.819501 3.685923 3.685923 3.881535
## 17 18 19 20 21 22 23 24
## 1 3.148816 3.178114 3.689352 3.135821 3.689352 3.822600 3.822600 3.951341
## 2 3.599916 3.868477 3.153742 3.668516 2.462619 3.309292 3.309292 3.457711
## 3 3.819501 4.073427 2.703986 3.948523 3.402887 2.784304 2.822032 2.936445
##
## Clustering vector:
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
## 1 2 2 2 2 2 1 3 3 3 2 3 1 2 1 1 1 1 3 1 2 3 3 3
##
## Within cluster sum of squares by cluster:
## [1] 104.00933 62.40691 80.22489
## (between_SS / total_SS = 33.1 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
news[fit$cluster == 3]
## [1] "Donald Trump's full CNN interview with Jake Tapper "
## [2] "Donald Trump 'would love to' talk about birtherism"
## [3] "Donald Trump corrects Zionist Israel, but lets hibby "
## [4] "Former Trump Adviser Corey Lewandowski Just Defended Trump's"
## [5] "Poll: Clinton extends lead over Trump"
## [6] "Trump defends criticism of judge with Mexican heritage"
## [7] "Trump looks to unlikely speakers for convention help"
## [8] "Trump tweet evoking anti-Semitic imagery first posted on"
findAssocs(dtm, 'trump', 0.3)
## $trump
## adviser corey defend former just lewandowski
## 0.60 0.60 0.60 0.60 0.60 0.60
## donald
## 0.42
Random Forest
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.2.5
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
library(C50)
## Warning: package 'C50' was built under R version 3.2.5
data(churn)
churnTrain = churnTrain[,! names(churnTrain) %in% c("state", "area_code", "account_length") ]
set.seed(2)
ind <- sample(2, nrow(churnTrain), replace = TRUE, prob=c(0.7, 0.3))
trainset <- churnTrain[ind == 1,]
testset <- churnTrain[ind == 2,]
dim(trainset)
## [1] 2315 17
dim(testset)
## [1] 1018 17
churn.rf = randomForest(churn ~ ., data = trainset, importance = T)
cm <-table(predict(churn.rf, testset, type= 'class'), testset$churn)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
## The following object is masked from 'package:NLP':
##
## annotate
confusionMatrix(cm)
## Confusion Matrix and Statistics
##
##
## yes no
## yes 111 7
## no 30 870
##
## Accuracy : 0.9637
## 95% CI : (0.9502, 0.9743)
## No Information Rate : 0.8615
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8365
## Mcnemar's Test P-Value : 0.0002983
##
## Sensitivity : 0.7872
## Specificity : 0.9920
## Pos Pred Value : 0.9407
## Neg Pred Value : 0.9667
## Prevalence : 0.1385
## Detection Rate : 0.1090
## Detection Prevalence : 0.1159
## Balanced Accuracy : 0.8896
##
## 'Positive' Class : yes
##
par(mfrow=c(1,1))
plot(churn.rf)

importance(churn.rf)
## yes no MeanDecreaseAccuracy
## international_plan 68.9592890 54.118994 72.190204
## voice_mail_plan 18.8899994 15.832400 19.607844
## number_vmail_messages 21.3080062 16.262770 22.068514
## total_day_minutes 28.3237379 30.323756 39.961077
## total_day_calls 0.6325725 -1.131930 -0.802642
## total_day_charge 28.4798708 28.146414 35.858906
## total_eve_minutes 18.5242988 20.572464 24.484322
## total_eve_calls -3.3431379 -2.301767 -3.495801
## total_eve_charge 20.4379809 20.619705 24.489771
## total_night_minutes 0.9451961 16.105720 16.694651
## total_night_calls -0.3497164 2.202619 1.869193
## total_night_charge 0.1110824 15.977083 16.593633
## total_intl_minutes 17.3951655 20.063485 24.967698
## total_intl_calls 37.3613313 23.415764 35.497785
## total_intl_charge 16.7925666 19.636891 24.498369
## number_customer_service_calls 79.7530696 59.731615 85.221845
## MeanDecreaseGini
## international_plan 50.35584
## voice_mail_plan 10.44601
## number_vmail_messages 19.05619
## total_day_minutes 79.91474
## total_day_calls 20.80946
## total_day_charge 77.84837
## total_eve_minutes 42.99373
## total_eve_calls 17.45619
## total_eve_charge 44.02855
## total_night_minutes 22.93663
## total_night_calls 19.94091
## total_night_charge 22.22769
## total_intl_minutes 26.05059
## total_intl_calls 33.03289
## total_intl_charge 26.60077
## number_customer_service_calls 67.29635
varImpPlot(churn.rf)
