https://www.cnblogs.com/tgzhu/p/6708947.html

補充：隨機森林(Random Forest)

library(C50)
library(randomForest)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

library(ROCR)

## Loading required package: gplots

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

library('caret')

## Loading required package: lattice

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:randomForest':
## 
##     margin

library('e1071')
data(churn)

names(churnTrain) %in% c("state", "area_code", "account_length")

##  [1]  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

!names(churnTrain) %in% c("state", "area_code", "account_length")

##  [1] FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [12]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE

#選擇建模變數
variable.list = !names(churnTrain) %in% c('state','area_code','account_length')
churnTrain=churnTrain[,variable.list]
churnTest=churnTest[,variable.list]


rf_model = randomForest(formula=churn ~ .,data=churnTrain)
#find best ntree
plot(rf_model)
legend("topright",colnames(rf_model$err.rate),col=1:3,cex=0.8,fill=1:3)

#find nest mtry
tuneRF(churnTrain[,-17],churnTrain[,17])

## mtry = 4  OOB error = 4.41% 
## Searching left ...
## mtry = 2     OOB error = 5.67% 
## -0.2857143 0.05 
## Searching right ...
## mtry = 8     OOB error = 4.59% 
## -0.04081633 0.05

##       mtry   OOBError
## 2.OOB    2 0.05670567
## 4.OOB    4 0.04410441
## 8.OOB    8 0.04590459

# rf_model <- randomForest(churn ~., data = churnTrain, ntree=50,mtry=4)
# confusionMatrix(table(predict(rf_model,churnTest),churnTest$churn))
# 
# rf.predict.prob <- predict(rf_model, churnTest, type="prob")
# rf.prediction <- prediction(rf.predict.prob[,1], as.factor(churnTest$churn))
# rf.auc <- performance(rf.prediction, measure = "auc", x.measure = "cutoff")
# rf.performance <- performance(rf.prediction, "tpr","fpr")
# plot(rf.performance)
# 
# #比較CART和RandomForest
# rf_model = train(churn~.,data=churnTrain,method='rf',trControl=trainControl(method="repeatedcv", number=10, repeats=3,classProbs = TRUE,summaryFunction = prSummary))
# rf_prob_yes = predict(rf_model,churnTest,type='prob')[,1]
# rf_pred.rocr = prediction(rf_prob_yes,churnTest$churn)
# rf_perf.rocr = performance(rf_pred.rocr,measure = 'tpr',x.measure = 'fpr')
# 
# control=trainControl(method="repeatedcv", number=10, repeats=3,classProbs = TRUE,summaryFunction = prSummary)
# tune_funs = expand.grid(cp=seq(0.01,0.1,0.01))
# rpart_model =train(churn~., data=churnTrain, method="rpart", trControl=control,tuneGrid=tune_funs)
# 
# rpart_prob_yes = predict(rpart_model,churnTest,type='prob')[,1]
# rpart_pred.rocr = prediction(rpart_prob_yes,churnTest$churn)
# rpart_perf.rocr = performance(rpart_pred.rocr,measure = 'tpr',x.measure = 'fpr')
# 
# plot(rpart_perf.rocr,col='red')
# plot(rf_perf.rocr,col='black',add=T)
# legend(0.7, 0.2, c('randomforest','rpart'), 1:2)

分群問題

距離計算

x =c(0, 0, 1, 1, 1, 1)
y =c(1, 0, 1, 1, 0, 1)

#euclidean
?dist
rbind(x,y)

##   [,1] [,2] [,3] [,4] [,5] [,6]
## x    0    0    1    1    1    1
## y    1    0    1    1    0    1

dist(rbind(x,y), method ="euclidean")

##          x
## y 1.414214

sqrt(sum((x-y)^2))

## [1] 1.414214

dist(rbind(x,y), method ="minkowski", p=2)

##          x
## y 1.414214

#city block
dist(rbind(x,y), method ="manhattan")

##   x
## y 2

sum(abs(x-y))

## [1] 2

dist(rbind(x,y), method ="minkowski", p=1)

##   x
## y 2

Hierarchical Clustering

聚合式(bottom-up)

setwd('~/lecture/riii')
customer=read.csv('data/customer.csv',header=TRUE)
head(customer)

##   ID Visit.Time Average.Expense Sex Age
## 1  1          3             5.7   0  10
## 2  2          5            14.5   0  27
## 3  3         16            33.5   0  32
## 4  4          5            15.9   0  30
## 5  5         16            24.9   0  23
## 6  6          3            12.0   0  15

str(customer)

## 'data.frame':    60 obs. of  5 variables:
##  $ ID             : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Visit.Time     : int  3 5 16 5 16 3 12 14 6 3 ...
##  $ Average.Expense: num  5.7 14.5 33.5 15.9 24.9 12 28.5 18.8 23.8 5.3 ...
##  $ Sex            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Age            : int  10 27 32 30 23 15 33 27 16 11 ...

#數值變數作正規化
customer_s =scale(customer[,-1])
?scale

#正規化後的變數平均數為0, 標準差為1
round(mean(customer_s[,2]),3)

## [1] 0

round(sd(customer_s[,2]),3)

## [1] 1

?hclust
hc=hclust(dist(customer_s, method="euclidean"), method="ward.D2")
plot(hc,hang =-0.01, cex=0.7)

hc3 =hclust(dist(customer, method="euclidean"), method="single")
plot(hc3, hang =-0.01, cex=0.8)

cutree

fit =cutree(hc, k =4)
fit

##  [1] 1 1 2 1 2 1 2 2 1 1 1 2 2 1 1 1 2 1 2 3 4 3 4 3 3 4 4 3 4 4 4 3 3 3 4
## [36] 4 3 4 4 4 4 4 4 4 3 3 4 4 4 3 4 3 3 4 4 4 3 4 4 3

table(fit)

## fit
##  1  2  3  4 
## 11  8 16 25

plot(hc, hang =-0.01, cex=0.7)
rect.hclust(hc, k =4, border="red")
rect.hclust(hc, k =3, border="blue")

c_1 = customer[fit == 1,]
summary(c_1)

##        ID           Visit.Time    Average.Expense      Sex         Age    
##  Min.   : 1.000   Min.   :3.000   Min.   : 4.60   Min.   :0   Min.   : 9  
##  1st Qu.: 5.000   1st Qu.:3.500   1st Qu.: 7.15   1st Qu.:0   1st Qu.:12  
##  Median :10.000   Median :5.000   Median :14.50   Median :0   Median :16  
##  Mean   : 9.636   Mean   :4.909   Mean   :12.71   Mean   :0   Mean   :17  
##  3rd Qu.:14.500   3rd Qu.:6.000   3rd Qu.:16.00   3rd Qu.:0   3rd Qu.:20  
##  Max.   :18.000   Max.   :8.000   Max.   :23.80   Max.   :0   Max.   :30

分裂式階層式(top-down)

#install.packages('cluster')
library(cluster)
?diana
dv =diana(customer_s, metric ="euclidean")
summary(dv)

## Merge:
##       [,1] [,2]
##  [1,]  -24  -50
##  [2,]  -28  -46
##  [3,]   -7  -13
##  [4,]  -30  -35
##  [5,]  -21  -40
##  [6,]  -54  -58
##  [7,]  -23  -26
##  [8,]   -1  -10
##  [9,]    7  -51
## [10,]  -27  -59
## [11,]    5  -39
## [12,]  -32  -45
## [13,]   -8  -12
## [14,]   -2   -4
## [15,]  -14  -18
## [16,]   11  -43
## [17,]  -44  -49
## [18,]    9  -56
## [19,]  -37  -60
## [20,]   -6  -11
## [21,]  -29  -48
## [22,]   -5  -19
## [23,]   10  -36
## [24,]  -42   17
## [25,]  -25   12
## [26,]   18  -41
## [27,]   21  -38
## [28,]   13  -17
## [29,]  -34  -52
## [30,]   16    6
## [31,]    8   20
## [32,]   26    4
## [33,]   19  -57
## [34,]  -47  -55
## [35,]   25  -53
## [36,]   24  -31
## [37,]   30   36
## [38,]   -3    3
## [39,]   -9   15
## [40,]  -33   33
## [41,]   32   23
## [42,]   22   28
## [43,]   31  -15
## [44,]   37   27
## [45,]  -20   40
## [46,]  -22   35
## [47,]   44   34
## [48,]   14   39
## [49,]    1   29
## [50,]   45    2
## [51,]   38   42
## [52,]   43  -16
## [53,]   46   49
## [54,]   52   48
## [55,]   47   41
## [56,]   50   53
## [57,]   54   55
## [58,]   51   56
## [59,]   57   58
## Order of objects:
##  [1]  1 10  6 11 15 16  2  4  9 14 18 21 40 39 43 54 58 42 44 49 31 29 48
## [24] 38 47 55 23 26 51 56 41 30 35 27 59 36  3  7 13  5 19  8 12 17 20 33
## [47] 37 60 57 28 46 22 25 32 45 53 24 50 34 52
## Height:
##  [1] 0.11775833 0.92338041 0.50974266 1.47360965 2.04722777 2.51250579
##  [7] 0.36355872 1.79099892 1.08967479 0.39308959 3.57679780 0.00000000
## [13] 0.21833707 0.44391855 0.80354844 0.08334529 0.98499722 0.70126085
## [19] 0.44921797 0.98499722 1.48962560 0.55960408 0.76573069 1.77868059
## [25] 0.97891452 2.79693737 0.09525176 0.12305649 0.48657744 0.76517620
## [31] 0.93270565 0.00000000 1.28196769 0.16054657 0.60321756 5.85655734
## [37] 1.07657773 0.00000000 1.98611220 0.59473487 1.44920797 0.33912975
## [43] 0.78523518 3.88572195 1.51921913 1.18521332 0.50902071 0.97225583
## [49] 1.91123321 0.00000000 3.39304108 1.52798723 0.72296652 0.31544012
## [55] 0.98335831 2.45910026 0.00000000 1.85224545 0.79085454
## Divisive coefficient:
## [1] 0.9117911
## 
## 1770 dissimilarities, summarized :
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.845   2.572   2.595   3.354   5.857 
## Metric :  euclidean 
## Number of objects : 60
## 
## Available components:
## [1] "order"  "height" "dc"     "merge"  "diss"   "call"   "data"

plot(dv)

fit2 =cutree(dv,k=4)
c_1 = customer[fit2 ==1,]
summary(c_1)

##        ID           Visit.Time    Average.Expense      Sex         Age    
##  Min.   : 1.000   Min.   :3.000   Min.   : 4.60   Min.   :0   Min.   : 9  
##  1st Qu.: 5.000   1st Qu.:3.500   1st Qu.: 7.15   1st Qu.:0   1st Qu.:12  
##  Median :10.000   Median :5.000   Median :14.50   Median :0   Median :16  
##  Mean   : 9.636   Mean   :4.909   Mean   :12.71   Mean   :0   Mean   :17  
##  3rd Qu.:14.500   3rd Qu.:6.000   3rd Qu.:16.00   3rd Qu.:0   3rd Qu.:20  
##  Max.   :18.000   Max.   :8.000   Max.   :23.80   Max.   :0   Max.   :30

k-means

str(customer_s)

##  num [1:60, 1:4] -1.202 -0.757 1.692 -0.757 1.692 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : NULL
##   ..$ : chr [1:4] "Visit.Time" "Average.Expense" "Sex" "Age"
##  - attr(*, "scaled:center")= Named num [1:4] 8.4 17.058 0.683 21.433
##   ..- attr(*, "names")= chr [1:4] "Visit.Time" "Average.Expense" "Sex" "Age"
##  - attr(*, "scaled:scale")= Named num [1:4] 4.492 8.399 0.469 9.285
##   ..- attr(*, "names")= chr [1:4] "Visit.Time" "Average.Expense" "Sex" "Age"

set.seed(22)
fit =kmeans(customer_s, centers=4)
?kmeans

barplot(t(fit$centers), beside =TRUE,xlab="cluster", ylab="value")

?barplot
fit$centers

##   Visit.Time Average.Expense        Sex        Age
## 1  1.3302016       1.0155226 -1.4566845  0.5591307
## 2 -0.7771737      -0.5178412 -1.4566845 -0.4774599
## 3  0.8571173       0.9887331  0.6750489  1.0505015
## 4 -0.6322632      -0.7299063  0.6750489 -0.6411604

customer[fit$cluster == 1,]

##    ID Visit.Time Average.Expense Sex Age
## 3   3         16            33.5   0  32
## 5   5         16            24.9   0  23
## 7   7         12            28.5   0  33
## 8   8         14            18.8   0  27
## 12 12         14            21.0   0  25
## 13 13         12            28.5   0  33
## 17 17         14            23.6   0  22
## 19 19         17            25.9   0  18

投影至二維空間

#install.packages("cluster")
library(cluster)
clusplot(customer_s, fit$cluster, color=TRUE, shade=TRUE)

#了解component 成分為何
pca =princomp(customer_s)
summary(pca)

## Importance of components:
##                           Comp.1    Comp.2     Comp.3     Comp.4
## Standard deviation     1.5339215 0.9953978 0.62428436 0.44706853
## Proportion of Variance 0.5981988 0.2519026 0.09908414 0.05081448
## Cumulative Proportion  0.5981988 0.8501014 0.94918552 1.00000000

pca$loadings

## 
## Loadings:
##                 Comp.1 Comp.2 Comp.3 Comp.4
## Visit.Time       0.576         0.601  0.554
## Average.Expense  0.602         0.146 -0.785
## Sex                    -0.989  0.133       
## Age              0.550 -0.148 -0.775  0.274
## 
##                Comp.1 Comp.2 Comp.3 Comp.4
## SS loadings      1.00   1.00   1.00   1.00
## Proportion Var   0.25   0.25   0.25   0.25
## Cumulative Var   0.25   0.50   0.75   1.00

Evaluating model

#silhouette
library('cluster')
par(mfrow= c(1,1))
set.seed(22)
library(cluster)
km =kmeans(customer_s, 4)
kms=silhouette(km$cluster,dist(customer_s))
summary(kms)

## Silhouette of 60 units in 4 clusters from silhouette.default(x = km$cluster, dist = dist(customer_s)) :
##  Cluster sizes and average silhouette widths:
##         8        11        16        25 
## 0.5464597 0.4080823 0.3794910 0.5164434 
## Individual silhouette widths:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1931  0.4030  0.4890  0.4641  0.5422  0.6333

plot(kms)

選擇k-means最佳k值

#within sum of squares
nk=2:10
set.seed(22)
WSS =sapply(nk, function(k){set.seed(22);kmeans(customer_s, centers=k)$tot.withinss})
WSS

## [1] 123.49224  93.08341  61.34890  48.76431  43.08965  40.25820  29.58014
## [8]  26.97709  24.99510

plot(x=nk, y=WSS, type="l", xlab="number of k", ylab="within sum of squares")

SW = sapply(nk,function(k){
  set.seed(22); summary(silhouette(kmeans(customer_s,centers=k)$cluster,dist(customer_s)))$avg.width
})

plot(x=nk,y=SW,type='l')

#install.packages("fpc")
#install.packages("robustbase", repos="http://R-Forge.R-project.org")
library(fpc)
?cluster.stats
cluster.stats(dist(customer_s), kmeans(customer_s, centers=2)$cluster)

## $n
## [1] 60
## 
## $cluster.number
## [1] 2
## 
## $cluster.size
## [1] 24 36
## 
## $min.cluster.size
## [1] 24
## 
## $noisen
## [1] 0
## 
## $diameter
## [1] 3.885722 3.576798
## 
## $average.distance
## [1] 2.045880 1.798055
## 
## $median.distance
## [1] 2.243328 1.996810
## 
## $separation
## [1] 0.9276315 0.9276315
## 
## $average.toother
## [1] 3.351368 3.351368
## 
## $separation.matrix
##           [,1]      [,2]
## [1,] 0.0000000 0.9276315
## [2,] 0.9276315 0.0000000
## 
## $ave.between.matrix
##          [,1]     [,2]
## [1,] 0.000000 3.351368
## [2,] 3.351368 0.000000
## 
## $average.between
## [1] 3.351368
## 
## $average.within
## [1] 1.873552
## 
## $n.between
## [1] 864
## 
## $n.within
## [1] 906
## 
## $max.diameter
## [1] 3.885722
## 
## $min.separation
## [1] 0.9276315
## 
## $within.cluster.ss
## [1] 123.4922
## 
## $clus.avg.silwidths
##         1         2 
## 0.3827935 0.4454536 
## 
## $avg.silwidth
## [1] 0.4203896
## 
## $g2
## NULL
## 
## $g3
## NULL
## 
## $pearsongamma
## [1] 0.6564321
## 
## $dunn
## [1] 0.2387282
## 
## $dunn2
## [1] 1.638105
## 
## $entropy
## [1] 0.6730117
## 
## $wb.ratio
## [1] 0.559041
## 
## $ch
## [1] 52.84097
## 
## $cwidegap
## [1] 2.148705 2.131733
## 
## $widestgap
## [1] 2.148705
## 
## $sindex
## [1] 1.010004
## 
## $corrected.rand
## NULL
## 
## $vi
## NULL

WSS =sapply(nk, function(k){set.seed(22);cluster.stats(dist(customer_s), kmeans(customer_s, centers=k)$cluster)$within.cluster.ss})

plot(x=nk, y=WSS, type="l", xlab="number of k", ylab="within sum of squares")

SW =sapply(2:10,function(k){set.seed(22);cluster.stats(dist(customer_s),kmeans(customer_s, centers=k)$cluster)$avg.silwidth})

plot(x=nk,y=SW,type='l')

model comparison

single_c=hclust(dist(customer_s), method="single")
hc_single=cutree(single_c, k =4)

complete_c=hclust(dist(customer_s), method="complete")
hc_complete=cutree(complete_c, k =4)

set.seed(22)
km =kmeans(customer_s, 4)

cs=cluster.stats(dist(customer_s),km$cluster)
cs[c("within.cluster.ss","avg.silwidth")]

## $within.cluster.ss
## [1] 61.3489
## 
## $avg.silwidth
## [1] 0.4640587

q =sapply(
  list(kmeans=km$cluster, 
       hc_single=hc_single, 
       hc_complete=hc_complete), function(c)cluster.stats(dist(customer_s),c)[c("within.cluster.ss","avg.silwidth")])
q

##                   kmeans    hc_single hc_complete
## within.cluster.ss 61.3489   136.0092  65.94076   
## avg.silwidth      0.4640587 0.2481926 0.4255961

density-based method-DBSCAN

http://123android.blogspot.tw/2012/01/28dec11-data-mining.html

#install.packages("mlbench")
# mlbench package provides many methods to generate simulated data with different shapes and sizes.
#In this example, we generate a Cassini problem graph
library(mlbench)
#install.packages("fpc")
library(fpc)
set.seed(2)
p = mlbench.cassini(500)
plot(p$x)

?mlbench.cassini

ds = dbscan(data = dist(p$x),eps= 0.2, MinPts = 2, method="dist")
ds

## dbscan Pts=500 MinPts=2 eps=0.2
##         1   2   3
## seed  200 200 100
## total 200 200 100

plot(ds, p$x)

y = matrix(0,nrow=3,ncol=2)
y[1,] = c(0,0)
y[2,] = c(0,-1.5)
y[3,] = c(1,1)
y

##      [,1] [,2]
## [1,]    0  0.0
## [2,]    0 -1.5
## [3,]    1  1.0

predict(ds, p$x, y)

## [1] 3 1 2

#filter群集的raw data
cluster_1_raw = p$x[ds$cluster == 1,]
cluster_1_raw

##                [,1]       [,2]
##   [1,] -0.878020041 -0.9762015
##   [2,]  0.204310908 -1.8311169
##   [3,] -1.033283148 -0.7664819
##   [4,] -0.089110770 -1.2200260
##   [5,]  0.146767003 -1.7177684
##   [6,]  0.725874430 -1.8106878
##   [7,]  0.451102355 -1.4799207
##   [8,] -0.425548959 -1.3179628
##   [9,] -0.977311794 -1.5286999
##  [10,]  0.864295737 -0.7098223
##  [11,]  0.039793615 -1.0964859
##  [12,]  0.959690709 -1.6442071
##  [13,]  0.465944793 -1.8592484
##  [14,]  1.237244233 -0.8282179
##  [15,] -0.625804973 -0.8010245
##  [16,]  0.317509698 -0.9637028
##  [17,]  0.752385028 -0.6808253
##  [18,]  0.348629646 -1.6835199
##  [19,]  1.000493954 -1.4000192
##  [20,] -0.311664918 -1.3615982
##  [21,] -0.624969555 -1.6033902
##  [22,] -0.882224692 -0.8912601
##  [23,] -0.590534200 -0.7114485
##  [24,] -0.271268856 -1.1837040
##  [25,] -0.416158140 -1.0248017
##  [26,] -0.835962518 -0.7295600
##  [27,]  0.649746827 -1.5558908
##  [28,] -0.334361972 -1.2033798
##  [29,] -0.100842130 -1.7851571
##  [30,] -1.071873313 -1.3959494
##  [31,] -0.833292664 -1.4157775
##  [32,] -0.593559806 -1.1465330
##  [33,]  1.111665006 -1.3745968
##  [34,] -1.324461332 -0.9219018
##  [35,] -0.352864039 -1.1809969
##  [36,]  0.906469168 -0.7980869
##  [37,]  1.261369418 -1.1895967
##  [38,] -0.721705502 -0.8884615
##  [39,]  0.175082884 -1.6118350
##  [40,]  0.539582980 -1.5624870
##  [41,] -0.818874233 -0.6291580
##  [42,] -0.513059044 -1.0350265
##  [43,] -0.876957755 -1.4607719
##  [44,] -0.895933315 -1.6433053
##  [45,]  0.360744853 -1.6198395
##  [46,] -0.156942578 -0.9316785
##  [47,]  0.468558055 -1.6013488
##  [48,] -0.059688396 -1.6337438
##  [49,] -0.885565535 -1.5227799
##  [50,]  1.125117614 -1.4695238
##  [51,]  0.779837759 -1.6453457
##  [52,] -1.267874267 -0.8865873
##  [53,] -0.565935794 -1.0583129
##  [54,] -1.355198401 -1.2378089
##  [55,] -0.677769509 -0.6677644
##  [56,] -0.312223762 -0.8291433
##  [57,] -0.419629384 -1.8780091
##  [58,]  0.419366754 -1.2295931
##  [59,]  1.240594060 -1.1852068
##  [60,]  1.263970040 -0.8515952
##  [61,] -1.063583088 -0.7606211
##  [62,] -1.150307795 -0.9022401
##  [63,] -1.100364472 -1.4493018
##  [64,]  0.008498944 -1.7775133
##  [65,] -0.836672143 -1.8149838
##  [66,]  0.999430884 -1.0677434
##  [67,] -0.343672879 -0.7894168
##  [68,] -0.761394282 -1.7695864
##  [69,] -0.851809277 -1.7150909
##  [70,]  1.336660721 -1.2611998
##  [71,]  0.909374488 -1.0265832
##  [72,] -0.382126787 -1.7644891
##  [73,] -0.969304533 -0.6027001
##  [74,] -0.275587040 -1.3226617
##  [75,]  0.505679994 -0.7335432
##  [76,]  1.150520920 -1.3386278
##  [77,]  0.595382492 -0.7723944
##  [78,]  0.489829736 -1.5564774
##  [79,]  1.088978634 -1.2538732
##  [80,]  1.043025382 -1.6861489
##  [81,]  0.118268481 -1.7543981
##  [82,] -0.789308146 -1.3415347
##  [83,]  1.094284148 -1.6172529
##  [84,]  0.153857954 -0.8916541
##  [85,] -0.464299827 -1.2733054
##  [86,] -0.980257695 -1.5290886
##  [87,] -1.279712214 -1.0883512
##  [88,]  0.361285474 -1.5286257
##  [89,]  0.858581717 -1.0253372
##  [90,] -1.332177181 -0.9967377
##  [91,]  1.079295343 -1.2404652
##  [92,]  1.072808084 -0.6373208
##  [93,]  0.972371113 -1.6303852
##  [94,]  0.791835346 -1.3100768
##  [95,]  0.443013721 -1.0278206
##  [96,]  0.094980484 -1.1810036
##  [97,]  0.758548042 -1.5286569
##  [98,] -0.678020587 -1.5936866
##  [99,]  0.681099042 -1.3005344
## [100,]  0.581809991 -1.4361657
## [101,] -0.949328033 -1.2934566
## [102,]  0.934646309 -1.3200800
## [103,] -0.838076695 -1.0010805
## [104,] -1.143456853 -1.3945971
## [105,]  0.294368850 -1.3797689
## [106,]  0.690397641 -1.8165608
## [107,]  0.431482058 -1.8374265
## [108,] -0.294068155 -1.7588508
## [109,]  0.391565978 -1.6878085
## [110,] -0.868340232 -0.7029784
## [111,] -1.092895774 -1.1875185
## [112,]  0.944616018 -0.7959523
## [113,]  1.008227022 -0.7349331
## [114,]  0.063169014 -1.5183811
## [115,] -0.883764856 -1.3822555
## [116,] -0.676683002 -1.1494103
## [117,] -0.905608044 -0.8047834
## [118,]  0.819799909 -1.1321852
## [119,] -1.247826523 -0.9938843
## [120,]  0.425023325 -1.7127364
## [121,]  0.028040646 -1.0690623
## [122,]  0.548264973 -1.0473068
## [123,] -1.109889201 -1.3317341
## [124,] -0.085253708 -1.6740866
## [125,] -1.144451660 -1.1216628
## [126,] -0.292571453 -1.2366095
## [127,]  1.007551267 -0.8961479
## [128,]  0.719466695 -0.6217663
## [129,] -0.476545375 -1.8497985
## [130,] -0.847643081 -0.7277424
## [131,] -0.758695996 -0.8727665
## [132,]  0.326395991 -1.1894974
## [133,]  0.241189831 -1.5574133
## [134,] -0.928170585 -0.8081438
## [135,]  0.416418319 -0.9710204
## [136,] -0.381826950 -1.0647319
## [137,]  0.628505803 -1.4240533
## [138,]  0.254806683 -0.8233607
## [139,] -0.557311573 -0.9897810
## [140,] -1.182075255 -1.1919261
## [141,]  1.080463546 -1.5803151
## [142,] -1.134948642 -0.7702150
## [143,] -1.038683810 -0.7100887
## [144,]  1.262473473 -1.3013711
## [145,]  0.431152384 -1.7899316
## [146,]  0.732401457 -1.2241335
## [147,] -0.797356507 -0.9931729
## [148,]  1.317411229 -1.2101679
## [149,]  0.078235150 -1.7534496
## [150,]  0.503113262 -0.9063731
## [151,] -0.099743764 -1.8801311
## [152,]  0.875963378 -0.7685420
## [153,]  1.359823570 -1.0845456
## [154,] -1.322579861 -1.3218902
## [155,] -0.646839537 -0.8774735
## [156,] -0.033284404 -0.9049642
## [157,] -1.274560036 -1.3949715
## [158,] -0.986175143 -1.3928907
## [159,]  0.516447124 -0.8282905
## [160,] -0.617672489 -0.7544855
## [161,] -0.090804023 -0.8836617
## [162,]  1.057596764 -1.3103589
## [163,]  0.581555766 -1.3198854
## [164,]  0.785857302 -0.6411018
## [165,] -0.596294891 -1.1266062
## [166,]  0.308849454 -1.7633732
## [167,] -0.508235955 -0.6506234
## [168,] -0.491856613 -1.2025502
## [169,]  0.534989613 -1.2040689
## [170,] -0.438313225 -1.7248040
## [171,] -0.196149509 -0.9212789
## [172,] -0.349246728 -1.3181133
## [173,]  0.837322887 -1.0071313
## [174,] -0.332437398 -1.4488438
## [175,] -0.312666601 -1.1529168
## [176,]  0.937778199 -1.2671769
## [177,] -0.109157470 -0.9522752
## [178,] -1.315804660 -0.9816730
## [179,] -0.672054106 -0.6749106
## [180,]  0.495424973 -0.8340842
## [181,]  0.560205803 -1.6648770
## [182,] -1.086024289 -0.9038942
## [183,]  0.276024465 -0.8513125
## [184,]  1.242073722 -1.5273660
## [185,] -0.039474960 -1.2577110
## [186,]  0.970639213 -1.2981420
## [187,] -0.433099332 -1.1673614
## [188,]  0.674286036 -1.3360663
## [189,]  1.113452181 -1.6006035
## [190,] -0.654962503 -1.2169928
## [191,]  1.261636182 -1.1705721
## [192,]  1.057543215 -0.8091239
## [193,] -0.103213762 -1.7898732
## [194,]  0.617210972 -1.1585440
## [195,]  0.098981815 -0.8887357
## [196,]  1.202563385 -0.9429087
## [197,] -0.385531083 -1.3979427
## [198,] -1.343070698 -0.9174087
## [199,] -0.344961580 -1.7403067
## [200,]  0.746652400 -1.0796416

其他分類方法

k-nearest neighbor classifer

https://www.youtube.com/watch?v=UqYde-LULfs

library(C50)
data(churn)
#選擇建模變數
variable.list = !names(churnTrain) %in% c('state','area_code','account_length')
churnTrain=churnTrain[,variable.list]
churnTest=churnTest[,variable.list]

#install.packages("class")
#library(class)

head(churnTrain)

##   international_plan voice_mail_plan number_vmail_messages
## 1                 no             yes                    25
## 2                 no             yes                    26
## 3                 no              no                     0
## 4                yes              no                     0
## 5                yes              no                     0
## 6                yes              no                     0
##   total_day_minutes total_day_calls total_day_charge total_eve_minutes
## 1             265.1             110            45.07             197.4
## 2             161.6             123            27.47             195.5
## 3             243.4             114            41.38             121.2
## 4             299.4              71            50.90              61.9
## 5             166.7             113            28.34             148.3
## 6             223.4              98            37.98             220.6
##   total_eve_calls total_eve_charge total_night_minutes total_night_calls
## 1              99            16.78               244.7                91
## 2             103            16.62               254.4               103
## 3             110            10.30               162.6               104
## 4              88             5.26               196.9                89
## 5             122            12.61               186.9               121
## 6             101            18.75               203.9               118
##   total_night_charge total_intl_minutes total_intl_calls total_intl_charge
## 1              11.01               10.0                3              2.70
## 2              11.45               13.7                3              3.70
## 3               7.32               12.2                5              3.29
## 4               8.86                6.6                7              1.78
## 5               8.41               10.1                3              2.73
## 6               9.18                6.3                6              1.70
##   number_customer_service_calls churn
## 1                             1    no
## 2                             1    no
## 3                             0    no
## 4                             2    no
## 5                             3    no
## 6                             0    no

levels(churnTrain$international_plan) = list("0"="no", "1"="yes")
levels(churnTrain$voice_mail_plan) = list("0"="no", "1"="yes")
churnTrain$international_plan = as.numeric(churnTrain$international_plan)
churnTrain$voice_mail_plan = as.numeric(churnTrain$voice_mail_plan)

levels(churnTest$international_plan) = list("0"="no", "1"="yes")
levels(churnTest$voice_mail_plan) = list("0"="no", "1"="yes")
churnTest$international_plan = as.numeric(churnTest$international_plan)
churnTest$voice_mail_plan = as.numeric(churnTest$voice_mail_plan)

head(churnTrain)

##   international_plan voice_mail_plan number_vmail_messages
## 1                  1               2                    25
## 2                  1               2                    26
## 3                  1               1                     0
## 4                  2               1                     0
## 5                  2               1                     0
## 6                  2               1                     0
##   total_day_minutes total_day_calls total_day_charge total_eve_minutes
## 1             265.1             110            45.07             197.4
## 2             161.6             123            27.47             195.5
## 3             243.4             114            41.38             121.2
## 4             299.4              71            50.90              61.9
## 5             166.7             113            28.34             148.3
## 6             223.4              98            37.98             220.6
##   total_eve_calls total_eve_charge total_night_minutes total_night_calls
## 1              99            16.78               244.7                91
## 2             103            16.62               254.4               103
## 3             110            10.30               162.6               104
## 4              88             5.26               196.9                89
## 5             122            12.61               186.9               121
## 6             101            18.75               203.9               118
##   total_night_charge total_intl_minutes total_intl_calls total_intl_charge
## 1              11.01               10.0                3              2.70
## 2              11.45               13.7                3              3.70
## 3               7.32               12.2                5              3.29
## 4               8.86                6.6                7              1.78
## 5               8.41               10.1                3              2.73
## 6               9.18                6.3                6              1.70
##   number_customer_service_calls churn
## 1                             1    no
## 2                             1    no
## 3                             0    no
## 4                             2    no
## 5                             3    no
## 6                             0    no

#use caret package
library('caret')
control=trainControl(method="cv",number=10,,classProbs = TRUE,summaryFunction = twoClassSummary)
tune_funs=expand.grid(k=seq(3,15,2))
knn.model = train(churn~., data=churnTrain, method="knn", trControl=control,metric="ROC",tuneGrid=tune_funs)

knn.predict = predict(knn.model,churnTest)
confusionMatrix(table(knn.predict,churnTest$churn))

## Confusion Matrix and Statistics
## 
##            
## knn.predict  yes   no
##         yes   66    6
##         no   158 1437
##                                           
##                Accuracy : 0.9016          
##                  95% CI : (0.8863, 0.9155)
##     No Information Rate : 0.8656          
##     P-Value [Acc > NIR] : 4.383e-06       
##                                           
##                   Kappa : 0.4072          
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.29464         
##             Specificity : 0.99584         
##          Pos Pred Value : 0.91667         
##          Neg Pred Value : 0.90094         
##              Prevalence : 0.13437         
##          Detection Rate : 0.03959         
##    Detection Prevalence : 0.04319         
##       Balanced Accuracy : 0.64524         
##                                           
##        'Positive' Class : yes             
##

R_basic5

York Lin

2018年11月8日

補充：隨機森林(Random Forest)

分群問題

距離計算

Hierarchical Clustering

聚合式(bottom-up)

cutree

分裂式階層式(top-down)

k-means

投影至二維空間

Evaluating model

選擇k-means最佳k值

model comparison

density-based method-DBSCAN

其他分類方法

k-nearest neighbor classifer