補充:隨機森林(Random Forest)
library(C50)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library('caret')
## Loading required package: lattice
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
library('e1071')
data(churn)
names(churnTrain) %in% c("state", "area_code", "account_length")
## [1] TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
!names(churnTrain) %in% c("state", "area_code", "account_length")
## [1] FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [12] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
#選擇建模變數
variable.list = !names(churnTrain) %in% c('state','area_code','account_length')
churnTrain=churnTrain[,variable.list]
churnTest=churnTest[,variable.list]
rf_model = randomForest(formula=churn ~ .,data=churnTrain)
#find best ntree
plot(rf_model)
legend("topright",colnames(rf_model$err.rate),col=1:3,cex=0.8,fill=1:3)

#find nest mtry
tuneRF(churnTrain[,-17],churnTrain[,17])
## mtry = 4 OOB error = 4.41%
## Searching left ...
## mtry = 2 OOB error = 5.67%
## -0.2857143 0.05
## Searching right ...
## mtry = 8 OOB error = 4.59%
## -0.04081633 0.05

## mtry OOBError
## 2.OOB 2 0.05670567
## 4.OOB 4 0.04410441
## 8.OOB 8 0.04590459
# rf_model <- randomForest(churn ~., data = churnTrain, ntree=50,mtry=4)
# confusionMatrix(table(predict(rf_model,churnTest),churnTest$churn))
#
# rf.predict.prob <- predict(rf_model, churnTest, type="prob")
# rf.prediction <- prediction(rf.predict.prob[,1], as.factor(churnTest$churn))
# rf.auc <- performance(rf.prediction, measure = "auc", x.measure = "cutoff")
# rf.performance <- performance(rf.prediction, "tpr","fpr")
# plot(rf.performance)
#
# #比較CART和RandomForest
# rf_model = train(churn~.,data=churnTrain,method='rf',trControl=trainControl(method="repeatedcv", number=10, repeats=3,classProbs = TRUE,summaryFunction = prSummary))
# rf_prob_yes = predict(rf_model,churnTest,type='prob')[,1]
# rf_pred.rocr = prediction(rf_prob_yes,churnTest$churn)
# rf_perf.rocr = performance(rf_pred.rocr,measure = 'tpr',x.measure = 'fpr')
#
# control=trainControl(method="repeatedcv", number=10, repeats=3,classProbs = TRUE,summaryFunction = prSummary)
# tune_funs = expand.grid(cp=seq(0.01,0.1,0.01))
# rpart_model =train(churn~., data=churnTrain, method="rpart", trControl=control,tuneGrid=tune_funs)
#
# rpart_prob_yes = predict(rpart_model,churnTest,type='prob')[,1]
# rpart_pred.rocr = prediction(rpart_prob_yes,churnTest$churn)
# rpart_perf.rocr = performance(rpart_pred.rocr,measure = 'tpr',x.measure = 'fpr')
#
# plot(rpart_perf.rocr,col='red')
# plot(rf_perf.rocr,col='black',add=T)
# legend(0.7, 0.2, c('randomforest','rpart'), 1:2)
分群問題
距離計算
x =c(0, 0, 1, 1, 1, 1)
y =c(1, 0, 1, 1, 0, 1)
#euclidean
?dist
rbind(x,y)
## [,1] [,2] [,3] [,4] [,5] [,6]
## x 0 0 1 1 1 1
## y 1 0 1 1 0 1
dist(rbind(x,y), method ="euclidean")
## x
## y 1.414214
sqrt(sum((x-y)^2))
## [1] 1.414214
dist(rbind(x,y), method ="minkowski", p=2)
## x
## y 1.414214
#city block
dist(rbind(x,y), method ="manhattan")
## x
## y 2
sum(abs(x-y))
## [1] 2
dist(rbind(x,y), method ="minkowski", p=1)
## x
## y 2
Hierarchical Clustering
聚合式(bottom-up)
setwd('~/lecture/riii')
customer=read.csv('data/customer.csv',header=TRUE)
head(customer)
## ID Visit.Time Average.Expense Sex Age
## 1 1 3 5.7 0 10
## 2 2 5 14.5 0 27
## 3 3 16 33.5 0 32
## 4 4 5 15.9 0 30
## 5 5 16 24.9 0 23
## 6 6 3 12.0 0 15
str(customer)
## 'data.frame': 60 obs. of 5 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Visit.Time : int 3 5 16 5 16 3 12 14 6 3 ...
## $ Average.Expense: num 5.7 14.5 33.5 15.9 24.9 12 28.5 18.8 23.8 5.3 ...
## $ Sex : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Age : int 10 27 32 30 23 15 33 27 16 11 ...
#數值變數作正規化
customer_s =scale(customer[,-1])
?scale
#正規化後的變數平均數為0, 標準差為1
round(mean(customer_s[,2]),3)
## [1] 0
round(sd(customer_s[,2]),3)
## [1] 1
?hclust
hc=hclust(dist(customer_s, method="euclidean"), method="ward.D2")
plot(hc,hang =-0.01, cex=0.7)

hc3 =hclust(dist(customer, method="euclidean"), method="single")
plot(hc3, hang =-0.01, cex=0.8)

cutree
fit =cutree(hc, k =4)
fit
## [1] 1 1 2 1 2 1 2 2 1 1 1 2 2 1 1 1 2 1 2 3 4 3 4 3 3 4 4 3 4 4 4 3 3 3 4
## [36] 4 3 4 4 4 4 4 4 4 3 3 4 4 4 3 4 3 3 4 4 4 3 4 4 3
table(fit)
## fit
## 1 2 3 4
## 11 8 16 25
plot(hc, hang =-0.01, cex=0.7)
rect.hclust(hc, k =4, border="red")
rect.hclust(hc, k =3, border="blue")

c_1 = customer[fit == 1,]
summary(c_1)
## ID Visit.Time Average.Expense Sex Age
## Min. : 1.000 Min. :3.000 Min. : 4.60 Min. :0 Min. : 9
## 1st Qu.: 5.000 1st Qu.:3.500 1st Qu.: 7.15 1st Qu.:0 1st Qu.:12
## Median :10.000 Median :5.000 Median :14.50 Median :0 Median :16
## Mean : 9.636 Mean :4.909 Mean :12.71 Mean :0 Mean :17
## 3rd Qu.:14.500 3rd Qu.:6.000 3rd Qu.:16.00 3rd Qu.:0 3rd Qu.:20
## Max. :18.000 Max. :8.000 Max. :23.80 Max. :0 Max. :30
分裂式階層式(top-down)
#install.packages('cluster')
library(cluster)
?diana
dv =diana(customer_s, metric ="euclidean")
summary(dv)
## Merge:
## [,1] [,2]
## [1,] -24 -50
## [2,] -28 -46
## [3,] -7 -13
## [4,] -30 -35
## [5,] -21 -40
## [6,] -54 -58
## [7,] -23 -26
## [8,] -1 -10
## [9,] 7 -51
## [10,] -27 -59
## [11,] 5 -39
## [12,] -32 -45
## [13,] -8 -12
## [14,] -2 -4
## [15,] -14 -18
## [16,] 11 -43
## [17,] -44 -49
## [18,] 9 -56
## [19,] -37 -60
## [20,] -6 -11
## [21,] -29 -48
## [22,] -5 -19
## [23,] 10 -36
## [24,] -42 17
## [25,] -25 12
## [26,] 18 -41
## [27,] 21 -38
## [28,] 13 -17
## [29,] -34 -52
## [30,] 16 6
## [31,] 8 20
## [32,] 26 4
## [33,] 19 -57
## [34,] -47 -55
## [35,] 25 -53
## [36,] 24 -31
## [37,] 30 36
## [38,] -3 3
## [39,] -9 15
## [40,] -33 33
## [41,] 32 23
## [42,] 22 28
## [43,] 31 -15
## [44,] 37 27
## [45,] -20 40
## [46,] -22 35
## [47,] 44 34
## [48,] 14 39
## [49,] 1 29
## [50,] 45 2
## [51,] 38 42
## [52,] 43 -16
## [53,] 46 49
## [54,] 52 48
## [55,] 47 41
## [56,] 50 53
## [57,] 54 55
## [58,] 51 56
## [59,] 57 58
## Order of objects:
## [1] 1 10 6 11 15 16 2 4 9 14 18 21 40 39 43 54 58 42 44 49 31 29 48
## [24] 38 47 55 23 26 51 56 41 30 35 27 59 36 3 7 13 5 19 8 12 17 20 33
## [47] 37 60 57 28 46 22 25 32 45 53 24 50 34 52
## Height:
## [1] 0.11775833 0.92338041 0.50974266 1.47360965 2.04722777 2.51250579
## [7] 0.36355872 1.79099892 1.08967479 0.39308959 3.57679780 0.00000000
## [13] 0.21833707 0.44391855 0.80354844 0.08334529 0.98499722 0.70126085
## [19] 0.44921797 0.98499722 1.48962560 0.55960408 0.76573069 1.77868059
## [25] 0.97891452 2.79693737 0.09525176 0.12305649 0.48657744 0.76517620
## [31] 0.93270565 0.00000000 1.28196769 0.16054657 0.60321756 5.85655734
## [37] 1.07657773 0.00000000 1.98611220 0.59473487 1.44920797 0.33912975
## [43] 0.78523518 3.88572195 1.51921913 1.18521332 0.50902071 0.97225583
## [49] 1.91123321 0.00000000 3.39304108 1.52798723 0.72296652 0.31544012
## [55] 0.98335831 2.45910026 0.00000000 1.85224545 0.79085454
## Divisive coefficient:
## [1] 0.9117911
##
## 1770 dissimilarities, summarized :
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.845 2.572 2.595 3.354 5.857
## Metric : euclidean
## Number of objects : 60
##
## Available components:
## [1] "order" "height" "dc" "merge" "diss" "call" "data"
plot(dv)


fit2 =cutree(dv,k=4)
c_1 = customer[fit2 ==1,]
summary(c_1)
## ID Visit.Time Average.Expense Sex Age
## Min. : 1.000 Min. :3.000 Min. : 4.60 Min. :0 Min. : 9
## 1st Qu.: 5.000 1st Qu.:3.500 1st Qu.: 7.15 1st Qu.:0 1st Qu.:12
## Median :10.000 Median :5.000 Median :14.50 Median :0 Median :16
## Mean : 9.636 Mean :4.909 Mean :12.71 Mean :0 Mean :17
## 3rd Qu.:14.500 3rd Qu.:6.000 3rd Qu.:16.00 3rd Qu.:0 3rd Qu.:20
## Max. :18.000 Max. :8.000 Max. :23.80 Max. :0 Max. :30
k-means
str(customer_s)
## num [1:60, 1:4] -1.202 -0.757 1.692 -0.757 1.692 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:4] "Visit.Time" "Average.Expense" "Sex" "Age"
## - attr(*, "scaled:center")= Named num [1:4] 8.4 17.058 0.683 21.433
## ..- attr(*, "names")= chr [1:4] "Visit.Time" "Average.Expense" "Sex" "Age"
## - attr(*, "scaled:scale")= Named num [1:4] 4.492 8.399 0.469 9.285
## ..- attr(*, "names")= chr [1:4] "Visit.Time" "Average.Expense" "Sex" "Age"
set.seed(22)
fit =kmeans(customer_s, centers=4)
?kmeans
barplot(t(fit$centers), beside =TRUE,xlab="cluster", ylab="value")

?barplot
fit$centers
## Visit.Time Average.Expense Sex Age
## 1 1.3302016 1.0155226 -1.4566845 0.5591307
## 2 -0.7771737 -0.5178412 -1.4566845 -0.4774599
## 3 0.8571173 0.9887331 0.6750489 1.0505015
## 4 -0.6322632 -0.7299063 0.6750489 -0.6411604
customer[fit$cluster == 1,]
## ID Visit.Time Average.Expense Sex Age
## 3 3 16 33.5 0 32
## 5 5 16 24.9 0 23
## 7 7 12 28.5 0 33
## 8 8 14 18.8 0 27
## 12 12 14 21.0 0 25
## 13 13 12 28.5 0 33
## 17 17 14 23.6 0 22
## 19 19 17 25.9 0 18
投影至二維空間
#install.packages("cluster")
library(cluster)
clusplot(customer_s, fit$cluster, color=TRUE, shade=TRUE)

#了解component 成分為何
pca =princomp(customer_s)
summary(pca)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4
## Standard deviation 1.5339215 0.9953978 0.62428436 0.44706853
## Proportion of Variance 0.5981988 0.2519026 0.09908414 0.05081448
## Cumulative Proportion 0.5981988 0.8501014 0.94918552 1.00000000
pca$loadings
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4
## Visit.Time 0.576 0.601 0.554
## Average.Expense 0.602 0.146 -0.785
## Sex -0.989 0.133
## Age 0.550 -0.148 -0.775 0.274
##
## Comp.1 Comp.2 Comp.3 Comp.4
## SS loadings 1.00 1.00 1.00 1.00
## Proportion Var 0.25 0.25 0.25 0.25
## Cumulative Var 0.25 0.50 0.75 1.00
Evaluating model
#silhouette
library('cluster')
par(mfrow= c(1,1))
set.seed(22)
library(cluster)
km =kmeans(customer_s, 4)
kms=silhouette(km$cluster,dist(customer_s))
summary(kms)
## Silhouette of 60 units in 4 clusters from silhouette.default(x = km$cluster, dist = dist(customer_s)) :
## Cluster sizes and average silhouette widths:
## 8 11 16 25
## 0.5464597 0.4080823 0.3794910 0.5164434
## Individual silhouette widths:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1931 0.4030 0.4890 0.4641 0.5422 0.6333
plot(kms)

選擇k-means最佳k值
#within sum of squares
nk=2:10
set.seed(22)
WSS =sapply(nk, function(k){set.seed(22);kmeans(customer_s, centers=k)$tot.withinss})
WSS
## [1] 123.49224 93.08341 61.34890 48.76431 43.08965 40.25820 29.58014
## [8] 26.97709 24.99510
plot(x=nk, y=WSS, type="l", xlab="number of k", ylab="within sum of squares")

SW = sapply(nk,function(k){
set.seed(22); summary(silhouette(kmeans(customer_s,centers=k)$cluster,dist(customer_s)))$avg.width
})
plot(x=nk,y=SW,type='l')

#install.packages("fpc")
#install.packages("robustbase", repos="http://R-Forge.R-project.org")
library(fpc)
?cluster.stats
cluster.stats(dist(customer_s), kmeans(customer_s, centers=2)$cluster)
## $n
## [1] 60
##
## $cluster.number
## [1] 2
##
## $cluster.size
## [1] 24 36
##
## $min.cluster.size
## [1] 24
##
## $noisen
## [1] 0
##
## $diameter
## [1] 3.885722 3.576798
##
## $average.distance
## [1] 2.045880 1.798055
##
## $median.distance
## [1] 2.243328 1.996810
##
## $separation
## [1] 0.9276315 0.9276315
##
## $average.toother
## [1] 3.351368 3.351368
##
## $separation.matrix
## [,1] [,2]
## [1,] 0.0000000 0.9276315
## [2,] 0.9276315 0.0000000
##
## $ave.between.matrix
## [,1] [,2]
## [1,] 0.000000 3.351368
## [2,] 3.351368 0.000000
##
## $average.between
## [1] 3.351368
##
## $average.within
## [1] 1.873552
##
## $n.between
## [1] 864
##
## $n.within
## [1] 906
##
## $max.diameter
## [1] 3.885722
##
## $min.separation
## [1] 0.9276315
##
## $within.cluster.ss
## [1] 123.4922
##
## $clus.avg.silwidths
## 1 2
## 0.3827935 0.4454536
##
## $avg.silwidth
## [1] 0.4203896
##
## $g2
## NULL
##
## $g3
## NULL
##
## $pearsongamma
## [1] 0.6564321
##
## $dunn
## [1] 0.2387282
##
## $dunn2
## [1] 1.638105
##
## $entropy
## [1] 0.6730117
##
## $wb.ratio
## [1] 0.559041
##
## $ch
## [1] 52.84097
##
## $cwidegap
## [1] 2.148705 2.131733
##
## $widestgap
## [1] 2.148705
##
## $sindex
## [1] 1.010004
##
## $corrected.rand
## NULL
##
## $vi
## NULL
WSS =sapply(nk, function(k){set.seed(22);cluster.stats(dist(customer_s), kmeans(customer_s, centers=k)$cluster)$within.cluster.ss})
plot(x=nk, y=WSS, type="l", xlab="number of k", ylab="within sum of squares")

SW =sapply(2:10,function(k){set.seed(22);cluster.stats(dist(customer_s),kmeans(customer_s, centers=k)$cluster)$avg.silwidth})
plot(x=nk,y=SW,type='l')

model comparison
single_c=hclust(dist(customer_s), method="single")
hc_single=cutree(single_c, k =4)
complete_c=hclust(dist(customer_s), method="complete")
hc_complete=cutree(complete_c, k =4)
set.seed(22)
km =kmeans(customer_s, 4)
cs=cluster.stats(dist(customer_s),km$cluster)
cs[c("within.cluster.ss","avg.silwidth")]
## $within.cluster.ss
## [1] 61.3489
##
## $avg.silwidth
## [1] 0.4640587
q =sapply(
list(kmeans=km$cluster,
hc_single=hc_single,
hc_complete=hc_complete), function(c)cluster.stats(dist(customer_s),c)[c("within.cluster.ss","avg.silwidth")])
q
## kmeans hc_single hc_complete
## within.cluster.ss 61.3489 136.0092 65.94076
## avg.silwidth 0.4640587 0.2481926 0.4255961
density-based method-DBSCAN
#install.packages("mlbench")
# mlbench package provides many methods to generate simulated data with different shapes and sizes.
#In this example, we generate a Cassini problem graph
library(mlbench)
#install.packages("fpc")
library(fpc)
set.seed(2)
p = mlbench.cassini(500)
plot(p$x)

?mlbench.cassini
ds = dbscan(data = dist(p$x),eps= 0.2, MinPts = 2, method="dist")
ds
## dbscan Pts=500 MinPts=2 eps=0.2
## 1 2 3
## seed 200 200 100
## total 200 200 100
plot(ds, p$x)

y = matrix(0,nrow=3,ncol=2)
y[1,] = c(0,0)
y[2,] = c(0,-1.5)
y[3,] = c(1,1)
y
## [,1] [,2]
## [1,] 0 0.0
## [2,] 0 -1.5
## [3,] 1 1.0
predict(ds, p$x, y)
## [1] 3 1 2
#filter群集的raw data
cluster_1_raw = p$x[ds$cluster == 1,]
cluster_1_raw
## [,1] [,2]
## [1,] -0.878020041 -0.9762015
## [2,] 0.204310908 -1.8311169
## [3,] -1.033283148 -0.7664819
## [4,] -0.089110770 -1.2200260
## [5,] 0.146767003 -1.7177684
## [6,] 0.725874430 -1.8106878
## [7,] 0.451102355 -1.4799207
## [8,] -0.425548959 -1.3179628
## [9,] -0.977311794 -1.5286999
## [10,] 0.864295737 -0.7098223
## [11,] 0.039793615 -1.0964859
## [12,] 0.959690709 -1.6442071
## [13,] 0.465944793 -1.8592484
## [14,] 1.237244233 -0.8282179
## [15,] -0.625804973 -0.8010245
## [16,] 0.317509698 -0.9637028
## [17,] 0.752385028 -0.6808253
## [18,] 0.348629646 -1.6835199
## [19,] 1.000493954 -1.4000192
## [20,] -0.311664918 -1.3615982
## [21,] -0.624969555 -1.6033902
## [22,] -0.882224692 -0.8912601
## [23,] -0.590534200 -0.7114485
## [24,] -0.271268856 -1.1837040
## [25,] -0.416158140 -1.0248017
## [26,] -0.835962518 -0.7295600
## [27,] 0.649746827 -1.5558908
## [28,] -0.334361972 -1.2033798
## [29,] -0.100842130 -1.7851571
## [30,] -1.071873313 -1.3959494
## [31,] -0.833292664 -1.4157775
## [32,] -0.593559806 -1.1465330
## [33,] 1.111665006 -1.3745968
## [34,] -1.324461332 -0.9219018
## [35,] -0.352864039 -1.1809969
## [36,] 0.906469168 -0.7980869
## [37,] 1.261369418 -1.1895967
## [38,] -0.721705502 -0.8884615
## [39,] 0.175082884 -1.6118350
## [40,] 0.539582980 -1.5624870
## [41,] -0.818874233 -0.6291580
## [42,] -0.513059044 -1.0350265
## [43,] -0.876957755 -1.4607719
## [44,] -0.895933315 -1.6433053
## [45,] 0.360744853 -1.6198395
## [46,] -0.156942578 -0.9316785
## [47,] 0.468558055 -1.6013488
## [48,] -0.059688396 -1.6337438
## [49,] -0.885565535 -1.5227799
## [50,] 1.125117614 -1.4695238
## [51,] 0.779837759 -1.6453457
## [52,] -1.267874267 -0.8865873
## [53,] -0.565935794 -1.0583129
## [54,] -1.355198401 -1.2378089
## [55,] -0.677769509 -0.6677644
## [56,] -0.312223762 -0.8291433
## [57,] -0.419629384 -1.8780091
## [58,] 0.419366754 -1.2295931
## [59,] 1.240594060 -1.1852068
## [60,] 1.263970040 -0.8515952
## [61,] -1.063583088 -0.7606211
## [62,] -1.150307795 -0.9022401
## [63,] -1.100364472 -1.4493018
## [64,] 0.008498944 -1.7775133
## [65,] -0.836672143 -1.8149838
## [66,] 0.999430884 -1.0677434
## [67,] -0.343672879 -0.7894168
## [68,] -0.761394282 -1.7695864
## [69,] -0.851809277 -1.7150909
## [70,] 1.336660721 -1.2611998
## [71,] 0.909374488 -1.0265832
## [72,] -0.382126787 -1.7644891
## [73,] -0.969304533 -0.6027001
## [74,] -0.275587040 -1.3226617
## [75,] 0.505679994 -0.7335432
## [76,] 1.150520920 -1.3386278
## [77,] 0.595382492 -0.7723944
## [78,] 0.489829736 -1.5564774
## [79,] 1.088978634 -1.2538732
## [80,] 1.043025382 -1.6861489
## [81,] 0.118268481 -1.7543981
## [82,] -0.789308146 -1.3415347
## [83,] 1.094284148 -1.6172529
## [84,] 0.153857954 -0.8916541
## [85,] -0.464299827 -1.2733054
## [86,] -0.980257695 -1.5290886
## [87,] -1.279712214 -1.0883512
## [88,] 0.361285474 -1.5286257
## [89,] 0.858581717 -1.0253372
## [90,] -1.332177181 -0.9967377
## [91,] 1.079295343 -1.2404652
## [92,] 1.072808084 -0.6373208
## [93,] 0.972371113 -1.6303852
## [94,] 0.791835346 -1.3100768
## [95,] 0.443013721 -1.0278206
## [96,] 0.094980484 -1.1810036
## [97,] 0.758548042 -1.5286569
## [98,] -0.678020587 -1.5936866
## [99,] 0.681099042 -1.3005344
## [100,] 0.581809991 -1.4361657
## [101,] -0.949328033 -1.2934566
## [102,] 0.934646309 -1.3200800
## [103,] -0.838076695 -1.0010805
## [104,] -1.143456853 -1.3945971
## [105,] 0.294368850 -1.3797689
## [106,] 0.690397641 -1.8165608
## [107,] 0.431482058 -1.8374265
## [108,] -0.294068155 -1.7588508
## [109,] 0.391565978 -1.6878085
## [110,] -0.868340232 -0.7029784
## [111,] -1.092895774 -1.1875185
## [112,] 0.944616018 -0.7959523
## [113,] 1.008227022 -0.7349331
## [114,] 0.063169014 -1.5183811
## [115,] -0.883764856 -1.3822555
## [116,] -0.676683002 -1.1494103
## [117,] -0.905608044 -0.8047834
## [118,] 0.819799909 -1.1321852
## [119,] -1.247826523 -0.9938843
## [120,] 0.425023325 -1.7127364
## [121,] 0.028040646 -1.0690623
## [122,] 0.548264973 -1.0473068
## [123,] -1.109889201 -1.3317341
## [124,] -0.085253708 -1.6740866
## [125,] -1.144451660 -1.1216628
## [126,] -0.292571453 -1.2366095
## [127,] 1.007551267 -0.8961479
## [128,] 0.719466695 -0.6217663
## [129,] -0.476545375 -1.8497985
## [130,] -0.847643081 -0.7277424
## [131,] -0.758695996 -0.8727665
## [132,] 0.326395991 -1.1894974
## [133,] 0.241189831 -1.5574133
## [134,] -0.928170585 -0.8081438
## [135,] 0.416418319 -0.9710204
## [136,] -0.381826950 -1.0647319
## [137,] 0.628505803 -1.4240533
## [138,] 0.254806683 -0.8233607
## [139,] -0.557311573 -0.9897810
## [140,] -1.182075255 -1.1919261
## [141,] 1.080463546 -1.5803151
## [142,] -1.134948642 -0.7702150
## [143,] -1.038683810 -0.7100887
## [144,] 1.262473473 -1.3013711
## [145,] 0.431152384 -1.7899316
## [146,] 0.732401457 -1.2241335
## [147,] -0.797356507 -0.9931729
## [148,] 1.317411229 -1.2101679
## [149,] 0.078235150 -1.7534496
## [150,] 0.503113262 -0.9063731
## [151,] -0.099743764 -1.8801311
## [152,] 0.875963378 -0.7685420
## [153,] 1.359823570 -1.0845456
## [154,] -1.322579861 -1.3218902
## [155,] -0.646839537 -0.8774735
## [156,] -0.033284404 -0.9049642
## [157,] -1.274560036 -1.3949715
## [158,] -0.986175143 -1.3928907
## [159,] 0.516447124 -0.8282905
## [160,] -0.617672489 -0.7544855
## [161,] -0.090804023 -0.8836617
## [162,] 1.057596764 -1.3103589
## [163,] 0.581555766 -1.3198854
## [164,] 0.785857302 -0.6411018
## [165,] -0.596294891 -1.1266062
## [166,] 0.308849454 -1.7633732
## [167,] -0.508235955 -0.6506234
## [168,] -0.491856613 -1.2025502
## [169,] 0.534989613 -1.2040689
## [170,] -0.438313225 -1.7248040
## [171,] -0.196149509 -0.9212789
## [172,] -0.349246728 -1.3181133
## [173,] 0.837322887 -1.0071313
## [174,] -0.332437398 -1.4488438
## [175,] -0.312666601 -1.1529168
## [176,] 0.937778199 -1.2671769
## [177,] -0.109157470 -0.9522752
## [178,] -1.315804660 -0.9816730
## [179,] -0.672054106 -0.6749106
## [180,] 0.495424973 -0.8340842
## [181,] 0.560205803 -1.6648770
## [182,] -1.086024289 -0.9038942
## [183,] 0.276024465 -0.8513125
## [184,] 1.242073722 -1.5273660
## [185,] -0.039474960 -1.2577110
## [186,] 0.970639213 -1.2981420
## [187,] -0.433099332 -1.1673614
## [188,] 0.674286036 -1.3360663
## [189,] 1.113452181 -1.6006035
## [190,] -0.654962503 -1.2169928
## [191,] 1.261636182 -1.1705721
## [192,] 1.057543215 -0.8091239
## [193,] -0.103213762 -1.7898732
## [194,] 0.617210972 -1.1585440
## [195,] 0.098981815 -0.8887357
## [196,] 1.202563385 -0.9429087
## [197,] -0.385531083 -1.3979427
## [198,] -1.343070698 -0.9174087
## [199,] -0.344961580 -1.7403067
## [200,] 0.746652400 -1.0796416
其他分類方法
k-nearest neighbor classifer
library(C50)
data(churn)
#選擇建模變數
variable.list = !names(churnTrain) %in% c('state','area_code','account_length')
churnTrain=churnTrain[,variable.list]
churnTest=churnTest[,variable.list]
#install.packages("class")
#library(class)
head(churnTrain)
## international_plan voice_mail_plan number_vmail_messages
## 1 no yes 25
## 2 no yes 26
## 3 no no 0
## 4 yes no 0
## 5 yes no 0
## 6 yes no 0
## total_day_minutes total_day_calls total_day_charge total_eve_minutes
## 1 265.1 110 45.07 197.4
## 2 161.6 123 27.47 195.5
## 3 243.4 114 41.38 121.2
## 4 299.4 71 50.90 61.9
## 5 166.7 113 28.34 148.3
## 6 223.4 98 37.98 220.6
## total_eve_calls total_eve_charge total_night_minutes total_night_calls
## 1 99 16.78 244.7 91
## 2 103 16.62 254.4 103
## 3 110 10.30 162.6 104
## 4 88 5.26 196.9 89
## 5 122 12.61 186.9 121
## 6 101 18.75 203.9 118
## total_night_charge total_intl_minutes total_intl_calls total_intl_charge
## 1 11.01 10.0 3 2.70
## 2 11.45 13.7 3 3.70
## 3 7.32 12.2 5 3.29
## 4 8.86 6.6 7 1.78
## 5 8.41 10.1 3 2.73
## 6 9.18 6.3 6 1.70
## number_customer_service_calls churn
## 1 1 no
## 2 1 no
## 3 0 no
## 4 2 no
## 5 3 no
## 6 0 no
levels(churnTrain$international_plan) = list("0"="no", "1"="yes")
levels(churnTrain$voice_mail_plan) = list("0"="no", "1"="yes")
churnTrain$international_plan = as.numeric(churnTrain$international_plan)
churnTrain$voice_mail_plan = as.numeric(churnTrain$voice_mail_plan)
levels(churnTest$international_plan) = list("0"="no", "1"="yes")
levels(churnTest$voice_mail_plan) = list("0"="no", "1"="yes")
churnTest$international_plan = as.numeric(churnTest$international_plan)
churnTest$voice_mail_plan = as.numeric(churnTest$voice_mail_plan)
head(churnTrain)
## international_plan voice_mail_plan number_vmail_messages
## 1 1 2 25
## 2 1 2 26
## 3 1 1 0
## 4 2 1 0
## 5 2 1 0
## 6 2 1 0
## total_day_minutes total_day_calls total_day_charge total_eve_minutes
## 1 265.1 110 45.07 197.4
## 2 161.6 123 27.47 195.5
## 3 243.4 114 41.38 121.2
## 4 299.4 71 50.90 61.9
## 5 166.7 113 28.34 148.3
## 6 223.4 98 37.98 220.6
## total_eve_calls total_eve_charge total_night_minutes total_night_calls
## 1 99 16.78 244.7 91
## 2 103 16.62 254.4 103
## 3 110 10.30 162.6 104
## 4 88 5.26 196.9 89
## 5 122 12.61 186.9 121
## 6 101 18.75 203.9 118
## total_night_charge total_intl_minutes total_intl_calls total_intl_charge
## 1 11.01 10.0 3 2.70
## 2 11.45 13.7 3 3.70
## 3 7.32 12.2 5 3.29
## 4 8.86 6.6 7 1.78
## 5 8.41 10.1 3 2.73
## 6 9.18 6.3 6 1.70
## number_customer_service_calls churn
## 1 1 no
## 2 1 no
## 3 0 no
## 4 2 no
## 5 3 no
## 6 0 no
#use caret package
library('caret')
control=trainControl(method="cv",number=10,,classProbs = TRUE,summaryFunction = twoClassSummary)
tune_funs=expand.grid(k=seq(3,15,2))
knn.model = train(churn~., data=churnTrain, method="knn", trControl=control,metric="ROC",tuneGrid=tune_funs)
knn.predict = predict(knn.model,churnTest)
confusionMatrix(table(knn.predict,churnTest$churn))
## Confusion Matrix and Statistics
##
##
## knn.predict yes no
## yes 66 6
## no 158 1437
##
## Accuracy : 0.9016
## 95% CI : (0.8863, 0.9155)
## No Information Rate : 0.8656
## P-Value [Acc > NIR] : 4.383e-06
##
## Kappa : 0.4072
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.29464
## Specificity : 0.99584
## Pos Pred Value : 0.91667
## Neg Pred Value : 0.90094
## Prevalence : 0.13437
## Detection Rate : 0.03959
## Detection Prevalence : 0.04319
## Balanced Accuracy : 0.64524
##
## 'Positive' Class : yes
##