library(C50)

data(churn)

names(churnTrain) %in% c("state", "area_code", "account_length")
##  [1]  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
!names(churnTrain) %in% c("state", "area_code", "account_length")
##  [1] FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [12]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
#選擇建模變數
variable.list = !names(churnTrain) %in% c('state','area_code','account_length')
churnTrain=churnTrain[,variable.list]
churnTest=churnTest[,variable.list]

set.seed(2)
#把資料分成training data 和 validation data
ind<-sample(1:2, size=nrow(churnTrain), replace=T, prob=c(0.7, 0.3))
trainset=churnTrain[ind==1,]
testset=churnTrain[ind==2,]

補充:隨機森林(Random Forest)

#install.packages('randomForest')
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
library('caret')
## Loading required package: lattice
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin
library('e1071')
library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
rf_model = randomForest(formula=churn ~ .,data=churnTrain)
#find best ntree
plot(rf_model)
legend("topright",colnames(rf_model$err.rate),col=1:3,cex=0.8,fill=1:3)

#find nest mtry
tuneRF(churnTrain[,-17],churnTrain[,17])
## mtry = 4  OOB error = 4.71% 
## Searching left ...
## mtry = 2     OOB error = 6.21% 
## -0.3184713 0.05 
## Searching right ...
## mtry = 8     OOB error = 4.68% 
## 0.006369427 0.05

##       mtry   OOBError
## 2.OOB    2 0.06210621
## 4.OOB    4 0.04710471
## 8.OOB    8 0.04680468
rf_model <- randomForest(churn ~., data = churnTrain, ntree=50,mtry=4)
# rf_model = train(churn~.,data=churnTrain,method='rf')
confusionMatrix(table(predict(rf_model,churnTest),churnTest$churn))
## Confusion Matrix and Statistics
## 
##      
##        yes   no
##   yes  162    4
##   no    62 1439
##                                           
##                Accuracy : 0.9604          
##                  95% CI : (0.9499, 0.9692)
##     No Information Rate : 0.8656          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8089          
##  Mcnemar's Test P-Value : 2.28e-12        
##                                           
##             Sensitivity : 0.72321         
##             Specificity : 0.99723         
##          Pos Pred Value : 0.97590         
##          Neg Pred Value : 0.95869         
##              Prevalence : 0.13437         
##          Detection Rate : 0.09718         
##    Detection Prevalence : 0.09958         
##       Balanced Accuracy : 0.86022         
##                                           
##        'Positive' Class : yes             
## 
rf.predict.prob <- predict(rf_model, churnTest, type="prob")
rf.prediction <- prediction(rf.predict.prob[,"yes"], as.factor(churnTest$churn))
rf.auc <- performance(rf.prediction, measure = "auc", x.measure = "cutoff")
rf.performance <- performance(rf.prediction, measure="tpr",x.measure="fpr")
plot(rf.performance)

#比較CART和RandomForest
tune_funs = expand.grid(cp=seq(0,0.1,0.01))
rpart_model =train(churn~., data=churnTrain, method="rpart",tuneGrid=tune_funs)

rpart_prob_yes = predict(rpart_model,churnTest,type='prob')[,1]
rpart_pred.rocr = prediction(rpart_prob_yes,churnTest$churn)
rpart_perf.rocr = performance(rpart_pred.rocr,measure = 'tpr',x.measure = 'fpr')

plot(rpart_perf.rocr,col='red')
plot(rf.performance,col='black',add=T)
legend(x=0.7, y=0.2, legend =c('randomforest','rpart'), fill= c("black","red"))

分群問題

距離計算

x =c(0, 0, 1, 1, 1, 1)
y =c(1, 0, 1, 1, 0, 1)

#euclidean
?dist
rbind(x,y)
##   [,1] [,2] [,3] [,4] [,5] [,6]
## x    0    0    1    1    1    1
## y    1    0    1    1    0    1
dist(rbind(x,y), method ="euclidean")
##          x
## y 1.414214
sqrt(sum((x-y)^2))
## [1] 1.414214
dist(rbind(x,y), method ="minkowski", p=2)
##          x
## y 1.414214
#city block
dist(rbind(x,y), method ="manhattan")
##   x
## y 2
sum(abs(x-y))
## [1] 2
dist(rbind(x,y), method ="minkowski", p=1)
##   x
## y 2

Hierarchical Clustering

聚合式(bottom-up)

setwd('~/lecture/riii')
customer=read.csv('data/customer.csv',header=TRUE)
head(customer)
##   ID Visit.Time Average.Expense Sex Age
## 1  1          3             5.7   0  10
## 2  2          5            14.5   0  27
## 3  3         16            33.5   0  32
## 4  4          5            15.9   0  30
## 5  5         16            24.9   0  23
## 6  6          3            12.0   0  15
str(customer)
## 'data.frame':    60 obs. of  5 variables:
##  $ ID             : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Visit.Time     : int  3 5 16 5 16 3 12 14 6 3 ...
##  $ Average.Expense: num  5.7 14.5 33.5 15.9 24.9 12 28.5 18.8 23.8 5.3 ...
##  $ Sex            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Age            : int  10 27 32 30 23 15 33 27 16 11 ...
#數值變數作正規化
customer_s =scale(customer[,-1])
?scale

#正規化後的變數平均數為0, 標準差為1
round(mean(customer_s[,2]),3)
## [1] 0
round(sd(customer_s[,2]),3)
## [1] 1
?hclust
hc=hclust(dist(customer_s, method="euclidean"), method="ward.D2")
plot(hc,hang =-0.01, cex=0.7)

hc3 =hclust(dist(customer, method="euclidean"), method="single")
plot(hc3, hang =-0.01, cex=0.8)

cutree

fit =cutree(hc, k =4)
fit
##  [1] 1 1 2 1 2 1 2 2 1 1 1 2 2 1 1 1 2 1 2 3 4 3 4 3 3 4 4 3 4 4 4 3 3 3 4
## [36] 4 3 4 4 4 4 4 4 4 3 3 4 4 4 3 4 3 3 4 4 4 3 4 4 3
table(fit)
## fit
##  1  2  3  4 
## 11  8 16 25
plot(hc, hang =-0.01, cex=0.7)
rect.hclust(hc, k =4, border="red")
rect.hclust(hc, k =3, border="blue")

c_1 = customer[fit == 1,]
summary(c_1)
##        ID           Visit.Time    Average.Expense      Sex         Age    
##  Min.   : 1.000   Min.   :3.000   Min.   : 4.60   Min.   :0   Min.   : 9  
##  1st Qu.: 5.000   1st Qu.:3.500   1st Qu.: 7.15   1st Qu.:0   1st Qu.:12  
##  Median :10.000   Median :5.000   Median :14.50   Median :0   Median :16  
##  Mean   : 9.636   Mean   :4.909   Mean   :12.71   Mean   :0   Mean   :17  
##  3rd Qu.:14.500   3rd Qu.:6.000   3rd Qu.:16.00   3rd Qu.:0   3rd Qu.:20  
##  Max.   :18.000   Max.   :8.000   Max.   :23.80   Max.   :0   Max.   :30

分裂式階層式(top-down)

#install.packages('cluster')
library(cluster)
?diana
dv =diana(customer_s, metric ="euclidean")
summary(dv)
## Merge:
##       [,1] [,2]
##  [1,]  -24  -50
##  [2,]  -28  -46
##  [3,]   -7  -13
##  [4,]  -30  -35
##  [5,]  -21  -40
##  [6,]  -54  -58
##  [7,]  -23  -26
##  [8,]   -1  -10
##  [9,]    7  -51
## [10,]  -27  -59
## [11,]    5  -39
## [12,]  -32  -45
## [13,]   -8  -12
## [14,]   -2   -4
## [15,]  -14  -18
## [16,]   11  -43
## [17,]  -44  -49
## [18,]    9  -56
## [19,]  -37  -60
## [20,]   -6  -11
## [21,]  -29  -48
## [22,]   -5  -19
## [23,]   10  -36
## [24,]  -42   17
## [25,]  -25   12
## [26,]   18  -41
## [27,]   21  -38
## [28,]   13  -17
## [29,]  -34  -52
## [30,]   16    6
## [31,]    8   20
## [32,]   26    4
## [33,]   19  -57
## [34,]  -47  -55
## [35,]   25  -53
## [36,]   24  -31
## [37,]   30   36
## [38,]   -3    3
## [39,]   -9   15
## [40,]  -33   33
## [41,]   32   23
## [42,]   22   28
## [43,]   31  -15
## [44,]   37   27
## [45,]  -20   40
## [46,]  -22   35
## [47,]   44   34
## [48,]   14   39
## [49,]    1   29
## [50,]   45    2
## [51,]   38   42
## [52,]   43  -16
## [53,]   46   49
## [54,]   52   48
## [55,]   47   41
## [56,]   50   53
## [57,]   54   55
## [58,]   51   56
## [59,]   57   58
## Order of objects:
##  [1]  1 10  6 11 15 16  2  4  9 14 18 21 40 39 43 54 58 42 44 49 31 29 48
## [24] 38 47 55 23 26 51 56 41 30 35 27 59 36  3  7 13  5 19  8 12 17 20 33
## [47] 37 60 57 28 46 22 25 32 45 53 24 50 34 52
## Height:
##  [1] 0.11775833 0.92338041 0.50974266 1.47360965 2.04722777 2.51250579
##  [7] 0.36355872 1.79099892 1.08967479 0.39308959 3.57679780 0.00000000
## [13] 0.21833707 0.44391855 0.80354844 0.08334529 0.98499722 0.70126085
## [19] 0.44921797 0.98499722 1.48962560 0.55960408 0.76573069 1.77868059
## [25] 0.97891452 2.79693737 0.09525176 0.12305649 0.48657744 0.76517620
## [31] 0.93270565 0.00000000 1.28196769 0.16054657 0.60321756 5.85655734
## [37] 1.07657773 0.00000000 1.98611220 0.59473487 1.44920797 0.33912975
## [43] 0.78523518 3.88572195 1.51921913 1.18521332 0.50902071 0.97225583
## [49] 1.91123321 0.00000000 3.39304108 1.52798723 0.72296652 0.31544012
## [55] 0.98335831 2.45910026 0.00000000 1.85224545 0.79085454
## Divisive coefficient:
## [1] 0.9117911
## 
## 1770 dissimilarities, summarized :
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.845   2.572   2.595   3.354   5.857 
## Metric :  euclidean 
## Number of objects : 60
## 
## Available components:
## [1] "order"  "height" "dc"     "merge"  "diss"   "call"   "data"
plot(dv)

fit2 =cutree(dv,k=4)
c_1 = customer[fit2 ==2,]
summary(c_1)
##        ID         Visit.Time    Average.Expense      Sex   
##  Min.   : 3.0   Min.   :12.00   Min.   :18.80   Min.   :0  
##  1st Qu.: 6.5   1st Qu.:13.50   1st Qu.:22.95   1st Qu.:0  
##  Median :10.0   Median :14.00   Median :25.40   Median :0  
##  Mean   :10.5   Mean   :14.38   Mean   :25.59   Mean   :0  
##  3rd Qu.:14.0   3rd Qu.:16.00   3rd Qu.:28.50   3rd Qu.:0  
##  Max.   :19.0   Max.   :17.00   Max.   :33.50   Max.   :0  
##       Age       
##  Min.   :18.00  
##  1st Qu.:22.75  
##  Median :26.00  
##  Mean   :26.62  
##  3rd Qu.:32.25  
##  Max.   :33.00