library(C50)
data(churn)
names(churnTrain) %in% c("state", "area_code", "account_length")
## [1] TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
!names(churnTrain) %in% c("state", "area_code", "account_length")
## [1] FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [12] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
#選擇建模變數
variable.list = !names(churnTrain) %in% c('state','area_code','account_length')
churnTrain=churnTrain[,variable.list]
churnTest=churnTest[,variable.list]
set.seed(2)
#把資料分成training data 和 validation data
ind<-sample(1:2, size=nrow(churnTrain), replace=T, prob=c(0.7, 0.3))
trainset=churnTrain[ind==1,]
testset=churnTrain[ind==2,]
補充:隨機森林(Random Forest)
#install.packages('randomForest')
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
library('caret')
## Loading required package: lattice
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
library('e1071')
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
rf_model = randomForest(formula=churn ~ .,data=churnTrain)
#find best ntree
plot(rf_model)
legend("topright",colnames(rf_model$err.rate),col=1:3,cex=0.8,fill=1:3)

#find nest mtry
tuneRF(churnTrain[,-17],churnTrain[,17])
## mtry = 4 OOB error = 4.71%
## Searching left ...
## mtry = 2 OOB error = 6.21%
## -0.3184713 0.05
## Searching right ...
## mtry = 8 OOB error = 4.68%
## 0.006369427 0.05

## mtry OOBError
## 2.OOB 2 0.06210621
## 4.OOB 4 0.04710471
## 8.OOB 8 0.04680468
rf_model <- randomForest(churn ~., data = churnTrain, ntree=50,mtry=4)
# rf_model = train(churn~.,data=churnTrain,method='rf')
confusionMatrix(table(predict(rf_model,churnTest),churnTest$churn))
## Confusion Matrix and Statistics
##
##
## yes no
## yes 162 4
## no 62 1439
##
## Accuracy : 0.9604
## 95% CI : (0.9499, 0.9692)
## No Information Rate : 0.8656
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8089
## Mcnemar's Test P-Value : 2.28e-12
##
## Sensitivity : 0.72321
## Specificity : 0.99723
## Pos Pred Value : 0.97590
## Neg Pred Value : 0.95869
## Prevalence : 0.13437
## Detection Rate : 0.09718
## Detection Prevalence : 0.09958
## Balanced Accuracy : 0.86022
##
## 'Positive' Class : yes
##
rf.predict.prob <- predict(rf_model, churnTest, type="prob")
rf.prediction <- prediction(rf.predict.prob[,"yes"], as.factor(churnTest$churn))
rf.auc <- performance(rf.prediction, measure = "auc", x.measure = "cutoff")
rf.performance <- performance(rf.prediction, measure="tpr",x.measure="fpr")
plot(rf.performance)

#比較CART和RandomForest
tune_funs = expand.grid(cp=seq(0,0.1,0.01))
rpart_model =train(churn~., data=churnTrain, method="rpart",tuneGrid=tune_funs)
rpart_prob_yes = predict(rpart_model,churnTest,type='prob')[,1]
rpart_pred.rocr = prediction(rpart_prob_yes,churnTest$churn)
rpart_perf.rocr = performance(rpart_pred.rocr,measure = 'tpr',x.measure = 'fpr')
plot(rpart_perf.rocr,col='red')
plot(rf.performance,col='black',add=T)
legend(x=0.7, y=0.2, legend =c('randomforest','rpart'), fill= c("black","red"))

分群問題
距離計算
x =c(0, 0, 1, 1, 1, 1)
y =c(1, 0, 1, 1, 0, 1)
#euclidean
?dist
rbind(x,y)
## [,1] [,2] [,3] [,4] [,5] [,6]
## x 0 0 1 1 1 1
## y 1 0 1 1 0 1
dist(rbind(x,y), method ="euclidean")
## x
## y 1.414214
sqrt(sum((x-y)^2))
## [1] 1.414214
dist(rbind(x,y), method ="minkowski", p=2)
## x
## y 1.414214
#city block
dist(rbind(x,y), method ="manhattan")
## x
## y 2
sum(abs(x-y))
## [1] 2
dist(rbind(x,y), method ="minkowski", p=1)
## x
## y 2
Hierarchical Clustering
聚合式(bottom-up)
setwd('~/lecture/riii')
customer=read.csv('data/customer.csv',header=TRUE)
head(customer)
## ID Visit.Time Average.Expense Sex Age
## 1 1 3 5.7 0 10
## 2 2 5 14.5 0 27
## 3 3 16 33.5 0 32
## 4 4 5 15.9 0 30
## 5 5 16 24.9 0 23
## 6 6 3 12.0 0 15
str(customer)
## 'data.frame': 60 obs. of 5 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Visit.Time : int 3 5 16 5 16 3 12 14 6 3 ...
## $ Average.Expense: num 5.7 14.5 33.5 15.9 24.9 12 28.5 18.8 23.8 5.3 ...
## $ Sex : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Age : int 10 27 32 30 23 15 33 27 16 11 ...
#數值變數作正規化
customer_s =scale(customer[,-1])
?scale
#正規化後的變數平均數為0, 標準差為1
round(mean(customer_s[,2]),3)
## [1] 0
round(sd(customer_s[,2]),3)
## [1] 1
?hclust
hc=hclust(dist(customer_s, method="euclidean"), method="ward.D2")
plot(hc,hang =-0.01, cex=0.7)

hc3 =hclust(dist(customer, method="euclidean"), method="single")
plot(hc3, hang =-0.01, cex=0.8)

cutree
fit =cutree(hc, k =4)
fit
## [1] 1 1 2 1 2 1 2 2 1 1 1 2 2 1 1 1 2 1 2 3 4 3 4 3 3 4 4 3 4 4 4 3 3 3 4
## [36] 4 3 4 4 4 4 4 4 4 3 3 4 4 4 3 4 3 3 4 4 4 3 4 4 3
table(fit)
## fit
## 1 2 3 4
## 11 8 16 25
plot(hc, hang =-0.01, cex=0.7)
rect.hclust(hc, k =4, border="red")
rect.hclust(hc, k =3, border="blue")

c_1 = customer[fit == 1,]
summary(c_1)
## ID Visit.Time Average.Expense Sex Age
## Min. : 1.000 Min. :3.000 Min. : 4.60 Min. :0 Min. : 9
## 1st Qu.: 5.000 1st Qu.:3.500 1st Qu.: 7.15 1st Qu.:0 1st Qu.:12
## Median :10.000 Median :5.000 Median :14.50 Median :0 Median :16
## Mean : 9.636 Mean :4.909 Mean :12.71 Mean :0 Mean :17
## 3rd Qu.:14.500 3rd Qu.:6.000 3rd Qu.:16.00 3rd Qu.:0 3rd Qu.:20
## Max. :18.000 Max. :8.000 Max. :23.80 Max. :0 Max. :30
分裂式階層式(top-down)
#install.packages('cluster')
library(cluster)
?diana
dv =diana(customer_s, metric ="euclidean")
summary(dv)
## Merge:
## [,1] [,2]
## [1,] -24 -50
## [2,] -28 -46
## [3,] -7 -13
## [4,] -30 -35
## [5,] -21 -40
## [6,] -54 -58
## [7,] -23 -26
## [8,] -1 -10
## [9,] 7 -51
## [10,] -27 -59
## [11,] 5 -39
## [12,] -32 -45
## [13,] -8 -12
## [14,] -2 -4
## [15,] -14 -18
## [16,] 11 -43
## [17,] -44 -49
## [18,] 9 -56
## [19,] -37 -60
## [20,] -6 -11
## [21,] -29 -48
## [22,] -5 -19
## [23,] 10 -36
## [24,] -42 17
## [25,] -25 12
## [26,] 18 -41
## [27,] 21 -38
## [28,] 13 -17
## [29,] -34 -52
## [30,] 16 6
## [31,] 8 20
## [32,] 26 4
## [33,] 19 -57
## [34,] -47 -55
## [35,] 25 -53
## [36,] 24 -31
## [37,] 30 36
## [38,] -3 3
## [39,] -9 15
## [40,] -33 33
## [41,] 32 23
## [42,] 22 28
## [43,] 31 -15
## [44,] 37 27
## [45,] -20 40
## [46,] -22 35
## [47,] 44 34
## [48,] 14 39
## [49,] 1 29
## [50,] 45 2
## [51,] 38 42
## [52,] 43 -16
## [53,] 46 49
## [54,] 52 48
## [55,] 47 41
## [56,] 50 53
## [57,] 54 55
## [58,] 51 56
## [59,] 57 58
## Order of objects:
## [1] 1 10 6 11 15 16 2 4 9 14 18 21 40 39 43 54 58 42 44 49 31 29 48
## [24] 38 47 55 23 26 51 56 41 30 35 27 59 36 3 7 13 5 19 8 12 17 20 33
## [47] 37 60 57 28 46 22 25 32 45 53 24 50 34 52
## Height:
## [1] 0.11775833 0.92338041 0.50974266 1.47360965 2.04722777 2.51250579
## [7] 0.36355872 1.79099892 1.08967479 0.39308959 3.57679780 0.00000000
## [13] 0.21833707 0.44391855 0.80354844 0.08334529 0.98499722 0.70126085
## [19] 0.44921797 0.98499722 1.48962560 0.55960408 0.76573069 1.77868059
## [25] 0.97891452 2.79693737 0.09525176 0.12305649 0.48657744 0.76517620
## [31] 0.93270565 0.00000000 1.28196769 0.16054657 0.60321756 5.85655734
## [37] 1.07657773 0.00000000 1.98611220 0.59473487 1.44920797 0.33912975
## [43] 0.78523518 3.88572195 1.51921913 1.18521332 0.50902071 0.97225583
## [49] 1.91123321 0.00000000 3.39304108 1.52798723 0.72296652 0.31544012
## [55] 0.98335831 2.45910026 0.00000000 1.85224545 0.79085454
## Divisive coefficient:
## [1] 0.9117911
##
## 1770 dissimilarities, summarized :
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.845 2.572 2.595 3.354 5.857
## Metric : euclidean
## Number of objects : 60
##
## Available components:
## [1] "order" "height" "dc" "merge" "diss" "call" "data"
plot(dv)


fit2 =cutree(dv,k=4)
c_1 = customer[fit2 ==2,]
summary(c_1)
## ID Visit.Time Average.Expense Sex
## Min. : 3.0 Min. :12.00 Min. :18.80 Min. :0
## 1st Qu.: 6.5 1st Qu.:13.50 1st Qu.:22.95 1st Qu.:0
## Median :10.0 Median :14.00 Median :25.40 Median :0
## Mean :10.5 Mean :14.38 Mean :25.59 Mean :0
## 3rd Qu.:14.0 3rd Qu.:16.00 3rd Qu.:28.50 3rd Qu.:0
## Max. :19.0 Max. :17.00 Max. :33.50 Max. :0
## Age
## Min. :18.00
## 1st Qu.:22.75
## Median :26.00
## Mean :26.62
## 3rd Qu.:32.25
## Max. :33.00