Churn Analysis

#install.packages('C50')
library(C50)
## Warning: package 'C50' was built under R version 3.3.3
data(churn)
dim(churnTrain)
## [1] 3333   20
View(churnTrain)
churnTrain <- churnTrain[ , ! names(churnTrain) %in% c('state', 'account_length', 'area_code')]


set.seed(2)
idx <- sample.int(2, nrow(churnTrain), replace=TRUE, prob= c(0.7, 0.3))

table(idx)
## idx
##    1    2 
## 2315 1018
trainset <- churnTrain[idx == 1, ]
testset  <- churnTrain[idx == 2, ]


library(rpart)
churn.rp <- rpart(churn ~ ., data = trainset)
plot(churn.rp, margin = 0.1)
text(churn.rp)

prediction <- predict(churn.rp, testset, type = 'class')
table(testset$churn, prediction)
##      prediction
##       yes  no
##   yes 100  41
##   no   18 859
tb <- table(prediction, testset$churn)

library(caret)
## Warning: package 'caret' was built under R version 3.3.3
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.3.3
cm <- confusionMatrix(tb)
cm
## Confusion Matrix and Statistics
## 
##           
## prediction yes  no
##        yes 100  18
##        no   41 859
##                                           
##                Accuracy : 0.942           
##                  95% CI : (0.9259, 0.9556)
##     No Information Rate : 0.8615          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7393          
##  Mcnemar's Test P-Value : 0.004181        
##                                           
##             Sensitivity : 0.70922         
##             Specificity : 0.97948         
##          Pos Pred Value : 0.84746         
##          Neg Pred Value : 0.95444         
##              Prevalence : 0.13851         
##          Detection Rate : 0.09823         
##    Detection Prevalence : 0.11591         
##       Balanced Accuracy : 0.84435         
##                                           
##        'Positive' Class : yes             
## 

K-fold Cross Validation

set.seed(2)
idx <- sample.int(10, nrow(churnTrain), replace=TRUE)

for (i in 1:10){
  print(i)
  trainset <- churnTrain[idx != i, ]
  testset  <- churnTrain[idx == i, ]
  churn.rp <- rpart(churn ~ ., data = trainset)  
  
  prediction <- predict(churn.rp, testset, type = 'class')
  
  tb <- table(prediction, testset$churn)
  
  cm <- confusionMatrix(tb)
  print(cm$overall[1])
}
## [1] 1
##  Accuracy 
## 0.9425982 
## [1] 2
##  Accuracy 
## 0.9354839 
## [1] 3
## Accuracy 
##     0.95 
## [1] 4
##  Accuracy 
## 0.9333333 
## [1] 5
##  Accuracy 
## 0.9382716 
## [1] 6
##  Accuracy 
## 0.9241983 
## [1] 7
##  Accuracy 
## 0.9345794 
## [1] 8
##  Accuracy 
## 0.9446254 
## [1] 9
##  Accuracy 
## 0.9498607 
## [1] 10
##  Accuracy 
## 0.9517045
library(caret)

control = trainControl(method="repeatedcv", number=10, repeats=3)

#?train
#names(getModelInfo())

model = train(churn~., data=trainset, method="rpart", preProcess="scale", trControl=control)

model
## CART 
## 
## 2981 samples
##   16 predictor
##    2 classes: 'yes', 'no' 
## 
## Pre-processing: scaled (16) 
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 2682, 2684, 2683, 2684, 2683, 2682, ... 
## Resampling results across tuning parameters:
## 
##   cp          Accuracy   Kappa    
##   0.05069124  0.9143537  0.5865270
##   0.07949309  0.8690623  0.2109233
##   0.08064516  0.8664873  0.1830921
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was cp = 0.05069124.
#predict(model, testset)

邏輯迴歸分析實做

set.seed(2)
idx <- sample.int(2, nrow(churnTrain), replace=TRUE, prob= c(0.7, 0.3))

trainset <- churnTrain[idx == 1, ]
testset  <- churnTrain[idx == 2, ]


fit <- glm(churn ~ ., data = trainset, family=binomial)

summary(fit)
## 
## Call:
## glm(formula = churn ~ ., family = binomial, data = trainset)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.1519   0.1983   0.3460   0.5186   2.1284  
## 
## Coefficients:
##                                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                    8.3462866  0.8364914   9.978  < 2e-16 ***
## international_planyes         -2.0534243  0.1726694 -11.892  < 2e-16 ***
## voice_mail_planyes             1.3445887  0.6618905   2.031 0.042211 *  
## number_vmail_messages         -0.0155101  0.0209220  -0.741 0.458496    
## total_day_minutes              0.2398946  3.9168466   0.061 0.951163    
## total_day_calls               -0.0014003  0.0032769  -0.427 0.669141    
## total_day_charge              -1.4855284 23.0402950  -0.064 0.948592    
## total_eve_minutes              0.3600678  1.9349825   0.186 0.852379    
## total_eve_calls               -0.0028484  0.0033061  -0.862 0.388928    
## total_eve_charge              -4.3204432 22.7644698  -0.190 0.849475    
## total_night_minutes            0.4431210  1.0478105   0.423 0.672367    
## total_night_calls              0.0003978  0.0033188   0.120 0.904588    
## total_night_charge            -9.9162795 23.2836376  -0.426 0.670188    
## total_intl_minutes             0.4587114  6.3524560   0.072 0.942435    
## total_intl_calls               0.1065264  0.0304318   3.500 0.000464 ***
## total_intl_charge             -2.0803428 23.5262100  -0.088 0.929538    
## number_customer_service_calls -0.5109077  0.0476289 -10.727  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1938.8  on 2314  degrees of freedom
## Residual deviance: 1515.3  on 2298  degrees of freedom
## AIC: 1549.3
## 
## Number of Fisher Scoring iterations: 6
pred <- predict(fit,testset, type="response")
Class = pred >.5
summary(Class)
##    Mode   FALSE    TRUE    NA's 
## logical      58     960       0
tb = table(testset$churn,Class)
tb
##      Class
##       FALSE TRUE
##   yes    29  112
##   no     29  848
churn.mod = ifelse(testset$churn == "yes", 1, 0)
pred_class = churn.mod
pred_class[pred<=.5] = 1- pred_class[pred<=.5]
ctb = table(churn.mod, pred_class)
ctb
##          pred_class
## churn.mod   0   1
##         0 848  29
##         1  29 112
confusionMatrix(ctb)
## Confusion Matrix and Statistics
## 
##          pred_class
## churn.mod   0   1
##         0 848  29
##         1  29 112
##                                          
##                Accuracy : 0.943          
##                  95% CI : (0.927, 0.9565)
##     No Information Rate : 0.8615         
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.7613         
##  Mcnemar's Test P-Value : 1              
##                                          
##             Sensitivity : 0.9669         
##             Specificity : 0.7943         
##          Pos Pred Value : 0.9669         
##          Neg Pred Value : 0.7943         
##              Prevalence : 0.8615         
##          Detection Rate : 0.8330         
##    Detection Prevalence : 0.8615         
##       Balanced Accuracy : 0.8806         
##                                          
##        'Positive' Class : 0              
## 

Use SVM

library(e1071)
## Warning: package 'e1071' was built under R version 3.3.3
model.tuned <- svm(churn~., data = trainset, gamma = 10^-2, cost = 100)
 summary(model.tuned)
## 
## Call:
## svm(formula = churn ~ ., data = trainset, gamma = 10^-2, cost = 100)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  100 
##       gamma:  0.01 
## 
## Number of Support Vectors:  547
## 
##  ( 304 243 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  yes no
svm.tuned.pred <- predict(model.tuned, testset[, !names(testset) %in% c("churn")])
 svm.tuned.table=table(svm.tuned.pred, testset$churn)
 svm.tuned.table
##               
## svm.tuned.pred yes  no
##            yes  95  24
##            no   46 853
 classAgreement(svm.tuned.table)
## $diag
## [1] 0.9312377
## 
## $kappa
## [1] 0.691678
## 
## $rand
## [1] 0.871806
## 
## $crand
## [1] 0.6303615
 confusionMatrix(svm.tuned.table)
## Confusion Matrix and Statistics
## 
##               
## svm.tuned.pred yes  no
##            yes  95  24
##            no   46 853
##                                          
##                Accuracy : 0.9312         
##                  95% CI : (0.9139, 0.946)
##     No Information Rate : 0.8615         
##     P-Value [Acc > NIR] : 1.56e-12       
##                                          
##                   Kappa : 0.6917         
##  Mcnemar's Test P-Value : 0.01207        
##                                          
##             Sensitivity : 0.67376        
##             Specificity : 0.97263        
##          Pos Pred Value : 0.79832        
##          Neg Pred Value : 0.94883        
##              Prevalence : 0.13851        
##          Detection Rate : 0.09332        
##    Detection Prevalence : 0.11690        
##       Balanced Accuracy : 0.82320        
##                                          
##        'Positive' Class : yes            
## 

繪製ROC 曲線

library(rpart)
churn.rp <- rpart(churn ~ ., data = trainset)


prediction <- predict(churn.rp, testset)


roc_x <- c(0)
roc_y <- c(0)
for(i in seq(0.1,1,0.01)){
  res <- as.factor(ifelse(prediction[,1] >= i, 'yes', 'no'))
  res <- factor(res,levels(res)[c(2,1)])

  tb <- table(testset$churn, res)
  cm <- confusionMatrix(tb)
  x <- 1 - cm$byClass[2]
  y <- cm$byClass[1]
  roc_x <- c(roc_x, x)
  roc_y <- c(roc_y, y)
}
roc_x <- c(roc_x, 1)
roc_y <- c(roc_y, 1)
plot(roc_x, roc_y, type='b')

## Using ROCR
#install.packages('ROCR')
library(ROCR)
## Warning: package 'ROCR' was built under R version 3.3.3
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.3.3
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
predictions <- predict(churn.rp, testset, type="prob")
pred.to.roc <- predictions[, 1] 
pred.rocr <- prediction(pred.to.roc, as.factor(testset[,(dim(testset)[[2]])])) 
perf.rocr <- performance(pred.rocr, measure = "auc", x.measure = "cutoff") 
perf.tpr.rocr <- performance(pred.rocr, "tpr","fpr") 
plot(perf.tpr.rocr, colorize=T,main=paste("AUC:",(perf.rocr@y.values)))

## 找出最重要的變數

#install.packages("rminer")
library(rminer)
## Warning: package 'rminer' was built under R version 3.3.3
model <- fit(churn~.,trainset,model="svm")

VariableImportance <- Importance(model,trainset,method="sensv")

L=list(runs=1,sen=t(VariableImportance$imp),sresponses=VariableImportance$sresponses)

mgraph(L,graph="IMP",leg=names(trainset),col="gray",Grid=10)

使用R 計算距離

?dist
## starting httpd help server ...
##  done
x <- c(0, 0, 1, 1, 1, 1)
y <- c(1, 0, 1, 1, 0, 1)

sqrt(sum((x - y) ^ 2))
## [1] 1.414214
dist(rbind(x,y), method = 'euclidean')
##          x
## y 1.414214
dist(rbind(x,y), method = 'minkowski', p = 2)
##          x
## y 1.414214
sum(abs(x - y))
## [1] 2
dist(rbind(x,y), method = 'manhattan')
##   x
## y 2
dist(rbind(x,y), method = 'minkowski', p = 1)
##   x
## y 2

階層式分群

data(iris)
iris.dist <- dist(iris[, -5], 'euclidean')

hc <- hclust(iris.dist,method = 'ward.D2')
plot(hc)

plot(hc, hang = -0.01, cex = 0.7)

plot(hc, hang = -0.01, cex = 0.7)
rect.hclust(hc, k = 3, border = 'red')

plot(iris$Petal.Length, iris$Petal.Width, col=iris$Species )

fit <- cutree(hc, k= 3)
fit
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [71] 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3
## [106] 3 2 3 3 3 3 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 2 3 3 3 3 3 2 2 3 3 3 2 3
## [141] 3 3 2 3 3 3 2 3 3 2
plot(iris$Petal.Length, iris$Petal.Width, col=fit )

par(mfrow=c(1,2))
plot(iris$Petal.Length, iris$Petal.Width, col=iris$Species )
plot(iris$Petal.Length, iris$Petal.Width, col=fit )

## Applenews 分群

# https://github.com/ywchiu/rtibame/blob/master/data/applenews.RData
load('applenews.RData')

dim(applenews)
head(applenews)

library(jiebaR)
mixseg <- worker()
apple.seg <- lapply(applenews$content, function(e) segment(e, jiebar = mixseg ))

library(tm)
s.corpus   <- Corpus(VectorSource(apple.seg))
doc        <- tm_map(s.corpus, removePunctuation)
doc        <- tm_map(doc, removeNumbers)
dtm        <- DocumentTermMatrix(doc)
dtm.remove <- removeSparseTerms(dtm, 0.99)

library(proxy)
dist.mat <- proxy::dist(as.matrix(dtm.remove), method = 'cosine')
mat <- as.matrix(dist.mat)

# Find nearest article
applenews$title[order(mat[20,])][1:10]

# Articel Clustering

hc <- hclust(dist.mat, method = 'ward.D2')
plot(hc)
rect.hclust(hc, 10)

fit <- cutree(hc, 20)
table(fit)


applenews$title[fit == 6]

KMeans 分群

download.file('https://github.com/ywchiu/rtibame/raw/master/data/customer.csv', 'customer.csv')

customer <- read.csv('customer.csv')
customer <- scale(customer[,-1])

set.seed(22)
fit <- kmeans(customer, 4 )
fit
## K-means clustering with 4 clusters of sizes 8, 11, 16, 25
## 
## Cluster means:
##   Visit.Time Average.Expense        Sex        Age
## 1  1.3302016       1.0155226 -1.4566845  0.5591307
## 2 -0.7771737      -0.5178412 -1.4566845 -0.4774599
## 3  0.8571173       0.9887331  0.6750489  1.0505015
## 4 -0.6322632      -0.7299063  0.6750489 -0.6411604
## 
## Clustering vector:
##  [1] 2 2 1 2 1 2 1 1 2 2 2 1 1 2 2 2 1 2 1 3 4 3 4 3 3 4 4 3 4 4 4 3 3 3 4
## [36] 4 3 4 4 4 4 4 4 4 3 3 4 4 4 3 4 3 3 4 4 4 3 4 4 3
## 
## Within cluster sum of squares by cluster:
## [1]  5.90040 11.97454 22.58236 20.89159
##  (between_SS / total_SS =  74.0 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"
#plot(customer$Visit.Time, customer$Average.Expense, col = fit$cluster)

library(cluster)
set.seed(22)
km  <- kmeans(customer, 4)
kms <- silhouette(km$cluster,dist(customer))
summary(kms)
## Silhouette of 60 units in 4 clusters from silhouette.default(x = km$cluster, dist = dist(customer)) :
##  Cluster sizes and average silhouette widths:
##         8        11        16        25 
## 0.5464597 0.4080823 0.3794910 0.5164434 
## Individual silhouette widths:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1931  0.4030  0.4890  0.4641  0.5422  0.6333
plot(kms)

# install.packages("fpc")
library(fpc)
## Warning: package 'fpc' was built under R version 3.3.3
nk <- 2:10
set.seed(22)
sw <- sapply(nk , function(k){
      cluster.stats(dist(customer[,-1]), kmeans(customer[,-1], centers=k)$cluster)$avg.silwidth
})

plot(nk, sw, type="l", xlab="number of clusers", ylab="average silhouette width")

nk = 2:10
set.seed(22)
WSS = sapply(nk, function(k) {
    kmeans(customer, centers=k)$tot.withinss
})
WSS
## [1] 123.49224  88.07028  61.34890  48.76431  47.20813  45.48114  29.58014
## [8]  28.87519  23.21331
plot(nk, WSS, type="l", xlab= "number of k", ylab="within sum of squares")

比較分群演算法

single_c  <-  hclust(dist(customer), method="single")
hc_single <-  cutree(single_c, k = 4)

complete_c <-  hclust(dist(customer), method="complete")
hc_complete <-  cutree(complete_c, k = 4)

set.seed(22)
km <- kmeans(customer, 4)

cs <- cluster.stats(dist(customer), km$cluster)
cs[c("within.cluster.ss","avg.silwidth")]
## $within.cluster.ss
## [1] 61.3489
## 
## $avg.silwidth
## [1] 0.4640587
sapply(list(kmeans = km$cluster, hc_single = hc_single, hc_complete = hc_complete), function(c) cluster.stats(dist(customer), c)[c("within.cluster.ss","avg.silwidth")])
##                   kmeans    hc_single hc_complete
## within.cluster.ss 61.3489   136.0092  65.94076   
## avg.silwidth      0.4640587 0.2481926 0.4255961

Using Neural Network

 data(iris)
 ind <- sample(2, nrow(iris), replace = TRUE, prob=c(0.7, 0.3))
 trainset <- iris[ind == 1,]
 testset  <- iris[ind == 2,]

 #install.packages("neuralnet")
 library(neuralnet)
## Warning: package 'neuralnet' was built under R version 3.3.3
## 
## Attaching package: 'neuralnet'
## The following object is masked from 'package:ROCR':
## 
##     prediction
 #trainset
 trainset$setosa     <- trainset$Species == "setosa"
 trainset$virginica  <- trainset$Species == "virginica"
 trainset$versicolor <- trainset$Species == "versicolor"
 network <-  neuralnet(versicolor + virginica + setosa ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, trainset, hidden=3)
#network
#network$result.matrix
#head(network$generalized.weights[[1]])
  plot(network)

 
  res <- compute(network, testset[-5])$net.result
  res
##                   [,1]             [,2]             [,3]
## 1   -0.000112341503913 -0.0011361437077  1.0011210699510
## 2    0.000171924625309 -0.0008509294624  1.0005516770555
## 4    0.000198773395306 -0.0008239911434  1.0004978982288
## 7    0.000179335424060 -0.0008434939464  1.0005368330212
## 10  -0.000064210732419 -0.0010878524051  1.0010246626885
## 20  -0.000071233625953 -0.0010948987222  1.0010387297379
## 21  -0.000008221233853 -0.0010316761641  1.0009125141789
## 22   0.000230977162540 -0.0007916799666  1.0004333931950
## 24   0.003906276182324  0.0028958773195  0.9930716679109
## 29  -0.000088440803922 -0.0011121632914  1.0010731961899
## 31   0.000245145141199 -0.0007774647333  1.0004050143431
## 33  -0.000206623645893 -0.0012307403100  1.0013099196813
## 47  -0.000144232500131 -0.0011681410703  1.0011849484958
## 53   0.500037999419108  0.5006823542983 -0.0006938547066
## 57   0.500045009008528  0.5006893872669 -0.0007078951075
## 61   0.499017716292572  0.4996586682100  0.0013498005304
## 62   0.500009004169413  0.5006532623405 -0.0006357764220
## 68   0.497459823928507  0.4980955797899  0.0044703018950
## 71   0.500066805776966  0.5007112567344 -0.0007515546358
## 79   0.500041171387181  0.5006855368459 -0.0007002082462
## 81   0.499252465332441  0.4998942002110  0.0008795917303
## 82   0.497719480946298  0.4983561028447  0.0039502017282
## 83   0.499403135404458  0.5000453728146  0.0005777954213
## 85   0.500041231118495  0.5006855967765 -0.0007003278896
## 87   0.500027401100188  0.5006717206307 -0.0006726259816
## 92   0.499995391557168  0.5006396043259 -0.0006085099843
## 95   0.499922893434478  0.5005668643994 -0.0004632942454
## 99   0.496127036434409  0.4967583470346  0.0071399119976
## 105  0.500069119464522  0.5007135781388 -0.0007561890158
## 109  0.500069038524807  0.5007134969291 -0.0007560268913
## 110  0.500069125946325  0.5007135846422 -0.0007562019990
## 112  0.500068990358979  0.5007134486026 -0.0007559304138
## 113  0.500069087483520  0.5007135460511 -0.0007561249569
## 119  0.500069127713582  0.5007135864154 -0.0007562055388
## 122  0.500069016010607  0.5007134743398 -0.0007559817947
## 124  0.500068458765984  0.5007129152366 -0.0007548656184
## 127  0.500068138888507  0.5007125942923 -0.0007542248950
## 128  0.500067845585739  0.5007123000112 -0.0007536374014
## 130  0.500066996657084  0.5007114482511 -0.0007519369739
## 131  0.500069068924737  0.5007135274304 -0.0007560877832
## 136  0.500069124669376  0.5007135833610 -0.0007561994412
## 146  0.500069116068220  0.5007135747312 -0.0007561822129
## 150  0.500068316190427  0.5007127721855 -0.0007545800356
 prediction <- c("versicolor", "virginica", "setosa")[apply(res, 1, which.max)]
 
 predict.table <- table(testset$Species, prediction)
 predict.table
##             prediction
##              setosa virginica
##   setosa         13         0
##   versicolor      0        15
##   virginica       0        15
 prediction
##  [1] "setosa"    "setosa"    "setosa"    "setosa"    "setosa"   
##  [6] "setosa"    "setosa"    "setosa"    "setosa"    "setosa"   
## [11] "setosa"    "setosa"    "setosa"    "virginica" "virginica"
## [16] "virginica" "virginica" "virginica" "virginica" "virginica"
## [21] "virginica" "virginica" "virginica" "virginica" "virginica"
## [26] "virginica" "virginica" "virginica" "virginica" "virginica"
## [31] "virginica" "virginica" "virginica" "virginica" "virginica"
## [36] "virginica" "virginica" "virginica" "virginica" "virginica"
## [41] "virginica" "virginica" "virginica"