Demo20180325

Churn Analysis

#install.packages('C50')
library(C50)
data(churn)
str(churnTrain)

## 'data.frame':    3333 obs. of  20 variables:
##  $ state                        : Factor w/ 51 levels "AK","AL","AR",..: 17 36 32 36 37 2 20 25 19 50 ...
##  $ account_length               : int  128 107 137 84 75 118 121 147 117 141 ...
##  $ area_code                    : Factor w/ 3 levels "area_code_408",..: 2 2 2 1 2 3 3 2 1 2 ...
##  $ international_plan           : Factor w/ 2 levels "no","yes": 1 1 1 2 2 2 1 2 1 2 ...
##  $ voice_mail_plan              : Factor w/ 2 levels "no","yes": 2 2 1 1 1 1 2 1 1 2 ...
##  $ number_vmail_messages        : int  25 26 0 0 0 0 24 0 0 37 ...
##  $ total_day_minutes            : num  265 162 243 299 167 ...
##  $ total_day_calls              : int  110 123 114 71 113 98 88 79 97 84 ...
##  $ total_day_charge             : num  45.1 27.5 41.4 50.9 28.3 ...
##  $ total_eve_minutes            : num  197.4 195.5 121.2 61.9 148.3 ...
##  $ total_eve_calls              : int  99 103 110 88 122 101 108 94 80 111 ...
##  $ total_eve_charge             : num  16.78 16.62 10.3 5.26 12.61 ...
##  $ total_night_minutes          : num  245 254 163 197 187 ...
##  $ total_night_calls            : int  91 103 104 89 121 118 118 96 90 97 ...
##  $ total_night_charge           : num  11.01 11.45 7.32 8.86 8.41 ...
##  $ total_intl_minutes           : num  10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
##  $ total_intl_calls             : int  3 3 5 7 3 6 7 6 4 5 ...
##  $ total_intl_charge            : num  2.7 3.7 3.29 1.78 2.73 1.7 2.03 1.92 2.35 3.02 ...
##  $ number_customer_service_calls: int  1 1 0 2 3 0 3 0 1 0 ...
##  $ churn                        : Factor w/ 2 levels "yes","no": 2 2 2 2 2 2 2 2 2 2 ...

dim(churnTrain)

## [1] 3333   20

churnTrain <- churnTrain[, ! names(churnTrain) %in% c('state', 'account_length', 'area_code')]
#sample.int(42,6)

set.seed(2)
idx <- sample.int(2, nrow(churnTrain), replace=TRUE, prob = c(0.7,0.3))
trainset <- churnTrain[idx == 1,]
testset  <- churnTrain[idx == 2,]

dim(trainset)

## [1] 2315   17

dim(testset)

## [1] 1018   17

library(rpart)
churn.rp <- rpart(churn ~ ., data = trainset   )
plot(churn.rp, margin = 0.1)
text(churn.rp)

predictions <- predict(churn.rp, testset, type= 'class')
sum(predictions == testset$churn) / length(predictions)

## [1] 0.9420432

tb <- table(testset$churn, predictions)


#install.packages('caret')
library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

cm <- confusionMatrix(tb)
cm

## Confusion Matrix and Statistics
## 
##      predictions
##       yes  no
##   yes 100  41
##   no   18 859
##                                           
##                Accuracy : 0.942           
##                  95% CI : (0.9259, 0.9556)
##     No Information Rate : 0.8841          
##     P-Value [Acc > NIR] : 2.052e-10       
##                                           
##                   Kappa : 0.7393          
##  Mcnemar's Test P-Value : 0.004181        
##                                           
##             Sensitivity : 0.84746         
##             Specificity : 0.95444         
##          Pos Pred Value : 0.70922         
##          Neg Pred Value : 0.97948         
##              Prevalence : 0.11591         
##          Detection Rate : 0.09823         
##    Detection Prevalence : 0.13851         
##       Balanced Accuracy : 0.90095         
##                                           
##        'Positive' Class : yes             
##

control <- trainControl(method="repeatedcv", number=10, repeats=3)
model   <- train(churn~., data=trainset, method="rpart", preProcess="scale", trControl=control)
model

## CART 
## 
## 2315 samples
##   16 predictor
##    2 classes: 'yes', 'no' 
## 
## Pre-processing: scaled (16) 
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 2084, 2083, 2083, 2083, 2083, 2084, ... 
## Resampling results across tuning parameters:
## 
##   cp          Accuracy   Kappa    
##   0.05555556  0.9036559  0.5315844
##   0.07456140  0.8671038  0.2796749
##   0.07602339  0.8613423  0.2142758
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.05555556.

predictions <- predict(churn.rp, testset)

fpr_ary <- c(0)
tpr_ary <- c(0)

for (i in seq(0,1,0.01)){
  res        <- ifelse(predictions[,1]> i, 'yes', 'no' )
  res_factor <- factor(res, levels = c('yes', 'no'),labels = c('yes', 'no'))
  
  tb <- table(testset$churn,res_factor )
  
  TP <- tb[1,1]
  FP <- tb[1,2]
  FN <- tb[2,1]
  TN <- tb[2,2]
  if ((TP > 0) & (FP >0) & (FN >0) & (TN > 0)){
    #print(tb)
    FPR <- FP / (FP + TN)
    TPR <- TP / (TP + FN)
    fpr_ary <- c(fpr_ary, FPR)
    tpr_ary <- c(tpr_ary, TPR)
  }
}

fpr_ary <- c(fpr_ary, 1)
tpr_ary <- c(tpr_ary, 1)


plot(fpr_ary, tpr_ary, type = 'o', main = 'ROC Curve', xlab = 'FPR', ylab = 'TPR', col= 'red', ylim = c(0,1), xlim = c(0,1))
abline(c(0,0), c(1,1))

#install.packages('ROCR')
library(ROCR)

## Loading required package: gplots

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

predictions <-predict(churn.rp, testset, type="prob")
pred.to.roc<-predictions[, 1]

pred.rocr<-prediction(pred.to.roc,as.factor(testset[,(dim(testset)[[2]])]))

perf.rocr<-performance(pred.rocr, measure ="auc", x.measure="cutoff")

perf.tpr.rocr<-performance(pred.rocr, "tpr","fpr")

plot(perf.tpr.rocr, colorize=T,main=paste("AUC:",(perf.rocr@y.values)))

RNN

library(rnn)

# create training numbers
X1 = sample(0:127, 7000, replace=TRUE)
X2 = sample(0:127, 7000, replace=TRUE)

# create training response numbers
Y <- X1 + X2

# convert to binary
X1 <- int2bin(X1, length=8)
X2 <- int2bin(X2, length=8)
Y  <- int2bin(Y,  length=8)

# create 3d array: dim 1: samples; dim 2: time; dim 3: variables
X <- array( c(X1,X2), dim=c(dim(X1),2) )

# train the model
model <- trainr(Y=Y,
                X=X,
                learningrate   =  0.1,
                hidden_dim     = 10   )

A1 = int2bin( sample(0:127, 7000, replace=TRUE) )
A2 = int2bin( sample(0:127, 7000, replace=TRUE) )

# create 3d array: dim 1: samples; dim 2: time; dim 3: variables
A <- array( c(A1,A2), dim=c(dim(A1),2) )
    
# predict
B  <- predictr(model,
               A[,dim(A)[2]:1,]     )
B

Random Forest

#install.packages('randomForest')
library(randomForest)

## randomForest 4.6-12

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

forest <-randomForest(churn ~., data=trainset, ntree=200, importance=T, proximity=T)

forest.predicted<-predict(forest, testset, type ="class")
table(testset$churn, forest.predicted)

##      forest.predicted
##       yes  no
##   yes 110  31
##   no    7 870

# Decision Tree
predictions1 <-predict(churn.rp, testset, type="prob")
pred.to.roc1 <-predictions1[, 1]
pred.rocr1 <-prediction(pred.to.roc1, as.factor(testset$churn))
perf.rocr1 <-performance(pred.rocr1, measure ="auc", x.measure="cutoff")
perf.tpr.rocr1 <-performance(pred.rocr1, "tpr","fpr")

# Random Forest
predictions2 <-predict(forest, testset, type="prob")
pred.to.roc2 <-predictions2[, 1]
pred.rocr2 <-prediction(pred.to.roc2, as.factor(testset$churn))
perf.rocr2 <-performance(pred.rocr2, measure ="auc", x.measure="cutoff")
perf.tpr.rocr2 <-performance(pred.rocr2, "tpr","fpr")

plot(perf.tpr.rocr1,main='ROC Curve', col=1)
legend(0.7, 0.2, c('rpart', 'randomforest'), 1:2)
plot(perf.tpr.rocr2, col=2, add=TRUE)

Find Most Informative Feature

#install.packages("rminer")
library(rminer)
model=fit(churn~.,trainset,model="svm")

VariableImportance<-Importance(model,trainset,method="sensv")
L<-list(runs=1,sen=t(VariableImportance$imp),sresponses=VariableImportance$sresponses)
mgraph(L,graph="IMP",leg=names(trainset),col="gray",Grid=10)

Compare Model

library(caret)
control <- trainControl(method="repeatedcv", number=10, repeats=3)
#names(getModelInfo())
model1   <- train(churn~., data=trainset, method="rpart", preProcess="scale", trControl=control)
model2   <- train(churn~., data=trainset, method="knn", preProcess="scale", trControl=control)
model3   <- train(churn~., data=trainset, method="rf", preProcess="scale", trControl=control)

getROC <- function(model, dataset){
  predictions2 <-predict(model, dataset, type="prob")
  pred.to.roc2 <-predictions2[, 1]
  pred.rocr2 <-prediction(pred.to.roc2, as.factor(dataset$churn))
  perf.rocr2 <-performance(pred.rocr2, measure ="auc", x.measure="cutoff")
  perf.tpr.rocr2 <-performance(pred.rocr2, "tpr","fpr")
  return(perf.tpr.rocr2)
}

plot(getROC(model1,testset),main='ROC Curve', col=1)
plot(getROC(model2,testset), col=2, add=TRUE)
plot(getROC(model3,testset), col=3, add=TRUE)

Clustering

x <- c(0, 0, 1, 1, 1, 1)
y <- c(1, 0, 1, 1, 0, 1)


# euclidean distance
sqrt(sum((x - y) ^ 2))

## [1] 1.414214

dist(rbind(x,y), method = 'euclidean')

##          x
## y 1.414214

dist(rbind(x,y), method = 'minkowski', p = 2)

##          x
## y 1.414214

# Manhattan distance
sum(abs(x - y))

## [1] 2

dist(rbind(x,y), method = 'manhattan')

##   x
## y 2

dist(rbind(x,y), method = 'minkowski', p = 1)

##   x
## y 2

Hierarchical Clustering

data(iris)
hc <- hclust(dist(iris[,-5], method = 'euclidean'), method='ward.D2'   )
plot(hc,hang = -0.01, cex= 0.7)


fit <- cutree(hc, k = 3)
table(fit)

## fit
##  1  2  3 
## 50 64 36

plot(hc,hang = -0.01, cex= 0.7)
rect.hclust(hc, k = 3 , border = 'red')

par(mfrow=c(1,2))
plot(Petal.Length ~ Petal.Width, data = iris, col = iris$Species,main = 'Real DataSet')
plot(Petal.Length ~ Petal.Width, data = iris, col = fit,main = 'Clustering')

KMeans

fit <-kmeans(iris[,-5], 3)
barplot(t(fit$centers), beside =TRUE,xlab="cluster", ylab="value")

par(mfrow=c(1,2))
plot(Petal.Width ~ Petal.Length, data = iris, col = iris$Species,main = 'Real DataSet')
plot(Petal.Width  ~ Petal.Length, data = iris, col = fit$cluster,main = 'KMeans Clustering')

Evaluate Clusters

library(cluster)
set.seed(22)
km <-kmeans(iris[,-5], 3)
kms<-silhouette(km$cluster,dist(iris[,-5]))
summary(kms)

## Silhouette of 150 units in 3 clusters from silhouette.default(x = km$cluster, dist = dist(iris[, -5])) :
##  Cluster sizes and average silhouette widths:
##        50        62        38 
## 0.7981405 0.4173199 0.4511051 
## Individual silhouette widths:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.02636 0.39115 0.56231 0.55282 0.77552 0.85391

plot(kms)

nk <-2:10

set.seed(22)
WSS <- sapply(nk, function(k){
  kmeans(iris[,-5], centers=k)$tot.withinss
})
WSS

## [1] 152.34795  78.85144  57.26562  46.44618  42.42965  37.39514  34.75335
## [8]  41.96594  33.71865

plot(nk, WSS, type= 'l')
abline(v = 3, col= 'red')

# install.packages('fpc')
library(fpc)


nk<-2:10
set.seed(123)
SW <- sapply(nk, function(k){
  cluster.stats(dist(iris[,-5]), kmeans(iris[,-5], centers=k)$cluster)$avg.silwidth
  })

plot(nk, SW, type="l", xlab="number of clusers", ylab="average silhouette width")

Compare Clustering Method

single_c<-hclust(dist(iris[,-5]), method="single")
hc_single<-cutree(single_c, k =3)

complete_c<-hclust(dist(iris[,-5]), method="complete")
hc_complete<-cutree(complete_c, k =3)

set.seed(42)
km <-kmeans(iris[,-5], 3)
cs<-cluster.stats(dist(iris[,-5]), km$cluster)
cs[c("within.cluster.ss","avg.silwidth")]

## $within.cluster.ss
## [1] 78.85144
## 
## $avg.silwidth
## [1] 0.552819

sapply(
  list(kmeans=km$cluster, 
       hc_single=hc_single, 
       hc_complete=hc_complete), function(c)cluster.stats(dist(iris[,-5]), c)[c("within.cluster.ss","avg.silwidth")])

##                   kmeans   hc_single hc_complete
## within.cluster.ss 78.85144 142.4794  89.52501   
## avg.silwidth      0.552819 0.5121108 0.5135953

Customer Segmentation

customers <- read.csv('https://raw.githubusercontent.com/ywchiu/tibamepy/master/data/customers.csv')
head(customers)

##   CustomerID  Genre Age Annual.Income..k.. Spending.Score..1.100.
## 1          1   Male  19                 15                     39
## 2          2   Male  21                 15                     81
## 3          3 Female  20                 16                      6
## 4          4 Female  23                 16                     77
## 5          5 Female  31                 17                     40
## 6          6 Female  22                 17                     76

customers <- customers[,c(4,5)]
names(customers) <- c('annual_income', 'spending_score')


plot(spending_score ~ annual_income, data = customers)

set.seed(54)
fit <- kmeans(customers, 5)
plot(spending_score ~ annual_income, data = customers, col=fit$cluster)
points(fit$centers[,1], fit$centers[,2], col='orange')

kcs<-silhouette(fit$cluster,dist(customers))
summary(kcs)

## Silhouette of 200 units in 5 clusters from silhouette.default(x = fit$cluster, dist = dist(customers)) :
##  Cluster sizes and average silhouette widths:
##        39        81        35        23        22 
## 0.5091706 0.5966512 0.5039873 0.5122676 0.5990129 
## Individual silhouette widths:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.02338  0.49425  0.59614  0.55393  0.66011  0.75807

plot(kcs)

nk<-2:10
set.seed(123)

SW <-sapply(nk, function(k){cluster.stats(dist(customers), kmeans(customers, centers=k)$cluster)$avg.silwidth})


plot(nk, SW, type="l", xlab="number of clusers", ylab="average silhouette width")

SKLearn Clustering

http://scikit-learn.org/stable/modules/clustering.html

Text Mining

s <- '一位網友近日抱怨說，買了1包100的蝦餅，結果打開只有8片！她問：可以多一點嗎？'

strsplit(s,'，|！|：|？')

## [[1]]
## [1] "一位網友近日抱怨說" "買了1包100的蝦餅"   "結果打開只有8片"   
## [4] "她問"               "可以多一點嗎"

#install.packages('jiebaR')
library(jiebaR)

## Loading required package: jiebaRD

mixseg <- worker()

s="那我們酸民婉君也可以報名嗎"
segment(code = s, jiebar = mixseg)

## [1] "那"   "我們" "酸民" "婉君" "也"   "可以" "報名" "嗎"

edit_dict()

## Warning in edit_dict(): You should save the dictionary without BOM on
## Windows

USERPATH

## [1] "C:/Program Files/R/R-3.4.4/library/jiebaRD/dict/user.dict.utf8"

tagseg <- worker('tag')
segment(code = s, jiebar = tagseg)

##      r      r      n      x      d      c      v      y 
##   "那" "我們" "酸民" "婉君"   "也" "可以" "報名"   "嗎"

ngram

#install.packages("NLP")
library(NLP)

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

s <-strsplit(x="那我們酸民婉君也可以報名嗎", split='')
s

## [[1]]
##  [1] "那" "我" "們" "酸" "民" "婉" "君" "也" "可" "以" "報" "名" "嗎"

bigram <-ngrams(unlist(s), 2)
bigram

## [[1]]
## [1] "那" "我"
## 
## [[2]]
## [1] "我" "們"
## 
## [[3]]
## [1] "們" "酸"
## 
## [[4]]
## [1] "酸" "民"
## 
## [[5]]
## [1] "民" "婉"
## 
## [[6]]
## [1] "婉" "君"
## 
## [[7]]
## [1] "君" "也"
## 
## [[8]]
## [1] "也" "可"
## 
## [[9]]
## [1] "可" "以"
## 
## [[10]]
## [1] "以" "報"
## 
## [[11]]
## [1] "報" "名"
## 
## [[12]]
## [1] "名" "嗎"

vapply(bigram, paste , '' , collapse = '')

##  [1] "那我" "我們" "們酸" "酸民" "民婉" "婉君" "君也" "也可" "可以" "以報"
## [11] "報名" "名嗎"

s <-strsplit(x="那我們酸民婉君也可以報名嗎", split='')
trigram <-ngrams(unlist(s), 3)
vapply(trigram, paste, "", collapse ="")

##  [1] "那我們" "我們酸" "們酸民" "酸民婉" "民婉君" "婉君也" "君也可"
##  [8] "也可以" "可以報" "以報名" "報名嗎"

news <- '研究業者IDC的數據顯示，隨著消費者逐漸察覺智慧手錶與醫療保健和行動支付有關的功能，2022年全球智慧手錶規模預估將比今年大1倍。

根據IDC的數據，全球智慧手錶今年總出貨量預估將達到4360萬支，2022年出貨量將進一步提高至8410萬支，2018至2022年出貨量年複合成長率（CAGR）為17.9%。2018年全球穿戴式裝置出貨量估為1.329億部，2022年出貨量估為2.194億部。

IDC行動裝置追蹤系統資深研究分析師烏布拉尼（Jitesh Ubrani）說：「消費者終於開始了解和需要智慧手錶的功能，目前體適能用途處於領先，但行動支付和通訊已展開追勢。」

Strategy Analytics指出，蘋果智慧手錶Apple Watch去年在全球智慧手錶市場佔有率達到60.4%，是主宰市場的產品，三星電子以10.6%的市佔率居次。（林文彬／綜合外電報導）'

s <-strsplit(x=news, split='')

bigram <-ngrams(unlist(s), 3)
#sort(table(vapply(bigram, paste , '' , collapse = '')), decreasing = TRUE)

s <-strsplit(x=news, split='，|：|。')
s

## [[1]]
##  [1] "研究業者IDC的數據顯示"                                           
##  [2] "隨著消費者逐漸察覺智慧手錶與醫療保健和行動支付有關的功能"        
##  [3] "2022年全球智慧手錶規模預估將比今年大1倍"                         
##  [4] "\n\n根據IDC的數據"                                               
##  [5] "全球智慧手錶今年總出貨量預估將達到4360萬支"                      
##  [6] "2022年出貨量將進一步提高至8410萬支"                              
##  [7] "2018至2022年出貨量年複合成長率（CAGR）為17.9%"                   
##  [8] "2018年全球穿戴式裝置出貨量估為1.329億部"                         
##  [9] "2022年出貨量估為2.194億部"                                       
## [10] "\n\nIDC行動裝置追蹤系統資深研究分析師烏布拉尼（Jitesh Ubrani）說"
## [11] "「消費者終於開始了解和需要智慧手錶的功能"                        
## [12] "目前體適能用途處於領先"                                          
## [13] "但行動支付和通訊已展開追勢"                                      
## [14] "」\n\nStrategy Analytics指出"                                    
## [15] "蘋果智慧手錶Apple Watch去年在全球智慧手錶市場佔有率達到60.4%"    
## [16] "是主宰市場的產品"                                                
## [17] "三星電子以10.6%的市佔率居次"                                     
## [18] "（林文彬／綜合外電報導）"

a.split<-strsplit(news, "、|，|。")
w.split<-strsplit(x=unlist(a.split), split='')

w.split

## [[1]]
##  [1] "研" "究" "業" "者" "I"  "D"  "C"  "的" "數" "據" "顯" "示"
## 
## [[2]]
##  [1] "隨" "著" "消" "費" "者" "逐" "漸" "察" "覺" "智" "慧" "手" "錶" "與"
## [15] "醫" "療" "保" "健" "和" "行" "動" "支" "付" "有" "關" "的" "功" "能"
## 
## [[3]]
##  [1] "2"  "0"  "2"  "2"  "年" "全" "球" "智" "慧" "手" "錶" "規" "模" "預"
## [15] "估" "將" "比" "今" "年" "大" "1"  "倍"
## 
## [[4]]
##  [1] "\n" "\n" "根" "據" "I"  "D"  "C"  "的" "數" "據"
## 
## [[5]]
##  [1] "全" "球" "智" "慧" "手" "錶" "今" "年" "總" "出" "貨" "量" "預" "估"
## [15] "將" "達" "到" "4"  "3"  "6"  "0"  "萬" "支"
## 
## [[6]]
##  [1] "2"  "0"  "2"  "2"  "年" "出" "貨" "量" "將" "進" "一" "步" "提" "高"
## [15] "至" "8"  "4"  "1"  "0"  "萬" "支"
## 
## [[7]]
##  [1] "2"  "0"  "1"  "8"  "至" "2"  "0"  "2"  "2"  "年" "出" "貨" "量" "年"
## [15] "複" "合" "成" "長" "率" "（" "C"  "A"  "G"  "R"  "）" "為" "1"  "7" 
## [29] "."  "9"  "%" 
## 
## [[8]]
##  [1] "2"  "0"  "1"  "8"  "年" "全" "球" "穿" "戴" "式" "裝" "置" "出" "貨"
## [15] "量" "估" "為" "1"  "."  "3"  "2"  "9"  "億" "部"
## 
## [[9]]
##  [1] "2"  "0"  "2"  "2"  "年" "出" "貨" "量" "估" "為" "2"  "."  "1"  "9" 
## [15] "4"  "億" "部"
## 
## [[10]]
##  [1] "\n" "\n" "I"  "D"  "C"  "行" "動" "裝" "置" "追" "蹤" "系" "統" "資"
## [15] "深" "研" "究" "分" "析" "師" "烏" "布" "拉" "尼" "（" "J"  "i"  "t" 
## [29] "e"  "s"  "h"  " "  "U"  "b"  "r"  "a"  "n"  "i"  "）" "說" "：" "「"
## [43] "消" "費" "者" "終" "於" "開" "始" "了" "解" "和" "需" "要" "智" "慧"
## [57] "手" "錶" "的" "功" "能"
## 
## [[11]]
##  [1] "目" "前" "體" "適" "能" "用" "途" "處" "於" "領" "先"
## 
## [[12]]
##  [1] "但" "行" "動" "支" "付" "和" "通" "訊" "已" "展" "開" "追" "勢"
## 
## [[13]]
##  [1] "」" "\n" "\n" "S"  "t"  "r"  "a"  "t"  "e"  "g"  "y"  " "  "A"  "n" 
## [15] "a"  "l"  "y"  "t"  "i"  "c"  "s"  "指" "出"
## 
## [[14]]
##  [1] "蘋" "果" "智" "慧" "手" "錶" "A"  "p"  "p"  "l"  "e"  " "  "W"  "a" 
## [15] "t"  "c"  "h"  "去" "年" "在" "全" "球" "智" "慧" "手" "錶" "市" "場"
## [29] "佔" "有" "率" "達" "到" "6"  "0"  "."  "4"  "%" 
## 
## [[15]]
## [1] "是" "主" "宰" "市" "場" "的" "產" "品"
## 
## [[16]]
##  [1] "三" "星" "電" "子" "以" "1"  "0"  "."  "6"  "%"  "的" "市" "佔" "率"
## [15] "居" "次"
## 
## [[17]]
##  [1] "（" "林" "文" "彬" "／" "綜" "合" "外" "電" "報" "導" "）"

bigram <-function(w){
  bigram <-ngrams(unlist(w), 2)
  bigram.str<-vapply(bigram, paste, "", collapse =" ")
  bigram.str
}
bigram.all<-sapply(w.split, bigram)
bigram.all

## [[1]]
##  [1] "研 究" "究 業" "業 者" "者 I"  "I D"   "D C"   "C 的"  "的 數"
##  [9] "數 據" "據 顯" "顯 示"
## 
## [[2]]
##  [1] "隨 著" "著 消" "消 費" "費 者" "者 逐" "逐 漸" "漸 察" "察 覺"
##  [9] "覺 智" "智 慧" "慧 手" "手 錶" "錶 與" "與 醫" "醫 療" "療 保"
## [17] "保 健" "健 和" "和 行" "行 動" "動 支" "支 付" "付 有" "有 關"
## [25] "關 的" "的 功" "功 能"
## 
## [[3]]
##  [1] "2 0"   "0 2"   "2 2"   "2 年"  "年 全" "全 球" "球 智" "智 慧"
##  [9] "慧 手" "手 錶" "錶 規" "規 模" "模 預" "預 估" "估 將" "將 比"
## [17] "比 今" "今 年" "年 大" "大 1"  "1 倍" 
## 
## [[4]]
## [1] "\n \n" "\n 根" "根 據" "據 I"  "I D"   "D C"   "C 的"  "的 數" "數 據"
## 
## [[5]]
##  [1] "全 球" "球 智" "智 慧" "慧 手" "手 錶" "錶 今" "今 年" "年 總"
##  [9] "總 出" "出 貨" "貨 量" "量 預" "預 估" "估 將" "將 達" "達 到"
## [17] "到 4"  "4 3"   "3 6"   "6 0"   "0 萬"  "萬 支"
## 
## [[6]]
##  [1] "2 0"   "0 2"   "2 2"   "2 年"  "年 出" "出 貨" "貨 量" "量 將"
##  [9] "將 進" "進 一" "一 步" "步 提" "提 高" "高 至" "至 8"  "8 4"  
## [17] "4 1"   "1 0"   "0 萬"  "萬 支"
## 
## [[7]]
##  [1] "2 0"   "0 1"   "1 8"   "8 至"  "至 2"  "2 0"   "0 2"   "2 2"  
##  [9] "2 年"  "年 出" "出 貨" "貨 量" "量 年" "年 複" "複 合" "合 成"
## [17] "成 長" "長 率" "率 （" "（ C"  "C A"   "A G"   "G R"   "R ）" 
## [25] "） 為" "為 1"  "1 7"   "7 ."   ". 9"   "9 %"  
## 
## [[8]]
##  [1] "2 0"   "0 1"   "1 8"   "8 年"  "年 全" "全 球" "球 穿" "穿 戴"
##  [9] "戴 式" "式 裝" "裝 置" "置 出" "出 貨" "貨 量" "量 估" "估 為"
## [17] "為 1"  "1 ."   ". 3"   "3 2"   "2 9"   "9 億"  "億 部"
## 
## [[9]]
##  [1] "2 0"   "0 2"   "2 2"   "2 年"  "年 出" "出 貨" "貨 量" "量 估"
##  [9] "估 為" "為 2"  "2 ."   ". 1"   "1 9"   "9 4"   "4 億"  "億 部"
## 
## [[10]]
##  [1] "\n \n" "\n I"  "I D"   "D C"   "C 行"  "行 動" "動 裝" "裝 置"
##  [9] "置 追" "追 蹤" "蹤 系" "系 統" "統 資" "資 深" "深 研" "研 究"
## [17] "究 分" "分 析" "析 師" "師 烏" "烏 布" "布 拉" "拉 尼" "尼 （"
## [25] "（ J"  "J i"   "i t"   "t e"   "e s"   "s h"   "h  "   "  U"  
## [33] "U b"   "b r"   "r a"   "a n"   "n i"   "i ）"  "） 說" "說 ："
## [41] "： 「" "「 消" "消 費" "費 者" "者 終" "終 於" "於 開" "開 始"
## [49] "始 了" "了 解" "解 和" "和 需" "需 要" "要 智" "智 慧" "慧 手"
## [57] "手 錶" "錶 的" "的 功" "功 能"
## 
## [[11]]
##  [1] "目 前" "前 體" "體 適" "適 能" "能 用" "用 途" "途 處" "處 於"
##  [9] "於 領" "領 先"
## 
## [[12]]
##  [1] "但 行" "行 動" "動 支" "支 付" "付 和" "和 通" "通 訊" "訊 已"
##  [9] "已 展" "展 開" "開 追" "追 勢"
## 
## [[13]]
##  [1] "」 \n" "\n \n" "\n S"  "S t"   "t r"   "r a"   "a t"   "t e"  
##  [9] "e g"   "g y"   "y  "   "  A"   "A n"   "n a"   "a l"   "l y"  
## [17] "y t"   "t i"   "i c"   "c s"   "s 指"  "指 出"
## 
## [[14]]
##  [1] "蘋 果" "果 智" "智 慧" "慧 手" "手 錶" "錶 A"  "A p"   "p p"  
##  [9] "p l"   "l e"   "e  "   "  W"   "W a"   "a t"   "t c"   "c h"  
## [17] "h 去"  "去 年" "年 在" "在 全" "全 球" "球 智" "智 慧" "慧 手"
## [25] "手 錶" "錶 市" "市 場" "場 佔" "佔 有" "有 率" "率 達" "達 到"
## [33] "到 6"  "6 0"   "0 ."   ". 4"   "4 %"  
## 
## [[15]]
## [1] "是 主" "主 宰" "宰 市" "市 場" "場 的" "的 產" "產 品"
## 
## [[16]]
##  [1] "三 星" "星 電" "電 子" "子 以" "以 1"  "1 0"   "0 ."   ". 6"  
##  [9] "6 %"   "% 的"  "的 市" "市 佔" "佔 率" "率 居" "居 次"
## 
## [[17]]
##  [1] "（ 林" "林 文" "文 彬" "彬 ／" "／ 綜" "綜 合" "合 外" "外 電"
##  [9] "電 報" "報 導" "導 ）"

tb<-table(unlist(bigram.all))
#tb[tb>=2]
#sort(tb, decreasing = TRUE)



s <- "當初中央政府拿台北市的精華地跟北市府交換"
gsub(pattern = '台北市',replacement = '', s)

## [1] "當初中央政府拿的精華地跟北市府交換"

removeKey <- function(s, keys){
  for(k in keys){
    s <- gsub(pattern = k ,replacement = '', s)
  }
  return(s)
}

removeKey(s, c('台北市', '北市府'))

## [1] "當初中央政府拿的精華地跟交換"

ngram.func<-function(s, n){
  w <- strsplit(s, '')
  n.gram<-ngrams(unlist(w), n)
  n.gram.str<-vapply(n.gram, paste, "", collapse ="")
  return(n.gram.str)
}

ngram.func(s, 3)

##  [1] "當初中" "初中央" "中央政" "央政府" "政府拿" "府拿台" "拿台北"
##  [8] "台北市" "北市的" "市的精" "的精華" "精華地" "華地跟" "地跟北"
## [15] "跟北市" "北市府" "市府交" "府交換"

keywords <- c()
sentence.vec <- unlist(strsplit(news,'，|。|（|）|：|「|」|\n|／'))

for(n in seq(4,2,-1)){
    words <- c()
    for( sentence in sentence.vec){
      s <- removeKey(sentence, keywords)
      w <- ngram.func(s, n)
      words <- c(words, w)
    }
    
  tb <- table(words)
  keywords <- c(keywords, names(tb[tb > 4]))
}

keywords

## [1] "智慧手錶" "出貨量"   "20"

longTermFirst <- function(article, keywords, threshold){
  sentence.vec <- unlist(strsplit(article,'，|。|（|）|：|「|」|\n|／'))
  
  for(n in seq(4,2,-1)){
      words <- c()
      for( sentence in sentence.vec){
        s <- removeKey(sentence, keywords)
        w <- ngram.func(s, n)
        words <- c(words, w)
      }
      
    tb <- table(words)
    keywords <- c(keywords, names(tb[tb > threshold]))
  }
  
  keywords
}

news <- '台大教授李錫錕今宣布以無黨籍身分參選台北市長，由於他和現任台北市長柯文哲在網路上的人氣都相當高，加上同樣用無黨籍身分選，外界也好奇「錕Ｐ」是否會瓜分「柯Ｐ」的選票。對此，中山大學政治系教授廖達琪坦言，「錕Ｐ」要造成像當年「柯Ｐ」在政壇上的衝擊，或者有全面性的影響力，恐怕不容易。

廖達琪表示，當年柯文哲在選台北市長時，除提出許多吸引年輕人的政見之外，在選前被監察院、調查局約談，加上本身是亞斯伯格症患者，以及在醫學上裝葉克膜的成就，讓柯文哲有比一般政治人物更吸引人的特殊故事，此外，柯文哲在市長辯論時雖自稱墨綠，但仍救治連勝文，這讓外界對柯主打的超越藍綠、白色力量有更多想像性和期待。

廖達琪說，雖然李錫錕此次以無黨籍身分參選，但畢竟李過去是代表國民黨選縣長，且缺少特殊故事，因此影響力恐不及柯，此外，李錫錕目前只有臉書上的73萬粉絲，要像柯文哲一樣在年輕人網路圈裡達成全面性的影響力，並不容易，廖認為李錫錕比較有可能排擠的，還是泛藍選票。（張博亭／台北報導）'

longTermFirst(news,c(), 3)

## [1] "李錫錕" "柯文哲" "台北"   "市長"

Demo20180325

David Chiu

2018年3月25日

Churn Analysis

RNN

Random Forest

Find Most Informative Feature

Compare Model

Clustering

Hierarchical Clustering

KMeans

Evaluate Clusters

Compare Clustering Method

Customer Segmentation

SKLearn Clustering

Text Mining

ngram