Churn Analysis

#install.packages('C50')
library(C50)
data(churn)
str(churnTrain)
## 'data.frame':    3333 obs. of  20 variables:
##  $ state                        : Factor w/ 51 levels "AK","AL","AR",..: 17 36 32 36 37 2 20 25 19 50 ...
##  $ account_length               : int  128 107 137 84 75 118 121 147 117 141 ...
##  $ area_code                    : Factor w/ 3 levels "area_code_408",..: 2 2 2 1 2 3 3 2 1 2 ...
##  $ international_plan           : Factor w/ 2 levels "no","yes": 1 1 1 2 2 2 1 2 1 2 ...
##  $ voice_mail_plan              : Factor w/ 2 levels "no","yes": 2 2 1 1 1 1 2 1 1 2 ...
##  $ number_vmail_messages        : int  25 26 0 0 0 0 24 0 0 37 ...
##  $ total_day_minutes            : num  265 162 243 299 167 ...
##  $ total_day_calls              : int  110 123 114 71 113 98 88 79 97 84 ...
##  $ total_day_charge             : num  45.1 27.5 41.4 50.9 28.3 ...
##  $ total_eve_minutes            : num  197.4 195.5 121.2 61.9 148.3 ...
##  $ total_eve_calls              : int  99 103 110 88 122 101 108 94 80 111 ...
##  $ total_eve_charge             : num  16.78 16.62 10.3 5.26 12.61 ...
##  $ total_night_minutes          : num  245 254 163 197 187 ...
##  $ total_night_calls            : int  91 103 104 89 121 118 118 96 90 97 ...
##  $ total_night_charge           : num  11.01 11.45 7.32 8.86 8.41 ...
##  $ total_intl_minutes           : num  10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
##  $ total_intl_calls             : int  3 3 5 7 3 6 7 6 4 5 ...
##  $ total_intl_charge            : num  2.7 3.7 3.29 1.78 2.73 1.7 2.03 1.92 2.35 3.02 ...
##  $ number_customer_service_calls: int  1 1 0 2 3 0 3 0 1 0 ...
##  $ churn                        : Factor w/ 2 levels "yes","no": 2 2 2 2 2 2 2 2 2 2 ...
dim(churnTrain)
## [1] 3333   20
churnTrain <- churnTrain[, ! names(churnTrain) %in% c('state', 'account_length', 'area_code')]
#sample.int(42,6)

set.seed(2)
idx <- sample.int(2, nrow(churnTrain), replace=TRUE, prob = c(0.7,0.3))
trainset <- churnTrain[idx == 1,]
testset  <- churnTrain[idx == 2,]

dim(trainset)
## [1] 2315   17
dim(testset)
## [1] 1018   17
library(rpart)
churn.rp <- rpart(churn ~ ., data = trainset   )
plot(churn.rp, margin = 0.1)
text(churn.rp)

predictions <- predict(churn.rp, testset, type= 'class')
sum(predictions == testset$churn) / length(predictions)
## [1] 0.9420432
tb <- table(testset$churn, predictions)


#install.packages('caret')
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
cm <- confusionMatrix(tb)
cm
## Confusion Matrix and Statistics
## 
##      predictions
##       yes  no
##   yes 100  41
##   no   18 859
##                                           
##                Accuracy : 0.942           
##                  95% CI : (0.9259, 0.9556)
##     No Information Rate : 0.8841          
##     P-Value [Acc > NIR] : 2.052e-10       
##                                           
##                   Kappa : 0.7393          
##  Mcnemar's Test P-Value : 0.004181        
##                                           
##             Sensitivity : 0.84746         
##             Specificity : 0.95444         
##          Pos Pred Value : 0.70922         
##          Neg Pred Value : 0.97948         
##              Prevalence : 0.11591         
##          Detection Rate : 0.09823         
##    Detection Prevalence : 0.13851         
##       Balanced Accuracy : 0.90095         
##                                           
##        'Positive' Class : yes             
## 
control <- trainControl(method="repeatedcv", number=10, repeats=3)
model   <- train(churn~., data=trainset, method="rpart", preProcess="scale", trControl=control)
model
## CART 
## 
## 2315 samples
##   16 predictor
##    2 classes: 'yes', 'no' 
## 
## Pre-processing: scaled (16) 
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 2084, 2083, 2083, 2083, 2083, 2084, ... 
## Resampling results across tuning parameters:
## 
##   cp          Accuracy   Kappa    
##   0.05555556  0.9036559  0.5315844
##   0.07456140  0.8671038  0.2796749
##   0.07602339  0.8613423  0.2142758
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.05555556.
predictions <- predict(churn.rp, testset)

fpr_ary <- c(0)
tpr_ary <- c(0)

for (i in seq(0,1,0.01)){
  res        <- ifelse(predictions[,1]> i, 'yes', 'no' )
  res_factor <- factor(res, levels = c('yes', 'no'),labels = c('yes', 'no'))
  
  tb <- table(testset$churn,res_factor )
  
  TP <- tb[1,1]
  FP <- tb[1,2]
  FN <- tb[2,1]
  TN <- tb[2,2]
  if ((TP > 0) & (FP >0) & (FN >0) & (TN > 0)){
    #print(tb)
    FPR <- FP / (FP + TN)
    TPR <- TP / (TP + FN)
    fpr_ary <- c(fpr_ary, FPR)
    tpr_ary <- c(tpr_ary, TPR)
  }
}

fpr_ary <- c(fpr_ary, 1)
tpr_ary <- c(tpr_ary, 1)


plot(fpr_ary, tpr_ary, type = 'o', main = 'ROC Curve', xlab = 'FPR', ylab = 'TPR', col= 'red', ylim = c(0,1), xlim = c(0,1))
abline(c(0,0), c(1,1))

#install.packages('ROCR')
library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
predictions <-predict(churn.rp, testset, type="prob")
pred.to.roc<-predictions[, 1]

pred.rocr<-prediction(pred.to.roc,as.factor(testset[,(dim(testset)[[2]])]))

perf.rocr<-performance(pred.rocr, measure ="auc", x.measure="cutoff")

perf.tpr.rocr<-performance(pred.rocr, "tpr","fpr")

plot(perf.tpr.rocr, colorize=T,main=paste("AUC:",(perf.rocr@y.values)))

RNN

library(rnn)

# create training numbers
X1 = sample(0:127, 7000, replace=TRUE)
X2 = sample(0:127, 7000, replace=TRUE)

# create training response numbers
Y <- X1 + X2

# convert to binary
X1 <- int2bin(X1, length=8)
X2 <- int2bin(X2, length=8)
Y  <- int2bin(Y,  length=8)

# create 3d array: dim 1: samples; dim 2: time; dim 3: variables
X <- array( c(X1,X2), dim=c(dim(X1),2) )

# train the model
model <- trainr(Y=Y,
                X=X,
                learningrate   =  0.1,
                hidden_dim     = 10   )

A1 = int2bin( sample(0:127, 7000, replace=TRUE) )
A2 = int2bin( sample(0:127, 7000, replace=TRUE) )

# create 3d array: dim 1: samples; dim 2: time; dim 3: variables
A <- array( c(A1,A2), dim=c(dim(A1),2) )
    
# predict
B  <- predictr(model,
               A[,dim(A)[2]:1,]     )
B

Random Forest

#install.packages('randomForest')
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
forest <-randomForest(churn ~., data=trainset, ntree=200, importance=T, proximity=T)

forest.predicted<-predict(forest, testset, type ="class")
table(testset$churn, forest.predicted)
##      forest.predicted
##       yes  no
##   yes 110  31
##   no    7 870
# Decision Tree
predictions1 <-predict(churn.rp, testset, type="prob")
pred.to.roc1 <-predictions1[, 1]
pred.rocr1 <-prediction(pred.to.roc1, as.factor(testset$churn))
perf.rocr1 <-performance(pred.rocr1, measure ="auc", x.measure="cutoff")
perf.tpr.rocr1 <-performance(pred.rocr1, "tpr","fpr")

# Random Forest
predictions2 <-predict(forest, testset, type="prob")
pred.to.roc2 <-predictions2[, 1]
pred.rocr2 <-prediction(pred.to.roc2, as.factor(testset$churn))
perf.rocr2 <-performance(pred.rocr2, measure ="auc", x.measure="cutoff")
perf.tpr.rocr2 <-performance(pred.rocr2, "tpr","fpr")

plot(perf.tpr.rocr1,main='ROC Curve', col=1)
legend(0.7, 0.2, c('rpart', 'randomforest'), 1:2)
plot(perf.tpr.rocr2, col=2, add=TRUE)

Find Most Informative Feature

#install.packages("rminer")
library(rminer)
model=fit(churn~.,trainset,model="svm")

VariableImportance<-Importance(model,trainset,method="sensv")
L<-list(runs=1,sen=t(VariableImportance$imp),sresponses=VariableImportance$sresponses)
mgraph(L,graph="IMP",leg=names(trainset),col="gray",Grid=10)

Compare Model

library(caret)
control <- trainControl(method="repeatedcv", number=10, repeats=3)
#names(getModelInfo())
model1   <- train(churn~., data=trainset, method="rpart", preProcess="scale", trControl=control)
model2   <- train(churn~., data=trainset, method="knn", preProcess="scale", trControl=control)
model3   <- train(churn~., data=trainset, method="rf", preProcess="scale", trControl=control)

getROC <- function(model, dataset){
  predictions2 <-predict(model, dataset, type="prob")
  pred.to.roc2 <-predictions2[, 1]
  pred.rocr2 <-prediction(pred.to.roc2, as.factor(dataset$churn))
  perf.rocr2 <-performance(pred.rocr2, measure ="auc", x.measure="cutoff")
  perf.tpr.rocr2 <-performance(pred.rocr2, "tpr","fpr")
  return(perf.tpr.rocr2)
}

plot(getROC(model1,testset),main='ROC Curve', col=1)
plot(getROC(model2,testset), col=2, add=TRUE)
plot(getROC(model3,testset), col=3, add=TRUE)

Clustering

x <- c(0, 0, 1, 1, 1, 1)
y <- c(1, 0, 1, 1, 0, 1)


# euclidean distance
sqrt(sum((x - y) ^ 2))
## [1] 1.414214
dist(rbind(x,y), method = 'euclidean')
##          x
## y 1.414214
dist(rbind(x,y), method = 'minkowski', p = 2)
##          x
## y 1.414214
# Manhattan distance
sum(abs(x - y))
## [1] 2
dist(rbind(x,y), method = 'manhattan')
##   x
## y 2
dist(rbind(x,y), method = 'minkowski', p = 1)
##   x
## y 2

Hierarchical Clustering

data(iris)
hc <- hclust(dist(iris[,-5], method = 'euclidean'), method='ward.D2'   )
plot(hc,hang = -0.01, cex= 0.7)


fit <- cutree(hc, k = 3)
table(fit)
## fit
##  1  2  3 
## 50 64 36
plot(hc,hang = -0.01, cex= 0.7)
rect.hclust(hc, k = 3 , border = 'red')

par(mfrow=c(1,2))
plot(Petal.Length ~ Petal.Width, data = iris, col = iris$Species,main = 'Real DataSet')
plot(Petal.Length ~ Petal.Width, data = iris, col = fit,main = 'Clustering')

KMeans

fit <-kmeans(iris[,-5], 3)
barplot(t(fit$centers), beside =TRUE,xlab="cluster", ylab="value")

par(mfrow=c(1,2))
plot(Petal.Width ~ Petal.Length, data = iris, col = iris$Species,main = 'Real DataSet')
plot(Petal.Width  ~ Petal.Length, data = iris, col = fit$cluster,main = 'KMeans Clustering')

Evaluate Clusters

library(cluster)
set.seed(22)
km <-kmeans(iris[,-5], 3)
kms<-silhouette(km$cluster,dist(iris[,-5]))
summary(kms)
## Silhouette of 150 units in 3 clusters from silhouette.default(x = km$cluster, dist = dist(iris[, -5])) :
##  Cluster sizes and average silhouette widths:
##        50        62        38 
## 0.7981405 0.4173199 0.4511051 
## Individual silhouette widths:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.02636 0.39115 0.56231 0.55282 0.77552 0.85391
plot(kms)

nk <-2:10

set.seed(22)
WSS <- sapply(nk, function(k){
  kmeans(iris[,-5], centers=k)$tot.withinss
})
WSS
## [1] 152.34795  78.85144  57.26562  46.44618  42.42965  37.39514  34.75335
## [8]  41.96594  33.71865
plot(nk, WSS, type= 'l')
abline(v = 3, col= 'red')

# install.packages('fpc')
library(fpc)


nk<-2:10
set.seed(123)
SW <- sapply(nk, function(k){
  cluster.stats(dist(iris[,-5]), kmeans(iris[,-5], centers=k)$cluster)$avg.silwidth
  })

plot(nk, SW, type="l", xlab="number of clusers", ylab="average silhouette width")

Compare Clustering Method

single_c<-hclust(dist(iris[,-5]), method="single")
hc_single<-cutree(single_c, k =3)

complete_c<-hclust(dist(iris[,-5]), method="complete")
hc_complete<-cutree(complete_c, k =3)

set.seed(42)
km <-kmeans(iris[,-5], 3)
cs<-cluster.stats(dist(iris[,-5]), km$cluster)
cs[c("within.cluster.ss","avg.silwidth")]
## $within.cluster.ss
## [1] 78.85144
## 
## $avg.silwidth
## [1] 0.552819
sapply(
  list(kmeans=km$cluster, 
       hc_single=hc_single, 
       hc_complete=hc_complete), function(c)cluster.stats(dist(iris[,-5]), c)[c("within.cluster.ss","avg.silwidth")])
##                   kmeans   hc_single hc_complete
## within.cluster.ss 78.85144 142.4794  89.52501   
## avg.silwidth      0.552819 0.5121108 0.5135953

Customer Segmentation

customers <- read.csv('https://raw.githubusercontent.com/ywchiu/tibamepy/master/data/customers.csv')
head(customers)
##   CustomerID  Genre Age Annual.Income..k.. Spending.Score..1.100.
## 1          1   Male  19                 15                     39
## 2          2   Male  21                 15                     81
## 3          3 Female  20                 16                      6
## 4          4 Female  23                 16                     77
## 5          5 Female  31                 17                     40
## 6          6 Female  22                 17                     76
customers <- customers[,c(4,5)]
names(customers) <- c('annual_income', 'spending_score')


plot(spending_score ~ annual_income, data = customers)

set.seed(54)
fit <- kmeans(customers, 5)
plot(spending_score ~ annual_income, data = customers, col=fit$cluster)
points(fit$centers[,1], fit$centers[,2], col='orange')

kcs<-silhouette(fit$cluster,dist(customers))
summary(kcs)
## Silhouette of 200 units in 5 clusters from silhouette.default(x = fit$cluster, dist = dist(customers)) :
##  Cluster sizes and average silhouette widths:
##        39        81        35        23        22 
## 0.5091706 0.5966512 0.5039873 0.5122676 0.5990129 
## Individual silhouette widths:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.02338  0.49425  0.59614  0.55393  0.66011  0.75807
plot(kcs)

nk<-2:10
set.seed(123)

SW <-sapply(nk, function(k){cluster.stats(dist(customers), kmeans(customers, centers=k)$cluster)$avg.silwidth})


plot(nk, SW, type="l", xlab="number of clusers", ylab="average silhouette width")

SKLearn Clustering

Text Mining

s <- '一位網友近日抱怨說,買了1包100的蝦餅,結果打開只有8片!她問:可以多一點嗎?'

strsplit(s,',|!|:|?')
## [[1]]
## [1] "一位網友近日抱怨說" "買了1包100的蝦餅"   "結果打開只有8片"   
## [4] "她問"               "可以多一點嗎"
#install.packages('jiebaR')
library(jiebaR)
## Loading required package: jiebaRD
mixseg <- worker()

s="那我們酸民婉君也可以報名嗎"
segment(code = s, jiebar = mixseg)
## [1] "那"   "我們" "酸民" "婉君" "也"   "可以" "報名" "嗎"
edit_dict()
## Warning in edit_dict(): You should save the dictionary without BOM on
## Windows
USERPATH
## [1] "C:/Program Files/R/R-3.4.4/library/jiebaRD/dict/user.dict.utf8"
tagseg <- worker('tag')
segment(code = s, jiebar = tagseg)
##      r      r      n      x      d      c      v      y 
##   "那" "我們" "酸民" "婉君"   "也" "可以" "報名"   "嗎"

ngram

#install.packages("NLP")
library(NLP)
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
s <-strsplit(x="那我們酸民婉君也可以報名嗎", split='')
s
## [[1]]
##  [1] "那" "我" "們" "酸" "民" "婉" "君" "也" "可" "以" "報" "名" "嗎"
bigram <-ngrams(unlist(s), 2)
bigram
## [[1]]
## [1] "那" "我"
## 
## [[2]]
## [1] "我" "們"
## 
## [[3]]
## [1] "們" "酸"
## 
## [[4]]
## [1] "酸" "民"
## 
## [[5]]
## [1] "民" "婉"
## 
## [[6]]
## [1] "婉" "君"
## 
## [[7]]
## [1] "君" "也"
## 
## [[8]]
## [1] "也" "可"
## 
## [[9]]
## [1] "可" "以"
## 
## [[10]]
## [1] "以" "報"
## 
## [[11]]
## [1] "報" "名"
## 
## [[12]]
## [1] "名" "嗎"
vapply(bigram, paste , '' , collapse = '')
##  [1] "那我" "我們" "們酸" "酸民" "民婉" "婉君" "君也" "也可" "可以" "以報"
## [11] "報名" "名嗎"
s <-strsplit(x="那我們酸民婉君也可以報名嗎", split='')
trigram <-ngrams(unlist(s), 3)
vapply(trigram, paste, "", collapse ="")
##  [1] "那我們" "我們酸" "們酸民" "酸民婉" "民婉君" "婉君也" "君也可"
##  [8] "也可以" "可以報" "以報名" "報名嗎"
news <- '研究業者IDC的數據顯示,隨著消費者逐漸察覺智慧手錶與醫療保健和行動支付有關的功能,2022年全球智慧手錶規模預估將比今年大1倍。

根據IDC的數據,全球智慧手錶今年總出貨量預估將達到4360萬支,2022年出貨量將進一步提高至8410萬支,2018至2022年出貨量年複合成長率(CAGR)為17.9%。2018年全球穿戴式裝置出貨量估為1.329億部,2022年出貨量估為2.194億部。

IDC行動裝置追蹤系統資深研究分析師烏布拉尼(Jitesh Ubrani)說:「消費者終於開始了解和需要智慧手錶的功能,目前體適能用途處於領先,但行動支付和通訊已展開追勢。」

Strategy Analytics指出,蘋果智慧手錶Apple Watch去年在全球智慧手錶市場佔有率達到60.4%,是主宰市場的產品,三星電子以10.6%的市佔率居次。(林文彬/綜合外電報導)'

s <-strsplit(x=news, split='')

bigram <-ngrams(unlist(s), 3)
#sort(table(vapply(bigram, paste , '' , collapse = '')), decreasing = TRUE)

s <-strsplit(x=news, split=',|:|。')
s
## [[1]]
##  [1] "研究業者IDC的數據顯示"                                           
##  [2] "隨著消費者逐漸察覺智慧手錶與醫療保健和行動支付有關的功能"        
##  [3] "2022年全球智慧手錶規模預估將比今年大1倍"                         
##  [4] "\n\n根據IDC的數據"                                               
##  [5] "全球智慧手錶今年總出貨量預估將達到4360萬支"                      
##  [6] "2022年出貨量將進一步提高至8410萬支"                              
##  [7] "2018至2022年出貨量年複合成長率(CAGR)為17.9%"                   
##  [8] "2018年全球穿戴式裝置出貨量估為1.329億部"                         
##  [9] "2022年出貨量估為2.194億部"                                       
## [10] "\n\nIDC行動裝置追蹤系統資深研究分析師烏布拉尼(Jitesh Ubrani)說"
## [11] "「消費者終於開始了解和需要智慧手錶的功能"                        
## [12] "目前體適能用途處於領先"                                          
## [13] "但行動支付和通訊已展開追勢"                                      
## [14] "」\n\nStrategy Analytics指出"                                    
## [15] "蘋果智慧手錶Apple Watch去年在全球智慧手錶市場佔有率達到60.4%"    
## [16] "是主宰市場的產品"                                                
## [17] "三星電子以10.6%的市佔率居次"                                     
## [18] "(林文彬/綜合外電報導)"
a.split<-strsplit(news, "、|,|。")
w.split<-strsplit(x=unlist(a.split), split='')

w.split
## [[1]]
##  [1] "研" "究" "業" "者" "I"  "D"  "C"  "的" "數" "據" "顯" "示"
## 
## [[2]]
##  [1] "隨" "著" "消" "費" "者" "逐" "漸" "察" "覺" "智" "慧" "手" "錶" "與"
## [15] "醫" "療" "保" "健" "和" "行" "動" "支" "付" "有" "關" "的" "功" "能"
## 
## [[3]]
##  [1] "2"  "0"  "2"  "2"  "年" "全" "球" "智" "慧" "手" "錶" "規" "模" "預"
## [15] "估" "將" "比" "今" "年" "大" "1"  "倍"
## 
## [[4]]
##  [1] "\n" "\n" "根" "據" "I"  "D"  "C"  "的" "數" "據"
## 
## [[5]]
##  [1] "全" "球" "智" "慧" "手" "錶" "今" "年" "總" "出" "貨" "量" "預" "估"
## [15] "將" "達" "到" "4"  "3"  "6"  "0"  "萬" "支"
## 
## [[6]]
##  [1] "2"  "0"  "2"  "2"  "年" "出" "貨" "量" "將" "進" "一" "步" "提" "高"
## [15] "至" "8"  "4"  "1"  "0"  "萬" "支"
## 
## [[7]]
##  [1] "2"  "0"  "1"  "8"  "至" "2"  "0"  "2"  "2"  "年" "出" "貨" "量" "年"
## [15] "複" "合" "成" "長" "率" "(" "C"  "A"  "G"  "R"  ")" "為" "1"  "7" 
## [29] "."  "9"  "%" 
## 
## [[8]]
##  [1] "2"  "0"  "1"  "8"  "年" "全" "球" "穿" "戴" "式" "裝" "置" "出" "貨"
## [15] "量" "估" "為" "1"  "."  "3"  "2"  "9"  "億" "部"
## 
## [[9]]
##  [1] "2"  "0"  "2"  "2"  "年" "出" "貨" "量" "估" "為" "2"  "."  "1"  "9" 
## [15] "4"  "億" "部"
## 
## [[10]]
##  [1] "\n" "\n" "I"  "D"  "C"  "行" "動" "裝" "置" "追" "蹤" "系" "統" "資"
## [15] "深" "研" "究" "分" "析" "師" "烏" "布" "拉" "尼" "(" "J"  "i"  "t" 
## [29] "e"  "s"  "h"  " "  "U"  "b"  "r"  "a"  "n"  "i"  ")" "說" ":" "「"
## [43] "消" "費" "者" "終" "於" "開" "始" "了" "解" "和" "需" "要" "智" "慧"
## [57] "手" "錶" "的" "功" "能"
## 
## [[11]]
##  [1] "目" "前" "體" "適" "能" "用" "途" "處" "於" "領" "先"
## 
## [[12]]
##  [1] "但" "行" "動" "支" "付" "和" "通" "訊" "已" "展" "開" "追" "勢"
## 
## [[13]]
##  [1] "」" "\n" "\n" "S"  "t"  "r"  "a"  "t"  "e"  "g"  "y"  " "  "A"  "n" 
## [15] "a"  "l"  "y"  "t"  "i"  "c"  "s"  "指" "出"
## 
## [[14]]
##  [1] "蘋" "果" "智" "慧" "手" "錶" "A"  "p"  "p"  "l"  "e"  " "  "W"  "a" 
## [15] "t"  "c"  "h"  "去" "年" "在" "全" "球" "智" "慧" "手" "錶" "市" "場"
## [29] "佔" "有" "率" "達" "到" "6"  "0"  "."  "4"  "%" 
## 
## [[15]]
## [1] "是" "主" "宰" "市" "場" "的" "產" "品"
## 
## [[16]]
##  [1] "三" "星" "電" "子" "以" "1"  "0"  "."  "6"  "%"  "的" "市" "佔" "率"
## [15] "居" "次"
## 
## [[17]]
##  [1] "(" "林" "文" "彬" "/" "綜" "合" "外" "電" "報" "導" ")"
bigram <-function(w){
  bigram <-ngrams(unlist(w), 2)
  bigram.str<-vapply(bigram, paste, "", collapse =" ")
  bigram.str
}
bigram.all<-sapply(w.split, bigram)
bigram.all
## [[1]]
##  [1] "研 究" "究 業" "業 者" "者 I"  "I D"   "D C"   "C 的"  "的 數"
##  [9] "數 據" "據 顯" "顯 示"
## 
## [[2]]
##  [1] "隨 著" "著 消" "消 費" "費 者" "者 逐" "逐 漸" "漸 察" "察 覺"
##  [9] "覺 智" "智 慧" "慧 手" "手 錶" "錶 與" "與 醫" "醫 療" "療 保"
## [17] "保 健" "健 和" "和 行" "行 動" "動 支" "支 付" "付 有" "有 關"
## [25] "關 的" "的 功" "功 能"
## 
## [[3]]
##  [1] "2 0"   "0 2"   "2 2"   "2 年"  "年 全" "全 球" "球 智" "智 慧"
##  [9] "慧 手" "手 錶" "錶 規" "規 模" "模 預" "預 估" "估 將" "將 比"
## [17] "比 今" "今 年" "年 大" "大 1"  "1 倍" 
## 
## [[4]]
## [1] "\n \n" "\n 根" "根 據" "據 I"  "I D"   "D C"   "C 的"  "的 數" "數 據"
## 
## [[5]]
##  [1] "全 球" "球 智" "智 慧" "慧 手" "手 錶" "錶 今" "今 年" "年 總"
##  [9] "總 出" "出 貨" "貨 量" "量 預" "預 估" "估 將" "將 達" "達 到"
## [17] "到 4"  "4 3"   "3 6"   "6 0"   "0 萬"  "萬 支"
## 
## [[6]]
##  [1] "2 0"   "0 2"   "2 2"   "2 年"  "年 出" "出 貨" "貨 量" "量 將"
##  [9] "將 進" "進 一" "一 步" "步 提" "提 高" "高 至" "至 8"  "8 4"  
## [17] "4 1"   "1 0"   "0 萬"  "萬 支"
## 
## [[7]]
##  [1] "2 0"   "0 1"   "1 8"   "8 至"  "至 2"  "2 0"   "0 2"   "2 2"  
##  [9] "2 年"  "年 出" "出 貨" "貨 量" "量 年" "年 複" "複 合" "合 成"
## [17] "成 長" "長 率" "率 (" "( C"  "C A"   "A G"   "G R"   "R )" 
## [25] ") 為" "為 1"  "1 7"   "7 ."   ". 9"   "9 %"  
## 
## [[8]]
##  [1] "2 0"   "0 1"   "1 8"   "8 年"  "年 全" "全 球" "球 穿" "穿 戴"
##  [9] "戴 式" "式 裝" "裝 置" "置 出" "出 貨" "貨 量" "量 估" "估 為"
## [17] "為 1"  "1 ."   ". 3"   "3 2"   "2 9"   "9 億"  "億 部"
## 
## [[9]]
##  [1] "2 0"   "0 2"   "2 2"   "2 年"  "年 出" "出 貨" "貨 量" "量 估"
##  [9] "估 為" "為 2"  "2 ."   ". 1"   "1 9"   "9 4"   "4 億"  "億 部"
## 
## [[10]]
##  [1] "\n \n" "\n I"  "I D"   "D C"   "C 行"  "行 動" "動 裝" "裝 置"
##  [9] "置 追" "追 蹤" "蹤 系" "系 統" "統 資" "資 深" "深 研" "研 究"
## [17] "究 分" "分 析" "析 師" "師 烏" "烏 布" "布 拉" "拉 尼" "尼 ("
## [25] "( J"  "J i"   "i t"   "t e"   "e s"   "s h"   "h  "   "  U"  
## [33] "U b"   "b r"   "r a"   "a n"   "n i"   "i )"  ") 說" "說 :"
## [41] ": 「" "「 消" "消 費" "費 者" "者 終" "終 於" "於 開" "開 始"
## [49] "始 了" "了 解" "解 和" "和 需" "需 要" "要 智" "智 慧" "慧 手"
## [57] "手 錶" "錶 的" "的 功" "功 能"
## 
## [[11]]
##  [1] "目 前" "前 體" "體 適" "適 能" "能 用" "用 途" "途 處" "處 於"
##  [9] "於 領" "領 先"
## 
## [[12]]
##  [1] "但 行" "行 動" "動 支" "支 付" "付 和" "和 通" "通 訊" "訊 已"
##  [9] "已 展" "展 開" "開 追" "追 勢"
## 
## [[13]]
##  [1] "」 \n" "\n \n" "\n S"  "S t"   "t r"   "r a"   "a t"   "t e"  
##  [9] "e g"   "g y"   "y  "   "  A"   "A n"   "n a"   "a l"   "l y"  
## [17] "y t"   "t i"   "i c"   "c s"   "s 指"  "指 出"
## 
## [[14]]
##  [1] "蘋 果" "果 智" "智 慧" "慧 手" "手 錶" "錶 A"  "A p"   "p p"  
##  [9] "p l"   "l e"   "e  "   "  W"   "W a"   "a t"   "t c"   "c h"  
## [17] "h 去"  "去 年" "年 在" "在 全" "全 球" "球 智" "智 慧" "慧 手"
## [25] "手 錶" "錶 市" "市 場" "場 佔" "佔 有" "有 率" "率 達" "達 到"
## [33] "到 6"  "6 0"   "0 ."   ". 4"   "4 %"  
## 
## [[15]]
## [1] "是 主" "主 宰" "宰 市" "市 場" "場 的" "的 產" "產 品"
## 
## [[16]]
##  [1] "三 星" "星 電" "電 子" "子 以" "以 1"  "1 0"   "0 ."   ". 6"  
##  [9] "6 %"   "% 的"  "的 市" "市 佔" "佔 率" "率 居" "居 次"
## 
## [[17]]
##  [1] "( 林" "林 文" "文 彬" "彬 /" "/ 綜" "綜 合" "合 外" "外 電"
##  [9] "電 報" "報 導" "導 )"
tb<-table(unlist(bigram.all))
#tb[tb>=2]
#sort(tb, decreasing = TRUE)



s <- "當初中央政府拿台北市的精華地跟北市府交換"
gsub(pattern = '台北市',replacement = '', s)
## [1] "當初中央政府拿的精華地跟北市府交換"
removeKey <- function(s, keys){
  for(k in keys){
    s <- gsub(pattern = k ,replacement = '', s)
  }
  return(s)
}

removeKey(s, c('台北市', '北市府'))
## [1] "當初中央政府拿的精華地跟交換"
ngram.func<-function(s, n){
  w <- strsplit(s, '')
  n.gram<-ngrams(unlist(w), n)
  n.gram.str<-vapply(n.gram, paste, "", collapse ="")
  return(n.gram.str)
}

ngram.func(s, 3)
##  [1] "當初中" "初中央" "中央政" "央政府" "政府拿" "府拿台" "拿台北"
##  [8] "台北市" "北市的" "市的精" "的精華" "精華地" "華地跟" "地跟北"
## [15] "跟北市" "北市府" "市府交" "府交換"
keywords <- c()
sentence.vec <- unlist(strsplit(news,',|。|(|)|:|「|」|\n|/'))

for(n in seq(4,2,-1)){
    words <- c()
    for( sentence in sentence.vec){
      s <- removeKey(sentence, keywords)
      w <- ngram.func(s, n)
      words <- c(words, w)
    }
    
  tb <- table(words)
  keywords <- c(keywords, names(tb[tb > 4]))
}

keywords
## [1] "智慧手錶" "出貨量"   "20"
longTermFirst <- function(article, keywords, threshold){
  sentence.vec <- unlist(strsplit(article,',|。|(|)|:|「|」|\n|/'))
  
  for(n in seq(4,2,-1)){
      words <- c()
      for( sentence in sentence.vec){
        s <- removeKey(sentence, keywords)
        w <- ngram.func(s, n)
        words <- c(words, w)
      }
      
    tb <- table(words)
    keywords <- c(keywords, names(tb[tb > threshold]))
  }
  
  keywords
}

news <- '台大教授李錫錕今宣布以無黨籍身分參選台北市長,由於他和現任台北市長柯文哲在網路上的人氣都相當高,加上同樣用無黨籍身分選,外界也好奇「錕P」是否會瓜分「柯P」的選票。對此,中山大學政治系教授廖達琪坦言,「錕P」要造成像當年「柯P」在政壇上的衝擊,或者有全面性的影響力,恐怕不容易。

廖達琪表示,當年柯文哲在選台北市長時,除提出許多吸引年輕人的政見之外,在選前被監察院、調查局約談,加上本身是亞斯伯格症患者,以及在醫學上裝葉克膜的成就,讓柯文哲有比一般政治人物更吸引人的特殊故事,此外,柯文哲在市長辯論時雖自稱墨綠,但仍救治連勝文,這讓外界對柯主打的超越藍綠、白色力量有更多想像性和期待。

廖達琪說,雖然李錫錕此次以無黨籍身分參選,但畢竟李過去是代表國民黨選縣長,且缺少特殊故事,因此影響力恐不及柯,此外,李錫錕目前只有臉書上的73萬粉絲,要像柯文哲一樣在年輕人網路圈裡達成全面性的影響力,並不容易,廖認為李錫錕比較有可能排擠的,還是泛藍選票。(張博亭/台北報導)'

longTermFirst(news,c(), 3)
## [1] "李錫錕" "柯文哲" "台北"   "市長"