Churn Analysis
#install.packages('C50')
library(C50)
data(churn)
str(churnTrain)
## 'data.frame': 3333 obs. of 20 variables:
## $ state : Factor w/ 51 levels "AK","AL","AR",..: 17 36 32 36 37 2 20 25 19 50 ...
## $ account_length : int 128 107 137 84 75 118 121 147 117 141 ...
## $ area_code : Factor w/ 3 levels "area_code_408",..: 2 2 2 1 2 3 3 2 1 2 ...
## $ international_plan : Factor w/ 2 levels "no","yes": 1 1 1 2 2 2 1 2 1 2 ...
## $ voice_mail_plan : Factor w/ 2 levels "no","yes": 2 2 1 1 1 1 2 1 1 2 ...
## $ number_vmail_messages : int 25 26 0 0 0 0 24 0 0 37 ...
## $ total_day_minutes : num 265 162 243 299 167 ...
## $ total_day_calls : int 110 123 114 71 113 98 88 79 97 84 ...
## $ total_day_charge : num 45.1 27.5 41.4 50.9 28.3 ...
## $ total_eve_minutes : num 197.4 195.5 121.2 61.9 148.3 ...
## $ total_eve_calls : int 99 103 110 88 122 101 108 94 80 111 ...
## $ total_eve_charge : num 16.78 16.62 10.3 5.26 12.61 ...
## $ total_night_minutes : num 245 254 163 197 187 ...
## $ total_night_calls : int 91 103 104 89 121 118 118 96 90 97 ...
## $ total_night_charge : num 11.01 11.45 7.32 8.86 8.41 ...
## $ total_intl_minutes : num 10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
## $ total_intl_calls : int 3 3 5 7 3 6 7 6 4 5 ...
## $ total_intl_charge : num 2.7 3.7 3.29 1.78 2.73 1.7 2.03 1.92 2.35 3.02 ...
## $ number_customer_service_calls: int 1 1 0 2 3 0 3 0 1 0 ...
## $ churn : Factor w/ 2 levels "yes","no": 2 2 2 2 2 2 2 2 2 2 ...
dim(churnTrain)
## [1] 3333 20
churnTrain <- churnTrain[, ! names(churnTrain) %in% c('state', 'account_length', 'area_code')]
#sample.int(42,6)
set.seed(2)
idx <- sample.int(2, nrow(churnTrain), replace=TRUE, prob = c(0.7,0.3))
trainset <- churnTrain[idx == 1,]
testset <- churnTrain[idx == 2,]
dim(trainset)
## [1] 2315 17
dim(testset)
## [1] 1018 17
library(rpart)
churn.rp <- rpart(churn ~ ., data = trainset )
plot(churn.rp, margin = 0.1)
text(churn.rp)

predictions <- predict(churn.rp, testset, type= 'class')
sum(predictions == testset$churn) / length(predictions)
## [1] 0.9420432
tb <- table(testset$churn, predictions)
#install.packages('caret')
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
cm <- confusionMatrix(tb)
cm
## Confusion Matrix and Statistics
##
## predictions
## yes no
## yes 100 41
## no 18 859
##
## Accuracy : 0.942
## 95% CI : (0.9259, 0.9556)
## No Information Rate : 0.8841
## P-Value [Acc > NIR] : 2.052e-10
##
## Kappa : 0.7393
## Mcnemar's Test P-Value : 0.004181
##
## Sensitivity : 0.84746
## Specificity : 0.95444
## Pos Pred Value : 0.70922
## Neg Pred Value : 0.97948
## Prevalence : 0.11591
## Detection Rate : 0.09823
## Detection Prevalence : 0.13851
## Balanced Accuracy : 0.90095
##
## 'Positive' Class : yes
##
control <- trainControl(method="repeatedcv", number=10, repeats=3)
model <- train(churn~., data=trainset, method="rpart", preProcess="scale", trControl=control)
model
## CART
##
## 2315 samples
## 16 predictor
## 2 classes: 'yes', 'no'
##
## Pre-processing: scaled (16)
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 2084, 2083, 2083, 2083, 2083, 2084, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.05555556 0.9036559 0.5315844
## 0.07456140 0.8671038 0.2796749
## 0.07602339 0.8613423 0.2142758
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.05555556.
predictions <- predict(churn.rp, testset)
fpr_ary <- c(0)
tpr_ary <- c(0)
for (i in seq(0,1,0.01)){
res <- ifelse(predictions[,1]> i, 'yes', 'no' )
res_factor <- factor(res, levels = c('yes', 'no'),labels = c('yes', 'no'))
tb <- table(testset$churn,res_factor )
TP <- tb[1,1]
FP <- tb[1,2]
FN <- tb[2,1]
TN <- tb[2,2]
if ((TP > 0) & (FP >0) & (FN >0) & (TN > 0)){
#print(tb)
FPR <- FP / (FP + TN)
TPR <- TP / (TP + FN)
fpr_ary <- c(fpr_ary, FPR)
tpr_ary <- c(tpr_ary, TPR)
}
}
fpr_ary <- c(fpr_ary, 1)
tpr_ary <- c(tpr_ary, 1)
plot(fpr_ary, tpr_ary, type = 'o', main = 'ROC Curve', xlab = 'FPR', ylab = 'TPR', col= 'red', ylim = c(0,1), xlim = c(0,1))
abline(c(0,0), c(1,1))

#install.packages('ROCR')
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
predictions <-predict(churn.rp, testset, type="prob")
pred.to.roc<-predictions[, 1]
pred.rocr<-prediction(pred.to.roc,as.factor(testset[,(dim(testset)[[2]])]))
perf.rocr<-performance(pred.rocr, measure ="auc", x.measure="cutoff")
perf.tpr.rocr<-performance(pred.rocr, "tpr","fpr")
plot(perf.tpr.rocr, colorize=T,main=paste("AUC:",(perf.rocr@y.values)))

RNN
library(rnn)
# create training numbers
X1 = sample(0:127, 7000, replace=TRUE)
X2 = sample(0:127, 7000, replace=TRUE)
# create training response numbers
Y <- X1 + X2
# convert to binary
X1 <- int2bin(X1, length=8)
X2 <- int2bin(X2, length=8)
Y <- int2bin(Y, length=8)
# create 3d array: dim 1: samples; dim 2: time; dim 3: variables
X <- array( c(X1,X2), dim=c(dim(X1),2) )
# train the model
model <- trainr(Y=Y,
X=X,
learningrate = 0.1,
hidden_dim = 10 )
A1 = int2bin( sample(0:127, 7000, replace=TRUE) )
A2 = int2bin( sample(0:127, 7000, replace=TRUE) )
# create 3d array: dim 1: samples; dim 2: time; dim 3: variables
A <- array( c(A1,A2), dim=c(dim(A1),2) )
# predict
B <- predictr(model,
A[,dim(A)[2]:1,] )
B
Random Forest
#install.packages('randomForest')
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
forest <-randomForest(churn ~., data=trainset, ntree=200, importance=T, proximity=T)
forest.predicted<-predict(forest, testset, type ="class")
table(testset$churn, forest.predicted)
## forest.predicted
## yes no
## yes 110 31
## no 7 870
# Decision Tree
predictions1 <-predict(churn.rp, testset, type="prob")
pred.to.roc1 <-predictions1[, 1]
pred.rocr1 <-prediction(pred.to.roc1, as.factor(testset$churn))
perf.rocr1 <-performance(pred.rocr1, measure ="auc", x.measure="cutoff")
perf.tpr.rocr1 <-performance(pred.rocr1, "tpr","fpr")
# Random Forest
predictions2 <-predict(forest, testset, type="prob")
pred.to.roc2 <-predictions2[, 1]
pred.rocr2 <-prediction(pred.to.roc2, as.factor(testset$churn))
perf.rocr2 <-performance(pred.rocr2, measure ="auc", x.measure="cutoff")
perf.tpr.rocr2 <-performance(pred.rocr2, "tpr","fpr")
plot(perf.tpr.rocr1,main='ROC Curve', col=1)
legend(0.7, 0.2, c('rpart', 'randomforest'), 1:2)
plot(perf.tpr.rocr2, col=2, add=TRUE)

Compare Model
library(caret)
control <- trainControl(method="repeatedcv", number=10, repeats=3)
#names(getModelInfo())
model1 <- train(churn~., data=trainset, method="rpart", preProcess="scale", trControl=control)
model2 <- train(churn~., data=trainset, method="knn", preProcess="scale", trControl=control)
model3 <- train(churn~., data=trainset, method="rf", preProcess="scale", trControl=control)
getROC <- function(model, dataset){
predictions2 <-predict(model, dataset, type="prob")
pred.to.roc2 <-predictions2[, 1]
pred.rocr2 <-prediction(pred.to.roc2, as.factor(dataset$churn))
perf.rocr2 <-performance(pred.rocr2, measure ="auc", x.measure="cutoff")
perf.tpr.rocr2 <-performance(pred.rocr2, "tpr","fpr")
return(perf.tpr.rocr2)
}
plot(getROC(model1,testset),main='ROC Curve', col=1)
plot(getROC(model2,testset), col=2, add=TRUE)
plot(getROC(model3,testset), col=3, add=TRUE)
Clustering
x <- c(0, 0, 1, 1, 1, 1)
y <- c(1, 0, 1, 1, 0, 1)
# euclidean distance
sqrt(sum((x - y) ^ 2))
## [1] 1.414214
dist(rbind(x,y), method = 'euclidean')
## x
## y 1.414214
dist(rbind(x,y), method = 'minkowski', p = 2)
## x
## y 1.414214
# Manhattan distance
sum(abs(x - y))
## [1] 2
dist(rbind(x,y), method = 'manhattan')
## x
## y 2
dist(rbind(x,y), method = 'minkowski', p = 1)
## x
## y 2
Hierarchical Clustering
data(iris)
hc <- hclust(dist(iris[,-5], method = 'euclidean'), method='ward.D2' )
plot(hc,hang = -0.01, cex= 0.7)
fit <- cutree(hc, k = 3)
table(fit)
## fit
## 1 2 3
## 50 64 36
plot(hc,hang = -0.01, cex= 0.7)
rect.hclust(hc, k = 3 , border = 'red')

par(mfrow=c(1,2))
plot(Petal.Length ~ Petal.Width, data = iris, col = iris$Species,main = 'Real DataSet')
plot(Petal.Length ~ Petal.Width, data = iris, col = fit,main = 'Clustering')

KMeans
fit <-kmeans(iris[,-5], 3)
barplot(t(fit$centers), beside =TRUE,xlab="cluster", ylab="value")

par(mfrow=c(1,2))
plot(Petal.Width ~ Petal.Length, data = iris, col = iris$Species,main = 'Real DataSet')
plot(Petal.Width ~ Petal.Length, data = iris, col = fit$cluster,main = 'KMeans Clustering')

Evaluate Clusters
library(cluster)
set.seed(22)
km <-kmeans(iris[,-5], 3)
kms<-silhouette(km$cluster,dist(iris[,-5]))
summary(kms)
## Silhouette of 150 units in 3 clusters from silhouette.default(x = km$cluster, dist = dist(iris[, -5])) :
## Cluster sizes and average silhouette widths:
## 50 62 38
## 0.7981405 0.4173199 0.4511051
## Individual silhouette widths:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.02636 0.39115 0.56231 0.55282 0.77552 0.85391
plot(kms)

nk <-2:10
set.seed(22)
WSS <- sapply(nk, function(k){
kmeans(iris[,-5], centers=k)$tot.withinss
})
WSS
## [1] 152.34795 78.85144 57.26562 46.44618 42.42965 37.39514 34.75335
## [8] 41.96594 33.71865
plot(nk, WSS, type= 'l')
abline(v = 3, col= 'red')

# install.packages('fpc')
library(fpc)
nk<-2:10
set.seed(123)
SW <- sapply(nk, function(k){
cluster.stats(dist(iris[,-5]), kmeans(iris[,-5], centers=k)$cluster)$avg.silwidth
})
plot(nk, SW, type="l", xlab="number of clusers", ylab="average silhouette width")

Compare Clustering Method
single_c<-hclust(dist(iris[,-5]), method="single")
hc_single<-cutree(single_c, k =3)
complete_c<-hclust(dist(iris[,-5]), method="complete")
hc_complete<-cutree(complete_c, k =3)
set.seed(42)
km <-kmeans(iris[,-5], 3)
cs<-cluster.stats(dist(iris[,-5]), km$cluster)
cs[c("within.cluster.ss","avg.silwidth")]
## $within.cluster.ss
## [1] 78.85144
##
## $avg.silwidth
## [1] 0.552819
sapply(
list(kmeans=km$cluster,
hc_single=hc_single,
hc_complete=hc_complete), function(c)cluster.stats(dist(iris[,-5]), c)[c("within.cluster.ss","avg.silwidth")])
## kmeans hc_single hc_complete
## within.cluster.ss 78.85144 142.4794 89.52501
## avg.silwidth 0.552819 0.5121108 0.5135953
Customer Segmentation
customers <- read.csv('https://raw.githubusercontent.com/ywchiu/tibamepy/master/data/customers.csv')
head(customers)
## CustomerID Genre Age Annual.Income..k.. Spending.Score..1.100.
## 1 1 Male 19 15 39
## 2 2 Male 21 15 81
## 3 3 Female 20 16 6
## 4 4 Female 23 16 77
## 5 5 Female 31 17 40
## 6 6 Female 22 17 76
customers <- customers[,c(4,5)]
names(customers) <- c('annual_income', 'spending_score')
plot(spending_score ~ annual_income, data = customers)

set.seed(54)
fit <- kmeans(customers, 5)
plot(spending_score ~ annual_income, data = customers, col=fit$cluster)
points(fit$centers[,1], fit$centers[,2], col='orange')

kcs<-silhouette(fit$cluster,dist(customers))
summary(kcs)
## Silhouette of 200 units in 5 clusters from silhouette.default(x = fit$cluster, dist = dist(customers)) :
## Cluster sizes and average silhouette widths:
## 39 81 35 23 22
## 0.5091706 0.5966512 0.5039873 0.5122676 0.5990129
## Individual silhouette widths:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.02338 0.49425 0.59614 0.55393 0.66011 0.75807
plot(kcs)

nk<-2:10
set.seed(123)
SW <-sapply(nk, function(k){cluster.stats(dist(customers), kmeans(customers, centers=k)$cluster)$avg.silwidth})
plot(nk, SW, type="l", xlab="number of clusers", ylab="average silhouette width")

Text Mining
s <- '一位網友近日抱怨說,買了1包100的蝦餅,結果打開只有8片!她問:可以多一點嗎?'
strsplit(s,',|!|:|?')
## [[1]]
## [1] "一位網友近日抱怨說" "買了1包100的蝦餅" "結果打開只有8片"
## [4] "她問" "可以多一點嗎"
#install.packages('jiebaR')
library(jiebaR)
## Loading required package: jiebaRD
mixseg <- worker()
s="那我們酸民婉君也可以報名嗎"
segment(code = s, jiebar = mixseg)
## [1] "那" "我們" "酸民" "婉君" "也" "可以" "報名" "嗎"
edit_dict()
## Warning in edit_dict(): You should save the dictionary without BOM on
## Windows
USERPATH
## [1] "C:/Program Files/R/R-3.4.4/library/jiebaRD/dict/user.dict.utf8"
tagseg <- worker('tag')
segment(code = s, jiebar = tagseg)
## r r n x d c v y
## "那" "我們" "酸民" "婉君" "也" "可以" "報名" "嗎"
ngram
#install.packages("NLP")
library(NLP)
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
s <-strsplit(x="那我們酸民婉君也可以報名嗎", split='')
s
## [[1]]
## [1] "那" "我" "們" "酸" "民" "婉" "君" "也" "可" "以" "報" "名" "嗎"
bigram <-ngrams(unlist(s), 2)
bigram
## [[1]]
## [1] "那" "我"
##
## [[2]]
## [1] "我" "們"
##
## [[3]]
## [1] "們" "酸"
##
## [[4]]
## [1] "酸" "民"
##
## [[5]]
## [1] "民" "婉"
##
## [[6]]
## [1] "婉" "君"
##
## [[7]]
## [1] "君" "也"
##
## [[8]]
## [1] "也" "可"
##
## [[9]]
## [1] "可" "以"
##
## [[10]]
## [1] "以" "報"
##
## [[11]]
## [1] "報" "名"
##
## [[12]]
## [1] "名" "嗎"
vapply(bigram, paste , '' , collapse = '')
## [1] "那我" "我們" "們酸" "酸民" "民婉" "婉君" "君也" "也可" "可以" "以報"
## [11] "報名" "名嗎"
s <-strsplit(x="那我們酸民婉君也可以報名嗎", split='')
trigram <-ngrams(unlist(s), 3)
vapply(trigram, paste, "", collapse ="")
## [1] "那我們" "我們酸" "們酸民" "酸民婉" "民婉君" "婉君也" "君也可"
## [8] "也可以" "可以報" "以報名" "報名嗎"
news <- '研究業者IDC的數據顯示,隨著消費者逐漸察覺智慧手錶與醫療保健和行動支付有關的功能,2022年全球智慧手錶規模預估將比今年大1倍。
根據IDC的數據,全球智慧手錶今年總出貨量預估將達到4360萬支,2022年出貨量將進一步提高至8410萬支,2018至2022年出貨量年複合成長率(CAGR)為17.9%。2018年全球穿戴式裝置出貨量估為1.329億部,2022年出貨量估為2.194億部。
IDC行動裝置追蹤系統資深研究分析師烏布拉尼(Jitesh Ubrani)說:「消費者終於開始了解和需要智慧手錶的功能,目前體適能用途處於領先,但行動支付和通訊已展開追勢。」
Strategy Analytics指出,蘋果智慧手錶Apple Watch去年在全球智慧手錶市場佔有率達到60.4%,是主宰市場的產品,三星電子以10.6%的市佔率居次。(林文彬/綜合外電報導)'
s <-strsplit(x=news, split='')
bigram <-ngrams(unlist(s), 3)
#sort(table(vapply(bigram, paste , '' , collapse = '')), decreasing = TRUE)
s <-strsplit(x=news, split=',|:|。')
s
## [[1]]
## [1] "研究業者IDC的數據顯示"
## [2] "隨著消費者逐漸察覺智慧手錶與醫療保健和行動支付有關的功能"
## [3] "2022年全球智慧手錶規模預估將比今年大1倍"
## [4] "\n\n根據IDC的數據"
## [5] "全球智慧手錶今年總出貨量預估將達到4360萬支"
## [6] "2022年出貨量將進一步提高至8410萬支"
## [7] "2018至2022年出貨量年複合成長率(CAGR)為17.9%"
## [8] "2018年全球穿戴式裝置出貨量估為1.329億部"
## [9] "2022年出貨量估為2.194億部"
## [10] "\n\nIDC行動裝置追蹤系統資深研究分析師烏布拉尼(Jitesh Ubrani)說"
## [11] "「消費者終於開始了解和需要智慧手錶的功能"
## [12] "目前體適能用途處於領先"
## [13] "但行動支付和通訊已展開追勢"
## [14] "」\n\nStrategy Analytics指出"
## [15] "蘋果智慧手錶Apple Watch去年在全球智慧手錶市場佔有率達到60.4%"
## [16] "是主宰市場的產品"
## [17] "三星電子以10.6%的市佔率居次"
## [18] "(林文彬/綜合外電報導)"
a.split<-strsplit(news, "、|,|。")
w.split<-strsplit(x=unlist(a.split), split='')
w.split
## [[1]]
## [1] "研" "究" "業" "者" "I" "D" "C" "的" "數" "據" "顯" "示"
##
## [[2]]
## [1] "隨" "著" "消" "費" "者" "逐" "漸" "察" "覺" "智" "慧" "手" "錶" "與"
## [15] "醫" "療" "保" "健" "和" "行" "動" "支" "付" "有" "關" "的" "功" "能"
##
## [[3]]
## [1] "2" "0" "2" "2" "年" "全" "球" "智" "慧" "手" "錶" "規" "模" "預"
## [15] "估" "將" "比" "今" "年" "大" "1" "倍"
##
## [[4]]
## [1] "\n" "\n" "根" "據" "I" "D" "C" "的" "數" "據"
##
## [[5]]
## [1] "全" "球" "智" "慧" "手" "錶" "今" "年" "總" "出" "貨" "量" "預" "估"
## [15] "將" "達" "到" "4" "3" "6" "0" "萬" "支"
##
## [[6]]
## [1] "2" "0" "2" "2" "年" "出" "貨" "量" "將" "進" "一" "步" "提" "高"
## [15] "至" "8" "4" "1" "0" "萬" "支"
##
## [[7]]
## [1] "2" "0" "1" "8" "至" "2" "0" "2" "2" "年" "出" "貨" "量" "年"
## [15] "複" "合" "成" "長" "率" "(" "C" "A" "G" "R" ")" "為" "1" "7"
## [29] "." "9" "%"
##
## [[8]]
## [1] "2" "0" "1" "8" "年" "全" "球" "穿" "戴" "式" "裝" "置" "出" "貨"
## [15] "量" "估" "為" "1" "." "3" "2" "9" "億" "部"
##
## [[9]]
## [1] "2" "0" "2" "2" "年" "出" "貨" "量" "估" "為" "2" "." "1" "9"
## [15] "4" "億" "部"
##
## [[10]]
## [1] "\n" "\n" "I" "D" "C" "行" "動" "裝" "置" "追" "蹤" "系" "統" "資"
## [15] "深" "研" "究" "分" "析" "師" "烏" "布" "拉" "尼" "(" "J" "i" "t"
## [29] "e" "s" "h" " " "U" "b" "r" "a" "n" "i" ")" "說" ":" "「"
## [43] "消" "費" "者" "終" "於" "開" "始" "了" "解" "和" "需" "要" "智" "慧"
## [57] "手" "錶" "的" "功" "能"
##
## [[11]]
## [1] "目" "前" "體" "適" "能" "用" "途" "處" "於" "領" "先"
##
## [[12]]
## [1] "但" "行" "動" "支" "付" "和" "通" "訊" "已" "展" "開" "追" "勢"
##
## [[13]]
## [1] "」" "\n" "\n" "S" "t" "r" "a" "t" "e" "g" "y" " " "A" "n"
## [15] "a" "l" "y" "t" "i" "c" "s" "指" "出"
##
## [[14]]
## [1] "蘋" "果" "智" "慧" "手" "錶" "A" "p" "p" "l" "e" " " "W" "a"
## [15] "t" "c" "h" "去" "年" "在" "全" "球" "智" "慧" "手" "錶" "市" "場"
## [29] "佔" "有" "率" "達" "到" "6" "0" "." "4" "%"
##
## [[15]]
## [1] "是" "主" "宰" "市" "場" "的" "產" "品"
##
## [[16]]
## [1] "三" "星" "電" "子" "以" "1" "0" "." "6" "%" "的" "市" "佔" "率"
## [15] "居" "次"
##
## [[17]]
## [1] "(" "林" "文" "彬" "/" "綜" "合" "外" "電" "報" "導" ")"
bigram <-function(w){
bigram <-ngrams(unlist(w), 2)
bigram.str<-vapply(bigram, paste, "", collapse =" ")
bigram.str
}
bigram.all<-sapply(w.split, bigram)
bigram.all
## [[1]]
## [1] "研 究" "究 業" "業 者" "者 I" "I D" "D C" "C 的" "的 數"
## [9] "數 據" "據 顯" "顯 示"
##
## [[2]]
## [1] "隨 著" "著 消" "消 費" "費 者" "者 逐" "逐 漸" "漸 察" "察 覺"
## [9] "覺 智" "智 慧" "慧 手" "手 錶" "錶 與" "與 醫" "醫 療" "療 保"
## [17] "保 健" "健 和" "和 行" "行 動" "動 支" "支 付" "付 有" "有 關"
## [25] "關 的" "的 功" "功 能"
##
## [[3]]
## [1] "2 0" "0 2" "2 2" "2 年" "年 全" "全 球" "球 智" "智 慧"
## [9] "慧 手" "手 錶" "錶 規" "規 模" "模 預" "預 估" "估 將" "將 比"
## [17] "比 今" "今 年" "年 大" "大 1" "1 倍"
##
## [[4]]
## [1] "\n \n" "\n 根" "根 據" "據 I" "I D" "D C" "C 的" "的 數" "數 據"
##
## [[5]]
## [1] "全 球" "球 智" "智 慧" "慧 手" "手 錶" "錶 今" "今 年" "年 總"
## [9] "總 出" "出 貨" "貨 量" "量 預" "預 估" "估 將" "將 達" "達 到"
## [17] "到 4" "4 3" "3 6" "6 0" "0 萬" "萬 支"
##
## [[6]]
## [1] "2 0" "0 2" "2 2" "2 年" "年 出" "出 貨" "貨 量" "量 將"
## [9] "將 進" "進 一" "一 步" "步 提" "提 高" "高 至" "至 8" "8 4"
## [17] "4 1" "1 0" "0 萬" "萬 支"
##
## [[7]]
## [1] "2 0" "0 1" "1 8" "8 至" "至 2" "2 0" "0 2" "2 2"
## [9] "2 年" "年 出" "出 貨" "貨 量" "量 年" "年 複" "複 合" "合 成"
## [17] "成 長" "長 率" "率 (" "( C" "C A" "A G" "G R" "R )"
## [25] ") 為" "為 1" "1 7" "7 ." ". 9" "9 %"
##
## [[8]]
## [1] "2 0" "0 1" "1 8" "8 年" "年 全" "全 球" "球 穿" "穿 戴"
## [9] "戴 式" "式 裝" "裝 置" "置 出" "出 貨" "貨 量" "量 估" "估 為"
## [17] "為 1" "1 ." ". 3" "3 2" "2 9" "9 億" "億 部"
##
## [[9]]
## [1] "2 0" "0 2" "2 2" "2 年" "年 出" "出 貨" "貨 量" "量 估"
## [9] "估 為" "為 2" "2 ." ". 1" "1 9" "9 4" "4 億" "億 部"
##
## [[10]]
## [1] "\n \n" "\n I" "I D" "D C" "C 行" "行 動" "動 裝" "裝 置"
## [9] "置 追" "追 蹤" "蹤 系" "系 統" "統 資" "資 深" "深 研" "研 究"
## [17] "究 分" "分 析" "析 師" "師 烏" "烏 布" "布 拉" "拉 尼" "尼 ("
## [25] "( J" "J i" "i t" "t e" "e s" "s h" "h " " U"
## [33] "U b" "b r" "r a" "a n" "n i" "i )" ") 說" "說 :"
## [41] ": 「" "「 消" "消 費" "費 者" "者 終" "終 於" "於 開" "開 始"
## [49] "始 了" "了 解" "解 和" "和 需" "需 要" "要 智" "智 慧" "慧 手"
## [57] "手 錶" "錶 的" "的 功" "功 能"
##
## [[11]]
## [1] "目 前" "前 體" "體 適" "適 能" "能 用" "用 途" "途 處" "處 於"
## [9] "於 領" "領 先"
##
## [[12]]
## [1] "但 行" "行 動" "動 支" "支 付" "付 和" "和 通" "通 訊" "訊 已"
## [9] "已 展" "展 開" "開 追" "追 勢"
##
## [[13]]
## [1] "」 \n" "\n \n" "\n S" "S t" "t r" "r a" "a t" "t e"
## [9] "e g" "g y" "y " " A" "A n" "n a" "a l" "l y"
## [17] "y t" "t i" "i c" "c s" "s 指" "指 出"
##
## [[14]]
## [1] "蘋 果" "果 智" "智 慧" "慧 手" "手 錶" "錶 A" "A p" "p p"
## [9] "p l" "l e" "e " " W" "W a" "a t" "t c" "c h"
## [17] "h 去" "去 年" "年 在" "在 全" "全 球" "球 智" "智 慧" "慧 手"
## [25] "手 錶" "錶 市" "市 場" "場 佔" "佔 有" "有 率" "率 達" "達 到"
## [33] "到 6" "6 0" "0 ." ". 4" "4 %"
##
## [[15]]
## [1] "是 主" "主 宰" "宰 市" "市 場" "場 的" "的 產" "產 品"
##
## [[16]]
## [1] "三 星" "星 電" "電 子" "子 以" "以 1" "1 0" "0 ." ". 6"
## [9] "6 %" "% 的" "的 市" "市 佔" "佔 率" "率 居" "居 次"
##
## [[17]]
## [1] "( 林" "林 文" "文 彬" "彬 /" "/ 綜" "綜 合" "合 外" "外 電"
## [9] "電 報" "報 導" "導 )"
tb<-table(unlist(bigram.all))
#tb[tb>=2]
#sort(tb, decreasing = TRUE)
s <- "當初中央政府拿台北市的精華地跟北市府交換"
gsub(pattern = '台北市',replacement = '', s)
## [1] "當初中央政府拿的精華地跟北市府交換"
removeKey <- function(s, keys){
for(k in keys){
s <- gsub(pattern = k ,replacement = '', s)
}
return(s)
}
removeKey(s, c('台北市', '北市府'))
## [1] "當初中央政府拿的精華地跟交換"
ngram.func<-function(s, n){
w <- strsplit(s, '')
n.gram<-ngrams(unlist(w), n)
n.gram.str<-vapply(n.gram, paste, "", collapse ="")
return(n.gram.str)
}
ngram.func(s, 3)
## [1] "當初中" "初中央" "中央政" "央政府" "政府拿" "府拿台" "拿台北"
## [8] "台北市" "北市的" "市的精" "的精華" "精華地" "華地跟" "地跟北"
## [15] "跟北市" "北市府" "市府交" "府交換"
keywords <- c()
sentence.vec <- unlist(strsplit(news,',|。|(|)|:|「|」|\n|/'))
for(n in seq(4,2,-1)){
words <- c()
for( sentence in sentence.vec){
s <- removeKey(sentence, keywords)
w <- ngram.func(s, n)
words <- c(words, w)
}
tb <- table(words)
keywords <- c(keywords, names(tb[tb > 4]))
}
keywords
## [1] "智慧手錶" "出貨量" "20"
longTermFirst <- function(article, keywords, threshold){
sentence.vec <- unlist(strsplit(article,',|。|(|)|:|「|」|\n|/'))
for(n in seq(4,2,-1)){
words <- c()
for( sentence in sentence.vec){
s <- removeKey(sentence, keywords)
w <- ngram.func(s, n)
words <- c(words, w)
}
tb <- table(words)
keywords <- c(keywords, names(tb[tb > threshold]))
}
keywords
}
news <- '台大教授李錫錕今宣布以無黨籍身分參選台北市長,由於他和現任台北市長柯文哲在網路上的人氣都相當高,加上同樣用無黨籍身分選,外界也好奇「錕P」是否會瓜分「柯P」的選票。對此,中山大學政治系教授廖達琪坦言,「錕P」要造成像當年「柯P」在政壇上的衝擊,或者有全面性的影響力,恐怕不容易。
廖達琪表示,當年柯文哲在選台北市長時,除提出許多吸引年輕人的政見之外,在選前被監察院、調查局約談,加上本身是亞斯伯格症患者,以及在醫學上裝葉克膜的成就,讓柯文哲有比一般政治人物更吸引人的特殊故事,此外,柯文哲在市長辯論時雖自稱墨綠,但仍救治連勝文,這讓外界對柯主打的超越藍綠、白色力量有更多想像性和期待。
廖達琪說,雖然李錫錕此次以無黨籍身分參選,但畢竟李過去是代表國民黨選縣長,且缺少特殊故事,因此影響力恐不及柯,此外,李錫錕目前只有臉書上的73萬粉絲,要像柯文哲一樣在年輕人網路圈裡達成全面性的影響力,並不容易,廖認為李錫錕比較有可能排擠的,還是泛藍選票。(張博亭/台北報導)'
longTermFirst(news,c(), 3)
## [1] "李錫錕" "柯文哲" "台北" "市長"