download.file('https://raw.githubusercontent.com/ywchiu/rtibame/master/Data/oneday.csv', 'oneday.csv')

library(readr)
oneday <- read_csv("~/oneday.csv")
## Parsed with column specification:
## cols(
##   url = col_character(),
##   title = col_character(),
##   content = col_character(),
##   source = col_character(),
##   date = col_integer(),
##   author = col_character()
## )
library(jiebaR)
## Loading required package: jiebaRD
edit_dict()
## Warning in edit_dict(): You should save the dictionary without BOM on
## Windows
mixseg <- worker()
oneday.seg <- sapply(oneday$content, function(e)segment(e, jiebar = mixseg))
tb <- table(unlist(oneday.seg))

chinese_filter <- sapply(names(tb), function(e)regexpr(text = e, pattern =      '[\u4e00-\u9fa5]{1,}' )
)

res <- tb[(tb >= 50)  & (nchar(names(tb) ) >=2) & (chinese_filter == 1) ]



library(wordcloud2)
?wordcloud2
## starting httpd help server ...
##  done
wordcloud2(res, shape = 'star')

中文篩選

regexpr(text = 'apple', pattern = 'b' )
## [1] -1
## attr(,"match.length")
## [1] -1
## attr(,"useBytes")
## [1] TRUE
regexpr(text = 'apple', pattern = '[a-z]' )
## [1] 1
## attr(,"match.length")
## [1] 1
## attr(,"useBytes")
## [1] TRUE
regexpr(text = 'apple', pattern = '[a-z]{3,6}' )
## [1] 1
## attr(,"match.length")
## [1] 5
## attr(,"useBytes")
## [1] TRUE
regexpr(text= 'apple', pattern= '[\u4e00-\u9fa5]{1,}')
## [1] -1
## attr(,"match.length")
## [1] -1
regexpr(text= '蘋果', pattern= '[\u4e00-\u9fa5]{1,}')
## [1] 1
## attr(,"match.length")
## [1] 2

Using rpart

data(iris)
str(iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
library(rpart)
fit <- rpart(Species  ~ Sepal.Length + Sepal.Width + Petal.Length+ Petal.Width, data = iris )

fit
## n= 150 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 150 100 setosa (0.33333333 0.33333333 0.33333333)  
##   2) Petal.Length< 2.45 50   0 setosa (1.00000000 0.00000000 0.00000000) *
##   3) Petal.Length>=2.45 100  50 versicolor (0.00000000 0.50000000 0.50000000)  
##     6) Petal.Width< 1.75 54   5 versicolor (0.00000000 0.90740741 0.09259259) *
##     7) Petal.Width>=1.75 46   1 virginica (0.00000000 0.02173913 0.97826087) *
plot(fit, margin=0.1)
text(fit)

plot(iris$Petal.Length , iris$Petal.Width, col = iris$Species)
abline(v = 2.45, col="orange")
abline(h = 1.75, col="blue")


#install.packages('party')
library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich

fit <- ctree(Species  ~ Sepal.Length + Sepal.Width + Petal.Length+ Petal.Width, data = iris )
plot(fit)

驗證模型

label     <- iris[,5]
iris_data <- iris[,1:4]
tb <- table(predict(fit, iris_data, type="response"), label)

#install.packages('caret')
#install.packages('e1071')
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
confusionMatrix(tb)
## Confusion Matrix and Statistics
## 
##             label
##              setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         49         5
##   virginica       0          1        45
## 
## Overall Statistics
##                                          
##                Accuracy : 0.96           
##                  95% CI : (0.915, 0.9852)
##     No Information Rate : 0.3333         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.94           
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.9800           0.9000
## Specificity                 1.0000            0.9500           0.9900
## Pos Pred Value              1.0000            0.9074           0.9783
## Neg Pred Value              1.0000            0.9896           0.9519
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.3267           0.3000
## Detection Prevalence        0.3333            0.3600           0.3067
## Balanced Accuracy           1.0000            0.9650           0.9450
#sample.int(42,6)
set.seed(123)
idx <- sample.int(2, nrow(iris), replace = TRUE, prob = c(0.7,0.3))
table(idx)
## idx
##   1   2 
## 106  44
trainset <- iris[idx == 1, ]
testset  <- iris[idx == 2, ]
dim(trainset)
## [1] 106   5
dim(testset)
## [1] 44  5
fit2 <- rpart(Species ~ ., data = trainset)
plot(fit2, margin = 0.1)
text(fit2)

plot(iris$Petal.Length , iris$Petal.Width, col = iris$Species)
abline(v = 2.45, col="orange")
abline(v = 4.75, col="blue")

#testset[, 1:4]
cm <- table(predict(fit2, testset[, -5], type='class'), testset[,5])
confusionMatrix(cm)
## Confusion Matrix and Statistics
## 
##             
##              setosa versicolor virginica
##   setosa         15          0         0
##   versicolor      0         10         1
##   virginica       0          4        14
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8864          
##                  95% CI : (0.7544, 0.9621)
##     No Information Rate : 0.3409          
##     P-Value [Acc > NIR] : 8.552e-14       
##                                           
##                   Kappa : 0.8291          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.7143           0.9333
## Specificity                 1.0000            0.9667           0.8621
## Pos Pred Value              1.0000            0.9091           0.7778
## Neg Pred Value              1.0000            0.8788           0.9615
## Prevalence                  0.3409            0.3182           0.3409
## Detection Rate              0.3409            0.2273           0.3182
## Detection Prevalence        0.3409            0.2500           0.4091
## Balanced Accuracy           1.0000            0.8405           0.8977
(15 + 10 + 14) / 44  
## [1] 0.8863636

客戶流失分析

#install.packages('C50')
library(C50)
data(churn)

str(churnTrain)
## 'data.frame':    3333 obs. of  20 variables:
##  $ state                        : Factor w/ 51 levels "AK","AL","AR",..: 17 36 32 36 37 2 20 25 19 50 ...
##  $ account_length               : int  128 107 137 84 75 118 121 147 117 141 ...
##  $ area_code                    : Factor w/ 3 levels "area_code_408",..: 2 2 2 1 2 3 3 2 1 2 ...
##  $ international_plan           : Factor w/ 2 levels "no","yes": 1 1 1 2 2 2 1 2 1 2 ...
##  $ voice_mail_plan              : Factor w/ 2 levels "no","yes": 2 2 1 1 1 1 2 1 1 2 ...
##  $ number_vmail_messages        : int  25 26 0 0 0 0 24 0 0 37 ...
##  $ total_day_minutes            : num  265 162 243 299 167 ...
##  $ total_day_calls              : int  110 123 114 71 113 98 88 79 97 84 ...
##  $ total_day_charge             : num  45.1 27.5 41.4 50.9 28.3 ...
##  $ total_eve_minutes            : num  197.4 195.5 121.2 61.9 148.3 ...
##  $ total_eve_calls              : int  99 103 110 88 122 101 108 94 80 111 ...
##  $ total_eve_charge             : num  16.78 16.62 10.3 5.26 12.61 ...
##  $ total_night_minutes          : num  245 254 163 197 187 ...
##  $ total_night_calls            : int  91 103 104 89 121 118 118 96 90 97 ...
##  $ total_night_charge           : num  11.01 11.45 7.32 8.86 8.41 ...
##  $ total_intl_minutes           : num  10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
##  $ total_intl_calls             : int  3 3 5 7 3 6 7 6 4 5 ...
##  $ total_intl_charge            : num  2.7 3.7 3.29 1.78 2.73 1.7 2.03 1.92 2.35 3.02 ...
##  $ number_customer_service_calls: int  1 1 0 2 3 0 3 0 1 0 ...
##  $ churn                        : Factor w/ 2 levels "yes","no": 2 2 2 2 2 2 2 2 2 2 ...
churnTrain <-  churnTrain[ ,! names(churnTrain) %in% c("state", "area_code", "account_length") ]

names(churnTrain)
##  [1] "international_plan"            "voice_mail_plan"              
##  [3] "number_vmail_messages"         "total_day_minutes"            
##  [5] "total_day_calls"               "total_day_charge"             
##  [7] "total_eve_minutes"             "total_eve_calls"              
##  [9] "total_eve_charge"              "total_night_minutes"          
## [11] "total_night_calls"             "total_night_charge"           
## [13] "total_intl_minutes"            "total_intl_calls"             
## [15] "total_intl_charge"             "number_customer_service_calls"
## [17] "churn"
set.seed(2)
idx <- sample.int(2, nrow(churnTrain), replace = TRUE, prob=c(0.7,0.3))
trainset <- churnTrain[idx == 1, ]
testset  <- churnTrain[idx == 2, ]

churn.rp <- rpart(churn ~., data = churnTrain)
plot(churn.rp, margin= 0.1)
text(churn.rp)

predictions <- predict(churn.rp, testset, type='class')
tb <- table(testset$churn, predictions)
confusionMatrix(tb)
## Confusion Matrix and Statistics
## 
##      predictions
##       yes  no
##   yes 110  31
##   no   11 866
##                                           
##                Accuracy : 0.9587          
##                  95% CI : (0.9446, 0.9701)
##     No Information Rate : 0.8811          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.8162          
##  Mcnemar's Test P-Value : 0.00337         
##                                           
##             Sensitivity : 0.9091          
##             Specificity : 0.9654          
##          Pos Pred Value : 0.7801          
##          Neg Pred Value : 0.9875          
##              Prevalence : 0.1189          
##          Detection Rate : 0.1081          
##    Detection Prevalence : 0.1385          
##       Balanced Accuracy : 0.9373          
##                                           
##        'Positive' Class : yes             
## 
library(caret)

control <-trainControl(method="repeatedcv", number=10, repeats=3)

model <- train(churn~., data=trainset, method="rpart", preProcess="scale", trControl=control)

model
## CART 
## 
## 2315 samples
##   16 predictor
##    2 classes: 'yes', 'no' 
## 
## Pre-processing: scaled (16) 
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 2083, 2083, 2083, 2084, 2084, 2084, ... 
## Resampling results across tuning parameters:
## 
##   cp          Accuracy   Kappa    
##   0.05555556  0.8994954  0.5143502
##   0.07456140  0.8652302  0.2498564
##   0.07602339  0.8584636  0.1772128
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was cp = 0.05555556.
table(predict(model, testset), testset$churn)
##      
##       yes  no
##   yes  64  10
##   no   77 867
 head(predict(churn.rp, testset, type='class'))
##   2   5   6   8  13  16 
##  no  no  no  no  no yes 
## Levels: yes no

ROC Curve

library(rpart)
library(caret)
churn.rp <- rpart(churn ~., data = churnTrain)
predictions <- predict(churn.rp, testset, type='prob')

roc_x <- c(0)
roc_y <- c(0)

for(i in seq(0.01,1,0.01)){
  res <- as.factor(ifelse(predictions[,1] >= i, 'yes', 'no'))
  res <- factor(res,levels(res)[c(2,1)])

  tb <- table(testset$churn, res)
  cm <- confusionMatrix(tb)

  x <- 1 - cm$byClass[2]
  y <- cm$byClass[1]
  roc_x <- c(roc_x, x)
  roc_y <- c(roc_y, y)
}
 roc_x <- c(roc_x, 1)
 roc_y <- c(roc_y, 1)
plot(roc_x,roc_y, type='b', xlim=c(0,1), ylim=c(0,1))

使用ROCR

#install.packages('ROCR')
library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
predictions <- predict(churn.rp, testset, type="prob")
pred.to.roc <- predictions[, 1] 
pred.rocr <- prediction(pred.to.roc, as.factor(testset[,(dim(testset)[[2]])])) 

perf.rocr <- performance(pred.rocr, measure = "auc", x.measure = "cutoff") 
perf.tpr.rocr <- performance(pred.rocr, "tpr","fpr") 
plot(perf.tpr.rocr, colorize=T,main=paste("AUC:",(perf.rocr@y.values)))

RMiner

#install.packages("rminer")
library(rminer)
## 
## Attaching package: 'rminer'
## The following object is masked from 'package:party':
## 
##     fit
## The following object is masked from 'package:modeltools':
## 
##     fit
model <- fit(churn~.,trainset,model="svm")

VariableImportance <- Importance(model,trainset,method="sensv")
L <- list(runs=1,sen=t(VariableImportance$imp),sresponses=VariableImportance$sresponses)
mgraph(L,graph="IMP",leg=names(trainset),col="gray",Grid=10)

使用R 計算距離

x = c(0, 0, 1, 1, 1, 1)
y = c(1, 0, 1, 1, 0, 1)

dist(rbind(x,y), method =  "euclidean")
##          x
## y 1.414214
sqrt(sum((x - y) ^2))
## [1] 1.414214
dist(rbind(x,y), method =  "manhattan")
##   x
## y 2
sum(abs(x - y))
## [1] 2

使用hclust 做iris 分群

plot(iris$Petal.Length, iris$Petal.Width, col=iris$Species)

plot(iris$Petal.Length, iris$Petal.Width)

str(iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
iris_data <- iris[, -5]

iris_dist <- dist(iris_data, method = 'euclidean')

hc <- hclust(iris_dist, method = 'ward.D2')

plot(hc, hang=-1, cex = 0.7)

fit <- cutree(hc, k = 3)
fit
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [71] 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3
## [106] 3 2 3 3 3 3 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 2 3 3 3 3 3 2 2 3 3 3 2 3
## [141] 3 3 2 3 3 3 2 3 3 2
table(fit)
## fit
##  1  2  3 
## 50 64 36
plot(iris$Petal.Length, iris$Petal.Width)

par(mfrow=c(1,2))
plot(iris$Petal.Length, iris$Petal.Width, col=iris$Species, main= "real answer")
plot(iris$Petal.Length, iris$Petal.Width, col=fit, main = "clustering result")

plot(hc, hang = -0.01, cex = 0.7)
rect.hclust(hc, k = 3 , border="red")

文章分群

a <- c(1, 1, 1, 1, 1, 1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1)
b <- c(1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
dist(rbind(a,b), method = 'euclidean')
##          a
## b 3.872983
a1 <- c(1, 1, 1, 1, 1, 1 ,1 )
b1 <- c(1, 1, 0, 0, 0, 0, 0)
dist(rbind(a1,b1), method = 'euclidean')
##          a1
## b1 2.236068
library(proxy)
## 
## Attaching package: 'proxy'
## The following objects are masked from 'package:stats':
## 
##     as.dist, dist
## The following object is masked from 'package:base':
## 
##     as.matrix
proxy::dist(rbind(a,b), method = 'cosine')
##           a
## b 0.6570028
proxy::dist(rbind(a1,b1), method = 'cosine')
##           a1
## b1 0.4654775
download.file('https://raw.githubusercontent.com/ywchiu/rtibame/master/data/applenews.RData', 'applenews.RData')
load('applenews.RData')


library(jiebaR)
mixseg <- worker()
apple.seg <- lapply(applenews$content, function(e) segment(e, mixseg))


library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
apple.corpus <- Corpus(VectorSource(apple.seg))
apple.dtm    <- DocumentTermMatrix(apple.corpus)
dim(apple.dtm)
## [1]  1500 39206
dtm.remove <- removeSparseTerms(apple.dtm, 0.99)
apple.dist <- proxy::dist(as.matrix(dtm.remove), method='cosine')
dtm.mat <- as.matrix(apple.dist)

applenews$title[9]
## [1] "【熊本強震】取消去九州 華航5月8日前退改票免手續費"
applenews$title[order(dtm.mat[9,])[1:10]]
##  [1] "【熊本強震】取消去九州 華航5月8日前退改票免手續費"
##  [2] "熊本強震!旅遊交通資訊懶人包"                      
##  [3] "【熊本強震】九州團恐掉1到3成 旅行團改行程避走熊本"
##  [4] "日熊本強震 燦星:本月20日前的旅客全額退費"        
##  [5] "【熊本強震】取消去九州 KKday4月底前可全額退費"    
##  [6] "【九州地震】SONY、瑞薩、東京威力 熊本廠今全面停工"
##  [7] "【九州地震】SONY、瑞薩、東京威力 熊本廠今全面停工"
##  [8] "九州地震 封測廠日月光、矽品無影響"                
##  [9] "易遊網宣布 今明九州旅遊團全額退費"                
## [10] "九州為半導體重鎮 強震後供應成市場焦點"
hc <- hclust(apple.dist, method = 'ward.D2')
plot(hc, hang = -0.01)
rect.hclust(hc, k = 20)

fit <- cutree(hc, k=20)
table(fit)
## fit
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
##  93  45 345  50  65 301  64  20  78  83  74  24  21  50  48  40  61  12 
##  19  20 
##  14  12
applenews$title[fit == 10]
##  [1] "陸委會跨部會議確認 下周登陸展開肯亞案協商"                  
##  [2] "【唱新聞】詐騙嗎?R.O.C.有CHINA但不是CHINA"                  
##  [3] "【民報】馬8年政績 外媒:台灣人被北京納入治外法權"            
##  [4] "馬國50台人將遣中 我代表處動員馬國高層友人協助"              
##  [5] "【肯亞案】為何強擄台嫌? 陸警:台灣判太輕"                  
##  [6] "【肯亞案】電信詐騙台嫌分贓手法 陸媒大揭密"                  
##  [7] "中國人為何容易被騙"                                          
##  [8] "肯亞案裡極藍極綠的民粹氛圍"                                  
##  [9] "520後將凸顯「一個中華人民共和國」"                           
## [10] "520後將凸顯「一個中華人民共和國」"                           
## [11] "自由與專制對撞的時刻"                                        
## [12] "【肯亞案】立院朝野發表共同聲明 批中國侵害我國主權"          
## [13] "【肯亞案】詐騙台人央視認罪:傷天害理的報應"                  
## [14] "【財訊】根留台灣好公司 惦惦賺很大"                           
## [15] "【更新】大馬50台人再被遣中? 陸委會:協商中"                
## [16] "肯亞案確定組團赴中 羅瑩雪:最快下周一出發"                  
## [17] "52台人涉電信詐騙被逮 我與中國搶人回台"                      
## [18] "【台灣英文新聞】詐騙案讓台灣無光"                            
## [19] "大馬50台人將被遣中 網友這樣說"                              
## [20] "【更新】黃國昌爆料:馬來西亞50台人 將被遣返到中國"          
## [21] "馬國50台人將被遣返到中國 刑事局證實:正與中國溝通"          
## [22] "趙少康:台灣騙子滿天下"                                      
## [23] "我遇到了詐欺"                                                
## [24] "【法廣RFI】陳一新:點評520前夕的兩岸動態"                    
## [25] "【法廣RFI】肯亞案45台灣人均被拘北京海淀"                     
## [26] "司法院最新統計 9成詐欺犯判刑不到1年"                        
## [27] "【公庫】台灣社:譴責馬政府軟弱面對中國非法綁架台灣人事件!"  
## [28] "【法廣RFI】李顯龍:民進黨執政下九二共識更模<U+7173>"          
## [29] "羅瑩雪再創金句 「泰國人只會講泰國話」"                      
## [30] "那些年跨國逮回的詐騙集團 後來怎麼了"                        
## [31] "【肯亞案】跨部會專案會議明召開 討論赴中交涉事宜"            
## [32] "管轄權爭議 陳其邁酸:台獨去肯亞會被抓嗎"                    
## [33] "【民報】外媒:殷鑑迫近!中國今日綁架台灣人,明日綁架異議人士"
## [34] "一再騙錢抓不怕 詐欺慣犯又吸金4千萬"                         
## [35] "他說肯亞案無關政治 「當然要遣返大陸」"                      
## [36] "連警大教授都被騙!台灣詐集團真的好神"                        
## [37] "【有片】為肯亞案互槓 段宜康、羅瑩雪火爆對嗆"                
## [38] "台灣四月雪─台灣的主權、法治在下雪!"                         
## [39] "【更新】詐騙產業公會感謝聲明 羅瑩雪收到了"                  
## [40] "《環時》再談肯亞案:蔡英文勿要挾大陸"                        
## [41] "【央廣RTI】肯亞案  學者:兩岸互信才有助互動"                 
## [42] "「肯亞案」為何爆發搶人大戰 他說出真正原因"                  
## [43] "金立群:會有辦法處理台灣加入亞投行"                          
## [44] "遠見民調 五成民眾認為520後和平協議會讓兩岸更好"             
## [45] "肯亞案:蔡英文與習大大可以做的事"                            
## [46] "蔡正元轟肯亞案 「史上最丟臉案件」"                          
## [47] "台灣大戰略芻議"                                              
## [48] "詐欺刑責太輕? 張善政指示研議修法"                          
## [49] "未來不管兩岸經貿? 國發會:本就不是職掌"                    
## [50] "【更新】李顯龍稱台灣與中國衝突將被孤立 外長:無此問題"      
## [51] "<U+200B>陸官媒談肯亞案 要台灣「給自己留點臉」"              
## [52] "<U+200B>亞投行長:有辦法處理台灣加入問題"                    
## [53] "蔡英文就職演說 近6成民眾認為不該提同屬一中"                 
## [54] "星國總理:台灣人明白 與中國衝突會被孤立"                    
## [55] "【更新】中國公布受害者數字 夏立言:拿出證據證明不是說說"    
## [56] "【肯亞案】邱太三自爆:中方曾發簡訊 不要在台上吵"            
## [57] "【更新】詐欺犯回台罪太輕? 羅瑩雪打算這麼做"                
## [58] "看完就知多嚴重 台灣人你為何不生氣?"                        
## [59] "就因為這理由!他告訴你「為何人渣都愛台灣」"                  
## [60] "肯亞律師:肯亞違法遣送台灣人到中國"                          
## [61] "肯亞將台灣人遣送中國 美國最新回應"                          
## [62] "【法廣RFI】肯亞案:北京高調遣送 學者稱短期難返台"             
## [63] "四月十四日各報頭條搶先報"                                    
## [64] "【國際為什麼】跨國電信詐騙以台灣嫌犯居多?"                  
## [65] "民進黨要看懂肯亞案"                                          
## [66] "江春男:遠離非洲"                                            
## [67] "法務部錯亂的管轄概念"                                        
## [68] "【法廣RFI】國台辦:堅決法辦肯亞詐騙台嫌犯"                   
## [69] "【肯亞案】45台人遣中 陸委會:尚無掌握詐欺相關犯罪資訊"      
## [70] "台嫌遭押送中國案 陸官媒:有人藉此煽動民粹"                  
## [71] "肯亞舉債6億美元 都靠中國相借"                               
## [72] "【法廣RFI】肯亞強遣台嫌回陸 目的何在?"                       
## [73] "【法廣RFI】印尼步肯亞後塵 台緊急行動防遣送陸"                
## [74] "肯亞案 法務部:台灣有管轄權"                                
## [75] "馬政府還要鴕鳥到何時?"                                      
## [76] "中國外交部談肯亞案台人:被開釋並非無罪"                      
## [77] "【肯亞案】被中國帶走的台人 有2人是通緝犯"                   
## [78] "【肯亞案】組團赴中 羅瑩雪:本來今天要出發"                  
## [79] "【肯亞案】組團赴中 羅瑩雪:本來今天要出發"                  
## [80] "葉毓蘭再嗆肯亞詐騙疑犯 要媒體將心比心"                      
## [81] "【民報】類肯亞案 黃國昌爆:法務部通報中國到海外抓台灣人"     
## [82] "BBC:蔡英文蜜月期結束 北京拿肯亞案施壓"                     
## [83] "台嫌被強押送中國 CNN頭版關注"

文章分類

str(applenews)
## 'data.frame':    1500 obs. of  5 variables:
##  $ content : chr  "(更新:新增影片)想要透過刮刮樂彩券一夕致富,但他卻用錯方法!台中市一名黃姓男子覬覦頭獎高達2600萬的「開門見喜」刮刮樂彩券,上月佯"| __truncated__ "澳洲一名就讀雪梨大學的華裔博士生,日前公開一段燒毀中國護照的影片,還大肆批評留澳學生是一群「留學豬」。消息傳出後,這名博士生立"| __truncated__ "【行銷專題企劃】房價高高在上,沒錢買房沒關係,但你認為自己是聰明的租屋族嗎? 由蘋果地產與FBS TV合作的全新節目-房市大追擊,本集節"| __truncated__ "本內容由中央廣播電臺提供<U+00A0><U+00A0> <U+00A0> <U+00A0> <U+00A0>美國國防部長卡特(Ash Carter)今天(15日)表示,他今天將前往在菲"| __truncated__ ...
##  $ title   : chr  "【更新】搶2.2萬彩券刮中1.4萬 沒發財還得入獄" "拿到澳洲護照後 他放火燒中國護照" "【特企】房市大追擊- 租屋這些事情要小心" "【央廣RTI】美菲軍演  美防長南海登艦" ...
##  $ dt      : POSIXct, format: "2016-04-15 14:32:00" "2016-04-15 14:32:00" ...
##  $ category: chr  "社會" "國際" "地產" "國際" ...
##  $ view_cnt: chr  "1754" "0" "0" "0" ...
apple.subset <- applenews[applenews$category %in% c('社會', '財經', '娛樂'), ]

library(jiebaR)
mixseg <- worker()
apple.seg <- lapply(apple.subset$content, function(e) segment(e, jiebar = mixseg))

library(tm)
doc <- Corpus(VectorSource(apple.seg))
dtm <- DocumentTermMatrix(doc, control = list(removePunctuation=TRUE, removeNumbers=TRUE ))
dtm.remove <- removeSparseTerms(dtm, 0.99)

dim(dtm.remove)
## [1]  428 1949
dtm.remove$dimnames$Terms
##    [1] "口罩"     "已經"     "不已"     "之際"     "分局"     "友人"    
##    [7] "方式"     "方法"     "只是"     "台中市"   "犯罪"     "全案"    
##   [13] "更新"     "男子"     "車主"     "佯稱"     "使用"     "供稱"    
##   [19] "到案"     "坦承"     "店家"     "表示"     "昨天"     "昨日"    
##   [25] "派出所"   "紅色"     "要求"     "計畫"     "員警"     "埋伏"    
##   [31] "拿出"     "逃逸"     "追查"     "高達"     "售價"     "移送"    
##   [37] "通知"     "透過"     "部分"     "報導"     "提供"     "無\xa9"  
##   [43] "無法"     "畫面"     "發\xb2"   "發稿"     "詐欺罪"   "逮捕"    
##   [49] "進入"     "嫌犯"     "想要"     "損失"     "新增"     "當下"    
##   [55] "經過"     "路口"     "監視"     "監視器"   "說明"     "影片"    
##   [61] "數字"     "調查"     "調\xbe"   "機車"     "獲報"     "聯絡"    
##   [67] "購買"     "轄區"     "擴大"     "翻攝"     "鎖定"     "鏡頭"    
##   [73] "警方"     "顧客"     "大量"     "不用"     "不得"     "中午"    
##   [79] "今天"     "今\xa6"   "公司"     "公\xb6"   "主動"     "主管"    
##   [85] "司法"     "台\xa5"   "未來"     "正\xb1"   "由於"     "交付"    
##   [91] "任何"     "再度"     "合法"     "自信"     "告知"     "技術"    
##   [97] "投資人"   "求償"     "依據"     "來自"     "協助"     "呼\xc6"  
##  [103] "法務部"   "秉持"     "非\xb1"   "保障"     "前往"     "指出"    
##  [109] "政府"     "查證"     "相關"     "研究"     "研發"     "美國"    
##  [115] "原則"     "員工"     "時間"     "案情"     "浩鼎"     "特定"    
##  [121] "真相"     "秘密"     "訊息"     "配合"     "偵查"     "國內"    
##  [127] "基於"     "從未"     "情事"     "產\xb7"   "第\xa4"   "這些"    
##  [133] "連續"     "單位"     "報告"     "媒體"     "揭露"     "發布"    
##  [139] "發表"     "發展"     "搜索"     "資料"     "對\xa5"   "影響"    
##  [145] "數據"     "導致"     "獨家"     "積極"     "隨意"     "檢調"    
##  [151] "營\xb7"   "營運"     "聲明"     "釐清"     "嚴重"     "權益"    
##  [157] "驚訝"     "上訴"     "女子"     "女兒"     "女童"     "不\xc4"  
##  [163] "今依"     "另名"     "犯行"     "再次"     "同\xa9"   "地\xb0"  
##  [169] "考量"     "住處"     "判刑"     "身為"     "兒童"     "和解"    
##  [175] "始終"     "保\xc5"   "前科"     "故意"     "個人"     "時\xb3"  
##  [181] "粉絲團"   "陪審團"   "猥褻"     "給予"     "新\xa5"   "新聞"    
##  [187] "達成"     "滿足"     "審酌"     "寫下"     "熟睡"     "請看"    
##  [193] "蘋果"     "中間"     "內容"     "支持"     "支援"     "方案"    
##  [199] "市佔率"   "市場"     "平台"     "打造"     "民眾"     "同時"    
##  [205] "合\xa7"   "地位"     "安全"     "成本"     "成長"     "收視"    
##  [211] "有效"     "而言"     "系統"     "事\xb7"   "亞洲"     "具有"    
##  [217] "服務"     "肯定"     "近期"     "長期"     "宣布"     "客戶"    
##  [223] "持續"     "挑戰"     "相當"     "重新"     "面對"     "根據"    
##  [229] "特別"     "能力"     "能\xa7"   "推出"     "採用"     "產\xab"  
##  [235] "規劃"     "規模"     "軟體"     "這項"     "最大"     "最新"    
##  [241] "創新"     "提升"     "晶片"     "超過"     "進\xa4"   "達到"    
##  [247] "電視"     "預估"     "預期"     "團隊"     "夥伴"     "對於"    
##  [253] "維持"     "億美元"   "廠商"     "影響力"   "數位"     "整體"    
##  [259] "機\xb7"   "選擇"     "營收"     "攀升"     "證明"     "關鍵"    
##  [265] "競爭力"   "體驗"     "台灣"     "看好"     "國際"     "授權"    
##  [271] "透露"     "進行"     "媽媽"     "準備"     "演出"     "bmw"     
##  [277] "com"      "人員"     "中心"     "今早"     "方向"     "以及"    
##  [283] "出版"     "台大"     "市府"     "正在"     "休息"     "全都"    
##  [289] "多名"     "行經"     "完全"     "投訴"     "沒有"     "車上"    
##  [295] "車道"     "車禍"     "車輛"     "其他"     "取得"     "受傷"    
##  [301] "幸運"     "所幸"     "法律"     "直接"     "直擊"     "社\xb7"  
##  [307] "後方"     "活動"     "突發"     "乘客"     "記得"     "追撞"    
##  [313] "基隆"     "救人"     "責任"     "造成"     "最高"     "發生"    
##  [319] "毀損"     "照片"     "當時"     "路旁"     "疑似"     "緊急"    
##  [325] "肇事"     "價值"     "撞擊"     "駕駛"     "駕駛座"   "隨即"    
##  [331] "幫忙"     "騎士"     "蘋果日報" "歡迎"     "聽到"     "讀者"    
##  [337] "中國"     "台籍"     "肯亞"     "冠軍"     "娛樂中心" "詐騙"    
##  [343] "集團"     "綜合"     "遣送"     "人家"     "下午"     "已有"    
##  [349] "不\xb7"   "司機"     "只好"     "必須"     "打\xb6"   "打電話"  
##  [355] "多次"     "老翁"     "而且"     "坐在"     "求助"     "汽車"    
##  [361] "車內"     "車門"     "到場"     "返\xa6"   "返家"     "前天"    
##  [367] "突然"     "紀錄"     "原來"     "堅持"     "帶\xa6"   "這名"    
##  [373] "最後"     "詢問"     "需要"     "銷售"     "離去"     "警員"    
##  [379] "九\xa6"   "大約"     "工廠"     "日本"     "主要"     "半導體"  
##  [385] "生產"     "目前"     "全面"     "地\xbe"   "投資"     "每日"    
##  [391] "明顯"     "昨晚"     "面板"     "停車場"   "產值"     "疏散"    
##  [397] "統計"     "當地"     "運\xa7"   "電報導"   "證實"     "人生"    
##  [403] "大家"     "不能"     "心情"     "日子"     "令人"     "平安"    
##  [409] "全台"     "各自"     "地說"     "安慰"     "早上"     "我\xb7"  
##  [415] "那個"     "所有"     "朋友"     "東西"     "知名"     "勇敢"    
##  [421] "凌晨"     "祝福"     "婚姻"     "崩潰"     "這段"     "曾經"    
##  [427] "然而"     "痛苦"     "感動"     "感嘆"     "感謝"     "愛女"    
##  [433] "說話"     "遭到"     "整個"     "整理"     "臉書"     "還有"    
##  [439] "藝人"     "難過"     "不是"     "不滿"     "心生"     "手槍"    
##  [445] "主嫌"     "犯案"     "安非他命" "改造"     "李姓"     "林姓"    
##  [451] "金錢"     "查獲"     "毒\xab"   "家中"     "偵辦"     "晚上"    
##  [457] "條例"     "被\xae"   "報警"     "債務"     "違反"     "電話"    
##  [463] "彰化縣"   "綽號"     "雙方"     "攜帶"     "大發"     "女友"    
##  [469] "公社"     "手機"     "月初"     "月間"     "出去"     "去\xa6"  
##  [475] "台中"     "先生"     "地檢署"   "自己"     "妨\xae"   "事後"    
##  [481] "所以"     "恐\xc0"   "桃園"     "將他"     "得知"     "散布"    
##  [487] "罪嫌"     "對話"     "網站"     "親密"     "爆料"     "出\xb2"  
##  [493] "孕\xa8"   "交往"     "多\xa6"   "成為"     "老公"     "拍攝"    
##  [499] "洛杉磯"   "結婚"     "傳出"     "遲遲"     "懷孕"     "下半\xa6"
##  [505] "下周"     "上漲"     "大立光"   "不大"     "不如"     "不佳"    
##  [511] "不過"     "之後"     "反映"     "反彈"     "台股"     "台積電"  
##  [517] "布局"     "企\xb7"   "同步"     "多少"     "收斂"     "早盤"    
##  [523] "至於"     "使得"     "法說"     "物料"     "股價"     "股權"    
##  [529] "表\xb2"   "指數"     "展望"     "族群"     "提前"     "電子"    
##  [535] "零組件"   "億元"     "盤中"     "翻紅"     "雙雙"     "顯示"    
##  [541] "上市"     "大漲"     "公告"     "交易"     "收購"     "低於"    
##  [547] "每股"     "周二"     "股東"     "恢\xb4"   "香港"     "截至"    
##  [553] "網路"     "歷史"     "十分"     "上車"     "出席"     "台南"    
##  [559] "母親"     "好好"     "兒子"     "具體"     "果然"     "很多"    
##  [565] "婆婆"     "竟然"     "麻煩"     "就是"     "想法"     "路上"    
##  [571] "睡覺"     "談到"     "覺得"     "女方"     "工\xa7"   "分手"    
##  [577] "主持"     "出門"     "本月"     "生活"     "男性"     "迅速"    
##  [583] "周刊"     "知道"     "計師"     "展\xb6"   "新歡"     "擔任"    
##  [589] "頻頻"     "曝光"     "大礙"     "不少"     "不適"     "今晚"    
##  [595] "公益"     "比賽"     "先前"     "有人"     "完美"     "更好"    
##  [601] "其實"     "幸好"     "直呼"     "表演"     "孩子"     "音樂"    
##  [607] "留言"     "能夠"     "條件"     "期待"     "登場"     "結束"    
##  [613] "順利"     "微博"     "意\xa5"   "慈善"     "歌迷"     "網友"    
##  [619] "播出"     "學習"     "擔心"     "錄製"     "聲音"     "醫\xb0"  
##  [625] "丈夫"     "不但"     "公訴"     "文件"     "申請"     "依法"    
##  [631] "負責人"   "送到"     "健康"     "產生"     "處理"     "販售"    
##  [637] "貨車"     "提\xb0"   "然後"     "經營"     "過\xb5"   "製\xa7"  
##  [643] "檢署"     "檢察官"   "環境"     "大批"     "火勢"     "火警"    
##  [649] "立即"     "但是"     "事發"     "受困"     "物\xab"   "初步"    
##  [655] "是\xa7"   "派出"     "原\xa6"   "消防局"   "救援"     "搶救"    
##  [661] "詳細"     "實驗室"   "說法"     "撲滅"     "警消"     "人物"    
##  [667] "立刻"     "同\xa6"   "有關"     "老師"     "見面"     "並非"    
##  [673] "妹妹"     "拒絕"     "抱怨"     "法官"     "怎麼"     "問題"    
##  [679] "張姓"     "從事"     "情侶"     "報案"     "提到"     "無關"    
##  [685] "感情"     "萬元"     "對方"     "賴姓"     "聯繫"     "避免"    
##  [691] "辯稱"     "小孩"     "不到"     "夫妻"     "台幣"     "打算"    
##  [697] "全球"     "成\xa5"   "成員"     "老婆"     "扮演"     "男星"    
##  [703] "來說"     "家庭"     "真是"     "這個"     "這對"     "逐漸"    
##  [709] "最近"     "超級"     "電影"     "籌備"     "受到"     "基地"    
##  [715] "確\xbb"   "了解"     "人士"     "大陸"     "引發"     "本案"    
##  [721] "刑事"     "地點"     "行為"     "努力"     "希望"     "事證"    
##  [727] "兩岸"     "法務"     "爭議"     "後續"     "政治"     "涉及"    
##  [733] "涉嫌"     "留在"     "針對"     "高層"     "動員"     "國人"    
##  [739] "移民"     "這次"     "詐騙案"   "溝通"     "電信"     "遣返"    
##  [745] "暫時"     "請求"     "機制"     "辦理"     "檢方"     "大規模"  
##  [751] "小時"     "不僅"     "分享"     "充滿"     "平\xb1"   "同期"    
##  [757] "多位"     "如何"     "此\xa5"   "此次"     "至少"     "吸引"    
##  [763] "來台"     "其中"     "協商"     "狀況"     "重要"     "首度"    
##  [769] "消費者"   "海\xa5"   "除了"     "商\xab"   "唯\xa4"   "專\xb7"  
##  [775] "帶動"     "細\xb8"   "搭配"     "運用"     "需求"     "大雨"    
##  [781] "工\xb5"   "今天上午" "分鐘"     "出面"     "用戶"     "住戶"    
##  [787] "利用"     "困難"     "延遲"     "清楚"     "這麼"     "這樣"    
##  [793] "路面"     "預計"     "簡直"     "上前"     "不好"     "不\xc2"  
##  [799] "不顧"     "公\xa6"   "未料"     "危險"     "多處"     "自行"    
##  [805] "你們"     "沒想到"   "私人"     "身分"     "身體"     "車頭"    
##  [811] "事件"     "長達"     "門口"     "後來"     "派員"     "負責"    
##  [817] "記者"     "訊後依"   "送辦"     "送醫"     "酒測"     "強制"    
##  [823] "情緒"     "接受"     "接著"     "救出"     "殺人"     "清晨"    
##  [829] "等待"     "趕緊"     "範圍"     "衝突"     "錄影"     "轉身"    
##  [835] "議題"     "屬於"     "五\xa6"   "可能"     "並不"     "消防隊"  
##  [841] "送往"     "高市"     "參\xa5"   "國小"     "教\xa8"   "通報"    
##  [847] "陸續"     "當天"     "學生"     "檢查"     "懷疑"     "上午"    
##  [853] "上網"     "不同"     "公務"     "他人"     "可能性"   "企圖"    
##  [859] "判決"     "告訴"     "決定"     "依照"     "制伏"     "彼此"    
##  [865] "性愛"     "查出"     "查緝"     "甚至"     "看到"     "缺乏"    
##  [871] "堅稱"     "密集"     "專家"     "控制"     "提告"     "暗示"    
##  [877] "對象"     "與其"     "嘿咻"     "審判"     "確定"     "檢警"    
##  [883] "隱瞞"     "藉此"     "證人"     "大\xad"   "太陽"     "主演"    
##  [889] "平均"     "成績"     "收視率"   "完結篇"   "宋慧喬"   "姜暮煙"  
##  [895] "後裔"     "柳時鎮"   "要看"     "首都"     "時段"     "根本"    
##  [901] "浪漫"     "國\xb7"   "甜蜜"     "最佳"     "最終"     "創下"    
##  [907] "該\xbc"   "厲\xae"   "選\xc1"   "瞬間"     "雖然"     "韓國"    
##  [913] "夯\xbc"   "抵達"     "抱著"     "急救"     "接獲"     "救\xc5"  
##  [919] "醫生"     "可望"     "供應"     "供應鏈"   "法人"     "萬人"    
##  [925] "獲得"     "小心"     "不治"     "不慎"     "主委"     "生命"    
##  [931] "地上"     "位置"     "巡邏"     "並未"     "呼救"     "昏迷"    
##  [937] "指稱"     "研判"     "消防人員" "勘驗"     "婦人"     "提及"    
##  [943] "舒服"     "意識"     "試圖"     "跡象"     "模糊"     "頭部"    
##  [949] "以上"     "可以"     "再將"     "百萬"     "百萬元"   "近來"    
##  [955] "借貸"     "案件"     "海巡"     "通過"     "價格"     "今日"    
##  [961] "公布"     "升值"     "反而"     "立德"     "成長率"   "為止"    
##  [967] "美元"     "美元匯率" "風險"     "符合"     "最低"     "經紀"    
##  [973] "經濟"     "增\xa5"   "口袋"     "內部"     "日前"     "另\xa4"  
##  [979] "交給"     "全部"     "事實"     "明確"     "法辦"     "直到"    
##  [985] "偵結"     "貪污"     "期間"     "發放"     "資金"     "察覺"    
##  [991] "職務"     "大\xb6"   "以來"     "利率"     "退出"     "歐盟"    
##  [997] "士林"     "行動"     "住家"     "所屬"     "法庭"     "指揮"    
## [1003] "查扣"     "浩鼎案"   "執行"     "處於"     "董事長"   "預定"    
## [1009] "什麼"     "以前"     "本來"     "法\xb0"   "為何"     "相信"    
## [1015] "桃園市"   "處\xbb"   "就要"     "裁定"     "傷\xae"   "下來"    
## [1021] "千萬"     "不要"     "引用"     "他們"     "代言"     "宋仲基"  
## [1027] "形象"     "扭曲"     "近\xa6"   "軍人"     "時候"     "真的"    
## [1033] "國家"     "組織"     "廣大"     "請問"     "應該"     "賺錢"    
## [1039] "韓\xbc"   "不知"     "付出"     "代價"     "先是"     "測值"    
## [1045] "輕鬆"     "警察"     "權利"     "不\xa8"   "公尺"     "可見"    
## [1051] "失控"     "左右"     "事故"     "宣告"     "骨折"     "情況"    
## [1057] "排除"     "陷入"     "疑\xa6"   "不敢"     "可愛"     "全新"    
## [1063] "好\xa9"   "自\xbb"   "我們"     "看了"     "歌姬"     "大學"    
## [1069] "不錯"     "之前"     "父母"     "未滿"     "多歲"     "扶養"    
## [1075] "沒關係"   "來源"     "委託"     "性侵"     "做出"     "畢\xb7"  
## [1081] "被告"     "就讀"     "嘉義"     "課\xb5"   "還是"     "關係"    
## [1087] "三重"     "民宅"     "生命危險" "交流"     "全身"     "自由"    
## [1093] "攻擊"     "參與"     "還不"     "關心"     "分析師"   "成立"    
## [1099] "似乎"     "近日"     "宣傳"     "看\xb0"   "動\xa7"   "萬美元"  
## [1105] "女星"     "不\xa4"   "手上"     "就連"     "幾乎"     "工人"    
## [1111] "中山路"   "太輕"     "少\xa6"   "另\xa5"   "未經"     "死\xa4"  
## [1117] "死者"     "至今"     "花蓮"     "怎麼辦"   "殺\xae"   "理由"    
## [1123] "終身"     "提出"     "筆錄"     "當中"     "審理"     "確實"    
## [1129] "歷經"     "隨後"     "證據"     "分別"     "空間"     "強調"    
## [1135] "app"      "下手"     "上次"     "口氣"     "尤其"     "月底"    
## [1141] "比較"     "出口"     "央行"     "全\xa5"   "如果"     "存款"    
## [1147] "收到"     "有利"     "利息"     "快速"     "改善"     "角色"    
## [1153] "並且"     "例如"     "來看"     "官網"     "所謂"     "於是"    
## [1159] "股市"     "金融"     "建議"     "很大"     "是從"     "借款"    
## [1165] "效果"     "效應"     "核心"     "紛紛"     "衰退"     "帶來"    
## [1171] "得到"     "措施"     "累積"     "終於"     "貨幣政策" "這\xa4"  
## [1177] "這是"     "焦點"     "資產"     "過去"     "實在"     "銀行"    
## [1183] "寬鬆"     "歐洲"     "衝擊"     "操\xa7"   "鋼鐵"     "變成"    
## [1189] "反應"     "以下"     "多家"     "持平"     "修訂"     "逆轉"    
## [1195] "單季"     "跳空"     "代表"     "全文"     "同樣"     "而是"    
## [1201] "身上"     "或是"     "面\xc1"   "時期"     "純益"     "從小"    
## [1207] "接近"     "尋找"     "製造"     "獲利"     "人氣"     "公分"    
## [1213] "以往"     "出身"     "本季"     "受訪"     "長大"     "接下"    
## [1219] "掀\xb0"   "等於"     "新\xbc"   "話題"     "飾演"     "演藝圈"  
## [1225] "廣告"     "學校"     "二版"     "口角"     "少女"     "火爆"    
## [1231] "男友"     "走出"     "事情"     "妻子"     "糾紛"     "表情"    
## [1237] "亮點"     "酒店"     "這種"     "部隊"     "場面"     "趁機"    
## [1243] "遇到"     "臉部"     "爆發"     "大型"     "不可"     "段長"    
## [1249] "道路"     "中文"     "主持人"   "出來"     "表達"     "則是"    
## [1255] "送給"     "訪問"     "最愛"     "照顧"     "資訊"     "團體"    
## [1261] "演唱\xb7" "禮物"     "互動"     "只能"     "氣\xaa"   "記者\xb7"
## [1267] "追問"     "經\xb1"   "態度"     "演員"     "重點"     "週刊"    
## [1273] "溫暖"     "主任"     "竹崎"     "完成"     "延續"     "承諾"    
## [1279] "殺警案"   "鄉長"     "感到"     "嘉義縣"   "羈押"     "同意"    
## [1285] "此事"     "受審"     "國民"     "強行"     "組團"     "部長"    
## [1291] "對岸"     "瑩雪"     "自然"     "男人"     "身材"     "明星"    
## [1297] "姿勢"     "訓練"     "做到"     "集中"     "經紀人"   "簡單"    
## [1303] "上季"     "工\xb7"   "之間"     "本周"     "目標"     "全\xa6"  
## [1309] "在於"     "但\xb7"   "依然"     "其次"     "協調"     "政策"    
## [1315] "計劃"     "動能"     "提醒"     "零售"     "學\xb7"   "優於"    
## [1321] "壓力"     "變化"     "下降"     "不想"     "之\xa5"   "比重"    
## [1327] "父親"     "片中"     "可是"     "只有"     "只要"     "地方"    
## [1333] "如此"     "安排"     "有沒有"   "找到"     "改變"     "每個"    
## [1339] "私下"     "究竟"     "那麼"     "兩人"     "到底"     "幸福"    
## [1345] "爸爸"     "的事"     "很快"     "流傳"     "原本"     "家人"    
## [1351] "容易"     "消息"     "特殊"     "粉絲"     "討論"     "高潮"    
## [1357] "偶像"     "組成"     "喜歡"     "就算"     "幅度"     "智慧"    
## [1363] "發覺"     "意思"     "愛情"     "當然"     "精神"     "歐美"    
## [1369] "擁有"     "興趣"     "還要"     "獻上"     "大概"     "不禁"    
## [1375] "以為"     "各種"     "多個"     "存在"     "即將"     "完整"    
## [1381] "快點"     "思考"     "是不是"   "看見"     "科技"     "差不多"  
## [1387] "差異"     "消失"     "真正"     "專頁"     "控訴"     "殺手"    
## [1393] "眼\xb2"   "當\xa6"   "質疑"     "遭受"     "導演"     "下\xb6"  
## [1399] "台塑"     "出刊"     "目的"     "兇殺"     "行兇"     "林進忠"  
## [1405] "前妻"     "時報"     "對此"     "離婚"     "文章"     "印象"    
## [1411] "將在"     "水準"     "立法\xb0" "我國"     "將於"     "通\xb1"  
## [1417] "發行"     "下班"     "大哥"     "小女兒"   "不料"     "全家"    
## [1423] "刑度"     "合併"     "在家"     "有無"     "有罪"     "老闆"    
## [1429] "受\xae"   "姊妹"     "拍照"     "長女"     "附近"     "強制性"  
## [1435] "採信"     "意見"     "搜出"     "電腦"     "裸照"     "親吻"    
## [1441] "還在"     "來到"     "新台幣"   "執行長"   "腰斬"     "iphone"  
## [1447] "之\xa4"   "只\xb7"   "於今"     "相機"     "高階"     "啟動"    
## [1453] "搭載"     "龍頭"     "關\xaa"   "釋出"     "消費"     "干擾"    
## [1459] "要將"     "規定"     "衣服"     "抗拒"     "還曾"     "kobe"    
## [1465] "平靜"     "合照"     "尖叫"     "昨在"     "差點"     "退休"    
## [1471] "最好"     "緊張"     "謝謝"     "還\xb7"   "女神"     "失去"    
## [1477] "即時新聞" "見光"     "協議"     "坦言"     "拍戲"     "採取"    
## [1483] "發文"     "結局"     "賈靜雯"   "並將"     "前方"     "前\xa6"  
## [1489] "故障"     "致死"     "案發"     "過失"     "調整"     "燃燒"    
## [1495] "出貨"     "召\xb6"   "季\xb0"   "保守"     "看法"     "訂單"    
## [1501] "首季"     "高點"     "樂\xc6"   "放在"     "指控"     "班機"    
## [1507] "國民黨"   "晚間"     "生日"     "並無"     "喜愛"     "結果"    
## [1513] "電視\xbc" "澄清"     "螢幕"     "採訪"     "這位"     "幕後"    
## [1519] "上映"     "不然"     "笑說"     "編\xbc"   "千元"     "事實上"  
## [1525] "金額"     "律師"     "處分"     "違規"     "應是"     "每\xa6"  
## [1531] "明\xa6"   "南韓"     "採購"     "機構"     "簽約"     "大幅"    
## [1537] "大戰"     "引述"     "正式"     "的話"     "夏普"     "強化"    
## [1543] "實際"     "方面"     "正確"     "自我"     "兩個"     "委員"    
## [1549] "侵\xae"   "偵訊"     "陪同"     "陪伴"     "就\xb7"   "親友"    
## [1555] "robohon"  "主人"     "半\xa6"   "同\xb7"   "取代"     "亮相"    
## [1561] "管理"     "機器人"   "隨著"     "不堪"     "打擊"     "收押"    
## [1567] "每月"     "並在"     "空難"     "重大"     "帶著"     "澎湖"    
## [1573] "遭遇"     "不夠"     "日將"     "每天"     "值得"     "個性"    
## [1579] "造型"     "他說"     "危機"     "官員"     "公斤"     "系列"    
## [1585] "miss"     "身邊"     "鼓勵"     "離\xb6"   "人涉"     "大小"    
## [1591] "之下"     "立場"     "全國"     "更換"     "刻意"     "重傷"    
## [1597] "飛機"     "能見度"   "航空"     "荊元武"   "馬公"     "高雄"    
## [1603] "高勤官"   "接到"     "塔台"     "督導"     "違法"     "管制"    
## [1609] "輕傷"     "機場"     "辦法"     "錄音"     "下車"     "大結局"  
## [1615] "不再"     "白色"     "任務"     "迎接"     "金秀賢"   "星星"    
## [1621] "看著"     "時鎮"     "障礙"     "擁抱"     "短短"     "人力"    
## [1627] "女性"     "背景"     "連結"     "策略"     "旗下"     "數\xa6"  
## [1633] "出道"     "首次"     "舞台"     "明慧"     "套房"     "留下"    
## [1639] "隊長"     "比例"     "好友"     "求救"     "車子"     "泊霖"    
## [1645] "這\xb0"   "彰化"     "鄰\xa9"   "鐵\xac"   "打擾"     "家屬"    
## [1651] "買賣"     "當場"     "劉姓"     "雙手"     "警局"     "今天下午"
## [1657] "兄弟"     "交保"     "其餘"     "屏東"     "搶走"     "優勢"    
## [1663] "縱火"     "出庭"     "協\xb7"   "委員\xb7" "股票"     "馬上"    
## [1669] "基金"     "基金\xb7" "帳戶"     "傍晚"     "調查局"   "調處"    
## [1675] "剛好"     "解釋"     "下滑"     "主席"     "創辦人"   "減少"    
## [1681] "經理人"   "如今"     "轉而"     "未遂"     "見到"     "屋內"    
## [1687] "眼睛"     "聊天"     "痛\xbd"   "路邊"     "質問"     "鑑識"    
## [1693] "穿著"     "文書"     "出爐"     "史上"     "多\xb0"   "有期\xae"
## [1699] "完畢"     "最重"     "詐欺"     "幫助"     "有待"     "車身"    
## [1705] "工具"     "不猶豫"   "預告"     "每次"     "相同"     "獨自"    
## [1711] "不等"     "自已"     "即可"     "創立"     "獎金"     "以後"    
## [1717] "色狼"     "伸出"     "判處"     "衍生"     "旁邊"     "賠償"    
## [1723] "貼心"     "失敗"     "的確"     "原諒"     "痛哭"     "感覺"    
## [1729] "新片"     "放棄"     "創\xb7"   "跨國"     "婚後"     "公庫"    
## [1735] "落實"     "聯盟"     "大隊"     "引擎"     "手法"     "涉案"    
## [1741] "第二"     "分行"     "指示"     "停止"     "進口"     "聲請"    
## [1747] "吸毒"     "黑色"     "辛苦"     "速度"     "預測"     "引\xb0"  
## [1753] "心臟病"   "台人"     "死掉"     "官方"     "情形"     "推動"    
## [1759] "通緝"     "測謊"     "分隊"     "消防"     "轎車"     "大眾"    
## [1765] "女嘿"     "太閒"     "可怕"     "交代"     "偽造"     "偷拍"    
## [1771] "副\xc1"   "這句話"   "這件"     "揭媽"     "貴婦人"   "摩鐵約"  
## [1777] "邀請"     "雙碩士"   "議員"     "主張"     "以此"     "出發"    
## [1783] "立委"     "立\xb0"   "法制"     "泰國"     "送\xa6"   "無罪"    
## [1789] "新聞稿"   "管轄權"   "熱議"     "質詢"     "行車"     "狀態"    
## [1795] "停車"     "球棒"     "賓士"     "赫然"     "印尼"     "承辦"    
## [1801] "相較"     "掃蕩"     "大巨蛋"   "文康"     "經理"     "不足"    
## [1807] "制度"     "限制"     "庫存"     "終止"     "義務"     "不幸"    
## [1813] "民報"     "安裝"     "自首"     "見狀"     "前後"     "整天"    
## [1819] "臉上"     "鐵路"     "人以"     "出生"     "有些"     "真人秀"  
## [1825] "慢慢"     "模樣"     "露臉"     "費用"     "看出"     "告別"    
## [1831] "傳\xa9"   "網路上"   "人選"     "中信"     "接手"     "辜仲諒"  
## [1837] "精彩"     "遠雄"     "行政\xb0" "宜康"     "予以"     "平日"    
## [1843] "平時"     "壓制"     "男女"     "公務員"   "萬餘元"   "電氣化"  
## [1849] "電務"     "標案"     "謝姓"     "龐大"     "驗收"     "趕到"    
## [1855] "支付"     "永遠"     "錯\xbb"   "天王"     "金管\xb7" "第三"    
## [1861] "專案"     "驚呼"     "局長"     "複雜"     "董事"     "分析"    
## [1867] "cef"      "流\xb5"   "高度"     "經驗"     "主角"     "轉折"    
## [1873] "控告"     "合成"     "等候"     "應用"     "大廠"     "整合"    
## [1879] "人民"     "半\xa9"   "馬路"     "地區"     "規格"     "智慧型"  
## [1885] "穩定"     "使用者"   "王姓"     "承\xbb"   "藍色"     "接觸"    
## [1891] "此案"     "人口"     "競爭"     "上周"     "上市公司" "國\xa5"  
## [1897] "打擊犯罪" "累計"     "別人"     "戲\xbc"   "痛批"     "為主"    
## [1903] "特地"     "裝置"     "趨勢"     "雲端"     "走路"     "宏達"    
## [1909] "重視"     "何時"     "運動"     "相對"     "bra"      "正面"    
## [1915] "珍妮佛"   "該片"     "驚喜"     "游姓"     "賴男"     "帶走"    
## [1921] "李女"     "此時"     "監控"     "向上"     "收賄"     "聯手"    
## [1927] "發動"     "傳統"     "輕人"     "成熟"     "評論"     "身\xa4"  
## [1933] "房內"     "租客"     "想到"     "違建"     "htc"      "慶祝"    
## [1939] "受惠"     "陳志朋"   "整容"     "客人"     "巴基斯坦" "妻小"    
## [1945] "由法廣"   "法廣"     "數量"     "嫌疑人"   "政部"
dtm.mat <- as.matrix(dtm.remove)


convert_count <- function(e){
  x <- ifelse(e > 0, 1, 0)
  x <- factor(x, levels = c(0,1), labels = c('no', 'yes'))
  return(x)
}

dtm.count  <- apply(dtm.mat, MARGIN=2, convert_count)
dim(dtm.count)
## [1]  428 1949
class(dtm.count)
## [1] "matrix"
m <- as.data.frame(dtm.count)
#View(m)

idx <- sample.int(2, nrow(m), replace=TRUE, prob=c(0.7,0.3))
trainset <- m[idx == 1, ]
testset  <- m[idx == 2, ]
traintag <- apple.subset[idx==1, 'category']
testtag  <- apple.subset[idx==2, 'category']

traintitle <- apple.subset[idx==1, 'title']
testtitle <- apple.subset[idx==2, 'title']


library(e1071)
model <- naiveBayes(trainset, as.factor(traintag))
pred  <- predict(model, testset)
tb    <- table(testtag, pred)

library(caret)
cm <- confusionMatrix(tb)
cm
## Confusion Matrix and Statistics
## 
##        pred
## testtag 社會 娛樂 財經
##    社會   56    1    1
##    娛樂    1   23    0
##    財經    1    1   29
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9558          
##                  95% CI : (0.8998, 0.9855)
##     No Information Rate : 0.5133          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9283          
##  Mcnemar's Test P-Value : 0.8013          
## 
## Statistics by Class:
## 
##                      Class: 社會 Class: 娛樂 Class: 財經
## Sensitivity               0.9655      0.9200      0.9667
## Specificity               0.9636      0.9886      0.9759
## Pos Pred Value            0.9655      0.9583      0.9355
## Neg Pred Value            0.9636      0.9775      0.9878
## Prevalence                0.5133      0.2212      0.2655
## Detection Rate            0.4956      0.2035      0.2566
## Detection Prevalence      0.5133      0.2124      0.2743
## Balanced Accuracy         0.9646      0.9543      0.9713
# Verify Predictions
testtag2 <- factor(testtag, levels = c('社會', '娛樂','財經'))
idx2 <- testtag2 != pred
df <- as.data.frame(cbind(testtitle[idx2], as.character(testtag2[idx2]), as.character(pred[idx2])))
View(df)
m1 <- matrix(1:9, nrow = 3)
m1
##      [,1] [,2] [,3]
## [1,]    1    4    7
## [2,]    2    5    8
## [3,]    3    6    9
apply(m1, MARGIN=2, FUN = sum)
## [1]  6 15 24
m2 <- dtm.mat[1:3, 1:3]

convert_count <- function(e){
  x <- ifelse(e > 0, 1, 0)
  x <- factor(x, levels = c(0,1), labels = c('no', 'yes'))
  return(x)
}

m3 <- apply(m2, MARGIN=2, convert_count)
as.table(m3)
##     Terms
## Docs 口罩 已經 不已
##    1 yes  yes  yes 
##    2 no   no   no  
##    3 no   no   no

Time Series Analysis

download.file('https://raw.githubusercontent.com/ywchiu/rcookbook/master/chapter10/tw2330_finance.csv', 'tw2330_finance.csv')
tw2330 <- read.csv('tw2330_finance.csv', header=TRUE)
View(tw2330)
m <- ts(tw2330$Total.Income, frequency = 4, start = c(2008,1))
class(m)
## [1] "ts"
m.sub <- window(m, start=c(2008,1), end = c(2014,4))
components <- decompose(m.sub)
names(components)
## [1] "x"        "seasonal" "trend"    "random"   "figure"   "type"
plot(components)

#install.packages('fpp')
library(fpp)
## Loading required package: forecast
## Loading required package: fma
## Loading required package: expsmooth
## Loading required package: lmtest
## Loading required package: tseries
fit <- hw(m.sub, seasonal = 'additive' )
plot(fit)