download.file('https://raw.githubusercontent.com/ywchiu/rtibame/master/Data/oneday.csv', 'oneday.csv')
library(readr)
oneday <- read_csv("~/oneday.csv")
## Parsed with column specification:
## cols(
## url = col_character(),
## title = col_character(),
## content = col_character(),
## source = col_character(),
## date = col_integer(),
## author = col_character()
## )
library(jiebaR)
## Loading required package: jiebaRD
edit_dict()
## Warning in edit_dict(): You should save the dictionary without BOM on
## Windows
mixseg <- worker()
oneday.seg <- sapply(oneday$content, function(e)segment(e, jiebar = mixseg))
tb <- table(unlist(oneday.seg))
chinese_filter <- sapply(names(tb), function(e)regexpr(text = e, pattern = '[\u4e00-\u9fa5]{1,}' )
)
res <- tb[(tb >= 50) & (nchar(names(tb) ) >=2) & (chinese_filter == 1) ]
library(wordcloud2)
?wordcloud2
## starting httpd help server ...
## done
wordcloud2(res, shape = 'star')
中文篩選
regexpr(text = 'apple', pattern = 'b' )
## [1] -1
## attr(,"match.length")
## [1] -1
## attr(,"useBytes")
## [1] TRUE
regexpr(text = 'apple', pattern = '[a-z]' )
## [1] 1
## attr(,"match.length")
## [1] 1
## attr(,"useBytes")
## [1] TRUE
regexpr(text = 'apple', pattern = '[a-z]{3,6}' )
## [1] 1
## attr(,"match.length")
## [1] 5
## attr(,"useBytes")
## [1] TRUE
regexpr(text= 'apple', pattern= '[\u4e00-\u9fa5]{1,}')
## [1] -1
## attr(,"match.length")
## [1] -1
regexpr(text= '蘋果', pattern= '[\u4e00-\u9fa5]{1,}')
## [1] 1
## attr(,"match.length")
## [1] 2
Using rpart
data(iris)
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
library(rpart)
fit <- rpart(Species ~ Sepal.Length + Sepal.Width + Petal.Length+ Petal.Width, data = iris )
fit
## n= 150
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 150 100 setosa (0.33333333 0.33333333 0.33333333)
## 2) Petal.Length< 2.45 50 0 setosa (1.00000000 0.00000000 0.00000000) *
## 3) Petal.Length>=2.45 100 50 versicolor (0.00000000 0.50000000 0.50000000)
## 6) Petal.Width< 1.75 54 5 versicolor (0.00000000 0.90740741 0.09259259) *
## 7) Petal.Width>=1.75 46 1 virginica (0.00000000 0.02173913 0.97826087) *
plot(fit, margin=0.1)
text(fit)

plot(iris$Petal.Length , iris$Petal.Width, col = iris$Species)
abline(v = 2.45, col="orange")
abline(h = 1.75, col="blue")
#install.packages('party')
library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich

fit <- ctree(Species ~ Sepal.Length + Sepal.Width + Petal.Length+ Petal.Width, data = iris )
plot(fit)

驗證模型
label <- iris[,5]
iris_data <- iris[,1:4]
tb <- table(predict(fit, iris_data, type="response"), label)
#install.packages('caret')
#install.packages('e1071')
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
confusionMatrix(tb)
## Confusion Matrix and Statistics
##
## label
## setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 49 5
## virginica 0 1 45
##
## Overall Statistics
##
## Accuracy : 0.96
## 95% CI : (0.915, 0.9852)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.94
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9800 0.9000
## Specificity 1.0000 0.9500 0.9900
## Pos Pred Value 1.0000 0.9074 0.9783
## Neg Pred Value 1.0000 0.9896 0.9519
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3267 0.3000
## Detection Prevalence 0.3333 0.3600 0.3067
## Balanced Accuracy 1.0000 0.9650 0.9450
#sample.int(42,6)
set.seed(123)
idx <- sample.int(2, nrow(iris), replace = TRUE, prob = c(0.7,0.3))
table(idx)
## idx
## 1 2
## 106 44
trainset <- iris[idx == 1, ]
testset <- iris[idx == 2, ]
dim(trainset)
## [1] 106 5
dim(testset)
## [1] 44 5
fit2 <- rpart(Species ~ ., data = trainset)
plot(fit2, margin = 0.1)
text(fit2)

plot(iris$Petal.Length , iris$Petal.Width, col = iris$Species)
abline(v = 2.45, col="orange")
abline(v = 4.75, col="blue")

#testset[, 1:4]
cm <- table(predict(fit2, testset[, -5], type='class'), testset[,5])
confusionMatrix(cm)
## Confusion Matrix and Statistics
##
##
## setosa versicolor virginica
## setosa 15 0 0
## versicolor 0 10 1
## virginica 0 4 14
##
## Overall Statistics
##
## Accuracy : 0.8864
## 95% CI : (0.7544, 0.9621)
## No Information Rate : 0.3409
## P-Value [Acc > NIR] : 8.552e-14
##
## Kappa : 0.8291
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.7143 0.9333
## Specificity 1.0000 0.9667 0.8621
## Pos Pred Value 1.0000 0.9091 0.7778
## Neg Pred Value 1.0000 0.8788 0.9615
## Prevalence 0.3409 0.3182 0.3409
## Detection Rate 0.3409 0.2273 0.3182
## Detection Prevalence 0.3409 0.2500 0.4091
## Balanced Accuracy 1.0000 0.8405 0.8977
(15 + 10 + 14) / 44
## [1] 0.8863636
客戶流失分析
#install.packages('C50')
library(C50)
data(churn)
str(churnTrain)
## 'data.frame': 3333 obs. of 20 variables:
## $ state : Factor w/ 51 levels "AK","AL","AR",..: 17 36 32 36 37 2 20 25 19 50 ...
## $ account_length : int 128 107 137 84 75 118 121 147 117 141 ...
## $ area_code : Factor w/ 3 levels "area_code_408",..: 2 2 2 1 2 3 3 2 1 2 ...
## $ international_plan : Factor w/ 2 levels "no","yes": 1 1 1 2 2 2 1 2 1 2 ...
## $ voice_mail_plan : Factor w/ 2 levels "no","yes": 2 2 1 1 1 1 2 1 1 2 ...
## $ number_vmail_messages : int 25 26 0 0 0 0 24 0 0 37 ...
## $ total_day_minutes : num 265 162 243 299 167 ...
## $ total_day_calls : int 110 123 114 71 113 98 88 79 97 84 ...
## $ total_day_charge : num 45.1 27.5 41.4 50.9 28.3 ...
## $ total_eve_minutes : num 197.4 195.5 121.2 61.9 148.3 ...
## $ total_eve_calls : int 99 103 110 88 122 101 108 94 80 111 ...
## $ total_eve_charge : num 16.78 16.62 10.3 5.26 12.61 ...
## $ total_night_minutes : num 245 254 163 197 187 ...
## $ total_night_calls : int 91 103 104 89 121 118 118 96 90 97 ...
## $ total_night_charge : num 11.01 11.45 7.32 8.86 8.41 ...
## $ total_intl_minutes : num 10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
## $ total_intl_calls : int 3 3 5 7 3 6 7 6 4 5 ...
## $ total_intl_charge : num 2.7 3.7 3.29 1.78 2.73 1.7 2.03 1.92 2.35 3.02 ...
## $ number_customer_service_calls: int 1 1 0 2 3 0 3 0 1 0 ...
## $ churn : Factor w/ 2 levels "yes","no": 2 2 2 2 2 2 2 2 2 2 ...
churnTrain <- churnTrain[ ,! names(churnTrain) %in% c("state", "area_code", "account_length") ]
names(churnTrain)
## [1] "international_plan" "voice_mail_plan"
## [3] "number_vmail_messages" "total_day_minutes"
## [5] "total_day_calls" "total_day_charge"
## [7] "total_eve_minutes" "total_eve_calls"
## [9] "total_eve_charge" "total_night_minutes"
## [11] "total_night_calls" "total_night_charge"
## [13] "total_intl_minutes" "total_intl_calls"
## [15] "total_intl_charge" "number_customer_service_calls"
## [17] "churn"
set.seed(2)
idx <- sample.int(2, nrow(churnTrain), replace = TRUE, prob=c(0.7,0.3))
trainset <- churnTrain[idx == 1, ]
testset <- churnTrain[idx == 2, ]
churn.rp <- rpart(churn ~., data = churnTrain)
plot(churn.rp, margin= 0.1)
text(churn.rp)

predictions <- predict(churn.rp, testset, type='class')
tb <- table(testset$churn, predictions)
confusionMatrix(tb)
## Confusion Matrix and Statistics
##
## predictions
## yes no
## yes 110 31
## no 11 866
##
## Accuracy : 0.9587
## 95% CI : (0.9446, 0.9701)
## No Information Rate : 0.8811
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.8162
## Mcnemar's Test P-Value : 0.00337
##
## Sensitivity : 0.9091
## Specificity : 0.9654
## Pos Pred Value : 0.7801
## Neg Pred Value : 0.9875
## Prevalence : 0.1189
## Detection Rate : 0.1081
## Detection Prevalence : 0.1385
## Balanced Accuracy : 0.9373
##
## 'Positive' Class : yes
##
library(caret)
control <-trainControl(method="repeatedcv", number=10, repeats=3)
model <- train(churn~., data=trainset, method="rpart", preProcess="scale", trControl=control)
model
## CART
##
## 2315 samples
## 16 predictor
## 2 classes: 'yes', 'no'
##
## Pre-processing: scaled (16)
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 2083, 2083, 2083, 2084, 2084, 2084, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.05555556 0.8994954 0.5143502
## 0.07456140 0.8652302 0.2498564
## 0.07602339 0.8584636 0.1772128
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.05555556.
table(predict(model, testset), testset$churn)
##
## yes no
## yes 64 10
## no 77 867
head(predict(churn.rp, testset, type='class'))
## 2 5 6 8 13 16
## no no no no no yes
## Levels: yes no
ROC Curve
library(rpart)
library(caret)
churn.rp <- rpart(churn ~., data = churnTrain)
predictions <- predict(churn.rp, testset, type='prob')
roc_x <- c(0)
roc_y <- c(0)
for(i in seq(0.01,1,0.01)){
res <- as.factor(ifelse(predictions[,1] >= i, 'yes', 'no'))
res <- factor(res,levels(res)[c(2,1)])
tb <- table(testset$churn, res)
cm <- confusionMatrix(tb)
x <- 1 - cm$byClass[2]
y <- cm$byClass[1]
roc_x <- c(roc_x, x)
roc_y <- c(roc_y, y)
}
roc_x <- c(roc_x, 1)
roc_y <- c(roc_y, 1)
plot(roc_x,roc_y, type='b', xlim=c(0,1), ylim=c(0,1))

使用ROCR
#install.packages('ROCR')
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
predictions <- predict(churn.rp, testset, type="prob")
pred.to.roc <- predictions[, 1]
pred.rocr <- prediction(pred.to.roc, as.factor(testset[,(dim(testset)[[2]])]))
perf.rocr <- performance(pred.rocr, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr <- performance(pred.rocr, "tpr","fpr")
plot(perf.tpr.rocr, colorize=T,main=paste("AUC:",(perf.rocr@y.values)))

RMiner
#install.packages("rminer")
library(rminer)
##
## Attaching package: 'rminer'
## The following object is masked from 'package:party':
##
## fit
## The following object is masked from 'package:modeltools':
##
## fit
model <- fit(churn~.,trainset,model="svm")
VariableImportance <- Importance(model,trainset,method="sensv")
L <- list(runs=1,sen=t(VariableImportance$imp),sresponses=VariableImportance$sresponses)
mgraph(L,graph="IMP",leg=names(trainset),col="gray",Grid=10)

使用R 計算距離
x = c(0, 0, 1, 1, 1, 1)
y = c(1, 0, 1, 1, 0, 1)
dist(rbind(x,y), method = "euclidean")
## x
## y 1.414214
sqrt(sum((x - y) ^2))
## [1] 1.414214
dist(rbind(x,y), method = "manhattan")
## x
## y 2
sum(abs(x - y))
## [1] 2
使用hclust 做iris 分群
plot(iris$Petal.Length, iris$Petal.Width, col=iris$Species)

plot(iris$Petal.Length, iris$Petal.Width)

str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
iris_data <- iris[, -5]
iris_dist <- dist(iris_data, method = 'euclidean')
hc <- hclust(iris_dist, method = 'ward.D2')
plot(hc, hang=-1, cex = 0.7)

fit <- cutree(hc, k = 3)
fit
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [71] 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3
## [106] 3 2 3 3 3 3 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 2 3 3 3 3 3 2 2 3 3 3 2 3
## [141] 3 3 2 3 3 3 2 3 3 2
table(fit)
## fit
## 1 2 3
## 50 64 36
plot(iris$Petal.Length, iris$Petal.Width)

par(mfrow=c(1,2))
plot(iris$Petal.Length, iris$Petal.Width, col=iris$Species, main= "real answer")
plot(iris$Petal.Length, iris$Petal.Width, col=fit, main = "clustering result")

plot(hc, hang = -0.01, cex = 0.7)
rect.hclust(hc, k = 3 , border="red")

文章分群
a <- c(1, 1, 1, 1, 1, 1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1)
b <- c(1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
dist(rbind(a,b), method = 'euclidean')
## a
## b 3.872983
a1 <- c(1, 1, 1, 1, 1, 1 ,1 )
b1 <- c(1, 1, 0, 0, 0, 0, 0)
dist(rbind(a1,b1), method = 'euclidean')
## a1
## b1 2.236068
library(proxy)
##
## Attaching package: 'proxy'
## The following objects are masked from 'package:stats':
##
## as.dist, dist
## The following object is masked from 'package:base':
##
## as.matrix
proxy::dist(rbind(a,b), method = 'cosine')
## a
## b 0.6570028
proxy::dist(rbind(a1,b1), method = 'cosine')
## a1
## b1 0.4654775
download.file('https://raw.githubusercontent.com/ywchiu/rtibame/master/data/applenews.RData', 'applenews.RData')
load('applenews.RData')
library(jiebaR)
mixseg <- worker()
apple.seg <- lapply(applenews$content, function(e) segment(e, mixseg))
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
apple.corpus <- Corpus(VectorSource(apple.seg))
apple.dtm <- DocumentTermMatrix(apple.corpus)
dim(apple.dtm)
## [1] 1500 39206
dtm.remove <- removeSparseTerms(apple.dtm, 0.99)
apple.dist <- proxy::dist(as.matrix(dtm.remove), method='cosine')
dtm.mat <- as.matrix(apple.dist)
applenews$title[9]
## [1] "【熊本強震】取消去九州 華航5月8日前退改票免手續費"
applenews$title[order(dtm.mat[9,])[1:10]]
## [1] "【熊本強震】取消去九州 華航5月8日前退改票免手續費"
## [2] "熊本強震!旅遊交通資訊懶人包"
## [3] "【熊本強震】九州團恐掉1到3成 旅行團改行程避走熊本"
## [4] "日熊本強震 燦星:本月20日前的旅客全額退費"
## [5] "【熊本強震】取消去九州 KKday4月底前可全額退費"
## [6] "【九州地震】SONY、瑞薩、東京威力 熊本廠今全面停工"
## [7] "【九州地震】SONY、瑞薩、東京威力 熊本廠今全面停工"
## [8] "九州地震 封測廠日月光、矽品無影響"
## [9] "易遊網宣布 今明九州旅遊團全額退費"
## [10] "九州為半導體重鎮 強震後供應成市場焦點"
hc <- hclust(apple.dist, method = 'ward.D2')
plot(hc, hang = -0.01)
rect.hclust(hc, k = 20)

fit <- cutree(hc, k=20)
table(fit)
## fit
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## 93 45 345 50 65 301 64 20 78 83 74 24 21 50 48 40 61 12
## 19 20
## 14 12
applenews$title[fit == 10]
## [1] "陸委會跨部會議確認 下周登陸展開肯亞案協商"
## [2] "【唱新聞】詐騙嗎?R.O.C.有CHINA但不是CHINA"
## [3] "【民報】馬8年政績 外媒:台灣人被北京納入治外法權"
## [4] "馬國50台人將遣中 我代表處動員馬國高層友人協助"
## [5] "【肯亞案】為何強擄台嫌? 陸警:台灣判太輕"
## [6] "【肯亞案】電信詐騙台嫌分贓手法 陸媒大揭密"
## [7] "中國人為何容易被騙"
## [8] "肯亞案裡極藍極綠的民粹氛圍"
## [9] "520後將凸顯「一個中華人民共和國」"
## [10] "520後將凸顯「一個中華人民共和國」"
## [11] "自由與專制對撞的時刻"
## [12] "【肯亞案】立院朝野發表共同聲明 批中國侵害我國主權"
## [13] "【肯亞案】詐騙台人央視認罪:傷天害理的報應"
## [14] "【財訊】根留台灣好公司 惦惦賺很大"
## [15] "【更新】大馬50台人再被遣中? 陸委會:協商中"
## [16] "肯亞案確定組團赴中 羅瑩雪:最快下周一出發"
## [17] "52台人涉電信詐騙被逮 我與中國搶人回台"
## [18] "【台灣英文新聞】詐騙案讓台灣無光"
## [19] "大馬50台人將被遣中 網友這樣說"
## [20] "【更新】黃國昌爆料:馬來西亞50台人 將被遣返到中國"
## [21] "馬國50台人將被遣返到中國 刑事局證實:正與中國溝通"
## [22] "趙少康:台灣騙子滿天下"
## [23] "我遇到了詐欺"
## [24] "【法廣RFI】陳一新:點評520前夕的兩岸動態"
## [25] "【法廣RFI】肯亞案45台灣人均被拘北京海淀"
## [26] "司法院最新統計 9成詐欺犯判刑不到1年"
## [27] "【公庫】台灣社:譴責馬政府軟弱面對中國非法綁架台灣人事件!"
## [28] "【法廣RFI】李顯龍:民進黨執政下九二共識更模<U+7173>"
## [29] "羅瑩雪再創金句 「泰國人只會講泰國話」"
## [30] "那些年跨國逮回的詐騙集團 後來怎麼了"
## [31] "【肯亞案】跨部會專案會議明召開 討論赴中交涉事宜"
## [32] "管轄權爭議 陳其邁酸:台獨去肯亞會被抓嗎"
## [33] "【民報】外媒:殷鑑迫近!中國今日綁架台灣人,明日綁架異議人士"
## [34] "一再騙錢抓不怕 詐欺慣犯又吸金4千萬"
## [35] "他說肯亞案無關政治 「當然要遣返大陸」"
## [36] "連警大教授都被騙!台灣詐集團真的好神"
## [37] "【有片】為肯亞案互槓 段宜康、羅瑩雪火爆對嗆"
## [38] "台灣四月雪─台灣的主權、法治在下雪!"
## [39] "【更新】詐騙產業公會感謝聲明 羅瑩雪收到了"
## [40] "《環時》再談肯亞案:蔡英文勿要挾大陸"
## [41] "【央廣RTI】肯亞案 學者:兩岸互信才有助互動"
## [42] "「肯亞案」為何爆發搶人大戰 他說出真正原因"
## [43] "金立群:會有辦法處理台灣加入亞投行"
## [44] "遠見民調 五成民眾認為520後和平協議會讓兩岸更好"
## [45] "肯亞案:蔡英文與習大大可以做的事"
## [46] "蔡正元轟肯亞案 「史上最丟臉案件」"
## [47] "台灣大戰略芻議"
## [48] "詐欺刑責太輕? 張善政指示研議修法"
## [49] "未來不管兩岸經貿? 國發會:本就不是職掌"
## [50] "【更新】李顯龍稱台灣與中國衝突將被孤立 外長:無此問題"
## [51] "<U+200B>陸官媒談肯亞案 要台灣「給自己留點臉」"
## [52] "<U+200B>亞投行長:有辦法處理台灣加入問題"
## [53] "蔡英文就職演說 近6成民眾認為不該提同屬一中"
## [54] "星國總理:台灣人明白 與中國衝突會被孤立"
## [55] "【更新】中國公布受害者數字 夏立言:拿出證據證明不是說說"
## [56] "【肯亞案】邱太三自爆:中方曾發簡訊 不要在台上吵"
## [57] "【更新】詐欺犯回台罪太輕? 羅瑩雪打算這麼做"
## [58] "看完就知多嚴重 台灣人你為何不生氣?"
## [59] "就因為這理由!他告訴你「為何人渣都愛台灣」"
## [60] "肯亞律師:肯亞違法遣送台灣人到中國"
## [61] "肯亞將台灣人遣送中國 美國最新回應"
## [62] "【法廣RFI】肯亞案:北京高調遣送 學者稱短期難返台"
## [63] "四月十四日各報頭條搶先報"
## [64] "【國際為什麼】跨國電信詐騙以台灣嫌犯居多?"
## [65] "民進黨要看懂肯亞案"
## [66] "江春男:遠離非洲"
## [67] "法務部錯亂的管轄概念"
## [68] "【法廣RFI】國台辦:堅決法辦肯亞詐騙台嫌犯"
## [69] "【肯亞案】45台人遣中 陸委會:尚無掌握詐欺相關犯罪資訊"
## [70] "台嫌遭押送中國案 陸官媒:有人藉此煽動民粹"
## [71] "肯亞舉債6億美元 都靠中國相借"
## [72] "【法廣RFI】肯亞強遣台嫌回陸 目的何在?"
## [73] "【法廣RFI】印尼步肯亞後塵 台緊急行動防遣送陸"
## [74] "肯亞案 法務部:台灣有管轄權"
## [75] "馬政府還要鴕鳥到何時?"
## [76] "中國外交部談肯亞案台人:被開釋並非無罪"
## [77] "【肯亞案】被中國帶走的台人 有2人是通緝犯"
## [78] "【肯亞案】組團赴中 羅瑩雪:本來今天要出發"
## [79] "【肯亞案】組團赴中 羅瑩雪:本來今天要出發"
## [80] "葉毓蘭再嗆肯亞詐騙疑犯 要媒體將心比心"
## [81] "【民報】類肯亞案 黃國昌爆:法務部通報中國到海外抓台灣人"
## [82] "BBC:蔡英文蜜月期結束 北京拿肯亞案施壓"
## [83] "台嫌被強押送中國 CNN頭版關注"
文章分類
str(applenews)
## 'data.frame': 1500 obs. of 5 variables:
## $ content : chr "(更新:新增影片)想要透過刮刮樂彩券一夕致富,但他卻用錯方法!台中市一名黃姓男子覬覦頭獎高達2600萬的「開門見喜」刮刮樂彩券,上月佯"| __truncated__ "澳洲一名就讀雪梨大學的華裔博士生,日前公開一段燒毀中國護照的影片,還大肆批評留澳學生是一群「留學豬」。消息傳出後,這名博士生立"| __truncated__ "【行銷專題企劃】房價高高在上,沒錢買房沒關係,但你認為自己是聰明的租屋族嗎? 由蘋果地產與FBS TV合作的全新節目-房市大追擊,本集節"| __truncated__ "本內容由中央廣播電臺提供<U+00A0><U+00A0> <U+00A0> <U+00A0> <U+00A0>美國國防部長卡特(Ash Carter)今天(15日)表示,他今天將前往在菲"| __truncated__ ...
## $ title : chr "【更新】搶2.2萬彩券刮中1.4萬 沒發財還得入獄" "拿到澳洲護照後 他放火燒中國護照" "【特企】房市大追擊- 租屋這些事情要小心" "【央廣RTI】美菲軍演 美防長南海登艦" ...
## $ dt : POSIXct, format: "2016-04-15 14:32:00" "2016-04-15 14:32:00" ...
## $ category: chr "社會" "國際" "地產" "國際" ...
## $ view_cnt: chr "1754" "0" "0" "0" ...
apple.subset <- applenews[applenews$category %in% c('社會', '財經', '娛樂'), ]
library(jiebaR)
mixseg <- worker()
apple.seg <- lapply(apple.subset$content, function(e) segment(e, jiebar = mixseg))
library(tm)
doc <- Corpus(VectorSource(apple.seg))
dtm <- DocumentTermMatrix(doc, control = list(removePunctuation=TRUE, removeNumbers=TRUE ))
dtm.remove <- removeSparseTerms(dtm, 0.99)
dim(dtm.remove)
## [1] 428 1949
dtm.remove$dimnames$Terms
## [1] "口罩" "已經" "不已" "之際" "分局" "友人"
## [7] "方式" "方法" "只是" "台中市" "犯罪" "全案"
## [13] "更新" "男子" "車主" "佯稱" "使用" "供稱"
## [19] "到案" "坦承" "店家" "表示" "昨天" "昨日"
## [25] "派出所" "紅色" "要求" "計畫" "員警" "埋伏"
## [31] "拿出" "逃逸" "追查" "高達" "售價" "移送"
## [37] "通知" "透過" "部分" "報導" "提供" "無\xa9"
## [43] "無法" "畫面" "發\xb2" "發稿" "詐欺罪" "逮捕"
## [49] "進入" "嫌犯" "想要" "損失" "新增" "當下"
## [55] "經過" "路口" "監視" "監視器" "說明" "影片"
## [61] "數字" "調查" "調\xbe" "機車" "獲報" "聯絡"
## [67] "購買" "轄區" "擴大" "翻攝" "鎖定" "鏡頭"
## [73] "警方" "顧客" "大量" "不用" "不得" "中午"
## [79] "今天" "今\xa6" "公司" "公\xb6" "主動" "主管"
## [85] "司法" "台\xa5" "未來" "正\xb1" "由於" "交付"
## [91] "任何" "再度" "合法" "自信" "告知" "技術"
## [97] "投資人" "求償" "依據" "來自" "協助" "呼\xc6"
## [103] "法務部" "秉持" "非\xb1" "保障" "前往" "指出"
## [109] "政府" "查證" "相關" "研究" "研發" "美國"
## [115] "原則" "員工" "時間" "案情" "浩鼎" "特定"
## [121] "真相" "秘密" "訊息" "配合" "偵查" "國內"
## [127] "基於" "從未" "情事" "產\xb7" "第\xa4" "這些"
## [133] "連續" "單位" "報告" "媒體" "揭露" "發布"
## [139] "發表" "發展" "搜索" "資料" "對\xa5" "影響"
## [145] "數據" "導致" "獨家" "積極" "隨意" "檢調"
## [151] "營\xb7" "營運" "聲明" "釐清" "嚴重" "權益"
## [157] "驚訝" "上訴" "女子" "女兒" "女童" "不\xc4"
## [163] "今依" "另名" "犯行" "再次" "同\xa9" "地\xb0"
## [169] "考量" "住處" "判刑" "身為" "兒童" "和解"
## [175] "始終" "保\xc5" "前科" "故意" "個人" "時\xb3"
## [181] "粉絲團" "陪審團" "猥褻" "給予" "新\xa5" "新聞"
## [187] "達成" "滿足" "審酌" "寫下" "熟睡" "請看"
## [193] "蘋果" "中間" "內容" "支持" "支援" "方案"
## [199] "市佔率" "市場" "平台" "打造" "民眾" "同時"
## [205] "合\xa7" "地位" "安全" "成本" "成長" "收視"
## [211] "有效" "而言" "系統" "事\xb7" "亞洲" "具有"
## [217] "服務" "肯定" "近期" "長期" "宣布" "客戶"
## [223] "持續" "挑戰" "相當" "重新" "面對" "根據"
## [229] "特別" "能力" "能\xa7" "推出" "採用" "產\xab"
## [235] "規劃" "規模" "軟體" "這項" "最大" "最新"
## [241] "創新" "提升" "晶片" "超過" "進\xa4" "達到"
## [247] "電視" "預估" "預期" "團隊" "夥伴" "對於"
## [253] "維持" "億美元" "廠商" "影響力" "數位" "整體"
## [259] "機\xb7" "選擇" "營收" "攀升" "證明" "關鍵"
## [265] "競爭力" "體驗" "台灣" "看好" "國際" "授權"
## [271] "透露" "進行" "媽媽" "準備" "演出" "bmw"
## [277] "com" "人員" "中心" "今早" "方向" "以及"
## [283] "出版" "台大" "市府" "正在" "休息" "全都"
## [289] "多名" "行經" "完全" "投訴" "沒有" "車上"
## [295] "車道" "車禍" "車輛" "其他" "取得" "受傷"
## [301] "幸運" "所幸" "法律" "直接" "直擊" "社\xb7"
## [307] "後方" "活動" "突發" "乘客" "記得" "追撞"
## [313] "基隆" "救人" "責任" "造成" "最高" "發生"
## [319] "毀損" "照片" "當時" "路旁" "疑似" "緊急"
## [325] "肇事" "價值" "撞擊" "駕駛" "駕駛座" "隨即"
## [331] "幫忙" "騎士" "蘋果日報" "歡迎" "聽到" "讀者"
## [337] "中國" "台籍" "肯亞" "冠軍" "娛樂中心" "詐騙"
## [343] "集團" "綜合" "遣送" "人家" "下午" "已有"
## [349] "不\xb7" "司機" "只好" "必須" "打\xb6" "打電話"
## [355] "多次" "老翁" "而且" "坐在" "求助" "汽車"
## [361] "車內" "車門" "到場" "返\xa6" "返家" "前天"
## [367] "突然" "紀錄" "原來" "堅持" "帶\xa6" "這名"
## [373] "最後" "詢問" "需要" "銷售" "離去" "警員"
## [379] "九\xa6" "大約" "工廠" "日本" "主要" "半導體"
## [385] "生產" "目前" "全面" "地\xbe" "投資" "每日"
## [391] "明顯" "昨晚" "面板" "停車場" "產值" "疏散"
## [397] "統計" "當地" "運\xa7" "電報導" "證實" "人生"
## [403] "大家" "不能" "心情" "日子" "令人" "平安"
## [409] "全台" "各自" "地說" "安慰" "早上" "我\xb7"
## [415] "那個" "所有" "朋友" "東西" "知名" "勇敢"
## [421] "凌晨" "祝福" "婚姻" "崩潰" "這段" "曾經"
## [427] "然而" "痛苦" "感動" "感嘆" "感謝" "愛女"
## [433] "說話" "遭到" "整個" "整理" "臉書" "還有"
## [439] "藝人" "難過" "不是" "不滿" "心生" "手槍"
## [445] "主嫌" "犯案" "安非他命" "改造" "李姓" "林姓"
## [451] "金錢" "查獲" "毒\xab" "家中" "偵辦" "晚上"
## [457] "條例" "被\xae" "報警" "債務" "違反" "電話"
## [463] "彰化縣" "綽號" "雙方" "攜帶" "大發" "女友"
## [469] "公社" "手機" "月初" "月間" "出去" "去\xa6"
## [475] "台中" "先生" "地檢署" "自己" "妨\xae" "事後"
## [481] "所以" "恐\xc0" "桃園" "將他" "得知" "散布"
## [487] "罪嫌" "對話" "網站" "親密" "爆料" "出\xb2"
## [493] "孕\xa8" "交往" "多\xa6" "成為" "老公" "拍攝"
## [499] "洛杉磯" "結婚" "傳出" "遲遲" "懷孕" "下半\xa6"
## [505] "下周" "上漲" "大立光" "不大" "不如" "不佳"
## [511] "不過" "之後" "反映" "反彈" "台股" "台積電"
## [517] "布局" "企\xb7" "同步" "多少" "收斂" "早盤"
## [523] "至於" "使得" "法說" "物料" "股價" "股權"
## [529] "表\xb2" "指數" "展望" "族群" "提前" "電子"
## [535] "零組件" "億元" "盤中" "翻紅" "雙雙" "顯示"
## [541] "上市" "大漲" "公告" "交易" "收購" "低於"
## [547] "每股" "周二" "股東" "恢\xb4" "香港" "截至"
## [553] "網路" "歷史" "十分" "上車" "出席" "台南"
## [559] "母親" "好好" "兒子" "具體" "果然" "很多"
## [565] "婆婆" "竟然" "麻煩" "就是" "想法" "路上"
## [571] "睡覺" "談到" "覺得" "女方" "工\xa7" "分手"
## [577] "主持" "出門" "本月" "生活" "男性" "迅速"
## [583] "周刊" "知道" "計師" "展\xb6" "新歡" "擔任"
## [589] "頻頻" "曝光" "大礙" "不少" "不適" "今晚"
## [595] "公益" "比賽" "先前" "有人" "完美" "更好"
## [601] "其實" "幸好" "直呼" "表演" "孩子" "音樂"
## [607] "留言" "能夠" "條件" "期待" "登場" "結束"
## [613] "順利" "微博" "意\xa5" "慈善" "歌迷" "網友"
## [619] "播出" "學習" "擔心" "錄製" "聲音" "醫\xb0"
## [625] "丈夫" "不但" "公訴" "文件" "申請" "依法"
## [631] "負責人" "送到" "健康" "產生" "處理" "販售"
## [637] "貨車" "提\xb0" "然後" "經營" "過\xb5" "製\xa7"
## [643] "檢署" "檢察官" "環境" "大批" "火勢" "火警"
## [649] "立即" "但是" "事發" "受困" "物\xab" "初步"
## [655] "是\xa7" "派出" "原\xa6" "消防局" "救援" "搶救"
## [661] "詳細" "實驗室" "說法" "撲滅" "警消" "人物"
## [667] "立刻" "同\xa6" "有關" "老師" "見面" "並非"
## [673] "妹妹" "拒絕" "抱怨" "法官" "怎麼" "問題"
## [679] "張姓" "從事" "情侶" "報案" "提到" "無關"
## [685] "感情" "萬元" "對方" "賴姓" "聯繫" "避免"
## [691] "辯稱" "小孩" "不到" "夫妻" "台幣" "打算"
## [697] "全球" "成\xa5" "成員" "老婆" "扮演" "男星"
## [703] "來說" "家庭" "真是" "這個" "這對" "逐漸"
## [709] "最近" "超級" "電影" "籌備" "受到" "基地"
## [715] "確\xbb" "了解" "人士" "大陸" "引發" "本案"
## [721] "刑事" "地點" "行為" "努力" "希望" "事證"
## [727] "兩岸" "法務" "爭議" "後續" "政治" "涉及"
## [733] "涉嫌" "留在" "針對" "高層" "動員" "國人"
## [739] "移民" "這次" "詐騙案" "溝通" "電信" "遣返"
## [745] "暫時" "請求" "機制" "辦理" "檢方" "大規模"
## [751] "小時" "不僅" "分享" "充滿" "平\xb1" "同期"
## [757] "多位" "如何" "此\xa5" "此次" "至少" "吸引"
## [763] "來台" "其中" "協商" "狀況" "重要" "首度"
## [769] "消費者" "海\xa5" "除了" "商\xab" "唯\xa4" "專\xb7"
## [775] "帶動" "細\xb8" "搭配" "運用" "需求" "大雨"
## [781] "工\xb5" "今天上午" "分鐘" "出面" "用戶" "住戶"
## [787] "利用" "困難" "延遲" "清楚" "這麼" "這樣"
## [793] "路面" "預計" "簡直" "上前" "不好" "不\xc2"
## [799] "不顧" "公\xa6" "未料" "危險" "多處" "自行"
## [805] "你們" "沒想到" "私人" "身分" "身體" "車頭"
## [811] "事件" "長達" "門口" "後來" "派員" "負責"
## [817] "記者" "訊後依" "送辦" "送醫" "酒測" "強制"
## [823] "情緒" "接受" "接著" "救出" "殺人" "清晨"
## [829] "等待" "趕緊" "範圍" "衝突" "錄影" "轉身"
## [835] "議題" "屬於" "五\xa6" "可能" "並不" "消防隊"
## [841] "送往" "高市" "參\xa5" "國小" "教\xa8" "通報"
## [847] "陸續" "當天" "學生" "檢查" "懷疑" "上午"
## [853] "上網" "不同" "公務" "他人" "可能性" "企圖"
## [859] "判決" "告訴" "決定" "依照" "制伏" "彼此"
## [865] "性愛" "查出" "查緝" "甚至" "看到" "缺乏"
## [871] "堅稱" "密集" "專家" "控制" "提告" "暗示"
## [877] "對象" "與其" "嘿咻" "審判" "確定" "檢警"
## [883] "隱瞞" "藉此" "證人" "大\xad" "太陽" "主演"
## [889] "平均" "成績" "收視率" "完結篇" "宋慧喬" "姜暮煙"
## [895] "後裔" "柳時鎮" "要看" "首都" "時段" "根本"
## [901] "浪漫" "國\xb7" "甜蜜" "最佳" "最終" "創下"
## [907] "該\xbc" "厲\xae" "選\xc1" "瞬間" "雖然" "韓國"
## [913] "夯\xbc" "抵達" "抱著" "急救" "接獲" "救\xc5"
## [919] "醫生" "可望" "供應" "供應鏈" "法人" "萬人"
## [925] "獲得" "小心" "不治" "不慎" "主委" "生命"
## [931] "地上" "位置" "巡邏" "並未" "呼救" "昏迷"
## [937] "指稱" "研判" "消防人員" "勘驗" "婦人" "提及"
## [943] "舒服" "意識" "試圖" "跡象" "模糊" "頭部"
## [949] "以上" "可以" "再將" "百萬" "百萬元" "近來"
## [955] "借貸" "案件" "海巡" "通過" "價格" "今日"
## [961] "公布" "升值" "反而" "立德" "成長率" "為止"
## [967] "美元" "美元匯率" "風險" "符合" "最低" "經紀"
## [973] "經濟" "增\xa5" "口袋" "內部" "日前" "另\xa4"
## [979] "交給" "全部" "事實" "明確" "法辦" "直到"
## [985] "偵結" "貪污" "期間" "發放" "資金" "察覺"
## [991] "職務" "大\xb6" "以來" "利率" "退出" "歐盟"
## [997] "士林" "行動" "住家" "所屬" "法庭" "指揮"
## [1003] "查扣" "浩鼎案" "執行" "處於" "董事長" "預定"
## [1009] "什麼" "以前" "本來" "法\xb0" "為何" "相信"
## [1015] "桃園市" "處\xbb" "就要" "裁定" "傷\xae" "下來"
## [1021] "千萬" "不要" "引用" "他們" "代言" "宋仲基"
## [1027] "形象" "扭曲" "近\xa6" "軍人" "時候" "真的"
## [1033] "國家" "組織" "廣大" "請問" "應該" "賺錢"
## [1039] "韓\xbc" "不知" "付出" "代價" "先是" "測值"
## [1045] "輕鬆" "警察" "權利" "不\xa8" "公尺" "可見"
## [1051] "失控" "左右" "事故" "宣告" "骨折" "情況"
## [1057] "排除" "陷入" "疑\xa6" "不敢" "可愛" "全新"
## [1063] "好\xa9" "自\xbb" "我們" "看了" "歌姬" "大學"
## [1069] "不錯" "之前" "父母" "未滿" "多歲" "扶養"
## [1075] "沒關係" "來源" "委託" "性侵" "做出" "畢\xb7"
## [1081] "被告" "就讀" "嘉義" "課\xb5" "還是" "關係"
## [1087] "三重" "民宅" "生命危險" "交流" "全身" "自由"
## [1093] "攻擊" "參與" "還不" "關心" "分析師" "成立"
## [1099] "似乎" "近日" "宣傳" "看\xb0" "動\xa7" "萬美元"
## [1105] "女星" "不\xa4" "手上" "就連" "幾乎" "工人"
## [1111] "中山路" "太輕" "少\xa6" "另\xa5" "未經" "死\xa4"
## [1117] "死者" "至今" "花蓮" "怎麼辦" "殺\xae" "理由"
## [1123] "終身" "提出" "筆錄" "當中" "審理" "確實"
## [1129] "歷經" "隨後" "證據" "分別" "空間" "強調"
## [1135] "app" "下手" "上次" "口氣" "尤其" "月底"
## [1141] "比較" "出口" "央行" "全\xa5" "如果" "存款"
## [1147] "收到" "有利" "利息" "快速" "改善" "角色"
## [1153] "並且" "例如" "來看" "官網" "所謂" "於是"
## [1159] "股市" "金融" "建議" "很大" "是從" "借款"
## [1165] "效果" "效應" "核心" "紛紛" "衰退" "帶來"
## [1171] "得到" "措施" "累積" "終於" "貨幣政策" "這\xa4"
## [1177] "這是" "焦點" "資產" "過去" "實在" "銀行"
## [1183] "寬鬆" "歐洲" "衝擊" "操\xa7" "鋼鐵" "變成"
## [1189] "反應" "以下" "多家" "持平" "修訂" "逆轉"
## [1195] "單季" "跳空" "代表" "全文" "同樣" "而是"
## [1201] "身上" "或是" "面\xc1" "時期" "純益" "從小"
## [1207] "接近" "尋找" "製造" "獲利" "人氣" "公分"
## [1213] "以往" "出身" "本季" "受訪" "長大" "接下"
## [1219] "掀\xb0" "等於" "新\xbc" "話題" "飾演" "演藝圈"
## [1225] "廣告" "學校" "二版" "口角" "少女" "火爆"
## [1231] "男友" "走出" "事情" "妻子" "糾紛" "表情"
## [1237] "亮點" "酒店" "這種" "部隊" "場面" "趁機"
## [1243] "遇到" "臉部" "爆發" "大型" "不可" "段長"
## [1249] "道路" "中文" "主持人" "出來" "表達" "則是"
## [1255] "送給" "訪問" "最愛" "照顧" "資訊" "團體"
## [1261] "演唱\xb7" "禮物" "互動" "只能" "氣\xaa" "記者\xb7"
## [1267] "追問" "經\xb1" "態度" "演員" "重點" "週刊"
## [1273] "溫暖" "主任" "竹崎" "完成" "延續" "承諾"
## [1279] "殺警案" "鄉長" "感到" "嘉義縣" "羈押" "同意"
## [1285] "此事" "受審" "國民" "強行" "組團" "部長"
## [1291] "對岸" "瑩雪" "自然" "男人" "身材" "明星"
## [1297] "姿勢" "訓練" "做到" "集中" "經紀人" "簡單"
## [1303] "上季" "工\xb7" "之間" "本周" "目標" "全\xa6"
## [1309] "在於" "但\xb7" "依然" "其次" "協調" "政策"
## [1315] "計劃" "動能" "提醒" "零售" "學\xb7" "優於"
## [1321] "壓力" "變化" "下降" "不想" "之\xa5" "比重"
## [1327] "父親" "片中" "可是" "只有" "只要" "地方"
## [1333] "如此" "安排" "有沒有" "找到" "改變" "每個"
## [1339] "私下" "究竟" "那麼" "兩人" "到底" "幸福"
## [1345] "爸爸" "的事" "很快" "流傳" "原本" "家人"
## [1351] "容易" "消息" "特殊" "粉絲" "討論" "高潮"
## [1357] "偶像" "組成" "喜歡" "就算" "幅度" "智慧"
## [1363] "發覺" "意思" "愛情" "當然" "精神" "歐美"
## [1369] "擁有" "興趣" "還要" "獻上" "大概" "不禁"
## [1375] "以為" "各種" "多個" "存在" "即將" "完整"
## [1381] "快點" "思考" "是不是" "看見" "科技" "差不多"
## [1387] "差異" "消失" "真正" "專頁" "控訴" "殺手"
## [1393] "眼\xb2" "當\xa6" "質疑" "遭受" "導演" "下\xb6"
## [1399] "台塑" "出刊" "目的" "兇殺" "行兇" "林進忠"
## [1405] "前妻" "時報" "對此" "離婚" "文章" "印象"
## [1411] "將在" "水準" "立法\xb0" "我國" "將於" "通\xb1"
## [1417] "發行" "下班" "大哥" "小女兒" "不料" "全家"
## [1423] "刑度" "合併" "在家" "有無" "有罪" "老闆"
## [1429] "受\xae" "姊妹" "拍照" "長女" "附近" "強制性"
## [1435] "採信" "意見" "搜出" "電腦" "裸照" "親吻"
## [1441] "還在" "來到" "新台幣" "執行長" "腰斬" "iphone"
## [1447] "之\xa4" "只\xb7" "於今" "相機" "高階" "啟動"
## [1453] "搭載" "龍頭" "關\xaa" "釋出" "消費" "干擾"
## [1459] "要將" "規定" "衣服" "抗拒" "還曾" "kobe"
## [1465] "平靜" "合照" "尖叫" "昨在" "差點" "退休"
## [1471] "最好" "緊張" "謝謝" "還\xb7" "女神" "失去"
## [1477] "即時新聞" "見光" "協議" "坦言" "拍戲" "採取"
## [1483] "發文" "結局" "賈靜雯" "並將" "前方" "前\xa6"
## [1489] "故障" "致死" "案發" "過失" "調整" "燃燒"
## [1495] "出貨" "召\xb6" "季\xb0" "保守" "看法" "訂單"
## [1501] "首季" "高點" "樂\xc6" "放在" "指控" "班機"
## [1507] "國民黨" "晚間" "生日" "並無" "喜愛" "結果"
## [1513] "電視\xbc" "澄清" "螢幕" "採訪" "這位" "幕後"
## [1519] "上映" "不然" "笑說" "編\xbc" "千元" "事實上"
## [1525] "金額" "律師" "處分" "違規" "應是" "每\xa6"
## [1531] "明\xa6" "南韓" "採購" "機構" "簽約" "大幅"
## [1537] "大戰" "引述" "正式" "的話" "夏普" "強化"
## [1543] "實際" "方面" "正確" "自我" "兩個" "委員"
## [1549] "侵\xae" "偵訊" "陪同" "陪伴" "就\xb7" "親友"
## [1555] "robohon" "主人" "半\xa6" "同\xb7" "取代" "亮相"
## [1561] "管理" "機器人" "隨著" "不堪" "打擊" "收押"
## [1567] "每月" "並在" "空難" "重大" "帶著" "澎湖"
## [1573] "遭遇" "不夠" "日將" "每天" "值得" "個性"
## [1579] "造型" "他說" "危機" "官員" "公斤" "系列"
## [1585] "miss" "身邊" "鼓勵" "離\xb6" "人涉" "大小"
## [1591] "之下" "立場" "全國" "更換" "刻意" "重傷"
## [1597] "飛機" "能見度" "航空" "荊元武" "馬公" "高雄"
## [1603] "高勤官" "接到" "塔台" "督導" "違法" "管制"
## [1609] "輕傷" "機場" "辦法" "錄音" "下車" "大結局"
## [1615] "不再" "白色" "任務" "迎接" "金秀賢" "星星"
## [1621] "看著" "時鎮" "障礙" "擁抱" "短短" "人力"
## [1627] "女性" "背景" "連結" "策略" "旗下" "數\xa6"
## [1633] "出道" "首次" "舞台" "明慧" "套房" "留下"
## [1639] "隊長" "比例" "好友" "求救" "車子" "泊霖"
## [1645] "這\xb0" "彰化" "鄰\xa9" "鐵\xac" "打擾" "家屬"
## [1651] "買賣" "當場" "劉姓" "雙手" "警局" "今天下午"
## [1657] "兄弟" "交保" "其餘" "屏東" "搶走" "優勢"
## [1663] "縱火" "出庭" "協\xb7" "委員\xb7" "股票" "馬上"
## [1669] "基金" "基金\xb7" "帳戶" "傍晚" "調查局" "調處"
## [1675] "剛好" "解釋" "下滑" "主席" "創辦人" "減少"
## [1681] "經理人" "如今" "轉而" "未遂" "見到" "屋內"
## [1687] "眼睛" "聊天" "痛\xbd" "路邊" "質問" "鑑識"
## [1693] "穿著" "文書" "出爐" "史上" "多\xb0" "有期\xae"
## [1699] "完畢" "最重" "詐欺" "幫助" "有待" "車身"
## [1705] "工具" "不猶豫" "預告" "每次" "相同" "獨自"
## [1711] "不等" "自已" "即可" "創立" "獎金" "以後"
## [1717] "色狼" "伸出" "判處" "衍生" "旁邊" "賠償"
## [1723] "貼心" "失敗" "的確" "原諒" "痛哭" "感覺"
## [1729] "新片" "放棄" "創\xb7" "跨國" "婚後" "公庫"
## [1735] "落實" "聯盟" "大隊" "引擎" "手法" "涉案"
## [1741] "第二" "分行" "指示" "停止" "進口" "聲請"
## [1747] "吸毒" "黑色" "辛苦" "速度" "預測" "引\xb0"
## [1753] "心臟病" "台人" "死掉" "官方" "情形" "推動"
## [1759] "通緝" "測謊" "分隊" "消防" "轎車" "大眾"
## [1765] "女嘿" "太閒" "可怕" "交代" "偽造" "偷拍"
## [1771] "副\xc1" "這句話" "這件" "揭媽" "貴婦人" "摩鐵約"
## [1777] "邀請" "雙碩士" "議員" "主張" "以此" "出發"
## [1783] "立委" "立\xb0" "法制" "泰國" "送\xa6" "無罪"
## [1789] "新聞稿" "管轄權" "熱議" "質詢" "行車" "狀態"
## [1795] "停車" "球棒" "賓士" "赫然" "印尼" "承辦"
## [1801] "相較" "掃蕩" "大巨蛋" "文康" "經理" "不足"
## [1807] "制度" "限制" "庫存" "終止" "義務" "不幸"
## [1813] "民報" "安裝" "自首" "見狀" "前後" "整天"
## [1819] "臉上" "鐵路" "人以" "出生" "有些" "真人秀"
## [1825] "慢慢" "模樣" "露臉" "費用" "看出" "告別"
## [1831] "傳\xa9" "網路上" "人選" "中信" "接手" "辜仲諒"
## [1837] "精彩" "遠雄" "行政\xb0" "宜康" "予以" "平日"
## [1843] "平時" "壓制" "男女" "公務員" "萬餘元" "電氣化"
## [1849] "電務" "標案" "謝姓" "龐大" "驗收" "趕到"
## [1855] "支付" "永遠" "錯\xbb" "天王" "金管\xb7" "第三"
## [1861] "專案" "驚呼" "局長" "複雜" "董事" "分析"
## [1867] "cef" "流\xb5" "高度" "經驗" "主角" "轉折"
## [1873] "控告" "合成" "等候" "應用" "大廠" "整合"
## [1879] "人民" "半\xa9" "馬路" "地區" "規格" "智慧型"
## [1885] "穩定" "使用者" "王姓" "承\xbb" "藍色" "接觸"
## [1891] "此案" "人口" "競爭" "上周" "上市公司" "國\xa5"
## [1897] "打擊犯罪" "累計" "別人" "戲\xbc" "痛批" "為主"
## [1903] "特地" "裝置" "趨勢" "雲端" "走路" "宏達"
## [1909] "重視" "何時" "運動" "相對" "bra" "正面"
## [1915] "珍妮佛" "該片" "驚喜" "游姓" "賴男" "帶走"
## [1921] "李女" "此時" "監控" "向上" "收賄" "聯手"
## [1927] "發動" "傳統" "輕人" "成熟" "評論" "身\xa4"
## [1933] "房內" "租客" "想到" "違建" "htc" "慶祝"
## [1939] "受惠" "陳志朋" "整容" "客人" "巴基斯坦" "妻小"
## [1945] "由法廣" "法廣" "數量" "嫌疑人" "政部"
dtm.mat <- as.matrix(dtm.remove)
convert_count <- function(e){
x <- ifelse(e > 0, 1, 0)
x <- factor(x, levels = c(0,1), labels = c('no', 'yes'))
return(x)
}
dtm.count <- apply(dtm.mat, MARGIN=2, convert_count)
dim(dtm.count)
## [1] 428 1949
class(dtm.count)
## [1] "matrix"
m <- as.data.frame(dtm.count)
#View(m)
idx <- sample.int(2, nrow(m), replace=TRUE, prob=c(0.7,0.3))
trainset <- m[idx == 1, ]
testset <- m[idx == 2, ]
traintag <- apple.subset[idx==1, 'category']
testtag <- apple.subset[idx==2, 'category']
traintitle <- apple.subset[idx==1, 'title']
testtitle <- apple.subset[idx==2, 'title']
library(e1071)
model <- naiveBayes(trainset, as.factor(traintag))
pred <- predict(model, testset)
tb <- table(testtag, pred)
library(caret)
cm <- confusionMatrix(tb)
cm
## Confusion Matrix and Statistics
##
## pred
## testtag 社會 娛樂 財經
## 社會 56 1 1
## 娛樂 1 23 0
## 財經 1 1 29
##
## Overall Statistics
##
## Accuracy : 0.9558
## 95% CI : (0.8998, 0.9855)
## No Information Rate : 0.5133
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9283
## Mcnemar's Test P-Value : 0.8013
##
## Statistics by Class:
##
## Class: 社會 Class: 娛樂 Class: 財經
## Sensitivity 0.9655 0.9200 0.9667
## Specificity 0.9636 0.9886 0.9759
## Pos Pred Value 0.9655 0.9583 0.9355
## Neg Pred Value 0.9636 0.9775 0.9878
## Prevalence 0.5133 0.2212 0.2655
## Detection Rate 0.4956 0.2035 0.2566
## Detection Prevalence 0.5133 0.2124 0.2743
## Balanced Accuracy 0.9646 0.9543 0.9713
# Verify Predictions
testtag2 <- factor(testtag, levels = c('社會', '娛樂','財經'))
idx2 <- testtag2 != pred
df <- as.data.frame(cbind(testtitle[idx2], as.character(testtag2[idx2]), as.character(pred[idx2])))
View(df)
m1 <- matrix(1:9, nrow = 3)
m1
## [,1] [,2] [,3]
## [1,] 1 4 7
## [2,] 2 5 8
## [3,] 3 6 9
apply(m1, MARGIN=2, FUN = sum)
## [1] 6 15 24
m2 <- dtm.mat[1:3, 1:3]
convert_count <- function(e){
x <- ifelse(e > 0, 1, 0)
x <- factor(x, levels = c(0,1), labels = c('no', 'yes'))
return(x)
}
m3 <- apply(m2, MARGIN=2, convert_count)
as.table(m3)
## Terms
## Docs 口罩 已經 不已
## 1 yes yes yes
## 2 no no no
## 3 no no no
Time Series Analysis
download.file('https://raw.githubusercontent.com/ywchiu/rcookbook/master/chapter10/tw2330_finance.csv', 'tw2330_finance.csv')
tw2330 <- read.csv('tw2330_finance.csv', header=TRUE)
View(tw2330)
m <- ts(tw2330$Total.Income, frequency = 4, start = c(2008,1))
class(m)
## [1] "ts"
m.sub <- window(m, start=c(2008,1), end = c(2014,4))
components <- decompose(m.sub)
names(components)
## [1] "x" "seasonal" "trend" "random" "figure" "type"
plot(components)

#install.packages('fpp')
library(fpp)
## Loading required package: forecast
## Loading required package: fma
## Loading required package: expsmooth
## Loading required package: lmtest
## Loading required package: tseries
fit <- hw(m.sub, seasonal = 'additive' )
plot(fit)
