20160731Demo

產生正確中文詞頻矩陣

source('https://raw.githubusercontent.com/ywchiu/rtibame/master/Lib/CNCorpus.R')

## Warning: package 'jiebaR' was built under R version 3.2.5

## Loading required package: jiebaRD

## Warning: package 'jiebaRD' was built under R version 3.2.5

## Warning: package 'tm' was built under R version 3.2.5

## Loading required package: NLP

library(jiebaR)
mixseg = worker()

s = "大巨蛋案對市府同仁下封口令？柯P否認"
s1 = "柯P市府近來飽受大巨蛋爭議"
s.vec <- segment(code= s , jiebar = mixseg)
s1.vec <- segment(code= s1 , jiebar = mixseg)
s.corpus = CNCorpus(list(s.vec, s1.vec))

control.list=list(wordLengths=c(1,Inf),tokenize=space_tokenizer)
s.dtm <- DocumentTermMatrix(s.corpus, control=control.list)
inspect(s.dtm)

## <<DocumentTermMatrix (documents: 2, terms: 12)>>
## Non-/sparse entries: 15/9
## Sparsity           : 38%
## Maximal term length: 3
## Weighting          : term frequency (tf)
## 
##     Terms
## Docs 下 大巨蛋 市府 同仁 否認 爭議 近來 封口令 柯p 案 飽受 對
##    1  1      1    1    1    1    0    0      1   1  1    0  1
##    2  0      1    1    0    0    1    1      0   1  0    1  0

產生正確中文詞頻矩陣 (2)

source('https://raw.githubusercontent.com/ywchiu/rtibame/master/Lib/CNCorpus.R')
library(jiebaR)
mixseg = worker()

s = "大巨蛋案對市府同仁下封口令？柯P否認"
s1 = "柯P市府近來飽受大巨蛋爭議"
s2 = "非核家園不是空談柯P要打造台北能源之丘"
s.vec <- segment(code= s , jiebar = mixseg)
s1.vec <- segment(code= s1 , jiebar = mixseg)
s2.vec <- segment(code= s2 , jiebar = mixseg)
s.corpus = CNCorpus(list(s.vec, s1.vec, s2.vec))

control.list=list(wordLengths=c(2,Inf),tokenize=space_tokenizer)
s.dtm <- DocumentTermMatrix(s.corpus, control=control.list)
inspect(s.dtm)

## <<DocumentTermMatrix (documents: 3, terms: 17)>>
## Non-/sparse entries: 21/30
## Sparsity           : 59%
## Maximal term length: 3
## Weighting          : term frequency (tf)
## 
##     Terms
## Docs 大巨蛋 不是 之丘 台北 市府 打造 同仁 否認 爭議 空談 近來 非核 封口令
##    1      1    0    0    0    1    0    1    1    0    0    0    0      1
##    2      1    0    0    0    1    0    0    0    1    0    1    0      0
##    3      0    1    1    1    0    1    0    0    0    1    0    1      0
##     Terms
## Docs 柯p 家園 能源 飽受
##    1   1    0    0    0
##    2   1    0    0    1
##    3   1    1    1    0

向量計算

v1 = c(1,0,1,0)
v2 = c(1,0,0,1)
# Euclidean Distance Calculation
sqrt(sum((v1 - v2) ^2 ))

## [1] 1.414214

dist(rbind(v1,v2))

##          v1
## v2 1.414214

1500篇中文文章處理

download.file('https://raw.githubusercontent.com/ywchiu/rtibame/master/data/applenews.RData', 'applenews.RData')
load('applenews.RData')

str(applenews)

## 'data.frame':    1500 obs. of  5 variables:
##  $ content : chr  "(更新:新增影片)想要透過刮刮樂彩券一夕致富，但他卻用錯方法！台中市一名黃姓男子覬覦頭獎高達2600萬的「開門見喜」刮刮樂彩券，上月佯"| __truncated__ "澳洲一名就讀雪梨大學的華裔博士生，日前公開一段燒毀中國護照的影片，還大肆批評留澳學生是一群「留學豬」。消息傳出後，這名博士生立"| __truncated__ "【行銷專題企劃】房價高高在上，沒錢買房沒關係，但你認為自己是聰明的租屋族嗎? 由蘋果地產與FBS TV合作的全新節目-房市大追擊，本集節"| __truncated__ "本內容由中央廣播電臺提供<U+00A0><U+00A0> <U+00A0> <U+00A0> <U+00A0>美國國防部長卡特(Ash Carter)今天(15日)表示，他今天將前往在菲"| __truncated__ ...
##  $ title   : chr  "【更新】搶2.2萬彩券刮中1.4萬　沒發財還得入獄" "拿到澳洲護照後　他放火燒中國護照" "【特企】房市大追擊- 租屋這些事情要小心" "【央廣RTI】美菲軍演  美防長南海登艦" ...
##  $ dt      : POSIXct, format: "2016-04-15 14:32:00" "2016-04-15 14:32:00" ...
##  $ category: chr  "社會" "國際" "地產" "國際" ...
##  $ view_cnt: chr  "1754" "0" "0" "0" ...

library(jiebaR)
mixseg = worker()
apple.seg =lapply(applenews$content, function(e)segment(code=e, jiebar=mixseg))

s.corpus <- CNCorpus(apple.seg)
control.list=list(wordLengths=c(2,Inf),tokenize=space_tokenizer)
s.dtm <- DocumentTermMatrix(s.corpus, control=control.list)
dim(s.dtm )

## [1]  1500 41840

freq.term = findFreqTerms(s.dtm, 150,Inf)
#inspect(s.dtm[1:5, freq.term])

findAssocs(s.dtm, "大巨蛋", 0.5)

## $大巨蛋
##       遠雄       解約       市府       展延       工期 2015.04.16 
##       0.88       0.78       0.74       0.72       0.70       0.69 
## 2015.04.17 2015.04.21 2016.03.16 2016.03.18 2016.03.21 2016.03.22 
##       0.69       0.69       0.69       0.69       0.69       0.69 
## 2016.04.13   collapse       口稱       已朝   已無太多     天遠雄 
##       0.69       0.69       0.69       0.69       0.69       0.69 
##     王貞治       他怕       回事     見遠雄   居心叵測     若不願 
##       0.69       0.69       0.69       0.69       0.69       0.69 
##     核子彈     起源於   問及此事     理還亂       連通       期議 
##       0.69       0.69       0.69       0.69       0.69       0.69 
##       菸廠     進度條       聘書       運棄       頒贈       孵出 
##       0.69       0.69       0.69       0.69       0.69       0.69 
##     蓋遠雄     蔡宗易       談盤       應北       歸責       難解 
##       0.69       0.69       0.69       0.69       0.69       0.69 
##       觸礁       議約     攝遠雄       無解       停工       五大 
##       0.69       0.69       0.69       0.68       0.64       0.62 
##       方向       拋出       溫室       南線       貿然     願和北 
##       0.62       0.61       0.61       0.60       0.60       0.60 
##       量體     若遠雄       逾期       逃生       違約       實地 
##       0.59       0.56       0.56       0.55       0.54       0.53 
##     趙藤雄       懶人        370       容納     柯文哲 
##       0.53       0.52       0.51       0.51       0.50

dtm.remove = removeSparseTerms(s.dtm, 0.9)
dim(dtm.remove)

## [1] 1500   66

dtm.remove$dimnames$Terms

##  [1] "10"       "12"       "20"       "一定"     "一個"     "已經"    
##  [7] "不是"     "不會"     "不過"     "中心"     "中國"     "今天"    
## [13] "今年"     "內容"     "日本"     "他們"     "以及"     "包括"    
## [19] "去年"     "可以"     "可能"     "台北"     "台灣"     "外電報導"
## [25] "未來"     "目前"     "因此"     "因為"     "如果"     "自己"    
## [31] "希望"     "我們"     "更新"     "沒有"     "其中"     "知道"    
## [37] "表示"     "指出"     "是否"     "甚至"     "相關"     "美國"    
## [43] "要求"     "時間"     "粉絲團"   "問題"     "國際"     "現在"    
## [49] "這個"     "報導"     "就是"     "提供"     "最後"     "發生"    
## [55] "發現"     "進行"     "開始"     "新聞"     "綜合"     "認為"    
## [61] "影片"     "影響"     "還是"     "翻攝"     "繼續"     "蘋果"

建立文字雲

class(s.dtm)

## [1] "DocumentTermMatrix"    "simple_triplet_matrix"

m = as.matrix(s.dtm)
class(m)

## [1] "matrix"

frequency <- colSums(as.matrix(s.dtm))
frequency <- sort(frequency, decreasing=TRUE)[1:100]
library(wordcloud2)

## Warning: package 'wordcloud2' was built under R version 3.2.5

wordcloud2(as.table(frequency),shape = 'pentagon')

抓取FB 資訊

library(rvest)
token = ''
fburl = paste0('https://graph.facebook.com/v2.7/me/friends?access_token=', token)

# install.packages("rjson")
library(httr)
library(rjson)

fbcontent<-GET(fburl)
fbjson<-fromJSON(content(fbcontent, 'text'))
name = sapply(fbjson$data, function(e) e$name)
id = sapply(fbjson$data, function(e) e$id)
df = data.frame(name=name, id=id)
View(df)

計算距離

x = c(0, 0, 1, 1, 1, 1)
y = c(1, 0, 1, 1, 0, 1)
# Euclidean Distance
dist(rbind(x,y), 'euclidean')
sqrt(sum((x - y ) ^2))
dist(rbind(x,y), method ="minkowski", p=2)

# Manhattan Distance
dist(rbind(x,y), 'manhattan')
sum(abs(x - y))
dist(rbind(x,y), method ="minkowski", p=1)

　使用hclust 做iris 分群

data(iris)
View(iris)

iris.dist = dist(iris[,-5], method='euclidean')
hc = hclust(iris.dist, method="ward.D2")
plot(hc)

fit = cutree(hc, k = 3)
 table(fit, iris[,5])

##    
## fit setosa versicolor virginica
##   1     50          0         0
##   2      0         49        15
##   3      0          1        35

table(fit)

## fit
##  1  2  3 
## 50 64 36

plot(hc, hang = -0.01, cex = 0.7)
rect.hclust(hc, k = 3 , border="red")

 par(mfrow=c(1,2))
# Original iris scatter plot 
plot(iris$Petal.Length, iris$Petal.Width, col=iris$Species, main = 'original')

# fitted iris scatter plot
plot(iris$Petal.Length, iris$Petal.Width, col=fit, main = 'clustered')

Diana

library(cluster)

dv = diana(iris[,-5], metric = "euclidean")
summary(dv)

## Merge:
##        [,1] [,2]
##   [1,] -102 -143
##   [2,]   -8  -40
##   [3,]   -1  -18
##   [4,]  -10  -35
##   [5,] -129 -133
##   [6,]  -11  -49
##   [7,]   -5  -38
##   [8,]  -20  -22
##   [9,] -117 -138
##  [10,]  -81  -82
##  [11,]  -58  -94
##  [12,]   -9  -39
##  [13,]  -96  -97
##  [14,]  -83  -93
##  [15,] -128 -139
##  [16,]  -28  -29
##  [17,]   -3  -48
##  [18,]   -2  -46
##  [19,]  -64  -92
##  [20,]  -66  -76
##  [21,] -124 -127
##  [22,] -113 -140
##  [23,]    3   16
##  [24,]    4  -31
##  [25,]   -4  -30
##  [26,]  -95 -100
##  [27,]  -89   13
##  [28,]  -67  -85
##  [29,]  -24  -27
##  [30,]   18  -13
##  [31,]  -54  -90
##  [32,]  -75  -98
##  [33,]  -36  -50
##  [34,]   23    2
##  [35,] -121 -144
##  [36,] -111 -148
##  [37,] -137 -149
##  [38,]  -55  -59
##  [39,]    8  -47
##  [40,] -141 -145
##  [41,]   19  -79
##  [42,] -104    9
##  [43,] -142 -146
##  [44,]  -70   10
##  [45,]    7  -41
##  [46,]    1 -114
##  [47,]   29  -44
##  [48,]  -51  -53
##  [49,]  -52  -57
##  [50,]   30   24
##  [51,] -108 -131
##  [52,] -106 -123
##  [53,]  -68   14
##  [54,]  -21  -32
##  [55,]  -71   15
##  [56,]  -12  -25
##  [57,]   17   25
##  [58,]   50  -26
##  [59,]  -56  -91
##  [60,]   12  -43
##  [61,]  -84 -134
##  [62,]   46 -122
##  [63,]   57   -7
##  [64,]   -6  -19
##  [65,]   48  -87
##  [66,]   27   26
##  [67,]   60  -14
##  [68,]   35   40
##  [69,]   55 -150
##  [70,]  -16  -34
##  [71,]    6   39
##  [72,]   34   45
##  [73,]   21 -147
##  [74,] -116   37
##  [75,]   11  -99
##  [76,]   41  -74
##  [77,]  -62  -72
##  [78,]   68 -125
##  [79,] -118 -132
##  [80,]  -73   61
##  [81,]   54  -37
##  [82,]   38   20
##  [83,]  -78   36
##  [84,]   31   44
##  [85,]  -65  -80
##  [86,]  -17  -33
##  [87,]   49  -86
##  [88,]   58   33
##  [89,]   65  -77
##  [90,]   53   66
##  [91,]   51 -126
##  [92,]   22   43
##  [93,] -105   78
##  [94,]   80   73
##  [95,]    5 -112
##  [96,]   42   95
##  [97,]   59   28
##  [98,]   76   32
##  [99,]  -69 -120
## [100,]   52 -119
## [101,]  -15   70
## [102,]   71  -45
## [103,]   77   90
## [104,]   88   63
## [105,]   84  -60
## [106,]   72   47
## [107,]   64   86
## [108,]   89   82
## [109,] -103   93
## [110,]  107  101
## [111,]  106   81
## [112,]   91 -136
## [113,] -101   74
## [114,]  104   56
## [115,]   87   98
## [116,]  105   85
## [117,]   69   62
## [118,]  117 -115
## [119,]  111  102
## [120,]   99   94
## [121,]  109   92
## [122,]   83   96
## [123,]   97  103
## [124,] -110   79
## [125,]  116  -63
## [126,]   67  -23
## [127,]  114  126
## [128,]  122 -109
## [129,]  128 -135
## [130,]  108  115
## [131,]  113  121
## [132,]  125  -61
## [133,]  130  -88
## [134,]  129 -130
## [135,]  119  110
## [136,]  100  112
## [137,]  120  118
## [138,]  127  -42
## [139,]  136  124
## [140,]  123 -107
## [141,]  134  131
## [142,]  132  140
## [143,]  133  137
## [144,]  141  139
## [145,]  135  138
## [146,]  143  142
## [147,]  145   75
## [148,]  146  144
## [149,]  147  148
## Order of objects:
##   [1]   1  18  28  29   8  40   5  38  41  24  27  44  21  32  37  11  49
##  [18]  20  22  47  45   6  19  17  33  15  16  34   2  46  13  10  35  31
##  [35]  26  36  50   3  48   4  30   7  12  25   9  39  43  14  23  42  58
##  [52]  94  99  51  53  87  77  55  59  66  76  52  57  86  64  92  79  74
##  [69]  75  98  88  69 120  73  84 134 124 127 147  71 128 139 150 102 143
##  [86] 114 122 115  54  90  70  81  82  60  65  80  63  61  56  91  67  85
## [103]  62  72  68  83  93  89  96  97  95 100 107  78 111 148 104 117 138
## [120] 129 133 112 109 135 130 101 116 137 149 103 105 121 144 141 145 125
## [137] 113 140 142 146 106 123 119 108 131 126 136 110 118 132
## Height:
##   [1] 0.1000000 0.1732051 0.1414214 0.2236068 0.1000000 0.3741657 0.1414214
##   [8] 0.2645751 0.6164414 0.2000000 0.2645751 0.7071068 0.2828427 0.4242641
##  [15] 0.8062258 0.1000000 0.3605551 0.1414214 0.2449490 0.5477226 1.2727922
##  [22] 0.3316625 0.6480741 0.4582576 0.6855655 0.5477226 0.3605551 2.4289916
##  [29] 0.1414214 0.2000000 0.2645751 0.1000000 0.1732051 0.3000000 0.4582576
##  [36] 0.2236068 0.6082763 0.1414214 0.3000000 0.1732051 0.3316625 0.7549834
##  [43] 0.3000000 1.0295630 0.1414214 0.3162278 0.3464102 1.0295630 1.3416408
##  [50] 2.9291637 0.1414214 0.3872983 7.0851958 0.2645751 0.3316625 0.4582576
##  [57] 0.6557439 0.2449490 0.4242641 0.1414214 1.0677078 0.2645751 0.4582576
##  [64] 0.7549834 0.1414214 0.2449490 0.3872983 0.5196152 0.2000000 1.1832160
##  [71] 1.6613248 0.5385165 0.8544004 0.4242641 0.3316625 0.5099020 0.1732051
##  [78] 0.3872983 1.3000000 0.3000000 0.1414214 0.3605551 0.7810250 0.0000000
##  [85] 0.2645751 0.3316625 0.7874008 2.6532998 0.2000000 0.4358899 0.2645751
##  [92] 0.1414214 0.6164414 0.7615773 0.4472136 1.0295630 1.1357817 1.6155494
##  [99] 0.3162278 0.5196152 0.2000000 0.9055385 0.4000000 0.6000000 0.2828427
## [106] 0.1414214 0.4690416 0.1732051 0.1414214 0.3316625 0.1732051 1.3928388
## [113] 4.7127487 0.4242641 0.2236068 0.8602325 0.2449490 0.1414214 0.5099020
## [120] 0.1000000 0.5099020 1.0295630 1.0630146 1.2041595 1.4387495 0.7416198
## [127] 0.3872983 0.2449490 1.1224972 0.6708204 0.4795832 0.2236068 0.3464102
## [134] 0.2449490 0.4000000 0.8544004 0.1732051 0.4690416 0.2449490 2.4186773
## [141] 0.2645751 0.5477226 1.2922848 0.2645751 0.4690416 0.7416198 1.3892444
## [148] 0.9327379 0.4123106
## Divisive coefficient:
## [1] 0.953798
## 
## Available components:
## [1] "order"  "height" "dc"     "merge"  "diss"   "call"   "data"

plot(dv)

## Cosine Distance

library(proxy)

## Warning: package 'proxy' was built under R version 3.2.5

## 
## Attaching package: 'proxy'

## The following objects are masked from 'package:stats':
## 
##     as.dist, dist

## The following object is masked from 'package:base':
## 
##     as.matrix

library(jiebaR)
library(tm)
source('https://raw.githubusercontent.com/ywchiu/rtibame/master/Lib/CNCorpus.R')
mixseg = worker()
s1 = '我喜歡看電視不喜歡看電影'
s2 = '我不喜歡看電視也不喜歡看電影'
s1.seg = segment(s1, mixseg)
s2.seg = segment(s2, mixseg)
s.corpus <- CNCorpus(list(s1.seg, s2.seg))
control.list=list(wordLengths=c(1,Inf),tokenize=space_tokenizer)
s.dtm <- DocumentTermMatrix(s.corpus, control=control.list)
inspect(s.dtm)

## <<DocumentTermMatrix (documents: 2, terms: 7)>>
## Non-/sparse entries: 13/1
## Sparsity           : 7%
## Maximal term length: 3
## Weighting          : term frequency (tf)
## 
##     Terms
## Docs 也 不 我 看 看電視 喜歡 電影
##    1  0  1  1  1      1    2    1
##    2  1  2  1  1      1    2    1

proxy::dist(as.matrix(s.dtm), method = "cosine")

##            1
## 2 0.07549967

建立詞頻矩陣

library(jiebaR)
library(tm)
source('https://raw.githubusercontent.com/ywchiu/rtibame/master/Lib/CNCorpus.R')
mixseg = worker()
apple.seg =lapply(applenews$content, 
       function(e)segment(code=e, jiebar=mixseg))

s.corpus <- CNCorpus(apple.seg)
control.list=list(wordLengths=c(2,Inf),
        tokenize=space_tokenizer)
s.corpus = tm_map(s.corpus, removeNumbers)
s.corpus = tm_map(s.corpus, removePunctuation)
dtm <- DocumentTermMatrix(s.corpus, 
        control=control.list)
dim(dtm )

## [1]  1500 40571

計算文章相似度

dtm.remove = removeSparseTerms(dtm, 0.99)
dim(dtm.remove )
dtm.dist = proxy::dist(as.matrix(dtm.remove ), method = "cosine")
dtm.mat = as.matrix(dtm.dist)
applenews$title[51]
applenews$title[order(dtm.mat[51,])[2:10]]

查詢近似新聞

alike.article.idx =  which(dtm.mat[51,] < 0.5)
applenews$title[alike.article.idx]

article.query = function(idx){
  alike.article.idx =  which(dtm.mat[idx,] < 0.8)
  applenews$title[alike.article.idx]
}
article.query(18)[1:10]

文章分群

dtm.cluster = hclust(dtm.dist, method="ward.D2")
rect.hclust(dtm.cluster, k = 20 , border="red")

fit = cutree(dtm.cluster, k = 20)
applenews$title[fit == 16]

20150627 新聞分群

download.file('https://raw.githubusercontent.com/ywchiu/rtibame/master/History/Class1/news_big5.RData', 'news.RData')
load('news.RData')
colnames(news) = c('title', 'content', 'id')


library(jiebaR)
library(tm)
source('https://raw.githubusercontent.com/ywchiu/rtibame/master/Lib/CNCorpus.R')
mixseg = worker()
news.seg =lapply(as.character(news$content), 
       function(e)segment(code=e, jiebar=mixseg))

s.corpus <- CNCorpus(news.seg)
control.list=list(wordLengths=c(2,Inf),
        tokenize=space_tokenizer)
s.corpus = tm_map(s.corpus, removeNumbers)
s.corpus = tm_map(s.corpus, removePunctuation)
dtm <- DocumentTermMatrix(s.corpus, 
        control=control.list)
dim(dtm )

## [1]   147 11485

dtm.dist = proxy::dist(as.matrix(dtm), method = "cosine")

dtm.cluster = hclust(dtm.dist, method="ward.D2")
plot(dtm.cluster)
rect.hclust(dtm.cluster, k = 5 , border="red")

fit = cutree(dtm.cluster, k = 5)
news$title[fit == 1]

##  [1] 八仙塵爆  五相關人依公共危險重傷害法辦        
##  [2] 八仙樂園意外 病患持續增加中                   
##  [3] 6月28日各報頭版要聞                           
##  [4] 八仙樂園舞台大火 逾400人輕重傷                
##  [5] 八仙樂園意外 毛揆取消視察臺東行程             
##  [6] 八仙樂園貼出暫停營業海報                      
##  [7] 八仙樂園爆炸案 專家認玉米粉危險性高應管制使用 
##  [8] 八仙樂園粉塵燃爆 400人傷 負責人被移送         
##  [9] 八仙案  新店慈濟4傷者危急                     
## [10] 八仙樂園爆炸案 負責人等5人移送                
## [11] 八里爆炸意外衛生局凌晨公布314名傷患名單 分送27
## [12] 八仙樂園爆炸 劣賊趁亂打劫                     
## [13] 侯友宜探視傷者：追究責任                      
## [14] 八仙樂園火警  國軍加入救援                    
## [15] 八仙樂園爆炸案災害應變中心　1時30分一級開設   
## [16] 八仙派對彩粉釀大禍  衛福部：研議加強管理      
## [17] 彩色派對主辦人：風勢太大，引燃粉塵            
## [18] 八仙樂園大火  傷者查詢電話                    
## [19] 派對彩粉首次釀禍  蔣丙煌：研議管理            
## [20] 八仙樂園火警受傷名單一覽表                    
## [21] 00：35統計　八仙大火重傷97人、輕傷132人       
## [22] 衛福部長：北北基醫護人員全力動員              
## [23] 八仙樂園粉塵瞬燃　還原失事現場影片曝光        
## [24] 八仙意外毛揆取消台東熱氣球活動                
## [25] 八仙樂園爆炸案 士檢派3檢察官現場指揮          
## [26] 八仙樂園爆炸案 3檢察官展開調查                
## [27] 八仙爆炸案   陳國恩：全力支援                 
## [28] 八仙大火傷患爆量  衛福調度醫院                
## [29] 八仙樂園意外 洪秀柱：為傷者祈福               
## [30] 陳國恩指示警方全力動員 協助救護交通順暢       
## [31] 八仙樂園爆炸228人輕重傷 朱立倫：無限期停業    
## [32] 八仙樂園爆炸意外 朱立倫：立即停園接受調查     
## [33] 北榮33名八里傷患  2/3傷重                     
## [34] 八仙樂園塵爆215傷　朱立倫下令即刻停業         
## [35] 八仙樂園爆炸 朱立倫4點指示                    
## [36] 【八仙意外】馬英九、毛治國第一時間得知　指示全
## [37] 【八仙意外】讓專業的來！柯文哲指示啟動EOC     
## [38] 八仙水上樂園爆炸逾百人傷　三軍總醫院收治傷患  
## 147 Levels: 《金曲26》2015金曲獎得獎名單 線上直播懶人包 ...

20160731Demo

David Chiu

2016年7月31日

產生正確中文詞頻矩陣

產生正確中文詞頻矩陣 (2)

向量計算

1500篇中文文章處理

建立文字雲

抓取FB 資訊

計算距離

使用hclust 做iris 分群

Diana

建立詞頻矩陣

計算文章相似度

查詢近似新聞

文章分群

20150627 新聞分群