蘋果網路爬蟲


parseArticle <- function(url){
    article <- read_html(url)
    
    title <- article %>%
      html_nodes('h1') %>%
      html_text()
    
    content <- article %>%
      html_node('.ndArticle_margin p') %>%
      html_text()
    
  return(data.frame(title, content, url, stringsAsFactors = FALSE))
}


library(rvest)
appleurl <- 'https://tw.appledaily.com/new/realtime/'
applenews <- data.frame()
for(page in seq(1,2)){
  rtddt <- read_html(paste0(appleurl, page) ) %>%
    html_nodes('.rtddt a') %>%
    html_attr('href')
  
  for(url in rtddt){
    news <- parseArticle(url)
    applenews <- rbind(applenews, news)
  }
  print(page)
}
applenews

write.csv(x=applenews, file = 'applenews.csv')
str(applenews)

建立詞頻矩陣

applenews <- read.csv('https://raw.githubusercontent.com/ywchiu/fubonr/master/data/applenews.csv', stringsAsFactors = FALSE)
head(applenews)
##   X                                        title
## 1 1     資深電影人揭張艾嘉三角情愛 曾戀過劉家昌
## 2 2       搭大眾運輸抗空污首日 高捷運量成長1成3
## 3 3           限量聖誕心意 送禮也可暖心愛護地球
## 4 4           美股飆風再起 台股開高反攻漲逾60點
## 5 5 【有片】西蒙斯才17歲時 詹皇就曾約他清晨密訓
## 6 6           Google Play最佳App出爐啦!你必須載
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              content
## 1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                張艾嘉執導的《相愛相親》獲本屆金馬獎7項提名全摃龜,她個人入圍導演和影后2項皆以1票之差落榜,「全軍覆沒」抱憾心情,又被爆料過往情事。今日出版的《時報周刊》報導,由資深媒體人倪有純執筆資深電影人江文雄的自傳書《江文雄熱愛電影50年》,當中揭露張艾嘉曾與已故總統蔣經國的兒子傳出戀情,也曾跟詞曲創作人劉家昌有段情愛,3人的情感有一小段重疊,衍生出三角戀,最後因蔣有家室而終結。張艾嘉與劉家昌相戀2年,因男方追求當時已婚的甄珍,才知劉家昌移情別戀。江文雄感嘆:「小妹(張艾嘉小名)的感情路一路顛簸」。(即時新聞中心/綜合報導)
## 2 為對抗冬季空污,高雄市今起展開三個月搭乘大眾運輸免費措施,《蘋果》一早走訪捷運、公車總站,八成民眾都知此措施,但全數稱不是刻意來搭乘,有人大讚環保又省錢,但也有學生指要把省下車票錢,存下來買機車。據高捷統計,今早6點半至8點半運輸量是2萬6970人次,較上周五同時段成長約13%。<U+00A0>據環保署空氣品質監測資料,高雄市空氣品質一掃過去幾天,呈對所有族群不康紅色警戒狀態,全面呈現黃色普通等級。高市環保局指,今天大氣條件擴散條件變佳,高雄空氣品質全面改善。<U+00A0>今同時也是高市府推行「搭大眾運輸抗空污」首日,今天起至明年二月底,持電子票證搭高市公車、客運、輕軌免費,捷運部分則是上午六點半至八點半,下午四點半到六點半,持電子票證免費。<U+00A0>《蘋果》到高捷車站詢問10位搭乘者,有8人知此抗空污搭車免費措施,但都稱不是因免費搭乘措施刻意搭乘,另2人因不知有此措施,花錢買票卡入站。<U+00A0>民眾楊小姐從美麗島站要搭到大<U+7AC2>站,她指,本來就要來搭車,不知有此新措施,本來可省60元票價,因沒帶電子票證沒省到。獅甲國中姓男同學說,自己天天坐捷運上下學,開心三個月免費搭乘。<U+00A0>在高雄客運總站方面,《蘋果》也隨機問十名等待搭車者。也有八位民眾知此抗空污免費搭車措施,另兩人不知道。雄工李姓男同學說,他不知道今起可免費搭公車,他3個月可省1千多元車資,會存下來,等18歲買機車,記者問他這樣不是不環保了嗎?他回「我本來就不是為環保來搭公車。」<U+00A0>另名雄女蔡姓學生也說,她也是不是因免費來搭公車,覺得這時候辦免費搭大眾運輸,有令人感覺是選舉快到了。來自台南陸小姐與二位朋友則指,肯定高雄免費搭大眾運輸抗空污做法,「台南空氣也很糟,應該比照辦理。」<U+00A0>高雄捷運指為配合高雄市政府空品不良時期搭乘大眾運輸尖峰時段免費方案,今日上午尖峰時段(06:30~08:30)全線進站2萬6970人次,較上周五進站2萬3898人次,共增加3072人次,增幅約13%。(吳慧芬、侯昌騰/高雄報導)<U+00A0><U+00A0>
## 3                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  美妝保養品牌品木宣言,秉持愛護自然理念與堅守保護環境承諾,現已在全球種植超過50萬顆樹,而隨著聖誕節到來,今年推出的聖誕禮盒,當然也跟守護環境有關。每年兵家必爭的聖誕新品,除美妝品大玩花招外,其實味道舒心的香氛品在冷冬的浪漫季節內,彷彿為冬日注入一股暖流,也是人氣夯品。品木宣言今年聖誕新品就特別引進三款限量FEEL GOOD聖誕蠟燭,預計於12/2開始販售,味道分別有松木柑橘蠟(綠)、佛手柑薑味(紅)、橙花檸檬草(藍)等,除造型討喜外,玻璃更是以再利用的回收酒瓶製成,此外,品木宣言也與Global Releaf合作Plant A Tree計畫,若消費者購買此系列綠色款的松木柑橘蠟燭任一枚,品木宣言將捐款給Global Releaf種一顆樹,為地球盡份心力,讓人買的更添暖心。(吳佳蓁/綜合報導)
## 4                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        (新增道瓊期指表現)投資人對美國改革稅制前景的看法轉趨樂觀,帶動美國股市周四收高,道瓊工業指數上漲逾330點,衝破24000點大關,是史上首見,S&P 500指數也創新62.95點或0.6%,高紀錄,帶動台股跳空開高,展開反攻,加權指數盤初上漲62.95點或0.6%,一度來到10623.39點。美股道瓊指數連2日創新高主要受稅改預期激勵,但台灣時間今早約8時傳出美國參議院暫停就稅改法案表決,直到周五上午11點(台灣周六凌晨12點),衝擊道瓊期指急跌,亞洲盤初一度下挫59點或0.24%至24215點。(財經中心/綜合報導)發稿時間:09:15修改時間:09:45<U+00A0>
## 5                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 (新增影片)本季76人新秀西蒙斯(Ben Simmons)爆紅,並且被比為「詹姆士(LeBron James)接班人」,根據《運動畫刊》報導,因為詹皇和西蒙斯同個經紀人,他對這位小老弟一直很照顧,早在西蒙斯17歲那年,詹皇就曾約西蒙斯一起訓練。<U+00A0>西蒙斯17歲時,曾參加詹姆士訓練營,當時詹皇看西蒙斯就像照鏡子一樣,愈看愈中意,他告訴西蒙斯:「明天早上6點半在球館見!」結果西蒙斯這小伙子也很長進,隔天凌晨4點半就出現在球場開始熱身了。<U+00A0>詹姆士曾勉勵西蒙斯說:「你有機會超越我,但你不能走捷徑,你要夠努力才行。」讓西蒙斯相當感動,他說:「詹姆士之所以偉大的一部份原因,就是他希望別人和他一樣偉大。」<U+00A0><U+00A0> (廖柏璋/綜合報導)出版時間:0721影片更新:0943<U+00A0>
## 6                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        Google公布台灣地區Google Play 2017年度最佳App精選名單,台灣開發者有夠威,以《記帳城市》奪下年度最佳應用程式,也是第一次有國產App同時強勢攻入香港、韓國、印度等地區榜單,向國際大展台灣軟體實力。<U+00A0>台灣人自製的《記帳城市》,將記帳結合遊戲功能,透過簡單快速的記錄開銷,就能打造專屬Q版城市,記越多、蓋越多,建築物還可升級,促使玩家養成記帳習慣。<U+00A0>台灣之光不僅如此,這次Google公布最佳榜單共36款應用程式、36款遊戲,其中國產App就包辦14個席次。以下4款國產App還雙雙入選台灣和香港的榜單,包含隨拍隨剪的《威力酷剪》、擁有超過15萬道食譜的《iCook愛料理》、素顏也能自拍的《玩美彩妝》,以及可邊玩遊戲邊回訊、邊看影片邊聊天的《神回覆》。(黃韻文/台北報導)
##                                                                        url
## 1       https://tw.entertainment.appledaily.com/realtime/20171201/1251316/
## 2           https://tw.news.appledaily.com/life/realtime/20171201/1251310/
## 3 https://tw.lifestyle.appledaily.com/lifestyle/realtime/20171201/1251222/
## 4             https://tw.finance.appledaily.com/realtime/20171201/1251315/
## 5              https://tw.sports.appledaily.com/realtime/20171201/1251285/
## 6    https://tw.lifestyle.appledaily.com/gadget/realtime/20171201/1251009/
library(jiebaR)
## Warning: package 'jiebaR' was built under R version 3.4.2
## Loading required package: jiebaRD
edit_dict()
## Warning in edit_dict(): You should save the dictionary without BOM on
## Windows
mixseg <- worker()
apple.seg <- lapply(applenews$content, function(e) segment(e, jiebar = mixseg))
apple.seg[1]
## [[1]]
##   [1] "張艾嘉"   "執導"     "的"       "相愛"     "相親"     "獲"      
##   [7] "本屆"     "金馬獎"   "7"        "項"       "提名"     "全"      
##  [13] "摃龜"     "她"       "個人"     "入圍"     "導演"     "和"      
##  [19] "影后"     "2"        "項皆以"   "1"        "票之差"   "落榜"    
##  [25] "全軍覆沒" "抱憾"     "心情"     "又"       "被"       "爆料"    
##  [31] "過往"     "情事"     "今日"     "出版"     "的"       "時報"    
##  [37] "周刊"     "報導"     "由"       "資深"     "媒體"     "人倪"    
##  [43] "有"       "純"       "執筆"     "資深"     "電影"     "人"      
##  [49] "江文雄"   "的"       "自"       "傳書"     "江文雄"   "熱愛"    
##  [55] "電影"     "50"       "年"       "當中"     "揭露"     "張艾嘉"  
##  [61] "曾"       "與"       "已故"     "總統"     "蔣經國"   "的"      
##  [67] "兒子"     "傳出"     "戀情"     "也"       "曾"       "跟"      
##  [73] "詞曲創作" "人"       "劉家"     "昌有"     "段"       "情愛"    
##  [79] "3"        "人"       "的"       "情感"     "有"       "一小"    
##  [85] "段"       "重疊"     "衍生"     "出"       "三角戀"   "最後"    
##  [91] "因"       "蔣有"     "家室"     "而"       "終結"     "張艾嘉"  
##  [97] "與"       "劉家"     "昌"       "相戀"     "2"        "年"      
## [103] "因"       "男方"     "追求"     "當時"     "已婚"     "的"      
## [109] "甄珍"     "才"       "知"       "劉家"     "昌"       "移情別戀"
## [115] "江文雄"   "感嘆"     "小妹"     "張艾嘉"   "小名"     "的"      
## [121] "感情"     "路"       "一路"     "顛簸"     "即時新聞" "中心"    
## [127] "綜合"     "報導"
removeEnglish <- function(word){
  word[grep(pattern = '[\u4e00-\u9fa5]+', x=word)]
}


library(tm)
## Warning: package 'tm' was built under R version 3.4.2
## Loading required package: NLP
s.corpus <- Corpus(VectorSource(apple.seg))
doc      <- tm_map(s.corpus, removeNumbers)

dtm      <- DocumentTermMatrix(doc)
dtm
## <<DocumentTermMatrix (documents: 900, terms: 29525)>>
## Non-/sparse entries: 107418/26465082
## Sparsity           : 100%
## Maximal term length: 19
## Weighting          : term frequency (tf)
findFreqTerms(dtm, 300)
##  [1] "中心"   "報導"   "綜合"   "自己"   "蘋果"   "今<a6>" "台灣"  
##  [8] "時間"   "台<a5>" "國際"   "可以"   "沒有"   "表示"   "新聞"  
## [15] "公司"   "警方"
findFreqTerms(dtm, 100, 300)
##   [1] "出版"   "最後"   "媒體"   "當時"   "不是"   "今天"   "民眾"  
##   [8] "同時"   "所有"   "明<a6>" "朋友"   "知道"   "空氣"   "政府"  
##  [15] "這樣"   "等級"   "學生"   "應該"   "全球"   "合<a7>" "特別"  
##  [22] "推出"   "超過"   "美國"   "新增"   "出<b2>" "希望"   "更新"  
##  [29] "根據"   "就是"   "結果"   "影片"   "以及"   "地區"   "其中"  
##  [36] "第<a4>" "這次"   "透過"   "大家"   "小時"   "工<a7>" "已經"  
##  [43] "日前"   "可能"   "企<b7>" "我們"   "所以"   "指出"   "真的"  
##  [50] "勞工"   "無法"   "萬元"   "目前"   "是<a7>" "甚至"   "接受"  
##  [57] "教練"   "除了"   "中國"   "方式"   "他們"   "去<a6>" "其他"  
##  [64] "服務"   "看到"   "這個"   "發<b2>" "影響"   "雖然"   "日本"  
##  [71] "不要"   "不<b7>" "只是"   "如果"   "問題"   "這些"   "部分"  
##  [78] "進行"   "對於"   "需要"   "市場"   "活動"   "國家"   "女子"  
##  [85] "不同"   "未來"   "安全"   "針對"   "什麼"   "過去"   "不過"  
##  [92] "健康"   "持續"   "教<a8>" "調查"   "還有"   "還是"   "人員"  
##  [99] "至於"   "使用"   "社<b7>" "員工"   "提供"   "發展"   "內容"  
## [106] "相關"   "男子"   "發生"   "億元"   "造成"   "電報導" "新<a5>"
## [113] "照片"   "要求"   "網友"   "要看"   "李婉鈺" "法官"   "手機"
findAssocs(dtm, "勞工", 0.7)
## $勞工
## 工時 長短 
## 0.71 0.71
findAssocs(dtm, "李婉鈺", 0.7)
## $李婉鈺
## 二樓住戶     上邊 分不<b6>     文家   代<b6>     多說 有民眾向     而入 
##     0.76     0.76     0.76     0.76     0.76     0.76     0.76     0.76 
##     每戶     其門     後進     洽當     哪戶     除狂 備受矚目     發稱 
##     0.76     0.76     0.76     0.76     0.76     0.76     0.76     0.76 
##     進張     催化     搶密     輕輕     還嚼     警送   警<b6>     國宅 
##     0.76     0.76     0.76     0.76     0.76     0.76     0.76     0.75 
##     電鈴 李婉鈺到     鄰長 
##     0.75     0.71     0.71
dim(dtm)
## [1]   900 29525
dtm.remove <- removeSparseTerms(dtm, 0.99)
dim(dtm.remove)
## [1]  900 2023
head(dtm.remove$dimnames$Terms)
## [1] "已婚"     "中心"     "今日"     "心情"     "出版"     "即時新聞"
dtm.remove$dimnames$Terms
##    [1] "已婚"       "中心"       "今日"       "心情"       "出版"      
##    [6] "即時新聞"   "兒子"       "個人"       "追求"       "最後"      
##   [11] "報導"       "媒體"       "提名"       "揭露"       "傳出"      
##   [16] "感情"       "當中"       "當時"       "資<b2>"     "過往"      
##   [21] "電影"       "綜合"       "導演"       "爆料"       "三個"      
##   [26] "下午"       "下來"       "上午"       "上周五"     "大眾"      
##   [31] "不<a8>"     "不知"       "不是"       "今天"       "今早"      
##   [36] "方面"       "方案"       "月底"       "令人"       "冬季"      
##   [41] "另名"       "台南"       "市府"       "本來"       "民眾"      
##   [46] "全面"       "全數"       "同時"       "成長"       "自己"      
##   [51] "免費"       "呈<b2>"     "改善"       "李姓"       "來自"      
##   [56] "兩人"       "刻意"       "所有"       "明<a6>"     "朋友"      
##   [61] "狀態"       "知道"       "空污"       "空<ab>"     "空氣"      
##   [66] "肯定"       "政府"       "是不是"     "紅色警戒"   "美麗"      
##   [71] "展<b6>"     "時期"       "記者"       "配合"       "高市"      
##   [76] "高雄"       "高雄市"     "做法"       "措施"       "族群"      
##   [81] "條件"       "統計"       "這樣"       "幾天"       "普通"      
##   [86] "等待"       "等級"       "黃色"       "感覺"       "搭乘"      
##   [91] "詢問"       "資料"       "電子"       "增<a5>"     "學生"      
##   [96] "機車"       "選<c1>"     "應該"       "環保"       "環保局"    
##  [101] "環保署"     "蘋果"       "覺得"       "人氣"       "今<a6>"    
##  [106] "分別"       "引進"       "全球"       "合<a7>"     "有關"      
##  [111] "此<a5>"     "自然"       "利用"       "更是"       "每<a6>"    
##  [116] "系列"       "其實"       "味道"       "保<c5>"     "玻璃"      
##  [121] "計畫"       "限量"       "捐款"       "消費者"     "特別"      
##  [126] "推出"       "理念"       "造型"       "超過"       "當然"      
##  [131] "聖誕"       "聖誕<b8>"   "預計"       "綠色"       "隨著"      
##  [136] "環境"       "購買"       "上漲"       "工<b7>"     "主要"      
##  [141] "台灣"       "投資人"     "改革"       "亞洲"       "來到"      
##  [146] "周六"       "周<a5>"     "直到"       "表<b2>"     "前景"      
##  [151] "指數"       "看法"       "紀錄"       "美國"       "修改"      
##  [156] "凌晨"       "時間"       "帶動"       "創新"       "發稿"      
##  [161] "新增"       "預期"       "樂<c6>"     "衝擊"       "不能"      
##  [166] "他說"       "出<b2>"     "本季"       "早上"       "別人"      
##  [171] "努力"       "告訴"       "希望"       "更新"       "並且"      
##  [176] "明天"       "相當"       "原<a6>"     "根據"       "訓練"      
##  [181] "參<a5>"     "就是"       "結果"       "超越"       "感動"      
##  [186] "照顧"       "經紀人"     "運動"       "隔天"       "影片"      
##  [191] "機<b7>"     "app"        "google"     "公布"       "升級"      
##  [196] "以下"       "以及"       "台<a5>"     "打造"       "印度"      
##  [201] "名單"       "地區"       "快速"       "其中"       "城市"      
##  [206] "威力"       "香港"       "記錄"       "國際"       "專屬"      
##  [211] "第<a4>"     "聊天"       "軟體"       "這次"       "透過"      
##  [216] "最佳"       "結合"       "遊戲"       "實力"       "擁有"      
##  [221] "還可"       "韓國"       "簡單"       "人力"       "千元"      
##  [226] "大家"       "大概"       "小時"       "工<a7>"     "已經"      
##  [231] "之前"       "之後"       "公告"       "分析"       "日前"      
##  [236] "主管"       "以上"       "以前"       "以後"       "出來"      
##  [241] "可以"       "可能"       "市長"       "必須"       "正在"      
##  [246] "企<b7>"     "再次"       "收入"       "老闆"       "而且"      
##  [251] "至少"       "行政<b0>"   "但是"       "我們"       "更<a5>"    
##  [256] "每月"       "沒有"       "所以"       "狀況"       "表示"      
##  [261] "很多"       "指出"       "為何"       "要是"       "個月"      
##  [266] "拿出"       "拿到"       "旁邊"       "真的"       "強調"      
##  [271] "這種"       "勞工"       "幾乎"       "減少"       "無法"      
##  [276] "給予"       "萬元"       "經營"       "達到"       "違法"      
##  [281] "說法"       "銀行"       "需求"       "論壇"       "擔任"      
##  [286] "賴清德"     "聽到"       "女友"       "月初"       "主動"      
##  [291] "目前"       "任何人"     "全<a5>"     "好像"       "成<a5>"    
##  [296] "每個"       "男友"       "受到"       "於是"       "明確"      
##  [301] "爭取"       "建議"       "是<a7>"     "甚至"       "時候"      
##  [306] "紐約"       "帶領"       "從未"       "接受"       "教練"      
##  [311] "球隊"       "畢竟"       "曾經"       "發文"       "想過"      
##  [316] "還在"       "還說"       "職務"       "職<b7>"     "關<aa>"    
##  [321] "矚目"       "之<a5>"     "日<b0>"     "只要"       "用戶"      
##  [326] "立刻"       "同樣"       "即可"       "宣布"       "指定"      
##  [331] "除了"       "專案"       "帶<a6>"     "授權"       "最高"      
##  [336] "最新"       "搭配"       "輕鬆"       "價格"       "銷售"      
##  [341] "獲得"       "十分"       "女兒"       "小心"       "已有"      
##  [346] "不足"       "中國"       "太太"       "方式"       "父母"      
##  [351] "他們"       "出生"       "去<a6>"     "先前"       "再度"      
##  [356] "各項"       "各種"       "自行"       "完成"       "每次"      
##  [361] "沒想到"     "其他"       "協商"       "承<bb>"     "服務"      
##  [366] "法律"       "治療"       "非<b1>"     "孩子"       "指控"      
##  [371] "看到"       "家屬"       "高興"       "情況"       "眼睛"      
##  [376] "這是"       "這個"       "最近"       "報告"       "就要"      
##  [381] "發<b2>"     "超級"       "媽媽"       "想要"       "解決"      
##  [386] "影響"       "據悉"       "積極"       "檢查"       "檢視"      
##  [391] "雖然"       "醫<b0>"     "醫療"       "顯示"       "不錯"      
##  [396] "文化"       "日本"       "生日"       "生活"       "交流"      
##  [401] "全<b5>"     "官方"       "東京"       "後來"       "穿著"      
##  [406] "高中"       "假日"       "喜歡"       "棒球"       "維持"      
##  [411] "學校"       "釋出"       "appledaily" "com"        "了嗎"      
##  [416] "大學"       "不用"       "不到"       "不要"       "不僅"      
##  [421] "不<b7>"     "不<c2>"     "反應"       "尤其"       "引發"      
##  [426] "方法"       "只是"       "正面"       "立委"       "如何"      
##  [431] "如果"       "而言"       "至今"       "助理"       "即使"      
##  [436] "即時"       "投稿"       "明顯"       "面對"       "候選人"    
##  [441] "恐怕"       "效果"       "時代"       "根本"       "真正"      
##  [446] "真實"       "討論"       "追問"       "問題"       "教師"      
##  [451] "產生"       "第二"       "第三"       "處理"       "被<ae>"    
##  [456] "這些"       "通知"       "部分"       "最大"       "無<a9>"    
##  [461] "絕對"       "越來越"     "進行"       "想法"       "新聞"      
##  [466] "當天"       "當事人"     "電話"       "對手"       "對於"      
##  [471] "網路"       "需要"       "播出"       "確實"       "課<b5>"    
##  [476] "興趣"       "選擇"       "獲利"       "聯絡"       "避免"      
##  [481] "難以"       "蘋果日報"   "歡迎"       "變成"       "facebook"  
##  [486] "https"      "www"        "比較"       "以來"       "台幣"      
##  [491] "市場"       "平衡"       "考<bc>"     "來看"       "來說"      
##  [496] "或是"       "所謂"       "近來"       "長期"       "活動"      
##  [501] "研究"       "美元"       "記者<b7>"   "國家"       "提升"      
##  [506] "集團"       "新台幣"     "經濟"       "董事長"     "裡面"      
##  [511] "運<a7>"     "預測"       "談到"       "機構"       "趨勢"      
##  [516] "競爭"       "女子"       "不同"       "公斤"       "生涯"      
##  [521] "由於"       "我國"       "直播"       "金牌"       "則是"      
##  [526] "唯<a4>"     "尋求"       "傳統"       "奧運"       "臺灣"      
##  [531] "機率"       "選手"       "體<a8>"     "不夠"       "不得"      
##  [536] "不讓"       "反映"       "方向"       "以<a5>"     "左右"      
##  [541] "未來"       "目標"       "立法<b0>"   "地點"       "安全"      
##  [546] "行經"       "行駛"       "即便"       "妨<ae>"     "車道"      
##  [551] "車輛"       "修正"       "修法"       "針對"       "停車"      
##  [556] "執行"       "救<c5>"     "條例"       "處<bb>"     "規定"      
##  [561] "通過"       "最快"       "提高"       "進入"       "僅有"      
##  [566] "準備"       "路口"       "道路"       "遇到"       "管理"      
##  [571] "適當"       "導致"       "隨時"       "上網"       "什麼"      
##  [576] "公司"       "公<b6>"     "方便"       "申請"       "全民"      
##  [581] "全部"       "她說"       "如今"       "政治"       "查詢"      
##  [586] "看待"       "值得"       "草案"       "基於"       "推動"      
##  [591] "教授"       "理由"       "進步"       "經過"       "落實"      
##  [596] "過去"       "鼓勵"       "團體"       "監督"       "質疑"      
##  [601] "不過"       "不適"       "中南部"     "天氣"       "戶<a5>"    
##  [606] "只有"       "全台"       "全都"       "在內"       "如有"      
##  [611] "完全"       "依據"       "前天"       "南部"       "咳嗽"      
##  [616] "室內"       "屏東"       "昨天"       "紅色"       "紅<ae>"    
##  [621] "留在"       "健康"       "敏感"       "傍晚"       "提醒"      
##  [626] "測站"       "嘉南"       "監測網"     "增強"       "橘色"      
##  [631] "嚴重"       "警示"       "不佳"       "中央氣象局" "中部"      
##  [636] "各地"       "衣物"       "季風"       "宜蘭"       "東<a5>"    
##  [641] "近日"       "持續"       "為主"       "氣溫"       "基隆"      
##  [646] "較大"       "整天"       "變化"       "已成"       "公益"      
##  [651] "支持"       "比例"       "好手"       "成立"       "位置"      
##  [656] "改變"       "兒童"       "相當於"     "科學"       "原本"      
##  [661] "退休"       "教<a8>"     "畢<b7>"     "最愛"       "發表"      
##  [666] "資訊"       "對象"       "調查"       "還有"       "曝光"      
##  [671] "大幅"       "以內"       "另<a5>"     "正式"       "交易"      
##  [676] "同意"       "在於"       "安排"       "告知"       "投手"      
##  [681] "究竟"       "信心"       "負擔"       "消息"       "球員"      
##  [686] "話題"       "億美元"     "儘管"       "聯盟"       "還是"      
##  [691] "出席"       "私人"       "店內"       "法國"       "空間"      
##  [696] "表演"       "音樂"       "留下"       "將在"       "晚間"      
##  [701] "期間"       "結束"       "雙方"       "土地"       "支付"      
##  [706] "出租"       "地產"       "享受"       "所得"       "限制"      
##  [711] "留意"       "粉絲團"     "國民"       "符合"       "登記"      
##  [716] "適用"       "優惠"       "營<b7>"     "聯合"       "人們"      
##  [721] "人員"       "中華民國"   "只能"       "母親"       "立法"      
##  [726] "全身"       "有人"       "有點"       "至於"       "衣服"      
##  [731] "完整"       "協助"       "協<b7>"     "幸好"       "後續"      
##  [736] "指示"       "故事"       "毒<ab>"     "相信"       "致死"      
##  [741] "晚<c0>"     "理事長"     "責任"       "嫌犯"       "對待"      
##  [746] "慘遭"       "辦理"       "幫忙"       "還被"       "醫<c5>"    
##  [751] "露出"       "權益"       "了解"       "人口"       "人數"      
##  [756] "上任"       "工<b5>"     "不見"       "反而"       "可能性"    
##  [761] "未能"       "交通"       "地方"       "似乎"       "困難"      
##  [766] "技術"       "每天"       "系統"       "角度"       "事實上"    
##  [771] "使用"       "使用者"     "例如"       "到底"       "或<b3>"    
##  [776] "的話"       "直接"       "社<b7>"     "律師"       "政策"      
##  [781] "看<b0>"     "面<c1>"     "首先"       "員工"       "租屋"      
##  [786] "做出"       "強化"       "產<b7>"     "規劃"       "就<b7>"    
##  [791] "就算"       "提出"       "提供"       "然而"       "發言人"    
##  [796] "發展"       "評估"       "概念"       "路上"       "路線"      
##  [801] "嘗試"       "實際"       "說出"       "適合"       "類似"      
##  [806] "下去"       "下班"       "大部分"     "內容"       "內部"      
##  [811] "分鐘"       "多數"       "事情"       "初步"       "很少"      
##  [816] "是從"       "相同"       "相關"       "從小"       "這<a4>"    
##  [821] "這麼"       "部門"       "單純"       "幾個"       "測試"      
##  [826] "然後"       "當<a6>"     "經理"       "團隊"       "鎖定"      
##  [831] "變得"       "引<b0>"     "估計"       "金門"       "家長"      
##  [836] "捐贈"       "疾病"       "基金<b7>"   "費用"       "管制"      
##  [841] "女人"       "不<a4>"     "不敢"       "分享"       "以為"      
##  [846] "地說"       "老婆"       "床上"       "男子"       "身上"      
##  [851] "那些"       "那麼"       "兩個"       "房間"       "段時間"    
##  [856] "突然"       "家中"       "馬上"       "做好"       "晚上"      
##  [861] "逐漸"       "發生"       "經歷"       "經驗"       "補助"      
##  [866] "確<bb>"     "罹患"       "還沒"       "還<b7>"     "千萬"      
##  [871] "大獎"       "日至"       "台中市"     "台南市"     "依照"      
##  [876] "官員"       "重新"       "食<ab>"     "桃園市"     "統<a4>"    
##  [881] "累計"       "順利"       "福利"       "億元"       "廣告"      
##  [886] "獎金"       "優先"       "上市"       "代理"       "正<b1>"    
##  [891] "生產"       "同步"       "享有"       "店家"       "版本"      
##  [896] "查看"       "售價"       "終於"       "提前"       "辦公室"    
##  [901] "決定"       "取消"       "官網"       "國道"       "造成"      
##  [906] "無論"       "發出"       "人家"       "下車"       "上周"      
##  [911] "不滿"       "本案"       "身<a4>"     "周二"       "將他"      
##  [916] "排除"       "這場"       "當地"       "當場"       "電報導"    
##  [921] "隨後"       "檢方"       "警方"       "全新"       "行車"      
##  [926] "專<b7>"     "得到"       "最低"       "智慧"       "網站"      
##  [931] "操<a7>"     "獨家"       "體驗"       "合約"       "成為"      
##  [936] "自由"       "金額"       "拿下"       "效力"       "救援"      
##  [941] "球季"       "最多"       "萬美元"     "可見"       "快來"      
##  [946] "拍攝"       "昨日"       "看見"       "脈動"       "區域"      
##  [951] "清楚"       "這段"       "畫面"       "跟上"       "任何"      
##  [956] "位於"       "為止"       "紛紛"       "規模"       "最終"      
##  [961] "感受"       "當局"       "數據"       "調整"       "小孩"      
##  [966] "中華"       "出發"       "同期"       "即日<b0>"   "每日"      
##  [971] "每周"       "並將"       "來台"       "來<a6>"     "協議"      
##  [976] "抵達"       "前往"       "飛行"       "旅客"       "時<b3>"    
##  [981] "桃園"       "航空"       "副<c1>"     "採用"       "理想"      
##  [986] "距離"       "搭載"       "新<a5>"     "照片"       "義大利"    
##  [991] "達成"       "德國"       "歐洲"       "整體"       "機場"      
##  [996] "澳洲"       "興奮"       "優質"       "營運"       "擴大"      
## [1001] "職棒"       "人權"       "女性"       "吸引"       "身為"      
## [1006] "事件"       "的確"       "宣稱"       "思考"       "能夠"      
## [1011] "最好"       "勞動"       "感到"       "選民"       "人妻"      
## [1016] "不好"       "吃飯"       "批評"       "直呼"       "要求"      
## [1021] "乾淨"       "違反"       "網友"       "趕緊"       "主持人"    
## [1026] "多次"       "依然"       "採訪"       "證明"       "大樓"      
## [1031] "工廠"       "公<a6>"     "可望"       "全<a6>"     "有效"      
## [1036] "汽車"       "客戶"       "挑戰"       "看好"       "計算"      
## [1041] "訂單"       "訊息"       "接獲"       "產<ab>"     "通訊"      
## [1046] "硬體"       "照<c5>"     "預估"       "領域"       "數字"      
## [1051] "數位"       "確保"       "應用"       "土城"       "中央"      
## [1056] "分局"       "找到"       "到案"       "知名"       "看了"      
## [1061] "突發"       "要看"       "案情"       "偵訊"       "這則"      
## [1066] "落網"       "疑似"       "監視器"     "說明"       "調<be>"    
## [1071] "遭人"       "縱火"       "釐清"       "鵝肉"       "關係"      
## [1076] "上映"       "之間"       "正義"       "此時"       "即將"      
## [1081] "周<a6>"     "計劃"       "粉絲"       "執行長"     "細<b8>"    
## [1086] "透露"       "創意"       "創辦人"     "經典"       "飾演"      
## [1091] "演員"       "出賽"       "打擊"       "名字"       "更好"      
## [1096] "身份"       "具有"       "制度"       "首次"       "首度"      
## [1101] "剛好"       "得獎"       "隊友"       "意義"       "資格"      
## [1106] "寫下"       "人民幣"     "不料"       "元台幣"     "公分"      
## [1111] "只好"       "平均"       "任務"       "在家"       "多<a4>"    
## [1116] "好友"       "好看"       "老公"       "老師"       "男女"      
## [1121] "身材"       "房子"       "所幸"       "爸爸"       "很難"      
## [1126] "怎樣"       "科技"       "無罪"       "痛苦"       "感謝"      
## [1131] "過<b5>"     "製<a7>"     "難過"       "人生"       "大約"      
## [1136] "平日"       "多<a6>"     "收到"       "呼<c6>"     "花蓮"      
## [1141] "南下"       "特色"       "堅持"       "規範"       "喝酒"      
## [1146] "幾<a6>"     "意<a5>"     "撞擊"       "鄰<a9>"     "頭部"      
## [1151] "騎士"       "警察"       "上前"       "公務"       "市議員"    
## [1156] "本周"       "民進黨"     "交保"       "合理"       "地檢署"    
## [1161] "行為"       "住戶"       "李婉鈺"     "事後"       "坦承"      
## [1166] "板橋"       "建築"       "研究所"     "捍衛"       "酒後"      
## [1171] "移送"       "連任"       "報警"       "無效"       "逮捕"      
## [1176] "當選"       "解釋"       "電鈴"       "對方"       "態度"      
## [1181] "藝人"       "競選"       "議員"       "議<b7>"     "警局"      
## [1186] "警員"       "iphone"     "考量"       "低於"       "形成"      
## [1191] "持有"       "相較"       "相對"       "風險"       "進<a4>"    
## [1196] "黑色"       "賣出"       "歷史"       "鴻海"       "人<a4>"    
## [1201] "上傳"       "不少"       "介紹"       "父親"       "主任"      
## [1206] "付出"       "他<b7>"     "好好"       "有趣"       "自拍"      
## [1211] "身邊"       "到場"       "放在"       "卻是"       "重要"      
## [1216] "核心"       "記得"       "動<a7>"     "問到"       "這項"      
## [1221] "創造"       "電視"       "精神"       "維<c5>"     "價值"      
## [1226] "機關"       "螢幕"       "還能"       "藝術"       "迅速"      
## [1231] "周三"       "附近"       "套房"       "像是"       "在<a5>"    
## [1236] "得以"       "連續"       "代表"       "抗議"       "得知"      
## [1241] "組織"       "痛批"       "口罩"       "不然"       "少女"      
## [1246] "犯案"       "多少"       "走向"       "兩地"       "店裡"      
## [1251] "性侵"       "威脅"       "昨晚"       "美女"       "家庭"      
## [1256] "時說"       "追查"       "被控"       "就醫"       "筆錄"      
## [1261] "進來"       "愛情"       "賓士"       "整個"       "應是"      
## [1266] "獲報"       "離去"       "離<b6>"     "驚訝"       "滿足"      
## [1271] "影音"       "公寓"       "多名"       "住家"       "家人"      
## [1276] "陸續"       "整理"       "瞬間"       "不想"       "分手"      
## [1281] "他人"       "交往"       "在場"       "抱歉"       "個性"      
## [1286] "對<a5>"     "臉書"       "讀者"       "不管"       "天空"      
## [1291] "手段"       "台中"       "台電"       "危機"       "如此"      
## [1296] "存在"       "兩天"       "兩<a6>"     "恢<b4>"     "昨在"      
## [1301] "等於"       "隨即"       "壓力"       "轉型"       "人士"      
## [1306] "白色"       "身分"       "南韓"       "背景"       "負責"      
## [1311] "高層"       "駕駛"       "學習"       "醫師"       "證實"      
## [1316] "屬於"       "主持"       "決心"       "典禮"       "能力"      
## [1321] "採購"       "階段"       "親自"       "大量"       "火星"      
## [1326] "以往"       "另有"       "列入"       "此次"       "行動"      
## [1331] "投入"       "受訪"       "委員"       "很大"       "飛彈"      
## [1336] "容易"       "恐怖"       "高級"       "停止"       "專家"      
## [1341] "將以"       "強烈"       "接下來"     "這時"       "部長"      
## [1346] "發射"       "經<b1>"     "領導人"     "模式"       "藉此"      
## [1351] "大型"       "之<a4>"     "另<a4>"     "本身"       "多元"      
## [1356] "房屋"       "東西"       "社區"       "表達"       "長達"      
## [1361] "保留"       "參考"       "這件"       "創<a7>"     "象徵"      
## [1366] "裝置"       "慢慢"       "邀請"       "顏色"       "藝術家"    
## [1371] "不符"       "出庭"       "台大"       "刑事"       "有所"      
## [1376] "判決"       "拒絕"       "法官"       "法<b0>"     "重大"      
## [1381] "原來"       "原則"       "案件"       "特定"       "參與"      
## [1386] "將由"       "被告"       "期待"       "審判"       "審理"      
## [1391] "證據"       "川普"       "收盤"       "終場"       "創下"      
## [1396] "標準"       "怎麼"       "高達"       "商<b7>"     "辦法"      
## [1401] "充滿"       "可愛"       "她們"       "成員"       "形象"      
## [1406] "保持"       "演出"       "股價"       "三人"       "友人"      
## [1411] "打人"       "地上"       "並不"       "並無"       "受傷"      
## [1416] "消費"       "送醫"       "逃逸"       "清晨"       "陪同"      
## [1421] "傷勢"       "意識"       "衝突"       "不慎"       "交由"      
## [1426] "動物"       "影像"       "人體"       "有助於"     "來源"      
## [1431] "食物"       "國人"       "盡量"       "幫助"       "失去"      
## [1436] "搜索"       "the"        "不當"       "仍然"       "其餘"      
## [1441] "受<ae>"     "信任"       "接觸"       "提到"       "傷<ae>"    
## [1446] "道歉"       "聲明"       "力量"       "少<a6>"     "主委"      
## [1451] "主席"       "付費"       "出面"       "失<b7>"     "立場"      
## [1456] "各國"       "地下"       "多處"       "早就"       "死者"      
## [1461] "坐在"       "辛苦"       "取得"       "委員<b7>"   "抱怨"      
## [1466] "放棄"       "爭議"       "金管<b7>"   "保障"       "查獲"      
## [1471] "洲際飛彈"   "研判"       "突<af>"     "重點"       "弱勢"      
## [1476] "特殊"       "秘書長"     "酒駕"       "高度"       "參選"      
## [1481] "國內"       "基金"       "專訪"       "將近"       "帳戶"      
## [1486] "情形"       "陷入"       "勞基法"     "單位"       "提案"      
## [1491] "意見"       "搶救"       "碰到"       "萬人"       "經費"      
## [1496] "運用"       "電力"       "實在"       "實施"       "彰化"      
## [1501] "截至"       "緊急"       "緊張"       "衛福部"     "駕車"      
## [1506] "整合"       "點名"       "鬆綁"       "穩定"       "關鍵"      
## [1511] "權利"       "合法"       "自家"       "言論"       "看似"      
## [1516] "涉及"       "將<b7>"     "焦點"       "登場"       "資源"      
## [1521] "境內"       "錯<bb>"     "不但"       "固定"       "球團"      
## [1526] "比賽"       "助攻"       "攻下"       "垃圾"       "想不到"    
## [1531] "有些"       "死<a4>"     "我<b7>"     "近<a6>"     "看看"      
## [1536] "國家隊"     "最強"       "隊長"       "熱門"       "確定"      
## [1541] "擔心"       "人民"       "不可"       "不如"       "互相"      
## [1546] "互動"       "夫妻"       "支出"       "主打"       "失敗"      
## [1551] "未料"       "生意"       "立<b0>"     "休假"       "百萬"      
## [1556] "而已"       "自爆"       "低調"       "投資"       "改為"      
## [1561] "身體"       "事實"       "依舊"       "刺激"       "妻子"      
## [1566] "幸福"       "明星"       "金融"       "保證"       "宣傳"      
## [1571] "若有"       "重視"       "風波"       "書上"       "涉嫌"      
## [1576] "留言"       "酒店"       "偵辦"       "國民黨"     "接著"      
## [1581] "訪問"       "尊重"       "短短"       "策略"       "園區"      
## [1586] "溝通"       "資金"       "違規"       "預算"       "對此"      
## [1591] "對話"       "舞台"       "錄影"       "檢討"       "爆發"      
## [1596] "關於"       "今晨"       "發布"       "鄰近"       "基本"      
## [1601] "清淨機"     "愛心"       "毛愛"       "理解"       "圖片"      
## [1606] "編輯組"     "instagram"  "各界"       "成績"       "跡象"      
## [1611] "瘋狂"       "還要"       "大小"       "平台"       "兩次"      
## [1616] "狗狗"       "帶來"       "手機"       "將於"       "購物"      
## [1621] "工具"       "近期"       "機器"       "而是"       "成交"      
## [1626] "行情"       "個案"       "發言"       "公園"       "如同"      
## [1631] "行<b5>"     "並在"       "風格"       "推薦"       "景點"      
## [1636] "漸漸"       "廣場"       "預定"       "哪裡"       "警告"      
## [1641] "本人"       "局長"       "這裡"       "家裡"       "笑說"      
## [1646] "旗下"       "地方法<b0>" "法庭"       "送往"       "這名"      
## [1651] "正確"       "有民眾"     "帳號"       "女生"       "旅遊"      
## [1656] "泰國"       "當下"       "支援"       "領先"       "廠商"      
## [1661] "很快"       "控制"       "上海"       "大陸"       "公尺"      
## [1666] "主題"       "建<b3>"     "遊客"       "出遊"       "追撞"      
## [1671] "動畫"       "被撞"       "聯結"       "大多"       "可是"      
## [1676] "角色"       "最為"       "發<b0>"     "雜誌"       "接到"      
## [1681] "燃燒"       "大批"       "女孩"       "用心"       "自身"      
## [1686] "明白"       "建立"       "真相"       "專頁"       "議題"      
## [1691] "全案"       "指稱"       "案例"       "偵查"       "從事"      
## [1696] "檢察官"     "檢<c1>"     "獲准"       "聲請"       "羈押"      
## [1701] "投訴"       "冠軍"       "找來"       "美食"       "提早"      
## [1706] "飯店"       "危險"       "攻擊"       "路邊"       "line"      
## [1711] "上班"       "月間"       "早已"       "並未"       "依法"      
## [1716] "國內<a5>"   "陪審團"     "報案"       "提告"       "結婚"      
## [1721] "進而"       "請看"       "賺錢"       "懷疑"       "灌水"      
## [1726] "辯稱"       "不治"       "公里"       "車內"       "事故"      
## [1731] "供稱"       "返<a6>"     "前方"       "後方"       "說話"      
## [1736] "身心"       "協調"       "好<a9>"     "司機"       "成本"      
## [1741] "住處"       "基層"       "基礎"       "帶著"       "清理"      
## [1746] "蛋<bf>"     "速度"       "忍不住"     "看來"       "試射"      
## [1751] "關閉"       "友善"       "前來"       "校園"       "嘉義"      
## [1756] "熟悉"       "聲音"       "商<ab>"     "線上"       "有沒有"    
## [1761] "患者"       "不論"       "休息"       "挑選"       "情緒"      
## [1766] "娛樂"       "累積"       "成果"       "從此"       "顧客"      
## [1771] "中午"       "上訴"       "上課"       "出去"       "打算"      
## [1776] "印象"       "同<a4>"     "你們"       "判處"       "國中"      
## [1781] "接近"       "舒服"       "電腦"       "輔導"       "駁<a6>"    
## [1786] "關心"       "求助"       "聯繫"       "國<a5>"     "地<a7>"    
## [1791] "迎接"       "客人"       "攝影"       "人潮"       "不再"      
## [1796] "予以"       "打卡"       "目的"       "自主"       "利益"      
## [1801] "拍照"       "便宜"       "展示"       "展<b2>"     "執政"      
## [1806] "推廣"       "誇張"       "資產"       "彈性"       "縣長"      
## [1811] "竟然"       "使得"       "工時"       "平時"       "正是"      
## [1816] "危<ae>"     "判<c2>"     "指標"       "缺乏"       "就是說"    
## [1821] "澄清"       "衛生"       "嚴格"       "行政"       "機制"      
## [1826] "有意"       "憤而"       "發揮"       "hen"        "兄弟"      
## [1831] "好吃"       "此事"       "偷吃"       "演唱<b7>"   "鏡頭"      
## [1836] "彼此"       "提及"       "熱情"       "藉由"       "水準"      
## [1841] "扮演"       "其次"       "花費"       "降低"       "校長"      
## [1846] "評價"       "遴選"       "當初"       "自我"       "低端"      
## [1851] "難道"       "或者"       "損失"       "日子"       "出門"      
## [1856] "拍到"       "路人"       "中職"       "太多"       "見狀"      
## [1861] "貨車"       "用<c0>"     "小朋友"     "太大"       "必要"      
## [1866] "民調"       "生命"       "再來"       "呼吸"       "怎麼辦"    
## [1871] "採取"       "數量"       "燃煤"       "將是"       "監視"      
## [1876] "製造"       "好玩"       "社團"       "縣市"       "營造"      
## [1881] "本月"       "行銷"       "店員"       "單價"       "不<c4>"    
## [1886] "及其"       "立即"       "地<b0>"     "判刑"       "男性"      
## [1891] "返家"       "手術"       "召<b6>"     "有期<ae>"   "決議"      
## [1896] "屆時"       "通報"       "禁止"       "反對"       "見面"      
## [1901] "主張"       "司法"       "先生"       "訴<b3>"     "遭到"      
## [1906] "融入"       "挑<c6>"     "動手"       "罪嫌"       "賣場"      
## [1911] "picshow"    "心中"       "本片"       "未經<b3>"   "放馬過來"  
## [1916] "流量"       "唱歌跳舞"   "惡意"       "感人"       "搞笑"      
## [1921] "新知"       "徵片"       "請寄"       "轉載"       "大安"      
## [1926] "通<b1>"     "車禍"       "肇事"       "今晚"       "犯罪"      
## [1931] "曾佳俊"     "強制"       "今天下午"   "派出所"     "進度"      
## [1936] "祝福"       "分<b6>"     "事<b7>"     "公頃"       "每坪"      
## [1941] "房市"       "分<b3>"     "宣告"       "中<a6>"     "市民"      
## [1946] "今天上午"   "車上"       "並非"       "追蹤"       "意圖"      
## [1951] "監控"       "公社"       "股東"       "意<c4>"     "獨立"      
## [1956] "直言"       "想到"       "農場"       "多出"       "表格"      
## [1961] "買房"       "進場"       "新竹"       "管理費"     "下降"      
## [1966] "同仁"       "果然"       "前科"       "員警"       "打電話"    
## [1971] "謝謝"       "關懷"       "範圍"       "當晚"       "視為"      
## [1976] "體系"       "臉書按"     "審查"       "撒野"       "還躺"      
## [1981] "自動"       "打到"       "安心"       "公平"       "公正"      
## [1986] "合照"       "私下"       "評論"       "賠償"       "檢警"      
## [1991] "新莊"       "污染物"     "民間"       "全國"       "長輩"      
## [1996] "當面"       "號召"       "鎮瀾宮"     "差異"       "出爐"      
## [2001] "報名"       "犀利"       "士林"       "處分"       "見到"      
## [2006] "於今"       "籌備"       "aqi"        "西部"       "處長"      
## [2011] "二版"       "掌握"       "權力"       "來<c1>"     "轄區"      
## [2016] "民主"       "同事"       "情侶"       "本次"       "character" 
## [2021] "明哲"       "黨團"       "平<b1>"

如何判斷距離

a <- c(1,1,0,0,1)
b <- c(1,0,0,0,1)
dist(rbind(a,b))
##   a
## b 1
a <- c(1,1,0,0,1)
b <- c(1,0,1,0,1)
dist(rbind(a,b))
##          a
## b 1.414214
sqrt(sum((a - b) ^ 2))
## [1] 1.414214
a <- '柯文哲今天去大巨蛋痛批遠雄'
b <- '柯文哲表示,遠雄已將書圖、文件送進都審會,「最後就是攤牌階段,要蓋?不蓋?能不能符合我們(指台北市政府)標準?」在在都必須討論,若遠雄無人可就大巨蛋案做決定,全案後續將有問題。'


library(jiebaR)
mixseg <- worker()
word.seg <- lapply(list(a,b), function(e) segment(e, jiebar = mixseg))
w.corpus <- Corpus(VectorSource(word.seg))
dtm <- DocumentTermMatrix(w.corpus)
dtm
## <<DocumentTermMatrix (documents: 2, terms: 30)>>
## Non-/sparse entries: 32/28
## Sparsity           : 47%
## Maximal term length: 5
## Weighting          : term frequency (tf)
inspect(dtm)
## <<DocumentTermMatrix (documents: 2, terms: 30)>>
## Non-/sparse entries: 32/28
## Sparsity           : 47%
## Maximal term length: 5
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs 大巨蛋 已將 不能 今天 文件 台<a5> 必須 全案 痛批 遠雄
##    1      1    0    0    1    0      0    0    0    1    1
##    2      1    1    1    0    1      1    1    1    0    1
m <- as.matrix(dtm)
dist(m)
##          1
## 2 5.291503

計算文章距離

library(proxy)
a <- c(1,2,2,1,1,1,0)
b <- c(1,2,2,1,1,2,1)
proxy::dist(rbind(a,b), method = 'cosine')


applenews <- read.csv('https://raw.githubusercontent.com/ywchiu/fubonr/master/data/applenews.csv', stringsAsFactors = FALSE)
head(applenews)

library(jiebaR)
library(tm)
mixseg    <- worker()
apple.seg <- lapply(applenews$content, function(e) segment(e, jiebar = mixseg))

s.corpus <- Corpus(VectorSource(apple.seg))
doc      <- tm_map(s.corpus, removeNumbers)

dtm      <- DocumentTermMatrix(doc)

dtm.remove <- removeSparseTerms(dtm, 0.99)
m        <- as.matrix(dtm.remove)

dtm.dist <- proxy::dist(m,method = 'cosine')
dtm.mat  <- as.matrix(dtm.dist)

applenews$title[order(dtm.mat[5,])[1:10]]

getSimiliarArticle <- function(idx){
    print(paste('查詢文章:', applenews$title[idx]) )
    rank  <- order(dtm.mat[idx,])[2:10]
    for (ele in rank){
      similarity <- dtm.mat[idx,][ele]
      if (similarity < 1){
      print(paste('相關新聞:',applenews$title[ele], similarity,ele))
      }
    }
}
getSimiliarArticle(894)

抓取富邦銀行所有問題集

library(rvest)

faq <- read_html('https://www.fubon.com/banking/FAQ_Data//faq/index_data/faqData1.xml')

questions <- faq %>% html_nodes('item') %>% html_nodes('title') %>% html_text()

descriptions <- faq %>% html_nodes('item') %>% html_nodes('description
') %>% html_text()

qa <- data.frame(questions, descriptions, stringsAsFactors = FALSE)

head(qa, 30)


q <- '我如何申請無卡提款'
library(jiebaR)
mixseg <- worker()

q.seg  <- lapply(  c(q, qa$questions), function(e) segment(e, jiebar = mixseg))

library(tm)
q.corpus <- Corpus(VectorSource(q.seg))
dtm      <- DocumentTermMatrix(q.corpus)                   

dtm.dist <- proxy::dist(as.matrix(dtm), method = 'cosine' )
dtm.mat  <- as.matrix(dtm.dist)

similarity <- dtm.mat[1,]
dim(dtm.mat)[1]
dim(dtm.mat)[1]
qa <- cbind(qa, similarity[ 2:dim(dtm.mat)[1]])

rank <- order(similarity)[2:3] - 1
qa[rank,]

使用R 計算距離

x <- c(0,0,1,1,1,1)
y <- c(1,0,1,1,0,1)
# Euclidean Distance
sqrt(sum((x - y) ^2))
## [1] 1.414214
dist(rbind(x,y), method = 'euclidean')
##          x
## y 1.414214
sum(abs(x - y))
## [1] 2
dist(rbind(x,y), method ="minkowski", p=2)
##          x
## y 1.414214
?dist
## starting httpd help server ... done
# Manhattan 
sum(abs(x - y))
## [1] 2
dist(rbind(x,y), method = 'manhattan')
##   x
## y 2
dist(rbind(x,y), method ="minkowski", p=1)
##   x
## y 2

文章分群

applenews <- read.csv('https://raw.githubusercontent.com/ywchiu/fubonr/master/data/applenews.csv', stringsAsFactors = FALSE)
head(applenews)

library(jiebaR)
library(tm)
mixseg    <- worker()
apple.seg <- lapply(applenews$content, function(e) segment(e, jiebar = mixseg))

s.corpus <- Corpus(VectorSource(apple.seg))
doc      <- tm_map(s.corpus, removeNumbers)

dtm      <- DocumentTermMatrix(doc)

dtm.remove <- removeSparseTerms(dtm, 0.99)

m        <- as.matrix(dtm.remove)

dtm.dist <- proxy::dist(m,method = 'cosine')
dtm.mat  <- as.matrix(dtm.dist)


dtm.cluster <- hclust(dtm.dist)
#plot(dtm.cluster, hang=-0.1)
fit <- cutree(dtm.cluster, k = 30)
applenews$title[fit==8]

分群方法(二)

library(readr)
news <- read_csv('https://raw.githubusercontent.com/ywchiu/fubonr/master/data/news_clustering.csv')

#head(news)
library(jiebaR)
mixseg <- worker()
news.seg <- lapply(news$description, function(e) segment(code = e, jiebar = mixseg))

library(tm)
news.corpus <- Corpus(VectorSource(news.seg))
doc <- tm_map(news.corpus, removeNumbers)
dtm <- DocumentTermMatrix(doc)

# Calculate Distance
dtm.dist <- proxy::dist(as.matrix(dtm), method = 'cosine')

# Clustering
fit          <- hclust(dtm.dist)
plot(fit, hang = -0.1)
news.cluster <- cutree(fit, 20)
table(news.cluster)

news$title[news.cluster == 1]