Demo20170422

作業四

library(readr)
lvr_price <- read_csv("C:/Users/USER/lvr_prices_mac.csv")
View(lvr_price)


# problem 1
library(dplyr)
lvr_price$trading_ym <-  lvr_price$trading_ymd %>% format('%Y-%m-01')

lvr_price_by_month <- lvr_price %>% select(trading_ym, total_price, area) %>% filter(trading_ym >= '2010-01-01')   %>% group_by(trading_ym, area) %>% summarise(price_sum = sum(as.numeric(total_price) ))


par(mfrow=c(3,4))
for (a in unique(lvr_price_by_month$area)){
  #print(a)
  df <- lvr_price_by_month[lvr_price_by_month$area == a, ]
  plot(price_sum  ~ as.Date(trading_ym) , data =df, type = 'l', main = a)
}


## problem 2

#lvr_price2 <- 

lvr_price$area <- as.factor(lvr_price$area)
area_order <- lvr_price %>% group_by(area) %>% summarise(med_price = median(total_price, na.rm = TRUE)) %>% arrange(desc(med_price) ) %>% select(area)


boxplot(log(total_price) ~ factor(area,levels(lvr_price$area)[area_order$area]), data = lvr_price)


## problem 3
lvr_price_by_area <- lvr_price %>% select(area, total_price, building_sqmeter) %>% filter(building_sqmeter > 0) %>% group_by(area) %>% summarise(sum_total = mean(as.numeric(total_price/ building_sqmeter), na.rm = TRUE)) %>% arrange(desc(sum_total))

lvr_price_by_area$sum_total /0.3025
barplot(height = lvr_price_by_area$sum_total/0.3025, names.arg = lvr_price_by_area$area, col ="blue")

English Segmentation

s <- 'How is the weather today?'
strsplit(s, ' ')

## [[1]]
## [1] "How"     "is"      "the"     "weather" "today?"

s2 <- '今天天氣如何?'

Chinese Segmentation

library(NLP)

s <- strsplit(x="那我們酸民婉君也可以報名嗎", split ='')

# bigram
bigram <- ngrams(unlist(s), 2)
vapply(bigram, "paste", "", collapse="")

##  [1] "那我" "我們" "們酸" "酸民" "民婉" "婉君" "君也" "也可" "可以" "以報"
## [11] "報名" "名嗎"

# trigram
trigram <- ngrams(unlist(s), 3)
vapply(trigram, "paste", "", collapse="")

##  [1] "那我們" "我們酸" "們酸民" "酸民婉" "民婉君" "婉君也" "君也可"
##  [8] "也可以" "可以報" "以報名" "報名嗎"

article <- "繼乳牛染結核病後，國內雞蛋首度被檢出含「世紀之毒」戴奧辛，含量高達5.2皮克／克脂肪，超出我國標準的2.5皮克／克脂肪，半衰期長達7年，政府應速查原因。有教授稱根據93年到105年的雞蛋檢驗調查，含有戴奧辛的平均量為1皮克／克脂肪，若是以1個60公斤的成人來看，終其一生每天吃5.2皮克／克脂肪，都符合世界衛生組織的標準，消費者無須過度擔心。
每每遇到食安問題政府總是說安心、安心、安心，然而身為家庭主婦對於這樣的說法怎能放心。沒錯每天吃5.2皮克／克脂肪戴奧辛沒事，但我整天只吃雞蛋嗎？還有其他的食物都沒問題嗎？若是蔬果也含達滅芬(Dimethomorph) 10ppm，「氟派瑞（Fluopyram）」殘留6ppm（均符合政府訂定的殘留標準），魚類含重金屬、抗生素、激素殘留，還有染結核病牛奶等等，未來還有可能吃到含10PPM瘦肉精美豬，這些全吃下肚後，請問政府食安風險超標了嗎？而對於有慢性病患者、懷孕的婦女或成長中的孩童，這樣還是安全的嗎？
日常生活環境遭受污染，加上食物中化學物質的多重殘留，與台灣罹癌人數逐年攀升究竟有沒有關係，卻看不到政府相關的說明，這些年周遭親友多人罹患癌症，醫療照護等更是一大負擔，許多人質疑跟國內空氣、飲用水等遭到汙染以及食物中殘留過多的農藥、抗生素、生長激素、重金屬等有關，只是目前尚未有直接的證據證明，這些物質與民眾罹癌有直接相關性，民眾除了自求多福外，似乎也無計可施，相當無奈，總不能為了喝牛奶就要養一頭牛吧，樣樣都要靠自己來生產的話，那要政府做甚麼呢？
目前國內農民在農藥、抗生素、激素等使用上多採取「雞尾酒療法」亦即多種藥物混合給予的情況相當普遍，對人類健康是否造成影響？農民也不清楚，讓家庭主婦上市場買菜時提心吊膽，深怕影響到家人健康。
政府總是制定所謂合法的添加物種類及安全的殘留量標準，但身為百姓只想問一句：難道不施化肥、不添加化學物質的食物不安全嗎？這樣的心情是三餐有御廚代為張羅的蔡總統以及很少上街買菜的高官們會在乎嗎？"

s <- strsplit(x=article, split ='')
bigram <- ngrams(unlist(s), 3)
bigram.str <- vapply(bigram, "paste", "", collapse="")
tb <- table(bigram.str)
tb[tb >= 3]

## bigram.str
## ，這些 、抗生   .2皮 ／克脂  2皮克    5.2 生素、 皮克／ 克／克 克脂肪 
##      3      3      3      5      3      3      3      5      5      5 
## 抗生素 脂肪， 戴奧辛 
##      3      4      3

a.split <- strsplit(article, '、|，|\n|「|」|（|？|）|：|。|／|\\(|\\)')
w.split <- strsplit(unlist(a.split), '')


bigram <- function(w){
  bigram <-ngrams(unlist(w), 2)
  bigram.str <- vapply(bigram, paste, "", collapse = "")
  bigram.str
}



bigram.all <- sapply(w.split, bigram)

tb <- table(unlist(bigram.all))
tb[tb > 2]

## 
##   .2   10  2皮   5. 生素 皮克 安心 安全 克脂 抗生 物質 政府 食物 脂肪 國內 
##    3    3    3    3    3    5    3    3    5    3    3    7    4    5    3 
## 這些 這樣 殘留 奧辛 標準 激素 戴奧 還有 雞蛋 
##    3    3    6    3    4    3    3    3    3

Long term first algo

s = "當初中央政府拿台北市的精華地跟北市府交換" 
s.split = strsplit(s, '台北市')
s.split

## [[1]]
## [1] "當初中央政府拿"       "的精華地跟北市府交換"

paste(unlist(s.split), collapse = "", sep="")

## [1] "當初中央政府拿的精華地跟北市府交換"

removekey <- function(s, keys){
  for (key in keys){
    s.split = strsplit(s, key)
    s <- paste(unlist(s.split), collapse = "", sep="")
  }
  s
}


removekey('當初中央政府拿台北市的精華地跟北市府交換', c('中央', '台北市', '北市府'))

## [1] "當初政府拿的精華地跟交換"

ngram.func <- function(w, n){
  n.gram <-ngrams(unlist(w), n)
  n.gram.str <- vapply(n.gram, paste, "", collapse = "")
  n.gram.str
}

a <- strsplit('當初中央政府拿台北市的精華地跟北市府交換', '')

ngram.func(a,4)

##  [1] "當初中央" "初中央政" "中央政府" "央政府拿" "政府拿台" "府拿台北"
##  [7] "拿台北市" "台北市的" "北市的精" "市的精華" "的精華地" "精華地跟"
## [13] "華地跟北" "地跟北市" "跟北市府" "北市府交" "市府交換"

long term first implementation

keywords <- c()
longTermFirst <- function(article, keywords){

for (i in seq(4,2,-1)){
  article <- removekey(article, keywords)
  a.split <- strsplit(article, "、|，|\n|「|」|（|？|）|：|。|／|\\(|\\)")
    w.split <- strsplit(x=unlist(a.split), split ='')
    n.gram.all <- sapply(w.split, function(e) ngram.func(e,i))
    
    tb <- table(unlist(n.gram.all))
    candidate <- names(tb[tb>=2])
    keywords = c(keywords, candidate)
  }
  keywords
}

keywords <- c()
longTermFirst(article, keywords)

##  [1] ".2皮克"   "5.2皮"    "化學物質" "天吃5."   "吃5.2"    "每天吃5" 
##  [7] "政府總是" "染結核病" "家庭主婦" "學物質的" "ppm"      "安全的"  
## [13] "有直接"   "克脂肪"   "抗生素"   "每克脂"   "的食物"   "的殘留"  
## [19] "重金屬"   "食物中"   "這樣的"   "戴奧辛"   "10"       "及殘"    
## [25] "牛奶"     "以及"     "民眾"     "皮克"     "目前"     "多人"    
## [31] "安心"     "有關"     "身為"     "政府"     "相當"     "相關"    
## [37] "若是"     "食安"     "健康"     "問題"     "國內"     "添加"    
## [43] "符合"     "這些"     "殘留"     "買菜"     "農民"     "農藥"    
## [49] "對於"     "影響"     "標準"     "激素"     "罹癌"     "還有"    
## [55] "雞蛋"

Use JiebaR

library(jiebaR)

## Loading required package: jiebaRD

s <- "那我們酸民婉君也可以報名嗎"
mixseg <- worker()
segment(code = s, jiebar = mixseg)

## [1] "那"   "我們" "酸民" "婉君" "也"   "可以" "報名" "嗎"

edit_dict()

## Warning in edit_dict(): You should save the dictionary without BOM on
## Windows

USERPATH

## [1] "C:/Program Files/R/R-3.3.3/library/jiebaRD/dict/user.dict.utf8"

s <- "那我們酸民婉君也可以報名嗎"
tagseg <- worker('tag')
segment(code = s, jiebar = tagseg)

##      r      r      n      x      d      c      v      y 
##   "那" "我們" "酸民" "婉君"   "也" "可以" "報名"   "嗎"

TFIDF Calculation

a   <- c("a")
abb <- c("a", "b", "b")
abc <- c("a", "b", "c")
D <- list(a, abb, abc)

# tfidf('a', a, D)
tf1  <- table(a)[names(table(a)) == 'a'] / sum(table(a))
idf1 <- log(3 / sum(sapply(D, function(e) 'a' %in% e))) 
tf1 * idf1

## a 
## 0

# tfidf('a', abb, D)
tfidf2 <- 0


# tfidf('b', abb, D)
tf3 <- table(abb)[names(table(abb)) == 'b'] / sum(table(abb))
idf3 <- log(3 / sum(sapply(D, function(e) 'b' %in% e))) 
tf3 * idf3

##         b 
## 0.2703101

# tfidf('b', abc, D)
tf4 <- table(abc)[names(table(abc)) == 'b'] / sum(table(abc))
idf4 <- log(3 / sum(sapply(D, function(e) 'b' %in% e))) 
tf4 * idf4

##        b 
## 0.135155

# tfidf('c', abc, D)
tf5 <- table(abc)[names(table(abc)) == 'c'] / sum(table(abc))
idf5 <- log(3 / sum(sapply(D, function(e) 'c' %in% e))) 
tf5 * idf5

##         c 
## 0.3662041

tfidf <- function(t,d, D){
 tf <- table(d)[names(table(d)) == t]/ sum(table(d))
 idf <- log(length(D) /sum(sapply(D, function(e) t %in% e)))
 tf*idf
}
tfidf('c', abc, D)

##         c 
## 0.3662041

計算詞頻

library(jiebaR)
mixseg <- worker()

seg <- segment(code = article, jiebar = mixseg)
library(wordcloud2)
#wordcloud2(table(seg))
tb  <- table(seg)
tb[tb >= 3]

## seg
##    5.2     也     不     中   皮克     吃   安心   安全     年     有 
##      3      3      3      3      5      4      3      3      4      5 
##     克     含 抗生素     到     的   政府     是     為   食物   脂肪 
##      5      3      3      3     21      7      3      3      4      5 
##   國內   這些   這樣     都   殘留     等     嗎   標準 戴奧辛   還有 
##      3      3      3      3      5      4      4      4      3      3 
##   雞蛋 
##      3

tb[nchar(names(tb)) >= 2]

## seg
##           10          105          2.5          5.2           60 
##            2            1            1            3            1 
##           93 Dimethomorph    Fluopyram          ppm          PPM 
##            1            1            1            2            1 
##         一大         一句         一頭         了嗎         人數 
##            1            1            1            1            1 
##         人類         三餐         上街         下肚         不施 
##            1            1            1            1            1 
##         不能         之毒         公斤         化肥     化學物質 
##            1            1            1            1            2 
##         心情     日常生活         牛奶 世界衛生組織         世紀 
##            1            1            1            1            1 
##         代為         以及         加上       半衰期         只是 
##            1            2            1            1            1 
##       只想問         可能         台灣         市場         平均 
##            1            1            1            1            1 
##         未來         民眾     生長激素         生產         皮克 
##            1            1            1            1            5 
##         目前         全吃         合法         在乎         多人 
##            2            1            1            1            1 
##         多重         多種         安心         安全         成人 
##            1            1            3            3            1 
##         成長       有沒有         有關         汙染         污染 
##            1            1            1            1            1 
##         百姓         自己     自求多福         似乎         但身 
##            1            1            1            1            1 
##         含有         含量         含達         我國       抗生素 
##            1            1            1            1            3 
##         更是         每天         每每         沒事         沒錯 
##            1            2            1            1            1 
##         究竟         身為         那要         乳牛         使用 
##            1            1            1            1            1 
##         來看         其他         制定         周遭         尚未 
##            1            1            1            1            1 
##         所謂         放心         物質         的嗎         的話 
##            1            1            1            1            1 
##         直接         空氣         長達         孩童         很少 
##            2            1            1            1            1 
##         怎能         政府         是否         甚麼         相當 
##            1            7            1            1            2 
##         相關       相關性       看不到         美豬         訂定 
##            1            1            1            1            1 
##         負擔       重金屬         風險         食安         食物 
##            1            2            1            2            4 
##         首度         原因         家人     家庭主婦         根據 
##            1            1            1            2            1 
##       消費者         脂肪         除了         高官         高達 
##            1            5            1            1            1 
##         健康         問題         國內         婦女         張羅 
##            2            2            3            1            1 
##         御廚         患者         情況         採取         教授 
##            1            1            1            1            1 
##         深怕         混合         添加       添加物         清楚 
##            1            1            1            1            1 
##         符合     終其一生         許多         逐年         這些 
##            2            1            1            1            3 
##         這樣         速查         造成         魚類       喝牛奶 
##            3            1            1            1            1 
##         就要     提心吊膽         普遍         殘留       殘留量 
##            1            1            1            5            1 
##         無奈     無計可施         無須         然而         等等 
##            1            1            1            1            1 
##       結核病         給予         買菜         超出         超標 
##            2            1            2            1            1 
##       飲用水         滅芬         照護         農民         農藥 
##            1            1            1            2            2 
##         遇到         過多         過度         對於       慢性病 
##            1            1            1            2            1 
##         種類       與民眾         說明         說法         影響 
##            1            1            1            1            2 
##         標準         樣樣       瘦肉精       蔡總統         蔬果 
##            4            1            1            1            1 
##         調查         請問         質疑         遭到         遭受 
##            1            1            1            1            1 
##         擔心         整天         激素         罹患         罹癌 
##            1            1            2            1            2 
##         親友       戴奧辛         檢出         檢驗         環境 
##            1            3            1            1            1 
##         療法         癌症         總是         還有         還是 
##            1            1            2            3            1 
##         醫療       雞尾酒         雞蛋         懷孕         攀升 
##            1            1            3            1            1 
##         藥物         證明         證據         關係         難道 
##            1            1            1            1            1

res <- tb[(nchar(names(tb)) >= 2) & (tb >= 2)]
wordcloud2(res,size=0.5)

download.file('https://raw.githubusercontent.com/ywchiu/rtibame/master/data/applenews.RData', 'applenews.RData')
load('applenews.RData')


word <- unlist(sapply(applenews$content,  function(e) segment(code= as.character(e) , jiebar = mixseg)))

tb <- table(word)
res <- tb[(nchar(names(tb)) >= 2) & (tb >= 200)]
wordcloud2(res)

建立詞頻矩陣

library(tm)
e3      <-  'Hello, I am David. I have taken over 100 courses ~~~' 
e3.vec    <- unlist(strsplit(e3, ' '))
e3.corpus <- Corpus(VectorSource(list(e3.vec)))
e3.dtm    <- DocumentTermMatrix(e3.corpus)
inspect(e3.dtm)

## <<DocumentTermMatrix (documents: 1, terms: 7)>>
## Non-/sparse entries: 7/0
## Sparsity           : 0%
## Maximal term length: 7
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs 100 courses david have hello over taken
##    1   1       1     1    1     1    1     1

dtm <- DocumentTermMatrix(e3.corpus, control=list(wordLengths=c(1, 20)))
inspect(dtm)

## <<DocumentTermMatrix (documents: 1, terms: 10)>>
## Non-/sparse entries: 10/0
## Sparsity           : 0%
## Maximal term length: 7
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs 100 am c courses david have hello i over taken
##    1   1  1 1       1     1    1     1 2    1     1

#install.packages('SnowballC')
library(SnowballC)
doc <- tm_map(e3.corpus,  removeNumbers)
doc <- tm_map(doc, stemDocument)
dtm <- DocumentTermMatrix(doc)
inspect(dtm)

## <<DocumentTermMatrix (documents: 1, terms: 6)>>
## Non-/sparse entries: 6/0
## Sparsity           : 0%
## Maximal term length: 7
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs courses david have hello over taken
##    1       1     1    1     1    1     1

e4        <- 'image imaging imagination' 
stemDocument(e4)

## [1] "imag imag imagin"

removetilde <- content_transformer(function(x, pattern) {return (gsub("~", "", x))})

doc <- tm_map(e3.corpus, removetilde)
dtm <- DocumentTermMatrix(doc)
inspect(dtm)

## <<DocumentTermMatrix (documents: 1, terms: 7)>>
## Non-/sparse entries: 7/0
## Sparsity           : 0%
## Maximal term length: 7
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs 100 courses david have hello over taken
##    1   1       1     1    1     1    1     1

e1 <- 'this is my book'
e2 <- 'this is my car'

e.vec <- strsplit(c(e1,e2), ' ')
e.corpus <- Corpus(VectorSource(e.vec))
dtm <- DocumentTermMatrix(e.corpus)
inspect(dtm)

## <<DocumentTermMatrix (documents: 2, terms: 3)>>
## Non-/sparse entries: 4/2
## Sparsity           : 33%
## Maximal term length: 4
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs book car this
##    1    1   0    1
##    2    0   1    1

產生中文詞頻矩陣

library(jiebaR)
edit_dict()

## Warning in edit_dict(): You should save the dictionary without BOM on
## Windows

mixseg <- worker()

s  <-  "大巨蛋案對市府同仁下封口令？柯P否認"
s1 <-  "柯P市府近來飽受大巨蛋爭議"

s.vec  <- segment(code = s, jiebar = mixseg)
s1.vec <- segment(code = s1, jiebar = mixseg)
s1.vec

## [1] "柯P"    "市府"   "近來"   "飽受"   "大巨蛋" "爭議"

s.corpus <- Corpus(VectorSource(list(s.vec, s1.vec)))
s.dtm <- DocumentTermMatrix(s.corpus,control=list( removeSparseTerms=FALSE))
inspect(s.dtm)

## <<DocumentTermMatrix (documents: 2, terms: 8)>>
## Non-/sparse entries: 10/6
## Sparsity           : 38%
## Maximal term length: 3
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs 大巨蛋 市府 同仁 爭議 近來 封口令 案對 飽受
##    1      1    1    1    0    0      1    1    0
##    2      1    1    0    1    1      0    0    1

詞頻矩陣的應用

library(jiebaR)
mixseg  <- worker()
apple.seg <- lapply(applenews$content, function(e) segment(e,mixseg))
apple.corpus <- Corpus(VectorSource(apple.seg))
#doc <- tm_map(apple.corpus, removeNumbers)
#doc <- tm_map(apple.corpus, removePunctuation)
apple.dtm <- DocumentTermMatrix(apple.corpus, control = list(removeNumbers=TRUE, removePunctuation=TRUE ))
#apple.dtm$dimnames$Terms

findFreqTerms(apple.dtm, 200, )

##  [1] "已經"   "更新"   "男子"   "表示"   "要求"   "報導"   "提供"  
##  [8] "發\xb2" "影片"   "調查"   "翻攝"   "警方"   "大陸"   "中心"  
## [15] "中國"   "他們"   "去\xa6" "自己"   "國際"   "新聞"   "綜合"  
## [22] "網友"   "蘋果"   "相關"   "問題"   "人員"   "今天"   "內容"  
## [29] "沒有"   "美國"   "進行"   "台灣"   "其中"   "知道"   "照片"  
## [36] "今\xa6" "公司"   "台\xa5" "未來"   "指出"   "政府"   "時間"  
## [43] "第\xa4" "這些"   "媒體"   "影響"   "民眾"   "國家"   "就是"  
## [50] "可能"   "希望"   "日本"   "可以"   "最後"   "發生"   "工\xa7"
## [57] "目前"   "如果"   "是\xa7" "經濟"   "kobe"   "不是"   "不過"  
## [64] "這個"   "這樣"   "不\xb7" "肯亞"   "市府"   "還是"   "詐騙"  
## [71] "萬元"   "我們"   "大巨蛋"

findAssocs(apple.dtm, "詐騙", 0.5)

## $詐騙
##   電信   骨幹   騙走   集團 受\xae   抓獲   台方   騙來   犯罪 嫌疑人 
##   0.74   0.69   0.68   0.64   0.64   0.62   0.60   0.60   0.56   0.56 
##   窩點   臺灣   輕判   分贓   騙光   嫌犯 詐騙犯   跨國   擄台   受騙 
##   0.56   0.54   0.53   0.53   0.53   0.52   0.52   0.51   0.51   0.51 
##   留點 新華社 
##   0.51   0.50

dim(apple.dtm)

## [1]  1500 38350

dtm.remove = removeSparseTerms(apple.dtm, 0.99)
dim(dtm.remove)

## [1] 1500 1729

計算cosine similarity

library(proxy)
a  <- c(1,2,2,1,1,1,0)
b  <- c(1,2,2,1,1,2,1)

d <- proxy::dist(rbind(a,b), method = "cosine")
as.matrix(d)
dtm.remove <- removeSparseTerms(apple.dtm, 0.99)
dtm.remove

dtm.dist <-  proxy::dist(as.matrix(dtm.remove), method = "cosine")
dtm.mat <- as.matrix(dtm.dist)
dim(dtm.mat)

applenews$title[9]
applenews$title[order(dtm.mat[9,])[2:11]]

article.query = function(idx){
  applenews$title[as.integer(names(sort(dtm.mat[idx, which(dtm.mat[idx,] < 0.8)])))]
}
article.query(9)[1:10]

Demo20170422

David Chiu

2017年4月22日

作業四

English Segmentation

Chinese Segmentation

Long term first algo

long term first implementation

Use JiebaR

TFIDF Calculation

計算詞頻

建立詞頻矩陣

產生中文詞頻矩陣

詞頻矩陣的應用

計算cosine similarity