Homework4

library(readr)
lvr_data <- read_csv("C:/Users/Administrator/Downloads/lvr_prices_mac.csv")

## Warning: Missing column names filled in: 'X1' [1]

## Parsed with column specification:
## cols(
##   .default = col_character(),
##   X1 = col_integer(),
##   land_sqmeter = col_double(),
##   trading_ymd = col_date(format = ""),
##   finish_ymd = col_date(format = ""),
##   building_sqmeter = col_double(),
##   room = col_integer(),
##   living_room = col_integer(),
##   bath = col_integer(),
##   total_price = col_integer(),
##   price_per_sqmeter = col_double(),
##   parking_sqmeter = col_double(),
##   parking_price = col_integer()
## )

## See spec(...) for full column specifications.

## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)

## Warning: 32 parsing failures.
## row # A tibble: 5 x 5 col     row         col   expected     actual expected   <int>       <chr>      <chr>      <chr> actual 1  1282 total_price an integer 6700000000 file 2  2243 total_price an integer 3882685600 row 3  2244 total_price an integer 3373314400 col 4  4629 total_price an integer 3050000000 expected 5  5890 total_price an integer 3133800000 actual # ... with 1 more variables: file <chr>
## ... ................. ... ......................................... ........ ......................................... ...... ......................................... .... ......................................... ... ......................................... ... ......................................... ........ ......................................... ...... .......................................
## See problems(...) for more details.

#View(lvr_data)


library(dplyr)

## Warning: package 'dplyr' was built under R version 3.4.2

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

#format(lvr_data$trading_ymd, '%Y-%m-01')

# Answer1
lvr_stat <- lvr_data %>% 
  mutate(trading_ym = as.Date(format(lvr_data$trading_ymd, '%Y-%m-01')) ) %>%
  filter(trading_ym > '2012-01-01') %>%
  group_by(trading_ym, area) %>%
  summarize(mean_total_price = mean(total_price, na.rm=TRUE))

## Warning: package 'bindrcpp' was built under R version 3.4.2

lvr_stat$area <- as.factor(lvr_stat$area)


par(mfrow = c(3,4))
for (a in levels(lvr_stat$area)){
  area_stat <- lvr_stat %>% filter(area == a)
  plot(mean_total_price ~ trading_ym, data = area_stat, type = 'l', main = a)
  #print(a)
}

# Answer2
par(mfrow = c(1,1))
boxplot(log(total_price) ~ area, data = lvr_data, main = '總價箱形圖', xlab = '區域', ylab = '總價 (log)', cex.axis = 0.6, cex.name = 0.6)

## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 1 is not drawn

## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 2 is not drawn

## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 3 is not drawn

## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 4 is not drawn

## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 6 is not drawn

# Answer3
lvr_stat <- lvr_data %>% 
  select(area, total_price) %>%
  group_by(area) %>%
  summarise(total_sum  = sum(as.numeric(total_price), na.rm=TRUE) ) %>%
  arrange(desc(total_sum))

barplot(lvr_stat$total_sum, names.arg = lvr_stat$area, main = 'House Price Sum', xlab = 'area', ylab = 'total price', col="blue")

lvr_stat2 <- lvr_data %>% 
  select(area, total_price) %>%
  group_by(area) %>%
  summarise(total_mean  = mean(as.numeric(total_price), na.rm=TRUE) ) %>%
  arrange(desc(total_mean))

barplot(lvr_stat2$total_mean, names.arg = lvr_stat2$area, main = 'House Price Mean', xlab = 'area', ylab = 'total price', col="blue", cex.axis = 0.6, cex.names = 0.6)

時間轉換

a <- '2012-01-01 10:20:30'

as.Date(a)

## [1] "2012-01-01"

as.POSIXct(a)

## [1] "2012-01-01 10:20:30 CST"

as.POSIXlt(a)

## [1] "2012-01-01 10:20:30 CST"

format(as.Date(a),    '%H')

## [1] "00"

format(as.POSIXct(a), '%H')

## [1] "10"

format(as.POSIXlt(a), '%H')

## [1] "10"

unclass(as.POSIXct(a))

## [1] 1325384430
## attr(,"tzone")
## [1] ""

unclass(as.POSIXlt(a))

## $sec
## [1] 30
## 
## $min
## [1] 20
## 
## $hour
## [1] 10
## 
## $mday
## [1] 1
## 
## $mon
## [1] 0
## 
## $year
## [1] 112
## 
## $wday
## [1] 0
## 
## $yday
## [1] 0
## 
## $isdst
## [1] 0
## 
## $zone
## [1] "CST"
## 
## $gmtoff
## [1] NA

英文斷詞

s <- 'this is a book'
strsplit(s,' ')

## [[1]]
## [1] "this" "is"   "a"    "book"

s<- '那酸民婉君也可以報名嗎'


library(jiebaR)

## Warning: package 'jiebaR' was built under R version 3.4.2

## Loading required package: jiebaRD

## Warning: package 'jiebaRD' was built under R version 3.4.2

mixseg <- worker()
segment(s, jiebar = mixseg)

## [1] "那酸民婉君" "也"         "可以"       "報名"       "嗎"

s <- '已經33歲的林丹，在今年羽球世錦賽敗給年僅22歲的現任球王、丹麥好手維克(Viktor)，僅獲得亞軍，不過他還不輕言退休，林丹強調，明年還想再拿一個世界冠軍。'
strsplit(s, '，|、|。')

## [[1]]
## [1] "已經33歲的林丹"                        
## [2] "在今年羽球世錦賽敗給年僅22歲的現任球王"
## [3] "丹麥好手維克(Viktor)"                  
## [4] "僅獲得亞軍"                            
## [5] "不過他還不輕言退休"                    
## [6] "林丹強調"                              
## [7] "明年還想再拿一個世界冠軍"

s<- '那酸民婉君也可以報名嗎'

library(jiebaR)
mixseg <- worker()
segment(s, jiebar = mixseg)

## [1] "那酸民婉君" "也"         "可以"       "報名"       "嗎"

#edit_dict()
tagseg <- worker('tag')
segment(s, jiebar = tagseg)

##            x            d            c            v            y 
## "那酸民婉君"         "也"       "可以"       "報名"         "嗎"

字典擴充

library(rvest)
article <- read_html('https://zh.wikipedia.org/wiki/%E9%AA%A8%E7%97%9B%E7%86%B1%E7%97%87') %>%
  html_nodes('p') %>% 
  .[1] %>%
  html_nodes('b') %>%
  html_text()
article

keywords <- read_html('http://sports.ltn.com.tw/news/breakingnews/2222132') %>%
  html_nodes('.keyword a') %>%
  html_text()
keywords
?file


#edit_dict()
dict_path <- USERPATH

keywords <- read_html('http://sports.ltn.com.tw/news/breakingnews/2222132') %>%
  html_nodes('.keyword a') %>%
  html_text()

fileConn <- file(dict_path, 'a')
writeLines(keywords, fileConn)
flush(fileConn)
close(fileConn)

#edit_dict()

使用爬蟲擴增字典

library(rvest)

## Loading required package: xml2

## 
## Attaching package: 'rvest'

## The following object is masked from 'package:readr':
## 
##     guess_encoding

getKeyword <- function(url){
  keywords <- read_html(url) %>%
  html_nodes('.keyword a') %>%
  html_text()
  return(keywords)
}
newsurl <- 'http://news.ltn.com.tw/list/breakingnews'
alinks <- read_html(newsurl) %>%
  html_nodes('.tit') %>%
  html_attr('href') 

# method1
keywords <- c()
for (link in alinks){
  #print(link)
  keywords <- c(keywords, getKeyword(link))
}
keywords

##  [1] "北韓"           "南韓"           "彈道飛彈"       "美國"          
##  [5] "專款專用"       "營業稅"         "生理用品"       "黃牛"          
##  [9] "黃牛票"         "拳擊"           "電競"           "劃紅線"        
## [13] "巷道狹窄"       "消防巷道"       "消防車"         "禁止臨停"      
## [17] "交友網站"       "詐欺罪"         "詐騙"           "DCARD"         
## [21] "保險套"         "女友"           "恐怖情人"       "戳破"          
## [25] "后里"           "舊山線鐵道"     "鐵道文化"       "鐵馬自由行"    
## [29] "下週油價"       "台塑油價"       "汽柴油價格"     "油價"          
## [33] "拔河比賽"       "東亞青年運動會" "運動會"         "國民黨"        
## [37] "按鈴申告"       "鍾小平"         "黃呂錦茹"       "全運會"        
## [41] "桌球"           "共伴效應"       "卡努颱風"       "水庫洩洪"      
## [45] "翡翠水庫"       "豪雨"           "2018台北市長"   "2018縣市長選舉"
## [49] "柯文哲"

# method2 
keywords_list <- lapply(alinks, getKeyword)
keywords2 <- unlist(keywords_list)
keywords2

##  [1] "北韓"           "南韓"           "彈道飛彈"       "美國"          
##  [5] "專款專用"       "營業稅"         "生理用品"       "黃牛"          
##  [9] "黃牛票"         "拳擊"           "電競"           "劃紅線"        
## [13] "巷道狹窄"       "消防巷道"       "消防車"         "禁止臨停"      
## [17] "交友網站"       "詐欺罪"         "詐騙"           "DCARD"         
## [21] "保險套"         "女友"           "恐怖情人"       "戳破"          
## [25] "后里"           "舊山線鐵道"     "鐵道文化"       "鐵馬自由行"    
## [29] "下週油價"       "台塑油價"       "汽柴油價格"     "油價"          
## [33] "拔河比賽"       "東亞青年運動會" "運動會"         "國民黨"        
## [37] "按鈴申告"       "鍾小平"         "黃呂錦茹"       "全運會"        
## [41] "桌球"           "共伴效應"       "卡努颱風"       "水庫洩洪"      
## [45] "翡翠水庫"       "豪雨"           "2018台北市長"   "2018縣市長選舉"
## [49] "柯文哲"

dict_path <- USERPATH

fileConn <- file(dict_path, 'a')
writeLines(keywords2, fileConn)
flush(fileConn)
close(fileConn)
#edit_dict()

N-GRAM

s <-strsplit(x="那我們酸民婉君也可以報名嗎", split='')

unlist(s)

##  [1] "那" "我" "們" "酸" "民" "婉" "君" "也" "可" "以" "報" "名" "嗎"

library(NLP)
bigram <- ngrams(unlist(s),2)
# method 1 
bigram2 <- lapply(bigram, function(e) paste0(e, collapse = ''))
unlist(bigram2)

##  [1] "那我" "我們" "們酸" "酸民" "民婉" "婉君" "君也" "也可" "可以" "以報"
## [11] "報名" "名嗎"

# method 2
vapply(bigram, paste, '', collapse = '')

##  [1] "那我" "我們" "們酸" "酸民" "民婉" "婉君" "君也" "也可" "可以" "以報"
## [11] "報名" "名嗎"

trigram <- ngrams(unlist(s),3)
vapply(trigram, paste, '', collapse = '')

##  [1] "那我們" "我們酸" "們酸民" "酸民婉" "民婉君" "婉君也" "君也可"
##  [8] "也可以" "可以報" "以報名" "報名嗎"

article <- '因受卡努颱風及東北季風共伴效應影響，台東地區部份地區已成災，台東縣長黃健庭除指示相關單位積極注意河川水位及土石流警戒，並因應水災災害防救需要，縣府今（14）日公告劃定向陽山等區域及金峰鄉嘉蘭村等五處莫拉克與莫蘭蒂颱風災後特定區域為管制區，並自14日中午12時起生效，限制或禁止民眾進入，違者將依法舉發，處罰5萬元以上、25萬元以下罰鍰，敬請民眾注意。 
臺東縣1014水災台東縣災害應變中心已於今日12時0分成立二級開設，各編組單位也立即啟動緊急應變小組，展開各項災害應變作業，黃縣長特別指示有關單位注意河川水位及土石流警戒，以便及早透過大眾媒體方式，通知民眾做好因應措施，減少損失。

同時縣府也緊急依災害防救法第31條第2款前段規定，公告劃定「向陽山區及大武山區、各河川溪流水域（含土石流潛勢溪流）、長濱至大武暨蘭嶼、綠島沿線之海岸、漁港等區域範圍」，以及「金峰鄉嘉蘭村莫拉克颱風災後特定區域範圍、大武鄉大竹村富山部落莫拉克颱風災後特定區域範圍、大武鄉大竹村本部落莫拉克颱風災後特定區域等範圍」、「大武鄉大竹村愛國蒲部落1、2、3、4鄰及延平鄉紅葉村1、2、3鄰莫蘭蒂颱風災後特定區域範圍」為限制或禁止人民進入或命其離去之範圍。
為確保民眾生命財產之安全，非持有通行證或應緊急避難之需要者，不得進入從事戲水、觀潮、垂釣、登山、健行、溯溪、出海捕魚等活動，如違反規定將依法進行舉發，處5萬元以上、25萬元以下罰鍰。
同時縣府也強制要求公告縣內各港口（泊、區）、岸際（高潮線起向臨陸側100公尺內，住宅區及海堤線內除外）列入管制區，禁止船（筏）出海作業（除救難船外），並由海巡單位執行管制措施，各山區、各河川溪流水域（含水石流潛勢溪流），應禁止人民從事各項危險活動。（突發中心黃瑞娟／台東報導）'


w <- strsplit(article, '')
# bigram
bigram <- ngrams(unlist(w),2)
bigram.str <- vapply(bigram, paste, '', collapse = '')
tb <- table(bigram.str)
tb[tb > 5]

## bigram.str
## 區域 颱風 範圍 
##    7    6    6

# trigram
trigram <- ngrams(unlist(w),3)
trigram.str <- vapply(trigram, paste, '', collapse = '')
tb <- table(trigram.str)
tb[tb > 3]

## trigram.str
##  5萬元 災後特 定區域 後特定 風災後 特定區 區域範 域範圍 莫拉克 萬元以 
##      4      5      5      5      5      5      4      4      4      4 
## 颱風災 
##      5

a.split <- strsplit(article, '、|，|。|\n|（|」|「|）|／')

w.split <- strsplit(x=unlist(a.split), split='')
#w.split

bigram <-function(w){
  bigram <-ngrams(unlist(w), 2)
  bigram.str<-vapply(bigram, paste, "", collapse ="")
  return(bigram.str)
}

bigram.all<-sapply(w.split, bigram)
#bigram.all

tb<-table(unlist(bigram.all))
tb[tb>=3]

## 
##   14  5萬 土石 大竹 大武 山區 元以 公告 台東 民眾 石流 竹村 克颱 災後 災害 
##    3    4    3    3    5    3    4    3    4    4    4    3    3    5    4 
## 定區 拉克 東縣 武鄉 河川 注意 後特 風災 特定 區域 域範 莫拉 部落 單位 進入 
##    5    4    3    3    4    3    5    5    5    7    4    4    3    4    3 
## 鄉大 溪流 禁止 萬元 管制 緊急 颱風 範圍 縣府 應變 
##    3    4    4    4    3    3    6    6    3    3

長詞優先法

s <- "當初中央政府拿台北市的精華地跟北市府交換"
s.split <- strsplit(s,'台北市')
paste(unlist(s.split), sep='', collapse = '')

## [1] "當初中央政府拿的精華地跟北市府交換"

removeKey <- function(s, keywords){
  for(keyword in keywords){
    s.split <- strsplit(s,keyword)
    s <- paste(unlist(s.split), sep='', collapse = '')
  }
  return(s)
}
removeKey(s, c('台北市', '精華地'))

## [1] "當初中央政府拿的跟北市府交換"

ngram.func <-function(w, n){
  n.gram<-ngrams(unlist(w), n)
  n.gram.str<-vapply(n.gram, paste, "", collapse ="")
  return(n.gram.str)
}



longTermFirst <- function(article, keywords,threshold){
  for(i in seq(8,2,-1)){
    article<-removeKey(article, keywords)
    a.split<-strsplit(article, "、|，|。")
    w.split<-strsplit(x=unlist(a.split), split='')
    n.gram.all<-sapply(w.split, function(e)ngram.func(e,i))
    tb<-table(unlist(n.gram.all))
    candidates <- names(tb[tb > threshold])
    keywords <- c(keywords, candidates)
  }
  keywords
}

longTermFirst(article, c(), 3)

##  [1] "颱風災後特定區域" "5萬元以"          "莫拉克"          
##  [4] "大武"             "台東"             "民眾"            
##  [7] "石流"             "災害"             "河川"            
## [10] "單位"             "溪流"             "禁止"            
## [13] "範圍"

Sapply

a <- c(2,3,4,5)
sapply(a, function(e) e + 5)

## [1]  7  8  9 10

sapply(a, function(e) e ^ 2)

## [1]  4  9 16 25

WordCloud2

library(jiebaR)

mixseg <- worker()
seg <- segment(article, mixseg)
tb <- table(seg)
# By number of occurence
tb <- tb[tb>2]
# By Chinese Words
tb <- tb[grep('^[\u4e00-\u9fa5]+$',names(tb))]
# By Term Length
tb <- tb[nchar(names(tb)) >= 2]


library(wordcloud2)

## Warning: package 'wordcloud2' was built under R version 3.4.2

wordcloud2(tb,shape = 'star')

#edit_dict()

WordCloud on Applenews

#download.file('https://raw.githubusercontent.com/ywchiu/pytextmining/master/20171003_applenews.xlsx', '20171003_applenews.xlsx')

library(readxl)
applenews <- read_excel("C:/Users/Administrator/Downloads/20171003_applenews.xlsx")
#View(applenews)


mixseg <- worker()
seg <- sapply(applenews$content, function(news) segment(news, mixseg))
word <- unlist(seg)
tb <- table(word)
# By number of occurence
tb <- tb[tb>100]
# By Chinese Words
tb <- tb[grep('^[a-zA-Z\u4e00-\u9fa5]+$',names(tb))]
# By Term Length
tb <- tb[nchar(names(tb)) >= 2]

library(wordcloud2)
wordcloud2(tb,shape = 'star')

tfidf

a   <- c('a')
abb <- c('a', 'b', 'b')
abc <- c('a', 'b', 'c')
D <- list(a,abb,abc)
#tfidf('a', a)
tf  <- 1 / 1
idf <- log(3/3)
tf * idf

## [1] 0

#tfidf('a', abb)
tf  <- 1 / 3
idf <- log(3/3)
tf * idf

## [1] 0

#tfidf('b', abb)
tf <- 2 / 3
idf <- log(3/2)
tf * idf

## [1] 0.2703101

#tfidf('a', abc)
tf <- 1 / 3
idf <- log(3/3)
tf * idf

## [1] 0

#tfidf('b', abc)
tf  <- 1 / 3
idf <- log(3/2)
tf * idf

## [1] 0.135155

#tfidf('c', abc)
tf  <- 1 / 3
idf <- log(3/1)
tf * idf

## [1] 0.3662041

#table(abb) / 
  #length(abb)
#  sum(abb == 'a')
length(D)

## [1] 3

tfidf <- function(t, d, D){
  tf  <- sum(d == t) / length(d)
  idf <- log(length(D) / sum(sapply(D, function(e) t %in% e)) )
  return(tf * idf)
}

tfidf('c', abc, D)

## [1] 0.3662041

Bag of Words

s <- "大巨蛋案對市府同仁下封口令？柯P否認"
mixseg <-  worker()
segment(s, mixseg)

## [1] "大巨蛋" "案對"   "市府"   "同仁"   "下"     "封口令" "柯P"    "否認"

#edit_dict()

library(tm)

## Warning: package 'tm' was built under R version 3.4.2

e3 <- 'Hello, I am David. I have taken over 100 courses ~~~' 
e3.vec <- strsplit(e3, ' ')[[1]]
e3.corpus <- Corpus(VectorSource(list(e3.vec)))
e3.dtm <- DocumentTermMatrix(e3.corpus)
inspect(e3.dtm)

## <<DocumentTermMatrix (documents: 1, terms: 7)>>
## Non-/sparse entries: 7/0
## Sparsity           : 0%
## Maximal term length: 7
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs 100 courses david have hello over taken
##    1   1       1     1    1     1    1     1

dtm <- DocumentTermMatrix(e3.corpus, control=list(wordLengths=c(1, 20)))
inspect(dtm)

## <<DocumentTermMatrix (documents: 1, terms: 10)>>
## Non-/sparse entries: 10/0
## Sparsity           : 0%
## Maximal term length: 7
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs 100 am c courses david have hello i over taken
##    1   1  1 1       1     1    1     1 2    1     1

doc <-  tm_map(e3.corpus, removeNumbers)
doc <-  tm_map(doc, removePunctuation)
dtm <-  DocumentTermMatrix(doc)
inspect(dtm)

## <<DocumentTermMatrix (documents: 1, terms: 6)>>
## Non-/sparse entries: 6/0
## Sparsity           : 0%
## Maximal term length: 7
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs chello courses david have over taken
##    1      1       1     1    1    1     1

removetilde <-content_transformer(function(x, pattern){return(gsub("~", "", x))})

doc =tm_map(e3.corpus, removetilde)
dtm<-DocumentTermMatrix(doc)
inspect(dtm)

## <<DocumentTermMatrix (documents: 1, terms: 7)>>
## Non-/sparse entries: 7/0
## Sparsity           : 0%
## Maximal term length: 7
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs 100 courses david have hello over taken
##    1   1       1     1    1     1    1     1

DocumentTermMatrix

e1 <- 'this is a book'
e2 <- 'this is my car'
str.list <- strsplit(c(e1,e2), ' ')
#str.list
corpus <- Corpus(VectorSource(str.list))
dtm <- DocumentTermMatrix(corpus)
inspect(dtm)

## <<DocumentTermMatrix (documents: 2, terms: 3)>>
## Non-/sparse entries: 4/2
## Sparsity           : 33%
## Maximal term length: 4
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs book car this
##    1    1   0    1
##    2    0   1    1

詞頻矩陣

library(jiebaR)
mixseg <- worker()
s  <- "大巨蛋案對市府同仁下封口令？柯P否認"
s1 <- "柯P市府近來飽受大巨蛋爭議"
s.vec  <- segment(s, jiebar = mixseg)
s1.vec <- segment(s1, jiebar = mixseg)
#s.vec
corpus <- Corpus(VectorSource(list(s.vec, s1.vec)))
dtm <- DocumentTermMatrix(corpus, control = list(wordLengths=c(2, 20)))
inspect(dtm)

## <<DocumentTermMatrix (documents: 2, terms: 9)>>
## Non-/sparse entries: 11/7
## Sparsity           : 39%
## Maximal term length: 3
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs 下 大巨蛋 市府 同仁 爭議 近來 封口令 案對 飽受
##    1  1      1    1    1    0    0      1    1    0
##    2  0      1    1    0    1    1      0    0    1

Get DocumentTermMatrix for AppleNews

#download.file('https://raw.githubusercontent.com/ywchiu/pytextmining/master/20171003_applenews.xlsx', '20171003_applenews.xlsx')

#library(readxl)

#applenews <- read_excel("C:/Users/Administrator/Downloads/20171003_applenews.xlsx")
#View(applenews)
#head(applenews)

library(jiebaR)
mixseg    <- worker()
apple.seg <- lapply(applenews$content, function(news) segment(news, mixseg))

class(apple.seg)

corpus <- Corpus(VectorSource(apple.seg))
dtm <- DocumentTermMatrix(corpus)
dim(dtm)
#?DocumentTermMatrix


findFreqTerms(dtm, 200,)

#dtm$dimnames$Terms == '張忠謀'

findAssocs(dtm, "張忠謀", 0.5)

findAssocs(dtm, "賭城", 0.7)

dtm.remove <- removeSparseTerms(dtm, 0.9)
dim(dtm.remove)
#dtm.remove$dimnames$Terms


inspect(dtm.remove)

m <- matrix( c(1,1,1,1,0,1,0,0,1), nrow = 3, ncol = 3, byrow = FALSE)
m
cor(m)
#cor(m[, 1], m[,2])




library(jiebaR)
mixseg <- worker()
s  <- "大巨蛋案對市府同仁下封口令？柯P否認"
s1 <- "柯P市府近來飽受大巨蛋爭議"
s.vec  <- segment(s, jiebar = mixseg)
s1.vec <- segment(s1, jiebar = mixseg)
#s.vec
corpus <- Corpus(VectorSource(list(s.vec, s1.vec)))
dtm2 <- DocumentTermMatrix(corpus, control = list(wordLengths=c(2, 20), weighting = function(x) weightTfIdf(x, normalize = FALSE)))
inspect(dtm2)



library(jiebaR)
mixseg    <- worker()
apple.seg <- lapply(applenews$content, function(news) segment(news, mixseg))

class(apple.seg)

corpus <- Corpus(VectorSource(apple.seg))
dtm2 <- DocumentTermMatrix(corpus, control = list(wordLengths=c(2, Inf), weighting = function(x) weightTfIdf(x, normalize = FALSE)))
dim(dtm2)

inspect(dtm2[1:10, 1:10])
ft <- findFreqTerms(dtm2, 500)
#ft

Demo20171014

David Chiu

2017年10月14日

Homework4

時間轉換

英文斷詞

字典擴充

使用爬蟲擴增字典

N-GRAM

長詞優先法

Sapply

WordCloud2

WordCloud on Applenews

tfidf

Bag of Words

DocumentTermMatrix

詞頻矩陣

Get DocumentTermMatrix for AppleNews