作業四
library(readr)
lvr_data <- read_csv("lvr_prices_mac.csv")
Missing column names filled in: 'X1' [1]Parsed with column specification:
cols(
.default = col_character(),
X1 = col_integer(),
land_sqmeter = col_double(),
trading_ymd = col_date(format = ""),
finish_ymd = col_date(format = ""),
building_sqmeter = col_double(),
room = col_integer(),
living_room = col_integer(),
bath = col_integer(),
total_price = col_integer(),
price_per_sqmeter = col_double(),
parking_sqmeter = col_double(),
parking_price = col_integer()
)
See spec(...) for full column specifications.
| | 0%
| | 0%
| | 1%
|= | 1%
|= | 1%
|= | 2%
|== | 2%
|== | 3%
|== | 3%
|=== | 3%
|=== | 4% 1 MB
|=== | 4% 1 MB
|==== | 5% 1 MB
|==== | 5% 1 MB
|==== | 5% 1 MB
|===== | 6% 1 MB
|===== | 6% 1 MB
|===== | 7% 1 MB
|===== | 7% 1 MB
|====== | 7% 1 MB
|====== | 8% 2 MB
|====== | 8% 2 MB
|======= | 9% 2 MB
|======= | 9% 2 MB
|======= | 9% 2 MB
|======== | 10% 2 MB
|======== | 10% 2 MB
|======== | 10% 2 MB
|========= | 11% 2 MB
|========= | 11% 2 MB
|========= | 12% 2 MB
|========= | 12% 3 MB
|========== | 12% 3 MB
|========== | 13% 3 MB
|========== | 13% 3 MB
|=========== | 13% 3 MB
|=========== | 14% 3 MB
|=========== | 14% 3 MB
|============ | 15% 3 MB
|============ | 15% 3 MB
|============ | 15% 3 MB
|============= | 16% 3 MB
|============= | 16% 4 MB
|============= | 17% 4 MB
|============== | 17% 4 MB
|============== | 17% 4 MB
|============== | 18% 4 MB
|============== | 18% 4 MB
|=============== | 19% 4 MB
|=============== | 19% 4 MB
|=============== | 19% 4 MB
|================ | 20% 4 MB
|================ | 20% 5 MB
|================ | 21% 5 MB
|================= | 21% 5 MB
|================= | 21% 5 MB
|================= | 22% 5 MB
|================== | 22% 5 MB
|================== | 23% 5 MB
|================== | 23% 5 MB
|=================== | 23% 5 MB
|=================== | 24% 5 MB
|=================== | 24% 6 MB
|=================== | 24% 6 MB
|==================== | 25% 6 MB
|==================== | 25% 6 MB
|==================== | 26% 6 MB
|===================== | 26% 6 MB
|===================== | 26% 6 MB
|===================== | 27% 6 MB
|====================== | 27% 6 MB
|====================== | 28% 6 MB
|====================== | 28% 6 MB
|======================= | 28% 7 MB
|======================= | 29% 7 MB
|======================= | 29% 7 MB
|======================== | 30% 7 MB
|======================== | 30% 7 MB
|======================== | 30% 7 MB
|========================= | 31% 7 MB
|========================= | 31% 7 MB
|========================= | 32% 7 MB
|========================= | 32% 7 MB
|========================== | 32% 8 MB
|========================== | 33% 8 MB
|========================== | 33% 8 MB
|=========================== | 34% 8 MB
|=========================== | 34% 8 MB
|=========================== | 34% 8 MB
|============================ | 35% 8 MB
|============================ | 35% 8 MB
|============================ | 35% 8 MB
|============================= | 36% 8 MB
|============================= | 36% 8 MB
|============================= | 37% 9 MB
|============================== | 37% 9 MB
|============================== | 37% 9 MB
|============================== | 38% 9 MB
|============================== | 38% 9 MB
|=============================== | 39% 9 MB
|=============================== | 39% 9 MB
|=============================== | 39% 9 MB
|================================ | 40% 9 MB
|================================ | 40% 9 MB
|================================ | 41% 10 MB
|================================= | 41% 10 MB
|================================= | 41% 10 MB
|================================= | 42% 10 MB
|================================== | 42% 10 MB
|================================== | 42% 10 MB
|================================== | 43% 10 MB
|=================================== | 43% 10 MB
|=================================== | 44% 10 MB
|=================================== | 44% 10 MB
|=================================== | 44% 10 MB
|==================================== | 45% 11 MB
|==================================== | 45% 11 MB
|==================================== | 46% 11 MB
|===================================== | 46% 11 MB
|===================================== | 46% 11 MB
|===================================== | 47% 11 MB
|====================================== | 47% 11 MB
|====================================== | 48% 11 MB
|====================================== | 48% 11 MB
|======================================= | 48% 11 MB
|======================================= | 49% 12 MB
|======================================= | 49% 12 MB
|======================================= | 49% 12 MB
|======================================== | 50% 12 MB
|======================================== | 50% 12 MB
|======================================== | 51% 12 MB
|========================================= | 51% 12 MB
|========================================= | 51% 12 MB
|========================================= | 52% 12 MB
|========================================== | 52% 12 MB
|========================================== | 53% 12 MB
|========================================== | 53% 13 MB
|=========================================== | 53% 13 MB
|=========================================== | 54% 13 MB
|=========================================== | 54% 13 MB
|============================================ | 55% 13 MB
|============================================ | 55% 13 MB
|============================================ | 55% 13 MB
|============================================= | 56% 13 MB
|============================================= | 56% 13 MB
|============================================= | 57% 13 MB
|============================================= | 57% 14 MB
|============================================== | 57% 14 MB
|============================================== | 58% 14 MB
|============================================== | 58% 14 MB
|=============================================== | 59% 14 MB
|=============================================== | 59% 14 MB
|=============================================== | 59% 14 MB
|================================================ | 60% 14 MB
|================================================ | 60% 14 MB
|================================================ | 61% 14 MB
|================================================= | 61% 14 MB
|================================================= | 61% 15 MB
|================================================= | 62% 15 MB
|================================================== | 62% 15 MB
|================================================== | 62% 15 MB
|================================================== | 63% 15 MB
|=================================================== | 63% 15 MB
|=================================================== | 64% 15 MB
|=================================================== | 64% 15 MB
|=================================================== | 64% 15 MB
|==================================================== | 65% 15 MB
|==================================================== | 65% 16 MB
|==================================================== | 66% 16 MB
|===================================================== | 66% 16 MB
|===================================================== | 66% 16 MB
|===================================================== | 67% 16 MB
|====================================================== | 67% 16 MB
|====================================================== | 68% 16 MB
|====================================================== | 68% 16 MB
|======================================================= | 68% 16 MB
|======================================================= | 69% 16 MB
|======================================================= | 69% 16 MB
|======================================================== | 70% 17 MB
|======================================================== | 70% 17 MB
|======================================================== | 70% 17 MB
|======================================================== | 71% 17 MB
|========================================================= | 71% 17 MB
|========================================================= | 71% 17 MB
|========================================================= | 72% 17 MB
|========================================================== | 72% 17 MB
|========================================================== | 73% 17 MB
|========================================================== | 73% 17 MB
|=========================================================== | 73% 18 MB
|=========================================================== | 74% 18 MB
|=========================================================== | 74% 18 MB
|============================================================ | 75% 18 MB
|============================================================ | 75% 18 MB
|============================================================ | 75% 18 MB
|============================================================= | 76% 18 MB
|============================================================= | 76% 18 MB
|============================================================= | 77% 18 MB
|============================================================== | 77% 18 MB
|============================================================== | 77% 19 MB
|============================================================== | 78% 19 MB
|============================================================== | 78% 19 MB
|=============================================================== | 79% 19 MB
|=============================================================== | 79% 19 MB
|=============================================================== | 79% 19 MB
|================================================================ | 80% 19 MB
|================================================================ | 80% 19 MB
|================================================================ | 81% 19 MB
|================================================================= | 81% 19 MB
|================================================================= | 81% 19 MB
|================================================================= | 82% 20 MB
|================================================================== | 82% 20 MB
|================================================================== | 83% 20 MB
|================================================================== | 83% 20 MB
|=================================================================== | 83% 20 MB
|=================================================================== | 84% 20 MB
|=================================================================== | 84% 20 MB
|=================================================================== | 84% 20 MB
|==================================================================== | 85% 20 MB
|==================================================================== | 85% 20 MB
|==================================================================== | 86% 21 MB
|===================================================================== | 86% 21 MB
|===================================================================== | 86% 21 MB
|===================================================================== | 87% 21 MB
|====================================================================== | 87% 21 MB
|====================================================================== | 88% 21 MB
|====================================================================== | 88% 21 MB
|======================================================================= | 88% 21 MB
|======================================================================= | 89% 21 MB
|======================================================================= | 89% 21 MB
|======================================================================== | 90% 21 MB
|======================================================================== | 90% 22 MB
|======================================================================== | 90% 22 MB
|========================================================================= | 91% 22 MB
|========================================================================= | 91% 22 MB
|========================================================================= | 92% 22 MB
|========================================================================== | 92% 22 MB
|========================================================================== | 92% 22 MB
|========================================================================== | 93% 22 MB
|========================================================================== | 93% 22 MB
|=========================================================================== | 94% 22 MB
|=========================================================================== | 94% 23 MB
|=========================================================================== | 94% 23 MB
|============================================================================ | 95% 23 MB
|============================================================================ | 95% 23 MB
|============================================================================ | 96% 23 MB
|============================================================================= | 96% 23 MB
|============================================================================= | 96% 23 MB
|============================================================================= | 97% 23 MB
|============================================================================== | 97% 23 MB
|============================================================================== | 97% 23 MB
|============================================================================== | 98% 23 MB
|===============================================================================| 98% 24 MB
|===============================================================================| 99% 24 MB
|===============================================================================| 99% 24 MB
|===============================================================================| 99% 24 MB
|================================================================================| 100% 24 MB
32 parsing failures.
row col expected actual
1282 total_price an integer 6700000000
2243 total_price an integer 3882685600
2244 total_price an integer 3373314400
4629 total_price an integer 3050000000
5890 total_price an integer 3133800000
.... ........... .......... ..........
See problems(...) for more details.
table(format(lvr_data$trading_ymd, '%Y-%m') )
1973-08 1975-10 1989-03 1989-06 1994-06 1998-01
3 1 1 1 1 1
2001-12 2003-04 2003-10 2003-12 2004-04 2004-12
1 1 1 1 4 1
2005-03 2005-04 2006-05 2007-06 2007-11 2007-12
1 1 1 2 1 1
2008-03 2008-04 2008-07 2008-11 2008-12 2009-01
1 4 3 5 1 3
2009-02 2009-03 2009-04 2009-05 2009-06 2009-07
6 19 25 31 31 15
2009-08 2009-09 2009-10 2009-11 2009-12 2010-01
20 16 70 64 38 101
2010-02 2010-03 2010-04 2010-05 2010-06 2010-07
31 45 190 83 190 120
2010-08 2010-09 2010-10 2010-11 2010-12 2011-01
100 121 115 97 142 152
2011-02 2011-03 2011-04 2011-05 2011-06 2011-07
64 83 24 20 31 75
2011-08 2011-09 2011-10 2011-11 2011-12 2012-01
63 99 85 77 62 43
2012-02 2012-03 2012-04 2012-05 2012-06 2012-07
38 137 215 192 182 978
2012-08 2012-09 2012-10 2012-11 2012-12 2013-01
1853 1950 2625 2519 3524 2179
2013-02 2013-03 2013-04 2013-05 2013-06 2013-07
1687 3009 3266 3449 3050 2761
2013-08 2013-09 2013-10 2013-11 2013-12 2014-01
2543 2715 2800 2820 3850 1953
2014-02 2014-03 2014-04 2014-05 2014-06 2014-07
1508 2638 2883 2220 1995 2123
2014-08 2014-09 2014-10 2014-11 2014-12 2015-01
1849 1913 2164 2179 2884 1454
2015-02 2015-03 2015-04 2015-05 2015-06 2015-07
948 1816 1821 1717 1621 1697
2015-08 2015-09 2015-10 2015-11 2015-12 2016-01
1476 1632 2074 2392 2740 1048
2016-02 2016-03 2016-04 2016-05
590 1036 685 47
lvr_data$trading_ym <- as.Date(format(lvr_data$trading_ymd, '%Y-%m-01'))
library(dplyr)
lvr_stat <- lvr_data %>% select(trading_ym, total_price) %>% filter(trading_ym >= '2012-01-01') %>% group_by(trading_ym) %>% summarise(overall_price = sum(as.numeric(total_price), na.rm=TRUE))
plot(overall_price ~ trading_ym,lvr_stat, type='l')
lvr_stat2 <- lvr_data %>% select(trading_ym, total_price, area) %>% filter(trading_ym >= '2012-01-01') %>% group_by(trading_ym, area) %>% summarise(overall_price = sum(as.numeric(total_price), na.rm=TRUE))
lvr_stat2$area <- as.factor(lvr_stat2$area)
# Answer 1
par(mfrow = c(4,3))

for (a in levels(lvr_stat2$area)){
plot(overall_price ~ trading_ym
,lvr_stat2[lvr_stat2$area == a,], type='l', main = a)
}
# Answer2
par(mfrow=c(1,1))

boxplot(overall_price ~ area
,lvr_stat2, cex=0.1)
# Answer3
par(mfrow=c(1,1))

lvr_stat3 <- lvr_data %>% select(total_price, area) %>% group_by(area) %>% summarise(overall_price = sum(as.numeric(total_price), na.rm=TRUE)) %>% arrange(desc(overall_price))
barplot(lvr_stat3$overall_price, col =factor(lvr_stat3$area), names.arg = lvr_stat3$area)

房價觀察
lvr_stat4 <- lvr_data %>% filter((price_per_sqmeter > 0) & (area == '大安區') & (trading_ym> '2012-01-01') & (trading_target == '房地(土地+建物)')) %>% select(trading_ym, price_per_sqmeter) %>% group_by(trading_ym) %>% summarise(median_price = median(as.numeric(price_per_sqmeter), na.rm=TRUE))
lvr_stat4$median_price <- lvr_stat4$median_price/ 0.3025
lvr_stat4
plot(median_price ~ trading_ym, data = lvr_stat4, type= 'l')
英文斷詞
strsplit(a , ' ')
[[1]]
[1] "this" "is" "a" "book"
中文斷詞
#install.packages("jiebaR")
library(jiebaR)
a <- '酸民婉君也可以報名嗎?'
mixseg <- worker()
segment(a, mixseg)
s<-"那我們酸民婉君也可以報名嗎"
mixseg<-worker()
segment(code=s , jiebar=mixseg)
edit_dict()
USERPATH
tagseg <- worker('tag')
segment(code=s , jiebar=tagseg)
test1 = worker("keywords",topn=3)
test1 <= s
s <- '金曲歌王蕭敬騰(老蕭)驚傳全身癱軟發冷,連站都無力,原來是感染「A型流感」,不得不取消今晚8時半在建國中學校慶的採訪通告。疾病管制署表示,今年的流感疫情在10月中開始出現,11月初曾達到1周32例重症。但由於今年擴大公費流感疫苗的施打對象,因此11月初以後疫情漸緩,沒有上升,可能要問蕭敬騰有沒有打流感疫苗。'
strsplit(s, '。|,|」|(|)|「')
library(jiebaR)
mixseg <- worker()
segment(s, mixseg)
Calculate TF-IDF
a <-c("a")
abb <-c("a", "b", "b")
abc <-c("a", "b", "c")
D <-list(a, abb, abc)
tfidf<-function(t,d, D){
tf<-table(d)[names(table(d))==t]/sum(table(d))
idf<-log(length(D)/sum(sapply(D,function(e)t%in%e)))
tf*idf
}
tfidf('a', a, D)
a
0
tf <- 1/1
idf <- log(3/3)
tf * idf
[1] 0
tfidf('a', abb, D)
a
0
tf <- 1/3
idf <- log(3/3)
tf * idf
[1] 0
tfidf('b', abb, D)
b
0.2703101
tf <- 2/3
idf <- log(3/2)
tf * idf
[1] 0.2703101
tfidf('b', abc, D)
b
0.135155
tf <- 1/3
idf <- log(3/2)
tf * idf
[1] 0.135155
tfidf('c', abc, D)
c
0.3662041
tf <- 1/3
idf <- log(3/1)
tf * idf
[1] 0.3662041
詞頻分析 (文字資料)
article <- '金曲歌王蕭敬騰(老蕭)驚傳全身癱軟發冷,連站都無力,原來是感染「A型流感」,不得不取消今晚8時半在建國中學校慶的採訪通告。疾病管制署表示,今年的流感疫情在10月中開始出現,11月初曾達到1周32例重症。但由於今年擴大公費流感疫苗的施打對象,因此11月初以後疫情漸緩,沒有上升,可能要問蕭敬騰有沒有打流感疫苗。
疾管署副署長莊人祥表示,今年10月起的流感季疫苗施打,擴大讓50至64歲成人及13至18歲青少年等,均納入公費疫苗施打對象,接種涵蓋率從全人口的13%增加25%,而流感疫情雖提早在10月中開始出現,11月初曾達到1周32例重症,但此後每周的重症人數逐漸減緩,沒再上升。相較於今年年初1周最高311例重症,目前疫情持平,沒有增溫。
疾管署統計,今年公費流感疫苗自10月1日開打至12月8日,接種數已超過572萬劑,整體疫苗使用率逾95%。為使疫苗在流感流行季來臨前發揮最大效益,疾管署已於12月1日起擴大公費流感疫苗接種對象至全國民眾,依目前疫苗使用率估計,最近1到2周內部分縣市將陸續出現疫苗打完的情形,呼籲尚未接種的民眾把握機會儘速接種。
莊人祥提醒,民眾平時應落實勤洗手及注意呼吸道衛生,避免出入人潮擁擠、空氣不流通的公共場所,防範流感病毒;如出現呼吸困難、急促、發紺(缺氧)、血痰或痰液變濃、胸痛、意識改變、低血壓等流感危險徵兆應及早就醫,必要時依醫師指示規則使用公費抗病毒藥劑並在家休養。(黃仲丘/台北報導)'
文字雲
library(jiebaR)
mixseg <- worker()
article.seg <- segment(article, mixseg)
tb <- table(article.seg)
sort(tb, decreasing = TRUE)
# install.packages('wordcloud2')
library(wordcloud2)
tb2 <- tb[(nchar(names(tb)) >= 2) & (tb >= 2) & grepl('[\u4e00-\u9fa5]+', names(tb))]
wordcloud2(tb2)
wordcloud2(tb2, shape = 'triangle')
使用tm模組
library(tm)
Loading required package: NLP
英文建立詞頻矩陣
e3 <-'Hello, I am David. I have taken over 100 courses ~~~'
e3.corpus <- Corpus(VectorSource(e3))
e3.dtm <- DocumentTermMatrix(e3.corpus)
inspect(e3.dtm)
<<DocumentTermMatrix (documents: 1, terms: 8)>>
Non-/sparse entries: 8/0
Sparsity : 0%
Maximal term length: 7
Weighting : term frequency (tf)
Terms
Docs ~~~ 100 courses david. have hello, over taken
1 1 1 1 1 1 1 1 1
dtm <- DocumentTermMatrix(e3.corpus, control=list(wordLengths=c(1, 20)))
inspect(dtm)
<<DocumentTermMatrix (documents: 1, terms: 10)>>
Non-/sparse entries: 10/0
Sparsity : 0%
Maximal term length: 7
Weighting : term frequency (tf)
Terms
Docs ~~~ 100 am courses david. have hello, i over taken
1 1 1 1 1 1 1 1 2 1 1
#install.packages('SnowballC')
stemDocument(c('image', 'imagine', 'imagination'))
[1] "imag" "imagin" "imagin"
doc <- tm_map(e3.corpus, removeNumbers)
doc <- tm_map(doc, removePunctuation)
dtm <- DocumentTermMatrix(doc)
inspect(dtm)
<<DocumentTermMatrix (documents: 1, terms: 6)>>
Non-/sparse entries: 6/0
Sparsity : 0%
Maximal term length: 7
Weighting : term frequency (tf)
Terms
Docs courses david have hello over taken
1 1 1 1 1 1 1
removetilde <- content_transformer(
function(x, pattern){return(gsub("~", "", x))})
doc <- tm_map(e3.corpus, removetilde)
dtm<-DocumentTermMatrix(doc)
inspect(dtm)
<<DocumentTermMatrix (documents: 1, terms: 7)>>
Non-/sparse entries: 7/0
Sparsity : 0%
Maximal term length: 7
Weighting : term frequency (tf)
Terms
Docs 100 courses david. have hello, over taken
1 1 1 1 1 1 1 1
e1 <- 'this is a book'
e2 <- 'this is my car'
e.vec <- list(e1,e2)
e.corpus <- Corpus(VectorSource(e.vec))
e.dtm <- DocumentTermMatrix(e.corpus)
inspect(e.dtm)
<<DocumentTermMatrix (documents: 2, terms: 3)>>
Non-/sparse entries: 4/2
Sparsity : 33%
Maximal term length: 4
Weighting : term frequency (tf)
Terms
Docs book car this
1 1 0 1
2 0 1 1
s <- "大巨蛋案對市府同仁下封口令?柯P否認"
s1 <- "柯P市府近來飽受大巨蛋爭議"
library(jiebaR)
mixseg <- worker()
s.vec <- segment(s, mixseg)
s1.vec <- segment(s1, mixseg)
s.corpus <- Corpus(VectorSource(list(s.vec, s1.vec)))
s.dtm <- DocumentTermMatrix(s.corpus)
inspect(s.dtm)
<<DocumentTermMatrix (documents: 2, terms: 3)>>
Non-/sparse entries: 3/3
Sparsity : 50%
Maximal term length: 8
Weighting : term frequency (tf)
Terms
Docs 銝n撠隞么n<e6>p 憭批楊<e8><9b>n<e7>霅<b0> 憭批楊<e8><9b>n獢<b0><8d>
1 1 0 1
2 0 1 0
中文詞頻矩陣
source('https://raw.githubusercontent.com/ywchiu/rtibame/master/Lib/CNCorpus.R')
s.vec <-segment(code=s , jiebar =mixseg)
s1.vec <-segment(code=s1 , jiebar =mixseg)
s.corpus=CNCorpus(list(s.vec, s1.vec))
control.list=list(wordLengths=c(1,Inf),tokenize=space_tokenizer)
s.dtm <- DocumentTermMatrix(s.corpus, control<-control.list)
inspect(s.dtm)
<<DocumentTermMatrix (documents: 2, terms: 11)>>
Non-/sparse entries: 14/8
Sparsity : 36%
Maximal term length: 3
Weighting : term frequency (tf)
Terms
Docs 銝<8b> 憭批楊<e8><9b><8b> 撣<ba><9c> <e5><90><bb><81> <e5>隤<8d> <e7>霅<b0> 餈<be><86> 撠隞<a4> <e6>p 獢<b0><8d>
1 1 1 1 1 1 0 0 1 1 1
2 0 1 1 0 0 1 1 0 1 0
Terms
Docs 憌賢<8f><97>
1 0
2 1
下載一例一休
source('https://raw.githubusercontent.com/ywchiu/rtibame/master/Lib/CNCorpus.R')
download.file('https://github.com/ywchiu/rtibame/raw/master/Data/oneday.csv', 'oneday.csv')
library(readr)
oneday <- read_csv("oneday.csv")
#str(oneday)
library(jiebaR)
mixseg <- worker()
article.seg <- lapply(oneday$content, function(e) segment(e, mixseg) )
s.corpus <- CNCorpus(article.seg)
control.list=list(wordLengths=c(2,Inf),tokenize=space_tokenizer)
doc <- tm_map(s.corpus, removeNumbers)
s.dtm <- DocumentTermMatrix(doc, control<-control.list)
#s.dtm$dimnames$Terms
findFreqTerms(s.dtm, 50,100)
dim(s.dtm)
dtm.remove <- removeSparseTerms(s.dtm, 0.95)
dim(dtm.remove)
#dtm.remove$dimnames$Terms
lapply example
a <- list(c(1,2,3), c(3,4,5))
lapply(a, sum)
