Linear Regression
data(anscombe)
plot(y1 ~ x1, data = anscombe)

fit <- lm(y1 ~ x1, data = anscombe)
fit
##
## Call:
## lm(formula = y1 ~ x1, data = anscombe)
##
## Coefficients:
## (Intercept) x1
## 3.0001 0.5001
summary(fit)
##
## Call:
## lm(formula = y1 ~ x1, data = anscombe)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.92127 -0.45577 -0.04136 0.70941 1.83882
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.0001 1.1247 2.667 0.02573 *
## x1 0.5001 0.1179 4.241 0.00217 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.237 on 9 degrees of freedom
## Multiple R-squared: 0.6665, Adjusted R-squared: 0.6295
## F-statistic: 17.99 on 1 and 9 DF, p-value: 0.00217
plot(y1 ~ x4, data = anscombe)

fit <- lm(y1 ~ x4, data = anscombe)
fit
##
## Call:
## lm(formula = y1 ~ x4, data = anscombe)
##
## Coefficients:
## (Intercept) x4
## 10.4177 -0.3241
summary(fit)
##
## Call:
## lm(formula = y1 ~ x4, data = anscombe)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.005 -0.730 0.000 0.745 3.015
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.4177 1.6528 6.303 0.00014 ***
## x4 -0.3241 0.1733 -1.871 0.09422 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.817 on 9 degrees of freedom
## Multiple R-squared: 0.2799, Adjusted R-squared: 0.1999
## F-statistic: 3.499 on 1 and 9 DF, p-value: 0.09422
house_prices <- read.csv('https://raw.githubusercontent.com/ywchiu/rtibame/master/data/house-prices.csv')
head(house_prices)
## Home Price SqFt Bedrooms Bathrooms Offers Brick Neighborhood
## 1 1 114300 1790 2 2 2 No East
## 2 2 114200 2030 4 2 3 No East
## 3 3 114800 1740 3 2 1 No East
## 4 4 94700 1980 3 2 3 No East
## 5 5 119800 2130 3 3 3 No East
## 6 6 114600 1780 3 2 2 No North
fit <- lm(Price ~ SqFt, data = house_prices)
summary(fit)
##
## Call:
## lm(formula = Price ~ SqFt, data = house_prices)
##
## Residuals:
## Min 1Q Median 3Q Max
## -46593 -16644 -1610 15124 54829
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -10091.130 18966.104 -0.532 0.596
## SqFt 70.226 9.426 7.450 1.3e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22480 on 126 degrees of freedom
## Multiple R-squared: 0.3058, Adjusted R-squared: 0.3003
## F-statistic: 55.5 on 1 and 126 DF, p-value: 1.302e-11
plot(Price ~ SqFt, data = house_prices)

fit <- lm(y3 ~ x1, data = anscombe)
plot(y3 ~ x1, data = anscombe)
abline(fit, col='red')

predicted <- predict(fit, anscombe)
rmse <- mean((predicted - anscombe$y3)^2) ^ (1/2)
rmse
## [1] 1.118286
actual <- anscombe$y3
mu <- mean(anscombe$y3)
rse <-mean((predicted -actual )^ 2)/mean(( mu -actual )^ 2)
Rsquare <- 1- rse
Rsquare
## [1] 0.666324
fit2 <- lm(y2 ~ x1, data = anscombe)
plot(y2 ~ x1, data = anscombe)
abline(fit2, col='red')

predicted <- predict(fit2, anscombe)
mean((predicted - anscombe$y2)^2) ^ (1/2)
## [1] 1.119102
fit3 <- lm(y2 ~ poly(x1, 2), data = anscombe)
plot(y2 ~ x1, data = anscombe)
lines(anscombe$x1[order(anscombe$x1)], fit3$fitted.values[order(anscombe$x1)], col = 'red')

predicted <- predict(fit3, anscombe)
mean((predicted - anscombe$y2)^2) ^ (1/2)
## [1] 0.001426299
Multiple Regression
house_prices <- read.csv('https://raw.githubusercontent.com/ywchiu/rtibame/master/data/house-prices.csv')
head(house_prices)
## Home Price SqFt Bedrooms Bathrooms Offers Brick Neighborhood
## 1 1 114300 1790 2 2 2 No East
## 2 2 114200 2030 4 2 3 No East
## 3 3 114800 1740 3 2 1 No East
## 4 4 94700 1980 3 2 3 No East
## 5 5 119800 2130 3 3 3 No East
## 6 6 114600 1780 3 2 2 No North
house_prices$brick_d<-ifelse(house_prices$Brick=="Yes",1,0)
house_prices$east<-
ifelse(house_prices$Neighborhood=="East",1,0)
house_prices$north<-
ifelse(house_prices$Neighborhood=="North",1,0)
set.seed(123)
idx <- sample.int(2, nrow(house_prices), prob=c(0.6,0.4), replace=TRUE)
training_data <- house_prices[idx == 1,]
validation_data <- house_prices[idx == 2,]
lm.fit1 <- lm(Price ~ SqFt + Bathrooms + Bedrooms + Offers+
north + east + brick_d, data=training_data)
summary(lm.fit1)
##
## Call:
## lm(formula = Price ~ SqFt + Bathrooms + Bedrooms + Offers + north +
## east + brick_d, data = training_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25491.4 -5131.2 -490.5 6769.7 20369.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13252.14 12531.98 1.057 0.2939
## SqFt 61.48 6.89 8.923 3.66e-13 ***
## Bathrooms 3266.08 2487.80 1.313 0.1935
## Bedrooms 4823.22 2015.58 2.393 0.0194 *
## Offers -8331.03 1260.22 -6.611 6.35e-09 ***
## north -17452.11 3911.56 -4.462 3.03e-05 ***
## east -20680.68 3159.79 -6.545 8.35e-09 ***
## brick_d 19541.81 2412.24 8.101 1.19e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9146 on 70 degrees of freedom
## Multiple R-squared: 0.8922, Adjusted R-squared: 0.8814
## F-statistic: 82.74 on 7 and 70 DF, p-value: < 2.2e-16
model <- step(lm.fit1)
## Start: AIC=1430.44
## Price ~ SqFt + Bathrooms + Bedrooms + Offers + north + east +
## brick_d
##
## Df Sum of Sq RSS AIC
## - Bathrooms 1 144167994 5.9994e+09 1430.3
## <none> 5.8552e+09 1430.4
## - Bedrooms 1 478980711 6.3342e+09 1434.6
## - north 1 1665108832 7.5203e+09 1448.0
## - east 1 3583097368 9.4383e+09 1465.7
## - Offers 1 3655512174 9.5107e+09 1466.3
## - brick_d 1 5489527535 1.1345e+10 1480.0
## - SqFt 1 6659891685 1.2515e+10 1487.7
##
## Step: AIC=1430.34
## Price ~ SqFt + Bedrooms + Offers + north + east + brick_d
##
## Df Sum of Sq RSS AIC
## <none> 5.9994e+09 1430.3
## - Bedrooms 1 692383542 6.6918e+09 1436.9
## - north 1 1714508808 7.7139e+09 1448.0
## - Offers 1 3533703830 9.5331e+09 1464.5
## - east 1 3667946556 9.6673e+09 1465.5
## - brick_d 1 6085356765 1.2085e+10 1483.0
## - SqFt 1 7460095606 1.3459e+10 1491.4
library(car)
## Loading required package: carData
vif(model)
## SqFt Bedrooms Offers north east brick_d
## 1.835850 1.842482 1.895115 3.032704 2.197699 1.158608
training_data$predict.price <- predict(lm.fit1)
training_data$error <- residuals(lm.fit1)
validation_data$predict.price <- predict(lm.fit1,newdata=validation_data)
validation_data$error <- validation_data$predict.price - validation_data$Price
hist(training_data$error)

hist(validation_data$error)

a<-cor(training_data$Price,training_data$predict.price)
b<-cor(validation_data$Price,validation_data$predict.price)
a*a
## [1] 0.8921676
b*b
## [1] 0.8097617
591 爬網
library(jsonlite)
1423 / 30
## [1] 47.43333
houseall <- data.frame()
for (i in seq(0,1)){
house_url <- paste0('https://sale.591.com.tw/home/search/list?type=2&&shType=list&kind=9®ionid=1§ion=5&shape=2&firstRow=',i * 30,'&totalRows=1423')
house <- fromJSON(house_url)
houseall <- rbind(houseall, house$data$house_list[,c('room','floor','area','houseage','price')])
print(i)
}
## [1] 0
## [1] 1
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
houseall <- houseall[! is.na(houseall$floor),]
houseall <- houseall %>% extract(floor,c("building_floor", "total_floor"), "(\\d+)F/(\\d+)F")
houseall <- houseall %>% extract(room,c("bedroom", "living_room","bathroom"), "(\\d+)房(\\d+)廳(\\d+)衛")
str(houseall)
## 'data.frame': 60 obs. of 8 variables:
## $ bedroom : chr "3" "1" "7" "2" ...
## $ living_room : chr "2" "1" "6" "2" ...
## $ bathroom : chr "2" "1" "6" "1" ...
## $ building_floor: chr "2" "3" "6" "5" ...
## $ total_floor : chr "7" "7" "6" "6" ...
## $ area : num 49.2 10.5 52.4 34.6 13 ...
## $ houseage : int 40 0 32 8 8 34 6 41 41 49 ...
## $ price : int 4978 1418 4588 3588 1488 4968 3550 3480 3200 1988 ...
houseall$bedroom <- as.numeric(houseall$bedroom)
houseall$living_room <- as.numeric(houseall$living_room)
houseall$bathroom <- as.numeric(houseall$bathroom)
houseall$building_floor <- as.numeric(houseall$building_floor)
houseall$total_floor <- as.numeric(houseall$total_floor)
#write.csv(houseall,file = '591housefull.csv')
houseall <- read.csv('https://raw.githubusercontent.com/ywchiu/rtibame/master/Data/591housefull.csv')
houseall$X <- NULL
fit <- lm(price ~ ., data=houseall)
summary(fit)
##
## Call:
## lm(formula = price ~ ., data = houseall)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6376.3 -507.4 48.0 472.0 10934.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.356 163.302 0.149 0.881458
## bedroom 48.699 44.945 1.084 0.278773
## living_room -515.002 93.690 -5.497 4.62e-08 ***
## bathroom 337.373 72.810 4.634 3.94e-06 ***
## building_floor 20.069 10.051 1.997 0.046048 *
## total_floor 25.624 7.654 3.348 0.000837 ***
## area 109.113 1.853 58.879 < 2e-16 ***
## houseage -14.583 2.645 -5.515 4.19e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1224 on 1348 degrees of freedom
## (67 observations deleted due to missingness)
## Multiple R-squared: 0.9023, Adjusted R-squared: 0.9018
## F-statistic: 1779 on 7 and 1348 DF, p-value: < 2.2e-16
step.fit <- step(fit)
## Start: AIC=19289.38
## price ~ bedroom + living_room + bathroom + building_floor + total_floor +
## area + houseage
##
## Df Sum of Sq RSS AIC
## - bedroom 1 1758100 2020417631 19289
## <none> 2018659531 19289
## - building_floor 1 5970943 2024630474 19291
## - total_floor 1 16785574 2035445105 19299
## - bathroom 1 32151953 2050811484 19309
## - living_room 1 45248293 2063907825 19317
## - houseage 1 45540861 2064200392 19318
## - area 1 5191536475 7210196006 21014
##
## Step: AIC=19288.56
## price ~ living_room + bathroom + building_floor + total_floor +
## area + houseage
##
## Df Sum of Sq RSS AIC
## <none> 2020417631 19289
## - building_floor 1 5866539 2026284169 19291
## - total_floor 1 15952797 2036370428 19297
## - bathroom 1 37019141 2057436772 19311
## - living_room 1 43728396 2064146027 19316
## - houseage 1 43811411 2064229042 19316
## - area 1 5912245965 7932663596 21141
summary(step.fit)
##
## Call:
## lm(formula = price ~ living_room + bathroom + building_floor +
## total_floor + area + houseage, data = houseall)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6540.6 -484.5 40.8 473.0 10926.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 47.782 161.875 0.295 0.76790
## living_room -487.894 90.294 -5.403 7.72e-08 ***
## bathroom 353.939 71.192 4.972 7.49e-07 ***
## building_floor 19.890 10.050 1.979 0.04800 *
## total_floor 24.879 7.623 3.264 0.00113 **
## area 109.782 1.747 62.829 < 2e-16 ***
## houseage -14.091 2.605 -5.409 7.51e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1224 on 1349 degrees of freedom
## (67 observations deleted due to missingness)
## Multiple R-squared: 0.9022, Adjusted R-squared: 0.9018
## F-statistic: 2075 on 6 and 1349 DF, p-value: < 2.2e-16
predict(step.fit, head(houseall, 1))
## 1
## 4836.917
Word Segment
s <- 'I love to eat apple'
strsplit(s, ,split = ' ')
## [[1]]
## [1] "I" "love" "to" "eat" "apple"
s <- '我喜歡吃蘋果'
library(jiebaR)
## Loading required package: jiebaRD
mixseg <- worker()
segment(code = s, mixseg)
## [1] "我" "喜歡" "吃" "蘋果"
s <- '北海道日前發生規模6.7強震,各地傳出嚴重災情,《蘋果》採訪團驅車前往災情嚴重的厚真町,沿途有房屋因強震磚塊、玻璃散落一地,部分交通號誌仍因停電尚未復用,數台大型軍用車輛不斷往厚真町方向行駛,而也因土石沖刷嚴重,目前往厚真町的道路已立牌禁止通行。'
strsplit(s,split=',|《|》|。|、' )
## [[1]]
## [1] "北海道日前發生規模6.7強震"
## [2] "各地傳出嚴重災情"
## [3] ""
## [4] "蘋果"
## [5] "採訪團驅車前往災情嚴重的厚真町"
## [6] "沿途有房屋因強震磚塊"
## [7] "玻璃散落一地"
## [8] "部分交通號誌仍因停電尚未復用"
## [9] "數台大型軍用車輛不斷往厚真町方向行駛"
## [10] "而也因土石沖刷嚴重"
## [11] "目前往厚真町的道路已立牌禁止通行"
JiebaR
s <- '那酸民婉君也可以報名嗎?'
library(jiebaR)
#?worker
mixseg <- worker()
segment(s, mixseg)
## [1] "那" "酸民" "婉君" "也" "可以" "報名" "嗎"
#edit_dict()
tagseg <- worker('tag')
segment(s, tagseg)
## r n n d c v y
## "那" "酸民" "婉君" "也" "可以" "報名" "嗎"
news <- '日本近日受到颱風、地震重創,我駐日代表處卻無因應措施,甚至有滯留當地的台灣民眾致電詢問能否提供住宿協助卻被冷言回應,讓駐日代表謝長廷的臉書湧入一片罵聲,質疑他毫無作為。謝長廷 7 日搭機到北海道協助駐札幌辦事處,受訪時強調若有責任自己定會承擔,面對指責也不覺得委屈,至於國民黨立委要求他下台,他則回應「政府叫我下台我就下台」。
綜合媒體報導,謝長廷 7 日搭機前往北海道協助駐札幌辦事處,針對外界的批評,謝長廷受訪時表示,有些人搞不清楚辦事處和代表處,「有的以為關西機場在我家隔壁,東京根本沒下雨,我還是照常在工作,你當然可以問我做什麼,我都可以說明。」
談到中國大使館派車接人一事,謝長廷解釋,所謂的巴士是關西機場安排的巴士,當時沒有任何外國巴士能進機場,中國大陸的巴士也是停在 11.6 公里外接駁,而其他國家像美國、南韓等也沒有派車去接人,也沒人在罵這些國家的大使。
至於國民黨立委要求他回台報告、甚至下台,謝長廷則說沒關係,立委要他回來就回來,「政府叫我下台我就下台」,這些都無所謂,對於外界批評也不覺得委屈,做政治沒有什麼委屈的,現在只想把事情處理好,剩下的之後再解釋。'
library(jiebaR)
mixseg <- worker()
seg.list <- segment(news, mixseg)
tb <- table(seg.list)
tb <- tb[(nchar(names(tb)) >= 2) & (tb >= 2)]
#tb
library(wordcloud2)
wordcloud2(tb, shape = 'star')
TFIDF
a <- c('a')
abb <- c('a', 'b', 'b')
abc <- c('a', 'b', 'c')
D <- list(a,abb,abc)
#tfidf('a', a, D)
tf <- 1/1
idf <- log(3/3)
tf * idf
## [1] 0
#tfidf('a', abb, D)
tf <- 1/3
idf <- log(3/3)
tf * idf
## [1] 0
#tfidf('a', abc, D)
tf <- 1/3
idf <- log(3/3)
tf * idf
## [1] 0
#tfidf('b', abb, D)
tf <- 2/3
idf <- log(3/2)
tf * idf
## [1] 0.2703101
#tfidf('b', abc, D)
tf <- 1/3
idf <- log(3/2)
tf * idf
## [1] 0.135155
#tfidf('c', abc, D)
tf <- 1/3
idf <- log(3/1)
tf * idf
## [1] 0.3662041
tfidf <- function(t, d, D){
tf <- sum(d == t) / length(d)
idf <- log(length(D) / sum(sapply(D, function(e) t %in% e)))
tf * idf
}
tfidf('a', a, D)
## [1] 0
tfidf('a', abb, D)
## [1] 0
tfidf('a', abc, D)
## [1] 0
tfidf('b', abb, D)
## [1] 0.2703101
tfidf('b', abc, D)
## [1] 0.135155
tfidf('c', abc, D)
## [1] 0.3662041
Term Document Frequency
library(tm)
## Loading required package: NLP
e3 <- 'Hello, I am David. I have taken over 100 courses ~~~'
e3.vec <- strsplit(e3,' ')
e3.corpus <- Corpus(VectorSource(e3.vec))
e3.corpus
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 1
e3.dtm <- DocumentTermMatrix(e3.corpus)
inspect(e3.dtm)
## <<DocumentTermMatrix (documents: 1, terms: 7)>>
## Non-/sparse entries: 7/0
## Sparsity : 0%
## Maximal term length: 7
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs 100 courses david have hello over taken
## 1 1 1 1 1 1 1 1
e3.dtm <- DocumentTermMatrix(e3.corpus, control=list(wordLengths=c(1,20)))
inspect(e3.dtm)
## <<DocumentTermMatrix (documents: 1, terms: 10)>>
## Non-/sparse entries: 10/0
## Sparsity : 0%
## Maximal term length: 7
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs 100 am c courses david have hello i over taken
## 1 1 1 1 1 1 1 1 2 1 1
doc <- tm_map(e3.corpus, removeNumbers)
## Warning in tm_map.SimpleCorpus(e3.corpus, removeNumbers): transformation
## drops documents
e3.dtm <- DocumentTermMatrix(doc)
inspect(e3.dtm)
## <<DocumentTermMatrix (documents: 1, terms: 6)>>
## Non-/sparse entries: 6/0
## Sparsity : 0%
## Maximal term length: 7
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs courses david have hello over taken
## 1 1 1 1 1 1 1
e3 <- 'imaging image imagination imaging'
e3.vec <- strsplit(e3,' ')
e3.corpus <- Corpus(VectorSource(e3.vec))
e3.corpus
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 1
doc <- tm_map(e3.corpus, removeNumbers)
## Warning in tm_map.SimpleCorpus(e3.corpus, removeNumbers): transformation
## drops documents
?stemDocument
## starting httpd help server ...
## done
doc <- tm_map(doc,stemDocument)
## Warning in tm_map.SimpleCorpus(doc, stemDocument): transformation drops
## documents
e3.dtm <- DocumentTermMatrix(doc)
inspect(e3.dtm)
## <<DocumentTermMatrix (documents: 1, terms: 3)>>
## Non-/sparse entries: 3/0
## Sparsity : 0%
## Maximal term length: 11
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs image imagination imaging
## 1 1 1 2
e1 <- 'this is a book'
e2 <- 'this is my car'
str.split <- strsplit(c(e1,e2), split = ' ')
str.corpus <- Corpus(VectorSource(str.split))
dtm <- DocumentTermMatrix(str.corpus)
inspect(dtm)
## <<DocumentTermMatrix (documents: 2, terms: 3)>>
## Non-/sparse entries: 4/2
## Sparsity : 33%
## Maximal term length: 4
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs book car this
## 1 1 0 1
## 2 0 1 1
中文詞頻矩陣
library(jiebaR)
edit_dict()
## Warning in edit_dict(): You should save the dictionary without BOM on
## Windows
mixseg <- worker()
s <- '大巨蛋案對市府同仁下封口令?柯p否認'
s1 <- '柯p市府近來飽受大巨蛋爭議'
seg.list <- lapply(list(s,s1), function(e)segment(e, mixseg))
#seg.list
str.corpus <- Corpus(VectorSource(seg.list))
dtm <- DocumentTermMatrix(str.corpus )
inspect(dtm)
## <<DocumentTermMatrix (documents: 2, terms: 8)>>
## Non-/sparse entries: 10/6
## Sparsity : 38%
## Maximal term length: 3
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs 大巨蛋 市府 同仁 爭議 近來 封口令 案對 飽受
## 1 1 1 1 0 0 1 1 0
## 2 1 1 0 1 1 0 0 1
詞頻矩陣應用
load("C:/Users/USER/Downloads/applenews.RData")
head(applenews)
library(jiebaR)
mixseg <- worker()
apple.seg <- lapply(applenews$content, function(e) segment(e, mixseg))
apple.corpus <- Corpus(VectorSource(apple.seg))
dtm <- DocumentTermMatrix(apple.corpus, control=list(weighting = function(x)
weightTfIdf(x, normalize =FALSE)))
?DocumentTermMatrix
dim(dtm)
sort(apply(as.matrix(dtm), sum,MARGIN = 2), decreasing=TRUE)[1:10]
?sort
findFreqTerms(dtm, lowfreq = 300)
dtm <- DocumentTermMatrix(apple.corpus)
findAssocs(dtm,"肯亞", 0.5)
findAssocs(dtm,"浩鼎", 0.7)
dim(dtm)
dtm.remove <- removeSparseTerms(dtm, 0.99)
dim(dtm.remove)
文章相似度
# 大巨蛋 柯文哲 趙藤雄 麻煩
# 大巨蛋 柯文哲 趙藤雄 麻煩 遠雄 牢獄 法庭 ....
# 1 1 1 1 0 0 0
# 1 1 1 1 1 1 1
a <- c(1,1,1,1,0,0,0)
b <- c(1,1,1,1,1,1,1)
dist(rbind(a,b))
# 我喜歡看電視不喜歡看電影
# 我不喜歡看電視也不喜歡看電影
#我 1,喜歡 2,看 2,電視 1,電影 1,不 1,也 0。
#我 1,喜歡 2,看 2,電視 1,電影 1,不 2,也 1。
library(proxy)
a <- c(1,2,2,1,1,1,0)
b <- c(1,2,2,1,1,2,1)
proxy::dist(rbind(a,b), method = 'cosine')
load("C:/Users/USER/Downloads/applenews.RData")
library(jiebaR)
library(tm)
mixseg <- worker()
apple.seg <- lapply(applenews$content, function(e) segment(e, mixseg))
apple.corpus <- Corpus(VectorSource(apple.seg))
dtm <- DocumentTermMatrix(apple.corpus)
dtm.remove <- removeSparseTerms(dtm, 0.99)
dim(dtm)
dim(dtm.remove)
dtm.dist <- proxy::dist(as.matrix(dtm.remove), method = 'cosine')
dim(dtm.dist)
applenews$title[80]
dtm.mat <- as.matrix(dtm.dist)
order(dtm.mat[80,])[1:10]
applenews$title[order(dtm.mat[80,])[1:10]]
article.query = function(idx){
applenews$title[as.integer(names(sort(dtm.mat[idx, which(dtm.mat[idx,] <
0.8)])))]
}
article.query(80)[1:10]
文章分群
hc <- hclust(dtm.dist, method = 'ward.D2')
plot(hc)
rect.hclust(hc, k=12)
fit <- cutree(hc, k = 12)
table(fit)
applenews$title[fit ==1]
library(igraph)
m <- ifelse(dtm.mat < 0.4, 1, 0)
G <- graph_from_adjacency_matrix(m)
wc <- cluster_walktrap(G)
modularity(wc)
table(membership(wc))
group <- membership(wc)
#group
applenews$title[group ==8]
文章分類
apple.subset <- applenews[applenews$category %in% c('財經','娛樂'),]
library(jiebaR)
mixseg <- worker()
apple.seg =lapply(apple.subset$content, function(e)segment(code=e,
jiebar=mixseg))
library(tm)
doc <- Corpus(VectorSource(apple.seg))
dtm <- DocumentTermMatrix(doc)
dim(dtm)
ft <- findFreqTerms(dtm, 10)
control.list <- list(wordLengths=c(2,Inf))
new.dtm <- removeSparseTerms(dtm, 0.99)
convert_counts <- function(x) {
x <- ifelse(x > 0, 1, 0)
x <- factor(x, levels = c(0, 1), labels = c("No", "Yes"))
return(x)
}
dtm.count <- apply(new.dtm, MARGIN = 2, convert_counts)
dtm.count[1:10,1:10]
dim(dtm.count)
m <- as.data.frame(dtm.count)
idx <- sample.int(2, nrow(m), replace=TRUE, prob=c(0.7,0.3))
trainset <- m[idx==1,]
testset <- m[idx==2,]
traintag <- apple.subset[idx==1,"category"]
testtag <- apple.subset[idx==2,"category"]
library(e1071)
model <- naiveBayes(trainset,as.factor(traintag) )
pred <- predict(model, testset)
table(testtag,pred)
正負評判斷
library(readxl)
yahoo_movie <- read_excel("C:/Users/USER/Downloads/yahoo_movie.xlsx")
head(yahoo_movie)
yahoo_movie <- yahoo_movie[(nchar(yahoo_movie$content) >=10)& (yahoo_movie$status %in% c('good', 'bad')),]
yahoo_movie$status <- as.factor(yahoo_movie$status )
library(jiebaR)
mixseg <- worker()
yahoo.seg =lapply(yahoo_movie$content, function(e)segment(code=e,jiebar=mixseg))
library(tm)
doc <- Corpus(VectorSource(yahoo.seg))
dtm <- DocumentTermMatrix(doc)
dim(dtm)
convert_counts <- function(x) {
x <- ifelse(x > 0, 1, 0)
x <- factor(x, levels = c(0, 1), labels = c("No", "Yes"))
return(x)
}
dtm.count <- apply(dtm, MARGIN = 2, convert_counts)
dim(dtm.count)
m <- as.data.frame(dtm.count)
idx <- sample.int(2, nrow(m), replace=TRUE, prob=c(0.7,0.3))
trainset <- m[idx==1,]
testset <- m[idx==2,]
traintag <- yahoo_movie[idx==1,]$status
testtag <- yahoo_movie[idx==2,]$status
traincontent <- yahoo_movie[idx==1,]$content
testcontent <- yahoo_movie[idx==2,]$content
library(e1071)
model <- naiveBayes(trainset,traintag )
pred <- predict(model, testset)
table(testtag,pred)
nequal <- which(pred != testtag)
#nequal[1:3]
#pred[1:3]
#testtag[1:3]
#testcontent[nequal[1:3]]
#traintag