Demo20180908

Linear Regression

data(anscombe)
plot(y1 ~ x1, data = anscombe)

fit <- lm(y1 ~ x1, data = anscombe)
fit

## 
## Call:
## lm(formula = y1 ~ x1, data = anscombe)
## 
## Coefficients:
## (Intercept)           x1  
##      3.0001       0.5001

summary(fit)

## 
## Call:
## lm(formula = y1 ~ x1, data = anscombe)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.92127 -0.45577 -0.04136  0.70941  1.83882 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)   3.0001     1.1247   2.667  0.02573 * 
## x1            0.5001     0.1179   4.241  0.00217 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.237 on 9 degrees of freedom
## Multiple R-squared:  0.6665, Adjusted R-squared:  0.6295 
## F-statistic: 17.99 on 1 and 9 DF,  p-value: 0.00217

plot(y1 ~ x4, data = anscombe)

fit <- lm(y1 ~ x4, data = anscombe)
fit

## 
## Call:
## lm(formula = y1 ~ x4, data = anscombe)
## 
## Coefficients:
## (Intercept)           x4  
##     10.4177      -0.3241

summary(fit)

## 
## Call:
## lm(formula = y1 ~ x4, data = anscombe)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.005 -0.730  0.000  0.745  3.015 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  10.4177     1.6528   6.303  0.00014 ***
## x4           -0.3241     0.1733  -1.871  0.09422 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.817 on 9 degrees of freedom
## Multiple R-squared:  0.2799, Adjusted R-squared:  0.1999 
## F-statistic: 3.499 on 1 and 9 DF,  p-value: 0.09422

house_prices <- read.csv('https://raw.githubusercontent.com/ywchiu/rtibame/master/data/house-prices.csv')

head(house_prices)

##   Home  Price SqFt Bedrooms Bathrooms Offers Brick Neighborhood
## 1    1 114300 1790        2         2      2    No         East
## 2    2 114200 2030        4         2      3    No         East
## 3    3 114800 1740        3         2      1    No         East
## 4    4  94700 1980        3         2      3    No         East
## 5    5 119800 2130        3         3      3    No         East
## 6    6 114600 1780        3         2      2    No        North

fit <- lm(Price ~ SqFt, data = house_prices)

summary(fit)

## 
## Call:
## lm(formula = Price ~ SqFt, data = house_prices)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -46593 -16644  -1610  15124  54829 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -10091.130  18966.104  -0.532    0.596    
## SqFt            70.226      9.426   7.450  1.3e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22480 on 126 degrees of freedom
## Multiple R-squared:  0.3058, Adjusted R-squared:  0.3003 
## F-statistic:  55.5 on 1 and 126 DF,  p-value: 1.302e-11

plot(Price ~ SqFt, data = house_prices)

fit <- lm(y3 ~ x1, data = anscombe)
plot(y3 ~ x1, data = anscombe)
abline(fit, col='red')

predicted <- predict(fit, anscombe)

rmse <- mean((predicted - anscombe$y3)^2) ^ (1/2)
rmse

## [1] 1.118286

actual <- anscombe$y3

mu <- mean(anscombe$y3)
rse <-mean((predicted -actual )^ 2)/mean(( mu -actual )^ 2)
Rsquare <- 1- rse
Rsquare

## [1] 0.666324

fit2 <- lm(y2 ~ x1, data = anscombe)
plot(y2 ~ x1, data = anscombe)
abline(fit2, col='red')

predicted <- predict(fit2, anscombe)
mean((predicted - anscombe$y2)^2) ^ (1/2)

## [1] 1.119102

fit3 <- lm(y2 ~ poly(x1, 2), data = anscombe)
plot(y2 ~ x1, data = anscombe)
lines(anscombe$x1[order(anscombe$x1)], fit3$fitted.values[order(anscombe$x1)], col = 'red')

predicted <- predict(fit3, anscombe)
mean((predicted - anscombe$y2)^2) ^ (1/2)

## [1] 0.001426299

Multiple Regression

house_prices <- read.csv('https://raw.githubusercontent.com/ywchiu/rtibame/master/data/house-prices.csv')

head(house_prices)

##   Home  Price SqFt Bedrooms Bathrooms Offers Brick Neighborhood
## 1    1 114300 1790        2         2      2    No         East
## 2    2 114200 2030        4         2      3    No         East
## 3    3 114800 1740        3         2      1    No         East
## 4    4  94700 1980        3         2      3    No         East
## 5    5 119800 2130        3         3      3    No         East
## 6    6 114600 1780        3         2      2    No        North

house_prices$brick_d<-ifelse(house_prices$Brick=="Yes",1,0)

house_prices$east<-
ifelse(house_prices$Neighborhood=="East",1,0)

house_prices$north<-
ifelse(house_prices$Neighborhood=="North",1,0)


set.seed(123)
idx <- sample.int(2, nrow(house_prices), prob=c(0.6,0.4), replace=TRUE)
training_data    <- house_prices[idx == 1,]
validation_data  <- house_prices[idx == 2,]

lm.fit1 <- lm(Price ~ SqFt + Bathrooms + Bedrooms + Offers+
north + east + brick_d, data=training_data)
summary(lm.fit1)

## 
## Call:
## lm(formula = Price ~ SqFt + Bathrooms + Bedrooms + Offers + north + 
##     east + brick_d, data = training_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -25491.4  -5131.2   -490.5   6769.7  20369.0 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  13252.14   12531.98   1.057   0.2939    
## SqFt            61.48       6.89   8.923 3.66e-13 ***
## Bathrooms     3266.08    2487.80   1.313   0.1935    
## Bedrooms      4823.22    2015.58   2.393   0.0194 *  
## Offers       -8331.03    1260.22  -6.611 6.35e-09 ***
## north       -17452.11    3911.56  -4.462 3.03e-05 ***
## east        -20680.68    3159.79  -6.545 8.35e-09 ***
## brick_d      19541.81    2412.24   8.101 1.19e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9146 on 70 degrees of freedom
## Multiple R-squared:  0.8922, Adjusted R-squared:  0.8814 
## F-statistic: 82.74 on 7 and 70 DF,  p-value: < 2.2e-16

model <- step(lm.fit1)

## Start:  AIC=1430.44
## Price ~ SqFt + Bathrooms + Bedrooms + Offers + north + east + 
##     brick_d
## 
##             Df  Sum of Sq        RSS    AIC
## - Bathrooms  1  144167994 5.9994e+09 1430.3
## <none>                    5.8552e+09 1430.4
## - Bedrooms   1  478980711 6.3342e+09 1434.6
## - north      1 1665108832 7.5203e+09 1448.0
## - east       1 3583097368 9.4383e+09 1465.7
## - Offers     1 3655512174 9.5107e+09 1466.3
## - brick_d    1 5489527535 1.1345e+10 1480.0
## - SqFt       1 6659891685 1.2515e+10 1487.7
## 
## Step:  AIC=1430.34
## Price ~ SqFt + Bedrooms + Offers + north + east + brick_d
## 
##            Df  Sum of Sq        RSS    AIC
## <none>                   5.9994e+09 1430.3
## - Bedrooms  1  692383542 6.6918e+09 1436.9
## - north     1 1714508808 7.7139e+09 1448.0
## - Offers    1 3533703830 9.5331e+09 1464.5
## - east      1 3667946556 9.6673e+09 1465.5
## - brick_d   1 6085356765 1.2085e+10 1483.0
## - SqFt      1 7460095606 1.3459e+10 1491.4

library(car)

## Loading required package: carData

vif(model)

##     SqFt Bedrooms   Offers    north     east  brick_d 
## 1.835850 1.842482 1.895115 3.032704 2.197699 1.158608

training_data$predict.price <- predict(lm.fit1)
training_data$error <- residuals(lm.fit1)

validation_data$predict.price <- predict(lm.fit1,newdata=validation_data)
validation_data$error <- validation_data$predict.price - validation_data$Price

hist(training_data$error)

hist(validation_data$error)

a<-cor(training_data$Price,training_data$predict.price)
b<-cor(validation_data$Price,validation_data$predict.price)
a*a

## [1] 0.8921676

b*b

## [1] 0.8097617

591 爬網

library(jsonlite)
1423 / 30

## [1] 47.43333

houseall <- data.frame()
for (i in seq(0,1)){
 house_url <- paste0('https://sale.591.com.tw/home/search/list?type=2&&shType=list&kind=9&regionid=1&section=5&shape=2&firstRow=',i * 30,'&totalRows=1423')
 house <- fromJSON(house_url)
 houseall <- rbind(houseall, house$data$house_list[,c('room','floor','area','houseage','price')])
 print(i)
}

## [1] 0
## [1] 1

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:car':
## 
##     recode

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)

houseall <- houseall[! is.na(houseall$floor),]

houseall <- houseall %>% extract(floor,c("building_floor", "total_floor"), "(\\d+)F/(\\d+)F")


houseall <- houseall %>% extract(room,c("bedroom", "living_room","bathroom"), "(\\d+)房(\\d+)廳(\\d+)衛")
str(houseall)

## 'data.frame':    60 obs. of  8 variables:
##  $ bedroom       : chr  "3" "1" "7" "2" ...
##  $ living_room   : chr  "2" "1" "6" "2" ...
##  $ bathroom      : chr  "2" "1" "6" "1" ...
##  $ building_floor: chr  "2" "3" "6" "5" ...
##  $ total_floor   : chr  "7" "7" "6" "6" ...
##  $ area          : num  49.2 10.5 52.4 34.6 13 ...
##  $ houseage      : int  40 0 32 8 8 34 6 41 41 49 ...
##  $ price         : int  4978 1418 4588 3588 1488 4968 3550 3480 3200 1988 ...

houseall$bedroom <- as.numeric(houseall$bedroom)
houseall$living_room <- as.numeric(houseall$living_room)
houseall$bathroom <- as.numeric(houseall$bathroom)
houseall$building_floor <- as.numeric(houseall$building_floor)
houseall$total_floor <- as.numeric(houseall$total_floor)
#write.csv(houseall,file = '591housefull.csv')


houseall <- read.csv('https://raw.githubusercontent.com/ywchiu/rtibame/master/Data/591housefull.csv')

houseall$X <- NULL


fit <- lm(price ~ ., data=houseall)
summary(fit)

## 
## Call:
## lm(formula = price ~ ., data = houseall)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6376.3  -507.4    48.0   472.0 10934.8 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      24.356    163.302   0.149 0.881458    
## bedroom          48.699     44.945   1.084 0.278773    
## living_room    -515.002     93.690  -5.497 4.62e-08 ***
## bathroom        337.373     72.810   4.634 3.94e-06 ***
## building_floor   20.069     10.051   1.997 0.046048 *  
## total_floor      25.624      7.654   3.348 0.000837 ***
## area            109.113      1.853  58.879  < 2e-16 ***
## houseage        -14.583      2.645  -5.515 4.19e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1224 on 1348 degrees of freedom
##   (67 observations deleted due to missingness)
## Multiple R-squared:  0.9023, Adjusted R-squared:  0.9018 
## F-statistic:  1779 on 7 and 1348 DF,  p-value: < 2.2e-16

step.fit <- step(fit)

## Start:  AIC=19289.38
## price ~ bedroom + living_room + bathroom + building_floor + total_floor + 
##     area + houseage
## 
##                  Df  Sum of Sq        RSS   AIC
## - bedroom         1    1758100 2020417631 19289
## <none>                         2018659531 19289
## - building_floor  1    5970943 2024630474 19291
## - total_floor     1   16785574 2035445105 19299
## - bathroom        1   32151953 2050811484 19309
## - living_room     1   45248293 2063907825 19317
## - houseage        1   45540861 2064200392 19318
## - area            1 5191536475 7210196006 21014
## 
## Step:  AIC=19288.56
## price ~ living_room + bathroom + building_floor + total_floor + 
##     area + houseage
## 
##                  Df  Sum of Sq        RSS   AIC
## <none>                         2020417631 19289
## - building_floor  1    5866539 2026284169 19291
## - total_floor     1   15952797 2036370428 19297
## - bathroom        1   37019141 2057436772 19311
## - living_room     1   43728396 2064146027 19316
## - houseage        1   43811411 2064229042 19316
## - area            1 5912245965 7932663596 21141

summary(step.fit)

## 
## Call:
## lm(formula = price ~ living_room + bathroom + building_floor + 
##     total_floor + area + houseage, data = houseall)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6540.6  -484.5    40.8   473.0 10926.4 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      47.782    161.875   0.295  0.76790    
## living_room    -487.894     90.294  -5.403 7.72e-08 ***
## bathroom        353.939     71.192   4.972 7.49e-07 ***
## building_floor   19.890     10.050   1.979  0.04800 *  
## total_floor      24.879      7.623   3.264  0.00113 ** 
## area            109.782      1.747  62.829  < 2e-16 ***
## houseage        -14.091      2.605  -5.409 7.51e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1224 on 1349 degrees of freedom
##   (67 observations deleted due to missingness)
## Multiple R-squared:  0.9022, Adjusted R-squared:  0.9018 
## F-statistic:  2075 on 6 and 1349 DF,  p-value: < 2.2e-16

predict(step.fit, head(houseall, 1))

##        1 
## 4836.917

Word Segment

s <- 'I love to eat apple'
strsplit(s, ,split = ' ')

## [[1]]
## [1] "I"     "love"  "to"    "eat"   "apple"

s <- '我喜歡吃蘋果'


library(jiebaR)

## Loading required package: jiebaRD

mixseg <- worker()
segment(code = s, mixseg)

## [1] "我"   "喜歡" "吃"   "蘋果"

s <- '北海道日前發生規模6.7強震，各地傳出嚴重災情，《蘋果》採訪團驅車前往災情嚴重的厚真町，沿途有房屋因強震磚塊、玻璃散落一地，部分交通號誌仍因停電尚未復用，數台大型軍用車輛不斷往厚真町方向行駛，而也因土石沖刷嚴重，目前往厚真町的道路已立牌禁止通行。'

strsplit(s,split='，|《|》|。|、' )

## [[1]]
##  [1] "北海道日前發生規模6.7強震"           
##  [2] "各地傳出嚴重災情"                    
##  [3] ""                                    
##  [4] "蘋果"                                
##  [5] "採訪團驅車前往災情嚴重的厚真町"      
##  [6] "沿途有房屋因強震磚塊"                
##  [7] "玻璃散落一地"                        
##  [8] "部分交通號誌仍因停電尚未復用"        
##  [9] "數台大型軍用車輛不斷往厚真町方向行駛"
## [10] "而也因土石沖刷嚴重"                  
## [11] "目前往厚真町的道路已立牌禁止通行"

JiebaR

s <- '那酸民婉君也可以報名嗎?'
library(jiebaR)
#?worker
mixseg <- worker()
segment(s, mixseg)

## [1] "那"   "酸民" "婉君" "也"   "可以" "報名" "嗎"

#edit_dict()


tagseg <- worker('tag')
segment(s, tagseg)

##      r      n      n      d      c      v      y 
##   "那" "酸民" "婉君"   "也" "可以" "報名"   "嗎"

news <- '日本近日受到颱風、地震重創，我駐日代表處卻無因應措施，甚至有滯留當地的台灣民眾致電詢問能否提供住宿協助卻被冷言回應，讓駐日代表謝長廷的臉書湧入一片罵聲，質疑他毫無作為。謝長廷 7 日搭機到北海道協助駐札幌辦事處，受訪時強調若有責任自己定會承擔，面對指責也不覺得委屈，至於國民黨立委要求他下台，他則回應「政府叫我下台我就下台」。

綜合媒體報導，謝長廷 7 日搭機前往北海道協助駐札幌辦事處，針對外界的批評，謝長廷受訪時表示，有些人搞不清楚辦事處和代表處，「有的以為關西機場在我家隔壁，東京根本沒下雨，我還是照常在工作，你當然可以問我做什麼，我都可以說明。」

談到中國大使館派車接人一事，謝長廷解釋，所謂的巴士是關西機場安排的巴士，當時沒有任何外國巴士能進機場，中國大陸的巴士也是停在 11.6 公里外接駁，而其他國家像美國、南韓等也沒有派車去接人，也沒人在罵這些國家的大使。

至於國民黨立委要求他回台報告、甚至下台，謝長廷則說沒關係，立委要他回來就回來，「政府叫我下台我就下台」，這些都無所謂，對於外界批評也不覺得委屈，做政治沒有什麼委屈的，現在只想把事情處理好，剩下的之後再解釋。'

library(jiebaR)
mixseg <- worker()
seg.list <- segment(news, mixseg)
tb <- table(seg.list)


tb <- tb[(nchar(names(tb)) >= 2) & (tb >= 2)]
#tb
library(wordcloud2)
wordcloud2(tb, shape = 'star')

TFIDF

a   <- c('a')
abb <- c('a', 'b', 'b')
abc <- c('a', 'b', 'c')
D   <- list(a,abb,abc)

#tfidf('a', a, D)
tf  <- 1/1
idf <- log(3/3)
tf * idf

## [1] 0

#tfidf('a', abb, D)
tf  <- 1/3
idf <- log(3/3)
tf * idf

## [1] 0

#tfidf('a', abc, D)
tf  <- 1/3
idf <- log(3/3)
tf * idf

## [1] 0

#tfidf('b', abb, D)
tf  <- 2/3
idf <- log(3/2)
tf * idf

## [1] 0.2703101

#tfidf('b', abc, D)
tf  <- 1/3
idf <- log(3/2)
tf * idf

## [1] 0.135155

#tfidf('c', abc, D)
tf  <- 1/3
idf <- log(3/1)
tf * idf

## [1] 0.3662041

tfidf <- function(t, d, D){
  tf  <- sum(d == t) / length(d)
  idf <- log(length(D) / sum(sapply(D, function(e) t %in% e)))
  tf * idf
}

tfidf('a', a, D)

## [1] 0

tfidf('a', abb, D)

## [1] 0

tfidf('a', abc, D)

## [1] 0

tfidf('b', abb, D)

## [1] 0.2703101

tfidf('b', abc, D)

## [1] 0.135155

tfidf('c', abc, D)

## [1] 0.3662041

Term Document Frequency

library(tm)

## Loading required package: NLP

e3 <- 'Hello, I am David. I have taken over 100 courses ~~~' 

e3.vec <- strsplit(e3,' ')

e3.corpus <-  Corpus(VectorSource(e3.vec))
e3.corpus

## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 1

e3.dtm    <- DocumentTermMatrix(e3.corpus)
inspect(e3.dtm)

## <<DocumentTermMatrix (documents: 1, terms: 7)>>
## Non-/sparse entries: 7/0
## Sparsity           : 0%
## Maximal term length: 7
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs 100 courses david have hello over taken
##    1   1       1     1    1     1    1     1

e3.dtm    <- DocumentTermMatrix(e3.corpus, control=list(wordLengths=c(1,20)))
inspect(e3.dtm)

## <<DocumentTermMatrix (documents: 1, terms: 10)>>
## Non-/sparse entries: 10/0
## Sparsity           : 0%
## Maximal term length: 7
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs 100 am c courses david have hello i over taken
##    1   1  1 1       1     1    1     1 2    1     1

doc <- tm_map(e3.corpus, removeNumbers)

## Warning in tm_map.SimpleCorpus(e3.corpus, removeNumbers): transformation
## drops documents

e3.dtm    <- DocumentTermMatrix(doc)
inspect(e3.dtm)

## <<DocumentTermMatrix (documents: 1, terms: 6)>>
## Non-/sparse entries: 6/0
## Sparsity           : 0%
## Maximal term length: 7
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs courses david have hello over taken
##    1       1     1    1     1    1     1

e3 <- 'imaging image imagination imaging' 

e3.vec <- strsplit(e3,' ')

e3.corpus <-  Corpus(VectorSource(e3.vec))
e3.corpus

## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 1

doc <- tm_map(e3.corpus, removeNumbers)

## Warning in tm_map.SimpleCorpus(e3.corpus, removeNumbers): transformation
## drops documents

?stemDocument

## starting httpd help server ...

##  done

doc <- tm_map(doc,stemDocument)

## Warning in tm_map.SimpleCorpus(doc, stemDocument): transformation drops
## documents

e3.dtm    <- DocumentTermMatrix(doc)
inspect(e3.dtm)

## <<DocumentTermMatrix (documents: 1, terms: 3)>>
## Non-/sparse entries: 3/0
## Sparsity           : 0%
## Maximal term length: 11
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs image imagination imaging
##    1     1           1       2

e1 <-  'this is a book'
e2 <-  'this is my car'

str.split  <- strsplit(c(e1,e2), split = ' ')
str.corpus <- Corpus(VectorSource(str.split))
dtm <- DocumentTermMatrix(str.corpus)
inspect(dtm)

## <<DocumentTermMatrix (documents: 2, terms: 3)>>
## Non-/sparse entries: 4/2
## Sparsity           : 33%
## Maximal term length: 4
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs book car this
##    1    1   0    1
##    2    0   1    1

中文詞頻矩陣

library(jiebaR)
edit_dict()

## Warning in edit_dict(): You should save the dictionary without BOM on
## Windows

mixseg <- worker()
s  <- '大巨蛋案對市府同仁下封口令？柯p否認'
s1 <- '柯p市府近來飽受大巨蛋爭議'

seg.list <- lapply(list(s,s1), function(e)segment(e, mixseg))
#seg.list
str.corpus <- Corpus(VectorSource(seg.list))
dtm <- DocumentTermMatrix(str.corpus )
inspect(dtm)

## <<DocumentTermMatrix (documents: 2, terms: 8)>>
## Non-/sparse entries: 10/6
## Sparsity           : 38%
## Maximal term length: 3
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs 大巨蛋 市府 同仁 爭議 近來 封口令 案對 飽受
##    1      1    1    1    0    0      1    1    0
##    2      1    1    0    1    1      0    0    1

詞頻矩陣應用

https://github.com/ywchiu/rtibame/blob/master/data/applenews.RData

load("C:/Users/USER/Downloads/applenews.RData")

head(applenews)



library(jiebaR)
mixseg    <- worker()
apple.seg <- lapply(applenews$content, function(e) segment(e, mixseg))

apple.corpus <- Corpus(VectorSource(apple.seg))

dtm <- DocumentTermMatrix(apple.corpus, control=list(weighting = function(x)
            weightTfIdf(x, normalize =FALSE)))
?DocumentTermMatrix
dim(dtm)

sort(apply(as.matrix(dtm), sum,MARGIN = 2), decreasing=TRUE)[1:10]

?sort
findFreqTerms(dtm, lowfreq = 300)


dtm <- DocumentTermMatrix(apple.corpus)
findAssocs(dtm,"肯亞", 0.5)
findAssocs(dtm,"浩鼎", 0.7)

dim(dtm)

dtm.remove <- removeSparseTerms(dtm, 0.99)
dim(dtm.remove)

文章相似度

# 大巨蛋 柯文哲 趙藤雄 麻煩
# 大巨蛋 柯文哲 趙藤雄 麻煩 遠雄 牢獄 法庭 ....

#   1      1      1     1    0     0    0
#   1      1      1     1    1     1    1

a <- c(1,1,1,1,0,0,0)
b <- c(1,1,1,1,1,1,1)
dist(rbind(a,b))

# 我喜歡看電視不喜歡看電影
# 我不喜歡看電視也不喜歡看電影

#我 1，喜歡 2，看 2，電視 1，電影 1，不 1，也 0。
#我 1，喜歡 2，看 2，電視 1，電影 1，不 2，也 1。


library(proxy)

a <- c(1,2,2,1,1,1,0)
b <- c(1,2,2,1,1,2,1)

proxy::dist(rbind(a,b), method = 'cosine')


load("C:/Users/USER/Downloads/applenews.RData")


library(jiebaR)
library(tm)
mixseg    <- worker()
apple.seg <- lapply(applenews$content, function(e) segment(e, mixseg))

apple.corpus <- Corpus(VectorSource(apple.seg))

dtm        <- DocumentTermMatrix(apple.corpus)
dtm.remove <- removeSparseTerms(dtm, 0.99)
dim(dtm)
dim(dtm.remove)
dtm.dist <- proxy::dist(as.matrix(dtm.remove), method = 'cosine')

dim(dtm.dist)

applenews$title[80]

dtm.mat <- as.matrix(dtm.dist)
order(dtm.mat[80,])[1:10]

applenews$title[order(dtm.mat[80,])[1:10]]


article.query = function(idx){
applenews$title[as.integer(names(sort(dtm.mat[idx, which(dtm.mat[idx,] <
0.8)])))]
}
article.query(80)[1:10]

文章分群

hc <- hclust(dtm.dist, method = 'ward.D2')

plot(hc)
rect.hclust(hc, k=12)


fit <- cutree(hc, k = 12)

table(fit)

applenews$title[fit ==1]



library(igraph)
m  <- ifelse(dtm.mat < 0.4, 1, 0)
G  <- graph_from_adjacency_matrix(m)
wc <- cluster_walktrap(G)
modularity(wc)
table(membership(wc))

group <- membership(wc)
#group

applenews$title[group ==8]

文章分類

apple.subset <- applenews[applenews$category %in% c('財經','娛樂'),]

library(jiebaR)

mixseg <- worker()
apple.seg =lapply(apple.subset$content, function(e)segment(code=e,
jiebar=mixseg))

library(tm)
doc <- Corpus(VectorSource(apple.seg))
dtm <- DocumentTermMatrix(doc)
dim(dtm)

ft <- findFreqTerms(dtm, 10)
control.list <- list(wordLengths=c(2,Inf))

new.dtm <- removeSparseTerms(dtm, 0.99)

convert_counts <- function(x) {
x <- ifelse(x > 0, 1, 0)
x <- factor(x, levels = c(0, 1), labels = c("No", "Yes"))
return(x)
}

dtm.count <- apply(new.dtm, MARGIN = 2, convert_counts)

dtm.count[1:10,1:10]

dim(dtm.count)


m <- as.data.frame(dtm.count)
idx <- sample.int(2, nrow(m), replace=TRUE, prob=c(0.7,0.3))
trainset <- m[idx==1,]
testset  <- m[idx==2,]
traintag <- apple.subset[idx==1,"category"]
testtag  <- apple.subset[idx==2,"category"]


library(e1071)
model <- naiveBayes(trainset,as.factor(traintag) )
pred <- predict(model, testset)

table(testtag,pred)

正負評判斷

https://raw.githubusercontent.com/ywchiu/tibamepy/master/data/yahoo_movie.xlsx

library(readxl)
yahoo_movie <- read_excel("C:/Users/USER/Downloads/yahoo_movie.xlsx")
head(yahoo_movie)

yahoo_movie <- yahoo_movie[(nchar(yahoo_movie$content) >=10)& (yahoo_movie$status %in% c('good', 'bad')),]

yahoo_movie$status <- as.factor(yahoo_movie$status )
library(jiebaR)

mixseg <- worker()
yahoo.seg =lapply(yahoo_movie$content, function(e)segment(code=e,jiebar=mixseg))

library(tm)
doc <- Corpus(VectorSource(yahoo.seg))
dtm <- DocumentTermMatrix(doc)
dim(dtm)


convert_counts <- function(x) {
x <- ifelse(x > 0, 1, 0)
x <- factor(x, levels = c(0, 1), labels = c("No", "Yes"))
return(x)
}

dtm.count <- apply(dtm, MARGIN = 2, convert_counts)


dim(dtm.count)


m <- as.data.frame(dtm.count)
idx <- sample.int(2, nrow(m), replace=TRUE, prob=c(0.7,0.3))
trainset <- m[idx==1,]
testset  <- m[idx==2,]
traintag <- yahoo_movie[idx==1,]$status
testtag  <- yahoo_movie[idx==2,]$status

traincontent <- yahoo_movie[idx==1,]$content
testcontent  <- yahoo_movie[idx==2,]$content


library(e1071)

model <- naiveBayes(trainset,traintag )
pred <- predict(model, testset)

table(testtag,pred)

nequal <- which(pred != testtag)
#nequal[1:3]

#pred[1:3]
#testtag[1:3]
#testcontent[nequal[1:3]]
#traintag

Demo20180908

David Chiu

2018年9月8日

Linear Regression

Multiple Regression

591 爬網

Word Segment

JiebaR

TFIDF

Term Document Frequency

中文詞頻矩陣

詞頻矩陣應用

文章相似度

文章分群

文章分類

正負評判斷