FinXChance 2016 Summer Workshop: R codes

Jying-Nan Wang

2016-07-31

quantmod demo

library(quantmod)
# 基本抓取資料與繪圖
getSymbols("AAPL",from="2015-09-01",to="2015-10-31")
## [1] "AAPL"
head(AAPL)
##            AAPL.Open AAPL.High AAPL.Low AAPL.Close AAPL.Volume
## 2015-09-01    110.15    111.88   107.36     107.72    76845900
## 2015-09-02    110.23    112.34   109.13     112.34    61888800
## 2015-09-03    112.49    112.78   110.04     110.37    53233900
## 2015-09-04    108.97    110.45   108.51     109.27    49996300
## 2015-09-08    111.75    112.56   110.32     112.31    54843600
## 2015-09-09    113.76    114.02   109.77     110.15    85010800
##            AAPL.Adjusted
## 2015-09-01      106.0364
## 2015-09-02      110.5842
## 2015-09-03      108.6450
## 2015-09-04      107.5622
## 2015-09-08      110.5546
## 2015-09-09      108.4284
chartSeries(AAPL,theme="white")

# 下載台股資料,存到變數X 中
X <- getSymbols("2330.tw", auto.assign=FALSE, from="2015-10-01",to="2015-11-13")
chartSeries(X)

# 技術分析 TTR 套件
getSymbols("AAPL",from="2015-06-01",to="2015-10-31")
## [1] "AAPL"
candleChart(AAPL,multi.col=TRUE,theme="white")

addMACD()

addBBands()

ggplot2 Demo

Basic Components

library(ggplot2)
# example 1
head(mtcars)
##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
p <- ggplot(mtcars, aes(wt, mpg))
p + geom_point()

# example 2
p <- ggplot(mtcars, aes(wt, mpg))
# 採用geom_point() 畫scatter plot,geom_smooth()畫迴歸線
p <- p + geom_point(shape=2)+ geom_smooth(lwd = 1, se = TRUE, method = "lm",color=rgb(0,0.5,0.5))
# 設定座標相關資訊
p <- p +xlab("車重") + ylab("油耗") + ggtitle("車重和油耗的關係") + expand_limits(x=c(1,6), y = c(8, 40)) 
# 設定中文(mac專用)
p + theme_bw(base_family = "STHeiti") 

dplyr Demo

library(dplyr)
# R 爬蟲,可先忽略
library(rvest)
sid <- "AAPL"
stockURL <- paste("http://finance.yahoo.com/q/ud?s=", sid, sep="") 
sdat = html(stockURL) %>% html_nodes(".yfnc_tabledata1") %>% html_text() 
sdat <- matrix(sdat,ncol=5,byrow = TRUE)
sdat <- data.frame(sdate=sdat[,1],sfirm=sdat[,2],sact=sdat[,3],from1=sdat[,4],to1=sdat[,5])

# 整理資料
library(dplyr)
# example 1: select
result <- sdat %>%
  select(sdate,sact)
#  arrange(desc(sdate))
head(result)
##       sdate      sact
## 1 28-Jul-16   Upgrade
## 2 27-Jul-16   Upgrade
## 3  6-Jul-16 Initiated
## 4  6-Apr-16 Initiated
## 5 30-Mar-16   Upgrade
## 6 12-Jan-16   Upgrade
# example 2: filter1
result <- sdat %>%
  filter(sact=="Upgrade")
head(result)
##       sdate          sfirm    sact          from1        to1
## 1 28-Jul-16 Hilliard Lyons Upgrade  Long-term Buy        Buy
## 2 27-Jul-16  Raymond James Upgrade    Mkt Perform Outperform
## 3 30-Mar-16          Cowen Upgrade Market Perform Outperform
## 4 12-Jan-16   BofA/Merrill Upgrade        Neutral        Buy
## 5 11-Jan-16         Mizuho Upgrade        Neutral        Buy
## 6 28-Oct-15  Pacific Crest Upgrade  Sector Weight Overweight
# example 3: filter2
result <- sdat %>%
  filter(sact=="Upgrade" & to1=="Outperform")
head(result)
##       sdate           sfirm    sact          from1        to1
## 1 27-Jul-16   Raymond James Upgrade    Mkt Perform Outperform
## 2 30-Mar-16           Cowen Upgrade Market Perform Outperform
## 3 25-Aug-15     Wells Fargo Upgrade Market Perform Outperform
## 4  9-Oct-14     Oppenheimer Upgrade        Perform Outperform
## 5 29-Oct-13 Robert W. Baird Upgrade        Neutral Outperform
## 6 23-Apr-09   Morgan Keegan Upgrade    Mkt Perform Outperform
# example 4: group_by 1
result <- sdat %>% 
  group_by(sact) %>%
  summarise(n=n())
head(result)
## Source: local data frame [3 x 2]
## 
##        sact     n
##      (fctr) (int)
## 1 Downgrade    62
## 2 Initiated    80
## 3   Upgrade    81
# example 5: 綜合
result <- sdat %>% 
  group_by(sfirm) %>%
  summarise(n=n()) %>%
  arrange(n) %>%
  filter(n>5)
head(result)
## Source: local data frame [6 x 2]
## 
##                 sfirm     n
##                (fctr) (int)
## 1        Bear Stearns     7
## 2          Prudential     7
## 3   Salomon Smth Brny     7
## 4                 UBS     7
## 5 Banc of America Sec     9
## 6        Needham & Co     9
# 合併資料
# inner_join(x, y): Include only rows in both x and y
# left_join(x, y): Include all of x, and matching rows of y

網路爬虫案例一:分析師資料

library(XML)
stockURL <- "http://finance.yahoo.com/q/ud?bypass=true&s=AAPL"
areport <- readHTMLTable(stockURL, header=F, which=3, stringsAsFactors=F)
# clean data based on the specifical web table structure
areport <- areport[12:(nrow(areport)-8),]
areport <- data.frame(areport[-1,])
library(rvest)
stockURL <- "http://finance.yahoo.com/q/ud?bypass=true&s=AAPL"
sdat <- html(stockURL) %>% 
  html_nodes(".yfnc_tabledata1") %>%
  html_text()

# 整理資料
sdat <- matrix(sdat,ncol=5,byrow = TRUE)
sdat <- data.frame(sdate=sdat[,1],sfirm=sdat[,2],sact=sdat[,3],from1=sdat[,4],to1=sdat[,5])
head(sdat)
##       sdate          sfirm      sact          from1        to1
## 1 28-Jul-16 Hilliard Lyons   Upgrade  Long-term Buy        Buy
## 2 27-Jul-16  Raymond James   Upgrade    Mkt Perform Outperform
## 3  6-Jul-16        Longbow Initiated                       Buy
## 4  6-Apr-16        Needham Initiated                Strong Buy
## 5 30-Mar-16          Cowen   Upgrade Market Perform Outperform
## 6 12-Jan-16   BofA/Merrill   Upgrade        Neutral        Buy

網路爬虫案例二:中時電子報

library(rvest)
news_url="http://www.chinatimes.com/money/realtimenews?page=1"
   mytitle <- html(news_url) %>% 
     html_nodes(".vertical-list h2") %>% 
     html_text()
   head(mytitle)
## [1] "日擴大寬鬆不如預期 台股下週震盪"                   
## [2] "在愛爾蘭繳稅1% 經濟學家:蘋果渉詐欺"              
## [3] "福邦投顧看下周指數8,550點到9,200點區間震盪"        
## [4] "行政院前院長陳冲:「房屋稅違法違憲,應予以廢止!」"
## [5] "台股下周指數 估8750點到9100點"                     
## [6] "買低價宅賺小賠大?破解5種黑心建案"
library(rvest)
news_url="http://www.chinatimes.com/money/realtimenews?page=1"

my_news =
  data.frame(
    title = html(news_url) %>% 
      html_nodes(".vertical-list h2") %>% 
      html_text(), 
    title_href = html(news_url) %>% 
      html_nodes(".vertical-list h2") %>%  
      html_nodes("a") %>% 
      html_attr( "href") ,
    stringsAsFactors=FALSE)  
head(my_news)
##                                                title
## 1                    日擴大寬鬆不如預期 台股下週震盪
## 2               在愛爾蘭繳稅1% 經濟學家:蘋果渉詐欺
## 3         福邦投顧看下周指數8,550點到9,200點區間震盪
## 4 行政院前院長陳冲:「房屋稅違法違憲,應予以廢止!」
## 5                      台股下周指數 估8750點到9100點
## 6                  買低價宅賺小賠大?破解5種黑心建案
##                                                     title_href
## 1 http://www.chinatimes.com/realtimenews/20160730003388-260410
## 2 http://www.chinatimes.com/realtimenews/20160730003248-260410
## 3 http://www.chinatimes.com/realtimenews/20160730003145-260410
## 4 http://www.chinatimes.com/realtimenews/20160730002891-260410
## 5 http://www.chinatimes.com/realtimenews/20160730002455-260410
## 6 http://www.chinatimes.com/realtimenews/20160730001184-260410
  # 已經取得my_news資料
  i <- 1
  doc_url <- my_news[i,2]
  doc_temp <- html(doc_url) %>% html_nodes(".clear-fix")%>% html_nodes("p") %>% html_text()
  # 清理資料
  lastnum <- length(doc_temp)-7
  x<-paste(doc_temp[1:lastnum], collapse = '', sep="")
  x
## [1] "日擴大寬鬆規模不如預期攪局,台股重跌92點收在8,984點,再痛失9千大關,外資終結連15買轉為賣超13.87億元,周線收黑下跌28點,終結周線連2紅,但7月大漲317點,漲幅3.67%,雙創近4周最大漲點及漲幅,月線連3紅,市值增加16.4座台北101大樓,但留意指數下周震盪將加大!國泰證期顧問處經理蔡明翰指出,美經濟數據佳,Fed維持利率不變,台灣6月景氣燈號連3月黃藍燈,第2季GDP為0.69%成功擺脫連3季負成長,美台景氣狀況佳,但市場關注日央在利率會議動作,市場期待更大擴大寬鬆政策出爐,雖安倍晉三已公布28兆日圓刺激方案,但市場仍期待更多,導致日圓對美元大幅波動,隱含波動率創08年來新高,但日央擴大寬鬆不如預期,台股跟進回檔,外資終結連15買轉為賣超13.87億元,但價跌量縮,價量結構仍健康。蔡明翰認為,日央決議擴大購買ETF至6兆日圓,美元貸款擴大一倍至240億美元,但維持利率不變,導致原先預期降息市場出現失望賣壓,但8月2日將表決首相提出28兆刺激方案,加上後續將續增加寬鬆政策力道,另外FED近期言論雖偏向鷹派,但市場對升息預期未明顯拉升,全球資金維持充裕態勢維持不變。"
  # 已經取得my_news資料
  i <- 1
  doc_url <- my_news[i,2]
  doc_time <- html(doc_url) %>% html_nodes(".reporter") %>% html_nodes("time") %>% html_text()
  date_value <- paste(substr(doc_time,1,4),substr(doc_time,6,7),substr(doc_time,9,10),sep="")
  date_value <- paste(date_value, substr(doc_time,13,14), substr(doc_time,16,17),sep="")
  filename <- paste("chinatimes/", date_value, "_", my_news[i,1], sep="")
  filename
## [1] "chinatimes/201607301606_日擴大寬鬆不如預期 台股下週震盪"
  doc_temp <- html(doc_url) %>% html_nodes(".clear-fix")%>% html_nodes("p") %>% html_text()

  lastnum <- length(doc_temp)-7
  x<-paste(doc_temp[1:lastnum], collapse = '', sep="")
  x
## [1] "日擴大寬鬆規模不如預期攪局,台股重跌92點收在8,984點,再痛失9千大關,外資終結連15買轉為賣超13.87億元,周線收黑下跌28點,終結周線連2紅,但7月大漲317點,漲幅3.67%,雙創近4周最大漲點及漲幅,月線連3紅,市值增加16.4座台北101大樓,但留意指數下周震盪將加大!國泰證期顧問處經理蔡明翰指出,美經濟數據佳,Fed維持利率不變,台灣6月景氣燈號連3月黃藍燈,第2季GDP為0.69%成功擺脫連3季負成長,美台景氣狀況佳,但市場關注日央在利率會議動作,市場期待更大擴大寬鬆政策出爐,雖安倍晉三已公布28兆日圓刺激方案,但市場仍期待更多,導致日圓對美元大幅波動,隱含波動率創08年來新高,但日央擴大寬鬆不如預期,台股跟進回檔,外資終結連15買轉為賣超13.87億元,但價跌量縮,價量結構仍健康。蔡明翰認為,日央決議擴大購買ETF至6兆日圓,美元貸款擴大一倍至240億美元,但維持利率不變,導致原先預期降息市場出現失望賣壓,但8月2日將表決首相提出28兆刺激方案,加上後續將續增加寬鬆政策力道,另外FED近期言論雖偏向鷹派,但市場對升息預期未明顯拉升,全球資金維持充裕態勢維持不變。"
  # write.table(x[1], paste(filename, ".txt",sep="") , quote = FALSE, row.names=FALSE, col.names = FALSE)
  # Sys.sleep(runif(1,0,5))

網路爬虫練習

文字探勘觀念

# 5個文件,6個字詞
d1 <- c(3,0,10,2,1,0)
d2 <- c(0,3,0,4,2,0)
d3 <- c(3,0,0,0,2,5)
d4 <- c(0,2,5,2,0,0)
d5 <- c(0,1,7,3,1,0)

#計算文件的相關性
d1%*%d2/sqrt((sum(d1^2)*sum(d2^2)))
##           [,1]
## [1,] 0.1739196
d4%*%d5/sqrt((sum(d4^2)*sum(d5^2)))
##           [,1]
## [1,] 0.9663531
d3%*%d4/sqrt((sum(d3^2)*sum(d4^2)))
##      [,1]
## [1,]    0
# 計算TF-IDF值
DTM <- matrix(c(d1,d2,d3,d4,d5),nrow=5,byrow=TRUE)
tf <- DTM/matrix(rep(apply(DTM,1,sum),each=6,len=6*5),nrow=5,byrow=TRUE)
DTM1 <- DTM
DTM1[DTM1>0] <- 1
idf <- log(nrow(DTM1)/apply(DTM1,2,sum),10)
idf <- matrix(rep(idf,nrow(DTM)),nrow=nrow(DTM),byrow = TRUE)
tf*idf
##            [,1]       [,2]      [,3]       [,4]        [,5]     [,6]
## [1,] 0.07461375 0.00000000 0.1386555 0.01211375 0.006056876 0.000000
## [2,] 0.00000000 0.07394958 0.0000000 0.04307112 0.021535558 0.000000
## [3,] 0.11938200 0.00000000 0.0000000 0.00000000 0.019382003 0.349485
## [4,] 0.00000000 0.04929972 0.1232493 0.02153556 0.000000000 0.000000
## [5,] 0.00000000 0.01848740 0.1294118 0.02422750 0.008075834 0.000000
d1 <- c(3,0,10,2,1,0)
d2 <- c(0,3,0,4,2,0)
d3 <- c(3,0,0,0,2,5)
d4 <- c(0,2,5,2,0,0)
d5 <- c(0,1,7,3,1,0)

DTM <- matrix(c(d1,d2,d3,d4,d5),nrow=5,byrow=TRUE)
TDM <- t(DTM)
TDM.dataframe <- data.frame(TDM)
colnames(TDM.dataframe) <- c("d1", "d2", "d3", "d4", "d5")
rownames(TDM.dataframe) <- c("t1","t2", "t3", "t4", "t5", "t6")

# 進行SVD,使用svd() function
svd.tdm <- svd(TDM)
svd.T <- svd.tdm$u
svd.D <- svd.tdm$v
s <- svd.tdm$d

# 查看分解後的結果
# 字詞概念矩陣
svd.T
##             [,1]        [,2]          [,3]       [,4]        [,5]
## [1,] -0.16861617  0.50316716  0.1928911910  0.6275839  0.49917590
## [2,] -0.11946249 -0.02723604 -0.5832560373 -0.2750752  0.68336607
## [3,] -0.92101036 -0.13654116  0.3010235221 -0.1739794 -0.02937896
## [4,] -0.30828606 -0.03420053 -0.6695776455  0.1882990 -0.27691654
## [5,] -0.11643062  0.32806545 -0.2892334076  0.3558817 -0.43474250
## [6,] -0.02099592  0.78653729  0.0001876407 -0.5814594 -0.13148120
# 奇異值矩陣
diag(s)
##          [,1]     [,2]     [,3]     [,4]    [,5]
## [1,] 14.17454 0.000000 0.000000 0.000000 0.00000
## [2,]  0.00000 6.226305 0.000000 0.000000 0.00000
## [3,]  0.00000 0.000000 5.523532 0.000000 0.00000
## [4,]  0.00000 0.000000 0.000000 1.640008 0.00000
## [5,]  0.00000 0.000000 0.000000 0.000000 1.05671
# 文件概念矩陣
svd.D
##             [,1]        [,2]          [,3]       [,4]        [,5]
## [1,] -0.73716383  0.06484652  0.3549395873  0.5338006  0.20361550
## [2,] -0.12870919  0.07028577 -0.9064029496  0.3900796  0.06903223
## [3,] -0.05952148  0.97944424  0.0002072879 -0.1907196 -0.02778749
## [4,] -0.38523653 -0.12938314 -0.1811431151 -0.6362464  0.63026224
## [5,] -0.53672474 -0.12167092 -0.1401381862 -0.3488716 -0.74550161
# 測試1: k=5,計算得到原本的TDM
k <- 5
svd.T[,1:k] %*% diag(s[1:k]) %*% t(svd.D[,1:k])
##               [,1]          [,2]          [,3]          [,4]          [,5]
## [1,]  3.000000e+00 -1.387779e-17  3.000000e+00 -1.498801e-15 -1.443290e-15
## [2,] -5.551115e-17  3.000000e+00  1.731254e-15  2.000000e+00  1.000000e+00
## [3,]  1.000000e+01  9.627715e-16 -2.042095e-15  5.000000e+00  7.000000e+00
## [4,]  2.000000e+00  4.000000e+00  4.250073e-16  2.000000e+00  3.000000e+00
## [5,]  1.000000e+00  2.000000e+00  2.000000e+00 -7.216450e-16  1.000000e+00
## [6,] -2.775558e-16  1.018283e-15  5.000000e+00  0.000000e+00 -4.024558e-16
# 測試2: k=3,計算得到近似的TDM
k <- 3
svd.T[,1:k] %*% diag(s[1:k]) %*% t(svd.D[,1:k])
##             [,1]       [,2]        [,3]       [,4]       [,5]
## [1,]  2.34318571 -0.4379000  3.21095426  0.3223988  0.7523138
## [2,]  0.09377636  3.1261254 -0.06597262  1.2578483  1.3809564
## [3,] 10.15862931  0.1134436 -0.05528025  4.8380278  6.8773132
## [4,]  1.89473807  3.8997390  0.05076531  2.3809081  2.8895867
## [5,]  0.78198816  1.8040436  2.09854783  0.6608849  0.8611376
## [6,]  0.53732105  0.3815704  4.81426938 -0.5191564 -0.4362614
# 將文件1的維度從5降到2
k <- 2
TDM[,1]
## [1]  3  0 10  2  1  0
solve(diag(s[1:k]))%*%t(svd.T[,1:k]) %*% TDM[,1]
##             [,1]
## [1,] -0.73716383
## [2,]  0.06484652
# 將TDM的維度降低
solve(diag(s[1:k]))%*%t(svd.T[,1:k]) %*% TDM
##             [,1]        [,2]        [,3]       [,4]       [,5]
## [1,] -0.73716383 -0.12870919 -0.05952148 -0.3852365 -0.5367247
## [2,]  0.06484652  0.07028577  0.97944424 -0.1293831 -0.1216709

文字探勘相關套件安裝 (win 10)

文字探勘實作 (1): 中文分詞 (Rwordseg)

library(rJava)
library(Rwordseg) # 分詞套件

temp <- "經濟部預期,3月整體外銷訂單金額將較2月增加,但因去年同期基期仍高,接單恐持續衰退。"
segmentCN(temp)
##  [1] "經濟部" "預期"   "3月"    "整體"   "外銷"   "訂單"   "金額"  
##  [8] "將"     "較"     "2月"    "增加"   "但"     "因"     "去年"  
## [15] "同期"   "基期"   "仍"     "高"     "接"     "單"     "恐"    
## [22] "持續"   "衰退"
# 自行加入字词,必须使用简体字
insertWords("接单")   
segmentCN(temp)
##  [1] "經濟部" "預期"   "3月"    "整體"   "外銷"   "訂單"   "金額"  
##  [8] "將"     "較"     "2月"    "增加"   "但"     "因"     "去年"  
## [15] "同期"   "基期"   "仍"     "高"     "接單"   "恐"     "持續"  
## [22] "衰退"
# insertWords 加入的詞只暫存在記憶體中
# insertWords("接单", save=TRUE)  
# /Library/Frameworks/R.framework/Versions/3.2/Resources/library/Rwordseg/config/userdic

# deleteWords("接单", save=TRUE)  # 將加入的新字詞刪除
segmentCN(temp, nature = TRUE)
##          n         vn          m          n          v          n 
##   "經濟部"     "預期"      "3月"     "整體"     "外銷"     "訂單" 
##          n          d          d          m          v          c 
##     "金額"       "將"       "較"      "2月"     "增加"       "但" 
##          p          t          f          n          d          a 
##       "因"     "去年"     "同期"     "基期"       "仍"       "高" 
## userDefine          d         vd          v 
##     "接單"       "恐"     "持續"     "衰退"
listDict()  # 查看目前本機中安裝那些詞典  
## [1] Name Type Des  Path
## <0 rows> (or 0-length row.names)
# 安裝下載的詞典,搜狗輸入法網站詞庫 (http://pinyin.sogou.com/dict/)
installDict("finterms.scel","finance_dict") 
## 11319 words were loaded! ... New dictionary 'finance_dict' was installed!
# 若不使用可以將其移除
uninstallDict() 
## 11319 words were removed! ... The dictionary 'finance_dict' was uninstalled!

文字探勘實作 (2): 中文分詞 (jiebaR)

# You can use worker() to initialize a worker, and then use <= or segment() to do the segmentation.

library(jiebaR)
temp <- "經濟部預期,3月整體外銷訂單金額將較2月增加,但因去年同期基期仍高,接單恐持續衰退。"

# use worker() to initialize a worker
cutter <- worker()
# 寫法1
cutter[temp]
##  [1] "經濟部"   "預期"     "3"        "月"       "整體"     "外銷"    
##  [7] "訂單"     "金額"     "將較"     "2"        "月"       "增加"    
## [13] "但"       "因"       "去年同期" "基期"     "仍高"     "接單"    
## [19] "恐"       "持續"     "衰退"
# 寫法2
cutter <= temp
##  [1] "經濟部"   "預期"     "3"        "月"       "整體"     "外銷"    
##  [7] "訂單"     "金額"     "將較"     "2"        "月"       "增加"    
## [13] "但"       "因"       "去年同期" "基期"     "仍高"     "接單"    
## [19] "恐"       "持續"     "衰退"
# 寫法3
segment(temp,cutter)
##  [1] "經濟部"   "預期"     "3"        "月"       "整體"     "外銷"    
##  [7] "訂單"     "金額"     "將較"     "2"        "月"       "增加"    
## [13] "但"       "因"       "去年同期" "基期"     "仍高"     "接單"    
## [19] "恐"       "持續"     "衰退"
# 查看cutter的參數設置
cutter
## Worker Type:  Jieba Segment
## 
## Default Method  :  mix
## Detect Encoding :  TRUE
## Default Encoding:  UTF-8
## Keep Symbols    :  FALSE
## Output Path     :  
## Write File      :  TRUE
## By Lines        :  FALSE
## Max Word Length :  20
## Max Read Lines  :  1e+05
## 
## Fixed Model Components:  
## 
## $dict
## [1] "/Library/Frameworks/R.framework/Versions/3.2/Resources/library/jiebaRD/dict/jieba.dict.utf8"
## 
## $user
## [1] "/Library/Frameworks/R.framework/Versions/3.2/Resources/library/jiebaRD/dict/user.dict.utf8"
## 
## $hmm
## [1] "/Library/Frameworks/R.framework/Versions/3.2/Resources/library/jiebaRD/dict/hmm_model.utf8"
## 
## $stop_word
## NULL
## 
## $timestamp
## [1] 1469866921
## 
## $default $detect $encoding $symbol $output $write $lines $bylines can be reset.
# 增加使用者字詞
cutter1 <- worker()
new_user_word(cutter1, "2月",tags="jn1")
## [1] TRUE
cutter1[temp]
##  [1] "經濟部"   "預期"     "3"        "月"       "整體"     "外銷"    
##  [7] "訂單"     "金額"     "將較"     "2月"      "增加"     "但"      
## [13] "因"       "去年同期" "基期"     "仍高"     "接單"     "恐"      
## [19] "持續"     "衰退"
# worker的參數設定 (https://qinwenfeng.com/jiebaR/worker.html)
cutter2 <- worker(type = "mix", dict = DICTPATH, hmm = HMMPATH, user = USERPATH,
                  idf = IDFPATH, stop_word = STOPPATH, write = T, qmax = 20, topn = 5,
                  encoding = "UTF-8", detect = T, symbol = F, lines = 1e+05,
                  output = NULL, bylines = F)

文字探勘實作 (3): 語料庫Corpus (using Rwordseg)

library(rJava)
library(Rwordseg) # 分詞套件
library(tm) #text mining 套件
library(tmcn)  # A Text mining toolkit for international characters

NLPdat <-readLines("https://sites.google.com/a/qfitrc.com/zheng-quan-fen-xi-ke-cheng/fgu-dae-104-2-statistics-ii/00-tong-ji-zuo-ye-zheng-li/twstock1.txt?attredirects=0&d=1")      # mac
# NLPdat <- toUTF8(readLines("https://sites.google.com/a/qfitrc.com/zheng-quan-fen-xi-ke-cheng/fgu-dae-104-2-statistics-ii/00-tong-ji-zuo-ye-zheng-li/twstock1.txt?attredirects=0&d=1"))    # windows
#segment.options(isNameRecognition=TRUE)
data.segCN <- segmentCN(NLPdat)
data.corpus <- Corpus(VectorSource(data.segCN))
# inspect(data.corpus)
data.corpus[[1]]$content
##   [1] "工商"    "時報"    "涂"      "志"      "豪"      "台"      "積"     
##   [8] "電"      "2330"    "股價"    "從"      "元月"    "中旬"    "以來"   
##  [15] "表現"    "銳"      "不可"    "擋"      "一路"    "從"      "130"    
##  [22] "5元"     "拉"      "到"      "上周"    "盤"      "中"      "高"     
##  [29] "點"      "的"      "163元"   "累計"    "外資"    "在"      "這"     
##  [36] "段"      "時間"    "狂"      "買"      "50"      "3萬張"   "累計"   
##  [43] "持"      "股"      "比重"    "飆"      "上"      "78"      "94"     
##  [50] "而"      "投"      "信"      "和"      "自營"    "商"      "則"     
##  [57] "沒"      "出"      "手"      "完全"    "看"      "外資"    "唱"     
##  [64] "獨腳戲"  "受到"    "南"      "台灣"    "強震"    "影響"    "台"     
##  [71] "積"      "電"      "二月"    "合併"    "營"      "收"      "月"     
##  [78] "減"      "16"      "0"       "達"      "595"     "51億元"  "累計"   
##  [85] "今年"    "前"      "二個"    "月"      "營"      "收"      "達"     
##  [92] "1"       "304"     "07億元"  "較"      "去年"    "同期"    "減少"   
##  [99] "12"      "9"       "不過"    "台"      "積"      "電"      "竭盡"   
## [106] "所"      "能"      "補足"    "所有"    "受"      "影響"    "的"     
## [113] "晶"      "圓"      "預期"    "不少"    "的"      "產"      "能"     
## [120] "其"      "出"      "貨"      "時間"    "延"      "後"      "至"     
## [127] "第二季"  "不過"    "受惠"    "於"      "半導體"  "生產"    "鏈"     
## [134] "庫存"    "在"      "去年底"  "已"      "完成"    "去"      "化"     
## [141] "隨"      "著"      "上游"    "客戶"    "陸續"    "提高"    "晶"     
## [148] "圓"      "代"      "工"      "投"      "片"      "量"      "加上"   
## [155] "新"      "台幣"    "兌"      "美元"    "匯率"    "走"      "貶"     
## [162] "台"      "積"      "電"      "將"      "第一季"  "的"      "匯率"   
## [169] "預估"    "更新"    "為"      "33"      "18元"    "並"      "將"     
## [176] "業績"    "估"      "調升"    "至"      "2"       "010"     "2"      
## [183] "030億元" "之間"    "但"      "毛利率"  "則"      "因"      "地震"   
## [190] "影響"    "下"      "修"      "至"      "44"      "46"      "法人"   
## [197] "預期"    "台"      "積"      "電"      "第一季"  "營運"    "將"     
## [204] "順利"    "達標"    "由"      "於"      "台"      "積"      "電"     
## [211] "Fab"     "14"      "A"       "以及"    "Fab"     "6"       "的"     
## [218] "部"      "份"      "晶"      "圓"      "出"      "貨"      "延"     
## [225] "後"      "到"      "第二季"  "加上"    "客戶"    "搶"      "產能"   
## [232] "的"      "動作"    "仍"      "持續"    "到"      "第二季"  "以及"   
## [239] "台"      "積"      "電"      "開始"    "為"      "蘋果"    "量"     
## [246] "產"      "新"      "一代"    "A"       "10"      "應用"    "處理器" 
## [253] "會"      "在"      "第二季"  "底"      "開始"    "大量出"  "貨"     
## [260] "因此"    "法人"    "看好"    "台"      "積"      "電"      "下"     
## [267] "季"      "營"      "收"      "將"      "見到"    "超過"    "一成"   
## [274] "的"      "成長"    "幅度"    "單"      "季"      "營"      "收"     
## [281] "應"      "可"      "順利"    "改寫"    "歷史"    "新"      "高"     
## [288] "且"      "毛利率"  "也"      "將"      "回升"    "到"      "47"     
## [295] "49"      "的"      "正常"    "水準"
# 清除標點符號, 數字
data.corpus <- tm_map(data.corpus, removePunctuation)
data.corpus <- tm_map(data.corpus, removeNumbers)
data.DTM <- DocumentTermMatrix(data.corpus,control=list(wordLengths = c(2, Inf)))
# data.TDM <- TermDocumentMatrix(data.corpus,control=list(wordLengths = c(2, Inf)))
# myterms <- c("半導體","訂單", "蘋果")
# data.DTM1 <- DocumentTermMatrix(data.corpus,control=list(wordLengths = c(2, Inf), dictionary = myterms))
# inspect(data.DTM1)

inspect(data.DTM[1:5, 120:125])
## <<DocumentTermMatrix (documents: 5, terms: 6)>>
## Non-/sparse entries: 0/30
## Sparsity           : 100%
## Maximal term length: 2
## Weighting          : term frequency (tf)
## 
##     Terms
## Docs 出讓 出色 出售 出現 初步 除非
##    1    0    0    0    0    0    0
##    2    0    0    0    0    0    0
##    3    0    0    0    0    0    0
##    4    0    0    0    0    0    0
##    5    0    0    0    0    0    0
findFreqTerms(data.DTM, lowfreq = 20)  # 找出較常出現的字詞
##  [1] "半導體" "表現"   "產業"   "成長"   "第季"   "訂單"   "法人"  
##  [8] "股價"   "合併"   "今年"   "今天"   "客戶"   "庫存"   "蘋果"  
## [15] "去年"   "設備"   "申請"   "市場"   "外資"   "先進"   "億美元"
## [22] "億元"   "營運"   "影響"   "預估"   "預期"   "支出"   "資本"
findAssocs(data.DTM, "成長", 0.4)    # 找出與某詞相關系數為0.4的字詞
## $成長
##   競爭力     電腦     中科     經濟     震撼     問題     主要     幅度 
##     0.70     0.67     0.67     0.61     0.60     0.58     0.57     0.54 
##     廠商     方面     設計     長期     百年     補給     產出     大旱 
##     0.53     0.53     0.53     0.53     0.52     0.52     0.52     0.52 
##     大戶     電子     複合     個人     工業     規模     會上     節水 
##     0.52     0.52     0.52     0.52     0.52     0.52     0.52     0.52 
##     擴大     歷年     水車     提供     先前     效率     應付     用水 
##     0.52     0.52     0.52     0.52     0.52     0.52     0.52     0.52 
##     雲端     遭遇     重疊     綜合     今年     氣氛     仍然     重視 
##     0.52     0.52     0.52     0.52     0.51     0.47     0.47     0.47 
##     昨天     年底     首季     一個     此外     智慧     安心     暗示 
##     0.47     0.44     0.44     0.44     0.43     0.43     0.42     0.42 
##     表明   不景氣     出來     出讓     創辦     創意     帶來     當中 
##     0.42     0.42     0.42     0.42     0.42     0.42     0.42     0.42 
##     刀口     的確     等待     第度     第一     方向     飛來     更加 
##     0.42     0.42     0.42     0.42     0.42     0.42     0.42     0.42 
##     功力     關係     好好   基本功     集團     季報     加快     價格 
##     0.42     0.42     0.42     0.42     0.42     0.42     0.42     0.42 
##     建廠   結構性     節省     就算     砍價     科技     困難     理想 
##     0.42     0.42     0.42     0.42     0.42     0.42     0.42     0.42 
##     兩成     面臨     平板     確立     上回     升值     示範     事實 
##     0.42     0.42     0.42     0.42     0.42     0.42     0.42     0.42 
##     說法     無異     下來     現身     效能     新書     宣告 養精蓄銳 
##     0.42     0.42     0.42     0.42     0.42     0.42     0.42     0.42 
##     要求   一口氣     一切     迎接     預告     這樣     只能     做好 
##     0.42     0.42     0.42     0.42     0.42     0.42     0.42     0.42 
##       pc     市況 
##     0.42     0.41
inspect(removeSparseTerms(data.DTM, 0.5))      # 處理稀疏矩陣
## <<DocumentTermMatrix (documents: 31, terms: 9)>>
## Non-/sparse entries: 166/113
## Sparsity           : 41%
## Maximal term length: 3
## Weighting          : term frequency (tf)
## 
##     Terms
## Docs 半導體 成長 第季 股價 今年 今天 客戶 營運 us
##   1       1    1    0    1    1    0    2    1  0
##   2       0    2    2    1    1    0    0    0  0
##   3       0    0    6    6    0    1    1    3  0
##   4       1    1    1    2    1    1    0    1  1
##   5       3    0    1    1    0    0    0    0  0
##   6       1    3    2    2    4    1    1    0  1
##   7       0    0    4    0    2    3    2    0  1
##   8       1    2    0    0    4    1    0    1  1
##   9       1    0    4    5    2    1    0    1  1
##   10      0    0    0    0    5    1    0    0  0
##   11      0    3    2    4    2    8    1    1  0
##   12      1    2   13    0    0    0    3    0  0
##   13      0    0    7    2    0    1    0    1  0
##   14      1    0    2    2    2    1    2    1  1
##   15      0    1    0    0    1    1    1    0  0
##   16      0    0    0    0    1    0    0    0  1
##   17      0    0    0    1    1    4    1    0  1
##   18      3    6    0    1    4    0    0    2  0
##   19      2    6    7    0    5    0    2    1  1
##   20      0    1    1    2    0    1    2    0  1
##   21      4    2    0    1    2    1    0    3  0
##   22      0    0    0    0    0    1    0    1  0
##   23      5    2    0    1    0    2    1    0  1
##   24      1    4    0    0    0    0    0    0  1
##   25      1    7    2    0    6    1    1    3  1
##   26      4    2    1    2    7    0    0    4  1
##   27      0    3    1    3    5    0    2    1  1
##   28      3    1    6    0    5    0    2    3  0
##   29      4    1    2    1    0    3    0    0  2
##   30      6    1    7    0    2    0    2    2  0
##   31      0    0    0    3    1    2    0    0  0
freq <- sort(colSums(as.matrix(data.DTM)), decreasing=TRUE)   
wf <- data.frame(word=names(freq), freq=freq)   
head(wf)
##          word freq
## 第季     第季   71
## 今年     今年   64
## 成長     成長   51
## 半導體 半導體   43
## 股價     股價   41
## 今天     今天   35
# 畫圖
library(ggplot2)   
p <- ggplot(subset(wf, freq>25), aes(word, freq))    
p <- p + geom_bar(stat="identity")   
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))   
p + theme_gray(base_family = "STHeiti")

文字探勘實作 (4): 語料庫Corpus (using jiebaR)

library(jiebaR) # 分詞套件
library(tm) #text mining 套件
library(tmcn)  # A Text mining toolkit for international characters

NLPdat <-readLines("https://sites.google.com/a/qfitrc.com/zheng-quan-fen-xi-ke-cheng/fgu-dae-104-2-statistics-ii/00-tong-ji-zuo-ye-zheng-li/twstock1.txt?attredirects=0&d=1")      # mac
# NLPdat <- toUTF8(readLines("https://sites.google.com/a/qfitrc.com/zheng-quan-fen-xi-ke-cheng/fgu-dae-104-2-statistics-ii/00-tong-ji-zuo-ye-zheng-li/twstock1.txt?attredirects=0&d=1"))    # windows
# 結巴分詞
cutter <- worker(bylines=T)
#new_user_word(cutter, "台積電",tags="jn1")
article_words <- lapply(NLPdat, function(x) cutter <=x)

data.corpus <- Corpus(VectorSource(article_words))
data.corpus <- tm_map(data.corpus, removeNumbers)
data.corpus <- tm_map(data.corpus, removePunctuation)
data.DTM <- DocumentTermMatrix(data.corpus,control=list(wordLengths = c(2, Inf)))
findFreqTerms(data.DTM, lowfreq = 20) 
##  [1] "半導體" "表現"   "產業"   "成長"   "出貨"   "訂單"   "法人"  
##  [8] "股價"   "合併"   "積電"   "今年"   "今天"   "晶圓"   "客戶"  
## [15] "庫存"   "奈米"   "蘋果"   "設備"   "市場"   "台股"   "台積"  
## [22] "台積電" "外資"   "先進"   "億美元" "億元"   "營收"   "營運"  
## [29] "影響"   "預估"   "預期"   "張忠謀" "支出"   "製程"   "資本"
findAssocs(data.DTM, "成長", 0.4) 
## $成長
##   幅度   布局   積極   公告   來看   統計   比例   廠商   方面   企業 
##   0.57   0.44   0.44   0.43   0.43   0.43   0.42   0.42   0.42   0.42 
##   設計   長期 半導體 英特爾   對於 
##   0.42   0.42   0.41   0.41   0.40
freq <- sort(colSums(as.matrix(data.DTM)), decreasing=TRUE)   
wf <- data.frame(word=names(freq), freq=freq)   
head(wf)
##          word freq
## 台積電 台積電  148
## 今年     今年   63
## 台積     台積   57
## 奈米     奈米   52
## 成長     成長   45
## 半導體 半導體   43
# 畫圖
library(ggplot2)   
p <- ggplot(subset(wf, freq>25), aes(word, freq))    
p <- p + geom_bar(stat="identity")   
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))   
p + theme_gray(base_family = "STHeiti")