Jying-Nan Wang
2016-07-31
library(quantmod)
# 基本抓取資料與繪圖
getSymbols("AAPL",from="2015-09-01",to="2015-10-31")## [1] "AAPL"
head(AAPL)## AAPL.Open AAPL.High AAPL.Low AAPL.Close AAPL.Volume
## 2015-09-01 110.15 111.88 107.36 107.72 76845900
## 2015-09-02 110.23 112.34 109.13 112.34 61888800
## 2015-09-03 112.49 112.78 110.04 110.37 53233900
## 2015-09-04 108.97 110.45 108.51 109.27 49996300
## 2015-09-08 111.75 112.56 110.32 112.31 54843600
## 2015-09-09 113.76 114.02 109.77 110.15 85010800
## AAPL.Adjusted
## 2015-09-01 106.0364
## 2015-09-02 110.5842
## 2015-09-03 108.6450
## 2015-09-04 107.5622
## 2015-09-08 110.5546
## 2015-09-09 108.4284
chartSeries(AAPL,theme="white")# 下載台股資料,存到變數X 中
X <- getSymbols("2330.tw", auto.assign=FALSE, from="2015-10-01",to="2015-11-13")
chartSeries(X)# 技術分析 TTR 套件
getSymbols("AAPL",from="2015-06-01",to="2015-10-31")## [1] "AAPL"
candleChart(AAPL,multi.col=TRUE,theme="white")addMACD()addBBands()Basic Components
library(ggplot2)
# example 1
head(mtcars)## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
p <- ggplot(mtcars, aes(wt, mpg))
p + geom_point()# example 2
p <- ggplot(mtcars, aes(wt, mpg))
# 採用geom_point() 畫scatter plot,geom_smooth()畫迴歸線
p <- p + geom_point(shape=2)+ geom_smooth(lwd = 1, se = TRUE, method = "lm",color=rgb(0,0.5,0.5))
# 設定座標相關資訊
p <- p +xlab("車重") + ylab("油耗") + ggtitle("車重和油耗的關係") + expand_limits(x=c(1,6), y = c(8, 40))
# 設定中文(mac專用)
p + theme_bw(base_family = "STHeiti") library(dplyr)
# R 爬蟲,可先忽略
library(rvest)
sid <- "AAPL"
stockURL <- paste("http://finance.yahoo.com/q/ud?s=", sid, sep="")
sdat = html(stockURL) %>% html_nodes(".yfnc_tabledata1") %>% html_text()
sdat <- matrix(sdat,ncol=5,byrow = TRUE)
sdat <- data.frame(sdate=sdat[,1],sfirm=sdat[,2],sact=sdat[,3],from1=sdat[,4],to1=sdat[,5])
# 整理資料
library(dplyr)
# example 1: select
result <- sdat %>%
select(sdate,sact)
# arrange(desc(sdate))
head(result)## sdate sact
## 1 28-Jul-16 Upgrade
## 2 27-Jul-16 Upgrade
## 3 6-Jul-16 Initiated
## 4 6-Apr-16 Initiated
## 5 30-Mar-16 Upgrade
## 6 12-Jan-16 Upgrade
# example 2: filter1
result <- sdat %>%
filter(sact=="Upgrade")
head(result)## sdate sfirm sact from1 to1
## 1 28-Jul-16 Hilliard Lyons Upgrade Long-term Buy Buy
## 2 27-Jul-16 Raymond James Upgrade Mkt Perform Outperform
## 3 30-Mar-16 Cowen Upgrade Market Perform Outperform
## 4 12-Jan-16 BofA/Merrill Upgrade Neutral Buy
## 5 11-Jan-16 Mizuho Upgrade Neutral Buy
## 6 28-Oct-15 Pacific Crest Upgrade Sector Weight Overweight
# example 3: filter2
result <- sdat %>%
filter(sact=="Upgrade" & to1=="Outperform")
head(result)## sdate sfirm sact from1 to1
## 1 27-Jul-16 Raymond James Upgrade Mkt Perform Outperform
## 2 30-Mar-16 Cowen Upgrade Market Perform Outperform
## 3 25-Aug-15 Wells Fargo Upgrade Market Perform Outperform
## 4 9-Oct-14 Oppenheimer Upgrade Perform Outperform
## 5 29-Oct-13 Robert W. Baird Upgrade Neutral Outperform
## 6 23-Apr-09 Morgan Keegan Upgrade Mkt Perform Outperform
# example 4: group_by 1
result <- sdat %>%
group_by(sact) %>%
summarise(n=n())
head(result)## Source: local data frame [3 x 2]
##
## sact n
## (fctr) (int)
## 1 Downgrade 62
## 2 Initiated 80
## 3 Upgrade 81
# example 5: 綜合
result <- sdat %>%
group_by(sfirm) %>%
summarise(n=n()) %>%
arrange(n) %>%
filter(n>5)
head(result)## Source: local data frame [6 x 2]
##
## sfirm n
## (fctr) (int)
## 1 Bear Stearns 7
## 2 Prudential 7
## 3 Salomon Smth Brny 7
## 4 UBS 7
## 5 Banc of America Sec 9
## 6 Needham & Co 9
# 合併資料
# inner_join(x, y): Include only rows in both x and y
# left_join(x, y): Include all of x, and matching rows of ylibrary(XML)
stockURL <- "http://finance.yahoo.com/q/ud?bypass=true&s=AAPL"
areport <- readHTMLTable(stockURL, header=F, which=3, stringsAsFactors=F)
# clean data based on the specifical web table structure
areport <- areport[12:(nrow(areport)-8),]
areport <- data.frame(areport[-1,])library(rvest)
stockURL <- "http://finance.yahoo.com/q/ud?bypass=true&s=AAPL"
sdat <- html(stockURL) %>%
html_nodes(".yfnc_tabledata1") %>%
html_text()
# 整理資料
sdat <- matrix(sdat,ncol=5,byrow = TRUE)
sdat <- data.frame(sdate=sdat[,1],sfirm=sdat[,2],sact=sdat[,3],from1=sdat[,4],to1=sdat[,5])
head(sdat)## sdate sfirm sact from1 to1
## 1 28-Jul-16 Hilliard Lyons Upgrade Long-term Buy Buy
## 2 27-Jul-16 Raymond James Upgrade Mkt Perform Outperform
## 3 6-Jul-16 Longbow Initiated Buy
## 4 6-Apr-16 Needham Initiated Strong Buy
## 5 30-Mar-16 Cowen Upgrade Market Perform Outperform
## 6 12-Jan-16 BofA/Merrill Upgrade Neutral Buy
library(rvest)
news_url="http://www.chinatimes.com/money/realtimenews?page=1"
mytitle <- html(news_url) %>%
html_nodes(".vertical-list h2") %>%
html_text()
head(mytitle)## [1] "日擴大寬鬆不如預期 台股下週震盪"
## [2] "在愛爾蘭繳稅1% 經濟學家:蘋果渉詐欺"
## [3] "福邦投顧看下周指數8,550點到9,200點區間震盪"
## [4] "行政院前院長陳冲:「房屋稅違法違憲,應予以廢止!」"
## [5] "台股下周指數 估8750點到9100點"
## [6] "買低價宅賺小賠大?破解5種黑心建案"
library(rvest)
news_url="http://www.chinatimes.com/money/realtimenews?page=1"
my_news =
data.frame(
title = html(news_url) %>%
html_nodes(".vertical-list h2") %>%
html_text(),
title_href = html(news_url) %>%
html_nodes(".vertical-list h2") %>%
html_nodes("a") %>%
html_attr( "href") ,
stringsAsFactors=FALSE)
head(my_news)## title
## 1 日擴大寬鬆不如預期 台股下週震盪
## 2 在愛爾蘭繳稅1% 經濟學家:蘋果渉詐欺
## 3 福邦投顧看下周指數8,550點到9,200點區間震盪
## 4 行政院前院長陳冲:「房屋稅違法違憲,應予以廢止!」
## 5 台股下周指數 估8750點到9100點
## 6 買低價宅賺小賠大?破解5種黑心建案
## title_href
## 1 http://www.chinatimes.com/realtimenews/20160730003388-260410
## 2 http://www.chinatimes.com/realtimenews/20160730003248-260410
## 3 http://www.chinatimes.com/realtimenews/20160730003145-260410
## 4 http://www.chinatimes.com/realtimenews/20160730002891-260410
## 5 http://www.chinatimes.com/realtimenews/20160730002455-260410
## 6 http://www.chinatimes.com/realtimenews/20160730001184-260410
# 已經取得my_news資料
i <- 1
doc_url <- my_news[i,2]
doc_temp <- html(doc_url) %>% html_nodes(".clear-fix")%>% html_nodes("p") %>% html_text()
# 清理資料
lastnum <- length(doc_temp)-7
x<-paste(doc_temp[1:lastnum], collapse = '', sep="")
x## [1] "日擴大寬鬆規模不如預期攪局,台股重跌92點收在8,984點,再痛失9千大關,外資終結連15買轉為賣超13.87億元,周線收黑下跌28點,終結周線連2紅,但7月大漲317點,漲幅3.67%,雙創近4周最大漲點及漲幅,月線連3紅,市值增加16.4座台北101大樓,但留意指數下周震盪將加大!國泰證期顧問處經理蔡明翰指出,美經濟數據佳,Fed維持利率不變,台灣6月景氣燈號連3月黃藍燈,第2季GDP為0.69%成功擺脫連3季負成長,美台景氣狀況佳,但市場關注日央在利率會議動作,市場期待更大擴大寬鬆政策出爐,雖安倍晉三已公布28兆日圓刺激方案,但市場仍期待更多,導致日圓對美元大幅波動,隱含波動率創08年來新高,但日央擴大寬鬆不如預期,台股跟進回檔,外資終結連15買轉為賣超13.87億元,但價跌量縮,價量結構仍健康。蔡明翰認為,日央決議擴大購買ETF至6兆日圓,美元貸款擴大一倍至240億美元,但維持利率不變,導致原先預期降息市場出現失望賣壓,但8月2日將表決首相提出28兆刺激方案,加上後續將續增加寬鬆政策力道,另外FED近期言論雖偏向鷹派,但市場對升息預期未明顯拉升,全球資金維持充裕態勢維持不變。"
# 已經取得my_news資料
i <- 1
doc_url <- my_news[i,2]
doc_time <- html(doc_url) %>% html_nodes(".reporter") %>% html_nodes("time") %>% html_text()
date_value <- paste(substr(doc_time,1,4),substr(doc_time,6,7),substr(doc_time,9,10),sep="")
date_value <- paste(date_value, substr(doc_time,13,14), substr(doc_time,16,17),sep="")
filename <- paste("chinatimes/", date_value, "_", my_news[i,1], sep="")
filename## [1] "chinatimes/201607301606_日擴大寬鬆不如預期 台股下週震盪"
doc_temp <- html(doc_url) %>% html_nodes(".clear-fix")%>% html_nodes("p") %>% html_text()
lastnum <- length(doc_temp)-7
x<-paste(doc_temp[1:lastnum], collapse = '', sep="")
x## [1] "日擴大寬鬆規模不如預期攪局,台股重跌92點收在8,984點,再痛失9千大關,外資終結連15買轉為賣超13.87億元,周線收黑下跌28點,終結周線連2紅,但7月大漲317點,漲幅3.67%,雙創近4周最大漲點及漲幅,月線連3紅,市值增加16.4座台北101大樓,但留意指數下周震盪將加大!國泰證期顧問處經理蔡明翰指出,美經濟數據佳,Fed維持利率不變,台灣6月景氣燈號連3月黃藍燈,第2季GDP為0.69%成功擺脫連3季負成長,美台景氣狀況佳,但市場關注日央在利率會議動作,市場期待更大擴大寬鬆政策出爐,雖安倍晉三已公布28兆日圓刺激方案,但市場仍期待更多,導致日圓對美元大幅波動,隱含波動率創08年來新高,但日央擴大寬鬆不如預期,台股跟進回檔,外資終結連15買轉為賣超13.87億元,但價跌量縮,價量結構仍健康。蔡明翰認為,日央決議擴大購買ETF至6兆日圓,美元貸款擴大一倍至240億美元,但維持利率不變,導致原先預期降息市場出現失望賣壓,但8月2日將表決首相提出28兆刺激方案,加上後續將續增加寬鬆政策力道,另外FED近期言論雖偏向鷹派,但市場對升息預期未明顯拉升,全球資金維持充裕態勢維持不變。"
# write.table(x[1], paste(filename, ".txt",sep="") , quote = FALSE, row.names=FALSE, col.names = FALSE)
# Sys.sleep(runif(1,0,5))# 5個文件,6個字詞
d1 <- c(3,0,10,2,1,0)
d2 <- c(0,3,0,4,2,0)
d3 <- c(3,0,0,0,2,5)
d4 <- c(0,2,5,2,0,0)
d5 <- c(0,1,7,3,1,0)
#計算文件的相關性
d1%*%d2/sqrt((sum(d1^2)*sum(d2^2)))## [,1]
## [1,] 0.1739196
d4%*%d5/sqrt((sum(d4^2)*sum(d5^2)))## [,1]
## [1,] 0.9663531
d3%*%d4/sqrt((sum(d3^2)*sum(d4^2)))## [,1]
## [1,] 0
# 計算TF-IDF值
DTM <- matrix(c(d1,d2,d3,d4,d5),nrow=5,byrow=TRUE)
tf <- DTM/matrix(rep(apply(DTM,1,sum),each=6,len=6*5),nrow=5,byrow=TRUE)
DTM1 <- DTM
DTM1[DTM1>0] <- 1
idf <- log(nrow(DTM1)/apply(DTM1,2,sum),10)
idf <- matrix(rep(idf,nrow(DTM)),nrow=nrow(DTM),byrow = TRUE)
tf*idf## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 0.07461375 0.00000000 0.1386555 0.01211375 0.006056876 0.000000
## [2,] 0.00000000 0.07394958 0.0000000 0.04307112 0.021535558 0.000000
## [3,] 0.11938200 0.00000000 0.0000000 0.00000000 0.019382003 0.349485
## [4,] 0.00000000 0.04929972 0.1232493 0.02153556 0.000000000 0.000000
## [5,] 0.00000000 0.01848740 0.1294118 0.02422750 0.008075834 0.000000
d1 <- c(3,0,10,2,1,0)
d2 <- c(0,3,0,4,2,0)
d3 <- c(3,0,0,0,2,5)
d4 <- c(0,2,5,2,0,0)
d5 <- c(0,1,7,3,1,0)
DTM <- matrix(c(d1,d2,d3,d4,d5),nrow=5,byrow=TRUE)
TDM <- t(DTM)
TDM.dataframe <- data.frame(TDM)
colnames(TDM.dataframe) <- c("d1", "d2", "d3", "d4", "d5")
rownames(TDM.dataframe) <- c("t1","t2", "t3", "t4", "t5", "t6")
# 進行SVD,使用svd() function
svd.tdm <- svd(TDM)
svd.T <- svd.tdm$u
svd.D <- svd.tdm$v
s <- svd.tdm$d
# 查看分解後的結果
# 字詞概念矩陣
svd.T## [,1] [,2] [,3] [,4] [,5]
## [1,] -0.16861617 0.50316716 0.1928911910 0.6275839 0.49917590
## [2,] -0.11946249 -0.02723604 -0.5832560373 -0.2750752 0.68336607
## [3,] -0.92101036 -0.13654116 0.3010235221 -0.1739794 -0.02937896
## [4,] -0.30828606 -0.03420053 -0.6695776455 0.1882990 -0.27691654
## [5,] -0.11643062 0.32806545 -0.2892334076 0.3558817 -0.43474250
## [6,] -0.02099592 0.78653729 0.0001876407 -0.5814594 -0.13148120
# 奇異值矩陣
diag(s)## [,1] [,2] [,3] [,4] [,5]
## [1,] 14.17454 0.000000 0.000000 0.000000 0.00000
## [2,] 0.00000 6.226305 0.000000 0.000000 0.00000
## [3,] 0.00000 0.000000 5.523532 0.000000 0.00000
## [4,] 0.00000 0.000000 0.000000 1.640008 0.00000
## [5,] 0.00000 0.000000 0.000000 0.000000 1.05671
# 文件概念矩陣
svd.D## [,1] [,2] [,3] [,4] [,5]
## [1,] -0.73716383 0.06484652 0.3549395873 0.5338006 0.20361550
## [2,] -0.12870919 0.07028577 -0.9064029496 0.3900796 0.06903223
## [3,] -0.05952148 0.97944424 0.0002072879 -0.1907196 -0.02778749
## [4,] -0.38523653 -0.12938314 -0.1811431151 -0.6362464 0.63026224
## [5,] -0.53672474 -0.12167092 -0.1401381862 -0.3488716 -0.74550161
# 測試1: k=5,計算得到原本的TDM
k <- 5
svd.T[,1:k] %*% diag(s[1:k]) %*% t(svd.D[,1:k])## [,1] [,2] [,3] [,4] [,5]
## [1,] 3.000000e+00 -1.387779e-17 3.000000e+00 -1.498801e-15 -1.443290e-15
## [2,] -5.551115e-17 3.000000e+00 1.731254e-15 2.000000e+00 1.000000e+00
## [3,] 1.000000e+01 9.627715e-16 -2.042095e-15 5.000000e+00 7.000000e+00
## [4,] 2.000000e+00 4.000000e+00 4.250073e-16 2.000000e+00 3.000000e+00
## [5,] 1.000000e+00 2.000000e+00 2.000000e+00 -7.216450e-16 1.000000e+00
## [6,] -2.775558e-16 1.018283e-15 5.000000e+00 0.000000e+00 -4.024558e-16
# 測試2: k=3,計算得到近似的TDM
k <- 3
svd.T[,1:k] %*% diag(s[1:k]) %*% t(svd.D[,1:k])## [,1] [,2] [,3] [,4] [,5]
## [1,] 2.34318571 -0.4379000 3.21095426 0.3223988 0.7523138
## [2,] 0.09377636 3.1261254 -0.06597262 1.2578483 1.3809564
## [3,] 10.15862931 0.1134436 -0.05528025 4.8380278 6.8773132
## [4,] 1.89473807 3.8997390 0.05076531 2.3809081 2.8895867
## [5,] 0.78198816 1.8040436 2.09854783 0.6608849 0.8611376
## [6,] 0.53732105 0.3815704 4.81426938 -0.5191564 -0.4362614
# 將文件1的維度從5降到2
k <- 2
TDM[,1]## [1] 3 0 10 2 1 0
solve(diag(s[1:k]))%*%t(svd.T[,1:k]) %*% TDM[,1]## [,1]
## [1,] -0.73716383
## [2,] 0.06484652
# 將TDM的維度降低
solve(diag(s[1:k]))%*%t(svd.T[,1:k]) %*% TDM## [,1] [,2] [,3] [,4] [,5]
## [1,] -0.73716383 -0.12870919 -0.05952148 -0.3852365 -0.5367247
## [2,] 0.06484652 0.07028577 0.97944424 -0.1293831 -0.1216709
# options(java.home="C://Program Files//Java//jre1.8.0_77")
# library(rJava)
# 執行上述指令,若無錯誤則安裝完成library(rJava)
library(Rwordseg) # 分詞套件
temp <- "經濟部預期,3月整體外銷訂單金額將較2月增加,但因去年同期基期仍高,接單恐持續衰退。"
segmentCN(temp)## [1] "經濟部" "預期" "3月" "整體" "外銷" "訂單" "金額"
## [8] "將" "較" "2月" "增加" "但" "因" "去年"
## [15] "同期" "基期" "仍" "高" "接" "單" "恐"
## [22] "持續" "衰退"library(jiebaR)
temp <- "經濟部預期,3月整體外銷訂單金額將較2月增加,但因去年同期基期仍高,接單恐持續衰退。"
cutter <- worker()
cutter[temp]## [1] "經濟部" "預期" "3" "月" "整體" "外銷"
## [7] "訂單" "金額" "將較" "2" "月" "增加"
## [13] "但" "因" "去年同期" "基期" "仍高" "接單"
## [19] "恐" "持續" "衰退"library(tmcn)
NLPdat <- toUTF8(readLines("https://sites.google.com/a/qfitrc.com/zheng-quan-fen-xi-ke-cheng/fgu-dae-104-2-statistics-ii/00-tong-ji-zuo-ye-zheng-li/twstock1.txt?attredirects=0&d=1"))
NLPdat[1]## [1] "工商時報【涂志豪】台積電(2330)股價從元月中旬以來表現銳不可擋,一路從130.5元拉到上周盤中高點的163元,累計外資在這段時間狂買50.3萬張,累計持股比重飆上78.94%,而投信和自營商則沒出手,完全看外資唱獨腳戲。受到南台灣強震影響,台積電二月合併營收月減16.0%達595.51億元,累計今年前二個月營收達1,304.07億元,較去年同期減少12.9%。不過,台積電竭盡所能補足所有受影響的晶圓,預期不少的產能其出貨時間延後至第二季。不過,受惠於半導體生產鏈庫存在去年底已完成去化,隨著上游客戶陸續提高晶圓代工投片量,加上新台幣兌美元匯率走貶,台積電將第一季的匯率預估更新為33.18元,並將業績估調升至2,010?2,030億元之間,但毛利率則因地震影響下修至44?46%。法人預期台積電第一季營運將順利達標。由於台積電Fab 14A以及Fab 6的部份晶圓出貨延後到第二季,加上客戶搶產能的動作仍持續到第二季,以及台積電開始為蘋果量產新一代A10應用處理器會在第二季底開始大量出貨,因此,法人看好台積電下季營收將見到超過一成的成長幅度,單季營收應可順利改寫歷史新高,且毛利率也將回升到47?49%的正常水準。"library(rJava)
library(Rwordseg) # 分詞套件
temp <- "經濟部預期,3月整體外銷訂單金額將較2月增加,但因去年同期基期仍高,接單恐持續衰退。"
segmentCN(temp)## [1] "經濟部" "預期" "3月" "整體" "外銷" "訂單" "金額"
## [8] "將" "較" "2月" "增加" "但" "因" "去年"
## [15] "同期" "基期" "仍" "高" "接" "單" "恐"
## [22] "持續" "衰退"
# 自行加入字词,必须使用简体字
insertWords("接单")
segmentCN(temp)## [1] "經濟部" "預期" "3月" "整體" "外銷" "訂單" "金額"
## [8] "將" "較" "2月" "增加" "但" "因" "去年"
## [15] "同期" "基期" "仍" "高" "接單" "恐" "持續"
## [22] "衰退"
# insertWords 加入的詞只暫存在記憶體中
# insertWords("接单", save=TRUE)
# /Library/Frameworks/R.framework/Versions/3.2/Resources/library/Rwordseg/config/userdic
# deleteWords("接单", save=TRUE) # 將加入的新字詞刪除
segmentCN(temp, nature = TRUE)## n vn m n v n
## "經濟部" "預期" "3月" "整體" "外銷" "訂單"
## n d d m v c
## "金額" "將" "較" "2月" "增加" "但"
## p t f n d a
## "因" "去年" "同期" "基期" "仍" "高"
## userDefine d vd v
## "接單" "恐" "持續" "衰退"
listDict() # 查看目前本機中安裝那些詞典 ## [1] Name Type Des Path
## <0 rows> (or 0-length row.names)
# 安裝下載的詞典,搜狗輸入法網站詞庫 (http://pinyin.sogou.com/dict/)
installDict("finterms.scel","finance_dict") ## 11319 words were loaded! ... New dictionary 'finance_dict' was installed!
# 若不使用可以將其移除
uninstallDict() ## 11319 words were removed! ... The dictionary 'finance_dict' was uninstalled!
# You can use worker() to initialize a worker, and then use <= or segment() to do the segmentation.
library(jiebaR)
temp <- "經濟部預期,3月整體外銷訂單金額將較2月增加,但因去年同期基期仍高,接單恐持續衰退。"
# use worker() to initialize a worker
cutter <- worker()
# 寫法1
cutter[temp]## [1] "經濟部" "預期" "3" "月" "整體" "外銷"
## [7] "訂單" "金額" "將較" "2" "月" "增加"
## [13] "但" "因" "去年同期" "基期" "仍高" "接單"
## [19] "恐" "持續" "衰退"
# 寫法2
cutter <= temp## [1] "經濟部" "預期" "3" "月" "整體" "外銷"
## [7] "訂單" "金額" "將較" "2" "月" "增加"
## [13] "但" "因" "去年同期" "基期" "仍高" "接單"
## [19] "恐" "持續" "衰退"
# 寫法3
segment(temp,cutter)## [1] "經濟部" "預期" "3" "月" "整體" "外銷"
## [7] "訂單" "金額" "將較" "2" "月" "增加"
## [13] "但" "因" "去年同期" "基期" "仍高" "接單"
## [19] "恐" "持續" "衰退"
# 查看cutter的參數設置
cutter## Worker Type: Jieba Segment
##
## Default Method : mix
## Detect Encoding : TRUE
## Default Encoding: UTF-8
## Keep Symbols : FALSE
## Output Path :
## Write File : TRUE
## By Lines : FALSE
## Max Word Length : 20
## Max Read Lines : 1e+05
##
## Fixed Model Components:
##
## $dict
## [1] "/Library/Frameworks/R.framework/Versions/3.2/Resources/library/jiebaRD/dict/jieba.dict.utf8"
##
## $user
## [1] "/Library/Frameworks/R.framework/Versions/3.2/Resources/library/jiebaRD/dict/user.dict.utf8"
##
## $hmm
## [1] "/Library/Frameworks/R.framework/Versions/3.2/Resources/library/jiebaRD/dict/hmm_model.utf8"
##
## $stop_word
## NULL
##
## $timestamp
## [1] 1469866921
##
## $default $detect $encoding $symbol $output $write $lines $bylines can be reset.
# 增加使用者字詞
cutter1 <- worker()
new_user_word(cutter1, "2月",tags="jn1")## [1] TRUE
cutter1[temp]## [1] "經濟部" "預期" "3" "月" "整體" "外銷"
## [7] "訂單" "金額" "將較" "2月" "增加" "但"
## [13] "因" "去年同期" "基期" "仍高" "接單" "恐"
## [19] "持續" "衰退"
# worker的參數設定 (https://qinwenfeng.com/jiebaR/worker.html)
cutter2 <- worker(type = "mix", dict = DICTPATH, hmm = HMMPATH, user = USERPATH,
idf = IDFPATH, stop_word = STOPPATH, write = T, qmax = 20, topn = 5,
encoding = "UTF-8", detect = T, symbol = F, lines = 1e+05,
output = NULL, bylines = F)library(rJava)
library(Rwordseg) # 分詞套件
library(tm) #text mining 套件
library(tmcn) # A Text mining toolkit for international characters
NLPdat <-readLines("https://sites.google.com/a/qfitrc.com/zheng-quan-fen-xi-ke-cheng/fgu-dae-104-2-statistics-ii/00-tong-ji-zuo-ye-zheng-li/twstock1.txt?attredirects=0&d=1") # mac
# NLPdat <- toUTF8(readLines("https://sites.google.com/a/qfitrc.com/zheng-quan-fen-xi-ke-cheng/fgu-dae-104-2-statistics-ii/00-tong-ji-zuo-ye-zheng-li/twstock1.txt?attredirects=0&d=1")) # windows
#segment.options(isNameRecognition=TRUE)
data.segCN <- segmentCN(NLPdat)
data.corpus <- Corpus(VectorSource(data.segCN))
# inspect(data.corpus)
data.corpus[[1]]$content## [1] "工商" "時報" "涂" "志" "豪" "台" "積"
## [8] "電" "2330" "股價" "從" "元月" "中旬" "以來"
## [15] "表現" "銳" "不可" "擋" "一路" "從" "130"
## [22] "5元" "拉" "到" "上周" "盤" "中" "高"
## [29] "點" "的" "163元" "累計" "外資" "在" "這"
## [36] "段" "時間" "狂" "買" "50" "3萬張" "累計"
## [43] "持" "股" "比重" "飆" "上" "78" "94"
## [50] "而" "投" "信" "和" "自營" "商" "則"
## [57] "沒" "出" "手" "完全" "看" "外資" "唱"
## [64] "獨腳戲" "受到" "南" "台灣" "強震" "影響" "台"
## [71] "積" "電" "二月" "合併" "營" "收" "月"
## [78] "減" "16" "0" "達" "595" "51億元" "累計"
## [85] "今年" "前" "二個" "月" "營" "收" "達"
## [92] "1" "304" "07億元" "較" "去年" "同期" "減少"
## [99] "12" "9" "不過" "台" "積" "電" "竭盡"
## [106] "所" "能" "補足" "所有" "受" "影響" "的"
## [113] "晶" "圓" "預期" "不少" "的" "產" "能"
## [120] "其" "出" "貨" "時間" "延" "後" "至"
## [127] "第二季" "不過" "受惠" "於" "半導體" "生產" "鏈"
## [134] "庫存" "在" "去年底" "已" "完成" "去" "化"
## [141] "隨" "著" "上游" "客戶" "陸續" "提高" "晶"
## [148] "圓" "代" "工" "投" "片" "量" "加上"
## [155] "新" "台幣" "兌" "美元" "匯率" "走" "貶"
## [162] "台" "積" "電" "將" "第一季" "的" "匯率"
## [169] "預估" "更新" "為" "33" "18元" "並" "將"
## [176] "業績" "估" "調升" "至" "2" "010" "2"
## [183] "030億元" "之間" "但" "毛利率" "則" "因" "地震"
## [190] "影響" "下" "修" "至" "44" "46" "法人"
## [197] "預期" "台" "積" "電" "第一季" "營運" "將"
## [204] "順利" "達標" "由" "於" "台" "積" "電"
## [211] "Fab" "14" "A" "以及" "Fab" "6" "的"
## [218] "部" "份" "晶" "圓" "出" "貨" "延"
## [225] "後" "到" "第二季" "加上" "客戶" "搶" "產能"
## [232] "的" "動作" "仍" "持續" "到" "第二季" "以及"
## [239] "台" "積" "電" "開始" "為" "蘋果" "量"
## [246] "產" "新" "一代" "A" "10" "應用" "處理器"
## [253] "會" "在" "第二季" "底" "開始" "大量出" "貨"
## [260] "因此" "法人" "看好" "台" "積" "電" "下"
## [267] "季" "營" "收" "將" "見到" "超過" "一成"
## [274] "的" "成長" "幅度" "單" "季" "營" "收"
## [281] "應" "可" "順利" "改寫" "歷史" "新" "高"
## [288] "且" "毛利率" "也" "將" "回升" "到" "47"
## [295] "49" "的" "正常" "水準"
# 清除標點符號, 數字
data.corpus <- tm_map(data.corpus, removePunctuation)
data.corpus <- tm_map(data.corpus, removeNumbers)
data.DTM <- DocumentTermMatrix(data.corpus,control=list(wordLengths = c(2, Inf)))
# data.TDM <- TermDocumentMatrix(data.corpus,control=list(wordLengths = c(2, Inf)))
# myterms <- c("半導體","訂單", "蘋果")
# data.DTM1 <- DocumentTermMatrix(data.corpus,control=list(wordLengths = c(2, Inf), dictionary = myterms))
# inspect(data.DTM1)
inspect(data.DTM[1:5, 120:125])## <<DocumentTermMatrix (documents: 5, terms: 6)>>
## Non-/sparse entries: 0/30
## Sparsity : 100%
## Maximal term length: 2
## Weighting : term frequency (tf)
##
## Terms
## Docs 出讓 出色 出售 出現 初步 除非
## 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## 5 0 0 0 0 0 0
findFreqTerms(data.DTM, lowfreq = 20) # 找出較常出現的字詞## [1] "半導體" "表現" "產業" "成長" "第季" "訂單" "法人"
## [8] "股價" "合併" "今年" "今天" "客戶" "庫存" "蘋果"
## [15] "去年" "設備" "申請" "市場" "外資" "先進" "億美元"
## [22] "億元" "營運" "影響" "預估" "預期" "支出" "資本"
findAssocs(data.DTM, "成長", 0.4) # 找出與某詞相關系數為0.4的字詞## $成長
## 競爭力 電腦 中科 經濟 震撼 問題 主要 幅度
## 0.70 0.67 0.67 0.61 0.60 0.58 0.57 0.54
## 廠商 方面 設計 長期 百年 補給 產出 大旱
## 0.53 0.53 0.53 0.53 0.52 0.52 0.52 0.52
## 大戶 電子 複合 個人 工業 規模 會上 節水
## 0.52 0.52 0.52 0.52 0.52 0.52 0.52 0.52
## 擴大 歷年 水車 提供 先前 效率 應付 用水
## 0.52 0.52 0.52 0.52 0.52 0.52 0.52 0.52
## 雲端 遭遇 重疊 綜合 今年 氣氛 仍然 重視
## 0.52 0.52 0.52 0.52 0.51 0.47 0.47 0.47
## 昨天 年底 首季 一個 此外 智慧 安心 暗示
## 0.47 0.44 0.44 0.44 0.43 0.43 0.42 0.42
## 表明 不景氣 出來 出讓 創辦 創意 帶來 當中
## 0.42 0.42 0.42 0.42 0.42 0.42 0.42 0.42
## 刀口 的確 等待 第度 第一 方向 飛來 更加
## 0.42 0.42 0.42 0.42 0.42 0.42 0.42 0.42
## 功力 關係 好好 基本功 集團 季報 加快 價格
## 0.42 0.42 0.42 0.42 0.42 0.42 0.42 0.42
## 建廠 結構性 節省 就算 砍價 科技 困難 理想
## 0.42 0.42 0.42 0.42 0.42 0.42 0.42 0.42
## 兩成 面臨 平板 確立 上回 升值 示範 事實
## 0.42 0.42 0.42 0.42 0.42 0.42 0.42 0.42
## 說法 無異 下來 現身 效能 新書 宣告 養精蓄銳
## 0.42 0.42 0.42 0.42 0.42 0.42 0.42 0.42
## 要求 一口氣 一切 迎接 預告 這樣 只能 做好
## 0.42 0.42 0.42 0.42 0.42 0.42 0.42 0.42
## pc 市況
## 0.42 0.41
inspect(removeSparseTerms(data.DTM, 0.5)) # 處理稀疏矩陣## <<DocumentTermMatrix (documents: 31, terms: 9)>>
## Non-/sparse entries: 166/113
## Sparsity : 41%
## Maximal term length: 3
## Weighting : term frequency (tf)
##
## Terms
## Docs 半導體 成長 第季 股價 今年 今天 客戶 營運 us
## 1 1 1 0 1 1 0 2 1 0
## 2 0 2 2 1 1 0 0 0 0
## 3 0 0 6 6 0 1 1 3 0
## 4 1 1 1 2 1 1 0 1 1
## 5 3 0 1 1 0 0 0 0 0
## 6 1 3 2 2 4 1 1 0 1
## 7 0 0 4 0 2 3 2 0 1
## 8 1 2 0 0 4 1 0 1 1
## 9 1 0 4 5 2 1 0 1 1
## 10 0 0 0 0 5 1 0 0 0
## 11 0 3 2 4 2 8 1 1 0
## 12 1 2 13 0 0 0 3 0 0
## 13 0 0 7 2 0 1 0 1 0
## 14 1 0 2 2 2 1 2 1 1
## 15 0 1 0 0 1 1 1 0 0
## 16 0 0 0 0 1 0 0 0 1
## 17 0 0 0 1 1 4 1 0 1
## 18 3 6 0 1 4 0 0 2 0
## 19 2 6 7 0 5 0 2 1 1
## 20 0 1 1 2 0 1 2 0 1
## 21 4 2 0 1 2 1 0 3 0
## 22 0 0 0 0 0 1 0 1 0
## 23 5 2 0 1 0 2 1 0 1
## 24 1 4 0 0 0 0 0 0 1
## 25 1 7 2 0 6 1 1 3 1
## 26 4 2 1 2 7 0 0 4 1
## 27 0 3 1 3 5 0 2 1 1
## 28 3 1 6 0 5 0 2 3 0
## 29 4 1 2 1 0 3 0 0 2
## 30 6 1 7 0 2 0 2 2 0
## 31 0 0 0 3 1 2 0 0 0
freq <- sort(colSums(as.matrix(data.DTM)), decreasing=TRUE)
wf <- data.frame(word=names(freq), freq=freq)
head(wf)## word freq
## 第季 第季 71
## 今年 今年 64
## 成長 成長 51
## 半導體 半導體 43
## 股價 股價 41
## 今天 今天 35
# 畫圖
library(ggplot2)
p <- ggplot(subset(wf, freq>25), aes(word, freq))
p <- p + geom_bar(stat="identity")
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
p + theme_gray(base_family = "STHeiti")library(jiebaR) # 分詞套件
library(tm) #text mining 套件
library(tmcn) # A Text mining toolkit for international characters
NLPdat <-readLines("https://sites.google.com/a/qfitrc.com/zheng-quan-fen-xi-ke-cheng/fgu-dae-104-2-statistics-ii/00-tong-ji-zuo-ye-zheng-li/twstock1.txt?attredirects=0&d=1") # mac
# NLPdat <- toUTF8(readLines("https://sites.google.com/a/qfitrc.com/zheng-quan-fen-xi-ke-cheng/fgu-dae-104-2-statistics-ii/00-tong-ji-zuo-ye-zheng-li/twstock1.txt?attredirects=0&d=1")) # windows
# 結巴分詞
cutter <- worker(bylines=T)
#new_user_word(cutter, "台積電",tags="jn1")
article_words <- lapply(NLPdat, function(x) cutter <=x)
data.corpus <- Corpus(VectorSource(article_words))
data.corpus <- tm_map(data.corpus, removeNumbers)
data.corpus <- tm_map(data.corpus, removePunctuation)
data.DTM <- DocumentTermMatrix(data.corpus,control=list(wordLengths = c(2, Inf)))
findFreqTerms(data.DTM, lowfreq = 20) ## [1] "半導體" "表現" "產業" "成長" "出貨" "訂單" "法人"
## [8] "股價" "合併" "積電" "今年" "今天" "晶圓" "客戶"
## [15] "庫存" "奈米" "蘋果" "設備" "市場" "台股" "台積"
## [22] "台積電" "外資" "先進" "億美元" "億元" "營收" "營運"
## [29] "影響" "預估" "預期" "張忠謀" "支出" "製程" "資本"
findAssocs(data.DTM, "成長", 0.4) ## $成長
## 幅度 布局 積極 公告 來看 統計 比例 廠商 方面 企業
## 0.57 0.44 0.44 0.43 0.43 0.43 0.42 0.42 0.42 0.42
## 設計 長期 半導體 英特爾 對於
## 0.42 0.42 0.41 0.41 0.40
freq <- sort(colSums(as.matrix(data.DTM)), decreasing=TRUE)
wf <- data.frame(word=names(freq), freq=freq)
head(wf)## word freq
## 台積電 台積電 148
## 今年 今年 63
## 台積 台積 57
## 奈米 奈米 52
## 成長 成長 45
## 半導體 半導體 43
# 畫圖
library(ggplot2)
p <- ggplot(subset(wf, freq>25), aes(word, freq))
p <- p + geom_bar(stat="identity")
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
p + theme_gray(base_family = "STHeiti")