讀取 csv 檔案

getwd()
## [1] "C:/Users/USER/Documents"
download.file('https://raw.githubusercontent.com/ywchiu/rtibame/master/Data/purchase.csv', destfile = 'purchase.csv')

purchase <- read.csv('purchase.csv')
class(purchase)
## [1] "data.frame"
str(purchase)
## 'data.frame':    54772 obs. of  7 variables:
##  $ X       : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ Time    : Factor w/ 53387 levels "2015-07-01 00:00:01",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Action  : Factor w/ 1 level "order": 1 1 1 1 1 1 1 1 1 1 ...
##  $ User    : Factor w/ 32539 levels "U1000001354",..: 8750 7126 187 7952 8235 19731 21996 13919 21994 8700 ...
##  $ Product : Factor w/ 20054 levels "P0000005913",..: 7731 6487 2370 12645 12183 4228 10469 4506 537 7731 ...
##  $ Quantity: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Price   : num  1069 1680 285 550 249 ...
View(purchase)
?read.csv
## starting httpd help server ... done
#setwd('C:/Users/USER/Desktop')
#getwd()


library(readr)
purchase <- read_csv("purchase.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   X1 = col_integer(),
##   Time = col_datetime(format = ""),
##   Action = col_character(),
##   User = col_character(),
##   Product = col_character(),
##   Quantity = col_integer(),
##   Price = col_double()
## )
View(purchase)
?read_csv

寫入 csv 檔案

data(iris)
write.csv(x = iris, file = 'iris.csv')
getwd()
## [1] "C:/Users/USER/Documents"
setwd('C:/Users/USER/Desktop')
write.csv(x = iris, file = 'iris.csv')


write.csv(x = iris, file = 'C:/Users/USER/Desktop/iris2.csv')


?write.table

write.table(x= iris, file='iris.tsv', sep='\t')


data(anscombe)
fit <- lm(y1 ~ x1, data = anscombe)
save(x = fit, file = 'fit.RData')


rm(fit)

load('fit.RData')
plot(y1 ~x1, data = anscombe)
abline(fit, col='red')

讀取 excel 檔案

library(readxl)
FinancialReport <- read_excel("~/FinancialReport.xlsx")
View(FinancialReport)
summary(FinancialReport)
##       年度           股本         財報評分          收盤       
##  Min.   :1999   Min.   : 767   Min.   :59.00   Min.   : 42.60  
##  1st Qu.:2003   1st Qu.:2027   1st Qu.:89.00   1st Qu.: 62.50  
##  Median :2007   Median :2583   Median :92.00   Median : 71.00  
##  Mean   :2007   Mean   :2249   Mean   :88.24   Mean   : 83.75  
##  3rd Qu.:2011   3rd Qu.:2592   3rd Qu.:94.00   3rd Qu.: 97.00  
##  Max.   :2015   Max.   :2643   Max.   :96.00   Max.   :167.00  
##       平均             漲跌            漲跌__1          營業收入   
##  Min.   : 52.40   Min.   :-88.500   Min.   :-53.00   Min.   : 731  
##  1st Qu.: 56.40   1st Qu.: -5.500   1st Qu.: -8.10   1st Qu.:2030  
##  Median : 67.40   Median :  6.500   Median :  8.80   Median :3174  
##  Mean   : 82.29   Mean   :  4.235   Mean   : 11.77   Mean   :3576  
##  3rd Qu.:104.00   3rd Qu.: 20.100   3rd Qu.: 28.00   3rd Qu.:4271  
##  Max.   :147.00   Max.   : 96.000   Max.   :135.00   Max.   :8435  
##     營業毛利       營業利益       業外損益         稅後淨利   
##  Min.   : 315   Min.   : 128   Min.   :-43.70   Min.   : 145  
##  1st Qu.: 765   1st Qu.: 613   1st Qu.:  4.97   1st Qu.: 651  
##  Median :1417   Median :1044   Median : 35.00   Median : 999  
##  Mean   :1639   Mean   :1238   Mean   : 50.67   Mean   :1179  
##  3rd Qu.:2071   3rd Qu.:1592   3rd Qu.: 62.10   3rd Qu.:1616  
##  Max.   :4104   Max.   :3200   Max.   :304.00   Max.   :3066  
##       ROA             EPS        
##  Min.   : 3.93   Min.   : 0.830  
##  1st Qu.:15.50   1st Qu.: 3.450  
##  Median :18.40   Median : 4.140  
##  Mean   :17.15   Mean   : 4.969  
##  3rd Qu.:19.40   3rd Qu.: 6.240  
##  Max.   :24.70   Max.   :11.820

讀取 JSON 檔案

download.file('https://www.railway.gov.tw/Upload/UserFiles/%E8%87%BA%E9%90%B5%E5%B1%80%E6%88%BF%E5%9C%B0%E7%94%A2%E5%87%BA%E7%A7%9F%E6%83%85%E5%BD%A2.json', destfile = 'rent.json')


library(jsonlite)
json_data<- fromJSON('rent.json')
## Warning: JSON string contains (illegal) UTF8 byte-order-mark!
head(json_data)
##     縣市 經管單位                 用途限制 實際用途 每月租金
## 1 臺北市   台北所 辦公或住宅或法律許可範圍     商店    22900
## 2 臺北市   台北所 辦公或住宅或法律許可範圍   辦公室   138000
## 3 臺北市   台北所 辦公或住宅或法律許可範圍     商店    56899
## 4 臺北市   台北所 辦公或住宅或法律許可範圍     住家    23050
## 5 臺北市   台北所 辦公或住宅或法律許可範圍     商店    33294
## 6 臺北市   台北所 辦公或住宅或法律許可範圍     商店    20020
##              租期屆滿 建物面積     構造 總樓層數 建物現況
## 1  105.12.1-108.11.30    189.8 木石磚造        1        B
## 2 104.06.07-107.06.06   220.49     磚造        1        B
## 3     105.2.3-108.2.2      132 加強磚造        2        A
## 4 104.05.28-107.05.27  102.545 加強磚造        3        C
## 5   105.3.31-107.3.30   127.92 加強磚造        1        A
## 6     105.7.2-107.7.1       72 加強磚造        2        B
##                                                                  房屋座落
## 1                     新北市淡水區鼻頭街10、11、12號(淡水鎮海鷗段316地號)
## 2 臺北市大同區忠孝西路2段13號(大同區玉泉段2小段371-7、371-20、371-21地號)
## 3                    臺北市大同區赤峰街33巷4號(大同區圓環段二小段151地號)
## 4                   臺北市萬華區康定路56巷3弄3號(萬華區直興段1小段93地號)
## 5                         臺北市中正區臨沂街19巷16號(中正區臨沂段301地號)
## 6         臺北市中正區汀州路2段73號(臺北市中正區河堤段687-16、687-27地號)
##                                   土地使用分區
## 1                                       工業區
## 2 371-7,371-20為交通廣場用地, 371-21為道路用地
## 3                                 第四種住宅區
## 4                                 第四種商業區
## 5                                 第三種住宅區
## 6               第三種住宅區及第三之一種住宅區

讀取XML 資料

library(XML)
url <- 'http://opendata.epa.gov.tw/ws/Data/ATM00698/?$format=xml'
weather <- xmlToDataFrame(url)
View(weather)

weather[ weather$SiteName == '臺北',  'Temperature'   ]

讀取PDF 資料

# install.packages('pdftools')
library(pdftools)
download.file("http://arxiv.org/pdf/1403.2805.pdf", "1403.2805.pdf", mode = "wb")
txt <- pdf_text("1403.2805.pdf")

#txt

GET

library(rvest)
## Loading required package: xml2
## 
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
## 
##     guess_encoding
apple <- read_html('https://tw.appledaily.com/new/realtime')
#as.character(apple)


pchome <- read_html('https://ecapi.pchome.com.tw/ecshop/prodapi/v2/prod/button&id=DMAW20-A90091BWN&fields=Seq,Id,Price,Qty,ButtonType,SaleStatus&_callback=jsonp_button&1534047300?_callback=jsonp_button')

#as.character(pchome)

POST

library(httr)
url <- 'http://www.thsrc.com.tw/tw/TimeTable/SearchResult'
payload <- list(
StartStation='977abb69-413a-4ccf-a109-0272c24fd490',
EndStation='a7a04c89-900b-4798-95a3-c01c455622f4',
SearchDate='2018/08/12',
SearchTime='11:00',
SearchWay='DepartureInMandarin'
  
)
res <- POST(url, body = payload, encode = 'form', add_headers("user-agent" = "Mozilla/5.0"))
#as.character(content(res))

台鐵簡易查詢

url <- 'http://twtraffic.tra.gov.tw/twrail/mobile/TimeTableSearchResult.aspx?searchtype=1&searchdate=2018/08/12&trainclass=2&fromtime=0000&totime=2359&searchtext=%u81FA%u5317,%u6843%u5712'

library(rvest)
res <- read_html(url)
#as.character(res)

Magrittr

# method 1
library(magrittr)

# method 2
library(rvest)

sum(tail(head(iris),3)$Sepal.Length)
## [1] 15
iris %>% head() %>% tail(3) %>% .$Sepal.Length %>% sum()
## [1] 15

剖析案例

a <- '<html>
<body>
<h1 id="title">Hello World</h1>
<a href="#" class="link">This is link1</a>
<a href="# link2" class="link">This is link2</a>
</body>
</html>'

read_html(a)
## {xml_document}
## <html>
## [1] <body>\n<h1 id="title">Hello World</h1>\n<a href="#" class="link">Th ...
# by tag name
read_html(a) %>% html_nodes('h1') %>% html_text()
## [1] "Hello World"
read_html(a) %>% html_nodes('a') %>% html_text()
## [1] "This is link1" "This is link2"
# by css attribute
## id -> #
read_html(a) %>% html_nodes('#title')%>% html_text()
## [1] "Hello World"
## class -> .
read_html(a) %>% html_nodes('.link')%>% html_text()
## [1] "This is link1" "This is link2"
# by tag name and attribute

read_html(a) %>% html_nodes('h1#title')
## {xml_nodeset (1)}
## [1] <h1 id="title">Hello World</h1>
read_html(a) %>% html_nodes('body h1#title')
## {xml_nodeset (1)}
## [1] <h1 id="title">Hello World</h1>
read_html(a) %>% html_nodes('a.link')
## {xml_nodeset (2)}
## [1] <a href="#" class="link">This is link1</a>
## [2] <a href="#%20link2" class="link">This is link2</a>
read_html(a) %>% html_nodes('body a.link')
## {xml_nodeset (2)}
## [1] <a href="#" class="link">This is link1</a>
## [2] <a href="#%20link2" class="link">This is link2</a>
## get href

read_html(a) %>% html_nodes('a.link') %>% html_attr('href')
## [1] "#"       "# link2"

剖析蘋果即時新聞

apple <- read_html('https://tw.appledaily.com/new/realtime')

apple %>% html_nodes('li.rtddt') %>% .[1] %>% as.character()
## [1] "<li class=\"rtddt sport\">\r\n                                    <a href=\"https://tw.sports.appledaily.com/realtime/20180812/1409610/\" target=\"_blank\">\r\n                                        <time>17:25</time><h2>體育</h2>\r\n                                        <h1><font color=\"#383c40\">潘威倫復出首戰 二軍2局24球無責失</font></h1>\r\n                                    </a>\r\n                                </li>"
apple %>% html_nodes('.rtddt a')
## {xml_nodeset (30)}
##  [1] <a href="https://tw.sports.appledaily.com/realtime/20180812/1409610 ...
##  [2] <a href="https://tw.news.appledaily.com/local/realtime/20180812/140 ...
##  [3] <a href="https://tw.news.appledaily.com/life/realtime/20180812/1409 ...
##  [4] <a href="https://tw.news.appledaily.com/life/realtime/20180812/1409 ...
##  [5] <a href="https://tw.news.appledaily.com/politics/realtime/20180812/ ...
##  [6] <a href="https://tw.sports.appledaily.com/realtime/20180812/1409500 ...
##  [7] <a href="https://tw.news.appledaily.com/local/realtime/20180812/140 ...
##  [8] <a href="https://tw.news.appledaily.com/international/realtime/2018 ...
##  [9] <a href="https://tw.finance.appledaily.com/realtime/20180812/140962 ...
## [10] <a href="https://tw.news.appledaily.com/local/realtime/20180812/140 ...
## [11] <a href="https://tw.entertainment.appledaily.com/realtime/20180812/ ...
## [12] <a href="https://tw.news.appledaily.com/local/realtime/20180812/140 ...
## [13] <a href="https://tw.entertainment.appledaily.com/realtime/20180812/ ...
## [14] <a href="https://tw.news.appledaily.com/life/realtime/20180812/1409 ...
## [15] <a href="https://tw.news.appledaily.com/local/realtime/20180812/140 ...
## [16] <a href="https://tw.news.appledaily.com/politics/realtime/20180812/ ...
## [17] <a href="https://tw.sports.appledaily.com/realtime/20180812/1409614 ...
## [18] <a href="https://tw.news.appledaily.com/local/realtime/20180812/140 ...
## [19] <a href="https://tw.sports.appledaily.com/realtime/20180812/1409609 ...
## [20] <a href="https://tw.news.appledaily.com/local/realtime/20180812/140 ...
## ...
read_html('http://news.ltn.com.tw/list/breakingnews') %>% html_nodes('.imm li')
## {xml_nodeset (17)}
##  [1] <li>\n\t\t\t\t\t<a href="http://news.ltn.com.tw/news/life/breakingn ...
##  [2] <li>\n\t\t\t\t\t<a href="http://sports.ltn.com.tw/news/breakingnews ...
##  [3] <li>\n\t\t\t\t\t<a href="http://news.ltn.com.tw/news/world/breaking ...
##  [4] <li>\n\t\t\t\t\t<a href="http://news.ltn.com.tw/news/life/breakingn ...
##  [5] <li>\n\t\t\t\t\t<a href="http://sports.ltn.com.tw/news/breakingnews ...
##  [6] <li>\n\t\t\t\t\t<a href="http://sports.ltn.com.tw/news/breakingnews ...
##  [7] <li>\n\t\t\t\t\t<a href="http://news.ltn.com.tw/news/life/breakingn ...
##  [8] <li>\n\t\t\t\t\t<a href="http://news.ltn.com.tw/news/politics/break ...
##  [9] <li>\n\t\t\t\t\t<a href="http://sports.ltn.com.tw/news/breakingnews ...
## [10] <li>\n\t\t\t\t\t<a href="http://sports.ltn.com.tw/news/breakingnews ...
## [11] <li>\n\t\t\t\t\t<a href="http://news.ltn.com.tw/news/society/breaki ...
## [12] <li>\n\t\t\t\t\t<a href="http://news.ltn.com.tw/news/politics/break ...
## [13] <li>\n\t\t\t\t\t<a href="http://ent.ltn.com.tw/news/breakingnews/25 ...
## [14] <li>\n\t\t\t\t\t<a href="http://istyle.ltn.com.tw/article/8326" dat ...
## [15] <li>\n\t\t\t\t\t<a href="http://news.ltn.com.tw/news/life/breakingn ...
## [16] <li>\n\t\t\t\t\t<a href="http://news.ltn.com.tw/news/life/breakingn ...
## [17] <li>\n\t\t\t\t\t<a href="http://news.ltn.com.tw/news/society/breaki ...
udn <- read_html('https://udn.com/news/breaknews/1')
udn %>% html_nodes('#breaknews_body dt') %>% .[1] %>% as.character()
## [1] "<dt>\n<a href=\"/news/story/7001/3305028?from=udn-ch1_breaknews-1-0-news\" target=\"\"><img src=\"https://pgw.udn.com.tw/gw/photo.php?u=https://uc.udn.com.tw/photo/2018/08/12/realtime/5121091.jpg&amp;x=&amp;y=&amp;sw=&amp;sh=&amp;exp=3600&amp;sl=W&amp;fw=240\"></a><a href=\"/news/breaknews/1/7#breaknews\" class=\"cate\">運動</a><h2><a class=\"\" href=\"/news/story/7001/3305028?from=udn-ch1_breaknews-1-0-news\" target=\"\">中職/葉君璋重返新莊球場 蔡承儒賦予培育基層重任</a></h2>\n<div class=\"info\">\n<div class=\"dt\">08-12 17:26</div>\n<div class=\"view\">\n<b></b>48</div>\n</div>\n<!--</a>-->\n</dt>\n"
a <- udn %>% html_nodes('#breaknews_body dt')
a[1]
## {xml_nodeset (1)}
## [1] <dt>\n<a href="/news/story/7001/3305028?from=udn-ch1_breaknews-1-0-n ...
vogue <- read_html('https://www.vogue.com.tw/')
vogue %>% html_nodes('.v-widget-timeBlock') %>% .[1] %>% as.character()
## [1] "<div class=\"v-widget-timeBlock type5\">\n                    \t    <div class=\"v-widget-timeStamp\" datetime=\"2018/8/12 15:29:19\">\n                    \t        <div class=\"v-widget-timeset-M\">\n                                                        <div class=\"Date\"></div>\n                                                        <div class=\"Month\"></div>\n                                                        <div class=\"Year\"></div>\n                                </div>\n                    \t        <div class=\"timeBlock-info\"></div>\n                    \t    </div>\n                    \t    <div class=\"v-widget-pinShareSet-content\">\n    <div class=\"v-widget-vogue-pinBtn\" style=\"top: 5px;margin-right:0;\" onclick=\"javascript:member_vogue_pin('//www.vogue.com.tw/feature/3c/content-42106.html',10215,42106);\"></div>\n        <div class=\"v-widget-shareBtn\" style=\"width: 300px;float:right;top:15px;\">\n        <!--weibo-->\n        <div class=\"share\" style=\"float:right;margin-right: 5px;padding-top: 0px;width:20px;\">\n        <share-button count=\"n\" type=\"icon\" size=\"small\" url=\"//www.vogue.com.tw/feature/3c/content-42106.html\" title=\"打出顏色最鮮豔的果汁!全新伊萊克斯Explore 7迷你隨行杯果汁機智能瞬速保留食材的最純粹\" pic=\"https://img.vogue.com.tw/userfiles/thumbnail/sm380380_images_A0/42106/2018081256795945.JPG\"><a target=\"_blank\" href=\"http://service.weibo.com/share/share.php?url=//www.vogue.com.tw/feature/3c/content-42106.html\"><img src=\"/images/widget/weibo.png\"></a></share-button>\n</div>\n        <!--pinterest-->                                \n        <div class=\"share\" style=\"float:right;margin-right: 5px;\">\n            <a target=\"_blank\" href=\"//www.pinterest.com/pin/create/button/?url=%2F%2Fwww%2Evogue%2Ecom%2Etw%2Ffeature%2F3c%2Fcontent%2D42106%2Ehtml&amp;media=https%3A%2F%2Fimg%2Evogue%2Ecom%2Etw%2Fuserfiles%2Fthumbnail%2Fsm380380%5Fimages%5FA0%2F42106%2F2018081256795945%2EJPG&amp;description=%E6%89%93%E5%87%BA%E9%A1%8F%E8%89%B2%E6%9C%80%E9%AE%AE%E8%B1%94%E7%9A%84%E6%9E%9C%E6%B1%81%EF%BC%81%E5%85%A8%E6%96%B0%E4%BC%8A%E8%90%8A%E5%85%8B%E6%96%AFExplore+7%E8%BF%B7%E4%BD%A0%E9%9A%A8%E8%A1%8C%E6%9D%AF%E6%9E%9C%E6%B1%81%E6%A9%9F%E6%99%BA%E8%83%BD%E7%9E%AC%E9%80%9F%E4%BF%9D%E7%95%99%E9%A3%9F%E6%9D%90%E7%9A%84%E6%9C%80%E7%B4%94%E7%B2%B9\" data-pin-do=\"buttonPin\" data-pin-config=\"above\"><img src=\"//assets.pinterest.com/images/pidgets/pinit_fg_en_rect_gray_20.png\"></a>\n        </div>\n         <!--g+-->     \n        <div class=\"share\" style=\"float:right;margin-right: 5px;\">\n            <div data-href=\"//www.vogue.com.tw/feature/3c/content-42106.html\" class=\"g-plusone\" data-size=\"medium\"></div>\n        </div>\n        <!--fb-->\n        <div class=\"share\" style=\"float:right;margin-right: 5px;\">\n            <div data-href=\"//www.vogue.com.tw/feature/3c/content-42106.html\" class=\"fb-like\" layout=\"button_count\" data-show-faces=\"show_faces\"></div>\n        </div>                    \n        <div class=\"clearAll\"></div>      \n    </div>\n    <div class=\"clearAll\"></div>                \n</div>\n\n\n\n                    \t    <div class=\"timeBlock-name\"><a href=\"//www.vogue.com.tw/feature/3c/content-42106.html\"><h2>FEATURE</h2></a></div>\n                    \t    <div class=\"timeBlock-pic\">\n                    \t        <a anchor=\"articlePhoto2\" href=\"//www.vogue.com.tw/feature/3c/content-42106.html\" style=\"margin-right: -2px;\"><img src=\"https://img.vogue.com.tw/userfiles/thumbnail/sm380380_images_A0/42106/2018081256795945.JPG\"></a> <!--350x350-->\n                    \t        <a anchor=\"articlePhotoLink\" href=\"//www.vogue.com.tw/feature/3c/content-42106.html\"><img src=\"https://img.vogue.com.tw/userfiles/thumbnail/sm380380_images_A0/42106/2018081256737905.JPG\"></a> <!--350x350-->\n                    \t    </div>\n\n                    \t    <div class=\"timeBlock-articleTitle\">\n                                                    <a href=\"//www.vogue.com.tw/feature/3c/content-42106.html\">打出顏色最鮮豔的果汁!全新伊萊克斯Explore 7迷你隨行杯果汁機智能瞬速保留食材的最純粹</a>\n                            </div>\n                            <div class=\"timeBlock-articleBody\">\n                                                    <a href=\"//www.vogue.com.tw/feature/3c/content-42106.html\">這一次的瑞典探訪百年家電品牌伊萊克斯Electrolux,除了新發明的吸塵器PURE F9滑移百變吸...</a>\n                            </div>\n                    \t    <div class=\"timeBlock-readMore\">\n                    \t        <a href=\"//www.vogue.com.tw/feature/3c/content-42106.html\">READ MORE</a>\n                    \t    </div>\n                    \t    <div class=\"clearAll\"></div>\n                    \t</div>"

蘋果爬蟲

apple <- read_html('https://tw.appledaily.com/new/realtime')

rtddt <- apple %>% html_nodes('.rtddt a')

as.character(rtddt[1])
## [1] "<a href=\"https://tw.sports.appledaily.com/realtime/20180812/1409610/\" target=\"_blank\">\r\n                                        <time>17:25</time><h2>體育</h2>\r\n                                        <h1><font color=\"#383c40\">潘威倫復出首戰 二軍2局24球無責失</font></h1>\r\n                                    </a>"
title <- rtddt %>% 
  html_nodes('h1') %>% 
  html_text()

category <- rtddt %>% 
  html_nodes('h2') %>% 
  html_text()

time <- rtddt %>% 
  html_nodes('time') %>% 
  html_text()

url <- rtddt %>%
  html_attr('href')

applenews <- data.frame(dt = time, title = title, category = category, link = url, stringsAsFactors = FALSE)

class(applenews)
## [1] "data.frame"
str(applenews)
## 'data.frame':    30 obs. of  4 variables:
##  $ dt      : chr  "17:25" "17:25" "17:23" "17:23" ...
##  $ title   : chr  "潘威倫復出首戰 二軍2局24球無責失" "【獨家】陸戰隊士官休假持毒遭逮 不排除銷..." "鬼月玩水人變少 遊客:慈悲月百無禁忌(1399)" "傳承家族總鋪師精神 大學女西餐烹飪奪金" ...
##  $ category: chr  "體育" "社會" "生活" "生活" ...
##  $ link    : chr  "https://tw.sports.appledaily.com/realtime/20180812/1409610/" "https://tw.news.appledaily.com/local/realtime/20180812/1409618/" "https://tw.news.appledaily.com/life/realtime/20180812/1409542/" "https://tw.news.appledaily.com/life/realtime/20180812/1409616/" ...
#View(applenews)
write.csv(x = applenews, file = 'applenews.csv')


applenews[ grepl('銓敘部', applenews$title)  ,   ]
## [1] dt       title    category link    
## <0 rows> (or 0-length row.names)

抓取內文

article <- read_html('https://tw.news.appledaily.com/life/realtime/20180812/1409381/')

article %>% html_nodes('h1') %>% html_text()
## [1] "台大醫畢業又教過補習班 柯P學弟今成柯辦發言人"
article %>% html_nodes('.ndArticle_margin p') %>%  .[1] %>% html_text()
## [1] "(更新:新增影片)台北市長柯文哲為爭取年底連任,上月開始海選柯辦發言人,從近300位角逐者中,選出人16人參加第二階段甄選,最後由柯文哲台大學弟楊笙及都發局幫工程司蔡峻維出線,兩人今首度接受媒體訪問時透露,他們參加甄選是展現對公共事務的支持及興趣,希望能與柯市長一起打選戰。楊笙表示,未來他應該還是會回去當醫師,蔡峻維也指,未來也沒有想到要進入市府,只希望把市長選戰打完。<U+00A0>柯辦競選總部發言人團隊今已成形,除了現有柯辦發言人林昆鋒、林筱淇外,如今又增添兩名生力軍,林昆鋒原在市長室擔任發言人,今也出席記者會表示,他已口頭請辭,預計15日起請假,9月1日正式報到。林昆鋒今表示,目前兩位發言人仍在見習階段,但本周開始將會安排新任發言人接受電台節目專訪。<U+00A0>柯辦發言人楊笙現年26歲,為柯文哲台大學弟,曾擔任補習班講師與台大實習醫師,他說因女友鼓勵他參加海選,以前他很喜歡說服別人,但後來發現要理解對方想法才能說服他人,這是一項技能也需要訓練,他更深受柯市長的「心存善念、盡力而為」的理念而感動,過往對政治想像都是選舉拚得你死我活,但柯市長把選舉是發展理念、改變台北。他並強調,未來還是會回去當醫師。<U+00A0>柯辦另一發言人蔡峻維現年25歲,政大中文系畢業,曾任市府都市更新處及都發局的約聘幫工程司,他說參加海選是想用公務員的角度講述公共政策,可跟民眾第一線溝通,畢竟他過去工作就是與民眾談論公宅等政策,他喜歡柯市長很親民率直個性,比起很多長官多一分率真也沒有架子,現階段先進入柯辦團隊,等選戰結束後再規劃下一步。(林媛玲/台北報導)出版時間:10:58更新時間:13:52<U+00A0>想知道更多,一定要看……柯辦今公布新發言人 柯P憂:兩三下就被周玉蔻幹掉"
article %>% html_nodes('.ndArticle_creat') %>% html_text()
## [1] "出版時間:2018/08/12 13:52"
article %>% html_nodes('.ndArticle_view') %>% html_text()
## [1] "37979"
article %>% html_nodes('.ndgTag .current') %>% .[1] %>% html_text()
## [1] "生活"
getArticle <- function(url){
  e <- read_html(url)
  
  article <- e %>% 
    html_nodes('.ndArticle_margin p') %>% 
    .[1] %>% 
    html_text()
  
  title <- e %>% 
    html_nodes('h1') %>% 
    html_text()
  
  dt <- e %>% 
    html_nodes('.ndArticle_creat') %>% 
    html_text()
  
  clicked <- e %>% 
    html_nodes('.ndArticle_view') %>% 
    html_text() %>% as.integer()
  
  if (length(clicked) == 0) {
    clicked <- 0
  }

  category <- e %>% 
    html_nodes('.ndgTag .current') %>% 
    .[1] %>% 
    html_text()  
  #print(clicked)
  data.frame(article = article, title = title, dt=dt, category=category,clicked = clicked, stringsAsFactors = FALSE )
}

getArticle('https://tw.entertainment.appledaily.com/realtime/20180812/1409535/')
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              article
## 1 台視外景實境競賽節目《綜藝3國智》主持群王少偉、納豆、花花、李易、哈孝遠、Spexial風田、張立東、瑪麗、鯰魚哥及大根前天在台中舉辦粉絲見面會,當天吸引超過300名熱情粉絲到場支持,見面會當天逾300位粉絲頂著攝氏35度的高溫,從清晨5點就到現場排隊等候。李易透露哈孝遠前一天還在擔心沒有人潮,見到現場人山人海馬上跑到對面偷拍大家,第一次參與見面會的花花則驚呼:「原來真的有這麼多人在看我們節目」。王少偉表示:「希望下次活動能換再大一點的場地,粉絲都已經站到馬路上了!」,納豆也打趣說:「平常工作出外景沒有接觸到大家,難得和這麼多人見面其實很開心,也要呼籲一下製作單位,我們其實是『明星』、不要再整我們了啦!」。(蔡維歆╱台北報導)看了這則新聞的人,也看了……【峇里島大婚】《步步》八爺今娶60億千金 包場印尼餐廳搞婚前趴罷工《食尚》沒在怕! 莎莎收入不減反變多【獨家】豬哥亮過世後453天 謝金晶慶生數度暴哭
##                                      title                         dt
## 1 累了!納豆「我們是明星」 求節目別再惡整 出版時間:2018/08/12 15:28
##   category clicked
## 1 娛樂時尚    3839

抓取內文

dfall   <- data.frame()
newsurl <- 'https://tw.appledaily.com/new/realtime'
apple   <- read_html(newsurl)
rtddt   <- apple %>% html_nodes('.rtddt a')
links   <- rtddt %>% html_attr('href')   
for (url in links){
  df <- getArticle(url)
  dfall <- rbind(dfall,df)
}
dfall

Apply

x <- list(c(1,2,3,4), c(5,6,7,8))
lapply(x, sum)
## [[1]]
## [1] 10
## 
## [[2]]
## [1] 26
sapply(x, sum)
## [1] 10 26
# named function
f <- function(e){
  e[1]
}

f(c(1,2,3,4))
## [1] 1
lapply(x, f)
## [[1]]
## [1] 1
## 
## [[2]]
## [1] 5
lapply(x, function(e) e[1])
## [[1]]
## [1] 1
## 
## [[2]]
## [1] 5
sapply(x, function(e) e[1])
## [1] 1 5
lapply(x, function(e) e[4])
## [[1]]
## [1] 4
## 
## [[2]]
## [1] 8
sapply(x, function(e) e[4])
## [1] 4 8

抓取多頁連結

url <- 'https://tw.appledaily.com/new/realtime/'

for (i in 1:3){
  print(paste0(url, i))
}


lapply(1:3, function(e) paste0(url, e))


getURL <- function(newsurl){
  dfall   <- data.frame()
  apple   <- read_html(newsurl)
  rtddt   <- apple %>% html_nodes('.rtddt a')
  links   <- rtddt %>% html_attr('href')   
  for (url in links){
    df <- getArticle(url)
    dfall <- rbind(dfall,df)
  }
  dfall
}


dfall <- lapply(1:3, function(e) getURL(paste0(url, e)))

#View(dfall)

a.frame <- data.frame(x= c(1,2,3), b =c(2,3,4))
b.frame <- data.frame(x= c(3,3,2), b =c(1,2,3))
b.frame
do.call("rbind", list(a.frame, b.frame))

appledf <- do.call('rbind', dfall)
View(appledf)

write.csv(x = appledf, file = 'apple.csv')

Database operation

show databases;
use orders;
show tables;

create table customer(
id int AUTO_INCREMENT PRIMARY KEY,
name    varchar(50),
gender  varchar(1),
address varchar(200)
);

ALTER TABLE customer
CHANGE COLUMN name cname VARCHAR(50) NOT NULL,
ADD COLUMN phone VARCHAR(10);

DESCRIBE customer;

insert into customer(cname, gender, address) values('John', 'M', 'Chiayi');
insert into customer(cname, gender, address) values('Mary', 'F', 'Tainan');
insert into customer(cname, gender, address) values('Brad', 'M', 'Chiayi');

select * from customer;
select cname, gender from customer;
select cname, gender from customer where gender = 'M';

select count(*) from customer;
SELECT gender, count(*) FROM customer GROUP BY gender;
SELECT gender, count(*) FROM customer GROUP BY gender HAVING count(*) >= 2;


select * from customer;
update customer set cname = 'Johnny' where id = 1;
select * from customer;


delete from customer where id = 1;
select * from customer;

delete from customer;
drop table customer;