讀取資料

lvr_prices <- read_csv("C:/Users/USER/lvr_prices_mac.csv")
Missing column names filled in: 'X1' [1]Parsed with column specification:
cols(
  .default = col_character(),
  X1 = col_integer(),
  land_sqmeter = col_double(),
  trading_ymd = col_date(format = ""),
  finish_ymd = col_date(format = ""),
  building_sqmeter = col_double(),
  room = col_integer(),
  living_room = col_integer(),
  bath = col_integer(),
  total_price = col_integer(),
  price_per_sqmeter = col_double(),
  parking_sqmeter = col_double(),
  parking_price = col_integer()
)
See spec(...) for full column specifications.
32 parsing failures.
 row         col   expected     actual
1282 total_price an integer 6700000000
2243 total_price an integer 3882685600
2244 total_price an integer 3373314400
4629 total_price an integer 3050000000
5890 total_price an integer 3133800000
.... ........... .......... ..........
See problems(...) for more details.

使用na.rm 處理missing value

a <- c(1,2,3,4,5,NA)
Warning message:
In strsplit(code, "\n", fixed = TRUE) :
  input string 1 is invalid in this locale
sum(a, na.rm=TRUE)
[1] 15
#?sum
mean(lvr_prices[lvr_prices$area == '憭批<ae><8d>€',]$total_price, na.rm=TRUE)
[1] 29798170

避免溢位, 使用as.numeric 轉型

sum(as.numeric(lvr_prices[lvr_prices$area == '憭批<ae><8d>€', ]$total_price), na.rm=TRUE)
[1] 2.79477e+11
Warning message:
In strsplit(code, "\n", fixed = TRUE) :
  input string 1 is invalid in this locale

做資料排序

sort(lvr_prices[lvr_prices$area == '銝剖控<e5><8d>€', ]$total_price, decreasing = TRUE)[1:3]
[1] 1850000000 1400000000
[3] 1084948034
Warning message:
In strsplit(code, "\n", fixed = TRUE) :
  input string 1 is invalid in this locale
zhongshan <- lvr_prices[lvr_prices$area == '銝剖控<e5><8d>€', ]
zhongshan <- zhongshan[order(zhongshan$total_price, decreasing = TRUE)[1:3],]
zhongshan

使用R 讀取csv檔案

setwd("C:/Users/USER/Desktop")
The working directory was changed to C:/Users/USER/Desktop inside a notebook chunk. The working directory will be reset when the chunk is finished running. Use the knitr root.dir option in the setup chunk to change the the working directory for notebook chunks.
getwd()
[1] "C:/Users/USER/Desktop"
download.file('https://raw.githubusercontent.com/ywchiu/rtibame/master/Data/purchase.csv', destfile = 'purchase.csv')
trying URL 'https://raw.githubusercontent.com/ywchiu/rtibame/master/Data/purchase.csv'
Content type 'text/plain; charset=utf-8' length 3497968 bytes (3.3 MB)
downloaded 3.3 MB
purchase <- read.csv('purchase.csv')
class(purchase)
[1] "data.frame"
library(readr)
purchase2 <- read_csv("C:/Users/USER/Desktop/purchase.csv")
Missing column names filled in: 'X1' [1]Parsed with column specification:
cols(
  X1 = col_integer(),
  Time = col_datetime(format = ""),
  Action = col_character(),
  User = col_character(),
  Product = col_character(),
  Quantity = col_integer(),
  Price = col_double()
)
class(purchase2)
[1] "tbl_df"     "tbl"       
[3] "data.frame"

將資料保存到檔案中

data(iris)
?write.csv
write.csv(x = iris , file = 'iris.csv')
getwd()
[1] "D:/OS DATA/Downloads"
write.table(x = iris, file = 'iris.tab', sep = '\t')

從RData 存取資料

?save
data(iris)
save(iris, file='dt.RData')
rm(iris)
load("dt.RData")

讀取Excel 檔案

library(readxl)
FinancialReport <- read_excel("C:/Users/USER/Desktop/FinancialReport.xlsx")
#View(FinancialReport)

讀取半結構化資料 (JSON)

download.file('https://raw.githubusercontent.com/ywchiu/rtibame/master/Data/fb.json', destfile = 'fb.json')


#install.packages('jsonlite')
library(jsonlite)
json_data<- fromJSON('fb.json')
json_data$data$from

讀取XML 資料

# install.packages("XML")
library(XML)
url <- 'http://download.post.gov.tw/post/download/county_h_10508.xml'
zipcode <- xmlToDataFrame(url)
names(zipcode) = c('郵遞區號', '中文名稱', '英文名稱')

開始蒐集網頁資料

#install.packages('rvest')
library(rvest)
package ‘rvest’ was built under R version 3.2.5Loading required package: xml2

Attaching package: ‘rvest’

The following object is masked from ‘package:readr’:

    guess_encoding
newsurl <- 'http://www.appledaily.com.tw/realtimenews/section/new/'
apple <- read_html(newsurl)
apple
{xml_document}
<html>
[1] <head>\n  <meta charset ...
[2] <body id="article" clas ...
#as.character(apple)

以台灣高鐵為例

library(httr)
package ‘httr’ was built under R version 3.2.5
url <- 'https://www.thsrc.com.tw/tw/TimeTable/SearchResult'
payload <- list(
StartStation='977abb69-413a-4ccf-a109-0272c24fd490',
EndStation='fbd828d8-b1da-4b06-a3bd-680cdca4d2cd',
SearchDate='2016/11/19',
SearchTime='14:30',
SearchWay='DepartureInMandarin'
)
res<-POST(url, body=payload, encode="form")
res
Response [https://www.thsrc.com.tw/tw/TimeTable/SearchResult]
  Date: 2016-11-19 13:14
  Status: 200
  Content-Type: text/html; charset=utf-8
  Size: 93.8 kB




<!DOCTYPE html>

<html lang="zh-tw">







<meta charset="utf-8" />

<link rel="shortcut icon" h...
<link rel="apple-touch-icon...
...

台鐵時刻表

url <- 'http://twtraffic.tra.gov.tw/twrail/SearchResult.aspx?searchtype=0&searchdate=2016/11/19&fromstation=1810&tostation=1008&trainclass=%271100%27,%271101%27,%271102%27,%271107%27,%271108%27,%271110%27,%271120%27&fromtime=0600&totime=2359'
read_html(url)
{xml_document}
<html>
[1] <head><meta http-equiv= ...
[2] <body bottommargin="0"> ...

DOM Tree 範例

html_sample <- '
<html> 
<head>
<style>
 #title{color:red;}
 .link{font-size:30px;}
</style>
</head>
 <body> 
 <h1 id="title">Hello World</h1> 
 <a href="#" class="link">This is link1</a> 
 <a href="# link2" class="link">This is link2</a> 
 </body> 
</html>'
res <- read_html(html_sample)
# get h1
html_nodes(res , 'h1')
{xml_nodeset (1)}
[1] <h1 id="title">Hello Wo ...
# get a
html_nodes(res , 'a')
{xml_nodeset (2)}
[1] <a href="#" class="link ...
[2] <a href="# link2" class ...
# get id = title, id => #
html_nodes(res , '#title')
{xml_nodeset (1)}
[1] <h1 id="title">Hello Wo ...
# get class = link, class => .
html_nodes(res , '.link')
{xml_nodeset (2)}
[1] <a href="#" class="link ...
[2] <a href="# link2" class ...
# get pure text
html_text(html_nodes(res , '.link'))
[1] "This is link1"
[2] "This is link2"
html_text(html_nodes(res , '#title'))
[1] "Hello World"
# get href
html_attr(html_nodes(res , '.link'), 'href')
[1] "#"       "# link2"

使用Magrittr

data(iris)
sum(tail(head(iris, 6), 3)[,3])
[1] 4.6
library(magrittr)
iris %>% head(6) %>% tail(3) %>% .[,3] %>% sum() 
[1] 4.6
# html_text(html_nodes(res , '#title'))
res %>% html_nodes('#title') %>% html_text()
[1] "Hello World"

完成列表抓取

newsurl <- 'http://www.appledaily.com.tw/realtimenews/section/new/'
domain <- 'http://www.appledaily.com.tw'
apple <- read_html(newsurl)
rtddt <- apple %>% html_nodes('.rtddt a')
as.character(rtddt[1] %>% iconv(from='UTF-8',to='UTF-8'))
[1] "<a href=\"/realtimenews/article/entertainment/20161119/993154/蝢<9c><9d><ad><ad><89><8e><9c><93><e7><97>€€€鈭怠ˊ60甇淚" target=\"_blank\">\n                                        <time>21:12</time>\n                                        <h2>憡<a8><82></h2>\n                                        <h1><font color=\"#383c40\">蝢<9c><9d><ad><ad><89><8e><9c><93><e7><97>€€€鈭怠ˊ60甇<b2>(0)</font></h1>\n                                    </a>"
title    <- rtddt %>% html_nodes('h1') %>% html_text() %>% iconv(from='UTF-8',to='UTF-8')
time     <- rtddt %>% html_nodes('time') %>% html_text() %>% iconv(from='UTF-8',to='UTF-8')
category <- rtddt %>% html_nodes('h2') %>% html_text() %>% iconv(from='UTF-8',to='UTF-8')
url      <- paste0(domain, rtddt %>% html_attr('href') %>% iconv(from='UTF-8',to='UTF-8'))
applenews <- data.frame(title = title, time =time, category = category, link = url, stringsAsFactors = FALSE)

paste string

a <- 'hello'
b <- 'world'
paste(a,b)
[1] "hello world"
paste(a,b, sep='')
[1] "helloworld"
paste0(a,b)
[1] "helloworld"
?paste

get Article

newsurl <- 'http://www.appledaily.com.tw/realtimenews/article/international/20161119/992964/%E8%82%AF%E4%BC%8A%E5%A8%81%E6%96%AF%E7%89%B9%E9%96%8B%E5%94%B1%E3%80%80%E3%80%8C%E8%8B%A5%E6%9C%89%E6%8A%95%E7%A5%A8%E6%9C%83%E6%8A%95%E7%B5%A6%E5%B7%9D%E6%99%AE%E3%80%8D'
read_html(newsurl) %>% html_nodes('#summary') %>% .[1] %>% html_text() %>% iconv(from='UTF-8',to='UTF-8')
[1] "蝢<9c><9d><a3><a5><88><ad><89>隡<a8><e7>嚗anye West嚗<e5><9b><e5><8a><b7><81>镼輸<96><e5>銝€<e5><8d><bc><aa>憭扯<81>瘝餉<a7>€暺€<bb><90><e5>蝎結銵函內嚗<9b><e9><e8><88>憭拐<bb><b2><e6><8a>巨嚗<bd><a6><9e><9c><a6><8a><9c><b0>巨<e6><8a>策<e5><e5><92>豪蝮賜絞<e7><e9>鈭箏<b7> 嚗onald Trump嚗<bc><9c><a9>銝<a7>€<e7><e3>€<be><9c><9c>振撱<a3><e6><e5><e5>嚗BC嚗撠<bc>隡<a8><e7><e5><e7><e5>銵函內嚗€<88><91>迄雿€<88><b2><e6><8a>巨撠嚗<bd><a6><9e><a6><88><e6><8a>巨<e7><9a>店嚗<88><9c><8a>策撌<e3>€€€<bb>€隞亦嚗<9b><e6><88><e9><9d><a3><bc><88><9b><e6><88><e5><90>犖嚗<89>€隞交<88>停蝡隞€<e9><82>€<bc><9b>雓<98><b0>情嚗<bd>皜祆<87><e6><8c><b0>蜓暺其犖<e3>€隡<a8><e7><e6><e9>€嚗<e5><8f>2020撟渡<be><9c>蜇蝯梧<bc><bd><e6><99>迂憭犖<e9>隤嚗<bb><87>府<e6><9c>誑瘞蜓暺函<9a>澈<e5><88>奎<e9><e3>€瞍<e6><9c><b8><bc><bb><b9>蝐脣之摰塚<bc><b8><a6><86><81>蝔格<97>郁閬降憿<bc><9b><e3>€<88>€停<e6>銝€<e5>€<bb><e7><9a>車<e6><97>郁閬<9c>振<e3>€<bc><b2><9c>遙雿€鈭箇<e6><9c><e8>憭<e5><96>€<bb>€<bd><bb><a3><85><bc><e6><8c><b7><e3>€<b8>誨銵冽<88><b8><aa>暺犖<e7><9a><94><e9><87><a6><bc><b8>誨銵冽<88><b8><e6><8c>戊甈<bc><b9><b8>誨銵冽<88><b8><e6><8c><90>€批<a9>宏<e3>€€<bc><9e><ad><84><bc><b6><90><a4><e5>撠<bc><89>"
getArticle <- function(newsurl){
  read_html(newsurl) %>% html_nodes('#summary') %>% .[1] %>% html_text() %>% iconv(from='UTF-8',to='UTF-8')
}
getArticle('http://www.appledaily.com.tw/realtimenews/article/international/20161119/992972/%E5%A4%A7%E9%A6%AC%E8%90%AC%E4%BA%BA%E7%A4%BA%E5%A8%81%E8%A6%81%E6%B1%82%E7%B4%8D%E5%90%89%E4%B8%8B%E5%8F%B0%E3%80%80%E5%A4%9A%E5%90%8D%E6%B0%91%E9%81%8B%E9%A0%98%E8%A2%96%E8%A2%AB%E8%A3%9C')
[1] "擐砌<be>正鈭<a6><e5><90><9a>隞予<e6><9c>之閬芋<e9><9b><9c><bc><a6><b1><b6><ab>痕<e8><85><9a><a6>蝝<90><bc>ajib Razak嚗<b8>嚗郎<e6><e5><e9>隞亙<be><bc>身蝡蝺<e6>撌游ㄚ銝<b9>恥<e3>€€<e9><9b><9c><89><bc>韏瑚<bb>活<e9><81><a1><9a><b0><81><b5><b9>€嗾瘛刻<88>撟喲<e8><88><e7><9b>€<bc>ersih嚗楊<e9><e7><9b><bc><a4><90><a0><a2>◤<e6><8d><bc><b8><81>楊<e9><e7><9b>撥隤蹂<b8><95><89><a3>€<9b><9c>犖憯怎忽銝<bb>銵<a3><e6><9c>憭€暺<9b><90><be><96><a7><81><a1><bc><e7><9a><e6><e7>蝡誨<e5>嚗<bb>€<ab><e5><e8><99>€<e6><95><b0>蜓<e3>€<bc><e5>瘚拙之<e3>€郎<e6><e5><89>撣閮剔<ab>楝<e9><9a><bc><b0><8e><89><be>€<e7>蝡誨<e5><e7><9a><81>楝嚗蒂<e5><e5><8b><e6>霅血<af><92>偌<e7>頠<ad><be><e3>€<81><a1><9c><96><bc><9c><e6><8c>摨<9a>€<b4>‵頠€<e7>嚗隞賭犖<e8><88>€<bb>‵頠€<ba><e5><e5><9b><bc>兢<e8>€<e6><9c><e7><94><a1><aa>€€楊<e9><e7><9b><9a><a0><a2>漲<e5>蝐脫<e6><8c>€<b8><a6><a1><93>郎<e6><e9>蝺<bc><b9><b8><a6><e7><e6><e5><8a><8a><e3>€<e9><81><a1><89><a4><bc>郎<e6><e5>鈭<aa><93><90>瘛券<e7><9b><9a>齒<e5>摰歹<bc><8b><95>蜓撣剔暻<ba>嚗aria Chin嚗<ad><a4><90><a0><a2><bc>蒂<e5><85><e9><e8><e3>€<89><a9><8f><8a>€銵閉<e5>蝑€楊<e9><e7><9b>蝷曆漱蝬脩撣<bc><8c>€<9b><e7>撅€銝<93><89>挾閬甇X<88>€<bc><bd><86>(<e9><9b><9c><83>)撠<a6><9c><88><a1>€€楊<e9><e7><9b>”蝷箏<e9><9b><9c><be><9c><b7>撅€鈭斗<b6><bc><a6><b1><87>鋡急<8d><a0><a2>€<a6>蝝<90>迤<e5>蝘陌擐<e5>擐砍<e7>鈭云蝬<90>陳<e6><9c><bc><bb>◤<e6><8c>€<81>收靘正鈭蜓甈撅<e9><87>€<b8>€擐祉撅€<bc><88>1MDB嚗<bc><8a><8a>7<e5><84><be><85><bc><88>224<e5><84><85>撟<a3>嚗<a7><af><ad><e8>撌梁<9a><a7>犖<e6><e5><e3>€<bc><9c><9a>葉敹<bc><b6><90><a4><e5>撠<bc><9c><9a><e8><81><b8><bc>嚗翰靘<98><9e><b9><91><e7><90><88><8c><ae><9a>"

增加內文資訊

library(rvest)
applenews$content <- sapply(applenews$link, getArticle)

列印applenews

head(applenews)

儲存資料

write.csv(applenews, 'apple.csv')

sapply

a <- list(c(1,2,3,4,5), c(6,7,8))
sapply(a, sum)

InfoLite

