讀取資料
lvr_prices <- read_csv("C:/Users/USER/lvr_prices_mac.csv")
Missing column names filled in: 'X1' [1]Parsed with column specification:
cols(
.default = col_character(),
X1 = col_integer(),
land_sqmeter = col_double(),
trading_ymd = col_date(format = ""),
finish_ymd = col_date(format = ""),
building_sqmeter = col_double(),
room = col_integer(),
living_room = col_integer(),
bath = col_integer(),
total_price = col_integer(),
price_per_sqmeter = col_double(),
parking_sqmeter = col_double(),
parking_price = col_integer()
)
See spec(...) for full column specifications.
32 parsing failures.
row col expected actual
1282 total_price an integer 6700000000
2243 total_price an integer 3882685600
2244 total_price an integer 3373314400
4629 total_price an integer 3050000000
5890 total_price an integer 3133800000
.... ........... .......... ..........
See problems(...) for more details.
使用na.rm 處理missing value
a <- c(1,2,3,4,5,NA)
Warning message:
In strsplit(code, "\n", fixed = TRUE) :
input string 1 is invalid in this locale
sum(a, na.rm=TRUE)
[1] 15
#?sum
mean(lvr_prices[lvr_prices$area == '憭批<ae><8d>',]$total_price, na.rm=TRUE)
[1] 29798170
避免溢位, 使用as.numeric 轉型
sum(as.numeric(lvr_prices[lvr_prices$area == '憭批<ae><8d>', ]$total_price), na.rm=TRUE)
[1] 2.79477e+11
Warning message:
In strsplit(code, "\n", fixed = TRUE) :
input string 1 is invalid in this locale
做資料排序
sort(lvr_prices[lvr_prices$area == '銝剖控<e5><8d>', ]$total_price, decreasing = TRUE)[1:3]
[1] 1850000000 1400000000
[3] 1084948034
Warning message:
In strsplit(code, "\n", fixed = TRUE) :
input string 1 is invalid in this locale
zhongshan <- lvr_prices[lvr_prices$area == '銝剖控<e5><8d>', ]
zhongshan <- zhongshan[order(zhongshan$total_price, decreasing = TRUE)[1:3],]
zhongshan
使用R 讀取csv檔案
setwd("C:/Users/USER/Desktop")
The working directory was changed to C:/Users/USER/Desktop inside a notebook chunk. The working directory will be reset when the chunk is finished running. Use the knitr root.dir option in the setup chunk to change the the working directory for notebook chunks.
getwd()
[1] "C:/Users/USER/Desktop"
download.file('https://raw.githubusercontent.com/ywchiu/rtibame/master/Data/purchase.csv', destfile = 'purchase.csv')
trying URL 'https://raw.githubusercontent.com/ywchiu/rtibame/master/Data/purchase.csv'
Content type 'text/plain; charset=utf-8' length 3497968 bytes (3.3 MB)
downloaded 3.3 MB
purchase <- read.csv('purchase.csv')
class(purchase)
[1] "data.frame"
library(readr)
purchase2 <- read_csv("C:/Users/USER/Desktop/purchase.csv")
Missing column names filled in: 'X1' [1]Parsed with column specification:
cols(
X1 = col_integer(),
Time = col_datetime(format = ""),
Action = col_character(),
User = col_character(),
Product = col_character(),
Quantity = col_integer(),
Price = col_double()
)
class(purchase2)
[1] "tbl_df" "tbl"
[3] "data.frame"
將資料保存到檔案中
data(iris)
?write.csv
write.csv(x = iris , file = 'iris.csv')
getwd()
[1] "D:/OS DATA/Downloads"
write.table(x = iris, file = 'iris.tab', sep = '\t')
從RData 存取資料
?save
data(iris)
save(iris, file='dt.RData')
rm(iris)
load("dt.RData")
讀取Excel 檔案
library(readxl)
FinancialReport <- read_excel("C:/Users/USER/Desktop/FinancialReport.xlsx")
#View(FinancialReport)
讀取半結構化資料 (JSON)
download.file('https://raw.githubusercontent.com/ywchiu/rtibame/master/Data/fb.json', destfile = 'fb.json')
#install.packages('jsonlite')
library(jsonlite)
json_data<- fromJSON('fb.json')
json_data$data$from
讀取XML 資料
# install.packages("XML")
library(XML)
url <- 'http://download.post.gov.tw/post/download/county_h_10508.xml'
zipcode <- xmlToDataFrame(url)
names(zipcode) = c('郵遞區號', '中文名稱', '英文名稱')
開始蒐集網頁資料
#install.packages('rvest')
library(rvest)
package ‘rvest’ was built under R version 3.2.5Loading required package: xml2
Attaching package: ‘rvest’
The following object is masked from ‘package:readr’:
guess_encoding
newsurl <- 'http://www.appledaily.com.tw/realtimenews/section/new/'
apple <- read_html(newsurl)
apple
{xml_document}
<html>
[1] <head>\n <meta charset ...
[2] <body id="article" clas ...
#as.character(apple)
以台灣高鐵為例
library(httr)
package ‘httr’ was built under R version 3.2.5
url <- 'https://www.thsrc.com.tw/tw/TimeTable/SearchResult'
payload <- list(
StartStation='977abb69-413a-4ccf-a109-0272c24fd490',
EndStation='fbd828d8-b1da-4b06-a3bd-680cdca4d2cd',
SearchDate='2016/11/19',
SearchTime='14:30',
SearchWay='DepartureInMandarin'
)
res<-POST(url, body=payload, encode="form")
res
Response [https://www.thsrc.com.tw/tw/TimeTable/SearchResult]
Date: 2016-11-19 13:14
Status: 200
Content-Type: text/html; charset=utf-8
Size: 93.8 kB
<!DOCTYPE html>
<html lang="zh-tw">
<meta charset="utf-8" />
<link rel="shortcut icon" h...
<link rel="apple-touch-icon...
...
台鐵時刻表
url <- 'http://twtraffic.tra.gov.tw/twrail/SearchResult.aspx?searchtype=0&searchdate=2016/11/19&fromstation=1810&tostation=1008&trainclass=%271100%27,%271101%27,%271102%27,%271107%27,%271108%27,%271110%27,%271120%27&fromtime=0600&totime=2359'
read_html(url)
{xml_document}
<html>
[1] <head><meta http-equiv= ...
[2] <body bottommargin="0"> ...
DOM Tree 範例
html_sample <- '
<html>
<head>
<style>
#title{color:red;}
.link{font-size:30px;}
</style>
</head>
<body>
<h1 id="title">Hello World</h1>
<a href="#" class="link">This is link1</a>
<a href="# link2" class="link">This is link2</a>
</body>
</html>'
res <- read_html(html_sample)
# get h1
html_nodes(res , 'h1')
{xml_nodeset (1)}
[1] <h1 id="title">Hello Wo ...
# get a
html_nodes(res , 'a')
{xml_nodeset (2)}
[1] <a href="#" class="link ...
[2] <a href="# link2" class ...
# get id = title, id => #
html_nodes(res , '#title')
{xml_nodeset (1)}
[1] <h1 id="title">Hello Wo ...
# get class = link, class => .
html_nodes(res , '.link')
{xml_nodeset (2)}
[1] <a href="#" class="link ...
[2] <a href="# link2" class ...
# get pure text
html_text(html_nodes(res , '.link'))
[1] "This is link1"
[2] "This is link2"
html_text(html_nodes(res , '#title'))
[1] "Hello World"
# get href
html_attr(html_nodes(res , '.link'), 'href')
[1] "#" "# link2"
使用Magrittr
data(iris)
sum(tail(head(iris, 6), 3)[,3])
[1] 4.6
library(magrittr)
iris %>% head(6) %>% tail(3) %>% .[,3] %>% sum()
[1] 4.6
# html_text(html_nodes(res , '#title'))
res %>% html_nodes('#title') %>% html_text()
[1] "Hello World"
完成列表抓取
newsurl <- 'http://www.appledaily.com.tw/realtimenews/section/new/'
domain <- 'http://www.appledaily.com.tw'
apple <- read_html(newsurl)
rtddt <- apple %>% html_nodes('.rtddt a')
as.character(rtddt[1] %>% iconv(from='UTF-8',to='UTF-8'))
[1] "<a href=\"/realtimenews/article/entertainment/20161119/993154/蝢<9c><9d><ad><ad><89><8e><9c><93><e7><97>鈭怠ˊ60甇淚" target=\"_blank\">\n <time>21:12</time>\n <h2>憡<a8><82></h2>\n <h1><font color=\"#383c40\">蝢<9c><9d><ad><ad><89><8e><9c><93><e7><97>鈭怠ˊ60甇<b2>(0)</font></h1>\n </a>"
title <- rtddt %>% html_nodes('h1') %>% html_text() %>% iconv(from='UTF-8',to='UTF-8')
time <- rtddt %>% html_nodes('time') %>% html_text() %>% iconv(from='UTF-8',to='UTF-8')
category <- rtddt %>% html_nodes('h2') %>% html_text() %>% iconv(from='UTF-8',to='UTF-8')
url <- paste0(domain, rtddt %>% html_attr('href') %>% iconv(from='UTF-8',to='UTF-8'))
applenews <- data.frame(title = title, time =time, category = category, link = url, stringsAsFactors = FALSE)
paste string
a <- 'hello'
b <- 'world'
paste(a,b)
[1] "hello world"
paste(a,b, sep='')
[1] "helloworld"
paste0(a,b)
[1] "helloworld"
?paste
get Article
newsurl <- 'http://www.appledaily.com.tw/realtimenews/article/international/20161119/992964/%E8%82%AF%E4%BC%8A%E5%A8%81%E6%96%AF%E7%89%B9%E9%96%8B%E5%94%B1%E3%80%80%E3%80%8C%E8%8B%A5%E6%9C%89%E6%8A%95%E7%A5%A8%E6%9C%83%E6%8A%95%E7%B5%A6%E5%B7%9D%E6%99%AE%E3%80%8D'
read_html(newsurl) %>% html_nodes('#summary') %>% .[1] %>% html_text() %>% iconv(from='UTF-8',to='UTF-8')
[1] "蝢<9c><9d><a3><a5><88><ad><89>隡<a8><e7>嚗anye West嚗<e5><9b><e5><8a><b7><81>镼輸<96><e5>銝<e5><8d><bc><aa>憭扯<81>瘝餉<a7>暺<bb><90><e5>蝎結銵函內嚗<9b><e9><e8><88>憭拐<bb><b2><e6><8a>巨嚗<bd><a6><9e><9c><a6><8a><9c><b0>巨<e6><8a>策<e5><e5><92>豪蝮賜絞<e7><e9>鈭箏<b7> 嚗onald Trump嚗<bc><9c><a9>銝<a7><e7><e3><be><9c><9c>振撱<a3><e6><e5><e5>嚗BC嚗撠<bc>隡<a8><e7><e5><e7><e5>銵函內嚗<88><91>迄雿<88><b2><e6><8a>巨撠嚗<bd><a6><9e><a6><88><e6><8a>巨<e7><9a>店嚗<88><9c><8a>策撌<e3><bb>隞亦嚗<9b><e6><88><e9><9d><a3><bc><88><9b><e6><88><e5><90>犖嚗<89>隞交<88>停蝡隞<e9><82><bc><9b>雓<98><b0>情嚗<bd>皜祆<87><e6><8c><b0>蜓暺其犖<e3>隡<a8><e7><e6><e9>嚗<e5><8f>2020撟渡<be><9c>蜇蝯梧<bc><bd><e6><99>迂憭犖<e9>隤嚗<bb><87>府<e6><9c>誑瘞蜓暺函<9a>澈<e5><88>奎<e9><e3>瞍<e6><9c><b8><bc><bb><b9>蝐脣之摰塚<bc><b8><a6><86><81>蝔格<97>郁閬降憿<bc><9b><e3><88>停<e6>銝<e5><bb><e7><9a>車<e6><97>郁閬<9c>振<e3><bc><b2><9c>遙雿鈭箇<e6><9c><e8>憭<e5><96><bb><bd><bb><a3><85><bc><e6><8c><b7><e3><b8>誨銵冽<88><b8><aa>暺犖<e7><9a><94><e9><87><a6><bc><b8>誨銵冽<88><b8><e6><8c>戊甈<bc><b9><b8>誨銵冽<88><b8><e6><8c><90>批<a9>宏<e3><bc><9e><ad><84><bc><b6><90><a4><e5>撠<bc><89>"
getArticle <- function(newsurl){
read_html(newsurl) %>% html_nodes('#summary') %>% .[1] %>% html_text() %>% iconv(from='UTF-8',to='UTF-8')
}
getArticle('http://www.appledaily.com.tw/realtimenews/article/international/20161119/992972/%E5%A4%A7%E9%A6%AC%E8%90%AC%E4%BA%BA%E7%A4%BA%E5%A8%81%E8%A6%81%E6%B1%82%E7%B4%8D%E5%90%89%E4%B8%8B%E5%8F%B0%E3%80%80%E5%A4%9A%E5%90%8D%E6%B0%91%E9%81%8B%E9%A0%98%E8%A2%96%E8%A2%AB%E8%A3%9C')
[1] "擐砌<be>正鈭<a6><e5><90><9a>隞予<e6><9c>之閬芋<e9><9b><9c><bc><a6><b1><b6><ab>痕<e8><85><9a><a6>蝝<90><bc>ajib Razak嚗<b8>嚗郎<e6><e5><e9>隞亙<be><bc>身蝡蝺<e6>撌游ㄚ銝<b9>恥<e3><e9><9b><9c><89><bc>韏瑚<bb>活<e9><81><a1><9a><b0><81><b5><b9>嗾瘛刻<88>撟喲<e8><88><e7><9b><bc>ersih嚗楊<e9><e7><9b><bc><a4><90><a0><a2>◤<e6><8d><bc><b8><81>楊<e9><e7><9b>撥隤蹂<b8><95><89><a3><9b><9c>犖憯怎忽銝<bb>銵<a3><e6><9c>憭暺<9b><90><be><96><a7><81><a1><bc><e7><9a><e6><e7>蝡誨<e5>嚗<bb><ab><e5><e8><99><e6><95><b0>蜓<e3><bc><e5>瘚拙之<e3>郎<e6><e5><89>撣閮剔<ab>楝<e9><9a><bc><b0><8e><89><be><e7>蝡誨<e5><e7><9a><81>楝嚗蒂<e5><e5><8b><e6>霅血<af><92>偌<e7>頠<ad><be><e3><81><a1><9c><96><bc><9c><e6><8c>摨<9a><b4>‵頠<e7>嚗隞賭犖<e8><88><bb>‵頠<ba><e5><e5><9b><bc>兢<e8><e6><9c><e7><94><a1><aa>楊<e9><e7><9b><9a><a0><a2>漲<e5>蝐脫<e6><8c><b8><a6><a1><93>郎<e6><e9>蝺<bc><b9><b8><a6><e7><e6><e5><8a><8a><e3><e9><81><a1><89><a4><bc>郎<e6><e5>鈭<aa><93><90>瘛券<e7><9b><9a>齒<e5>摰歹<bc><8b><95>蜓撣剔暻<ba>嚗aria Chin嚗<ad><a4><90><a0><a2><bc>蒂<e5><85><e9><e8><e3><89><a9><8f><8a>銵閉<e5>蝑楊<e9><e7><9b>蝷曆漱蝬脩撣<bc><8c><9b><e7>撅銝<93><89>挾閬甇X<88><bc><bd><86>(<e9><9b><9c><83>)撠<a6><9c><88><a1>楊<e9><e7><9b>”蝷箏<e9><9b><9c><be><9c><b7>撅鈭斗<b6><bc><a6><b1><87>鋡急<8d><a0><a2><a6>蝝<90>迤<e5>蝘陌擐<e5>擐砍<e7>鈭云蝬<90>陳<e6><9c><bc><bb>◤<e6><8c><81>收靘正鈭蜓甈撅<e9><87><b8>擐祉撅<bc><88>1MDB嚗<bc><8a><8a>7<e5><84><be><85><bc><88>224<e5><84><85>撟<a3>嚗<a7><af><ad><e8>撌梁<9a><a7>犖<e6><e5><e3><bc><9c><9a>葉敹<bc><b6><90><a4><e5>撠<bc><9c><9a><e8><81><b8><bc>嚗翰靘<98><9e><b9><91><e7><90><88><8c><ae><9a>"
增加內文資訊
library(rvest)
applenews$content <- sapply(applenews$link, getArticle)
列印applenews
head(applenews)
儲存資料
write.csv(applenews, 'apple.csv')
sapply
a <- list(c(1,2,3,4,5), c(6,7,8))
sapply(a, sum)
