Demo20180123

Dplyr

Sys.setlocale("LC_ALL", "en_us.UTF-8")

## [1] "en_us.UTF-8/en_us.UTF-8/en_us.UTF-8/C/en_us.UTF-8/C"

data(Titanic)
titanic <- as.data.frame(Titanic)
class(titanic)

## [1] "data.frame"

head(titanic[titanic$Age == 'Child'  , c('Age', 'Freq') ])

##     Age Freq
## 1 Child    0
## 2 Child    0
## 3 Child   35
## 4 Child    0
## 5 Child    0
## 6 Child    0

## SELECT Age, Freq FROM titanic WHERE Age = Child LIMIT 6
library(dplyr)

## Warning: package 'dplyr' was built under R version 3.2.5

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

titanic %>%
  filter(Age == 'Child') %>%
  select(Age, Freq) %>%
  head()

## Warning: package 'bindrcpp' was built under R version 3.2.5

##     Age Freq
## 1 Child    0
## 2 Child    0
## 3 Child   35
## 4 Child    0
## 5 Child    0
## 6 Child    0

## SELECT Age, Freq FROM titanic WHERE Age = Child ORDER BY Freq LIMIT 6

titanic %>%
  filter(Age == 'Child') %>%
  select(Age, Freq) %>%
  arrange(Freq) %>%
  head()

##     Age Freq
## 1 Child    0
## 2 Child    0
## 3 Child    0
## 4 Child    0
## 5 Child    0
## 6 Child    0

## SELECT Age, Freq FROM titanic WHERE Age = Child ORDER BY Freq DESC LIMIT 6
titanic %>%
  filter(Age == 'Child') %>%
  select(Age, Freq) %>%
  arrange(desc(Freq)) %>%
  head()

##     Age Freq
## 1 Child   35
## 2 Child   17
## 3 Child   14
## 4 Child   13
## 5 Child   13
## 6 Child   11

freqsum <- titanic %>%
  select(Freq) %>%
  sum()

titanic <- titanic %>%
  select(Sex, Age, Freq) %>%
  mutate(portion = Freq/freqsum)

# SELECT Sex, SUM(Freq) FROM titanic GROUP BY Sex 

titanic %>%
  group_by(Sex) %>%
  summarise(sexsum = sum(Freq, na.rm = TRUE))

## # A tibble: 2 x 2
##   Sex    sexsum
##   <fctr>  <dbl>
## 1 Male     1731
## 2 Female    470

# SELECT Sex, SUM(Freq), SUM(Portion) FROM titanic GROUP BY Sex 

titanic %>%
  group_by(Sex) %>%
  summarise_each(funs(sum), Freq, portion)

## `summarise_each()` is deprecated.
## Use `summarise_all()`, `summarise_at()` or `summarise_if()` instead.
## To map `funs` over a selection of variables, use `summarise_at()`

## # A tibble: 2 x 3
##   Sex     Freq portion
##   <fctr> <dbl>   <dbl>
## 1 Male    1731   0.786
## 2 Female   470   0.214

# SELECT Class, min(Freq), max(Freq) FROM titanic GROUP BY Class
titanic <- as.data.frame(Titanic)
titanic %>%
 group_by(Class) %>%
 summarise_each(funs(min(., na.rm=TRUE), max(., na.rm=TRUE)), matches("Freq"))

## `summarise_each()` is deprecated.
## Use `summarise_all()`, `summarise_at()` or `summarise_if()` instead.
## To map `funs` over a selection of variables, use `summarise_at()`

## # A tibble: 4 x 3
##   Class  Freq_min Freq_max
##   <fctr>    <dbl>    <dbl>
## 1 1st         0        140
## 2 2nd         0        154
## 3 3rd        13.0      387
## 4 Crew        0        670

# SELECT COUNT(Sex) FROM titanic
titanic %>%
  select(Sex) %>%
  summarise_each(funs(n()))

## `summarise_each()` is deprecated.
## Use `summarise_all()`, `summarise_at()` or `summarise_if()` instead.
## To map `funs` over all variables, use `summarise_all()`

##   Sex
## 1  32

# SELECT COUNT(DISTINCT(Sex)) FROM titanic
titanic %>%
  select(Sex) %>%
  summarise_each(funs(n_distinct(Sex)))

## `summarise_each()` is deprecated.
## Use `summarise_all()`, `summarise_at()` or `summarise_if()` instead.
## To map `funs` over all variables, use `summarise_all()`

##   Sex
## 1   2

# SELECT Sex, Age, SUM(Freq) AS frequency_sum GROUP Sex, Age ORDER BY frequency_sum DESC

titanic %>%
  select(Sex, Age, Freq) %>%
  group_by(Sex, Age) %>%
  summarise(frequency_sum = sum(Freq, na.rm=TRUE)) %>%
  arrange(desc(frequency_sum))

## # A tibble: 4 x 3
## # Groups: Sex [2]
##   Sex    Age    frequency_sum
##   <fctr> <fctr>         <dbl>
## 1 Male   Adult         1667  
## 2 Female Adult          425  
## 3 Male   Child           64.0
## 4 Female Child           45.0

# SELECT Sex, SUM(Freq) FROM titanic GROUP BY Sex
sex_stat <- titanic %>%
  select(Sex, Freq) %>%
  group_by(Sex) %>%
  summarise(sexsum = sum(Freq))

sex_stat

## # A tibble: 2 x 2
##   Sex    sexsum
##   <fctr>  <dbl>
## 1 Male     1731
## 2 Female    470

barplot(sex_stat$sexsum, names.arg = sex_stat$Sex, col=c('blue'))

sex_stat2 <- titanic %>%
  select(Sex, Survived, Freq) %>%
  group_by(Sex, Survived) %>%
  summarise(sexsum = sum(Freq))

#install.packages('tidyr')
library(tidyr)

## Warning: package 'tidyr' was built under R version 3.2.5

titanic_pivot <- spread(sex_stat2, Survived, sexsum, fill = 0)
mosaicplot(titanic_pivot)

survived_stat <- titanic %>% group_by(Survived,Sex) %>% summarise_each(funs(sum), Freq)

## `summarise_each()` is deprecated.
## Use `summarise_all()`, `summarise_at()` or `summarise_if()` instead.
## To map `funs` over a selection of variables, use `summarise_at()`

library(reshape2)

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

survived_tb <- dcast(survived_stat, Survived ~ Sex, value.var="Freq") 

survived_tb

##   Survived Male Female
## 1       No 1364    126
## 2      Yes  367    344

m <- as.matrix(survived_tb[2:3])
barplot(m, legend=c("Perished" ,"Survived"))

分析姓名

babyname <- read.csv('https://raw.githubusercontent.com/ywchiu/fubonr/master/data/WA.TXT', header = FALSE)

colnames(babyname) = c("state", "sex", "year", "name", "freq")

# SELECT name, Freq FROM babyname WHERE year = 2012 AND sex = 'M' ORDER BY Freq DESC LIMIT 10

top10_male <- babyname %>%
  filter(year == 2012, sex == 'M') %>%
  select(name, freq) %>%
  arrange(desc(freq)) %>%
  head(10)


top10_female <- babyname %>%
  filter(year == 2012, sex == 'F') %>%
  select(name, freq) %>%
  arrange(desc(freq)) %>%
  head(10)

male_stat <- babyname %>%
  filter(name %in% top10_male$name, sex == 'M') %>%
  select(year, name, freq)

plot(male_stat$year, male_stat$freq, type = 'n')
c <- 1
for (n in levels(factor(male_stat$name))){
  stat <- male_stat[male_stat$name == n, ]
  lines(freq ~ year, data = stat, col=c)
  c <- c + 1
}
legend("topleft", levels(factor(male_stat$name)), col =1:10, lwd= 1)

female_stat <- babyname %>%
  filter(name %in% top10_female$name, sex == 'F') %>%
  select(year, name, freq)

plot(female_stat$year, female_stat$freq, type = 'n')
c <- 1
for (n in levels(factor(female_stat$name))){
  stat <- female_stat[female_stat$name == n, ]
  lines(freq ~ year, data = stat, col=c)
  c <- c + 1
}
legend("topleft", levels(factor(female_stat$name)), col =1:10, lwd= 1)

使用 rvest

#install.packages('rvest')
library(rvest)

## Warning: package 'rvest' was built under R version 3.2.5

## Loading required package: xml2

newsurl <- 'https://tw.appledaily.com/new/realtime'
apple <- read_html(newsurl, encoding = 'utf-8')
#as.character(apple)

library(httr)
#GET(newsurl)


library(httr)
url <- "https://www.thsrc.com.tw/tw/TimeTable/SearchResult" 
payload <- list(
StartStation='977abb69-413a-4ccf-a109-0272c24fd490', EndStation='fbd828d8-b1da-4b06-a3bd-680cdca4d2cd', SearchDate='2018/01/23',
SearchTime='15:00', 
SearchWay='DepartureInMandarin'
)
res<-POST(url, body=payload, encode="form") 
#res

Magrittr

data(iris)
# R
sum(tail(head(iris), 3)$Sepal.Length)

## [1] 15

# Magrittr
library(magrittr)

## 
## Attaching package: 'magrittr'

## The following object is masked from 'package:tidyr':
## 
##     extract

iris %>% head() %>% tail(3) %>% .$Sepal.Length %>% sum()

## [1] 15

Data Cleaning and Converting

newsurl <- 'https://tw.appledaily.com/new/realtime'
library(rvest)
rtddt <- read_html(newsurl) %>% html_nodes('.rtddt a')
title    <- rtddt %>% html_nodes('h1') %>% html_text()
category <- rtddt %>% html_nodes('h2') %>% html_text()
time     <- rtddt %>% html_nodes('time') %>% html_text()
url      <- rtddt %>% html_attr('href')

applenews <- data.frame(title = title, category = category, dt = time, url = url)
#View(applenews)

證交所資料抓取


Sys.setenv(HTTP_PROXY="http://54.36.153.19:8080")


library(httr)
for (i in 1:2){
  print(i)
  GET('http://www.twse.com.tw/exchangeReport/MI_5MINS?response=json&date=&_=1516687032427')
  Sys.sleep(2)
}

library(jsonlite)
j <- fromJSON('http://www.twse.com.tw/exchangeReport/MI_5MINS?response=json&date=&_=1516687032427') 
j$data

抓取內文資訊

Sys.setlocale("LC_ALL", "en_us.UTF-8")

## [1] "en_us.UTF-8/en_us.UTF-8/en_us.UTF-8/C/en_us.UTF-8/C"

library(rvest)
#read_html('https://tw.finance.appledaily.com/realtime/20180123/1283989') %>% html_node('.ndArticle_margin p') %>% html_text()

getArticle <- function(url){
  e <- read_html(url)
  article = e %>% html_nodes('.ndArticle_margin p') %>%
  html_text() %>% paste(., collapse='') 
  title <- e %>% html_nodes('h1') %>% html_text()
  dt    <- e %>% html_nodes('.ndArticle_creat') %>%
  html_text()

  len <- e %>% html_nodes('.ndArticle_view') %>% length()
  clicked <- 0 
  if (len > 0) { 
    clicked <- e %>% html_nodes('.ndArticle_view') %>%
  html_text()
  }

  category <- e %>% html_nodes('.ndgTag .current') %>%
      html_text() %>% .[1]

  data.frame(article = article, title = title, dt=dt, category=category, clicked = clicked, stringsAsFactors = FALSE )
}

#getArticle('https://tw.finance.appledaily.com/realtime/20180123/1283989')


dfall <- data.frame()
getURL <- function(newsurl){
  apple <- read_html(newsurl)
  rtddt <- apple %>% html_nodes('.rtddt a')
  for(ele in rtddt){
    url <- ele %>% html_attr('href')
    #print(url)
    df    <- getArticle(url)
    dfall <- rbind(dfall, df)
  }
  dfall
}

#getURL('https://tw.appledaily.com/new/realtime')


getURL2 <- function(newsurl){
  newsurl <- 'https://tw.appledaily.com/new/realtime'
  apple <- read_html(newsurl)
  rtddt <- apple %>% html_nodes('.rtddt a') %>% html_attr('href')
  news_list <- lapply(rtddt, getArticle )
  do.call(rbind, news_list)
}

#getURL2('https://tw.appledaily.com/new/realtime')

a <- list(c(1,2,3,4), c(5,6))
lapply(a, sum)

## [[1]]
## [1] 10
## 
## [[2]]
## [1] 11

Demo20180123

David Chiu

1/23/2018

Dplyr

分析姓名

使用 rvest

Magrittr

Data Cleaning and Converting

證交所資料抓取

抓取內文資訊