Homework2

library(httr)
#GET('http://news.ltn.com.tw/list/BreakingNews')


library(rvest)
## Loading required package: xml2
ltn <- read_html('http://news.ltn.com.tw/list/BreakingNews') %>% html_nodes('.lipic')
#as.character(ltn[1])

title <- ltn %>% html_nodes('a.picword') %>% html_text()
url   <- ltn %>% html_nodes('a.picword') %>% html_attr('href')
datetime <- ltn %>% html_nodes('span') %>% html_text()

ltn_news <- data.frame(title = title, url = url, datetime = datetime)
View(ltn_news)

Shopee.tw

library(httr)
?POST
## starting httpd help server ...
##  done
url <- 'https://shopee.tw/api/v1/items/'
payload <- '{"item_shop_ids":[{"itemid":19785078,"adsid":0,"shopid":5435363,"campaignid":0},{"itemid":5595311,"adsid":0,"shopid":118045,"campaignid":0},{"itemid":30193672,"adsid":0,"shopid":4067127,"campaignid":0},{"itemid":2330199,"adsid":0,"shopid":196610,"campaignid":0},{"itemid":5778442,"adsid":0,"shopid":1691271,"campaignid":0},{"itemid":15001686,"adsid":0,"shopid":4634210,"campaignid":0},{"itemid":21807228,"adsid":0,"shopid":5435363,"campaignid":0},{"itemid":29047156,"adsid":0,"shopid":5435363,"campaignid":0},{"itemid":17075672,"adsid":0,"shopid":3434330,"campaignid":0},{"itemid":12566028,"adsid":0,"shopid":3418898,"campaignid":0},{"itemid":7569360,"adsid":0,"shopid":3051849,"campaignid":0},{"itemid":29678731,"adsid":0,"shopid":4063412,"campaignid":0},{"itemid":4143428,"adsid":0,"shopid":1794309,"campaignid":0},{"itemid":22324648,"adsid":0,"shopid":1249908,"campaignid":0},{"itemid":7658646,"adsid":0,"shopid":1975307,"campaignid":0},{"itemid":14969456,"adsid":0,"shopid":4634210,"campaignid":0},{"itemid":14875124,"adsid":0,"shopid":13982,"campaignid":0},{"itemid":10261379,"adsid":0,"shopid":3712151,"campaignid":0},{"itemid":5398424,"adsid":0,"shopid":118223,"campaignid":0},{"itemid":4854194,"adsid":0,"shopid":1982863,"campaignid":0},{"itemid":29895576,"adsid":0,"shopid":2519614,"campaignid":0},{"itemid":12256422,"adsid":0,"shopid":169631,"campaignid":0},{"itemid":14932484,"adsid":0,"shopid":482725,"campaignid":0},{"itemid":16693567,"adsid":0,"shopid":4930205,"campaignid":0},{"itemid":11717734,"adsid":0,"shopid":943728,"campaignid":0},{"itemid":5556010,"adsid":0,"shopid":2453776,"campaignid":0},{"itemid":19373566,"adsid":0,"shopid":3979419,"campaignid":0},{"itemid":1274446,"adsid":0,"shopid":633256,"campaignid":0},{"itemid":4483565,"adsid":0,"shopid":810776,"campaignid":0},{"itemid":21312569,"adsid":0,"shopid":12278,"campaignid":0},{"itemid":8352857,"adsid":0,"shopid":633256,"campaignid":0},{"itemid":21419323,"adsid":0,"shopid":4696764,"campaignid":0},{"itemid":28965921,"adsid":0,"shopid":3377331,"campaignid":0},{"itemid":20036765,"adsid":0,"shopid":4634210,"campaignid":0},{"itemid":8035333,"adsid":0,"shopid":633256,"campaignid":0},{"itemid":21415348,"adsid":0,"shopid":4696764,"campaignid":0},{"itemid":5354695,"adsid":0,"shopid":633256,"campaignid":0},{"itemid":20597384,"adsid":0,"shopid":4696764,"campaignid":0},{"itemid":16303760,"adsid":0,"shopid":4824242,"campaignid":0},{"itemid":15422537,"adsid":0,"shopid":3626349,"campaignid":0},{"itemid":9781824,"adsid":0,"shopid":1652921,"campaignid":0},{"itemid":131449,"adsid":0,"shopid":12290,"campaignid":0},{"itemid":10465863,"adsid":0,"shopid":2698155,"campaignid":0},{"itemid":10463269,"adsid":0,"shopid":2698155,"campaignid":0},{"itemid":10452756,"adsid":0,"shopid":2698155,"campaignid":0},{"itemid":10450644,"adsid":0,"shopid":2698155,"campaignid":0},{"itemid":10453441,"adsid":0,"shopid":2698155,"campaignid":0},{"itemid":10455438,"adsid":0,"shopid":2698155,"campaignid":0},{"itemid":13540440,"adsid":0,"shopid":2698155,"campaignid":0},{"itemid":14640624,"adsid":0,"shopid":2698155,"campaignid":0}]}'


shopee <- POST(url, 
     add_headers(
       `User-Agent` = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
       Referer = 'https://shopee.tw/%E6%89%8B%E6%A9%9F%E5%B9%B3%E6%9D%BF%E8%88%87%E5%91%A8%E9%82%8A-cat.70',
       `x-csrftoken` = 'nejFG2gAVqfyxvjTDcqz8WP55XrSgqdD',
       Cookie = 'SPC_IA=-1; SPC_F=JM49YcKaERKGJGQZzjnXv5s6Ip1pFxZ4; REC_T_ID=d007be74-16b9-11e7-bbb8-246e961721ec; SPC_T_ID="R13wP3bTYRQgb3bQaMQeP1gLVE65uvi9LWf3FyOjA0O7qSQMhYtQ8rqstSWrLhLUI8lXNCk3hHDi7ZvI0F+MvDAJe3/26gyexmQQS7I7UF8="; SPC_IA_U=-; SPC_T_IV="98O2upWiCsUC2HxJ7+dQoA=="; csrftoken=nejFG2gAVqfyxvjTDcqz8WP55XrSgqdD; _gat=1; _ga=GA1.2.1232379326.1491037300; SPC_EC=-; SPC_SI=znchksgsgebh01cficchzu63ua5h034k'
     ),
     body = payload,
     encode = 'json')

#content(shopee)

RSelenium

create table customer( id int AUTO_INCREMENT PRIMARY KEY, name varchar(50), gender varchar(1), address varchar(200) ) ENGINE=InnoDB;

describe customer;

ALTER TABLE customer; CHANGE COLUMN name cname VARCHAR(50) NOT NULL, ADD COLUMN phone VARCHAR(10);

describe customer;

insert into customer(cname, gender, address) values(‘John’, ‘M’, ‘Chiayi’); insert into customer(cname, gender, address) values(‘Mary’, ‘F’, ‘Tainan’); insert into customer(cname, gender, address) values(‘Brad’, ‘M’, ‘Chiayi’);

select * from customer; select cname, gender from customer; select cname, gender from customer where gender = ‘M’;

select count() from customer; select gender, count() from customer group by gender; select gender, count() from customer group by gender having count() >= 2;

update customer set cname = ‘Johnny’ where id = 1;

create index idx_customer on customer(cname);

create index idx_customer2 on customer(cname, gender);

create index idx_customer3 on customer(cname) using BTREE;

delete from customer
where id =1;

truncate customer;

drop table customer;

``` ## 使用R 連結JDBC

#install.packages("rJava")
library(rJava)

#install.packages("RJDBC")
library(RJDBC)
## Loading required package: DBI
jar.loc <- 'C:\\Program Files (x86)\\MySQL\\Connector.J 5.1\\mysql-connector-java-5.1.40-bin.jar'
drv <- JDBC("com.mysql.jdbc.Driver",
           jar.loc,
           identifier.quote="`")
conn <- dbConnect(drv, "jdbc:mysql://localhost/appledaily", "root", "test")

data(iris)
dbWriteTable(conn, "iris", iris)
## [1] TRUE
dbListTables(conn)
## [1] "applenews" "iris"      "news_main"
df <- dbGetQuery(conn, 'select * from iris')
df
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 1            5.1         3.5          1.4         0.2     setosa
## 2            4.9         3.0          1.4         0.2     setosa
## 3            4.7         3.2          1.3         0.2     setosa
## 4            4.6         3.1          1.5         0.2     setosa
## 5            5.0         3.6          1.4         0.2     setosa
## 6            5.4         3.9          1.7         0.4     setosa
## 7            4.6         3.4          1.4         0.3     setosa
## 8            5.0         3.4          1.5         0.2     setosa
## 9            4.4         2.9          1.4         0.2     setosa
## 10           4.9         3.1          1.5         0.1     setosa
## 11           5.4         3.7          1.5         0.2     setosa
## 12           4.8         3.4          1.6         0.2     setosa
## 13           4.8         3.0          1.4         0.1     setosa
## 14           4.3         3.0          1.1         0.1     setosa
## 15           5.8         4.0          1.2         0.2     setosa
## 16           5.7         4.4          1.5         0.4     setosa
## 17           5.4         3.9          1.3         0.4     setosa
## 18           5.1         3.5          1.4         0.3     setosa
## 19           5.7         3.8          1.7         0.3     setosa
## 20           5.1         3.8          1.5         0.3     setosa
## 21           5.4         3.4          1.7         0.2     setosa
## 22           5.1         3.7          1.5         0.4     setosa
## 23           4.6         3.6          1.0         0.2     setosa
## 24           5.1         3.3          1.7         0.5     setosa
## 25           4.8         3.4          1.9         0.2     setosa
## 26           5.0         3.0          1.6         0.2     setosa
## 27           5.0         3.4          1.6         0.4     setosa
## 28           5.2         3.5          1.5         0.2     setosa
## 29           5.2         3.4          1.4         0.2     setosa
## 30           4.7         3.2          1.6         0.2     setosa
## 31           4.8         3.1          1.6         0.2     setosa
## 32           5.4         3.4          1.5         0.4     setosa
## 33           5.2         4.1          1.5         0.1     setosa
## 34           5.5         4.2          1.4         0.2     setosa
## 35           4.9         3.1          1.5         0.2     setosa
## 36           5.0         3.2          1.2         0.2     setosa
## 37           5.5         3.5          1.3         0.2     setosa
## 38           4.9         3.6          1.4         0.1     setosa
## 39           4.4         3.0          1.3         0.2     setosa
## 40           5.1         3.4          1.5         0.2     setosa
## 41           5.0         3.5          1.3         0.3     setosa
## 42           4.5         2.3          1.3         0.3     setosa
## 43           4.4         3.2          1.3         0.2     setosa
## 44           5.0         3.5          1.6         0.6     setosa
## 45           5.1         3.8          1.9         0.4     setosa
## 46           4.8         3.0          1.4         0.3     setosa
## 47           5.1         3.8          1.6         0.2     setosa
## 48           4.6         3.2          1.4         0.2     setosa
## 49           5.3         3.7          1.5         0.2     setosa
## 50           5.0         3.3          1.4         0.2     setosa
## 51           7.0         3.2          4.7         1.4 versicolor
## 52           6.4         3.2          4.5         1.5 versicolor
## 53           6.9         3.1          4.9         1.5 versicolor
## 54           5.5         2.3          4.0         1.3 versicolor
## 55           6.5         2.8          4.6         1.5 versicolor
## 56           5.7         2.8          4.5         1.3 versicolor
## 57           6.3         3.3          4.7         1.6 versicolor
## 58           4.9         2.4          3.3         1.0 versicolor
## 59           6.6         2.9          4.6         1.3 versicolor
## 60           5.2         2.7          3.9         1.4 versicolor
## 61           5.0         2.0          3.5         1.0 versicolor
## 62           5.9         3.0          4.2         1.5 versicolor
## 63           6.0         2.2          4.0         1.0 versicolor
## 64           6.1         2.9          4.7         1.4 versicolor
## 65           5.6         2.9          3.6         1.3 versicolor
## 66           6.7         3.1          4.4         1.4 versicolor
## 67           5.6         3.0          4.5         1.5 versicolor
## 68           5.8         2.7          4.1         1.0 versicolor
## 69           6.2         2.2          4.5         1.5 versicolor
## 70           5.6         2.5          3.9         1.1 versicolor
## 71           5.9         3.2          4.8         1.8 versicolor
## 72           6.1         2.8          4.0         1.3 versicolor
## 73           6.3         2.5          4.9         1.5 versicolor
## 74           6.1         2.8          4.7         1.2 versicolor
## 75           6.4         2.9          4.3         1.3 versicolor
## 76           6.6         3.0          4.4         1.4 versicolor
## 77           6.8         2.8          4.8         1.4 versicolor
## 78           6.7         3.0          5.0         1.7 versicolor
## 79           6.0         2.9          4.5         1.5 versicolor
## 80           5.7         2.6          3.5         1.0 versicolor
## 81           5.5         2.4          3.8         1.1 versicolor
## 82           5.5         2.4          3.7         1.0 versicolor
## 83           5.8         2.7          3.9         1.2 versicolor
## 84           6.0         2.7          5.1         1.6 versicolor
## 85           5.4         3.0          4.5         1.5 versicolor
## 86           6.0         3.4          4.5         1.6 versicolor
## 87           6.7         3.1          4.7         1.5 versicolor
## 88           6.3         2.3          4.4         1.3 versicolor
## 89           5.6         3.0          4.1         1.3 versicolor
## 90           5.5         2.5          4.0         1.3 versicolor
## 91           5.5         2.6          4.4         1.2 versicolor
## 92           6.1         3.0          4.6         1.4 versicolor
## 93           5.8         2.6          4.0         1.2 versicolor
## 94           5.0         2.3          3.3         1.0 versicolor
## 95           5.6         2.7          4.2         1.3 versicolor
## 96           5.7         3.0          4.2         1.2 versicolor
## 97           5.7         2.9          4.2         1.3 versicolor
## 98           6.2         2.9          4.3         1.3 versicolor
## 99           5.1         2.5          3.0         1.1 versicolor
## 100          5.7         2.8          4.1         1.3 versicolor
## 101          6.3         3.3          6.0         2.5  virginica
## 102          5.8         2.7          5.1         1.9  virginica
## 103          7.1         3.0          5.9         2.1  virginica
## 104          6.3         2.9          5.6         1.8  virginica
## 105          6.5         3.0          5.8         2.2  virginica
## 106          7.6         3.0          6.6         2.1  virginica
## 107          4.9         2.5          4.5         1.7  virginica
## 108          7.3         2.9          6.3         1.8  virginica
## 109          6.7         2.5          5.8         1.8  virginica
## 110          7.2         3.6          6.1         2.5  virginica
## 111          6.5         3.2          5.1         2.0  virginica
## 112          6.4         2.7          5.3         1.9  virginica
## 113          6.8         3.0          5.5         2.1  virginica
## 114          5.7         2.5          5.0         2.0  virginica
## 115          5.8         2.8          5.1         2.4  virginica
## 116          6.4         3.2          5.3         2.3  virginica
## 117          6.5         3.0          5.5         1.8  virginica
## 118          7.7         3.8          6.7         2.2  virginica
## 119          7.7         2.6          6.9         2.3  virginica
## 120          6.0         2.2          5.0         1.5  virginica
## 121          6.9         3.2          5.7         2.3  virginica
## 122          5.6         2.8          4.9         2.0  virginica
## 123          7.7         2.8          6.7         2.0  virginica
## 124          6.3         2.7          4.9         1.8  virginica
## 125          6.7         3.3          5.7         2.1  virginica
## 126          7.2         3.2          6.0         1.8  virginica
## 127          6.2         2.8          4.8         1.8  virginica
## 128          6.1         3.0          4.9         1.8  virginica
## 129          6.4         2.8          5.6         2.1  virginica
## 130          7.2         3.0          5.8         1.6  virginica
## 131          7.4         2.8          6.1         1.9  virginica
## 132          7.9         3.8          6.4         2.0  virginica
## 133          6.4         2.8          5.6         2.2  virginica
## 134          6.3         2.8          5.1         1.5  virginica
## 135          6.1         2.6          5.6         1.4  virginica
## 136          7.7         3.0          6.1         2.3  virginica
## 137          6.3         3.4          5.6         2.4  virginica
## 138          6.4         3.1          5.5         1.8  virginica
## 139          6.0         3.0          4.8         1.8  virginica
## 140          6.9         3.1          5.4         2.1  virginica
## 141          6.7         3.1          5.6         2.4  virginica
## 142          6.9         3.1          5.1         2.3  virginica
## 143          5.8         2.7          5.1         1.9  virginica
## 144          6.8         3.2          5.9         2.3  virginica
## 145          6.7         3.3          5.7         2.5  virginica
## 146          6.7         3.0          5.2         2.3  virginica
## 147          6.3         2.5          5.0         1.9  virginica
## 148          6.5         3.0          5.2         2.0  virginica
## 149          6.2         3.4          5.4         2.3  virginica
## 150          5.9         3.0          5.1         1.8  virginica
df2 <- dbGetQuery(conn, 'select species, avg(`Petal.Length`) from iris group by species')
df2
##      Species avg(`Petal.Length`)
## 1     setosa               1.462
## 2 versicolor               4.260
## 3  virginica               5.552
dbExistsTable(conn,'iris')
## [1] TRUE
if(dbExistsTable(conn,'iris')){
     dbRemoveTable(conn, 'iris')
}
## logical(0)
dbListTables(conn)
## [1] "applenews" "news_main"
dbDisconnect(conn)
## [1] TRUE

將蘋果新聞存進資料庫

download.file('https://raw.githubusercontent.com/ywchiu/rtibame/master/data/applenews.RData', 'applenews.RData')
load('applenews.RData')


library(RJDBC)
jar.loc <- 'C:\\Program Files (x86)\\MySQL\\Connector.J 5.1\\mysql-connector-java-5.1.40-bin.jar'
drv <- JDBC("com.mysql.jdbc.Driver",
           jar.loc,
           identifier.quote="`")
conn <- dbConnect(drv, "jdbc:mysql://localhost/appledaily", "root", "test")
#dbWriteTable(conn, "applenews", appledaily)
#CREATE TABLE `news_main` (
#  `content` text,
#  `title` varchar(1000) DEFAULT NULL,
#  `dt` datetime DEFAULT NULL,
#  `category` varchar(100) DEFAULT NULL,
#  `view_cnt` int(11) DEFAULT NULL
#) ENGINE=InnoDB DEFAULT CHARSET=utf8

applenews$view_cnt <- as.integer(applenews$view_cnt )
str(applenews)
dbWriteTable(conn, 'news_main', applenews, append=TRUE,row.names=FALSE,overwrite=FALSE)

df3 <- dbGetQuery(conn, 'select * from news_main')
res <- dbGetQuery(conn, "SELECT category, count(*) FROM news_main group by category")

res <- dbGetQuery(conn, "SELECT title FROM news_main order by view_cnt desc limit 1")
res

res <- dbGetQuery(conn, "SELECT category, sum(view_cnt) FROM news_main group by category")
res

res <- dbGetQuery(conn, "SELECT category, avg(view_cnt) FROM news_main group by category order by 2 desc")
barplot(height = res$`avg(view_cnt)`, names.arg = res$category, col = as.factor(res$category) )
dbDisconnect(conn)

使用R 作敘述性統計

data(iris)
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
mean(iris[iris$Species == 'setosa', 'Petal.Length'])
## [1] 1.462
tapply(iris$Petal.Length, iris$Species, mean )
##     setosa versicolor  virginica 
##      1.462      4.260      5.552

使用dplyr

download.file('https://raw.githubusercontent.com/ywchiu/rtibame/master/data/applenews.RData', 'applenews.RData')
load('applenews.RData')
# R
applenews[applenews$category == '娛樂',]

# dplyr
library(dplyr)
filter(applenews,category == '娛樂')
filter(applenews, category == "娛樂" & view_cnt > 1000)
filter(applenews, category == "娛樂" | view_cnt > 1000)
filter(applenews, category %in% c("娛樂", "社會") )

# Choose Column
applenews[,c('title', 'view_cnt')]
select(applenews, title, view_cnt)


applenews %>%
  select(title,view_cnt) %>%
  filter(view_cnt > 1000)


#使用Arrange 可以將資料做排序
applenews %>% 
    select(title, category,view_cnt) %>% 
    filter(category == "社會") %>%
    arrange(view_cnt)

#由大到小排序 (desc)
applenews %>% 
    select(title, category,view_cnt) %>% 
    filter(category == "社會") %>%
    arrange(desc(view_cnt))


#計算總和
freqsum <- applenews %>%
     select(view_cnt) %>%
     sum()
freqsum
# 使用mutate 新增欄位
applenews %>%
    select(title, category,view_cnt) %>%
    mutate(portion= view_cnt/freqsum)
#儲存新欄位
applenews <- applenews %>% mutate(portion= view_cnt/freqsum)


applenews %>%
    group_by(category) %>%
    summarise(view_sum = sum(view_cnt, na.rm=TRUE)) %>% arrange(desc(view_sum))

applenews %>%
    group_by(category) %>%
    summarise_each(funs(sum), view_cnt, portion) %>% arrange(desc(portion))



applenews %>%
   group_by(category) %>%
   summarise_each(funs(min(., na.rm=TRUE), max(., na.rm=TRUE)), matches("view_cnt"))


applenews %>%
   group_by(category) %>%
   summarise_each(funs(sum(., na.rm=TRUE), mean(., na.rm=TRUE)), matches("view_cnt"))

#一般計數
 applenews %>%
     select(category) %>%
     summarise_each(funs(n()))

#不重複計數
applenews %>%
    select(category) %>%
    summarise_each(funs(n_distinct(category)))


#取得統計數
cat_stat <- applenews %>%
    group_by(category) %>%
    summarise(view_sum = sum(view_cnt)) %>%
    arrange(desc(view_sum))
cat_stat
cat_stat$category <- as.factor(cat_stat$category)
#繪圖
barplot(cat_stat$view_sum, names.arg=cat_stat$category, col=cat_stat$category)

pie(cat_stat$view_sum, label = cat_stat$category)


applenews %>%
    group_by(category) %>%
    summarise(view_cnt = n()) %>%
    arrange(desc(view_cnt))
applenews %>%
    group_by(category) %>%
    summarise(view_mean = mean(view_cnt)) %>%
    arrange(desc(view_mean))

Post analysis

applenews  %>% select(dt) %>% head() %>% format('%H')

applenews$hour <- format(applenews$dt, '%H')


cnt_by_hour <- applenews %>% filter(dt >= '2016-04-14' & dt < '2016-04-15') %>% select(hour) %>% group_by(hour) %>% summarize(cnt = n())

plot(cnt_by_hour, type = 'b')


?strptime
dates <- c('2017/04/08 16:42:00')
class(dates)
a <- strptime(dates, "%Y/%m/%d %H:%M:%S")
class(a)
format(a, '%H')
format(a, '%y-%m-%d %A')