連結資料庫

library(RJDBC)
## Loading required package: DBI
## Warning: package 'DBI' was built under R version 3.2.5
## Loading required package: rJava
jar.loc <- 'C:\\Program Files (x86)\\MySQL\\Connector.J 5.1\\mysql-connector-java-5.1.38-bin.jar'
drv <- JDBC("com.mysql.jdbc.Driver",
           jar.loc,
           identifier.quote="`")
conn <- dbConnect(drv, "jdbc:mysql://localhost/appledaily", "root", "test")

dbDisconnect(conn)
## [1] TRUE

寫入資料進Appledaily 的 iris_test

data(iris)
conn <- dbConnect(drv, "jdbc:mysql://localhost/appledaily", "root", "test")
dbWriteTable(conn, "iris_test", iris)
dbDisconnect(conn)

SQL 範例

# 從iris_test 中挑選所有欄位
SELECT * FROM appledaily.iris_test;
# 從iris_test 中挑選sepal.length 以及 Species 欄位
SELECT `Sepal.Length`, `Species` FROM appledaily.iris_test;
# 從iris_test 中挑選sepal.length 以及 Species 欄位,並篩選Sepal.Length > 5 的資料
SELECT `Sepal.Length`, `Species` FROM appledaily.iris_test 
    WHERE `Sepal.Length` >=5;
    
# 根據花的種類進行分析    
SELECT AVG(`Sepal.Length`), `Species` FROM appledaily.iris_test 
    GROUP BY `Species`;

表列Appledaily 中的表格

conn <- dbConnect(drv, "jdbc:mysql://localhost/appledaily", "root", "test")
dbListTables(conn)
dbDisconnect(conn)

讀取iris_test的資料

conn <- dbConnect(drv, "jdbc:mysql://localhost/appledaily", "root", "test")
df<-dbReadTable(conn,"iris_test")
dbDisconnect(conn)

對MySQL 下Query 語句

conn <- dbConnect(drv, "jdbc:mysql://localhost/appledaily", "root", "test")
res <- dbGetQuery(conn, 'SELECT avg(`Sepal.Length`) as avg_length, Species FROM iris_test group by Species;')
res
dbDisconnect(conn)

刪除表格

conn <- dbConnect(drv, "jdbc:mysql://localhost/appledaily", "root", "test")

if(dbExistsTable(conn,'iris_test')){
   dbRemoveTable(conn,'iris_test')
}
dbListTables(conn)
dbDisconnect(conn)

下載檔案 & 將資料塞入 (錯誤版本)

download.file('https://github.com/ywchiu/rtibame/raw/master/data/applenews.RData', 'applenews.RData')
load('applenews.RData')

conn <- dbConnect(drv, "jdbc:mysql://localhost/appledaily", "root", "test")

dbWriteTable(conn, "applenews", applenews)
dbListTables(conn)
dbReadTable(conn,"applenews")
dbDisconnect(conn)

下載檔案 & 將資料塞入 (正確版本)

CREATE TABLE `news_main` (
  `content` text,
  `title` varchar(1000) DEFAULT NULL,
  `dt` datetime DEFAULT NULL,
  `category` varchar(50) DEFAULT NULL,
  `view_cnt` int(11) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8

download.file('https://github.com/ywchiu/rtibame/raw/master/data/applenews.RData', 'applenews.RData')
load('applenews.RData')

applenews$view_cnt = as.integer(applenews$view_cnt)
conn <- dbConnect(drv, "jdbc:mysql://localhost/appledaily", "root", "test")

dbWriteTable(conn, 'news_main', applenews, append=TRUE,row.names=FALSE,overwrite=FALSE)
dbDisconnect(conn)

轉型

s = c("1", "2", "3")
class(s)

# to integer
n = as.integer(s)
class(n)

# to character
s2 = as.character(n)
class(s2)

# to factor
c1 = as.factor(s2)
class(c1)

做成圖表


conn <- dbConnect(drv, "jdbc:mysql://localhost/appledaily", "root", "test")
res <- dbGetQuery(conn, "SELECT category, count(*) FROM news_main group by category")
res
str(res)
names(res) = c('category', 'cnt')
pie(res$cnt,labels = res$category)
dbDisconnect(conn)

過濾資料

download.file('https://github.com/ywchiu/rtibame/raw/master/data/applenews.RData', 'applenews.RData')
load('applenews.RData')


applenews$view_cnt = as.integer(applenews$view_cnt)
applenews[applenews$category == "娛樂",]
load('applenews.RData')

library(dplyr)
filter(applenews, category == "娛樂")

可以使用 AND, OR 與 IN 來過濾資料

#找出娛樂以及點閱數超過1000的新聞
filter(applenews, category == "娛樂" & view_cnt > 1000)

#找出娛樂或點閱數超過1000的新聞
filter(applenews, category == "娛樂" | view_cnt > 1000)

#找出娛樂與社會新聞
filter(applenews, category %in% c("娛樂", "社會"))

選擇欄位

#原先R 提供的欄位選取
applenews[, c("category","view_cnt")]

#dplyr 的欄位選取
select(applenews,category,view_cnt)

選擇欄位又過濾資料

applenews %>% 
    select(category,view_cnt) %>% 
    filter(category == "社會")

資料做排序

a = applenews[applenews$category == '社會' , c('category', 'view_cnt')]
a[order(a$view_cnt),]

applenews %>% 
  select(category,view_cnt) %>% 
    filter(category == "社會") %>%
    arrange(view_cnt)

抽取時間

a = head(applenews[,'dt'])
as.numeric(format(a, "%H"))


table(format(applenews[,'dt'], '%d'))

hournews <- 
applenews %>% 
select(dt) %>%
filter(format(dt, '%d')  == '14') %>%
mutate(hour= format(dt, '%H')) %>% 
select(hour) %>%
table()

barplot(hournews)
plot(hournews, type='l')

applenews %>% 
select(dt, title) %>%
filter(format(dt, '%H')  == '09' & format(dt, '%d')  == '14')

mutate

#計算總和
freqsum = applenews %>%
     select(view_cnt) %>%
     sum()
#使用mutate 新增欄位
applenews %>%
    select(title, category,view_cnt) %>%
    mutate(portion= view_cnt/freqsum)
#儲存新欄位
applenews = applenews %>% mutate(portion= view_cnt/freqsum)

分組計算 (group_by, summarise)

applenews %>%
    group_by(category) %>%
    summarise(view_sum = sum(view_cnt, na.rm=TRUE)) %>% arrange(view_sum)
    
applenews %>%
    group_by(category) %>%
    summarise(view_mean = mean(view_cnt, na.rm=TRUE)) %>% arrange(view_mean)

統計多個欄位

applenews %>%
    group_by(category) %>%
    summarise_each(funs(sum), view_cnt, portion)

針對多個欄位做統計

applenews %>%
   group_by(category) %>%
   summarise_each(funs(min(., na.rm=TRUE), max(., na.rm=TRUE), sum(., na.rm=TRUE), mean(., na.rm=TRUE)), matches("view_cnt"))

資料計數

 applenews %>%
     select(category) %>%
     summarise_each(funs(n()))

 applenews %>%
     select(category) %>%
     summarise_each(funs(n_distinct(category)))

使用直方圖顯示新聞點閱總和

#取得統計數
cat_stat = applenews %>%
    group_by(category) %>%
    summarise(view_sum = sum(view_cnt)) %>%
    arrange(desc(view_sum))
cat_stat$category = as.factor(cat_stat$category)
#繪圖
barplot(cat_stat$view_sum, names.arg=cat_stat$category, col=cat_stat$category)

使用圓餅圖顯示新聞點閱比例

pie(cat_stat$view_sum, label = cat_stat$category)