使用dplyr 分析資料

使用dplyr

# install.packages("dplyr")
library(dplyr)

原先R 提供的過濾功能

appledaily[appledaily$category == "娛樂",]

dplyr 的過濾功能

filter(appledaily, category == "娛樂")

可以使用 AND, OR 與 IN 來過濾資料

filter(appledaily, category == "娛樂" & view_cnt > 1000)

filter(appledaily, category == "娛樂" | view_cnt > 1000)

filter(appledaily, category %in% c("娛樂", "社會"))

選擇欄位

#原先R 提供的欄位選取
appledaily[, c("category","view_cnt")]

#dplyr 的欄位選取
select(appledaily,category,view_cnt)

同時選擇欄位又過濾資料

appledaily %>% 
  select(category,view_cnt) %>% 
  filter(category == "社會")

資料做排序

#使用Arrange 可以將資料做排序
appledaily %>% 
  select(category,view_cnt) %>% 
  filter(category == "社會") %>%
  arrange(view_cnt)

#由大到小排序 (desc)
appledaily %>% 
  select(category,view_cnt) %>% 
  filter(category == "社會") %>%
  arrange(desc(view_cnt))

新增欄位 (mutate)

#計算總和
freqsum = appledaily %>%
  select(view_cnt) %>%
  sum()
#使用mutate 新增欄位
appledaily %>%
  select(title, category,view_cnt) %>%
  mutate(portion= view_cnt/freqsum)
#儲存新欄位
appledaily = appledaily %>% mutate(portion= view_cnt/freqsum)

 分組計算 (group_by, summarise)

appledaily %>%
  group_by(category) %>%
  summarise(view_sum = sum(view_cnt, na.rm=TRUE))

appledaily %>%
  group_by(category) %>%
  summarise_each(funs(sum), view_cnt, portion)

appledaily %>%
  group_by(category) %>%
  summarise_each(funs(min(., na.rm=TRUE), max(., na.rm=TRUE)), matches("view_cnt"))

一般計數

appledaily %>%
  select(category) %>%
  summarise_each(funs(n()))

#不重複計數
appledaily %>%
  select(category) %>%
  summarise_each(funs(n_distinct(category)))

#取得統計數
cat_stat = appledaily %>%
  group_by(category) %>%
  summarise(view_sum = sum(view_cnt))
cat_stat$category = as.factor(cat_stat$category)

繪圖

barplot(cat_stat$view_sum, names.arg=cat_stat$category, col=cat_stat$category)

pie(cat_stat$view_sum, label = cat_stat$category)