使用dplyr 分析資料
使用dplyr
# install.packages("dplyr")
library(dplyr)
原先R 提供的過濾功能
appledaily[appledaily$category == "娛樂",]
dplyr 的過濾功能
filter(appledaily, category == "娛樂")
可以使用 AND, OR 與 IN 來過濾資料
filter(appledaily, category == "娛樂" & view_cnt > 1000)
filter(appledaily, category == "娛樂" | view_cnt > 1000)
filter(appledaily, category %in% c("娛樂", "社會"))
選擇欄位
#原先R 提供的欄位選取
appledaily[, c("category","view_cnt")]
#dplyr 的欄位選取
select(appledaily,category,view_cnt)
同時選擇欄位又過濾資料
appledaily %>%
select(category,view_cnt) %>%
filter(category == "社會")
資料做排序
#使用Arrange 可以將資料做排序
appledaily %>%
select(category,view_cnt) %>%
filter(category == "社會") %>%
arrange(view_cnt)
#由大到小排序 (desc)
appledaily %>%
select(category,view_cnt) %>%
filter(category == "社會") %>%
arrange(desc(view_cnt))
新增欄位 (mutate)
#計算總和
freqsum = appledaily %>%
select(view_cnt) %>%
sum()
#使用mutate 新增欄位
appledaily %>%
select(title, category,view_cnt) %>%
mutate(portion= view_cnt/freqsum)
#儲存新欄位
appledaily = appledaily %>% mutate(portion= view_cnt/freqsum)
分組計算 (group_by, summarise)
appledaily %>%
group_by(category) %>%
summarise(view_sum = sum(view_cnt, na.rm=TRUE))
appledaily %>%
group_by(category) %>%
summarise_each(funs(sum), view_cnt, portion)
appledaily %>%
group_by(category) %>%
summarise_each(funs(min(., na.rm=TRUE), max(., na.rm=TRUE)), matches("view_cnt"))
一般計數
appledaily %>%
select(category) %>%
summarise_each(funs(n()))
#不重複計數
appledaily %>%
select(category) %>%
summarise_each(funs(n_distinct(category)))
#取得統計數
cat_stat = appledaily %>%
group_by(category) %>%
summarise(view_sum = sum(view_cnt))
cat_stat$category = as.factor(cat_stat$category)
繪圖
barplot(cat_stat$view_sum, names.arg=cat_stat$category, col=cat_stat$category)
pie(cat_stat$view_sum, label = cat_stat$category)