- 背景描述
- 資料樣貌
- 資料視覺化
2018/6/4
背景 : Youtube為目前全球著名的影音分享平台,每年會提供最熱門的視頻名單。透過數據,我們可以了解不同國家熱門的影片的類別,與其他是數據變項之間的關聯。
方法 : 依據現有資料,找出有用的變項內容做視覺化的圖表,進行探索式資料分析(EDA),從中發現有趣的現象。
資料來源:Kaggle Trending YouTube Statistics
# JSON (大同小異,只差在US多了category_id = 29)
us_cat_json <- fromJSON("US_category_id.json")
US_category <- as.data.frame(cbind(us_cat_json[["items"]][["id"]],
us_cat_json[["items"]][["snippet"]][["title"]]))
# read in data
ca <- fread("CAvideos.csv", encoding = "UTF-8", nrows = 10000)
ca[,"Location":="CA"] # add location name
de <- fread("DEvideos.csv", encoding = "UTF-8", nrows=10000)
de[,"Location":="DE"]
fr <- fread("FRvideos.csv", encoding = "UTF-8", nrows=10000)
fr[,"Location":="FR"]
gb <- fread("GBvideos.csv", encoding = "UTF-8", nrows=10000)
gb[,"Location":="GB"]
us <- fread("USvideos.csv", encoding = "UTF-8", nrows=10000)
us[,"Location":="US"]
videos <- as.data.table(rbind(gb,fr,ca,us,de))
# 改變時間格式
videos$trending_date <- ydm(videos$trending_date)
videos$publish_date <- ymd(substr(videos$publish_time, start = 1,
stop = 10))
# 發佈到上熱門間隔了幾天
videos$dif_days <- videos$trending_date-videos$publish_date
# 發布時間
videos$publish_timing <- substr(videos$publish_time, start = 12,
stop = 13)
# 計算比率(以views為分母)
videos[,"Percent_Likes" := round((100*(likes)/views), digits = 4)]
videos[,"Percent_Dislikes" := round((100*(dislikes)/views), digits = 4)]
videos[,"Percent_Comment" := round((100*(comment_count)/views),
digits = 4)]
# 將對應的類別名稱輸入
videos$category_name <- US_category$V2[match(videos$category_id,
US_category$V1)]
corrplot.mixed(corr = cor(videos[,c("views","likes","dislikes",
"comment_count")]),tl.col = "black")
topch <- videos[,.N,by = channel_title][order(-N)][1:10] ggplot(topch,aes(reorder(channel_title, N), N)) + geom_bar(stat="identity", fill = "#66CDAA") + guides(fill="none") + labs(x = "Channel title", y = "Number of videos")+ coord_flip()
ggplot(videos,aes(as.factor(videos$publish_timing))) + geom_bar(fill = "#66CDAA") + labs(x = "Timing", y = "Count of videos")
ggplot(videos[dif_days<30],aes(as.factor(dif_days), group = Location)) + geom_point(stat = "count", size = 2, aes(color = Location)) + geom_line(stat = "count", size = 1.2, aes(color = Location)) + labs(x = "Days to trend", y = "Count of videos")
trend_day <- videos[,.N,by = c("video_id", "Location")]
ggplot(trend_day,aes(N, group = Location)) +
geom_point(stat = "count", size = 2, aes(color = Location)) +
geom_line(stat = "count", size = 1.2, aes(color = Location)) +
labs(x = "Trending days", y = "Count of videos")
ggplot(videos,aes(x = reorder(category_name, category_name,
function(x) + length(x)))) +
geom_bar(fill = "#66CDAA") +
facet_wrap(~ Location) +
labs(x = "Categories", y = "Count of videos") + coord_flip()
ggplot(videos, aes(Location, views, color = Location)) + stat_summary(fun.data = mean_se, geom = "pointrange", size = 1.1) + labs(x = "Location", y = "Mean of views")
mViews <- videos[,.("MeanViews"= round(mean(views, na.rm = T)/10^6,
digits = 4)), by = c("category_id", "Location")]
ggplot(mViews, aes(as.factor(category_id), MeanViews)) +
geom_point(colour = "#66CDAA", size = 2) +
facet_wrap(~ Location) +
labs(x = "Categories", y = "Mean of views (million)")
videos %>% group_by(Location) %>%
summarise(m_Percent_Like = mean(Percent_Likes),
se_Percent_Like = sd(Percent_Likes)/sqrt(n())) %>%
ggplot() + aes(reorder(Location, -m_Percent_Like), m_Percent_Like) +
geom_bar(stat = "identity", aes(fill = Location)) +
geom_errorbar(aes(ymin = m_Percent_Like - se_Percent_Like,
ymax = m_Percent_Like + se_Percent_Like),
width = .2) +
guides(fill="none") +
labs(x = "Countries", y = "Mean Percent of likes")
videos[videos$Location == "US"] %>% group_by(category_name) %>%
summarise(m_Percent_Likes = mean(Percent_Likes),
se_Percent_Likes = sd(Percent_Likes)/sqrt(n())) %>%
ggplot() +
aes(reorder(category_name, -m_Percent_Likes), m_Percent_Likes) +
geom_bar(stat = "identity", fill = "#66CDAA") +
geom_errorbar(aes(ymin = m_Percent_Likes - se_Percent_Likes,
ymax = m_Percent_Likes + se_Percent_Likes),
width = .2) +
labs(x = "Category name", y = "Mean Percent(%)",
title= "Mean Percent of likes by Categories in US") +
theme(axis.text.x = element_text(angle = 45,hjust = 1))
### 文字轉換 function
transformer <- function(x){
Text<- Corpus(VectorSource(x))
Text <- Text %>% tm_map(tolower) %>%
tm_map(stripWhitespace)%>%
tm_map(removeWords,stopwords("english"))
}
### 字頻矩陣 function
freq_matrix <- function(y) {
m <- TermDocumentMatrix(transformer(y)) %>% as.matrix()
v <- sort(rowSums(m), decreasing = T)
d <- data.frame(word = names(v), freq = v)
head(d, 10)
}
# 字頻矩陣 freq_matrix(transformer(us$tags))
word freq video video 2009 2017 2017 1870 funny funny 1730 christmas christmas 1688 news news 1622 new new 1582 music music 1487 show show 1445 comedy comedy 1177 makeup makeup 1074
# 文字雲 (各國語言不一,用US做範例)
wordcloud(transformer(us$tags), max.words = 50,
random.order = F, colors = brewer.pal(8,"Dark2"))