YouTube 熱門影片數據探索

2018/6/4

大綱

背景描述
資料樣貌
資料視覺化

前導

背景 : Youtube為目前全球著名的影音分享平台，每年會提供最熱門的視頻名單。透過數據，我們可以了解不同國家熱門的影片的類別，與其他是數據變項之間的關聯。
方法 : 依據現有資料，找出有用的變項內容做視覺化的圖表，進行探索式資料分析(EDA)，從中發現有趣的現象。
資料來源：Kaggle Trending YouTube Statistics

資料樣貌

去年11月至今年5月每日前200名熱門影片
五個國家的熱門影片，包含美國、英國、德國、加拿大與法國。
影片內容csv檔、影片類型id json檔
Rows : 三萬多筆
Columns : 影片名稱、頻道名稱、發布日期、影片類型、標示的關鍵字、影片觀看數、按喜歡的數、按不喜歡的數、留言數、影片描述等，共16個變項。

讀入資料

# JSON (大同小異，只差在US多了category_id = 29)
us_cat_json <- fromJSON("US_category_id.json")
US_category <-  as.data.frame(cbind(us_cat_json[["items"]][["id"]], 
us_cat_json[["items"]][["snippet"]][["title"]])) 

# read in data
ca <- fread("CAvideos.csv", encoding = "UTF-8", nrows = 10000)
ca[,"Location":="CA"]   # add location name
de <- fread("DEvideos.csv", encoding = "UTF-8", nrows=10000)
de[,"Location":="DE"]   
fr <- fread("FRvideos.csv", encoding = "UTF-8", nrows=10000)
fr[,"Location":="FR"] 
gb <- fread("GBvideos.csv", encoding = "UTF-8", nrows=10000)
gb[,"Location":="GB"] 
us <- fread("USvideos.csv", encoding = "UTF-8", nrows=10000)
us[,"Location":="US"] 

videos <- as.data.table(rbind(gb,fr,ca,us,de))

新增變項

# 改變時間格式
videos$trending_date <- ydm(videos$trending_date)
videos$publish_date <- ymd(substr(videos$publish_time, start = 1,
                                                        stop = 10))

# 發佈到上熱門間隔了幾天
videos$dif_days <- videos$trending_date-videos$publish_date  
# 發布時間
videos$publish_timing <- substr(videos$publish_time, start = 12,
                                                        stop = 13)
# 計算比率(以views為分母)
videos[,"Percent_Likes" := round((100*(likes)/views), digits = 4)]
videos[,"Percent_Dislikes" := round((100*(dislikes)/views), digits = 4)]
videos[,"Percent_Comment" := round((100*(comment_count)/views),
                                                      digits = 4)]
# 將對應的類別名稱輸入
videos$category_name <- US_category$V2[match(videos$category_id,
                                                  US_category$V1)]

資料探索與視覺化

1. 變項之間的關係

corrplot.mixed(corr = cor(videos[,c("views","likes","dislikes",
                                "comment_count")]),tl.col = "black")

2. 五國之間熱門的頻道

topch <- videos[,.N,by = channel_title][order(-N)][1:10]
ggplot(topch,aes(reorder(channel_title, N), N)) +
  geom_bar(stat="identity", fill = "#66CDAA") +
  guides(fill="none") +
  labs(x = "Channel title", y = "Number of videos")+
  coord_flip()

3. 要在什麼時間發布影片

ggplot(videos,aes(as.factor(videos$publish_timing))) +
  geom_bar(fill = "#66CDAA") +
  labs(x = "Timing", y = "Count of videos")

4. 影片發布多久後上熱門

ggplot(videos[dif_days<30],aes(as.factor(dif_days), group = Location)) +
  geom_point(stat = "count", size = 2, aes(color = Location)) +
  geom_line(stat = "count", size = 1.2, aes(color = Location)) +
  labs(x = "Days to trend", y = "Count of videos")

5. 影片上熱門後持續幾天

trend_day <- videos[,.N,by = c("video_id", "Location")]
ggplot(trend_day,aes(N, group = Location)) +
  geom_point(stat = "count", size = 2, aes(color = Location)) +
  geom_line(stat = "count", size = 1.2, aes(color = Location)) +
  labs(x = "Trending days", y = "Count of videos")

6. 各國熱門影片類型

ggplot(videos,aes(x = reorder(category_name, category_name, 
                              function(x) + length(x)))) +
  geom_bar(fill = "#66CDAA") +    
  facet_wrap(~ Location) +
  labs(x = "Categories", y = "Count of videos") + coord_flip()

7.1 各國影片平均觀看數

ggplot(videos, aes(Location, views, color = Location)) +
  stat_summary(fun.data = mean_se, geom = "pointrange", size = 1.1) +
  labs(x = "Location", y = "Mean of views")

7.2 各國不同類型影片平均觀看數

mViews <- videos[,.("MeanViews"= round(mean(views, na.rm = T)/10^6,
                    digits = 4)), by = c("category_id", "Location")]
ggplot(mViews, aes(as.factor(category_id), MeanViews)) +
  geom_point(colour = "#66CDAA", size = 2) +
  facet_wrap(~ Location) +
  labs(x = "Categories", y = "Mean of views (million)")

8. 各國平均按喜歡、不喜歡、留言比率

videos %>% group_by(Location) %>%
  summarise(m_Percent_Like = mean(Percent_Likes),
            se_Percent_Like = sd(Percent_Likes)/sqrt(n())) %>%
  ggplot() + aes(reorder(Location, -m_Percent_Like), m_Percent_Like) +
  geom_bar(stat = "identity", aes(fill = Location)) +
  geom_errorbar(aes(ymin = m_Percent_Like - se_Percent_Like,
                  ymax = m_Percent_Like + se_Percent_Like), 
                    width = .2) +
  guides(fill="none") +
  labs(x = "Countries", y = "Mean Percent of likes")

8. 各國平均按喜歡、不喜歡、留言比率

9. 各類型影片行為 (US)

videos[videos$Location == "US"] %>% group_by(category_name) %>%
  summarise(m_Percent_Likes = mean(Percent_Likes),
            se_Percent_Likes = sd(Percent_Likes)/sqrt(n())) %>%
  ggplot() + 
  aes(reorder(category_name, -m_Percent_Likes), m_Percent_Likes) +
  geom_bar(stat = "identity", fill = "#66CDAA") +
  geom_errorbar(aes(ymin = m_Percent_Likes - se_Percent_Likes,
                    ymax = m_Percent_Likes + se_Percent_Likes), 
                    width = .2) +
  labs(x = "Category name", y = "Mean Percent(%)", 
       title= "Mean Percent of likes by Categories in US") +
  theme(axis.text.x = element_text(angle = 45,hjust = 1))

9. 各類型影片行為 (US)

各類型影片行為 (US)

Tags 文字雲 (US)

### 文字轉換 function
transformer <- function(x){
  Text<- Corpus(VectorSource(x))
  
  Text <- Text %>% tm_map(tolower) %>% 
    tm_map(stripWhitespace)%>% 
    tm_map(removeWords,stopwords("english"))
}

### 字頻矩陣 function
freq_matrix <- function(y) {
  m <- TermDocumentMatrix(transformer(y)) %>% as.matrix()
  v <- sort(rowSums(m), decreasing = T)
  d <- data.frame(word = names(v), freq = v)
  head(d, 10)
}

Tags 文字雲 (US)

# 字頻矩陣
freq_matrix(transformer(us$tags))

               word freq
video         video 2009
2017           2017 1870
funny         funny 1730
christmas christmas 1688
news           news 1622
new             new 1582
music         music 1487
show           show 1445
comedy       comedy 1177
makeup       makeup 1074

Tags 文字雲 (US)

# 文字雲 (各國語言不一，用US做範例)
wordcloud(transformer(us$tags), max.words = 50, 
          random.order = F, colors = brewer.pal(8,"Dark2"))

總結

影片觀看數、按喜歡的數、按不喜歡的數、留言數皆為正相關。
跨國的熱門頻道半數為美國脫口秀。
影片的最佳發布時間為美國太平洋時間下午3-6點。
加拿大、德國、法國這三個國家影片容易爆紅但也容易退燒。英國影片很少會爆紅但較容易維持在熱門影片清單上。
熱門影片主流類型為娛樂
英國人最愛看影片貢獻最多觀看次數
法國人最會按喜歡、加拿大人最不會表達負面評價
美國人對不同類型的熱門影片會有不同比率的影片互動行為。
熱門影片最常打的關鍵字為2017、video、show、music、funny等等。

討論

上熱門的影響條件
影片資料語言限制

大綱

前導

資料樣貌

讀入資料

新增變項

資料探索與視覺化

1. 變項之間的關係

2. 五國之間熱門的頻道

3. 要在什麼時間發布影片

4. 影片發布多久後上熱門

5. 影片上熱門後持續幾天

6. 各國熱門影片類型

7.1 各國影片平均觀看數

7.2 各國不同類型影片平均觀看數

8. 各國平均按喜歡、不喜歡、留言比率

8. 各國平均按喜歡、不喜歡、留言比率

9. 各類型影片行為 (US)

9. 各類型影片行為 (US)

各類型影片行為 (US)

Tags 文字雲 (US)

Tags 文字雲 (US)

Tags 文字雲 (US)

總結

討論

請多指教,謝謝。