- Installing packages
install.packages("gridExtra")
trying URL 'https://cran.rstudio.com/bin/macosx/el-capitan/contrib/3.6/gridExtra_2.3.tgz'
Content type 'application/x-gzip' length 1103470 bytes (1.1 MB)
==================================================
downloaded 1.1 MB
The downloaded binary packages are in
/var/folders/_b/l95j9sls68zfzm9gpf99lh6h0000gp/T//RtmpUwp73V/downloaded_packages
# install.packages('devtools')
devtools::install_github("soodoku/tuber", build_vignettes = TRUE)
- Loading libraries
#loading libraries
library(tuber) # youtube API
library(magrittr)
library(tidyverse)
library(purrr)
library(stringr)
library(dplyr)
library(ggplot2)
library(wordcloud)
library(tm)
library(lubridate)
library(plyr)
library(gridExtra)
- Authentication
#authentication from account 1
client_id = "XXX"
client_secret = "XXX"
api_key = "XXX"
client_id = "XXX"
client_secret = "XXX"
# use the youtube oauth
yt_oauth(app_id = client_id,
app_secret = client_secret,
token = '')
4.Reading in scraped URLS
#read in scraped youtube urls
raw_urls = read_csv("/Users/yuyangwang 1/Desktop/OIDD 245/Data Project 2/youtube_urls.csv")
- Scraping data from YouTube API
#function to extract the video ID
getID = function(url) {
id = str_split(string = url,
pattern = "=",
n = 2,
simplify = TRUE)[ , 2]
return (id)
}
raw_urls$vid_id = sapply(raw_urls$URL, getID)
raw_urls = unique(raw_urls)
# function to scrape stats for all vids
get_all_stats = function(id) {
get_stats(video_id = id)
}
get_all_details = function(id) {
get_video_details(video_id = id)
}
#stats of all vids - ONLY RUN WHEN NEED (quota of queries per day)
video_all_stats = map_df(.x = raw_urls$vid_id, .f = get_all_stats)
#remove rows with null val
video_all_stats = na.omit(video_all_stats)
#join
merged_id = merge(x = raw_urls, y = video_all_stats, by.x = "vid_id", by.y = "id", all.y = TRUE)
#getting video details
videodets = lapply(as.character(merged_id$vid_id), function(x){
get_video_details(video_id = x, part="snippet")
})
#appending to data frame
for (i in 1 : 681) {
merged_id[i,]$publishedAt = videodets[[i]][["items"]][[1]][["snippet"]][["publishedAt"]]
merged_id[i,]$channelId = videodets[[i]][["items"]][[1]][["snippet"]][["channelId"]]
merged_id[i,]$description = videodets[[i]][["items"]][[1]][["snippet"]][["description"]]
merged_id[i,]$tags = paste(videodets[[i]][["items"]][[1]][["snippet"]][["tags"]], collapse = ' ')
}
#getting channel stats
channeldets = lapply(as.character(merged_id$channelId), function(x){
get_channel_stats(x)
})
#adding channel stats to main df
for (i in 1 : 681) {
merged_id[i,]$channel_views = as.numeric(channeldets[[i]][["statistics"]][["viewCount"]])
merged_id[i,]$channel_title = channeldets[[i]][["snippet"]][["title"]]
merged_id[i,]$channel_desc = channeldets[[i]][["snippet"]][["description"]]
merged_id[i,]$channel_subs = channeldets[[i]][["statistics"]][["subscriberCount"]]
}
- Manipulation to get some basic info on the videos scraped from homepage
head(worst_dislike_ratio, 15)
head(best_like_ratio, 15)
#creating histogram of data
hist(df_like_dislike_ratio$like_dislike_ratio, main="Histogram for Like/Dislike Ratio among 600+ YouTube Videos",
xlab="Like/Dislike Ratio",
border="white",
col="red",)

#finding the best/worst subs engagement
worst_subs_engagement = df_filtered[order(df_filtered$subs_engage), c("Link","channel_title","subs_engage")]
best_subs_engagement = df_filtered[order(-df_filtered$subs_engage), c("Link","channel_title","subs_engage")]
head(worst_subs_engagement, 10)
head(best_subs_engagement, 10)
#finding the best/worst comment ratio
worst_comment_ratio = df_filtered[order(df_filtered$comments_ratio), c("Link","channel_title","comments_ratio")]
best_comment_ratio = df_filtered[order(-df_filtered$comments_ratio), c("Link","channel_title","comments_ratio")]
head(worst_comment_ratio, 10)
head(best_comment_ratio, 10)
Generate wordcloud representing the videos shown
alldesc = ""
for (i in 1:100) {
alldesc = paste(alldesc, top100views[i, "description"], sep=" ")
}
#corpus
corpus = VCorpus(VectorSource(top100views$description))
#Step 1: cleaning
corp = tm_map(corpus, removePunctuation)
corp = tm_map(corp, removeNumbers)
corp = tm_map(corp, content_transformer(tolower) ,lazy=TRUE)
corp = tm_map(corp, content_transformer(removeWords), c("TIL") ,lazy=TRUE)
corp = tm_map(corp, content_transformer(removeWords), stopwords("english") ,lazy=TRUE)
corp = tm_map(corp, removeWords, c("the", "for", "is", "and"))
corp = tm_map(corp, stripWhitespace)
dtm = DocumentTermMatrix(corp)
dtms = removeSparseTerms(dtm, 0.983)
dim(dtm)
[1] 100 4869
dim(dtms)
[1] 100 1083
dtms_m = as.matrix(dtms)
# colSums adds up value over all of the Columns in a matrix
# rowSums(m) is the equivalent over rows
word.freq = colSums(dtms_m)
word.freq = sort(word.freq, decreasing=T)
d <- data.frame(word = names(word.freq),freq=word.freq)
#create wordcloud
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
max.words=100, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))

Load in data from
top50gamers = read_csv("/Users/yuyangwang 1/Desktop/OIDD 245/Data Project 2/top100game.csv")
top50vloggers = read_csv("/Users/yuyangwang 1/Desktop/OIDD 245/Data Project 2/top100bloggers.csv")
top50comedians = read_csv("/Users/yuyangwang 1/Desktop/OIDD 245/Data Project 2/top100comedy.csv")
colnames(top50vloggers)[which(names(top50vloggers) == "user")] <- "username"
Edit the numbers


#plot of how subscriber counts compare in 3 categories
plot_subs = ggplot() +
geom_line(data=top50gamers, aes(x=rank, y=subs), color='green') +
geom_line(data=top50vloggers, aes(x=rank, y=subs), color='blue') +
geom_line(data=top50comedians, aes(x=rank, y=subs), color='red') + ggtitle("Number of Subscribers for Top 50 YouTubers of Each Category") +
xlab("Rank") + ylab("Number of Subscribers")
plot_subs
#plot of how total view counts compare in 3 categories
plot_views = ggplot() +
geom_line(data=top50gamers, aes(x=rank, y=totalviews), color='green') +
geom_line(data=top50vloggers, aes(x=rank, y=totalviews), color='blue') +
geom_line(data=top50comedians, aes(x=rank, y=totalviews), color='red') + ggtitle("Number of Views for Top 50 YouTubers of Each Category") +
xlab("Rank") + ylab("Total Views")
plot_views
#plot of how subscriber number of videos published compare in 3 categories
plot_vids = ggplot() +
geom_line(data=top50gamers, aes(x=rank, y=totalVids), color='green') +
geom_line(data=top50vloggers, aes(x=rank, y=totalVids), color='blue') +
geom_line(data=top50comedians, aes(x=rank, y=totalVids), color='red') + ggtitle("Number of Total Videos for Top 50 YouTubers of Each Category") +
xlab("Rank") + ylab("Total Videos")
plot_vids

#plot showing number of subscribers from when you joined
ggplot() +
geom_line(data=top50gamers, aes(x=join_date, y=subs), color='green') +
geom_line(data=top50vloggers, aes(x=join_date, y=subs), color='blue') +
geom_line(data=top50comedians, aes(x=join_date, y=subs), color='red') + ggtitle("Timeline of Join Dates of YouTubers and Number of Subscribers") +
xlab("Date") + ylab("Number of Subscribers") + labs(fill = "Categories")


#get pewdiepie
#pewd = get_channel_stats("UC-lHJZR3Gqxm24_Vd_AJ5Yw")
#get all of his videos
#videos = yt_search(term="", type="video", channel_id = "UC-lHJZR3Gqxm24_Vd_AJ5Yw")
#get all their stats
#videostats = lapply(as.character(videos$video_id), function(x){
# get_stats(video_id = x)
#})
