#This Stack Requires:
# 1. Docker
# 2. Jupyter
# 3. Selenium
# 4. RSelenium
#Practically, I wanted to set up the basics of a set for a predictor that links the year, the content, and the 3rd party views on youtube.
#I wanted to practice selinum scraping in R, and this seemed a perfect example as this already has links
#This project made me realize how much more limited RSelenium is
#This Stack Requires:
# 1. Docker
# 2. Jupyter
# 3. Selenium
# 4. RSelenium
#Practically, I wanted to set up the basics of a set for a predictor that links the year, the content, and the 3rd party views on youtube.
#I wanted to practice selinum scraping in R, and this seemed a perfect example as this already has links
#In terms of Article, this
#Start your Selinum Instance Now
#docker run -d -p 4444:4444 --shm-size="2g" selenium/standalone-chrome:4.1.2-20220131
Fresh_Data <- FALSE #This will refresh data (Not using caching from my git)
Fresh_Computes <- TRUE #Uses docker to pull and augment data, requires the Selenium Instance
Dev_Checks <- FALSE #Various Extra Print statements that make debugging easier
url.data <- "https://raw.githubusercontent.com/fivethirtyeight/superbowl-ads/main/superbowl-ads.csv"
raw <- read.csv(url(url.data), header = TRUE,)
head(raw)
## year brand
## 1 2018 Toyota
## 2 2020 Bud Light
## 3 2006 Bud Light
## 4 2018 Hynudai
## 5 2003 Bud Light
## 6 2020 Toyota
## superbowl_ads_dot_com_url
## 1 https://superbowl-ads.com/good-odds-toyota/
## 2 https://superbowl-ads.com/2020-bud-light-seltzer-inside-posts-brain/
## 3 https://superbowl-ads.com/2006-bud-light-bear-attack/
## 4 https://superbowl-ads.com/hope-detector-nfl-super-bowl-lii-hyundai/
## 5 https://superbowl-ads.com/2003-bud-light-hermit-crab/
## 6 https://superbowl-ads.com/2020-toyota-go-places-with-cobie-smulders/
## youtube_url funny show_product_quickly
## 1 https://www.youtube.com/watch?v=zeBZvwYQ-hA False False
## 2 https://www.youtube.com/watch?v=nbbp0VW7z8w True True
## 3 https://www.youtube.com/watch?v=yk0MQD5YgV8 True False
## 4 https://www.youtube.com/watch?v=lNPccrGk77A False True
## 5 https://www.youtube.com/watch?v=ovQYgnXHooY True True
## 6 https://www.youtube.com/watch?v=f34Ji70u3nk True True
## patriotic celebrity danger animals use_sex
## 1 False False False False False
## 2 False True True False False
## 3 False False True True False
## 4 False False False False False
## 5 False False True True True
## 6 False True True True False
#This is the raw data from fivethirtyeight
#Now we're going to augment it
if (Fresh_Computes && Fresh_Data){
remDr <- RSelenium::remoteDriver(remoteServerAddr = "localhost",
port = 4444,
browserName = "chrome")
remDr$open()
}
if (Fresh_Computes && Fresh_Data && Dev_Checks){
remDr$navigate("https://www.youtube.com/watch?v=lNPccrGk77A") #Entering our URL gets the browser to navigate to the page
remDr$screenshot(display = TRUE)
}
if (Fresh_Computes && Fresh_Data && Dev_Checks){
webElem <- remDr$findElement(using = "css selector", "[class='view-count style-scope ytd-video-view-count-renderer']")
view_number_text = webElem$getElementText()
print(view_number_text[])
}
if (Fresh_Computes && Fresh_Data && Dev_Checks){
webElem <- remDr$findElement(using = "css", "[class='view-count style-scope ytd-video-view-count-renderer']")
view_number_text = webElem$getElementText()
print(view_number_text)
view_number_text = gsub("views", "", view_number_text)
view_number_text = gsub(",", "", view_number_text)
print(view_number_text)
}
raw$Views <- "NAN"
if (Dev_Checks) {
head(raw)
}
if (Fresh_Computes && Fresh_Data){
length <- nrow(raw)
if (Dev_Checks){
print("Dev Mode Enabled")
length <- 5
}
for (row_number in 1:length){
remDr$navigate(as.character(raw$youtube[row_number]))
Sys.sleep(5.0) #Could not find a way to wait for complete page load, so I threw in a wait to ensure load
webElem <- tryCatch({remDr$findElement(using = "css", "[class='view-count style-scope ytd-video-view-count-renderer']")},
error = function(e){ print("Could Not Find Video")})
view_number_text <- ""
if(!is.null(webElem)){
view_number_text <- tryCatch({webElem$getElementText()},
error = function(e){ print("Could Not get View Count Value")})
} else {
print("I could not find target")
}
view_number_text = gsub("views", "", view_number_text)
view_number_text = gsub(",", "", view_number_text)
if (Dev_Checks){
print(raw[row_number,]['youtube_url'])
print(view_number_text)
}
raw$Views[row_number] <- view_number_text
}
if (Dev_Checks) {
write.csv(raw,"test_data.csv", row.names = FALSE)
} else {
write.csv(raw,"Superbowl_adds_count.csv", row.names = FALSE)
}
} else {
url.data <- "https://raw.githubusercontent.com/Amantux/Data607_Assignment1/main/Superbowl_adds_count.csv"
raw <- read.csv(url(url.data), header = TRUE,)
}
head(raw)
## year brand
## 1 2018 Toyota
## 2 2020 Bud Light
## 3 2006 Bud Light
## 4 2018 Hynudai
## 5 2003 Bud Light
## 6 2020 Toyota
## superbowl_ads_dot_com_url
## 1 https://superbowl-ads.com/good-odds-toyota/
## 2 https://superbowl-ads.com/2020-bud-light-seltzer-inside-posts-brain/
## 3 https://superbowl-ads.com/2006-bud-light-bear-attack/
## 4 https://superbowl-ads.com/hope-detector-nfl-super-bowl-lii-hyundai/
## 5 https://superbowl-ads.com/2003-bud-light-hermit-crab/
## 6 https://superbowl-ads.com/2020-toyota-go-places-with-cobie-smulders/
## youtube_url funny show_product_quickly
## 1 https://www.youtube.com/watch?v=zeBZvwYQ-hA False False
## 2 https://www.youtube.com/watch?v=nbbp0VW7z8w True True
## 3 https://www.youtube.com/watch?v=yk0MQD5YgV8 True False
## 4 https://www.youtube.com/watch?v=lNPccrGk77A False True
## 5 https://www.youtube.com/watch?v=ovQYgnXHooY True True
## 6 https://www.youtube.com/watch?v=f34Ji70u3nk True True
## patriotic celebrity danger animals use_sex Views
## 1 False False False False False 185328
## 2 False True True False False 78717
## 3 False False True True False 142558
## 4 False False False False False 240
## 5 False False True True True 13860
## 6 False True True True False 28043
nrow(raw)
## [1] 244
raw<-raw[!((raw$Views=="Could Not get View Count Value")|(raw$Views=="Could Not Find Video")),] #remove the empty rows
head(raw)
## year brand
## 1 2018 Toyota
## 2 2020 Bud Light
## 3 2006 Bud Light
## 4 2018 Hynudai
## 5 2003 Bud Light
## 6 2020 Toyota
## superbowl_ads_dot_com_url
## 1 https://superbowl-ads.com/good-odds-toyota/
## 2 https://superbowl-ads.com/2020-bud-light-seltzer-inside-posts-brain/
## 3 https://superbowl-ads.com/2006-bud-light-bear-attack/
## 4 https://superbowl-ads.com/hope-detector-nfl-super-bowl-lii-hyundai/
## 5 https://superbowl-ads.com/2003-bud-light-hermit-crab/
## 6 https://superbowl-ads.com/2020-toyota-go-places-with-cobie-smulders/
## youtube_url funny show_product_quickly
## 1 https://www.youtube.com/watch?v=zeBZvwYQ-hA False False
## 2 https://www.youtube.com/watch?v=nbbp0VW7z8w True True
## 3 https://www.youtube.com/watch?v=yk0MQD5YgV8 True False
## 4 https://www.youtube.com/watch?v=lNPccrGk77A False True
## 5 https://www.youtube.com/watch?v=ovQYgnXHooY True True
## 6 https://www.youtube.com/watch?v=f34Ji70u3nk True True
## patriotic celebrity danger animals use_sex Views
## 1 False False False False False 185328
## 2 False True True False False 78717
## 3 False False True True False 142558
## 4 False False False False False 240
## 5 False False True True True 13860
## 6 False True True True False 28043
c_dat = subset(raw, select = -c(superbowl_ads_dot_com_url,youtube_url) )
head(c_dat)
## year brand funny show_product_quickly patriotic celebrity danger animals
## 1 2018 Toyota False False False False False False
## 2 2020 Bud Light True True False True True False
## 3 2006 Bud Light True False False False True True
## 4 2018 Hynudai False True False False False False
## 5 2003 Bud Light True True False False True True
## 6 2020 Toyota True True False True True True
## use_sex Views
## 1 False 185328
## 2 False 78717
## 3 False 142558
## 4 False 240
## 5 True 13860
## 6 False 28043
unique(c_dat['brand'])
## brand
## 1 Toyota
## 2 Bud Light
## 4 Hynudai
## 7 Coca-Cola
## 8 Kia
## 10 Budweiser
## 15 NFL
## 18 Pepsi
## 20 Doritos
## 34 E-Trade
#Findings and Recommendations
#Practically I wanted to develop the data backing a predictor, and I think I could probably do a better job of augmenting
# This mainly relies on scraping youtube data, where not all cases actually even have a presence. It may be better
# to look at the viewership numbers from the superbowl itself.
# In terms of the origin data set, I would love to see a breakdown on time or % of ad as I think that would be interesting!