#This Stack Requires:
# 1. Docker
# 2. Jupyter
# 3. Selenium
# 4. RSelenium

#Practically, I wanted to set up the basics of a set for a predictor that links the year, the content, and the 3rd party views on youtube.
#I wanted to practice selinum scraping in R, and this seemed a perfect example as this already has links
#This project made me realize how much more limited RSelenium is

#This Stack Requires:

# 1. Docker

# 2. Jupyter

# 3. Selenium

# 4. RSelenium



#Practically, I wanted to set up the basics of a set for a predictor that links the year, the content, and the 3rd party views on youtube.

#I wanted to practice selinum scraping in R, and this seemed a perfect example as this already has links

#In terms of Article, this 
#Start your Selinum Instance Now
#docker run -d -p 4444:4444 --shm-size="2g" selenium/standalone-chrome:4.1.2-20220131
Fresh_Data <- FALSE #This will refresh data (Not using caching from my git)
Fresh_Computes <- TRUE  #Uses docker to pull and augment data, requires the Selenium Instance
Dev_Checks <- FALSE #Various Extra Print statements that make debugging easier
url.data <- "https://raw.githubusercontent.com/fivethirtyeight/superbowl-ads/main/superbowl-ads.csv"

raw <- read.csv(url(url.data), header = TRUE,) 
head(raw)
##   year     brand
## 1 2018    Toyota
## 2 2020 Bud Light
## 3 2006 Bud Light
## 4 2018   Hynudai
## 5 2003 Bud Light
## 6 2020    Toyota
##                                              superbowl_ads_dot_com_url
## 1                          https://superbowl-ads.com/good-odds-toyota/
## 2 https://superbowl-ads.com/2020-bud-light-seltzer-inside-posts-brain/
## 3                https://superbowl-ads.com/2006-bud-light-bear-attack/
## 4  https://superbowl-ads.com/hope-detector-nfl-super-bowl-lii-hyundai/
## 5                https://superbowl-ads.com/2003-bud-light-hermit-crab/
## 6 https://superbowl-ads.com/2020-toyota-go-places-with-cobie-smulders/
##                                   youtube_url funny show_product_quickly
## 1 https://www.youtube.com/watch?v=zeBZvwYQ-hA False                False
## 2 https://www.youtube.com/watch?v=nbbp0VW7z8w  True                 True
## 3 https://www.youtube.com/watch?v=yk0MQD5YgV8  True                False
## 4 https://www.youtube.com/watch?v=lNPccrGk77A False                 True
## 5 https://www.youtube.com/watch?v=ovQYgnXHooY  True                 True
## 6 https://www.youtube.com/watch?v=f34Ji70u3nk  True                 True
##   patriotic celebrity danger animals use_sex
## 1     False     False  False   False   False
## 2     False      True   True   False   False
## 3     False     False   True    True   False
## 4     False     False  False   False   False
## 5     False     False   True    True    True
## 6     False      True   True    True   False
#This is the raw data from fivethirtyeight
#Now we're going to augment it
if (Fresh_Computes && Fresh_Data){
    remDr <- RSelenium::remoteDriver(remoteServerAddr = "localhost",
                                 port = 4444,
                                 browserName = "chrome")
    remDr$open()
}
if (Fresh_Computes && Fresh_Data && Dev_Checks){
    remDr$navigate("https://www.youtube.com/watch?v=lNPccrGk77A") #Entering our URL gets the browser to navigate to the page
    remDr$screenshot(display = TRUE)
}
if (Fresh_Computes && Fresh_Data && Dev_Checks){    
    webElem <- remDr$findElement(using = "css selector", "[class='view-count style-scope ytd-video-view-count-renderer']")
    view_number_text = webElem$getElementText()
    print(view_number_text[])
}
if (Fresh_Computes && Fresh_Data && Dev_Checks){
    webElem <- remDr$findElement(using = "css", "[class='view-count style-scope ytd-video-view-count-renderer']")
    view_number_text = webElem$getElementText()
    print(view_number_text)
    view_number_text = gsub("views", "", view_number_text)
    view_number_text = gsub(",", "", view_number_text)
    print(view_number_text)
}
raw$Views <- "NAN"
if (Dev_Checks) {
    head(raw)
}
if (Fresh_Computes && Fresh_Data){
    length <- nrow(raw)
    if (Dev_Checks){
        print("Dev Mode Enabled")
        length <- 5
    }
    for (row_number in 1:length){
        remDr$navigate(as.character(raw$youtube[row_number]))
        Sys.sleep(5.0) #Could not find a way to wait for complete page load, so I threw in a wait to ensure load
        webElem <- tryCatch({remDr$findElement(using = "css", "[class='view-count style-scope ytd-video-view-count-renderer']")},
                            error = function(e){ print("Could Not Find Video")})
        view_number_text <- ""
        if(!is.null(webElem)){
            view_number_text <- tryCatch({webElem$getElementText()},
                                error = function(e){ print("Could Not get View Count Value")})
        } else {
            print("I could not find target")
        }
        view_number_text = gsub("views", "", view_number_text)
        view_number_text = gsub(",", "", view_number_text)
        if (Dev_Checks){
            print(raw[row_number,]['youtube_url'])
            print(view_number_text)
        }
        raw$Views[row_number] <- view_number_text
    }
    if (Dev_Checks) {
        write.csv(raw,"test_data.csv", row.names = FALSE)
    } else {
        write.csv(raw,"Superbowl_adds_count.csv", row.names = FALSE)
    }
} else {
    url.data <- "https://raw.githubusercontent.com/Amantux/Data607_Assignment1/main/Superbowl_adds_count.csv"
    raw <- read.csv(url(url.data), header = TRUE,) 
}
head(raw)
##   year     brand
## 1 2018    Toyota
## 2 2020 Bud Light
## 3 2006 Bud Light
## 4 2018   Hynudai
## 5 2003 Bud Light
## 6 2020    Toyota
##                                              superbowl_ads_dot_com_url
## 1                          https://superbowl-ads.com/good-odds-toyota/
## 2 https://superbowl-ads.com/2020-bud-light-seltzer-inside-posts-brain/
## 3                https://superbowl-ads.com/2006-bud-light-bear-attack/
## 4  https://superbowl-ads.com/hope-detector-nfl-super-bowl-lii-hyundai/
## 5                https://superbowl-ads.com/2003-bud-light-hermit-crab/
## 6 https://superbowl-ads.com/2020-toyota-go-places-with-cobie-smulders/
##                                   youtube_url funny show_product_quickly
## 1 https://www.youtube.com/watch?v=zeBZvwYQ-hA False                False
## 2 https://www.youtube.com/watch?v=nbbp0VW7z8w  True                 True
## 3 https://www.youtube.com/watch?v=yk0MQD5YgV8  True                False
## 4 https://www.youtube.com/watch?v=lNPccrGk77A False                 True
## 5 https://www.youtube.com/watch?v=ovQYgnXHooY  True                 True
## 6 https://www.youtube.com/watch?v=f34Ji70u3nk  True                 True
##   patriotic celebrity danger animals use_sex   Views
## 1     False     False  False   False   False 185328 
## 2     False      True   True   False   False  78717 
## 3     False     False   True    True   False 142558 
## 4     False     False  False   False   False    240 
## 5     False     False   True    True    True  13860 
## 6     False      True   True    True   False  28043
nrow(raw)
## [1] 244
raw<-raw[!((raw$Views=="Could Not get View Count Value")|(raw$Views=="Could Not Find Video")),] #remove the empty rows
head(raw)
##   year     brand
## 1 2018    Toyota
## 2 2020 Bud Light
## 3 2006 Bud Light
## 4 2018   Hynudai
## 5 2003 Bud Light
## 6 2020    Toyota
##                                              superbowl_ads_dot_com_url
## 1                          https://superbowl-ads.com/good-odds-toyota/
## 2 https://superbowl-ads.com/2020-bud-light-seltzer-inside-posts-brain/
## 3                https://superbowl-ads.com/2006-bud-light-bear-attack/
## 4  https://superbowl-ads.com/hope-detector-nfl-super-bowl-lii-hyundai/
## 5                https://superbowl-ads.com/2003-bud-light-hermit-crab/
## 6 https://superbowl-ads.com/2020-toyota-go-places-with-cobie-smulders/
##                                   youtube_url funny show_product_quickly
## 1 https://www.youtube.com/watch?v=zeBZvwYQ-hA False                False
## 2 https://www.youtube.com/watch?v=nbbp0VW7z8w  True                 True
## 3 https://www.youtube.com/watch?v=yk0MQD5YgV8  True                False
## 4 https://www.youtube.com/watch?v=lNPccrGk77A False                 True
## 5 https://www.youtube.com/watch?v=ovQYgnXHooY  True                 True
## 6 https://www.youtube.com/watch?v=f34Ji70u3nk  True                 True
##   patriotic celebrity danger animals use_sex   Views
## 1     False     False  False   False   False 185328 
## 2     False      True   True   False   False  78717 
## 3     False     False   True    True   False 142558 
## 4     False     False  False   False   False    240 
## 5     False     False   True    True    True  13860 
## 6     False      True   True    True   False  28043
c_dat = subset(raw, select = -c(superbowl_ads_dot_com_url,youtube_url) )
head(c_dat)
##   year     brand funny show_product_quickly patriotic celebrity danger animals
## 1 2018    Toyota False                False     False     False  False   False
## 2 2020 Bud Light  True                 True     False      True   True   False
## 3 2006 Bud Light  True                False     False     False   True    True
## 4 2018   Hynudai False                 True     False     False  False   False
## 5 2003 Bud Light  True                 True     False     False   True    True
## 6 2020    Toyota  True                 True     False      True   True    True
##   use_sex   Views
## 1   False 185328 
## 2   False  78717 
## 3   False 142558 
## 4   False    240 
## 5    True  13860 
## 6   False  28043
unique(c_dat['brand'])
##        brand
## 1     Toyota
## 2  Bud Light
## 4    Hynudai
## 7  Coca-Cola
## 8        Kia
## 10 Budweiser
## 15       NFL
## 18     Pepsi
## 20   Doritos
## 34   E-Trade
#Findings and Recommendations
#Practically I wanted to develop the data backing a predictor, and I think I could probably do a better job of augmenting
# This mainly relies on scraping youtube data, where not all cases actually even have a presence. It may be better
# to look at the viewership numbers from the superbowl itself. 
# In terms of the origin data set, I would love to see a breakdown on time or % of ad as I think that would be interesting!