Data and Setup

This project uses the TidyTuesday on March, 2nd, 2021 titled, Superbowl Commercials. The data is based on Youtube videos of commercials aired during the Superbowl. This dataset only contains commercials from the top 10 brands that aired the most commercials.

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.6     v dplyr   1.0.4
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(scales)

## 
## Attaching package: 'scales'

## The following object is masked from 'package:purrr':
## 
##     discard

## The following object is masked from 'package:readr':
## 
##     col_factor

library(ggthemes)

youtube <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-03-02/youtube.csv')

## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_character(),
##   year = col_double(),
##   funny = col_logical(),
##   show_product_quickly = col_logical(),
##   patriotic = col_logical(),
##   celebrity = col_logical(),
##   danger = col_logical(),
##   animals = col_logical(),
##   use_sex = col_logical(),
##   view_count = col_double(),
##   like_count = col_double(),
##   dislike_count = col_double(),
##   favorite_count = col_double(),
##   comment_count = col_double(),
##   published_at = col_datetime(format = ""),
##   category_id = col_double()
## )
## i Use `spec()` for the full column specifications.

glimpse(youtube)

## Rows: 247
## Columns: 25
## $ year                      <dbl> 2018, 2020, 2006, 2018, 2003, 2020, 2020, 20~
## $ brand                     <chr> "Toyota", "Bud Light", "Bud Light", "Hynudai~
## $ superbowl_ads_dot_com_url <chr> "https://superbowl-ads.com/good-odds-toyota/~
## $ youtube_url               <chr> "https://www.youtube.com/watch?v=zeBZvwYQ-hA~
## $ funny                     <lgl> FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, TRUE, ~
## $ show_product_quickly      <lgl> FALSE, TRUE, FALSE, TRUE, TRUE, TRUE, FALSE,~
## $ patriotic                 <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FA~
## $ celebrity                 <lgl> FALSE, TRUE, FALSE, FALSE, FALSE, TRUE, TRUE~
## $ danger                    <lgl> FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, FALSE,~
## $ animals                   <lgl> FALSE, FALSE, TRUE, FALSE, TRUE, TRUE, TRUE,~
## $ use_sex                   <lgl> FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FAL~
## $ id                        <chr> "zeBZvwYQ-hA", "nbbp0VW7z8w", "yk0MQD5YgV8",~
## $ kind                      <chr> "youtube#video", "youtube#video", "youtube#v~
## $ etag                      <chr> "rn-ggKNly38Cl0C3CNjNnUH9xUw", "1roDoK-SYqSp~
## $ view_count                <dbl> 173929, 47752, 142310, 198, 13741, 23636, 30~
## $ like_count                <dbl> 1233, 485, 129, 2, 20, 115, 1470, 78, 342, 7~
## $ dislike_count             <dbl> 38, 14, 15, 0, 3, 11, 384, 6, 7, 0, 14, 0, 2~
## $ favorite_count            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ comment_count             <dbl> NA, 14, 9, 0, 2, 13, 227, 6, 30, 0, 8, 1, 13~
## $ published_at              <dttm> 2018-02-03 11:29:14, 2020-01-31 21:04:13, 2~
## $ title                     <chr> "Toyota Super Bowl Commercial 2018 Good Odds~
## $ description               <chr> "Toyota Super Bowl Commercial 2018 Good Odds~
## $ thumbnail                 <chr> "https://i.ytimg.com/vi/zeBZvwYQ-hA/sddefaul~
## $ channel_title             <chr> "Funny Commercials", "VCU Brandcenter", "Joh~
## $ category_id               <dbl> 1, 27, 17, 22, 24, 1, 24, 2, 24, 24, 24, 24,~

How Many Ads are Shown Each Year

The number of ads in this dataset is fairly constant with some dips.

adsyear <- youtube %>%
  group_by(year) %>%
  summarise(count = n())

ggplot(adsyear, aes(x = year, y = count)) +
  geom_col(fill = "#306dab") +
  labs(x = "Year", y = "Number of Ads", title = "Number of Superbowl Ads Each Year") +
  theme_minimal()

Views stay fairly similar pre 2010. After 2010, we can see some viral videos that extremely outperformed other years. The most viral ad came in 2012 which caused the average to be much higher than other years.

viewsyear <- youtube %>%
  filter(!is.na(view_count)) %>%
  group_by(year) %>%
  summarise(avgview = mean(view_count))
  
ggplot(viewsyear, aes(x = year, y = avgview)) +
  geom_col(fill = "#306dab") +
  scale_y_continuous(label = comma, breaks = extended_breaks(n = 8)) +
  labs(x = "Year",
       y = "Average Views",
       title = "Average Superbowl Ad Views Each Year") +
  theme_minimal()

Which Brands Are Viewed the Most

NFL and Doritos are the clear leaders in views.

It also interesting that Budweiser and Bud Light have a large amount of ads but rarely create ads that have extremely high view counts.

viewbybrand <- youtube %>%
  filter(!is.na(view_count))

ggplot(viewbybrand, aes(x = fct_reorder(brand, view_count), y = view_count, color = brand)) + 
  geom_boxplot(alpha = 0.3) +
  coord_flip() +
  geom_jitter(width = 0.2, alpha = 0.3) +
  scale_y_log10(labels = comma, breaks = log_breaks(n = 8)) +
  labs(x = "Brand",
       y = "Views",
       title ="Number of Views for Each Brand") +
  theme_clean() +
  theme(axis.text.x = element_text(hjust = 0.5, size = 7))

ggplot(viewbybrand, aes(x = fct_reorder(brand, view_count), y = view_count, color = brand)) + 
  geom_boxplot(alpha = 0.3) +
  coord_flip() +
  geom_jitter(width = 0.2, alpha = 0.3) +
  labs(x = "Brand",
       y = "Views",
       title ="Number of Views for Each Brand") +
  theme_clean() +
  theme(axis.text.x = element_text(hjust = 0.5, size = 7))

which Category Combinations are Used the Most

Most ads during the Superbowl use some combinations of these main categories to appeal to their audiences. The list of letters for each category is below.

f = funny
q = Show Product Quickly
p = Patriotic
c = Celebrity
d = Danger
a = Animals
s = Sex

Ads that are funny and show the product quickly are by far the most common, followed by ads that funny, show the product quickly, and use danger.

Most of the commonly seen combinations of categories also stay between 3-4 categories as well.

combosbrandall <- youtube %>%
  filter(!is.na(view_count)) %>%
  group_by(brand, funny, show_product_quickly, patriotic, celebrity, danger,
           animals, use_sex) %>%
  summarise(count = n(), avgviews = mean(view_count))  %>%
  mutate(type = "",
         type = ifelse(funny == TRUE, paste(type, "f"), type),
         type = ifelse(show_product_quickly == TRUE, paste(type, "q"), type),
         type = ifelse(patriotic == TRUE, paste(type, "p"), type),
         type = ifelse(celebrity == TRUE, paste(type, "c"), type),
         type = ifelse(danger == TRUE, paste(type, "d"), type),
         type = ifelse(animals == TRUE, paste(type, "a"), type),
         type = ifelse(use_sex == TRUE, paste(type, "s"), type)
  )

## `summarise()` has grouped output by 'brand', 'funny', 'show_product_quickly', 'patriotic', 'celebrity', 'danger', 'animals'. You can override using the `.groups` argument.

ggplot(combosbrandall, aes(x = fct_reorder(type, count, sum), y = count, fill = brand)) + 
  geom_col() +
  labs(x = "Combination of Categories",
       y = "Count",
       title = "What Combinations of Categories are Used Most Often",
       subtitle = "q = Show Product Quickly p = Patriotic c = Celebrity d = Danger a = Animals s = Sex") +
  theme_clean() +
  theme(axis.text.x = element_text(angle = 360, hjust = 0.5, size = 7))  +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 2))

How Many Views does Each Combination get

Patriotic and danger receive the most views. That category is closely followed by show product quickly danger animals, patriotic celebrity, and funny show product quickly. Those categories make up the clear top tier categories with the most views.

combos <- youtube %>%
  filter(!is.na(view_count)) %>%
  group_by(funny, show_product_quickly, patriotic, celebrity, danger,
           animals, use_sex) %>%
  summarise(count = n(), avgviews = mean(view_count), medviews = median(view_count))  %>%
  mutate(type = "",
         type = ifelse(funny == TRUE, paste(type, "f"), type),
         type = ifelse(show_product_quickly == TRUE, paste(type, "q"), type),
         type = ifelse(patriotic == TRUE, paste(type, "p"), type),
         type = ifelse(celebrity == TRUE, paste(type, "c"), type),
         type = ifelse(danger == TRUE, paste(type, "d"), type),
         type = ifelse(animals == TRUE, paste(type, "a"), type),
         type = ifelse(use_sex == TRUE, paste(type, "s"), type)
  )

## `summarise()` has grouped output by 'funny', 'show_product_quickly', 'patriotic', 'celebrity', 'danger', 'animals'. You can override using the `.groups` argument.

ggplot(combos, aes(x = reorder(type, avgviews), y = avgviews)) + 
  geom_col(color = "black", fill = "#306dab") +
  scale_y_log10(labels = comma, breaks = log_breaks(n=8)) +
  labs(x = "Combination of Categories",
       y = "Average Views",
       title = "Which Combination of Categories Recieves the Most Views") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 360, hjust = 0.5, size = 7))  +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 2))

Which Combination Does Each Brand Use The Most

Most of the brands use a diverse group of categories in their ads. For Most, their top 3 most used categories makes up 1/3 to 1/2 of their total ads.

## count of each type used by brand
combosbrand <- youtube %>%
  group_by(brand, funny, show_product_quickly, patriotic, celebrity, danger,
           animals, use_sex) %>%
  summarise(count = n(), avgviews = mean(view_count))  %>%
  mutate(type = "",
         type = ifelse(funny == TRUE, paste(type, "f"), type),
         type = ifelse(show_product_quickly == TRUE, paste(type, "q"), type),
         type = ifelse(patriotic == TRUE, paste(type, "p"), type),
         type = ifelse(celebrity == TRUE, paste(type, "c"), type),
         type = ifelse(danger == TRUE, paste(type, "d"), type),
         type = ifelse(animals == TRUE, paste(type, "a"), type),
         type = ifelse(use_sex == TRUE, paste(type, "s"), type)
  ) %>%
  ungroup() %>%
  group_by(brand) %>%
  slice_max(count, n=3, with_ties = FALSE)

## `summarise()` has grouped output by 'brand', 'funny', 'show_product_quickly', 'patriotic', 'celebrity', 'danger', 'animals'. You can override using the `.groups` argument.

brandtotaladds <- youtube %>%
  group_by(brand) %>%
  summarise(TotalAdds = n())

combosbrand <- left_join(combosbrand, brandtotaladds, by = c("brand" = "brand"))

## top combos for each brand
ggplot(combosbrand, aes(x = brand, y = count, fill = type)) + 
  geom_col(data = brandtotaladds, aes(x = brand, y = TotalAdds, fill = "red"), alpha = 0.4) +
  geom_col(data = combosbrand, aes(x = brand, y = count, fill = type)) +
  geom_text(aes(label = type), position = position_stack(vjust = 0.5)) +
  theme_clean() +
  labs(x = "Brand",
       y = "Count of Ads",
       title = "What Category Combinations Each Brand Uses",
       subtitle = "Top 3 Most Used Combinations Shown")

What Combinations Have Increased or Decreased Over the Years

The two most obvious trends seen here are the increase in celebrity commercials and decrease in commercials that use sex. Also, patriotic is trending slightly up while funny is trending down.

comboyear <- youtube %>%
  mutate(type = "",
         type = ifelse(funny == TRUE, paste(type, "f"), type),
         type = ifelse(show_product_quickly == TRUE, paste(type, "q"), type),
         type = ifelse(patriotic == TRUE, paste(type, "p"), type),
         type = ifelse(celebrity == TRUE, paste(type, "c"), type),
         type = ifelse(danger == TRUE, paste(type, "d"), type),
         type = ifelse(animals == TRUE, paste(type, "a"), type),
         type = ifelse(use_sex == TRUE, paste(type, "s"), type)
  ) 

f <-comboyear %>%
  filter(str_detect(type, "f")) %>%
  group_by(year) %>%
  summarise(count = n()) %>%
  mutate(type2 = "Funny")
q <-comboyear %>%
  filter(str_detect(type, "q")) %>%
  group_by(year) %>%
  summarise(count = n()) %>%
  mutate(type2 = "Shows_Product_Quickly")
p <-comboyear %>%
  filter(str_detect(type, "p")) %>%
  group_by(year) %>%
  summarise(count = n()) %>%
  mutate(type2 = "Patriotic")
c <-comboyear %>%
  filter(str_detect(type, "c")) %>%
  group_by(year) %>%
  summarise(count = n()) %>%
  mutate(type2 = "Celebrity")
d <-comboyear %>%
  filter(str_detect(type, "d")) %>%
  group_by(year) %>%
  summarise(count = n()) %>%
  mutate(type2 = "Danger")
a <-comboyear %>%
  filter(str_detect(type, "a")) %>%
  group_by(year) %>%
  summarise(count = n()) %>%
  mutate(type2 = "Animals")
s <-comboyear %>%
  filter(str_detect(type, "s")) %>%
  group_by(year) %>%
  summarise(count = n()) %>%
  mutate(type2 = "Use_Sex")

comboyear2 <- bind_rows(f,q,p,c,d,a,s)

ggplot(comboyear2, aes(x = year, y = count)) + 
  geom_point() + 
  geom_smooth() + 
  facet_wrap(~type2) +
  theme_minimal() +
  labs(x = "Year",
       y = "Number of Commercials Aired",
       title = "Number of Commercials Aired Containing Each Category")

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Which Category gets the most views

Ads that used sex were not very popular with celebrities and animals following. Ads that show the product quickly and are patriotic seem to be a good indicator of a successful ad. Surprisingly danger also appears to not be a successful category.

youtubepivoted <- youtube %>%
  filter(!is.na(view_count)) %>%
  pivot_longer(funny:use_sex, names_to = "type") %>%
  group_by(type, value) %>%
  summarise(avgviews = mean(view_count))

## `summarise()` has grouped output by 'type'. You can override using the `.groups` argument.

ggplot(youtubepivoted, aes(x = fct_reorder(type, avgviews, sum), y = avgviews, fill = value)) + 
  geom_col(position = "dodge") +
  labs(x = "Category",
       y = "Average views",
       title = "Average Views For Each Category") +
  scale_y_continuous(labels = comma) +
  theme_minimal() +
  theme(axis.text.x = element_text(size = 7))

How Good Does Each Brand Use Categories

Viewing how well each brand does with each category highlights how much more successful Budweiser, Coca-Cola, Doritos and the NFL are in creating high view ads.

youtubepivotedbrand <- youtube %>%
  filter(!is.na(view_count)) %>%
  pivot_longer(funny:use_sex, names_to = "type") %>%
  group_by(brand, type, value) %>%
  summarise(avgviews = mean(view_count))

## `summarise()` has grouped output by 'brand', 'type'. You can override using the `.groups` argument.

ggplot(youtubepivotedbrand, aes(x = fct_reorder(type, avgviews, sum), y = avgviews, fill = value)) + 
  geom_col(position = "dodge") +
  facet_wrap(~brand) +
  theme_minimal() +
  labs(x = "Category", 
       y = "Average View Count") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 7)) +
  scale_y_continuous(labels = comma)

Reducing the dataset to those big 4, the trends become much more clear.

Budweiser benefits mostly from patriotic and danger.

Coca-Cola does well with showing the product quickly, animals, and danger

Doritos is mostly only successful with funny and showing the product quickly.

The NFL works best with celebrities and patriotic.

youtubepivotedbrand <- youtube %>%
  filter(!is.na(view_count) & brand %in% c("Budweiser", "Coca-Cola", "Doritos", "NFL")) %>%
  pivot_longer(funny:use_sex, names_to = "type") %>%
  group_by(brand, type, value) %>%
  summarise(avgviews = mean(view_count))

## `summarise()` has grouped output by 'brand', 'type'. You can override using the `.groups` argument.

ggplot(youtubepivotedbrand, aes(x = fct_reorder(type, avgviews, sum), y = avgviews, fill = value)) + 
  geom_col(position = "dodge") +
  scale_y_continuous(labels = comma) +
  facet_wrap(~brand) +
    theme_minimal() +
  labs(x = "Category", 
       y = "Average View Count",
       title = "How Each Brand Performed in Each Category") +
  scale_y_continuous(labels = comma) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10))

## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.

What Combination Have the Most Popular Ads Been

Looking at the most successful ads, almost all of them have either been funny or patriotic. Also, no brand dominates the top 10 ads, 5 brands make an appearance in the top 10 and most appear more than once although Doritos clearly has had the most successful ad.

MostViews <- youtube %>%
  mutate(type = "",
         type = ifelse(funny == TRUE, paste(type, "f"), type),
         type = ifelse(show_product_quickly == TRUE, paste(type, "q"), type),
         type = ifelse(patriotic == TRUE, paste(type, "p"), type),
         type = ifelse(celebrity == TRUE, paste(type, "c"), type),
         type = ifelse(danger == TRUE, paste(type, "d"), type),
         type = ifelse(animals == TRUE, paste(type, "a"), type),
         type = ifelse(use_sex == TRUE, paste(type, "s"), type)
  ) %>%
  slice_max(view_count, n = 10)

ggplot(MostViews, aes(x = reorder(title, view_count), y = view_count, fill = brand)) +
  geom_col() +
  coord_flip() +
  geom_text(aes(label = type), hjust = -0.01) +
  theme_minimal() + 
  scale_x_discrete(labels = function(x) str_wrap(x, width = 30)) + 
  ylim(0, 200000000) +
  labs(x = "Ad Title",
       y = "Total Views",
       title = "View Count of Top 10 Ads")

CSC 463 Final Project - What Kind of Advertisements are Shown During the SuperBowl