# load data
library(tidyverse)
library(scales)
library(infer)
library(psych)
library(httr)
library(jsonlite)
The Data Set was obtained from Kaggle. This dataset was collected using the YouTube API.
Loading the Data.
#Get the videos csv
raw_video_df <- read_csv(file="https://raw.githubusercontent.com/georg4re/ds606/main/data/USvideos.csv",quote = "\"")
## Parsed with column specification:
## cols(
## video_id = col_character(),
## trending_date = col_character(),
## title = col_character(),
## channel_title = col_character(),
## category_id = col_double(),
## publish_time = col_datetime(format = ""),
## tags = col_character(),
## views = col_double(),
## likes = col_double(),
## dislikes = col_double(),
## comment_count = col_double(),
## thumbnail_link = col_character(),
## comments_disabled = col_logical(),
## ratings_disabled = col_logical(),
## video_error_or_removed = col_logical(),
## description = col_character()
## )
## Warning: 1533544 parsing failures.
## row col expected actual file
## 2 tags delimiter or quote | 'https://raw.githubusercontent.com/georg4re/ds606/main/data/USvideos.csv'
## 2 tags delimiter or quote l 'https://raw.githubusercontent.com/georg4re/ds606/main/data/USvideos.csv'
## 2 tags delimiter or quote | 'https://raw.githubusercontent.com/georg4re/ds606/main/data/USvideos.csv'
## 2 tags delimiter or quote j 'https://raw.githubusercontent.com/georg4re/ds606/main/data/USvideos.csv'
## 2 tags delimiter or quote | 'https://raw.githubusercontent.com/georg4re/ds606/main/data/USvideos.csv'
## ... .... .................. ...... .........................................................................
## See problems(...) for more details.
#get the categories JSON
url <- paste("https://raw.githubusercontent.com/georg4re/ds606/main/data/US_category_id.json", sep="")
res <- GET(url)
data <- fromJSON(rawToChar(res$content))
category_df <- data$items %>%
flatten(.) %>%
rename(category=snippet.title)
Joining the data and the Categories
category_df <- category_df %>%
rename(category_id = id)
category_df$category_id <- as.numeric(category_df$category_id)
video_df <- raw_video_df %>%
left_join(category_df) %>%
select(video_id,
trending_date,
title,
channel_title,
category,
publish_time,
tags,
views,
likes,
dislikes,
comment_count,
comments_disabled,
ratings_disabled,
video_error_or_removed,
description
)
## Joining, by = "category_id"
A snippet
## Rows: 40,949
## Columns: 15
## $ video_id <chr> "2kyS6SvSYSE", "1ZAPwfrtAFY", "5qpjK5DgCt4",...
## $ trending_date <chr> "17.14.11", "17.14.11", "17.14.11", "17.14.1...
## $ title <chr> "WE WANT TO TALK ABOUT OUR MARRIAGE", "The T...
## $ channel_title <chr> "CaseyNeistat", "LastWeekTonight", "Rudy Man...
## $ category <chr> "People & Blogs", "Entertainment", "Comedy",...
## $ publish_time <dttm> 2017-11-13 17:13:01, 2017-11-13 07:30:00, 2...
## $ tags <chr> "SHANtell martin", "last week tonight trump ...
## $ views <dbl> 748374, 2418783, 3191434, 343168, 2095731, 1...
## $ likes <dbl> 57527, 97185, 146033, 10172, 132235, 9763, 1...
## $ dislikes <dbl> 2966, 6146, 5339, 666, 1989, 511, 2445, 778,...
## $ comment_count <dbl> 15954, 12703, 8181, 2146, 17518, 1434, 1970,...
## $ comments_disabled <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FA...
## $ ratings_disabled <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FA...
## $ video_error_or_removed <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FA...
## $ description <chr> "SHANTELL'S CHANNEL - https://www.youtube.co...
video_id | trending_date | title | channel_title | category | publish_time | tags | views | likes | dislikes | comment_count | comments_disabled | ratings_disabled | video_error_or_removed |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2kyS6SvSYSE | 17.14.11 | WE WANT TO TALK ABOUT OUR MARRIAGE | CaseyNeistat | People & Blogs | 2017-11-13 17:13:01 | SHANtell martin | 748374 | 57527 | 2966 | 15954 | FALSE | FALSE | FALSE |
1ZAPwfrtAFY | 17.14.11 | The Trump Presidency: Last Week Tonight with John Oliver (HBO) | LastWeekTonight | Entertainment | 2017-11-13 07:30:00 | last week tonight trump presidency“|”last week tonight donald trump“|”john oliver trump“|”donald trump | 2418783 | 97185 | 6146 | 12703 | FALSE | FALSE | FALSE |
5qpjK5DgCt4 | 17.14.11 | Racist Superman | Rudy Mancuso, King Bach & Lele Pons | Rudy Mancuso | Comedy | 2017-11-12 19:05:24 | racist superman“|”rudy“|”mancuso“|”king“|”bach“|”racist“|”superman“|”love“|”rudy mancuso poo bear black white official music video“|”iphone x by pineapple“|”lelepons“|”hannahstocking“|”rudymancuso“|”inanna“|”anwar“|”sarkis“|”shots“|”shotsstudios“|”alesso“|”anitta“|”brazil“|”Getting My Driver’s License | Lele Pons | 3191434 | 146033 | 5339 | 8181 | FALSE | FALSE | FALSE |
puqaWrEC7tY | 17.14.11 | Nickelback Lyrics: Real or Fake? | Good Mythical Morning | Entertainment | 2017-11-13 11:00:04 | rhett and link“|”gmm“|”good mythical morning“|”rhett and link good mythical morning“|”good mythical morning rhett and link“|”mythical morning“|”Season 12“|”nickelback lyrics“|”nickelback lyrics real or fake“|”nickelback“|”nickelback songs“|”nickelback song“|”rhett link nickelback“|”gmm nickelback“|”lyrics (website category)“|”nickelback (musical group)“|”rock“|”music“|”lyrics“|”chad kroeger“|”canada“|”music (industry)“|”mythical“|”gmm challenge“|”comedy“|”funny“|”challenge | 343168 | 10172 | 666 | 2146 | FALSE | FALSE | FALSE |
d380meD0W0M | 17.14.11 | I Dare You: GOING BALD!? | nigahiga | Entertainment | 2017-11-12 18:01:41 | ryan“|”higa“|”higatv“|”nigahiga“|”i dare you“|”idy“|”rhpc“|”dares“|”no truth“|”comments“|”comedy“|”funny“|”stupid“|”fail | 2095731 | 132235 | 1989 | 17518 | FALSE | FALSE | FALSE |
gHZ1Qz0KiKM | 17.14.11 | 2 Weeks with iPhone X | iJustine | Science & Technology | 2017-11-13 19:07:23 | ijustine“|”week with iPhone X“|”iphone x“|”apple“|”iphone“|”iphone x review“|”iphone x unboxing | 119180 | 9763 | 511 | 1434 | FALSE | FALSE | FALSE |
39idVpFF7NQ | 17.14.11 | Roy Moore & Jeff Sessions Cold Open - SNL | Saturday Night Live | Entertainment | 2017-11-12 05:37:17 | SNL“|”Saturday Night Live“|”SNL Season 43“|”Episode 1730“|”Tiffany Haddish“|”Roy Moore“|”Mikey Day“|”Mike Pence“|”Beck Bennett“|”Jeff Sessions“|”Kate McKinnon“|”s43“|”s43e5“|”episode 5“|”live“|”new york“|”comedy“|”sketch“|”funny“|”hilarious“|”late night“|”host“|”music“|”guest“|”laugh“|”impersonation“|”actor“|”improv“|”musician“|”comedian“|”actress“|”If Loving You Is Wrong“|”Oprah Winfrey“|”OWN“|”Girls Trip“|”The Carmichael Show“|”Keanu“|”Taylor Swift“|”Reputation“|”Look What You Made Me Do“|”ready for it?“|”cold open | 2103417 | 15993 | 2445 | 1970 | FALSE | FALSE | FALSE |
nc99ccSXST0 | 17.14.11 | 5 Ice Cream Gadgets put to the Test | CrazyRussianHacker | Science & Technology | 2017-11-12 21:50:37 | 5 Ice Cream Gadgets“|”Ice Cream“|”Cream Sandwich Maker“|”gadgets“|”gadget review“|”review“|”unboxing“|”kitchen gadgets“|”Gadgets put to the Test“|”testing“|”10 Kitchen Gadgets“|”7 Camping Coffee Gadgets“|”10 Kitchen Gadgets put to the Test | 817732 | 23663 | 778 | 3432 | FALSE | FALSE | FALSE |
jr9QtXwC9vc | 17.14.11 | The Greatest Showman | Official Trailer 2 [HD] | 20th Century FOX | 20th Century Fox | Film & Animation | 2017-11-13 14:00:23 | Trailer“|”Hugh Jackman“|”Michelle Williams“|”Zac Efron“|”Zendaya“|”Rebecca Ferguson“|”pasek and paul“|”la la land“|”moulin rouge“|”high school musical“|”hugh jackman musical“|”zac efron musical“|”musical“|”the greatest showman“|”greatest showman“|”Michael Gracey“|”P.T. Barnum“|”Barnum and Bailey“|”Barnum Circus“|”Barnum and Bailey Circus“|”20th century fox“|”greatest showman trailer“|”trailer“|”official trailer“|”the greatest showman trailer“|”logan“|”Benj Pasek“|”Justin Paul | 826059 | 3543 | 119 | 340 | FALSE | FALSE | FALSE |
TUmyygCMMGA | 17.14.11 | Why the rise of the robots won’t mean the end of work | Vox | News & Politics | 2017-11-13 13:45:16 | vox.com“|”vox“|”explain“|”shift change“|”future of work“|”automation“|”robots“|”jobs“|”technological unemployment“|”joss fong“|”heidi shierholz“|”martin ford“|”rise of the robots“|”humans“|”workers“|”employment“|”economics“|”macroeconomics“|”silicon valley“|”basic income | 256426 | 12654 | 1363 | 2368 | FALSE | FALSE | FALSE |
You should phrase your research question in a way that matches up with the scope of inference your dataset allows for. Is it possible to predict based on these variables or a combination of them the popularity of a youtube video in America?
What are the cases, and how many are there? Each observation represents a video in Youtube. There are 40,949 observations.
Describe the method of data collection. Data was obtained from a Kaggle data set.
What type of study is this (observational/experiment)? This is an observational study based on the obervations captured in this data.
If you collected the data, state self-collected. If not, provide a citation/link. Data was obtained from a Kaggle data set.
What is the response variable? Is it quantitative or qualitative? The response variable will be the prediction. It is qualitative.
You should have two independent variables, one quantitative and one qualitative. Category, likes, comments and tags. Likes is quantitative, the others are qualitative.
Provide summary statistics for each the variables. Also include appropriate visualizations related to your research question (e.g. scatter plot, boxplots, etc). This step requires the use of R, hence a code chunk is provided below. Insert more code chunks as needed.
Summary Statistics
## video_id trending_date title channel_title
## Length:40949 Length:40949 Length:40949 Length:40949
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## category publish_time tags
## Length:40949 Min. :2006-07-23 08:24:11 Length:40949
## Class :character 1st Qu.:2017-12-27 21:00:00 Class :character
## Mode :character Median :2018-02-21 16:19:27 Mode :character
## Mean :2018-02-11 01:00:49
## 3rd Qu.:2018-04-16 17:20:26
## Max. :2018-06-14 01:31:53
## views likes dislikes comment_count
## Min. : 549 Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 242329 1st Qu.: 5424 1st Qu.: 202 1st Qu.: 614
## Median : 681861 Median : 18091 Median : 631 Median : 1856
## Mean : 2360785 Mean : 74267 Mean : 3711 Mean : 8447
## 3rd Qu.: 1823157 3rd Qu.: 55417 3rd Qu.: 1938 3rd Qu.: 5755
## Max. :225211923 Max. :5613827 Max. :1674420 Max. :1361580
## comments_disabled ratings_disabled video_error_or_removed description
## Mode :logical Mode :logical Mode :logical Length:40949
## FALSE:40316 FALSE:40780 FALSE:40926 Class :character
## TRUE :633 TRUE :169 TRUE :23 Mode :character
##
##
##
## vars n mean sd median trimmed mad min
## views 1 40949 2360784.6 7394113.76 681861 1054836.27 813077.11 549
## likes 2 40949 74266.7 228885.34 18091 32156.33 23496.24 0
## dislikes 3 40949 3711.4 29029.71 631 1137.46 797.64 0
## max range skew kurtosis se
## views 225211923 225211374 12.24 232.34 36539.66
## likes 5613827 5613827 10.92 177.82 1131.09
## dislikes 1674420 1674420 40.19 1987.08 143.46
We see a clear tendency of some categories to gather more views than others.
video_categories <- video_df %>%
group_by(category) %>%
summarise(
views_sum = sum(views),
likes_sum = sum(likes),
dislikes_sum = sum(dislikes))
## `summarise()` ungrouping output (override with `.groups` argument)
category | views_sum | likes_sum | dislikes_sum |
---|---|---|---|
Autos & Vehicles | 520690717 | 4245656 | 243010 |
Comedy | 5117426208 | 216346746 | 7230391 |
Education | 1180629990 | 49257772 | 1351972 |
Entertainment | 20604388195 | 530516491 | 42987663 |
Film & Animation | 7284156721 | 165997476 | 6075148 |
Gaming | 2141218625 | 69038284 | 9184466 |
Howto & Style | 4078545064 | 162880075 | 5473899 |
Music | 40132892190 | 1416838584 | 51179008 |
News & Politics | 1473765704 | 18151033 | 4180049 |
Nonprofits & Activism | 168941392 | 14815646 | 3310381 |
People & Blogs | 4917191726 | 186615999 | 10187901 |
Pets & Animals | 764651989 | 19370702 | 527379 |
Science & Technology | 3487756816 | 82532638 | 4548402 |
Shows | 51501058 | 1082639 | 24508 |
Sports | 4404456673 | 98621211 | 5133551 |
Travel & Events | 343557084 | 4836246 | 340427 |
ggplot(video_categories, aes(factor(category), likes_sum, fill = category)) +
geom_bar(stat="identity", position = "dodge") +
scale_fill_brewer(palette = "Set1") +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors
We can see the Music category seems to be the one gathering more likes. Further analysis is needed to identify and analyse the tags associated with the different videos and how the presence of these tags might help answer the initial question. …