library(tidyverse)
## Warning in system("timedatectl", intern = TRUE): running command 'timedatectl'
## had status 1
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.1.0 ✓ dplyr 1.0.5
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(reprex)
library(readr)
Movie_Ratings.csv
Download link: https://urldefense.com/v3/__https://www.dropbox.com/s/ebr2gzy95pb9lsx/Movie*20Ratings.csv?dl=1__;JQ!!BZ50a36bapWJ!43rCQauXRJbBH5rrZGHgcy0j9uY3ewa9J5kaubTBMW8B-1bi3zkTlfGEFLsiCIcpWZg$
Variables:
imdb_1000.csv
Download link: https://urldefense.com/v3/__https://www.dropbox.com/s/ov5cntaof9lj9v6/imdb_1000.csv?dl=1__;!!BZ50a36bapWJ!43rCQauXRJbBH5rrZGHgcy0j9uY3ewa9J5kaubTBMW8B-1bi3zkTlfGEFLsisKt5jA0$
Variables:
library(readr)
rt <- read_csv("Movie Ratings.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## Film = col_character(),
## Genre = col_character(),
## `Rotten Tomatoes Ratings %` = col_double(),
## `Audience Ratings %` = col_double(),
## `Budget (million $)` = col_double(),
## `Year of release` = col_double()
## )
imdb <- read_csv("imdb_1000.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## star_rating = col_double(),
## title = col_character(),
## content_rating = col_character(),
## genre = col_character(),
## duration = col_double(),
## actors_list = col_character()
## )
Best practices: look and summarize your data
summary(rt)
## Film Genre Rotten Tomatoes Ratings %
## Length:562 Length:562 Min. : 0.0
## Class :character Class :character 1st Qu.:25.0
## Mode :character Mode :character Median :46.0
## Mean :47.4
## 3rd Qu.:70.0
## Max. :97.0
## Audience Ratings % Budget (million $) Year of release
## Min. : 0.00 Min. : 0.0 Min. :2007
## 1st Qu.:47.00 1st Qu.: 20.0 1st Qu.:2008
## Median :58.00 Median : 35.0 Median :2009
## Mean :58.83 Mean : 50.1 Mean :2009
## 3rd Qu.:72.00 3rd Qu.: 65.0 3rd Qu.:2010
## Max. :96.00 Max. :300.0 Max. :2011
summary(imdb)
## star_rating title content_rating genre
## Min. :7.40 Length:979 Length:979 Length:979
## 1st Qu.:7.60 Class :character Class :character Class :character
## Median :7.80 Mode :character Mode :character Mode :character
## Mean :7.89
## 3rd Qu.:8.10
## Max. :9.30
## duration actors_list
## Min. : 64 Length:979
## 1st Qu.:102 Class :character
## Median :117 Mode :character
## Mean :121
## 3rd Qu.:134
## Max. :242
no missing links
arrange highest to lowest= Western movies had the highest rating by genre.
imdb %>%
group_by(genre) %>%
summarize(
mean_rating= mean(star_rating)
) %>%
arrange(desc(mean_rating))
## # A tibble: 16 x 2
## genre mean_rating
## <chr> <dbl>
## 1 Western 8.26
## 2 Film-Noir 8.03
## 3 History 8
## 4 Mystery 7.98
## 5 Adventure 7.93
## 6 Sci-Fi 7.92
## 7 Crime 7.92
## 8 Animation 7.91
## 9 Drama 7.90
## 10 Action 7.88
## 11 Biography 7.86
## 12 Family 7.85
## 13 Comedy 7.82
## 14 Horror 7.81
## 15 Fantasy 7.7
## 16 Thriller 7.68
just asking for top rating
imdb %>%
group_by(genre) %>%
summarize(
mean_rating= mean(star_rating)
) %>%
top_n(1, mean_rating)
## # A tibble: 1 x 2
## genre mean_rating
## <chr> <dbl>
## 1 Western 8.26
make a boxplot to see how long the movies are compared to length
imdb %>%
ggplot(aes( x= content_rating, y = duration)) +
geom_boxplot()
Rated G movies are longer on average
library(dplyr)
dataset is too wide
rt_long <- rt %>%
pivot_longer('Rotten Tomatoes Ratings %':'Audience Ratings %',
names_to = "Type",
values_to = "Rating")
rt_long
## # A tibble: 1,124 x 6
## Film Genre `Budget (million … `Year of releas… Type Rating
## <chr> <chr> <dbl> <dbl> <chr> <dbl>
## 1 (500) Days … Comedy 8 2009 Rotten Tomat… 87
## 2 (500) Days … Comedy 8 2009 Audience Rat… 81
## 3 10,000 B.C. Advent… 105 2008 Rotten Tomat… 9
## 4 10,000 B.C. Advent… 105 2008 Audience Rat… 44
## 5 12 Rounds Action 20 2009 Rotten Tomat… 30
## 6 12 Rounds Action 20 2009 Audience Rat… 52
## 7 127 Hours Advent… 18 2010 Rotten Tomat… 93
## 8 127 Hours Advent… 18 2010 Audience Rat… 84
## 9 17 Again Comedy 20 2009 Rotten Tomat… 55
## 10 17 Again Comedy 20 2009 Audience Rat… 70
## # … with 1,114 more rows
rt_long %>%
filter(Genre == "Romance") %>%
ggplot(aes(x = Film, y = Rating, fill = Type))+
geom_col(position = "dodge")
Audience vs. Critic next to each other… not able to expand it without crashing R
rt %>%
group_by(Genre) %>%
summarize(
avg_rating = mean( `Audience Ratings %`)
)
## # A tibble: 7 x 2
## Genre avg_rating
## <chr> <dbl>
## 1 Action 58.7
## 2 Adventure 62.7
## 3 Comedy 56.4
## 4 Drama 64.4
## 5 Horror 47.4
## 6 Romance 62.3
## 7 Thriller 65.6
rt %>%
group_by(Genre, `Year of release`) %>%
summarize(
avg_rating = mean( `Audience Ratings %`)
)
## `summarise()` has grouped output by 'Genre'. You can override using the `.groups` argument.
## # A tibble: 34 x 3
## # Groups: Genre [7]
## Genre `Year of release` avg_rating
## <chr> <dbl> <dbl>
## 1 Action 2007 65.3
## 2 Action 2008 60.1
## 3 Action 2009 56.0
## 4 Action 2010 56.7
## 5 Action 2011 59.1
## 6 Adventure 2007 84
## 7 Adventure 2008 54.1
## 8 Adventure 2009 55.6
## 9 Adventure 2010 74.8
## 10 Adventure 2011 81
## # … with 24 more rows
rt %>%
group_by(Genre, `Year of release`) %>%
summarize(
avg_rating = mean( `Audience Ratings %`)
) %>%
pivot_wider(
names_from = Genre,
values_from = avg_rating
)
## `summarise()` has grouped output by 'Genre'. You can override using the `.groups` argument.
## # A tibble: 5 x 8
## `Year of release` Action Adventure Comedy Drama Horror Romance Thriller
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2007 65.3 84 64.1 70 55.1 70 73.5
## 2 2008 60.1 54.1 58.0 61.1 43.5 75 57.3
## 3 2009 56.0 55.6 54.3 59.1 51.2 38 NA
## 4 2010 56.7 74.8 51.4 61.2 36.9 53 57
## 5 2011 59.1 81 58.1 72.9 48.8 58.5 65.9
rt %>%
group_by(Genre, `Year of release`) %>%
summarize(
avg_rating = mean( `Audience Ratings %`)
) %>%
pivot_wider(
names_from = Genre,
values_from = avg_rating
) %>%
mutate(
diff = Comedy - Drama
)%>%
arrange(`Year of release`)
## `summarise()` has grouped output by 'Genre'. You can override using the `.groups` argument.
## # A tibble: 5 x 9
## `Year of release` Action Adventure Comedy Drama Horror Romance Thriller diff
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2007 65.3 84 64.1 70 55.1 70 73.5 -5.91
## 2 2008 60.1 54.1 58.0 61.1 43.5 75 57.3 -3.15
## 3 2009 56.0 55.6 54.3 59.1 51.2 38 NA -4.82
## 4 2010 56.7 74.8 51.4 61.2 36.9 53 57 -9.76
## 5 2011 59.1 81 58.1 72.9 48.8 58.5 65.9 -14.8
People are prefering watching Drama over Comedy in over the years of 2007-2011
left_join() in dplyr Prioritizes the first dataset, without changing it. So A and B, A stays the same and is prioritized
Full join full_join() in dplyr retains all information in either dataset 2 sources and combine dataset together into one dataset
inner join semi_join() in dplyr retains only infomration for the observations that apprear in both datasets ** you need to be careful because it will rewrite the data and drop anything it does not have a value for in both datasheets
Outerjoin anti_join() in dplyr rarely used, for shortcut to see what data is in one set but not both
names(rt)
## [1] "Film" "Genre"
## [3] "Rotten Tomatoes Ratings %" "Audience Ratings %"
## [5] "Budget (million $)" "Year of release"
names(imdb)
## [1] "star_rating" "title" "content_rating" "genre"
## [5] "duration" "actors_list"
rt %>%
semi_join(imdb,
by = c(Film= "title"))
## # A tibble: 64 x 6
## Film Genre `Rotten Tomatoes Ra… `Audience Rating… `Budget (million…
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 (500) Days … Comedy 87 81 8
## 2 127 Hours Advent… 93 84 18
## 3 50/50 Comedy 93 93 8
## 4 A Nightmare… Horror 13 40 35
## 5 Across the … Romance 54 84 45
## 6 Alice in Wo… Advent… 52 72 200
## 7 American Ga… Thrill… 79 87 100
## 8 Avatar Action 83 92 237
## 9 Black Swan Drama 88 86 13
## 10 Changeling Drama 62 84 55
## # … with 54 more rows, and 1 more variable: Year of release <dbl>
names(rt)
## [1] "Film" "Genre"
## [3] "Rotten Tomatoes Ratings %" "Audience Ratings %"
## [5] "Budget (million $)" "Year of release"
names(imdb)
## [1] "star_rating" "title" "content_rating" "genre"
## [5] "duration" "actors_list"
rt %>%
semi_join(imdb,
by = c(Film= "title")) %>%
nrow()
## [1] 64
There are 64 movies that apprear in both datasets
names(rt)
## [1] "Film" "Genre"
## [3] "Rotten Tomatoes Ratings %" "Audience Ratings %"
## [5] "Budget (million $)" "Year of release"
names(imdb)
## [1] "star_rating" "title" "content_rating" "genre"
## [5] "duration" "actors_list"
rt %>%
semi_join(imdb,
by = c(Film= "title")) %>%
nrow()
## [1] 64
rt %>% nrow()
## [1] 562
562 on rt and not imdb
names(rt)
## [1] "Film" "Genre"
## [3] "Rotten Tomatoes Ratings %" "Audience Ratings %"
## [5] "Budget (million $)" "Year of release"
names(imdb)
## [1] "star_rating" "title" "content_rating" "genre"
## [5] "duration" "actors_list"
rt %>%
semi_join(imdb,
by = c(Film= "title")) %>%
nrow()
## [1] 64
imdb %>% nrow()
## [1] 979
979 appear only in imdb
imdb %>%
anti_join(rt,
by = c(title = "Film"))
## # A tibble: 914 x 6
## star_rating title content_rating genre duration actors_list
## <dbl> <chr> <chr> <chr> <dbl> <chr>
## 1 9.3 The Shawshank… R Crime 142 [u'Tim Robbins', u…
## 2 9.2 The Godfather R Crime 175 [u'Marlon Brando',…
## 3 9.1 The Godfather… R Crime 200 [u'Al Pacino', u'R…
## 4 8.9 Pulp Fiction R Crime 154 [u'John Travolta',…
## 5 8.9 12 Angry Men NOT RATED Drama 96 [u'Henry Fonda', u…
## 6 8.9 The Good, the… NOT RATED Weste… 161 [u'Clint Eastwood'…
## 7 8.9 The Lord of t… PG-13 Adven… 201 [u'Elijah Wood', u…
## 8 8.9 Schindler's L… R Biogr… 195 [u'Liam Neeson', u…
## 9 8.9 Fight Club R Drama 139 [u'Brad Pitt', u'E…
## 10 8.8 The Lord of t… PG-13 Adven… 178 [u'Elijah Wood', u…
## # … with 904 more rows
rt %>%
anti_join(imdb,
by = c(Film = "title"))
## # A tibble: 498 x 6
## Film Genre `Rotten Tomatoes Rat… `Audience Rating… `Budget (million…
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 10,000 B.C. Advent… 9 44 105
## 2 12 Rounds Action 30 52 20
## 3 17 Again Comedy 55 70 20
## 4 2012 Action 39 63 200
## 5 27 Dresses Comedy 40 71 30
## 6 30 Days of… Horror 50 57 32
## 7 30 Minutes… Comedy 43 48 28
## 8 88 Minutes Drama 5 51 30
## 9 A Dangerou… Drama 79 89 20
## 10 A Serious … Drama 89 64 7
## # … with 488 more rows, and 1 more variable: Year of release <dbl>
example error ``{r} library(reprex) penguins %>% summarize(mean(bill_length_mm)) reprex()
example error
``{r}
library(tidyverse)
penguins %>%
summarize(mean(bill_length_mm))
need to load data penguins..
library(tidyverse)
library(palmerpenguins)
penguins %>%
summarize(mean(bill_length_mm))
## # A tibble: 1 x 1
## `mean(bill_length_mm)`
## <dbl>
## 1 NA
head(imdb)
## # A tibble: 6 x 6
## star_rating title content_rating genre duration actors_list
## <dbl> <chr> <chr> <chr> <dbl> <chr>
## 1 9.3 The Shawsh… R Crime 142 [u'Tim Robbins', u'Morg…
## 2 9.2 The Godfat… R Crime 175 [u'Marlon Brando', u'Al…
## 3 9.1 The Godfat… R Crime 200 [u'Al Pacino', u'Robert…
## 4 9 The Dark K… PG-13 Acti… 152 [u'Christian Bale', u'H…
## 5 8.9 Pulp Ficti… R Crime 154 [u'John Travolta', u'Um…
## 6 8.9 12 Angry M… NOT RATED Drama 96 [u'Henry Fonda', u'Lee …
separate, turns one variable to multiple variables
head(imdb)
## # A tibble: 6 x 6
## star_rating title content_rating genre duration actors_list
## <dbl> <chr> <chr> <chr> <dbl> <chr>
## 1 9.3 The Shawsh… R Crime 142 [u'Tim Robbins', u'Morg…
## 2 9.2 The Godfat… R Crime 175 [u'Marlon Brando', u'Al…
## 3 9.1 The Godfat… R Crime 200 [u'Al Pacino', u'Robert…
## 4 9 The Dark K… PG-13 Acti… 152 [u'Christian Bale', u'H…
## 5 8.9 Pulp Ficti… R Crime 154 [u'John Travolta', u'Um…
## 6 8.9 12 Angry M… NOT RATED Drama 96 [u'Henry Fonda', u'Lee …
imdb %>%
separate(actors_list,
into = c("Actor 1" , "Other Actors"),
sep = ",")
## Warning: Expected 2 pieces. Additional pieces discarded in 979 rows [1, 2, 3, 4,
## 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
## # A tibble: 979 x 7
## star_rating title content_rating genre duration `Actor 1` `Other Actors`
## <dbl> <chr> <chr> <chr> <dbl> <chr> <chr>
## 1 9.3 The Shaw… R Crime 142 [u'Tim R… " u'Morgan Fr…
## 2 9.2 The Godf… R Crime 175 [u'Marlo… " u'Al Pacino…
## 3 9.1 The Godf… R Crime 200 [u'Al Pa… " u'Robert De…
## 4 9 The Dark… PG-13 Action 152 [u'Chris… " u'Heath Led…
## 5 8.9 Pulp Fic… R Crime 154 [u'John … " u'Uma Thurm…
## 6 8.9 12 Angry… NOT RATED Drama 96 [u'Henry… " u'Lee J. Co…
## 7 8.9 The Good… NOT RATED Weste… 161 [u'Clint… " u'Eli Walla…
## 8 8.9 The Lord… PG-13 Adven… 201 [u'Elija… " u'Viggo Mor…
## 9 8.9 Schindle… R Biogr… 195 [u'Liam … " u'Ralph Fie…
## 10 8.9 Fight Cl… R Drama 139 [u'Brad … " u'Edward No…
## # … with 969 more rows
head(imdb)
## # A tibble: 6 x 6
## star_rating title content_rating genre duration actors_list
## <dbl> <chr> <chr> <chr> <dbl> <chr>
## 1 9.3 The Shawsh… R Crime 142 [u'Tim Robbins', u'Morg…
## 2 9.2 The Godfat… R Crime 175 [u'Marlon Brando', u'Al…
## 3 9.1 The Godfat… R Crime 200 [u'Al Pacino', u'Robert…
## 4 9 The Dark K… PG-13 Acti… 152 [u'Christian Bale', u'H…
## 5 8.9 Pulp Ficti… R Crime 154 [u'John Travolta', u'Um…
## 6 8.9 12 Angry M… NOT RATED Drama 96 [u'Henry Fonda', u'Lee …
imdb %>%
separate(actors_list,
into = c("Actor 1" , "Actor 2" , "Actor 3"),
sep = ",")
## # A tibble: 979 x 8
## star_rating title content_rating genre duration `Actor 1` `Actor 2` `Actor 3`
## <dbl> <chr> <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 9.3 The … R Crime 142 [u'Tim R… " u'Morg… " u'Bob …
## 2 9.2 The … R Crime 175 [u'Marlo… " u'Al P… " u'Jame…
## 3 9.1 The … R Crime 200 [u'Al Pa… " u'Robe… " u'Robe…
## 4 9 The … PG-13 Acti… 152 [u'Chris… " u'Heat… " u'Aaro…
## 5 8.9 Pulp… R Crime 154 [u'John … " u'Uma … " u'Samu…
## 6 8.9 12 A… NOT RATED Drama 96 [u'Henry… " u'Lee … " u'Mart…
## 7 8.9 The … NOT RATED West… 161 [u'Clint… " u'Eli … " u'Lee …
## 8 8.9 The … PG-13 Adve… 201 [u'Elija… " u'Vigg… " u'Ian …
## 9 8.9 Schi… R Biog… 195 [u'Liam … " u'Ralp… " u'Ben …
## 10 8.9 Figh… R Drama 139 [u'Brad … " u'Edwa… " u'Hele…
## # … with 969 more rows
Top 10 star ratings
head(imdb)
## # A tibble: 6 x 6
## star_rating title content_rating genre duration actors_list
## <dbl> <chr> <chr> <chr> <dbl> <chr>
## 1 9.3 The Shawsh… R Crime 142 [u'Tim Robbins', u'Morg…
## 2 9.2 The Godfat… R Crime 175 [u'Marlon Brando', u'Al…
## 3 9.1 The Godfat… R Crime 200 [u'Al Pacino', u'Robert…
## 4 9 The Dark K… PG-13 Acti… 152 [u'Christian Bale', u'H…
## 5 8.9 Pulp Ficti… R Crime 154 [u'John Travolta', u'Um…
## 6 8.9 12 Angry M… NOT RATED Drama 96 [u'Henry Fonda', u'Lee …
imdb %>%
separate(actors_list,
into = c("Actor 1" , "Actor 2" , "Actor 3"),
sep = ",") %>%
top_n(10, star_rating)
## # A tibble: 10 x 8
## star_rating title content_rating genre duration `Actor 1` `Actor 2` `Actor 3`
## <dbl> <chr> <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 9.3 The … R Crime 142 [u'Tim R… " u'Morg… " u'Bob …
## 2 9.2 The … R Crime 175 [u'Marlo… " u'Al P… " u'Jame…
## 3 9.1 The … R Crime 200 [u'Al Pa… " u'Robe… " u'Robe…
## 4 9 The … PG-13 Acti… 152 [u'Chris… " u'Heat… " u'Aaro…
## 5 8.9 Pulp… R Crime 154 [u'John … " u'Uma … " u'Samu…
## 6 8.9 12 A… NOT RATED Drama 96 [u'Henry… " u'Lee … " u'Mart…
## 7 8.9 The … NOT RATED West… 161 [u'Clint… " u'Eli … " u'Lee …
## 8 8.9 The … PG-13 Adve… 201 [u'Elija… " u'Vigg… " u'Ian …
## 9 8.9 Schi… R Biog… 195 [u'Liam … " u'Ralp… " u'Ben …
## 10 8.9 Figh… R Drama 139 [u'Brad … " u'Edwa… " u'Hele…
unite data
head(imdb)
## # A tibble: 6 x 6
## star_rating title content_rating genre duration actors_list
## <dbl> <chr> <chr> <chr> <dbl> <chr>
## 1 9.3 The Shawsh… R Crime 142 [u'Tim Robbins', u'Morg…
## 2 9.2 The Godfat… R Crime 175 [u'Marlon Brando', u'Al…
## 3 9.1 The Godfat… R Crime 200 [u'Al Pacino', u'Robert…
## 4 9 The Dark K… PG-13 Acti… 152 [u'Christian Bale', u'H…
## 5 8.9 Pulp Ficti… R Crime 154 [u'John Travolta', u'Um…
## 6 8.9 12 Angry M… NOT RATED Drama 96 [u'Henry Fonda', u'Lee …
imdb %>%
separate(actors_list,
into = c("Actor 1" , "Actor 2" , "Actor 3"),
sep = ",") %>%
top_n(10, star_rating)
## # A tibble: 10 x 8
## star_rating title content_rating genre duration `Actor 1` `Actor 2` `Actor 3`
## <dbl> <chr> <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 9.3 The … R Crime 142 [u'Tim R… " u'Morg… " u'Bob …
## 2 9.2 The … R Crime 175 [u'Marlo… " u'Al P… " u'Jame…
## 3 9.1 The … R Crime 200 [u'Al Pa… " u'Robe… " u'Robe…
## 4 9 The … PG-13 Acti… 152 [u'Chris… " u'Heat… " u'Aaro…
## 5 8.9 Pulp… R Crime 154 [u'John … " u'Uma … " u'Samu…
## 6 8.9 12 A… NOT RATED Drama 96 [u'Henry… " u'Lee … " u'Mart…
## 7 8.9 The … NOT RATED West… 161 [u'Clint… " u'Eli … " u'Lee …
## 8 8.9 The … PG-13 Adve… 201 [u'Elija… " u'Vigg… " u'Ian …
## 9 8.9 Schi… R Biog… 195 [u'Liam … " u'Ralp… " u'Ben …
## 10 8.9 Figh… R Drama 139 [u'Brad … " u'Edwa… " u'Hele…
imdb %>%
unite("content_ genre",
content_rating, genre,
sep = "_")
## # A tibble: 979 x 5
## star_rating title `content_ genre` duration actors_list
## <dbl> <chr> <chr> <dbl> <chr>
## 1 9.3 The Shawshank R… R_Crime 142 [u'Tim Robbins', u'Mo…
## 2 9.2 The Godfather R_Crime 175 [u'Marlon Brando', u'…
## 3 9.1 The Godfather: … R_Crime 200 [u'Al Pacino', u'Robe…
## 4 9 The Dark Knight PG-13_Action 152 [u'Christian Bale', u…
## 5 8.9 Pulp Fiction R_Crime 154 [u'John Travolta', u'…
## 6 8.9 12 Angry Men NOT RATED_Drama 96 [u'Henry Fonda', u'Le…
## 7 8.9 The Good, the B… NOT RATED_Weste… 161 [u'Clint Eastwood', u…
## 8 8.9 The Lord of the… PG-13_Adventure 201 [u'Elijah Wood', u'Vi…
## 9 8.9 Schindler's List R_Biography 195 [u'Liam Neeson', u'Ra…
## 10 8.9 Fight Club R_Drama 139 [u'Brad Pitt', u'Edwa…
## # … with 969 more rows
or any titles that say “and”
head(imdb)
## # A tibble: 6 x 6
## star_rating title content_rating genre duration actors_list
## <dbl> <chr> <chr> <chr> <dbl> <chr>
## 1 9.3 The Shawsh… R Crime 142 [u'Tim Robbins', u'Morg…
## 2 9.2 The Godfat… R Crime 175 [u'Marlon Brando', u'Al…
## 3 9.1 The Godfat… R Crime 200 [u'Al Pacino', u'Robert…
## 4 9 The Dark K… PG-13 Acti… 152 [u'Christian Bale', u'H…
## 5 8.9 Pulp Ficti… R Crime 154 [u'John Travolta', u'Um…
## 6 8.9 12 Angry M… NOT RATED Drama 96 [u'Henry Fonda', u'Lee …
imdb %>%
separate(actors_list,
into = c("Actor 1" , "Actor 2" , "Actor 3"),
sep = ",") %>%
top_n(10, star_rating)
## # A tibble: 10 x 8
## star_rating title content_rating genre duration `Actor 1` `Actor 2` `Actor 3`
## <dbl> <chr> <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 9.3 The … R Crime 142 [u'Tim R… " u'Morg… " u'Bob …
## 2 9.2 The … R Crime 175 [u'Marlo… " u'Al P… " u'Jame…
## 3 9.1 The … R Crime 200 [u'Al Pa… " u'Robe… " u'Robe…
## 4 9 The … PG-13 Acti… 152 [u'Chris… " u'Heat… " u'Aaro…
## 5 8.9 Pulp… R Crime 154 [u'John … " u'Uma … " u'Samu…
## 6 8.9 12 A… NOT RATED Drama 96 [u'Henry… " u'Lee … " u'Mart…
## 7 8.9 The … NOT RATED West… 161 [u'Clint… " u'Eli … " u'Lee …
## 8 8.9 The … PG-13 Adve… 201 [u'Elija… " u'Vigg… " u'Ian …
## 9 8.9 Schi… R Biog… 195 [u'Liam … " u'Ralp… " u'Ben …
## 10 8.9 Figh… R Drama 139 [u'Brad … " u'Edwa… " u'Hele…
imdb %>%
unite("content_ genre",
content_rating, genre,
sep = " and ")
## # A tibble: 979 x 5
## star_rating title `content_ genre` duration actors_list
## <dbl> <chr> <chr> <dbl> <chr>
## 1 9.3 The Shawshank … R and Crime 142 [u'Tim Robbins', u'Mo…
## 2 9.2 The Godfather R and Crime 175 [u'Marlon Brando', u'…
## 3 9.1 The Godfather:… R and Crime 200 [u'Al Pacino', u'Robe…
## 4 9 The Dark Knight PG-13 and Action 152 [u'Christian Bale', u…
## 5 8.9 Pulp Fiction R and Crime 154 [u'John Travolta', u'…
## 6 8.9 12 Angry Men NOT RATED and Dr… 96 [u'Henry Fonda', u'Le…
## 7 8.9 The Good, the … NOT RATED and We… 161 [u'Clint Eastwood', u…
## 8 8.9 The Lord of th… PG-13 and Advent… 201 [u'Elijah Wood', u'Vi…
## 9 8.9 Schindler's Li… R and Biography 195 [u'Liam Neeson', u'Ra…
## 10 8.9 Fight Club R and Drama 139 [u'Brad Pitt', u'Edwa…
## # … with 969 more rows
rearrange columns, normally used for yourself to arrange data and read it easier
imdb %>%
relocate(title,
.before = star_rating)
## # A tibble: 979 x 6
## title star_rating content_rating genre duration actors_list
## <chr> <dbl> <chr> <chr> <dbl> <chr>
## 1 The Shawshan… 9.3 R Crime 142 [u'Tim Robbins', u'…
## 2 The Godfather 9.2 R Crime 175 [u'Marlon Brando', …
## 3 The Godfathe… 9.1 R Crime 200 [u'Al Pacino', u'Ro…
## 4 The Dark Kni… 9 PG-13 Action 152 [u'Christian Bale',…
## 5 Pulp Fiction 8.9 R Crime 154 [u'John Travolta', …
## 6 12 Angry Men 8.9 NOT RATED Drama 96 [u'Henry Fonda', u'…
## 7 The Good, th… 8.9 NOT RATED Weste… 161 [u'Clint Eastwood',…
## 8 The Lord of … 8.9 PG-13 Adven… 201 [u'Elijah Wood', u'…
## 9 Schindler's … 8.9 R Biogr… 195 [u'Liam Neeson', u'…
## 10 Fight Club 8.9 R Drama 139 [u'Brad Pitt', u'Ed…
## # … with 969 more rows
imdb %>%
relocate(title,
.before = 1)
## # A tibble: 979 x 6
## title star_rating content_rating genre duration actors_list
## <chr> <dbl> <chr> <chr> <dbl> <chr>
## 1 The Shawshan… 9.3 R Crime 142 [u'Tim Robbins', u'…
## 2 The Godfather 9.2 R Crime 175 [u'Marlon Brando', …
## 3 The Godfathe… 9.1 R Crime 200 [u'Al Pacino', u'Ro…
## 4 The Dark Kni… 9 PG-13 Action 152 [u'Christian Bale',…
## 5 Pulp Fiction 8.9 R Crime 154 [u'John Travolta', …
## 6 12 Angry Men 8.9 NOT RATED Drama 96 [u'Henry Fonda', u'…
## 7 The Good, th… 8.9 NOT RATED Weste… 161 [u'Clint Eastwood',…
## 8 The Lord of … 8.9 PG-13 Adven… 201 [u'Elijah Wood', u'…
## 9 Schindler's … 8.9 R Biogr… 195 [u'Liam Neeson', u'…
## 10 Fight Club 8.9 R Drama 139 [u'Brad Pitt', u'Ed…
## # … with 969 more rows
across
to find the mean and sd of the Rotten Tomatoes critic rating and audience rating.names(rt)
## [1] "Film" "Genre"
## [3] "Rotten Tomatoes Ratings %" "Audience Ratings %"
## [5] "Budget (million $)" "Year of release"
rt %>%
summarize(
mean_critic = mean(`Rotten Tomatoes Ratings %`)
)
## # A tibble: 1 x 1
## mean_critic
## <dbl>
## 1 47.4
names(rt)
## [1] "Film" "Genre"
## [3] "Rotten Tomatoes Ratings %" "Audience Ratings %"
## [5] "Budget (million $)" "Year of release"
rt %>%
summarize(
across(c("Rotten Tomatoes Ratings %", "Audience Ratings %"), list(mean, sd))
)
## # A tibble: 1 x 4
## `Rotten Tomatoes Ra… `Rotten Tomatoes Ra… `Audience Ratings… `Audience Rating…
## <dbl> <dbl> <dbl> <dbl>
## 1 47.4 26.4 58.8 16.8
Make a plot comparing the ratings from Rotten Tomatoes Critics, Rotten Tomatoes Audience, and imdb.
rating_full <- rt %>%
full_join(imdb,
by = c(Film = "title"))
rating_full
## # A tibble: 1,477 x 11
## Film Genre `Rotten Tomatoes Rat… `Audience Rating… `Budget (million…
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 (500) Days… Comedy 87 81 8
## 2 10,000 B.C. Advent… 9 44 105
## 3 12 Rounds Action 30 52 20
## 4 127 Hours Advent… 93 84 18
## 5 17 Again Comedy 55 70 20
## 6 2012 Action 39 63 200
## 7 27 Dresses Comedy 40 71 30
## 8 30 Days of… Horror 50 57 32
## 9 30 Minutes… Comedy 43 48 28
## 10 50/50 Comedy 93 93 8
## # … with 1,467 more rows, and 6 more variables: Year of release <dbl>,
## # star_rating <dbl>, content_rating <chr>, genre <chr>, duration <dbl>,
## # actors_list <chr>
Find and plot averages as bar plot
rating_full <- rt %>%
full_join(imdb,
by = c(Film = "title"))
rating_full %>%
summarize(
across(c("Rotten Tomatoes Ratings %", "Audience Ratings %", "star_rating"),
~mean(.x, na.rm = TRUE))
)
## # A tibble: 1 x 3
## `Rotten Tomatoes Ratings %` `Audience Ratings %` star_rating
## <dbl> <dbl> <dbl>
## 1 47.5 58.9 7.89
make barplot
rating_full <- rt %>%
full_join(imdb,
by = c(Film = "title"))
rating_full %>%
summarize(
across(c("Rotten Tomatoes Ratings %", "Audience Ratings %", "star_rating"),
~mean(.x, na.rm = TRUE))
) %>%
pivot_longer(
everything(),
names_to = "Source",
values_to = "Rating"
) %>%
ggplot(aes(x = Source, y = Rating))+
geom_col()
2 different data sets, dont match up…
star rating was on scale 1-10, and 1-100
imdb is top 1000 rotten tomatoes 2007- 2011 only
rating_full <- rt %>%
inner_join(imdb,
by = c(Film = "title"))
rating_full_long <- rating_full %>%
mutate(
star_rating = 10*star_rating
) %>%
summarize(
across(c("Rotten Tomatoes Ratings %", "Audience Ratings %", "star_rating"),
~mean(.x, na.rm = TRUE))
) %>%
pivot_longer(
everything(),
names_to = "Source",
values_to = "Rating"
)
rating_full_long %>%
ggplot(aes(x = Source, y = Rating))+
geom_col()
NOW: Data only contains movies that are found in both datasets
The End!