IMDB tutorial

Author

Allan Maino Vieytes

library(rvest)
Warning: package 'rvest' was built under R version 4.3.3
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter()         masks stats::filter()
✖ readr::guess_encoding() masks rvest::guess_encoding()
✖ dplyr::lag()            masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(plotly)
Warning: package 'plotly' was built under R version 4.3.3

Attaching package: 'plotly'

The following object is masked from 'package:ggplot2':

    last_plot

The following object is masked from 'package:stats':

    filter

The following object is masked from 'package:graphics':

    layout
library(XML)
#Specifying the url for desired website to be scraped
url <- 'https://www.imdb.com/search/title/?title_type=feature&release_date=2019-01-01,2019-12-31&count=100'
#Reading the HTML code from the website
webpage <- read_html(url)
# save_url(webpage, filename="webpage.html")
#Use CSS selectors to scrape the rankings section
rank_title_html <- html_elements(webpage, css='.ipc-title__text')
head(rank_title_html)
{xml_nodeset (6)}
[1] <h1 class="ipc-title__text">Advanced search</h1>
[2] <h3 class="ipc-title__text">1. The Gentlemen</h3>
[3] <h3 class="ipc-title__text">2. Ford v Ferrari</h3>
[4] <h3 class="ipc-title__text">3. Once Upon a Time... in Hollywood</h3>
[5] <h3 class="ipc-title__text">4. Midsommar</h3>
[6] <h3 class="ipc-title__text">5. Code 8</h3>
  #Convert the ranking data to text
rank_title <- html_text(rank_title_html)
9
[1] 9
#Remove the first and last rows - they are not movie titles
rank_title_data <- rank_title[-c(1, 102)]
#Let's have a look at the rankings
head(rank_title_data)
[1] "1. The Gentlemen"                    "2. Ford v Ferrari"                  
[3] "3. Once Upon a Time... in Hollywood" "4. Midsommar"                       
[5] "5. Code 8"                           "6. Parasite"                        
# notice that the format is "rank. title"
length(rank_title_data)
[1] 100
#should be 100
# remove the title and extract just the number
rank_data <- parse_number(rank_title_data)
summary(rank_data)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   1.00   25.75   50.50   50.50   75.25  100.00 
# Use the rank_title_data and extract just the characters from the title
title_data <- str_sub(rank_title_data, start = 4L, end = -1L)
head(title_data) #check first 6 titles
[1] "The Gentlemen"                    "Ford v Ferrari"                  
[3] "Once Upon a Time... in Hollywood" "Midsommar"                       
[5] "Code 8"                           "Parasite"                        
length(title_data) # check number of titles - should be 100
[1] 100
#Use CSS selectors to scrape the description section
description_data_html <- html_elements(webpage, css='.ipc-html-content-inner-div')
#Convert the description data to text
description_data <- html_text(description_data_html)
#Let's have a look at the description data
head(description_data)
[1] "An American expat tries to sell off his highly profitable marijuana empire in London, triggering plots, schemes, bribery and blackmail in an attempt to steal his domain out from under him."                                             
[2] "American car designer Carroll Shelby and driver Ken Miles battle corporate interference and the laws of physics to build a revolutionary race car for Ford in order to defeat Ferrari at the 24 Hours of Le Mans in 1966."                
[3] "A faded television actor and his stunt double strive to achieve fame and success in the final years of Hollywood's Golden Age in 1969 Los Angeles."                                                                                       
[4] "A couple travels to Northern Europe to visit a rural hometown's fabled Swedish mid-summer festival. What begins as an idyllic retreat quickly devolves into an increasingly violent and bizarre competition at the hands of a pagan cult."
[5] "A super-powered construction worker falls in with a group of criminals in order to raise the funds to help his ill mother."                                                                                                               
[6] "Greed and class discrimination threaten the newly formed symbiotic relationship between the wealthy Park family and the destitute Kim clan."                                                                                              
length(description_data)
[1] 100
#Use CSS selectors to scrape the Movie runtime
details_data_html <- html_elements(webpage, css = 'span.sc-b0691f29-8.ilsLEX.dli-title-metadata-item')
#Convert the description data to text
details_data <- html_text(details_data_html)
head(details_data)
[1] "2019"   "1h 53m" "R"      "2019"   "2h 32m" "PG-13" 
details_data_html <- html_elements(webpage, css = 'span.sc-b0691f29-8.ilsLEX.dli-title-metadataitem')
# Filter out the movie runtimes in the form "Xh XXm" from details_data
runtime_text <- details_data[grep("\\d+h", details_data)]
head(runtime_text)
[1] "1h 53m" "2h 32m" "2h 41m" "2h 28m" "1h 38m" "2h 12m"
# Convert runtime_text from hours and minutes to minutes
converted_runtimes <- sapply(strsplit(runtime_text, "h |m"), function(x) as.numeric(x[1]) * 60 +
as.numeric(x[2]))
Warning in FUN(X[[i]], ...): NAs introduced by coercion

Warning in FUN(X[[i]], ...): NAs introduced by coercion
# Display the converted movie runtimes
head(converted_runtimes)
[1] 113 152 161 148  98 132
length(converted_runtimes)
[1] 100
# Display the titles of movies with missing runtimes and their corresponding runtimes
df1 <- data.frame(Title = title_data, Runtime = converted_runtimes)
df1
                                             Title Runtime
1                                    The Gentlemen     113
2                                   Ford v Ferrari     152
3                 Once Upon a Time... in Hollywood     161
4                                        Midsommar     148
5                                           Code 8      98
6                                         Parasite     132
7                                       Knives Out     130
8                                     Little Women     135
9                                Avengers: Endgame     181
10                                           Joker     122
11                                        The King     140
12                  Godzilla: King of the Monsters     132
13                                            1917     119
14                                    The Irishman     209
15   Star Wars: Episode IX - The Rise of Skywalker     141
16                                     Jojo Rabbit     108
17                                       Yesterday     116
18                                    Doctor Sleep     152
19                                           After     105
20                                       I See You      98
21                                           Waves     135
22                                  The Lighthouse     109
23                                   The Lion King     118
24                                      Uncut Gems     135
25                                         Aladdin     128
26                                  It Chapter Two     169
27                                            Anna     118
28                                    Ready or Not      95
29               John Wick: Chapter 3 - Parabellum     130
30                                      Saint Maud      84
31                         Jumanji: The Next Level     123
32                             Alita: Battle Angel     122
33                                              Us     116
34                                 Triple Frontier     125
35                                         Hellboy      NA
36                       Spider-Man: Far from Home     129
37                        The Peanut Butter Falcon      97
38                                           Glass     129
39                                  Captain Marvel     123
40                                         Curiosa     107
41                                 Queen of Hearts     127
42           Fast & Furious Presents: Hobbs & Shaw     137
43                                    The Platform      94
44             A Beautiful Day in the Neighborhood     109
45                                   Downton Abbey     122
46                                         Shazam!     132
47                                      21 Bridges      99
48      Extremely Wicked, Shockingly Evil and Vile     110
49                                           Crawl      87
50                           Terminator: Dark Fate     128
51                  Dora and the Lost City of Gold     102
52                                     Toy Story 4     100
53                                            Cats     110
54                                      Brightburn      90
55                          Zombieland: Double Tap      99
56                                    Dark Phoenix     113
57                 El Camino: A Breaking Bad Movie     122
58                                  Marriage Story     137
59                      Portrait of a Lady on Fire     122
60                                           Polar     118
61                                        Ad Astra     123
62                                       Frozen II     103
63                                            Burn      88
64                                       Bombshell     109
65                                     Dark Waters     126
66                                        Hustlers     110
67                                   6 Underground     128
68      How to Train Your Dragon: The Hidden World     104
69                                       Booksmart     102
70                                  The Highwaymen     132
71                                  Sound of Metal      NA
72                         A Rainy Day in New York      92
73                                          Midway     138
74                              The Dead Don't Die     104
75                                Charlie's Angels     118
76                                        Vivarium      97
77                                     Escape Room      99
78                                       Rocketman     121
79                     Men in Black: International     114
80                                  Murder Mystery      97
81                                Calm with Horses     100
82                         Fighting with My Family     108
83                                       Fractured      99
84                                  Richard Jewell     131
85                                        Serenity     106
86                      Pokémon: Detective Pikachu     104
87                                 Five Feet Apart     116
88                                Angel Has Fallen     121
89                                     The Outpost     123
90                                         Tolkien     112
91                                        The Dirt     107
92                                      Just Mercy     137
93                    Maleficent: Mistress of Evil     119
94                                     Guns Akimbo      98
95                               Rambo: Last Blood      89
96                                            Togo     113
97                       In the Shadow of the Moon     115
98                                     The Fanatic      88
99                                        Plus One      98
100                                    . Good Boys      90
# Use CSS selectors to scrape the number of votes
votes_labels <- html_nodes(webpage, "span.ipc-rating-star--voteCount")
vote_text <- html_text(votes_labels)
head(vote_text)
[1] " (393K)" " (468K)" " (842K)" " (400K)" " (51K)"  " (951K)"
vote_text1 <- str_sub(vote_text, start = 3L, end = -2L)
vote_text2 <- gsub("K", "", vote_text1)
vote_number <- as.numeric(vote_text2) 
Warning: NAs introduced by coercion
vote_values <- vote_number*1000
head(vote_values)
[1] 393000 468000 842000 400000  51000 951000
length(vote_values)
[1] 100
# Display the movies with missing or invalid runtimes
df_movies <- data.frame(rank = rank_data, 
                        title = title_data, 
                        description = description_data, 
                        runtime = converted_runtimes, 
                        votes = vote_values)
head(df_movies)
  rank                            title
1    1                    The Gentlemen
2    2                   Ford v Ferrari
3    3 Once Upon a Time... in Hollywood
4    4                        Midsommar
5    5                           Code 8
6    6                         Parasite
                                                                                                                                                                                                                                description
1                                              An American expat tries to sell off his highly profitable marijuana empire in London, triggering plots, schemes, bribery and blackmail in an attempt to steal his domain out from under him.
2                 American car designer Carroll Shelby and driver Ken Miles battle corporate interference and the laws of physics to build a revolutionary race car for Ford in order to defeat Ferrari at the 24 Hours of Le Mans in 1966.
3                                                                                        A faded television actor and his stunt double strive to achieve fame and success in the final years of Hollywood's Golden Age in 1969 Los Angeles.
4 A couple travels to Northern Europe to visit a rural hometown's fabled Swedish mid-summer festival. What begins as an idyllic retreat quickly devolves into an increasingly violent and bizarre competition at the hands of a pagan cult.
5                                                                                                                A super-powered construction worker falls in with a group of criminals in order to raise the funds to help his ill mother.
6                                                                                               Greed and class discrimination threaten the newly formed symbiotic relationship between the wealthy Park family and the destitute Kim clan.
  runtime  votes
1     113 393000
2     152 468000
3     161 842000
4     148 400000
5      98  51000
6     132 951000

Problems 1-4

Problem 1: Based on the scraped 2019 IMDB movie data frame, create a scatterplot that shows runtime on the x-axis, number of votes on the y-axis. Be sure to provide a title, axis labels, and caption for the data source.

ggplot

p.1 <-
  ggplot(df_movies, aes( y = votes/100000, x = runtime ) ) +
  geom_point() +
labs( title = "IMDB Top Movies: Comparison of Runtimes & Votes", # Labels title
        x = "Movie Runtime (minutes)", # Labels x axis
        y = "# of Votes (expressed in hundreds of thousands)", # Labels y axis
        caption = "Source: IMDB" ) + # Adds Source
    theme_linedraw() + # sets the theme for the graphs, it is the reason they are dark
    theme(
        aspect.ratio =0.8, # Made the overall size of the visualization smaller
        axis.title.x = element_text( size=14 ), # Changes size of X-axis Label
        axis.title.y = element_text( size=10 ), # Changes size of Y-axis Label
        axis.text = element_text( size = 9 ), # changes axes text sizes
        plot.title = element_text( size = 17, # Changes size of Title
                                   face = "bold", # Boldens Title
                                   hjust = 0.5, ), # Centers the Title to the Plots
        plot.caption = element_text( hjust = 0.5, # Centers the caption to the plots
                                    face = "italic" ))  # Italicizes the caption
p.1
Warning: Removed 4 rows containing missing values (`geom_point()`).

Problem 2: Use the filter function to answer the following question. Which movie had a runtime 100-150 minutes AND highest votes from the top 10 highest ranked movies? I must see code that shows your filtering. Be sure to answer what the name of the movie is, what its runtime is and what its number of votes are.

Top10-Parasite on Top with Runtime of 132 & 951,000 Votes

filtered_movies <- df_movies %>% #creates filtered_movies from df_movies
  filter(runtime >= 100 & runtime <= 150) %>% #filteres for movies btw 100-150 runtime
  arrange(desc(votes)) %>% #arranges votes in descending order
  head(10) #shows top 10 of dataframe

filtered_movies
   rank                                          title
1     6                                       Parasite
2     7                                     Knives Out
3    13                                           1917
4    39                                 Captain Marvel
5    36                      Spider-Man: Far from Home
6    15  Star Wars: Episode IX - The Rise of Skywalker
7    16                                    Jojo Rabbit
8    29              John Wick: Chapter 3 - Parabellum
9     4                                      Midsommar
10    1                                  The Gentlemen
                                                                                                                                                                                                                                 description
1                                                                                                Greed and class discrimination threaten the newly formed symbiotic relationship between the wealthy Park family and the destitute Kim clan.
2                                                                                                                                                     A detective investigates the death of the patriarch of an eccentric, combative family.
3           April 6th, 1917. As an infantry battalion assembles to wage war deep in enemy territory, two soldiers are assigned to race against time and deliver a message that will stop 1,600 men from walking straight into a deadly trap.
4                                                                                             Carol Danvers becomes one of the universe's most powerful heroes when Earth is caught in the middle of a galactic war between two alien races.
5                                                                                                      Following the events of Avengers: Endgame (2019), Spider-Man must step up to take on new threats in a world that has changed forever.
6                                                                                                       In the riveting conclusion of the landmark Skywalker saga, new legends will be born-and the final battle for freedom is yet to come.
7                                                             A young German boy in the Hitler Youth whose hero and imaginary friend is the country's dictator is shocked to discover that his mother is hiding a Jewish girl in their home.
8                                                      John Wick is on the run after killing a member of the international assassins' guild, and with a $14 million price tag on his head, he is the target of hit men and women everywhere.
9  A couple travels to Northern Europe to visit a rural hometown's fabled Swedish mid-summer festival. What begins as an idyllic retreat quickly devolves into an increasingly violent and bizarre competition at the hands of a pagan cult.
10                                              An American expat tries to sell off his highly profitable marijuana empire in London, triggering plots, schemes, bribery and blackmail in an attempt to steal his domain out from under him.
   runtime  votes
1      132 951000
2      130 771000
3      119 671000
4      123 609000
5      129 554000
6      141 491000
7      108 439000
8      130 424000
9      148 400000
10     113 393000

Problem 3: In the runtime of 130-160 mins, which movie from the lowest ranked 10 movies (out of 100) had highest votes? Again, you must use the filter function to get the exact movie which answers this question. Be sure to state the rank and number of votes for this movie.

Bottom10-Waves is on Bottom with Runtime of 135 & 30,000 Votes

filtered_movies.2 <- df_movies %>% #creates filtered_movies from df_movies
  filter(runtime >= 130 & runtime <= 160) %>% #filteres for movies btw 130-160 runtime
  arrange(votes) %>% #arranges votes in descending order
  head(10) #shows top 10 of dataframe

filtered_movies.2
   rank                                  title
1    21                                  Waves
2    92                             Just Mercy
3    73                                 Midway
4    84                         Richard Jewell
5    70                         The Highwaymen
6    11                               The King
7    12         Godzilla: King of the Monsters
8    18                           Doctor Sleep
9    42  Fast & Furious Presents: Hobbs & Shaw
10    8                           Little Women
                                                                                                                                                                                                         description
1                                   Traces the journey of a suburban family - led by a well-intentioned but domineering father - as they navigate love, forgiveness, and coming together in the aftermath of a loss.
2                                                                                                 World-renowned civil rights defense attorney Bryan Stevenson works to free a wrongly condemned death row prisoner.
3                                                                                                                              The story of the Battle of Midway, told by the leaders and the sailors who fought it.
4      Security guard Richard Jewell is an instant hero after foiling a bomb attack at the 1996 Atlanta Olympics, but his life becomes a nightmare when the FBI leaks to the media that he is a suspect in the case.
5                                                                                                                               The untold true story of the legendary detectives who brought down Bonnie and Clyde.
6                                                                    Young Henry V encounters deceit, war and treachery after becoming King of England in the 15th century, in the aftermath of his brother's death.
7  The crypto-zoological agency Monarch faces off against a battery of god-sized monsters, including the mighty Godzilla, who collides with Mothra, Rodan, and his ultimate nemesis, the three-headed King Ghidorah.
8  Years following the events of The Shining (1980), a now-adult Dan Torrance must protect a young girl with similar powers from a cult known as The True Knot, who prey on children with powers to remain immortal.
9                                                                   Lawman Luke Hobbs and outcast Deckard Shaw form an unlikely alliance when a cyber-genetically enhanced villain threatens the future of humanity.
10                                                   Jo March reflects back and forth on her life, telling the beloved story of the March sisters - four young women, each determined to live life on her own terms.
   runtime  votes
1      135  30000
2      137  75000
3      138  94000
4      131  97000
5      132 103000
6      140 152000
7      132 204000
8      152 218000
9      137 235000
10     135 244000

Problem 4: Create a categorical variable 1. Create an additional categorical variable, “rank_groups” that groups movies into ranks 1-20, 21-40, 41-60, 61-80, and 81-100. 2. Color your scatterplot from Problem #1 by this new variable. Be sure to use a NON-DEFAULT color palette. 3. Make the size of the point based on the runtime. 4. Be sure your plot includes a legend for the rank_groups. 5. Add interactivity to get the movie title on mouseover for that point.

df_movies_groups

df_movies_groups <- df_movies %>%
  mutate(rank_groups = cut(row_number(), breaks = c(1, 20, 40, 60, 80, 100), labels = c("1-20", "21-40", "41-60", "61-80", "81-100")))

ggplot.2

p.2 <-
  ggplot(df_movies_groups, aes( y = votes/100000, x = runtime, color = rank_groups, size = runtime, text = paste("Title:", title ))) +
  geom_point() +
  scale_color_viridis_d( option = "viridis", name = "Rank Groups" ) +
labs( title = "IMDB Top Movies: Comparison of Runtimes & Votes", # Labels title
        x = "Movie Runtime (minutes)", # Labels x axis
        y = "# of Votes (expressed in hundreds of thousands)", # Labels y axis
        caption = "Source: IMDB", 
        color = "Rank Groups",
        size = "Runtime" ) + # Adds Source
  
    theme_linedraw() + # sets the theme for the graphs, it is the reason they are dark
    theme(
        aspect.ratio =0.8, # Made the overall size of the visualization smaller
        axis.title.x = element_text( size=14 ), # Changes size of X-axis Label
        axis.title.y = element_text( size=10 ), # Changes size of Y-axis Label
        axis.text = element_text( size = 9 ), # changes axes text sizes
        plot.title = element_text( size = 17, # Changes size of Title
                                   face = "bold", # Boldens Title
                                   hjust = 0.5, ), # Centers the Title to the Plots
        plot.caption = element_text( hjust = 0.5, # Centers the caption to the plots
                                    face = "italic" ) ,
        legend.background = element_blank(), # Makes the background of the legend box blank
        legend.box.background = element_rect( color = "black" ),
        legend.position = c( .825, 0.723 ), # Changes legend position
        legend.title = element_text( size = 8.5, face = "bold" ), # Changes the legend title text size
        legend.text=element_text( size = 8.5 )) + # Changes the legend text size))  
  guides( size = "none" )
p.2
Warning: Removed 5 rows containing missing values (`geom_point()`).

Plotly sucks :)

ggplotly(p.2, tooltip = "text") 
Warning: Aspect ratios aren't yet implemented, but you can manually set a
suitable height/width

Warning: Aspect ratios aren't yet implemented, but you can manually set a
suitable height/width