Ryan Nicholas Webscrapping Tutorial

To start off with I had to load in the neccesarry packages like instructed in the tutorial.

library(rvest)

Warning: package 'rvest' was built under R version 4.3.3

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter()         masks stats::filter()
✖ readr::guess_encoding() masks rvest::guess_encoding()
✖ dplyr::lag()            masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(plotly)

Warning: package 'plotly' was built under R version 4.3.3


Attaching package: 'plotly'

The following object is masked from 'package:ggplot2':

    last_plot

The following object is masked from 'package:stats':

    filter

The following object is masked from 'package:graphics':

    layout

library(highcharter)

Warning: package 'highcharter' was built under R version 4.3.3

Registered S3 method overwritten by 'quantmod':
  method            from
  as.zoo.data.frame zoo 
Highcharts (www.highcharts.com) is a Highsoft software product which is
not free for commercial and Governmental use

library(dplyr)
library(RColorBrewer)

Here as I continue the tutorial I need to use the url for movies.

#Specifying the url for desired website to be scraped
url <- 'https://www.imdb.com/search/title/?title_type=feature&release_date=2019-01-01,2019-12-31&count=100'
#Reading the HTML code from the website
webpage <- read_html(url)
# save_url(webpage, filename="webpage.htm"

Here I will load various elements and clean the data like the tutorial told me to.

rank_title_html <- html_elements(webpage, css='.ipc-title__text')
head(rank_title_html)

{xml_nodeset (6)}
[1] <h1 class="ipc-title__text">Advanced search</h1>
[2] <h3 class="ipc-title__text">1. The Gentlemen</h3>
[3] <h3 class="ipc-title__text">2. Ford v Ferrari</h3>
[4] <h3 class="ipc-title__text">3. Once Upon a Time... in Hollywood</h3>
[5] <h3 class="ipc-title__text">4. Midsommar</h3>
[6] <h3 class="ipc-title__text">5. Code 8</h3>

rank_title <- html_text(rank_title_html) #here i convert ranking title data to text.

#using the next command I will remove the advanced search and recently viewed data as they are not movie titles!
rank_title_data <- rank_title[-c(1, 102)]

head(rank_title_data)

[1] "1. The Gentlemen"                    "2. Ford v Ferrari"                  
[3] "3. Once Upon a Time... in Hollywood" "4. Midsommar"                       
[5] "5. Code 8"                           "6. Parasite"

length(rank_title_data)

[1] 100

#it is at 100 which means I did it right

Next I will scrape just the rank information.

rank_data <- parse_number(rank_title_data)

summary(rank_data)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   1.00   25.75   50.50   50.50   75.25  100.00

Next I will scrape the title information,

title_data <- str_sub(rank_title_data, start = 4L, end = -1L)

head(title_data)

[1] "The Gentlemen"                    "Ford v Ferrari"                  
[3] "Once Upon a Time... in Hollywood" "Midsommar"                       
[5] "Code 8"                           "Parasite"

length(title_data)

[1] 100

I will now start to scrape for description info.

#I use CSS selectors to scrape for description data
description_data_html <- html_elements(webpage, css='.ipc-html-content-inner-div')
#I then convert this data into text.
description_data <- html_text(description_data_html)

I then want to see the head of description data,

head(description_data)

[1] "An American expat tries to sell off his highly profitable marijuana empire in London, triggering plots, schemes, bribery and blackmail in an attempt to steal his domain out from under him."                                             
[2] "American car designer Carroll Shelby and driver Ken Miles battle corporate interference and the laws of physics to build a revolutionary race car for Ford in order to defeat Ferrari at the 24 Hours of Le Mans in 1966."                
[3] "A faded television actor and his stunt double strive to achieve fame and success in the final years of Hollywood's Golden Age in 1969 Los Angeles."                                                                                       
[4] "A couple travels to Northern Europe to visit a rural hometown's fabled Swedish mid-summer festival. What begins as an idyllic retreat quickly devolves into an increasingly violent and bizarre competition at the hands of a pagan cult."
[5] "A super-powered construction worker falls in with a group of criminals in order to raise the funds to help his ill mother."                                                                                                               
[6] "Greed and class discrimination threaten the newly formed symbiotic relationship between the wealthy Park family and the destitute Kim clan."

It all works!!

length(description_data)

[1] 100

Its the correct length too!

I will then scrape for details information.

#I use CSS selectors again but this time to scrape a movies run time.
details_data_html <- html_elements(webpage, css = 'span.sc-b0691f29-8.ilsLEX.dli-title-metadata-item')

#I then convert this data into text.
details_data <- html_text(details_data_html)

head(details_data)

[1] "2019"   "1h 53m" "R"      "2019"   "2h 32m" "PG-13"

I now want to filter out just for the run time.

runtime_text <- details_data[grep("\\d+h", details_data)]
head(runtime_text)

[1] "1h 53m" "2h 32m" "2h 41m" "2h 28m" "1h 38m" "2h 12m"

I will then convert run time hours and mins to just mins.

converted_runtimes <- sapply(strsplit(runtime_text, "h |m"), function(x) as.numeric(x[1]) * 60 +
as.numeric(x[2]))

Warning in FUN(X[[i]], ...): NAs introduced by coercion

Warning in FUN(X[[i]], ...): NAs introduced by coercion

head(converted_runtimes)

[1] 113 152 161 148  98 132

length(converted_runtimes)

[1] 100

I then want to make sure movies match with run times with a temp data frame.

df1 <- data.frame(Title = title_data, Runtime = converted_runtimes)
df1

                                             Title Runtime
1                                    The Gentlemen     113
2                                   Ford v Ferrari     152
3                 Once Upon a Time... in Hollywood     161
4                                        Midsommar     148
5                                           Code 8      98
6                                         Parasite     132
7                                       Knives Out     130
8                                     Little Women     135
9                                Avengers: Endgame     181
10                                           Joker     122
11                                        The King     140
12                  Godzilla: King of the Monsters     132
13                                            1917     119
14                                    The Irishman     209
15   Star Wars: Episode IX - The Rise of Skywalker     141
16                                     Jojo Rabbit     108
17                                       Yesterday     116
18                                    Doctor Sleep     152
19                                           After     105
20                                       I See You      98
21                                           Waves     135
22                                  The Lighthouse     109
23                                   The Lion King     118
24                                      Uncut Gems     135
25                                         Aladdin     128
26                                  It Chapter Two     169
27                                            Anna     118
28                                    Ready or Not      95
29               John Wick: Chapter 3 - Parabellum     130
30                                      Saint Maud      84
31                         Jumanji: The Next Level     123
32                             Alita: Battle Angel     122
33                                              Us     116
34                                 Triple Frontier     125
35                                         Hellboy      NA
36                       Spider-Man: Far from Home     129
37                        The Peanut Butter Falcon      97
38                                           Glass     129
39                                  Captain Marvel     123
40                                         Curiosa     107
41                                 Queen of Hearts     127
42           Fast & Furious Presents: Hobbs & Shaw     137
43                                    The Platform      94
44             A Beautiful Day in the Neighborhood     109
45                                   Downton Abbey     122
46                                         Shazam!     132
47                                      21 Bridges      99
48      Extremely Wicked, Shockingly Evil and Vile     110
49                                           Crawl      87
50                           Terminator: Dark Fate     128
51                  Dora and the Lost City of Gold     102
52                                     Toy Story 4     100
53                                            Cats     110
54                                      Brightburn      90
55                          Zombieland: Double Tap      99
56                                    Dark Phoenix     113
57                 El Camino: A Breaking Bad Movie     122
58                                  Marriage Story     137
59                      Portrait of a Lady on Fire     122
60                                           Polar     118
61                                        Ad Astra     123
62                                       Frozen II     103
63                                            Burn      88
64                                       Bombshell     109
65                                     Dark Waters     126
66                                        Hustlers     110
67                                   6 Underground     128
68      How to Train Your Dragon: The Hidden World     104
69                                       Booksmart     102
70                                  The Highwaymen     132
71                                  Sound of Metal      NA
72                         A Rainy Day in New York      92
73                                          Midway     138
74                              The Dead Don't Die     104
75                                Charlie's Angels     118
76                                        Vivarium      97
77                                     Escape Room      99
78                                       Rocketman     121
79                     Men in Black: International     114
80                                  Murder Mystery      97
81                                Calm with Horses     100
82                         Fighting with My Family     108
83                                       Fractured      99
84                                  Richard Jewell     131
85                                        Serenity     106
86                      Pokémon: Detective Pikachu     104
87                                 Five Feet Apart     116
88                                Angel Has Fallen     121
89                                     The Outpost     123
90                                         Tolkien     112
91                                        The Dirt     107
92                                      Just Mercy     137
93                    Maleficent: Mistress of Evil     119
94                                     Guns Akimbo      98
95                               Rambo: Last Blood      89
96                                            Togo     113
97                       In the Shadow of the Moon     115
98                                     The Fanatic      88
99                                        Plus One      98
100                                    . Good Boys      90

I then want to scrape for voting information.

#I will use CSS selectors to do this.
votes_labels <- html_nodes(webpage, "span.ipc-rating-star--voteCount")
vote_text <- html_text(votes_labels)
head(vote_text)

[1] " (394K)" " (468K)" " (843K)" " (400K)" " (51K)"  " (951K)"

I will then clean the voting strings.

vote_text1 <- str_sub(vote_text, start = 3L, end = -2L)
vote_text2 <- gsub("K", "", vote_text1)

finally I will convert votes to numeric and multiply by 1000.

vote_number <- as.numeric(vote_text2)

Warning: NAs introduced by coercion

vote_values <- vote_number*1000
head(vote_values)

[1] 394000 468000 843000 400000  51000 951000

length(vote_values)

[1] 100

I will then combine all lists into a single data frame.

df_movies <- data.frame(rank = rank_data, title = title_data, description = description_data, runtime = converted_runtimes, votes = vote_values)
head(df_movies)

  rank                            title
1    1                    The Gentlemen
2    2                   Ford v Ferrari
3    3 Once Upon a Time... in Hollywood
4    4                        Midsommar
5    5                           Code 8
6    6                         Parasite
                                                                                                                                                                                                                                description
1                                              An American expat tries to sell off his highly profitable marijuana empire in London, triggering plots, schemes, bribery and blackmail in an attempt to steal his domain out from under him.
2                 American car designer Carroll Shelby and driver Ken Miles battle corporate interference and the laws of physics to build a revolutionary race car for Ford in order to defeat Ferrari at the 24 Hours of Le Mans in 1966.
3                                                                                        A faded television actor and his stunt double strive to achieve fame and success in the final years of Hollywood's Golden Age in 1969 Los Angeles.
4 A couple travels to Northern Europe to visit a rural hometown's fabled Swedish mid-summer festival. What begins as an idyllic retreat quickly devolves into an increasingly violent and bizarre competition at the hands of a pagan cult.
5                                                                                                                A super-powered construction worker falls in with a group of criminals in order to raise the funds to help his ill mother.
6                                                                                               Greed and class discrimination threaten the newly formed symbiotic relationship between the wealthy Park family and the destitute Kim clan.
  runtime  votes
1     113 394000
2     152 468000
3     161 843000
4     148 400000
5      98  51000
6     132 951000

**Problem 1: Based on the scraped 2019 IMDB movie data frame,

create a scatterplot that shows runtime on the x-axis, number of votes on the y-axis. Be sure to provide a title, axis labels, and caption for the data source.**

p1 <- ggplot(df_movies, aes(x= runtime, y = votes)) + geom_point() + labs(title = "Runtime VS Number Of Votes", caption = "Source: IMDB") + xlab("Show Runtime (In Mins)") + ylab("Number Of Votes")
p1

Warning: Removed 4 rows containing missing values (`geom_point()`).

Problem 2: Use the filter function to answer the following question.

Which movie had a runtime 100-150 minutes AND highest votes from the top 10 highest ranked movies? I must see code that shows your filtering. Be sure to answer what the name of the movie is, what its runtime is and what its number of votes are.

ProbTwoData <- filter(df_movies, between(runtime,100,150) & rank <= 10 )
ProbTwoData

  rank         title
1    1 The Gentlemen
2    4     Midsommar
3    6      Parasite
4    7    Knives Out
5    8  Little Women
6   10         Joker
                                                                                                                                                                                                                                description
1                                              An American expat tries to sell off his highly profitable marijuana empire in London, triggering plots, schemes, bribery and blackmail in an attempt to steal his domain out from under him.
2 A couple travels to Northern Europe to visit a rural hometown's fabled Swedish mid-summer festival. What begins as an idyllic retreat quickly devolves into an increasingly violent and bizarre competition at the hands of a pagan cult.
3                                                                                               Greed and class discrimination threaten the newly formed symbiotic relationship between the wealthy Park family and the destitute Kim clan.
4                                                                                                                                                    A detective investigates the death of the patriarch of an eccentric, combative family.
5                                                                           Jo March reflects back and forth on her life, telling the beloved story of the March sisters - four young women, each determined to live life on her own terms.
6                                                                 During the 1980s, a failed stand-up comedian is driven insane and turns to a life of crime and chaos in Gotham City while becoming an infamous psychopathic crime figure.
  runtime  votes
1     113 394000
2     148 400000
3     132 951000
4     130 771000
5     135 244000
6     122     NA

The movie that fits these requirements is parasite, the runtime for this movie is 132 and the number of votes is 951000. Parasites rank is 6.

Problem 3: In the runtime of 130-160 mins, which movie from the

lowest ranked 10 movies (out of 100) had highest votes?

Again, you must use the filter function to get the exact movie which answers this question. Be sure to state the rank and number of votes for this movie.

ProbThreeData<- filter(df_movies, between(runtime,130,160) & rank >= 90 )
ProbThreeData

  rank       title
1   92  Just Mercy
                                                                                                         description
1 World-renowned civil rights defense attorney Bryan Stevenson works to free a wrongly condemned death row prisoner.
  runtime votes
1     137 75000

Only one movie fits these parameters and its Just Mercy which has a run time of 137 and has 75000 votes. The rank of this movie is 92.

Problem 4: Create a categorical variable

1. Create an additional categorical variable, “rank_groups” that groups movies into ranks 1-20, 21-40, 41-60, 61-80, and 81-100. 2. Color your scatterplot from Problem #1 by this new variable. Be sure to use a NON-DEFAULT color palette. 3. Make the size of the point based on the runtime. 4. Be sure your plot includes a legend for the rank_groups. 5. Add interactivity to get the movie title on mouseover for that point.

ProbFourDF <- df_movies |>
  mutate(rank_groups = ifelse(rank >= 1 & rank <= 20,"1-20",
  ifelse(rank >= 21 & rank <= 40,"21-40",
  ifelse(rank >= 41 & rank <=60,"41-60",
  ifelse(rank >=61 & rank <=80, "61-80", "81-100"))))) #there could be a better way to do this but this worked the best for me I assign each value to a categorical variable using ifelse statements.

cols <- brewer.pal(5, "Set1") #making color pallete for chart

highchart() |>
  hc_add_series(data = ProbFourDF,
                type= "scatter", 
                hcaes(x=runtime, y=votes, group = rank_groups, size = runtime)) |>
  hc_colors(cols) |>
hc_title(text="Runtime VS Number Of Votes") |>
hc_caption(text= "Source: IMDB") |>
hc_xAxis(title = list(text= "Runtime In Mins")) |>
hc_yAxis(title = list(text="Number Of Votes"), min = 0) |>
hc_plotOptions(series = list(marker = list(symbol = "circle"))) |>
hc_legend(align = "right", verticalAlign = "top") |> hc_tooltip(borderColor = "black", pointFormat = "Movie Title: {point.title} <br> Movie Rank: {point.rank} out of 100 <br> Number of Votes : {point.votes} <br> Runtime: {point.runtime}")