Webscrapping- Jason Laucel

url <- 'https://www.imdb.com/search/title/?title_type=feature&release_date=2019-01-01,2019-12-
31&count=100'

 url <- 'https://www.imdb.com/search/title/?title_type=feature&release_date=2019-01-01,2019-12-31&count=100'
webpage <- read_html(url)

#Use CSS selectors to scrape the rankings section
rank_title_html <- html_elements(webpage, css='.ipc-title__text')
head(rank_title_html)

{xml_nodeset (6)}
[1] <h1 class="ipc-title__text">Advanced search</h1>
[2] <h3 class="ipc-title__text">1. The Gentlemen</h3>
[3] <h3 class="ipc-title__text">2. Ford v Ferrari</h3>
[4] <h3 class="ipc-title__text">3. Once Upon a Time... in Hollywood</h3>
[5] <h3 class="ipc-title__text">4. Midsommar</h3>
[6] <h3 class="ipc-title__text">5. Code 8</h3>

rank_title <- html_text(rank_title_html)
9

[1] 9

#Remove the first and last rows - they are not movie titles
rank_title_data <- rank_title[-c(1, 102)]

#Let's have a look at the rankings
head(rank_title_data)

[1] "1. The Gentlemen"                    "2. Ford v Ferrari"                  
[3] "3. Once Upon a Time... in Hollywood" "4. Midsommar"                       
[5] "5. Code 8"                           "6. Parasite"

length(rank_title_data)

[1] 100

#should be 100

# remove the title and extract just the number
rank_data <- parse_number(rank_title_data)

summary(rank_data)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   1.00   25.75   50.50   50.50   75.25  100.00

# Use the rank_title_data and extract just the characters from the title
title_data <- str_sub(rank_title_data, start = 4L, end = -1L)

head(title_data)

[1] "The Gentlemen"                    "Ford v Ferrari"                  
[3] "Once Upon a Time... in Hollywood" "Midsommar"                       
[5] "Code 8"                           "Parasite"

length(title_data)

[1] 100

#Use CSS selectors to scrape the description section
description_data_html <- html_elements(webpage, css='.ipc-html-content-inner-div')
description_data <- html_text(description_data_html)
head(description_data)

[1] "An American expat tries to sell off his highly profitable marijuana empire in London, triggering plots, schemes, bribery and blackmail in an attempt to steal his domain out from under him."                                             
[2] "American car designer Carroll Shelby and driver Ken Miles battle corporate interference and the laws of physics to build a revolutionary race car for Ford in order to defeat Ferrari at the 24 Hours of Le Mans in 1966."                
[3] "A faded television actor and his stunt double strive to achieve fame and success in the final years of Hollywood's Golden Age in 1969 Los Angeles."                                                                                       
[4] "A couple travels to Northern Europe to visit a rural hometown's fabled Swedish mid-summer festival. What begins as an idyllic retreat quickly devolves into an increasingly violent and bizarre competition at the hands of a pagan cult."
[5] "A super-powered construction worker falls in with a group of criminals in order to raise the funds to help his ill mother."                                                                                                               
[6] "Greed and class discrimination threaten the newly formed symbiotic relationship between the wealthy Park family and the destitute Kim clan."

#Use CSS selectors to scrape the Movie runtime
details_data_html <- html_elements(webpage, css = 'span.sc-b0691f29-8.ilsLEX.dli-title-metadata-item')

#Convert the description data to text
details_data <- html_text(details_data_html)
head(details_data)

[1] "2019"   "1h 53m" "R"      "2019"   "2h 32m" "PG-13"

# Filter out the movie runtimes in the form "Xh XXm" from details_data

runtime_text <- details_data[grep("\\d+h", details_data)]
head(runtime_text)

[1] "1h 53m" "2h 32m" "2h 41m" "2h 28m" "1h 38m" "2h 12m"

# Convert runtime_text from hours and minutes to minutes
converted_runtimes <- sapply(strsplit(runtime_text, "h |m"), function(x) as.numeric(x[1]) * 60 +
as.numeric(x[2]))

Warning in FUN(X[[i]], ...): NAs introduced by coercion

Warning in FUN(X[[i]], ...): NAs introduced by coercion

# Display the converted movie runtimes
head(converted_runtimes)

[1] 113 152 161 148  98 132

length(converted_runtimes)

[1] 100

# Display the titles of movies with missing runtimes and their corresponding runtimes
df1 <- data.frame(Title = title_data, Runtime = converted_runtimes)
df1

                                             Title Runtime
1                                    The Gentlemen     113
2                                   Ford v Ferrari     152
3                 Once Upon a Time... in Hollywood     161
4                                        Midsommar     148
5                                           Code 8      98
6                                         Parasite     132
7                                       Knives Out     130
8                                     Little Women     135
9                                Avengers: Endgame     181
10                                           Joker     122
11                                        The King     140
12                  Godzilla: King of the Monsters     132
13                                            1917     119
14                                    The Irishman     209
15   Star Wars: Episode IX - The Rise of Skywalker     141
16                                     Jojo Rabbit     108
17                                       Yesterday     116
18                                    Doctor Sleep     152
19                                           After     105
20                                       I See You      98
21                                           Waves     135
22                                  The Lighthouse     109
23                                   The Lion King     118
24                                      Uncut Gems     135
25                                         Aladdin     128
26                                  It Chapter Two     169
27                                            Anna     118
28                                    Ready or Not      95
29               John Wick: Chapter 3 - Parabellum     130
30                                      Saint Maud      84
31                         Jumanji: The Next Level     123
32                             Alita: Battle Angel     122
33                                              Us     116
34                                 Triple Frontier     125
35                                         Hellboy      NA
36                       Spider-Man: Far from Home     129
37                        The Peanut Butter Falcon      97
38                                           Glass     129
39                                  Captain Marvel     123
40                                         Curiosa     107
41                                 Queen of Hearts     127
42           Fast & Furious Presents: Hobbs & Shaw     137
43                                    The Platform      94
44             A Beautiful Day in the Neighborhood     109
45                                   Downton Abbey     122
46                                         Shazam!     132
47                                      21 Bridges      99
48      Extremely Wicked, Shockingly Evil and Vile     110
49                                           Crawl      87
50                           Terminator: Dark Fate     128
51                  Dora and the Lost City of Gold     102
52                                     Toy Story 4     100
53                                            Cats     110
54                                      Brightburn      90
55                          Zombieland: Double Tap      99
56                                    Dark Phoenix     113
57                 El Camino: A Breaking Bad Movie     122
58                                  Marriage Story     137
59                      Portrait of a Lady on Fire     122
60                                           Polar     118
61                                        Ad Astra     123
62                                       Frozen II     103
63                                            Burn      88
64                                       Bombshell     109
65                                     Dark Waters     126
66                                        Hustlers     110
67                                   6 Underground     128
68      How to Train Your Dragon: The Hidden World     104
69                                       Booksmart     102
70                                  The Highwaymen     132
71                                  Sound of Metal      NA
72                         A Rainy Day in New York      92
73                                          Midway     138
74                              The Dead Don't Die     104
75                                Charlie's Angels     118
76                                        Vivarium      97
77                                     Escape Room      99
78                                       Rocketman     121
79                     Men in Black: International     114
80                                  Murder Mystery      97
81                                Calm with Horses     100
82                         Fighting with My Family     108
83                                       Fractured      99
84                                  Richard Jewell     131
85                                        Serenity     106
86                      Pokémon: Detective Pikachu     104
87                                 Five Feet Apart     116
88                                Angel Has Fallen     121
89                                     The Outpost     123
90                                         Tolkien     112
91                                        The Dirt     107
92                                      Just Mercy     137
93                    Maleficent: Mistress of Evil     119
94                                     Guns Akimbo      98
95                               Rambo: Last Blood      89
96                                            Togo     113
97                       In the Shadow of the Moon     115
98                                     The Fanatic      88
99                                        Plus One      98
100                                    . Good Boys      90

# Use CSS selectors to scrape the number of votes
votes_labels <- html_nodes(webpage, "span.ipc-rating-star--voteCount")
vote_text <- html_text(votes_labels)
head(vote_text)

[1] " (394K)" " (468K)" " (843K)" " (400K)" " (51K)"  " (951K)"

vote_text1 <- str_sub(vote_text, start = 3L, end = -2L)
vote_text2 <- gsub("K", "", vote_text1)

vote_number <- as.numeric(vote_text2)

Warning: NAs introduced by coercion

vote_values <- vote_number*1000
head(vote_values)

[1] 394000 468000 843000 400000  51000 951000

length(vote_values)

[1] 100

# Display the movies with missing or invalid runtimes
df_movies <- data.frame(rank = rank_data, title = title_data, description = description_data, runtime = converted_runtimes, votes = vote_values)
head(df_movies)

  rank                            title
1    1                    The Gentlemen
2    2                   Ford v Ferrari
3    3 Once Upon a Time... in Hollywood
4    4                        Midsommar
5    5                           Code 8
6    6                         Parasite
                                                                                                                                                                                                                                description
1                                              An American expat tries to sell off his highly profitable marijuana empire in London, triggering plots, schemes, bribery and blackmail in an attempt to steal his domain out from under him.
2                 American car designer Carroll Shelby and driver Ken Miles battle corporate interference and the laws of physics to build a revolutionary race car for Ford in order to defeat Ferrari at the 24 Hours of Le Mans in 1966.
3                                                                                        A faded television actor and his stunt double strive to achieve fame and success in the final years of Hollywood's Golden Age in 1969 Los Angeles.
4 A couple travels to Northern Europe to visit a rural hometown's fabled Swedish mid-summer festival. What begins as an idyllic retreat quickly devolves into an increasingly violent and bizarre competition at the hands of a pagan cult.
5                                                                                                                A super-powered construction worker falls in with a group of criminals in order to raise the funds to help his ill mother.
6                                                                                               Greed and class discrimination threaten the newly formed symbiotic relationship between the wealthy Park family and the destitute Kim clan.
  runtime  votes
1     113 394000
2     152 468000
3     161 843000
4     148 400000
5      98  51000
6     132 951000

library(ggplot2)

# df_movies scatterplot based on votes and runtime

# Scatterplot format ggplot

ggplot(data = df_movies, aes(x = runtime, y = votes)) +
  geom_point(na.rm=TRUE) +  # Movie = points
  labs(
    title = " Movie Runtime vs Number of Votes",
    x = "Runtime (minutes)",
    y = "Number of Votes",
    caption = "Source: 2019 IMDB Movie Data"
  )

####. Problem 2:

# Filter data to include movies between 100 and 150 minutes of runtime
filtered_data <- df_movies %>% 
  filter(runtime >= 100 & runtime <= 150)

# Filtered dat arranged by the number of votes in descending order
sorted_data <- filtered_data %>% 
  arrange(desc(votes))

# Top_movie rank list from sorted data

top_movie <- head(sorted_data, 1)

# Display the name of the movie, its runtime in minutes, and the # of votes

cat("The movie with a runtime between 100-150 minutes and the highest number of votes from the top 10 highest-ranked movies is:",
    "\nName:", top_movie$title,
    "\nRuntime:", top_movie$runtime,"minutes",
    "\nNumber of Votes:", top_movie$votes)

The movie with a runtime between 100-150 minutes and the highest number of votes from the top 10 highest-ranked movies is: 
Name: Parasite 
Runtime: 132 minutes 
Number of Votes: 951000

####. Problem 3

# Filter data for movies between 130 and 160 minutes of runtime
filtered_data <- df_movies %>% 
  filter(runtime >= 130 & runtime <= 160)

# Filtered data in ascending order by rank list
sorted_data <- filtered_data %>% 
  arrange(rank)

# Lowest-ranked movie from the sorted data
lowest_ranked_movie <- tail(sorted_data, 1)

# Display the rank of the movie, its name, and the number of votes

cat("The movie with a runtime between 130-160 minutes and the highest number of votes from the lowest-ranked 10 movies is:",
    "\nRank:", lowest_ranked_movie$rank,
    "\nName:", lowest_ranked_movie$title,
    "\nNumber of Votes:", lowest_ranked_movie$votes)

The movie with a runtime between 130-160 minutes and the highest number of votes from the lowest-ranked 10 movies is: 
Rank: 92 
Name:  Just Mercy 
Number of Votes: 75000

#####. Problem 4

# Scatterplot with ggplot
library(ggplot2)
library(ggrepel)
library(plotly)

# rank_groups variable
df_movies$rank_groups <- cut(df_movies$rank, breaks = c(0, 20, 40, 60, 80, 100), labels = c("1-20", "21-40", "41-60", "61-80", "81-100"))

# Scatterplot with ggplot
p <- ggplot(data = df_movies, aes(x = runtime, y = votes, color = rank_groups, size = runtime)) +
  geom_point() +  # Scatterplot with points
  scale_color_manual(values = c("1-20" = "DarkSlateGray", "21-40" = "Orchid", "41-60" = "CadetBlue", "61-80" = "Goldenrod", "81-100" = "DarkOliveGreen")) +  # Non-default color palette
  labs(
    title = "Movie Runtime vs Number of Votes",
    x = "Runtime (minutes)",
    y = "Number of Votes",
    caption = "Source: 2019 IMDB Movie Data"
  ) +
  guides(color = guide_legend(title = "Rank Groups")) +  # Add legend for rank_groups
  theme(legend.text = element_text(size = 8))  # Legend text size

# Convert ggplot to plotly for interactivity
p <- ggplotly(p, tooltip = c("title"))

# Print the interactive plot
p