url <- 'https://www.imdb.com/search/title/?title_type=feature&release_date=2019-01-01,2019-12-
31&count=100'Webscrapping- Jason Laucel
url <- 'https://www.imdb.com/search/title/?title_type=feature&release_date=2019-01-01,2019-12-31&count=100'
webpage <- read_html(url)#Use CSS selectors to scrape the rankings section
rank_title_html <- html_elements(webpage, css='.ipc-title__text')
head(rank_title_html){xml_nodeset (6)}
[1] <h1 class="ipc-title__text">Advanced search</h1>
[2] <h3 class="ipc-title__text">1. The Gentlemen</h3>
[3] <h3 class="ipc-title__text">2. Ford v Ferrari</h3>
[4] <h3 class="ipc-title__text">3. Once Upon a Time... in Hollywood</h3>
[5] <h3 class="ipc-title__text">4. Midsommar</h3>
[6] <h3 class="ipc-title__text">5. Code 8</h3>
rank_title <- html_text(rank_title_html)
9[1] 9
#Remove the first and last rows - they are not movie titles
rank_title_data <- rank_title[-c(1, 102)]#Let's have a look at the rankings
head(rank_title_data)[1] "1. The Gentlemen" "2. Ford v Ferrari"
[3] "3. Once Upon a Time... in Hollywood" "4. Midsommar"
[5] "5. Code 8" "6. Parasite"
length(rank_title_data)[1] 100
#should be 100# remove the title and extract just the number
rank_data <- parse_number(rank_title_data)summary(rank_data) Min. 1st Qu. Median Mean 3rd Qu. Max.
1.00 25.75 50.50 50.50 75.25 100.00
# Use the rank_title_data and extract just the characters from the title
title_data <- str_sub(rank_title_data, start = 4L, end = -1L)head(title_data) [1] "The Gentlemen" "Ford v Ferrari"
[3] "Once Upon a Time... in Hollywood" "Midsommar"
[5] "Code 8" "Parasite"
length(title_data)[1] 100
#Use CSS selectors to scrape the description section
description_data_html <- html_elements(webpage, css='.ipc-html-content-inner-div')
description_data <- html_text(description_data_html)
head(description_data)[1] "An American expat tries to sell off his highly profitable marijuana empire in London, triggering plots, schemes, bribery and blackmail in an attempt to steal his domain out from under him."
[2] "American car designer Carroll Shelby and driver Ken Miles battle corporate interference and the laws of physics to build a revolutionary race car for Ford in order to defeat Ferrari at the 24 Hours of Le Mans in 1966."
[3] "A faded television actor and his stunt double strive to achieve fame and success in the final years of Hollywood's Golden Age in 1969 Los Angeles."
[4] "A couple travels to Northern Europe to visit a rural hometown's fabled Swedish mid-summer festival. What begins as an idyllic retreat quickly devolves into an increasingly violent and bizarre competition at the hands of a pagan cult."
[5] "A super-powered construction worker falls in with a group of criminals in order to raise the funds to help his ill mother."
[6] "Greed and class discrimination threaten the newly formed symbiotic relationship between the wealthy Park family and the destitute Kim clan."
#Use CSS selectors to scrape the Movie runtime
details_data_html <- html_elements(webpage, css = 'span.sc-b0691f29-8.ilsLEX.dli-title-metadata-item')
#Convert the description data to text
details_data <- html_text(details_data_html)
head(details_data)[1] "2019" "1h 53m" "R" "2019" "2h 32m" "PG-13"
# Filter out the movie runtimes in the form "Xh XXm" from details_data
runtime_text <- details_data[grep("\\d+h", details_data)]
head(runtime_text)[1] "1h 53m" "2h 32m" "2h 41m" "2h 28m" "1h 38m" "2h 12m"
# Convert runtime_text from hours and minutes to minutes
converted_runtimes <- sapply(strsplit(runtime_text, "h |m"), function(x) as.numeric(x[1]) * 60 +
as.numeric(x[2]))Warning in FUN(X[[i]], ...): NAs introduced by coercion
Warning in FUN(X[[i]], ...): NAs introduced by coercion
# Display the converted movie runtimes
head(converted_runtimes)[1] 113 152 161 148 98 132
length(converted_runtimes)[1] 100
# Display the titles of movies with missing runtimes and their corresponding runtimes
df1 <- data.frame(Title = title_data, Runtime = converted_runtimes)
df1 Title Runtime
1 The Gentlemen 113
2 Ford v Ferrari 152
3 Once Upon a Time... in Hollywood 161
4 Midsommar 148
5 Code 8 98
6 Parasite 132
7 Knives Out 130
8 Little Women 135
9 Avengers: Endgame 181
10 Joker 122
11 The King 140
12 Godzilla: King of the Monsters 132
13 1917 119
14 The Irishman 209
15 Star Wars: Episode IX - The Rise of Skywalker 141
16 Jojo Rabbit 108
17 Yesterday 116
18 Doctor Sleep 152
19 After 105
20 I See You 98
21 Waves 135
22 The Lighthouse 109
23 The Lion King 118
24 Uncut Gems 135
25 Aladdin 128
26 It Chapter Two 169
27 Anna 118
28 Ready or Not 95
29 John Wick: Chapter 3 - Parabellum 130
30 Saint Maud 84
31 Jumanji: The Next Level 123
32 Alita: Battle Angel 122
33 Us 116
34 Triple Frontier 125
35 Hellboy NA
36 Spider-Man: Far from Home 129
37 The Peanut Butter Falcon 97
38 Glass 129
39 Captain Marvel 123
40 Curiosa 107
41 Queen of Hearts 127
42 Fast & Furious Presents: Hobbs & Shaw 137
43 The Platform 94
44 A Beautiful Day in the Neighborhood 109
45 Downton Abbey 122
46 Shazam! 132
47 21 Bridges 99
48 Extremely Wicked, Shockingly Evil and Vile 110
49 Crawl 87
50 Terminator: Dark Fate 128
51 Dora and the Lost City of Gold 102
52 Toy Story 4 100
53 Cats 110
54 Brightburn 90
55 Zombieland: Double Tap 99
56 Dark Phoenix 113
57 El Camino: A Breaking Bad Movie 122
58 Marriage Story 137
59 Portrait of a Lady on Fire 122
60 Polar 118
61 Ad Astra 123
62 Frozen II 103
63 Burn 88
64 Bombshell 109
65 Dark Waters 126
66 Hustlers 110
67 6 Underground 128
68 How to Train Your Dragon: The Hidden World 104
69 Booksmart 102
70 The Highwaymen 132
71 Sound of Metal NA
72 A Rainy Day in New York 92
73 Midway 138
74 The Dead Don't Die 104
75 Charlie's Angels 118
76 Vivarium 97
77 Escape Room 99
78 Rocketman 121
79 Men in Black: International 114
80 Murder Mystery 97
81 Calm with Horses 100
82 Fighting with My Family 108
83 Fractured 99
84 Richard Jewell 131
85 Serenity 106
86 Pokémon: Detective Pikachu 104
87 Five Feet Apart 116
88 Angel Has Fallen 121
89 The Outpost 123
90 Tolkien 112
91 The Dirt 107
92 Just Mercy 137
93 Maleficent: Mistress of Evil 119
94 Guns Akimbo 98
95 Rambo: Last Blood 89
96 Togo 113
97 In the Shadow of the Moon 115
98 The Fanatic 88
99 Plus One 98
100 . Good Boys 90
# Use CSS selectors to scrape the number of votes
votes_labels <- html_nodes(webpage, "span.ipc-rating-star--voteCount")
vote_text <- html_text(votes_labels)
head(vote_text)[1] " (394K)" " (468K)" " (843K)" " (400K)" " (51K)" " (951K)"
vote_text1 <- str_sub(vote_text, start = 3L, end = -2L)
vote_text2 <- gsub("K", "", vote_text1)vote_number <- as.numeric(vote_text2)Warning: NAs introduced by coercion
vote_values <- vote_number*1000
head(vote_values)[1] 394000 468000 843000 400000 51000 951000
length(vote_values)[1] 100
# Display the movies with missing or invalid runtimes
df_movies <- data.frame(rank = rank_data, title = title_data, description = description_data, runtime = converted_runtimes, votes = vote_values)
head(df_movies) rank title
1 1 The Gentlemen
2 2 Ford v Ferrari
3 3 Once Upon a Time... in Hollywood
4 4 Midsommar
5 5 Code 8
6 6 Parasite
description
1 An American expat tries to sell off his highly profitable marijuana empire in London, triggering plots, schemes, bribery and blackmail in an attempt to steal his domain out from under him.
2 American car designer Carroll Shelby and driver Ken Miles battle corporate interference and the laws of physics to build a revolutionary race car for Ford in order to defeat Ferrari at the 24 Hours of Le Mans in 1966.
3 A faded television actor and his stunt double strive to achieve fame and success in the final years of Hollywood's Golden Age in 1969 Los Angeles.
4 A couple travels to Northern Europe to visit a rural hometown's fabled Swedish mid-summer festival. What begins as an idyllic retreat quickly devolves into an increasingly violent and bizarre competition at the hands of a pagan cult.
5 A super-powered construction worker falls in with a group of criminals in order to raise the funds to help his ill mother.
6 Greed and class discrimination threaten the newly formed symbiotic relationship between the wealthy Park family and the destitute Kim clan.
runtime votes
1 113 394000
2 152 468000
3 161 843000
4 148 400000
5 98 51000
6 132 951000
library(ggplot2)
# df_movies scatterplot based on votes and runtime
# Scatterplot format ggplot
ggplot(data = df_movies, aes(x = runtime, y = votes)) +
geom_point(na.rm=TRUE) + # Movie = points
labs(
title = " Movie Runtime vs Number of Votes",
x = "Runtime (minutes)",
y = "Number of Votes",
caption = "Source: 2019 IMDB Movie Data"
)####. Problem 2:
# Filter data to include movies between 100 and 150 minutes of runtime
filtered_data <- df_movies %>%
filter(runtime >= 100 & runtime <= 150)
# Filtered dat arranged by the number of votes in descending order
sorted_data <- filtered_data %>%
arrange(desc(votes))
# Top_movie rank list from sorted data
top_movie <- head(sorted_data, 1)
# Display the name of the movie, its runtime in minutes, and the # of votes
cat("The movie with a runtime between 100-150 minutes and the highest number of votes from the top 10 highest-ranked movies is:",
"\nName:", top_movie$title,
"\nRuntime:", top_movie$runtime,"minutes",
"\nNumber of Votes:", top_movie$votes)The movie with a runtime between 100-150 minutes and the highest number of votes from the top 10 highest-ranked movies is:
Name: Parasite
Runtime: 132 minutes
Number of Votes: 951000
####. Problem 3
# Filter data for movies between 130 and 160 minutes of runtime
filtered_data <- df_movies %>%
filter(runtime >= 130 & runtime <= 160)
# Filtered data in ascending order by rank list
sorted_data <- filtered_data %>%
arrange(rank)
# Lowest-ranked movie from the sorted data
lowest_ranked_movie <- tail(sorted_data, 1)
# Display the rank of the movie, its name, and the number of votes
cat("The movie with a runtime between 130-160 minutes and the highest number of votes from the lowest-ranked 10 movies is:",
"\nRank:", lowest_ranked_movie$rank,
"\nName:", lowest_ranked_movie$title,
"\nNumber of Votes:", lowest_ranked_movie$votes)The movie with a runtime between 130-160 minutes and the highest number of votes from the lowest-ranked 10 movies is:
Rank: 92
Name: Just Mercy
Number of Votes: 75000
#####. Problem 4
# Scatterplot with ggplot
library(ggplot2)
library(ggrepel)
library(plotly)
# rank_groups variable
df_movies$rank_groups <- cut(df_movies$rank, breaks = c(0, 20, 40, 60, 80, 100), labels = c("1-20", "21-40", "41-60", "61-80", "81-100"))
# Scatterplot with ggplot
p <- ggplot(data = df_movies, aes(x = runtime, y = votes, color = rank_groups, size = runtime)) +
geom_point() + # Scatterplot with points
scale_color_manual(values = c("1-20" = "DarkSlateGray", "21-40" = "Orchid", "41-60" = "CadetBlue", "61-80" = "Goldenrod", "81-100" = "DarkOliveGreen")) + # Non-default color palette
labs(
title = "Movie Runtime vs Number of Votes",
x = "Runtime (minutes)",
y = "Number of Votes",
caption = "Source: 2019 IMDB Movie Data"
) +
guides(color = guide_legend(title = "Rank Groups")) + # Add legend for rank_groups
theme(legend.text = element_text(size = 8)) # Legend text size
# Convert ggplot to plotly for interactivity
p <- ggplotly(p, tooltip = c("title"))
# Print the interactive plot
p