Warning: package 'rvest' was built under R version 4.3.3
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.4.4 ✔ tibble 3.2.1
✔ lubridate 1.9.3 ✔ tidyr 1.3.1
✔ purrr 1.0.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ readr::guess_encoding() masks rvest::guess_encoding()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(plotly)
Warning: package 'plotly' was built under R version 4.3.3
Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':
last_plot
The following object is masked from 'package:stats':
filter
The following object is masked from 'package:graphics':
layout
library(XML)
#Specifying the url for desired website to be scrapedurl <-'https://www.imdb.com/search/title/?title_type=feature&release_date=2019-01-01,2019-12-31&count=100'#Reading the HTML code from the websitewebpage <-read_html(url)# save_url(webpage, filename="webpage.html")
#Use CSS selectors to scrape the rankings sectionrank_title_html <-html_elements(webpage, css='.ipc-title__text')head(rank_title_html)
{xml_nodeset (6)}
[1] <h1 class="ipc-title__text">Advanced search</h1>
[2] <h3 class="ipc-title__text">1. The Gentlemen</h3>
[3] <h3 class="ipc-title__text">2. Ford v Ferrari</h3>
[4] <h3 class="ipc-title__text">3. Once Upon a Time... in Hollywood</h3>
[5] <h3 class="ipc-title__text">4. Midsommar</h3>
[6] <h3 class="ipc-title__text">5. Code 8</h3>
#Convert the ranking data to textrank_title <-html_text(rank_title_html)9
[1] 9
#Remove the first and last rows - they are not movie titlesrank_title_data <- rank_title[-c(1, 102)]#Let's have a look at the rankingshead(rank_title_data)
[1] "1. The Gentlemen" "2. Ford v Ferrari"
[3] "3. Once Upon a Time... in Hollywood" "4. Midsommar"
[5] "5. Code 8" "6. Parasite"
# notice that the format is "rank. title"length(rank_title_data)
[1] 100
#should be 100
# remove the title and extract just the numberrank_data <-parse_number(rank_title_data)summary(rank_data)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.00 25.75 50.50 50.50 75.25 100.00
# Use the rank_title_data and extract just the characters from the titletitle_data <-str_sub(rank_title_data, start = 4L, end =-1L)head(title_data) #check first 6 titles
[1] "The Gentlemen" "Ford v Ferrari"
[3] "Once Upon a Time... in Hollywood" "Midsommar"
[5] "Code 8" "Parasite"
length(title_data) # check number of titles - should be 100
[1] 100
#Use CSS selectors to scrape the description sectiondescription_data_html <-html_elements(webpage, css='.ipc-html-content-inner-div')#Convert the description data to textdescription_data <-html_text(description_data_html)#Let's have a look at the description datahead(description_data)
[1] "An American expat tries to sell off his highly profitable marijuana empire in London, triggering plots, schemes, bribery and blackmail in an attempt to steal his domain out from under him."
[2] "American car designer Carroll Shelby and driver Ken Miles battle corporate interference and the laws of physics to build a revolutionary race car for Ford in order to defeat Ferrari at the 24 Hours of Le Mans in 1966."
[3] "A faded television actor and his stunt double strive to achieve fame and success in the final years of Hollywood's Golden Age in 1969 Los Angeles."
[4] "A couple travels to Northern Europe to visit a rural hometown's fabled Swedish mid-summer festival. What begins as an idyllic retreat quickly devolves into an increasingly violent and bizarre competition at the hands of a pagan cult."
[5] "A super-powered construction worker falls in with a group of criminals in order to raise the funds to help his ill mother."
[6] "Greed and class discrimination threaten the newly formed symbiotic relationship between the wealthy Park family and the destitute Kim clan."
length(description_data)
[1] 100
#Use CSS selectors to scrape the Movie runtimedetails_data_html <-html_elements(webpage, css ='span.sc-b0691f29-8.ilsLEX.dli-title-metadata-item')#Convert the description data to textdetails_data <-html_text(details_data_html)head(details_data)
# Convert runtime_text from hours and minutes to minutesconverted_runtimes <-sapply(strsplit(runtime_text, "h |m"), function(x) as.numeric(x[1]) *60+as.numeric(x[2]))
Warning in FUN(X[[i]], ...): NAs introduced by coercion
Warning in FUN(X[[i]], ...): NAs introduced by coercion
# Display the converted movie runtimeshead(converted_runtimes)
[1] 113 152 161 148 98 132
length(converted_runtimes)
[1] 100
# Display the titles of movies with missing runtimes and their corresponding runtimesdf1 <-data.frame(Title = title_data, Runtime = converted_runtimes)df1
Title Runtime
1 The Gentlemen 113
2 Ford v Ferrari 152
3 Once Upon a Time... in Hollywood 161
4 Midsommar 148
5 Code 8 98
6 Parasite 132
7 Knives Out 130
8 Little Women 135
9 Avengers: Endgame 181
10 Joker 122
11 The King 140
12 Godzilla: King of the Monsters 132
13 1917 119
14 The Irishman 209
15 Star Wars: Episode IX - The Rise of Skywalker 141
16 Jojo Rabbit 108
17 Yesterday 116
18 Doctor Sleep 152
19 After 105
20 I See You 98
21 Waves 135
22 The Lighthouse 109
23 The Lion King 118
24 Uncut Gems 135
25 Aladdin 128
26 It Chapter Two 169
27 Anna 118
28 Ready or Not 95
29 John Wick: Chapter 3 - Parabellum 130
30 Saint Maud 84
31 Jumanji: The Next Level 123
32 Alita: Battle Angel 122
33 Us 116
34 Triple Frontier 125
35 Hellboy NA
36 Spider-Man: Far from Home 129
37 The Peanut Butter Falcon 97
38 Glass 129
39 Captain Marvel 123
40 Curiosa 107
41 Queen of Hearts 127
42 Fast & Furious Presents: Hobbs & Shaw 137
43 The Platform 94
44 A Beautiful Day in the Neighborhood 109
45 Downton Abbey 122
46 Shazam! 132
47 21 Bridges 99
48 Extremely Wicked, Shockingly Evil and Vile 110
49 Crawl 87
50 Terminator: Dark Fate 128
51 Dora and the Lost City of Gold 102
52 Toy Story 4 100
53 Cats 110
54 Brightburn 90
55 Zombieland: Double Tap 99
56 Dark Phoenix 113
57 El Camino: A Breaking Bad Movie 122
58 Marriage Story 137
59 Portrait of a Lady on Fire 122
60 Polar 118
61 Ad Astra 123
62 Frozen II 103
63 Burn 88
64 Bombshell 109
65 Dark Waters 126
66 Hustlers 110
67 6 Underground 128
68 How to Train Your Dragon: The Hidden World 104
69 Booksmart 102
70 The Highwaymen 132
71 Sound of Metal NA
72 A Rainy Day in New York 92
73 Midway 138
74 The Dead Don't Die 104
75 Charlie's Angels 118
76 Vivarium 97
77 Escape Room 99
78 Rocketman 121
79 Men in Black: International 114
80 Murder Mystery 97
81 Calm with Horses 100
82 Fighting with My Family 108
83 Fractured 99
84 Richard Jewell 131
85 Serenity 106
86 Pokémon: Detective Pikachu 104
87 Five Feet Apart 116
88 Angel Has Fallen 121
89 The Outpost 123
90 Tolkien 112
91 The Dirt 107
92 Just Mercy 137
93 Maleficent: Mistress of Evil 119
94 Guns Akimbo 98
95 Rambo: Last Blood 89
96 Togo 113
97 In the Shadow of the Moon 115
98 The Fanatic 88
99 Plus One 98
100 . Good Boys 90
# Use CSS selectors to scrape the number of votesvotes_labels <-html_nodes(webpage, "span.ipc-rating-star--voteCount")vote_text <-html_text(votes_labels)head(vote_text)
vote_text1 <-str_sub(vote_text, start = 3L, end =-2L)vote_text2 <-gsub("K", "", vote_text1)
vote_number <-as.numeric(vote_text2)
Warning: NAs introduced by coercion
vote_values <- vote_number*1000head(vote_values)
[1] 393000 468000 842000 400000 51000 951000
length(vote_values)
[1] 100
# Display the movies with missing or invalid runtimesdf_movies <-data.frame(rank = rank_data, title = title_data, description = description_data, runtime = converted_runtimes, votes = vote_values)head(df_movies)
rank title
1 1 The Gentlemen
2 2 Ford v Ferrari
3 3 Once Upon a Time... in Hollywood
4 4 Midsommar
5 5 Code 8
6 6 Parasite
description
1 An American expat tries to sell off his highly profitable marijuana empire in London, triggering plots, schemes, bribery and blackmail in an attempt to steal his domain out from under him.
2 American car designer Carroll Shelby and driver Ken Miles battle corporate interference and the laws of physics to build a revolutionary race car for Ford in order to defeat Ferrari at the 24 Hours of Le Mans in 1966.
3 A faded television actor and his stunt double strive to achieve fame and success in the final years of Hollywood's Golden Age in 1969 Los Angeles.
4 A couple travels to Northern Europe to visit a rural hometown's fabled Swedish mid-summer festival. What begins as an idyllic retreat quickly devolves into an increasingly violent and bizarre competition at the hands of a pagan cult.
5 A super-powered construction worker falls in with a group of criminals in order to raise the funds to help his ill mother.
6 Greed and class discrimination threaten the newly formed symbiotic relationship between the wealthy Park family and the destitute Kim clan.
runtime votes
1 113 393000
2 152 468000
3 161 842000
4 148 400000
5 98 51000
6 132 951000
Problems 1-4
Problem 1: Based on the scraped 2019 IMDB movie data frame, create a scatterplot that shows runtime on the x-axis, number of votes on the y-axis. Be sure to provide a title, axis labels, and caption for the data source.
ggplot
p.1<-ggplot(df_movies, aes( y = votes/100000, x = runtime ) ) +geom_point() +labs( title ="IMDB Top Movies: Comparison of Runtimes & Votes", # Labels titlex ="Movie Runtime (minutes)", # Labels x axisy ="# of Votes (expressed in hundreds of thousands)", # Labels y axiscaption ="Source: IMDB" ) +# Adds Sourcetheme_linedraw() +# sets the theme for the graphs, it is the reason they are darktheme(aspect.ratio =0.8, # Made the overall size of the visualization smalleraxis.title.x =element_text( size=14 ), # Changes size of X-axis Labelaxis.title.y =element_text( size=10 ), # Changes size of Y-axis Labelaxis.text =element_text( size =9 ), # changes axes text sizesplot.title =element_text( size =17, # Changes size of Titleface ="bold", # Boldens Titlehjust =0.5, ), # Centers the Title to the Plotsplot.caption =element_text( hjust =0.5, # Centers the caption to the plotsface ="italic" )) # Italicizes the captionp.1
Problem 2: Use the filter function to answer the following question. Which movie had a runtime 100-150 minutes AND highest votes from the top 10 highest ranked movies? I must see code that shows your filtering. Be sure to answer what the name of the movie is, what its runtime is and what its number of votes are.
Top10-Parasite on Top with Runtime of 132 & 951,000 Votes
filtered_movies <- df_movies %>%#creates filtered_movies from df_moviesfilter(runtime >=100& runtime <=150) %>%#filteres for movies btw 100-150 runtimearrange(desc(votes)) %>%#arranges votes in descending orderhead(10) #shows top 10 of dataframefiltered_movies
rank title
1 6 Parasite
2 7 Knives Out
3 13 1917
4 39 Captain Marvel
5 36 Spider-Man: Far from Home
6 15 Star Wars: Episode IX - The Rise of Skywalker
7 16 Jojo Rabbit
8 29 John Wick: Chapter 3 - Parabellum
9 4 Midsommar
10 1 The Gentlemen
description
1 Greed and class discrimination threaten the newly formed symbiotic relationship between the wealthy Park family and the destitute Kim clan.
2 A detective investigates the death of the patriarch of an eccentric, combative family.
3 April 6th, 1917. As an infantry battalion assembles to wage war deep in enemy territory, two soldiers are assigned to race against time and deliver a message that will stop 1,600 men from walking straight into a deadly trap.
4 Carol Danvers becomes one of the universe's most powerful heroes when Earth is caught in the middle of a galactic war between two alien races.
5 Following the events of Avengers: Endgame (2019), Spider-Man must step up to take on new threats in a world that has changed forever.
6 In the riveting conclusion of the landmark Skywalker saga, new legends will be born-and the final battle for freedom is yet to come.
7 A young German boy in the Hitler Youth whose hero and imaginary friend is the country's dictator is shocked to discover that his mother is hiding a Jewish girl in their home.
8 John Wick is on the run after killing a member of the international assassins' guild, and with a $14 million price tag on his head, he is the target of hit men and women everywhere.
9 A couple travels to Northern Europe to visit a rural hometown's fabled Swedish mid-summer festival. What begins as an idyllic retreat quickly devolves into an increasingly violent and bizarre competition at the hands of a pagan cult.
10 An American expat tries to sell off his highly profitable marijuana empire in London, triggering plots, schemes, bribery and blackmail in an attempt to steal his domain out from under him.
runtime votes
1 132 951000
2 130 771000
3 119 671000
4 123 609000
5 129 554000
6 141 491000
7 108 439000
8 130 424000
9 148 400000
10 113 393000
Problem 3: In the runtime of 130-160 mins, which movie from the lowest ranked 10 movies (out of 100) had highest votes? Again, you must use the filter function to get the exact movie which answers this question. Be sure to state the rank and number of votes for this movie.
Bottom10-Waves is on Bottom with Runtime of 135 & 30,000 Votes
filtered_movies.2<- df_movies %>%#creates filtered_movies from df_moviesfilter(runtime >=130& runtime <=160) %>%#filteres for movies btw 130-160 runtimearrange(votes) %>%#arranges votes in descending orderhead(10) #shows top 10 of dataframefiltered_movies.2
rank title
1 21 Waves
2 92 Just Mercy
3 73 Midway
4 84 Richard Jewell
5 70 The Highwaymen
6 11 The King
7 12 Godzilla: King of the Monsters
8 18 Doctor Sleep
9 42 Fast & Furious Presents: Hobbs & Shaw
10 8 Little Women
description
1 Traces the journey of a suburban family - led by a well-intentioned but domineering father - as they navigate love, forgiveness, and coming together in the aftermath of a loss.
2 World-renowned civil rights defense attorney Bryan Stevenson works to free a wrongly condemned death row prisoner.
3 The story of the Battle of Midway, told by the leaders and the sailors who fought it.
4 Security guard Richard Jewell is an instant hero after foiling a bomb attack at the 1996 Atlanta Olympics, but his life becomes a nightmare when the FBI leaks to the media that he is a suspect in the case.
5 The untold true story of the legendary detectives who brought down Bonnie and Clyde.
6 Young Henry V encounters deceit, war and treachery after becoming King of England in the 15th century, in the aftermath of his brother's death.
7 The crypto-zoological agency Monarch faces off against a battery of god-sized monsters, including the mighty Godzilla, who collides with Mothra, Rodan, and his ultimate nemesis, the three-headed King Ghidorah.
8 Years following the events of The Shining (1980), a now-adult Dan Torrance must protect a young girl with similar powers from a cult known as The True Knot, who prey on children with powers to remain immortal.
9 Lawman Luke Hobbs and outcast Deckard Shaw form an unlikely alliance when a cyber-genetically enhanced villain threatens the future of humanity.
10 Jo March reflects back and forth on her life, telling the beloved story of the March sisters - four young women, each determined to live life on her own terms.
runtime votes
1 135 30000
2 137 75000
3 138 94000
4 131 97000
5 132 103000
6 140 152000
7 132 204000
8 152 218000
9 137 235000
10 135 244000
Problem 4: Create a categorical variable 1. Create an additional categorical variable, “rank_groups” that groups movies into ranks 1-20, 21-40, 41-60, 61-80, and 81-100. 2. Color your scatterplot from Problem #1 by this new variable. Be sure to use a NON-DEFAULT color palette. 3. Make the size of the point based on the runtime. 4. Be sure your plot includes a legend for the rank_groups. 5. Add interactivity to get the movie title on mouseover for that point.
p.2<-ggplot(df_movies_groups, aes( y = votes/100000, x = runtime, color = rank_groups, size = runtime, text =paste("Title:", title ))) +geom_point() +scale_color_viridis_d( option ="viridis", name ="Rank Groups" ) +labs( title ="IMDB Top Movies: Comparison of Runtimes & Votes", # Labels titlex ="Movie Runtime (minutes)", # Labels x axisy ="# of Votes (expressed in hundreds of thousands)", # Labels y axiscaption ="Source: IMDB", color ="Rank Groups",size ="Runtime" ) +# Adds Sourcetheme_linedraw() +# sets the theme for the graphs, it is the reason they are darktheme(aspect.ratio =0.8, # Made the overall size of the visualization smalleraxis.title.x =element_text( size=14 ), # Changes size of X-axis Labelaxis.title.y =element_text( size=10 ), # Changes size of Y-axis Labelaxis.text =element_text( size =9 ), # changes axes text sizesplot.title =element_text( size =17, # Changes size of Titleface ="bold", # Boldens Titlehjust =0.5, ), # Centers the Title to the Plotsplot.caption =element_text( hjust =0.5, # Centers the caption to the plotsface ="italic" ) ,legend.background =element_blank(), # Makes the background of the legend box blanklegend.box.background =element_rect( color ="black" ),legend.position =c( .825, 0.723 ), # Changes legend positionlegend.title =element_text( size =8.5, face ="bold" ), # Changes the legend title text sizelegend.text=element_text( size =8.5 )) +# Changes the legend text size)) guides( size ="none" )p.2
Warning: Aspect ratios aren't yet implemented, but you can manually set a
suitable height/width
Warning: Aspect ratios aren't yet implemented, but you can manually set a
suitable height/width