if (!requireNamespace("xml2", quietly = TRUE)) {
  install.packages("xml2")
}

library(xml2)

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

Upload the necessary library

library(rvest)
library(dplyr)
## 
## Caricamento pacchetto: 'dplyr'
## I seguenti oggetti sono mascherati da 'package:stats':
## 
##     filter, lag
## I seguenti oggetti sono mascherati da 'package:base':
## 
##     intersect, setdiff, setequal, union

Web Scraping

url <- "https://flixgem.com/"
webpage <- read_html(url)

Extraction of variables

film_directors <- webpage %>% html_nodes(".Directors_CSS") %>% html_text()
film_titles <- webpage %>% html_nodes(".Titles_CSS") %>% html_text()
film_years <- webpage %>% html_nodes(".Release_Date_CSS") %>% html_text()

Data frame creation

film_date <- data.frame(Title = film_titles, Director = film_directors, Release_Date = film_years)

CHALLENGE 1: What is the oldest Woody Allen film available on Netflix?

webpage <- film_date %>%
  arrange(Release_Date)
oldest_film <- webpage[1, ]

Upload necessary library

library(rvest)
library(dplyr)

Web Scraping

url <- "https://flixgem.com/"
webpage <- read_html(url)

Extract Data

movie_data <- webpage %>%
  html_nodes("selector_for_movie_data_CSS") %>%
  html_text()
film_genres <- webpage %>% html_nodes(".Genre_CSS") %>% html_text()
film_ratings <- webpage %>% html_nodes(".View_Rating_CSS") %>% html_text()
film_languages <- webpage %>% html_nodes(".Available_Languages_CSS") %>% html_text()

Data frame creation

movie_df <- data.frame(Title = film_titles, Genre = film_genres, Rating = film_ratings, Language = film_languages)

CHALLENGE 2: What are the three highest rated comedies available in Polish?

Movies available in Polish

polish_comedies <- movie_df %>%
  filter(tolower(Language) == "Polish")

sort by rating

sorted_polish_comedies <- polish_comedies %>%
  arrange(desc(Rating))

select top three

top_three_comedies <- head(sorted_polish_comedies, 3)

Upload necessary library

library(rvest)
library(dplyr)

Web Scraping

url <- "https://flixgem.com/"
webpage <- read_html(url)

Extract Data

movie_data <- webpage %>% 
  html_nodes("selector_for_movie_data_CSS") %>% 
  html_text()
film_release_dates <- webpage %>% html_nodes(".Release_Date_CSS") %>% html_text()
Netflix_film_release_dates <- webpage %>% html_nodes(".Netflix_Release_Date_CSS") %>% html_text()

Dataframe Creation

movie_df <- data.frame(Title = film_titles, Release_Date = film_release_dates, Netflix_Release_Date = Netflix_film_release_dates)

CHALLENGE 3: For 2019 and 2020 productions, what is the average time between release and appearance on Netflix?

Films produced between 2019 and 2020

Films_2019_2020 <- movie_df %>%
  filter(Release_Date >= 2019 & Release_Date <= 2020)

Average time between Netflix release and Appereance on Netflix

movie_df <- movie_df %>%
  mutate(DaysToNetflix = as.numeric(as.Date(Netflix_Release_Date) - as.Date(Release_Date))) 
average_time <- mean(movie_df$DaysToNetflix, na.rm = TRUE)

Upload necessary library

library(rvest)
library(dplyr)

Web Scraping

url <- "https://flixgem.com/"
webpage <- read_html(url)

Extract data

tags <- webpage %>% 
  html_nodes("selector_for_tags_CSS") %>% 
  html_text()

CHALLENGE 5: What are the average ratings of films produced in each decade (i.e., 1960s, 1970s, 1980s, 1990s, etc.)?

Decade Extraction

movie_df <- movie_df %>%
  mutate(Decade = ifelse(film_years >= 1960 & film_years <= 1969, "1960s",
                         ifelse(film_years >= 1970 & film_years <= 1979, "1970s",
                         ifelse(film_years >= 1980 & film_years <= 1989, "1980s",
                         ifelse(film_years >= 1990 & film_years <= 1999, "1990s",
                         ifelse(film_years >= 2000 & film_years <= 2009, "2000s",
                         ifelse(film_years >= 2010 & film_years <= 2019, "2010s", "Unknown")))))))

Average ratings for each decade

average_rating <- movie_df %>%
  group_by(Decade) %>%
  summarize(AverageRating = mean(film_ratings, na.rm = TRUE))
## Warning: There was 1 warning in `summarize()`.
## ℹ In argument: `AverageRating = mean(film_ratings, na.rm = TRUE)`.
## Caused by warning in `mean.default()`:
## ! l'argomento non è numerico o logico: si restituisce NA