Assignment 1, Report 2

if (!requireNamespace("xml2", quietly = TRUE)) {
  install.packages("xml2")
}

library(xml2)

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)

##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

Upload the necessary library

library(rvest)
library(dplyr)

## 
## Caricamento pacchetto: 'dplyr'

## I seguenti oggetti sono mascherati da 'package:stats':
## 
##     filter, lag

## I seguenti oggetti sono mascherati da 'package:base':
## 
##     intersect, setdiff, setequal, union

Web Scraping

url <- "https://flixgem.com/"
webpage <- read_html(url)

Extraction of variables

film_directors <- webpage %>% html_nodes(".Directors_CSS") %>% html_text()
film_titles <- webpage %>% html_nodes(".Titles_CSS") %>% html_text()
film_years <- webpage %>% html_nodes(".Release_Date_CSS") %>% html_text()

Data frame creation

film_date <- data.frame(Title = film_titles, Director = film_directors, Release_Date = film_years)

CHALLENGE 1: What is the oldest Woody Allen film available on Netflix?

webpage <- film_date %>%
  arrange(Release_Date)
oldest_film <- webpage[1, ]

Upload necessary library

library(rvest)
library(dplyr)

Web Scraping

url <- "https://flixgem.com/"
webpage <- read_html(url)

Extract Data

movie_data <- webpage %>%
  html_nodes("selector_for_movie_data_CSS") %>%
  html_text()
film_genres <- webpage %>% html_nodes(".Genre_CSS") %>% html_text()
film_ratings <- webpage %>% html_nodes(".View_Rating_CSS") %>% html_text()
film_languages <- webpage %>% html_nodes(".Available_Languages_CSS") %>% html_text()

Data frame creation

movie_df <- data.frame(Title = film_titles, Genre = film_genres, Rating = film_ratings, Language = film_languages)

CHALLENGE 2: What are the three highest rated comedies available in Polish?

Movies available in Polish

polish_comedies <- movie_df %>%
  filter(tolower(Language) == "Polish")

sort by rating

sorted_polish_comedies <- polish_comedies %>%
  arrange(desc(Rating))

select top three

top_three_comedies <- head(sorted_polish_comedies, 3)

Upload necessary library

library(rvest)
library(dplyr)

Web Scraping

url <- "https://flixgem.com/"
webpage <- read_html(url)

Extract Data

movie_data <- webpage %>% 
  html_nodes("selector_for_movie_data_CSS") %>% 
  html_text()
film_release_dates <- webpage %>% html_nodes(".Release_Date_CSS") %>% html_text()
Netflix_film_release_dates <- webpage %>% html_nodes(".Netflix_Release_Date_CSS") %>% html_text()

Dataframe Creation

movie_df <- data.frame(Title = film_titles, Release_Date = film_release_dates, Netflix_Release_Date = Netflix_film_release_dates)

CHALLENGE 3: For 2019 and 2020 productions, what is the average time between release and appearance on Netflix?

Films produced between 2019 and 2020

Films_2019_2020 <- movie_df %>%
  filter(Release_Date >= 2019 & Release_Date <= 2020)

Average time between Netflix release and Appereance on Netflix

movie_df <- movie_df %>%
  mutate(DaysToNetflix = as.numeric(as.Date(Netflix_Release_Date) - as.Date(Release_Date)))

average_time <- mean(movie_df$DaysToNetflix, na.rm = TRUE)

Upload necessary library

library(rvest)
library(dplyr)

Web Scraping

url <- "https://flixgem.com/"
webpage <- read_html(url)

Extract data

tags <- webpage %>% 
  html_nodes("selector_for_tags_CSS") %>% 
  html_text()

CHALLENGE 4: What are the most popular tags for productions available in Polish?

Frequency of each tag

tag_frequency <- table(tags)

Most popular tags

top_tags <- head(sort(tag_frequency, decreasing = TRUE), 10)

Upload necessary library

library(rvest)
library(dplyr)

Web Scraping

url <- "https://flixgem.com/"
webpage <- read_html(url)

Extract Data

film_data <- webpage %>%
  html_nodes("selector_for_movie_data_CSS") %>%
  html_text()
film_ratings <- webpage %>% html_nodes(".Ratings_CSS") %>% html_text()
film_titles <- webpage %>% html_nodes(".Title_CSS") %>% html_text()
film_years <- webpage %>% html_nodes(".Release_Date_CSS") %>% html_text()

Dataframe Creation

movie.df <- data.frame(Title = film_titles, Release_Date = film_years, Rating = film_ratings)

CHALLENGE 5: What are the average ratings of films produced in each decade (i.e., 1960s, 1970s, 1980s, 1990s, etc.)?

Decade Extraction

movie_df <- movie_df %>%
  mutate(Decade = ifelse(film_years >= 1960 & film_years <= 1969, "1960s",
                         ifelse(film_years >= 1970 & film_years <= 1979, "1970s",
                         ifelse(film_years >= 1980 & film_years <= 1989, "1980s",
                         ifelse(film_years >= 1990 & film_years <= 1999, "1990s",
                         ifelse(film_years >= 2000 & film_years <= 2009, "2000s",
                         ifelse(film_years >= 2010 & film_years <= 2019, "2010s", "Unknown")))))))

Average ratings for each decade

average_rating <- movie_df %>%
  group_by(Decade) %>%
  summarize(AverageRating = mean(film_ratings, na.rm = TRUE))

## Warning: There was 1 warning in `summarize()`.
## ℹ In argument: `AverageRating = mean(film_ratings, na.rm = TRUE)`.
## Caused by warning in `mean.default()`:
## ! l'argomento non è numerico o logico: si restituisce NA

Assignment 1, Report 2

Sofia Animato

2023-10-31

R Markdown

Including Plots

Upload the necessary library

Web Scraping

Extraction of variables

Data frame creation

CHALLENGE 1: What is the oldest Woody Allen film available on Netflix?

Upload necessary library

Web Scraping

Extract Data

Data frame creation

CHALLENGE 2: What are the three highest rated comedies available in Polish?

Movies available in Polish

sort by rating

select top three

Upload necessary library

Web Scraping

Extract Data

Dataframe Creation

CHALLENGE 3: For 2019 and 2020 productions, what is the average time between release and appearance on Netflix?

Films produced between 2019 and 2020

Average time between Netflix release and Appereance on Netflix

Upload necessary library

Web Scraping

Extract data

CHALLENGE 4: What are the most popular tags for productions available in Polish?

Frequency of each tag

Most popular tags

Upload necessary library

Web Scraping

Extract Data

Dataframe Creation

CHALLENGE 5: What are the average ratings of films produced in each decade (i.e., 1960s, 1970s, 1980s, 1990s, etc.)?

Decade Extraction

Average ratings for each decade