(1) Display top 10 rows of the dataset.
(2) Display last 10 rows in the dataset.
(3) Find shape of dataset (number of rows and columns).
cat('The number of rows in the dataset is: ', nrow(tibble_movies), '\n')
## The number of rows in the dataset is: 1000
cat('The number of columns in the dataset is: ', ncol(tibble_movies))
## The number of columns in the dataset is: 12
(4) Get information about the dataset.
## Rows: 1,000
## Columns: 12
## $ Rank <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15…
## $ Title <chr> "Guardians of the Galaxy", "Prometheus", "Split",…
## $ Genre <chr> "Action,Adventure,Sci-Fi", "Adventure,Mystery,Sci…
## $ Description <chr> "A group of intergalactic criminals are forced to…
## $ Director <chr> "James Gunn", "Ridley Scott", "M. Night Shyamalan…
## $ Actors <chr> "Chris Pratt, Vin Diesel, Bradley Cooper, Zoe Sal…
## $ Year <dbl> 2014, 2012, 2016, 2016, 2016, 2016, 2016, 2016, 2…
## $ `Runtime (Minutes)` <dbl> 121, 124, 117, 108, 123, 103, 128, 89, 141, 116, …
## $ Rating <dbl> 8.1, 7.0, 7.3, 7.2, 6.2, 6.1, 8.3, 6.4, 7.1, 7.0,…
## $ Votes <dbl> 757074, 485820, 157606, 60545, 393727, 56036, 258…
## $ `Revenue (Millions)` <dbl> 333.13, 126.46, 138.12, 270.32, 325.02, 45.13, 15…
## $ Metascore <dbl> 76, 65, 62, 59, 40, 42, 93, 71, 78, 41, 66, 74, 6…
(5) Check missing values in the dataset.
#skim_without_charts(tibble_movies)
revenue_na = sum(is.na(tibble_movies$`Revenue (Millions)`))
metascore_na = sum(is.na(tibble_movies$Metascore))
cat('The number of missing values for Revenue column is: ', revenue_na, '\n')
## The number of missing values for Revenue column is: 128
cat('The number of missing values for Metascore column is: ', metascore_na)
## The number of missing values for Metascore column is: 64
(6) Drop all missing values.
tibble_movies_2 = tibble_movies %>%
drop_na()
revenue_na_2 = sum(is.na(tibble_movies_2$`Revenue (Millions)`))
metascore_na_2 = sum(is.na(tibble_movies_2$Metascore))
cat('The number of missing values for Revenue column is: ', revenue_na_2, '\n')
## The number of missing values for Revenue column is: 0
cat('The number of missing values for Metascore column is: ', metascore_na_2)
## The number of missing values for Metascore column is: 0
(7) Check for duplicate data.
duplicated_data = tibble_movies_2 %>%
filter(duplicated(.))
duplicated_data
(8) Get overall statistics about dataset.
## Rank Title Genre Description
## Min. : 1.0 Length:838 Length:838 Length:838
## 1st Qu.: 238.2 Class :character Class :character Class :character
## Median : 475.5 Mode :character Mode :character Mode :character
## Mean : 485.2
## 3rd Qu.: 729.8
## Max. :1000.0
## Director Actors Year Runtime (Minutes)
## Length:838 Length:838 Min. :2006 Min. : 66.0
## Class :character Class :character 1st Qu.:2010 1st Qu.:101.0
## Mode :character Mode :character Median :2013 Median :112.0
## Mean :2013 Mean :114.6
## 3rd Qu.:2015 3rd Qu.:124.0
## Max. :2016 Max. :187.0
## Rating Votes Revenue (Millions) Metascore
## Min. :1.900 Min. : 178 Min. : 0.00 Min. : 11.00
## 1st Qu.:6.300 1st Qu.: 61277 1st Qu.: 13.97 1st Qu.: 47.00
## Median :6.900 Median : 136880 Median : 48.15 Median : 60.00
## Mean :6.814 Mean : 193230 Mean : 84.56 Mean : 59.58
## 3rd Qu.:7.500 3rd Qu.: 271083 3rd Qu.:116.80 3rd Qu.: 72.00
## Max. :9.000 Max. :1791916 Max. :936.63 Max. :100.00
(9) Title of the movies having runtime over than 180 minutes.
movies_over_180 = tibble_movies_2 %>%
filter(`Runtime (Minutes)` >= 180) %>%
select(Title, `Runtime (Minutes)`)
movies_over_180
(10) In which year there was the highest average voting?
votes_per_year_list = tibble_movies_2 %>%
group_by(Year) %>%
summarize(mean_votes_per_year = mean(Votes)) %>%
arrange(-mean_votes_per_year)
votes_per_year_list
votes_per_year_list$Year = factor(votes_per_year_list$Year)
ggplot(data=votes_per_year_list, aes(x = Year, y = mean_votes_per_year, fill = Year)) +
geom_col() +
labs(title = 'Average votes per year.', x = 'Year', y = 'Votes') +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
scale_y_continuous(labels = scales::comma) +
guides(fill = FALSE)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

(11) In which year there was the highest average revenue?
revenue_per_year_list = tibble_movies_2 %>%
group_by(Year) %>%
summarize(average_revenue_per_year = mean(`Revenue (Millions)`)) %>%
arrange(-average_revenue_per_year)
revenue_per_year_list
revenue_per_year_list$Year = factor(revenue_per_year_list$Year)
ggplot(data=revenue_per_year_list, aes(x = Year, y = average_revenue_per_year, fill = Year)) +
geom_col() +
labs(title = 'Average revenue per year.', x = 'Year', y = 'Revenue') +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
scale_y_continuous(labels = scales::comma) +
guides(fill = FALSE)

(12) Find the average rating for each director.
average_rating_director = tibble_movies_2 %>%
group_by(Director) %>%
summarize(average_rating = mean(Rating)) %>%
arrange(-average_rating)
average_rating_director
(13) Display top 10 lengthy movies title and runtime.
lengthy_movies_list = tibble_movies_2 %>%
select(Title, `Runtime (Minutes)`) %>%
arrange(-`Runtime (Minutes)`) %>%
head(10)
lengthy_movies_list
ggplot(data=lengthy_movies_list, aes(x = `Runtime (Minutes)`, y = Title, fill = Title)) +
geom_col() +
labs(title = 'Top 10 longest runtime movies.', x = 'Minutes', y = 'Title movie') +
guides(fill = FALSE)

(14) Display number of movies per year.
movies_per_year = tibble_movies_2 %>%
group_by(Year) %>%
summarize(counter = n()) %>%
arrange(-counter)
movies_per_year
(15) Find the most popular movie title (highest revenue).
most_popular_movie = tibble_movies_2 %>%
filter(`Revenue (Millions)` == max(`Revenue (Millions)`)) %>%
select(Title, `Revenue (Millions)`)
most_popular_movie
(16) Find the least movie title (lowest revenue).
most_unpopular_movie = tibble_movies_2 %>%
filter(`Revenue (Millions)` == min(`Revenue (Millions)`)) %>%
select(Title, `Revenue (Millions)`)
most_unpopular_movie
(17) Display de 10 highest rated movie titles and directors.
highest_rated_movies = tibble_movies_2 %>%
arrange(-Rating) %>%
select(Title, Director, Rating) %>%
head(10)
highest_rated_movies
ggplot(data=highest_rated_movies, aes(x = Title, y = Rating, fill = Director)) +
geom_col() +
labs(title = 'Top 10 highest rated movies.', x = 'Title', y = 'Rating') +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

(18) Display top 10 highest revenue movie titles.
highest_revenue_movies = tibble_movies_2 %>%
arrange(-`Revenue (Millions)`) %>%
select(Title, `Revenue (Millions)`) %>%
head(10)
highest_revenue_movies
ggplot(data=highest_revenue_movies, aes(x = `Revenue (Millions)`, y = Title, fill = Title)) +
geom_col() +
labs(title = 'Top 10 highest revenue movies.', x = 'Revenue', y = 'Title') +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
guides(fill = FALSE)

(19) Find average rating of movies per year.
average_rating_per_year = tibble_movies_2 %>%
group_by(Year) %>%
summarize(average_rating = mean(Rating)) %>%
arrange(-average_rating)
average_rating_per_year
average_rating_per_year$Year = factor(average_rating_per_year$Year)
ggplot(data=average_rating_per_year, aes(x = Year, y = average_rating, fill = Year)) +
geom_col() +
labs(title = 'Average rating of movies per year.', x = 'Year', y = 'Average rating') +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
guides(fill = FALSE)

(20) Does rating affect the revenue?
ggplot(data=tibble_movies_2, aes(x = Rating, y = `Revenue (Millions)`))+
geom_point() +
labs(x = 'Rating', y = 'Revenue') +
ggtitle('Scatterplot Rating vs Revenue') +
theme_bw()

(21) Classify movies based on ratings (excellent, good and
average).
tibble_movies_2 = tibble_movies_2 %>%
mutate(Category = case_when(
Rating >= 7 ~ 'Excellent',
Rating >= 6 ~ 'Good',
TRUE ~ 'Average'
))
tibble_movies_2
(22) Count number of action movies.
action_movies = tibble_movies_2 %>%
filter(str_detect(Genre, 'Action')) %>%
nrow()
cat('The number of action movies is: ', action_movies)
## The number of action movies is: 277
(23) Find unique values from genre.
genres = tibble_movies_2 %>%
separate_rows(Genre, sep = ',') %>%
distinct(Genre)
genres