(1) Display top 10 rows of the dataset.

head(tibble_movies, 10)

(2) Display last 10 rows in the dataset.

tail(tibble_movies, 10)

(3) Find shape of dataset (number of rows and columns).

cat('The number of rows in the dataset is: ', nrow(tibble_movies), '\n')
## The number of rows in the dataset is:  1000
cat('The number of columns in the dataset is: ', ncol(tibble_movies))
## The number of columns in the dataset is:  12

(4) Get information about the dataset.

glimpse(tibble_movies)
## Rows: 1,000
## Columns: 12
## $ Rank                 <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15…
## $ Title                <chr> "Guardians of the Galaxy", "Prometheus", "Split",…
## $ Genre                <chr> "Action,Adventure,Sci-Fi", "Adventure,Mystery,Sci…
## $ Description          <chr> "A group of intergalactic criminals are forced to…
## $ Director             <chr> "James Gunn", "Ridley Scott", "M. Night Shyamalan…
## $ Actors               <chr> "Chris Pratt, Vin Diesel, Bradley Cooper, Zoe Sal…
## $ Year                 <dbl> 2014, 2012, 2016, 2016, 2016, 2016, 2016, 2016, 2…
## $ `Runtime (Minutes)`  <dbl> 121, 124, 117, 108, 123, 103, 128, 89, 141, 116, …
## $ Rating               <dbl> 8.1, 7.0, 7.3, 7.2, 6.2, 6.1, 8.3, 6.4, 7.1, 7.0,…
## $ Votes                <dbl> 757074, 485820, 157606, 60545, 393727, 56036, 258…
## $ `Revenue (Millions)` <dbl> 333.13, 126.46, 138.12, 270.32, 325.02, 45.13, 15…
## $ Metascore            <dbl> 76, 65, 62, 59, 40, 42, 93, 71, 78, 41, 66, 74, 6…

(5) Check missing values in the dataset.

#skim_without_charts(tibble_movies)
revenue_na = sum(is.na(tibble_movies$`Revenue (Millions)`))
metascore_na = sum(is.na(tibble_movies$Metascore))
cat('The number of missing values for Revenue column is: ', revenue_na, '\n')
## The number of missing values for Revenue column is:  128
cat('The number of missing values for Metascore column is: ', metascore_na)
## The number of missing values for Metascore column is:  64

(6) Drop all missing values.

tibble_movies_2 = tibble_movies %>% 
  drop_na()

revenue_na_2 = sum(is.na(tibble_movies_2$`Revenue (Millions)`))
metascore_na_2 = sum(is.na(tibble_movies_2$Metascore))
cat('The number of missing values for Revenue column is: ', revenue_na_2, '\n')
## The number of missing values for Revenue column is:  0
cat('The number of missing values for Metascore column is: ', metascore_na_2)
## The number of missing values for Metascore column is:  0

(7) Check for duplicate data.

duplicated_data = tibble_movies_2 %>% 
  filter(duplicated(.))

duplicated_data

(8) Get overall statistics about dataset.

summary(tibble_movies_2)
##       Rank           Title              Genre           Description       
##  Min.   :   1.0   Length:838         Length:838         Length:838        
##  1st Qu.: 238.2   Class :character   Class :character   Class :character  
##  Median : 475.5   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 485.2                                                           
##  3rd Qu.: 729.8                                                           
##  Max.   :1000.0                                                           
##    Director            Actors               Year      Runtime (Minutes)
##  Length:838         Length:838         Min.   :2006   Min.   : 66.0    
##  Class :character   Class :character   1st Qu.:2010   1st Qu.:101.0    
##  Mode  :character   Mode  :character   Median :2013   Median :112.0    
##                                        Mean   :2013   Mean   :114.6    
##                                        3rd Qu.:2015   3rd Qu.:124.0    
##                                        Max.   :2016   Max.   :187.0    
##      Rating          Votes         Revenue (Millions)   Metascore     
##  Min.   :1.900   Min.   :    178   Min.   :  0.00     Min.   : 11.00  
##  1st Qu.:6.300   1st Qu.:  61277   1st Qu.: 13.97     1st Qu.: 47.00  
##  Median :6.900   Median : 136880   Median : 48.15     Median : 60.00  
##  Mean   :6.814   Mean   : 193230   Mean   : 84.56     Mean   : 59.58  
##  3rd Qu.:7.500   3rd Qu.: 271083   3rd Qu.:116.80     3rd Qu.: 72.00  
##  Max.   :9.000   Max.   :1791916   Max.   :936.63     Max.   :100.00

(9) Title of the movies having runtime over than 180 minutes.

movies_over_180 = tibble_movies_2 %>% 
  filter(`Runtime (Minutes)` >= 180) %>% 
  select(Title, `Runtime (Minutes)`)

movies_over_180

(10) In which year there was the highest average voting?

votes_per_year_list = tibble_movies_2 %>% 
  group_by(Year) %>% 
  summarize(mean_votes_per_year = mean(Votes)) %>% 
  arrange(-mean_votes_per_year)

votes_per_year_list
votes_per_year_list$Year = factor(votes_per_year_list$Year)

ggplot(data=votes_per_year_list, aes(x = Year, y = mean_votes_per_year, fill = Year)) +
  geom_col() + 
  labs(title = 'Average votes per year.', x = 'Year', y = 'Votes') +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  scale_y_continuous(labels = scales::comma) + 
  guides(fill = FALSE)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

(11) In which year there was the highest average revenue?

revenue_per_year_list = tibble_movies_2 %>% 
  group_by(Year) %>% 
  summarize(average_revenue_per_year = mean(`Revenue (Millions)`)) %>% 
  arrange(-average_revenue_per_year)

revenue_per_year_list
revenue_per_year_list$Year = factor(revenue_per_year_list$Year)

ggplot(data=revenue_per_year_list, aes(x = Year, y = average_revenue_per_year, fill = Year)) +
  geom_col() + 
  labs(title = 'Average revenue per year.', x = 'Year', y = 'Revenue') +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  scale_y_continuous(labels = scales::comma) +
  guides(fill = FALSE)

(12) Find the average rating for each director.

average_rating_director = tibble_movies_2 %>% 
  group_by(Director) %>% 
  summarize(average_rating = mean(Rating)) %>% 
  arrange(-average_rating)

average_rating_director

(13) Display top 10 lengthy movies title and runtime.

lengthy_movies_list = tibble_movies_2 %>%
  select(Title, `Runtime (Minutes)`) %>% 
  arrange(-`Runtime (Minutes)`) %>% 
  head(10)

lengthy_movies_list
ggplot(data=lengthy_movies_list, aes(x = `Runtime (Minutes)`, y = Title, fill = Title)) + 
  geom_col() + 
  labs(title = 'Top 10 longest runtime movies.', x = 'Minutes', y = 'Title movie') + 
  guides(fill = FALSE)

(14) Display number of movies per year.

movies_per_year = tibble_movies_2 %>% 
  group_by(Year) %>% 
  summarize(counter = n()) %>% 
  arrange(-counter)
  
movies_per_year

(16) Find the least movie title (lowest revenue).

most_unpopular_movie = tibble_movies_2 %>% 
  filter(`Revenue (Millions)` == min(`Revenue (Millions)`)) %>% 
  select(Title, `Revenue (Millions)`)

most_unpopular_movie

(17) Display de 10 highest rated movie titles and directors.

highest_rated_movies = tibble_movies_2 %>% 
  arrange(-Rating) %>% 
  select(Title, Director, Rating) %>% 
  head(10)

highest_rated_movies
ggplot(data=highest_rated_movies, aes(x = Title, y = Rating, fill = Director)) + 
  geom_col() + 
  labs(title = 'Top 10 highest rated movies.', x = 'Title', y = 'Rating') + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

(18) Display top 10 highest revenue movie titles.

highest_revenue_movies = tibble_movies_2 %>% 
  arrange(-`Revenue (Millions)`) %>% 
  select(Title, `Revenue (Millions)`) %>% 
  head(10)

highest_revenue_movies
ggplot(data=highest_revenue_movies, aes(x = `Revenue (Millions)`, y = Title, fill = Title)) + 
  geom_col() + 
  labs(title = 'Top 10 highest revenue movies.', x = 'Revenue', y = 'Title') + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
  guides(fill = FALSE)

(19) Find average rating of movies per year.

average_rating_per_year = tibble_movies_2 %>% 
  group_by(Year) %>% 
  summarize(average_rating = mean(Rating)) %>% 
  arrange(-average_rating)

average_rating_per_year
average_rating_per_year$Year = factor(average_rating_per_year$Year)

ggplot(data=average_rating_per_year, aes(x = Year, y = average_rating, fill = Year)) + 
  geom_col() + 
  labs(title = 'Average rating of movies per year.', x = 'Year', y = 'Average rating') + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
  guides(fill = FALSE)

(20) Does rating affect the revenue?

ggplot(data=tibble_movies_2, aes(x = Rating, y = `Revenue (Millions)`))+
  geom_point() +
  labs(x = 'Rating', y = 'Revenue') + 
  ggtitle('Scatterplot Rating vs Revenue') +
  theme_bw()

(21) Classify movies based on ratings (excellent, good and average).

tibble_movies_2 = tibble_movies_2 %>% 
  mutate(Category = case_when(
  Rating >= 7 ~ 'Excellent',
  Rating >= 6 ~ 'Good',
  TRUE ~ 'Average'
))

tibble_movies_2

(22) Count number of action movies.

action_movies = tibble_movies_2 %>% 
  filter(str_detect(Genre, 'Action')) %>% 
  nrow()

cat('The number of action movies is: ', action_movies)
## The number of action movies is:  277

(23) Find unique values from genre.

genres = tibble_movies_2 %>% 
  separate_rows(Genre, sep = ',') %>% 
  distinct(Genre)

genres