(1) Display top 10 rows of the dataset.

head(tibble_movies, 10)

(2) Display last 10 rows in the dataset.

tail(tibble_movies, 10)

(3) Find shape of dataset (number of rows and columns).

cat('The number of rows in the dataset is: ', nrow(tibble_movies), '\n')

## The number of rows in the dataset is:  1000

cat('The number of columns in the dataset is: ', ncol(tibble_movies))

## The number of columns in the dataset is:  12

(4) Get information about the dataset.

glimpse(tibble_movies)

## Rows: 1,000
## Columns: 12
## $ Rank                 <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15…
## $ Title                <chr> "Guardians of the Galaxy", "Prometheus", "Split",…
## $ Genre                <chr> "Action,Adventure,Sci-Fi", "Adventure,Mystery,Sci…
## $ Description          <chr> "A group of intergalactic criminals are forced to…
## $ Director             <chr> "James Gunn", "Ridley Scott", "M. Night Shyamalan…
## $ Actors               <chr> "Chris Pratt, Vin Diesel, Bradley Cooper, Zoe Sal…
## $ Year                 <dbl> 2014, 2012, 2016, 2016, 2016, 2016, 2016, 2016, 2…
## $ `Runtime (Minutes)`  <dbl> 121, 124, 117, 108, 123, 103, 128, 89, 141, 116, …
## $ Rating               <dbl> 8.1, 7.0, 7.3, 7.2, 6.2, 6.1, 8.3, 6.4, 7.1, 7.0,…
## $ Votes                <dbl> 757074, 485820, 157606, 60545, 393727, 56036, 258…
## $ `Revenue (Millions)` <dbl> 333.13, 126.46, 138.12, 270.32, 325.02, 45.13, 15…
## $ Metascore            <dbl> 76, 65, 62, 59, 40, 42, 93, 71, 78, 41, 66, 74, 6…

(5) Check missing values in the dataset.

#skim_without_charts(tibble_movies)
revenue_na = sum(is.na(tibble_movies$`Revenue (Millions)`))
metascore_na = sum(is.na(tibble_movies$Metascore))
cat('The number of missing values for Revenue column is: ', revenue_na, '\n')

## The number of missing values for Revenue column is:  128

cat('The number of missing values for Metascore column is: ', metascore_na)

## The number of missing values for Metascore column is:  64

(6) Drop all missing values.

tibble_movies_2 = tibble_movies %>% 
  drop_na()

revenue_na_2 = sum(is.na(tibble_movies_2$`Revenue (Millions)`))
metascore_na_2 = sum(is.na(tibble_movies_2$Metascore))
cat('The number of missing values for Revenue column is: ', revenue_na_2, '\n')

## The number of missing values for Revenue column is:  0

cat('The number of missing values for Metascore column is: ', metascore_na_2)

## The number of missing values for Metascore column is:  0

(7) Check for duplicate data.

duplicated_data = tibble_movies_2 %>% 
  filter(duplicated(.))

duplicated_data

(8) Get overall statistics about dataset.

summary(tibble_movies_2)

##       Rank           Title              Genre           Description       
##  Min.   :   1.0   Length:838         Length:838         Length:838        
##  1st Qu.: 238.2   Class :character   Class :character   Class :character  
##  Median : 475.5   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 485.2                                                           
##  3rd Qu.: 729.8                                                           
##  Max.   :1000.0                                                           
##    Director            Actors               Year      Runtime (Minutes)
##  Length:838         Length:838         Min.   :2006   Min.   : 66.0    
##  Class :character   Class :character   1st Qu.:2010   1st Qu.:101.0    
##  Mode  :character   Mode  :character   Median :2013   Median :112.0    
##                                        Mean   :2013   Mean   :114.6    
##                                        3rd Qu.:2015   3rd Qu.:124.0    
##                                        Max.   :2016   Max.   :187.0    
##      Rating          Votes         Revenue (Millions)   Metascore     
##  Min.   :1.900   Min.   :    178   Min.   :  0.00     Min.   : 11.00  
##  1st Qu.:6.300   1st Qu.:  61277   1st Qu.: 13.97     1st Qu.: 47.00  
##  Median :6.900   Median : 136880   Median : 48.15     Median : 60.00  
##  Mean   :6.814   Mean   : 193230   Mean   : 84.56     Mean   : 59.58  
##  3rd Qu.:7.500   3rd Qu.: 271083   3rd Qu.:116.80     3rd Qu.: 72.00  
##  Max.   :9.000   Max.   :1791916   Max.   :936.63     Max.   :100.00

(9) Title of the movies having runtime over than 180 minutes.

movies_over_180 = tibble_movies_2 %>% 
  filter(`Runtime (Minutes)` >= 180) %>% 
  select(Title, `Runtime (Minutes)`)

movies_over_180

(10) In which year there was the highest average voting?

votes_per_year_list = tibble_movies_2 %>% 
  group_by(Year) %>% 
  summarize(mean_votes_per_year = mean(Votes)) %>% 
  arrange(-mean_votes_per_year)

votes_per_year_list

votes_per_year_list$Year = factor(votes_per_year_list$Year)

ggplot(data=votes_per_year_list, aes(x = Year, y = mean_votes_per_year, fill = Year)) +
  geom_col() + 
  labs(title = 'Average votes per year.', x = 'Year', y = 'Votes') +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  scale_y_continuous(labels = scales::comma) + 
  guides(fill = FALSE)

## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

(11) In which year there was the highest average revenue?

revenue_per_year_list = tibble_movies_2 %>% 
  group_by(Year) %>% 
  summarize(average_revenue_per_year = mean(`Revenue (Millions)`)) %>% 
  arrange(-average_revenue_per_year)

revenue_per_year_list

revenue_per_year_list$Year = factor(revenue_per_year_list$Year)

ggplot(data=revenue_per_year_list, aes(x = Year, y = average_revenue_per_year, fill = Year)) +
  geom_col() + 
  labs(title = 'Average revenue per year.', x = 'Year', y = 'Revenue') +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  scale_y_continuous(labels = scales::comma) +
  guides(fill = FALSE)

(12) Find the average rating for each director.

average_rating_director = tibble_movies_2 %>% 
  group_by(Director) %>% 
  summarize(average_rating = mean(Rating)) %>% 
  arrange(-average_rating)

average_rating_director

(13) Display top 10 lengthy movies title and runtime.

lengthy_movies_list = tibble_movies_2 %>%
  select(Title, `Runtime (Minutes)`) %>% 
  arrange(-`Runtime (Minutes)`) %>% 
  head(10)

lengthy_movies_list

ggplot(data=lengthy_movies_list, aes(x = `Runtime (Minutes)`, y = Title, fill = Title)) + 
  geom_col() + 
  labs(title = 'Top 10 longest runtime movies.', x = 'Minutes', y = 'Title movie') + 
  guides(fill = FALSE)

(14) Display number of movies per year.

movies_per_year = tibble_movies_2 %>% 
  group_by(Year) %>% 
  summarize(counter = n()) %>% 
  arrange(-counter)
  
movies_per_year

(15) Find the most popular movie title (highest revenue).

most_popular_movie = tibble_movies_2 %>% 
  filter(`Revenue (Millions)` == max(`Revenue (Millions)`)) %>% 
  select(Title, `Revenue (Millions)`)
  
most_popular_movie

(16) Find the least movie title (lowest revenue).

most_unpopular_movie = tibble_movies_2 %>% 
  filter(`Revenue (Millions)` == min(`Revenue (Millions)`)) %>% 
  select(Title, `Revenue (Millions)`)

most_unpopular_movie

(17) Display de 10 highest rated movie titles and directors.

highest_rated_movies = tibble_movies_2 %>% 
  arrange(-Rating) %>% 
  select(Title, Director, Rating) %>% 
  head(10)

highest_rated_movies

ggplot(data=highest_rated_movies, aes(x = Title, y = Rating, fill = Director)) + 
  geom_col() + 
  labs(title = 'Top 10 highest rated movies.', x = 'Title', y = 'Rating') + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

(18) Display top 10 highest revenue movie titles.

highest_revenue_movies = tibble_movies_2 %>% 
  arrange(-`Revenue (Millions)`) %>% 
  select(Title, `Revenue (Millions)`) %>% 
  head(10)

highest_revenue_movies

ggplot(data=highest_revenue_movies, aes(x = `Revenue (Millions)`, y = Title, fill = Title)) + 
  geom_col() + 
  labs(title = 'Top 10 highest revenue movies.', x = 'Revenue', y = 'Title') + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
  guides(fill = FALSE)

(19) Find average rating of movies per year.

average_rating_per_year = tibble_movies_2 %>% 
  group_by(Year) %>% 
  summarize(average_rating = mean(Rating)) %>% 
  arrange(-average_rating)

average_rating_per_year

average_rating_per_year$Year = factor(average_rating_per_year$Year)

ggplot(data=average_rating_per_year, aes(x = Year, y = average_rating, fill = Year)) + 
  geom_col() + 
  labs(title = 'Average rating of movies per year.', x = 'Year', y = 'Average rating') + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
  guides(fill = FALSE)

(20) Does rating affect the revenue?

ggplot(data=tibble_movies_2, aes(x = Rating, y = `Revenue (Millions)`))+
  geom_point() +
  labs(x = 'Rating', y = 'Revenue') + 
  ggtitle('Scatterplot Rating vs Revenue') +
  theme_bw()

(21) Classify movies based on ratings (excellent, good and average).

tibble_movies_2 = tibble_movies_2 %>% 
  mutate(Category = case_when(
  Rating >= 7 ~ 'Excellent',
  Rating >= 6 ~ 'Good',
  TRUE ~ 'Average'
))

tibble_movies_2

(22) Count number of action movies.

action_movies = tibble_movies_2 %>% 
  filter(str_detect(Genre, 'Action')) %>% 
  nrow()

cat('The number of action movies is: ', action_movies)

## The number of action movies is:  277

(23) Find unique values from genre.

genres = tibble_movies_2 %>% 
  separate_rows(Genre, sep = ',') %>% 
  distinct(Genre)

genres

movie_imdb

Edwin Lee

2023-09-20

(1) Display top 10 rows of the dataset.

(2) Display last 10 rows in the dataset.

(3) Find shape of dataset (number of rows and columns).

(4) Get information about the dataset.

(5) Check missing values in the dataset.

(6) Drop all missing values.

(7) Check for duplicate data.

(8) Get overall statistics about dataset.

(9) Title of the movies having runtime over than 180 minutes.

(10) In which year there was the highest average voting?

(11) In which year there was the highest average revenue?

(12) Find the average rating for each director.

(13) Display top 10 lengthy movies title and runtime.

(14) Display number of movies per year.

(15) Find the most popular movie title (highest revenue).

(16) Find the least movie title (lowest revenue).

(17) Display de 10 highest rated movie titles and directors.

(18) Display top 10 highest revenue movie titles.

(19) Find average rating of movies per year.

(20) Does rating affect the revenue?

(21) Classify movies based on ratings (excellent, good and average).

(22) Count number of action movies.

(23) Find unique values from genre.