options(scipen = 999)
library(lubridate)

## Warning: package 'lubridate' was built under R version 4.5.3

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.5.3

library(tidyr)

## Warning: package 'tidyr' was built under R version 4.5.3

library(corrplot)

## Warning: package 'corrplot' was built under R version 4.5.3

## corrplot 0.95 loaded

library(stringr)

## Warning: package 'stringr' was built under R version 4.5.3

library(readr)

## Warning: package 'readr' was built under R version 4.5.3

tmdb_5000_movies <- read_csv(
  "C:/Users/mkuma/Downloads/archive(2)/tmdb_5000_movies.csv",locale = locale(encoding = "UTF-8"))

## Rows: 4803 Columns: 20

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (12): genres, homepage, keywords, original_language, original_title, ov...
## dbl   (7): budget, id, popularity, revenue, runtime, vote_average, vote_count
## date  (1): release_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Data cleaning and preparation

colSums(is.na(tmdb_5000_movies))

##               budget               genres             homepage 
##                    0                    0                 3091 
##                   id             keywords    original_language 
##                    0                    0                    0 
##       original_title             overview           popularity 
##                    0                    4                    0 
## production_companies production_countries         release_date 
##                    0                    0                    1 
##              revenue              runtime     spoken_languages 
##                    0                    2                    0 
##               status              tagline                title 
##                    0                  844                    0 
##         vote_average           vote_count 
##                    0                    0

movies_clean <- tmdb_5000_movies %>%
  filter(!is.na(runtime),
         !is.na(release_date)) %>%
  mutate(year = year(as.Date(release_date)),
    main_genre = str_extract(genres, '"name":\\s*"[^"]+"'),
    main_genre = str_remove_all(main_genre, '"name":\\s*"|\\"'),
    profit = revenue - budget,
    budget_category = case_when(
      budget >= 150000000 ~ "High",
      budget >= 50000000 ~ "Medium",
      TRUE ~ "Low" ),
    rating_category = case_when(
      vote_average >= 7 ~ "High",
      vote_average >= 5 ~ "Medium",
      TRUE ~ "Low")) %>%
  filter(budget > 0,
         revenue > 0,
         runtime > 0)

interpretation: dataset was cleaned by removing missing values from important variable such as runtime and release date.new variables such as budget,year category,rating category and main genere were creater for analysis.

#Exploratory data analysis

A. Basic distribution

1. Distribution of movie ratings.

ggplot(movies_clean,aes(x=vote_average))+
  geom_histogram(bins=20,fill="thistle",color="black")+
  labs(title = "distribution of movies ratings",
       x="ratings",
       y="numbers of movies")+
  theme_minimal()

Interpretation: The graph shows how ratings are spread across all movies in the dataset.

Insights: Most movies have ratings between 5 and 7. very few movies have extremely high or very low ratings. this means average rated movies are more common.

2.Distribution of Runtime

ggplot(movies_clean,aes(x=runtime))+
  geom_histogram(bins=20,fill="lightpink1",color="black")+
  labs(title = "distribution of movies ratings",
       x="runtime",
       y="numbers of movies")+
  theme_minimal()

Interpretation: The graph shows how movie runtimes are distributed.

Insights: Most movies have a runtime in the middle range. very short and very long movies are relatively lesser

B.relationship analysis

4.Vote count VS rating

ggplot(movies_clean, aes(x = vote_count, y = vote_average)) +
  geom_point(alpha = 0.5, color = "pink3") +
  geom_smooth(method = "lm", color = "darkgreen") +
  labs(title = "Vote Count vs Rating",
       x = "Number of Votes",
       y = "Rating") +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

Interpretation: This graph shows the relationship between vote count and ratings.

Insights: Movies with more votes have more stable ratings. movies with fewer votes show more variation. this means ratings become more reliable when more people vote.

5.Runtime VS rating

ggplot(movies_clean,aes(x=runtime,y=vote_average))+
  geom_point(alpha=0.5,color="violet")+
  geom_smooth(method = "lm",color="skyblue")+
  labs(title = "runtime vs rating",
       x="runtime",
       y="rating")+
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

Interpretation: This graph shows how runtime is related to ratings.

Insights: From the graph, there is a slight positive relationship between runtime and rating.That means as runtime increases, rating tends to increase a little, but not very strongly.

C.Time trend analysis

6. Movies over time

movies_year <- movies_clean %>%
  group_by(year) %>%
  summarise(total_movies = n(), .groups = "drop")
movies_year

## # A tibble: 89 × 2
##     year total_movies
##    <dbl>        <int>
##  1  1916            1
##  2  1925            1
##  3  1927            1
##  4  1929            1
##  5  1930            1
##  6  1932            1
##  7  1933            2
##  8  1934            1
##  9  1935            1
## 10  1936            2
## # ℹ 79 more rows

ggplot(movies_year, aes(x = year, y = total_movies)) +
  geom_line(color = "violet",linewidth =1) +
  labs(title = "Number of Movies Released Per Year",
       x = "Year",
       y = "Number of Movies") +
  theme_minimal()

Interpretation: This graph shows how many movies were released each year.

Insights: The number of movies increases over time. this shows growth in the film industry.

7.Average profit over time

profit_year <- movies_clean %>%
  group_by(year) %>%
  summarise(avg_profit = mean(profit), .groups = "drop")
profit_year

## # A tibble: 89 × 2
##     year avg_profit
##    <dbl>      <dbl>
##  1  1916   8008844 
##  2  1925  21755000 
##  3  1927 -91969578 
##  4  1929   3979000 
##  5  1930   4050000 
##  6  1932        21 
##  7  1933   1921000 
##  8  1934   4175000 
##  9  1935   2593000 
## 10  1936   5018000.
## # ℹ 79 more rows

ggplot(profit_year, aes(x = year, y = avg_profit)) +
  geom_line(color = "purple") +
  labs(title = "avg Profit Over Years",
       x = "Year",
       y = "avg Profit") +
  theme_minimal()+scale_y_continuous(labels = scales::label_number(scale = 1e-6, suffix = "M"))

Interpretation: This graph shows how average profit changes over years.

Insights: Profit is not stable every year. some years have higher profits than others. this shows fluctuation in movie success.

8.Total revenue over years

revenue_year <- movies_clean %>%
  group_by(year) %>%
  summarise(total_revenue = sum(revenue), .groups = "drop")
revenue_year

## # A tibble: 89 × 2
##     year total_revenue
##    <dbl>         <dbl>
##  1  1916       8394751
##  2  1925      22000000
##  3  1927        650422
##  4  1929       4358000
##  5  1930       8000000
##  6  1932            25
##  7  1933       4481000
##  8  1934       4500000
##  9  1935       3202000
## 10  1936      11236000
## # ℹ 79 more rows

ggplot(revenue_year, aes(x = year, y = total_revenue)) +
  geom_line(color = "red") +
  labs(title = "Total Revenue Over Years",
       x = "Year",
       y = "Total Revenue") +
  theme_minimal()+scale_y_continuous(labels = scales::label_number(scale = 1e-6, suffix = "M"))

Interpretation: This graph shows total revenue generated each year.

Insights: Revenue increases over time. this means the movie industry is growing in value.

9.Profit by Budget Category

profit_category <- movies_clean %>%
  group_by(budget_category) %>%
  summarise(avg_profit = mean(profit),
            total_movies = n(),
            .groups = "drop")
profit_category

## # A tibble: 3 × 3
##   budget_category avg_profit total_movies
##   <chr>                <dbl>        <int>
## 1 High            421265144.          143
## 2 Low              43045487.         2273
## 3 Medium          125629765.          813

ggplot(profit_category, aes(x = budget_category, y = avg_profit, fill = budget_category)) +
  geom_col() +
  labs(title = "avg Profit by Budget Category",
       x = "Budget Category",
       y = "avg Profit (in Millions)") +
  theme_minimal() +
  scale_fill_manual(values = c(
    "Low" = "purple",
    "Medium" = "lightblue",
    "High" = "darkblue"
  )) +
  scale_y_continuous(labels = scales::label_number(scale = 1e-6, suffix = "M")) +
  theme(legend.position = "none")

Interpretation: This graph compares profit for low, medium and high budget movies.

Insights: High budget movies earn more profit on average. but they also require more investment.

E.Genre and Language analysis

10. Top 10 Languages by Number of Movies

language_count <- movies_clean %>%
  group_by(original_language) %>%
  summarise(total_movies = n(), .groups = "drop") %>%
  arrange(desc(total_movies)) %>%
  head(10)
language_count

## # A tibble: 10 × 2
##    original_language total_movies
##    <chr>                    <int>
##  1 en                        3102
##  2 fr                          25
##  3 es                          15
##  4 ja                          13
##  5 zh                          13
##  6 de                           9
##  7 hi                           7
##  8 it                           6
##  9 ru                           6
## 10 cn                           5

ggplot(language_count, aes(x = reorder(original_language, total_movies), y = total_movies)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  labs(title = "Top 10 Languages by Number of Movies",
       x = "Language",
       y = "Number of Movies") +
  theme_minimal()

Interpretation: This graph shows top 10 langauges

Insights: Movies with higher ratings tend to earn more revenue. this shows that better movies attract more audience.

11. Taking top 10 profitable movies

top_profit <- movies_clean %>%
  arrange(desc(profit)) %>%
  select(title, profit) %>%
  head(10)
top_profit

## # A tibble: 10 × 2
##    title                                             profit
##    <chr>                                              <dbl>
##  1 Avatar                                        2550965087
##  2 Titanic                                       1645034188
##  3 Jurassic World                                1363528810
##  4 Furious 7                                     1316249360
##  5 The Avengers                                  1299557910
##  6 Avengers: Age of Ultron                       1125403694
##  7 Frozen                                        1124219009
##  8 Minions                                       1082730962
##  9 The Lord of the Rings: The Return of the King 1024888979
## 10 Iron Man 3                                    1015439994

ggplot(top_profit, aes(x = reorder(title, profit), y = profit)) +
  geom_col(fill = "pink4") +
  coord_flip() +
  labs(title = "Top 10 Profitable Movies",
       x = "Movie Title",
       y = "Profit") +
  theme_minimal()

Interpretation: This graph shows the most common languages in the dataset.

Insights: English movies are highest in number. other languages have much fewer movies.

12. Top Genres

genre_count <- movies_clean %>%
  group_by(main_genre) %>%
  summarise(total_movies = n(), .groups = "drop") %>%
  arrange(desc(total_movies)) %>%head(10)
genre_count

## # A tibble: 10 × 2
##    main_genre      total_movies
##    <chr>                  <int>
##  1 Drama                    747
##  2 Comedy                   634
##  3 Action                   588
##  4 Adventure                288
##  5 Horror                   197
##  6 Crime                    141
##  7 Thriller                 118
##  8 Animation                 99
##  9 Fantasy                   93
## 10 Science Fiction           79

ggplot(genre_count, aes(x = reorder(main_genre, total_movies), y = total_movies)) +
  geom_col(fill = "darkcyan") +
  coord_flip() +
  labs(title = "Top Genres by Number of Movies",
       x = "Genre",
       y = "Number of Movies") +
  theme_minimal()

Interpretation: This graph shows which genres are most common.

Insights: Some genres appear more frequently. this shows that certain types of movies are made more often.

13. Profit by Genre

profit_genre <- movies_clean %>%
  group_by(main_genre) %>%
  summarise(avg_profit = mean(profit),
            total_movies = n(),
            .groups = "drop") %>%
  filter(total_movies > 20)
profit_genre

## # A tibble: 15 × 3
##    main_genre      avg_profit total_movies
##    <chr>                <dbl>        <int>
##  1 Action           97322816.          588
##  2 Adventure       170718674.          288
##  3 Animation       216260494.           99
##  4 Comedy           56210243.          634
##  5 Crime            39166665.          141
##  6 Documentary      18670263.           30
##  7 Drama            46064427.          747
##  8 Family          172206323.           38
##  9 Fantasy         119642574.           93
## 10 Horror           50941923.          197
## 11 Mystery          65985201.           27
## 12 Romance          69887043.           70
## 13 Science Fiction 146139275.           79
## 14 Thriller         60873100.          118
## 15 Western          32930690            22

ggplot(profit_genre, aes(x = reorder(main_genre, avg_profit), y = avg_profit)) +
  geom_col(fill = "purple4") +coord_flip() +
  labs(title = "avg Profit by Genre",
       x = "Genre",
       y = "avg Profit") +
  theme_minimal() +
  scale_y_continuous(labels = scales::label_number(scale = 1e-6, suffix = "M"))

Interpretation: This graph shows average profit for each genre.

Insights: Some genres earn more profit than others. this means genre affects movie earnings.

14. Popularity by Genre

popularity_genre <- movies_clean %>%
  group_by(main_genre) %>%
  summarise(avg_popularity = mean(popularity),
            total_movies = n(),
            .groups = "drop") %>%
  filter(total_movies > 20)
popularity_genre

## # A tibble: 15 × 3
##    main_genre      avg_popularity total_movies
##    <chr>                    <dbl>        <int>
##  1 Action                   34.6           588
##  2 Adventure                44.6           288
##  3 Animation                41.7            99
##  4 Comedy                   20.3           634
##  5 Crime                    25.2           141
##  6 Documentary               6.60           30
##  7 Drama                    22.5           747
##  8 Family                   52.0            38
##  9 Fantasy                  40.3            93
## 10 Horror                   26.1           197
## 11 Mystery                  33.2            27
## 12 Romance                  25.9            70
## 13 Science Fiction          49.1            79
## 14 Thriller                 31.2           118
## 15 Western                  26.8            22

ggplot(popularity_genre, aes(x = reorder(main_genre, avg_popularity), y = avg_popularity)) +
  geom_col(fill = "purple") +coord_flip() +
  labs(title = "avg Popularity by Genre",
       x = "Genre",
       y = "Popularity") +
  theme_minimal()

Interpretation: This graph shows popularity across different genres.

Insights: Some genres are more popular among audience. this shows audience interest varies by genre.

15. Revenue by Rating Category

revenue_rating <- movies_clean %>%
  group_by(rating_category) %>%
  summarise(avg_revenue = mean(revenue),
            total_movies = n(),
            .groups = "drop")
revenue_rating

## # A tibble: 3 × 3
##   rating_category avg_revenue total_movies
##   <chr>                 <dbl>        <int>
## 1 High             168980432.          768
## 2 Low               43147160.          192
## 3 Medium           111693381.         2269

ggplot(revenue_rating, aes(x = rating_category, y = avg_revenue, fill = rating_category)) +
  geom_col() +
  labs(title = "avg Revenue by Rating Category",
       x = "Rating Category",
       y = "avg Revenue") +
  theme_minimal() +
  scale_fill_manual(values = c("Low" = "seagreen2", "Medium" = "green4", "High" = "darkgreen")) +
  scale_y_continuous(labels = scales::label_number(scale = 1e-6, suffix = "M"))

Interpretation: This graph shows revenue based on rating levels.

Insights: Higher rated movies earn more money. this shows quality may influence earnings.

F.Special Analysis

16. Hit vs Flop Analysis

movies_clean <- movies_clean %>%
  mutate(success = ifelse(profit > 0, "Hit", "Flop"))
success_count <- movies_clean %>%
  group_by(success) %>%
  summarise(total = n(), .groups = "drop")
success_count

## # A tibble: 2 × 2
##   success total
##   <chr>   <int>
## 1 Flop      791
## 2 Hit      2438

Interpretation: This table shows how many movies are hits or flops.

Insights: Not all movies make profit. some movies fail even after release.

17. Runtime Group Performance

movies_clean <- movies_clean %>%
  mutate(runtime_group = case_when(
    runtime < 90 ~ "Short",
    runtime <= 120 ~ "Medium",
    TRUE ~ "Long"
  ))
runtime_perf <- movies_clean %>%
  group_by(runtime_group) %>%
  summarise(
    avg_rating = mean(vote_average),
    avg_profit = mean(profit),
    .groups = "drop"
  )

runtime_perf

## # A tibble: 3 × 3
##   runtime_group avg_rating avg_profit
##   <chr>              <dbl>      <dbl>
## 1 Long                6.79 138279307.
## 2 Medium              6.19  62110087.
## 3 Short               5.83  51957018.

Interpretation: This table compares rating and profit for short medium and long movies.

Insights: It helps to see which runtime performs better. some durations may be more effective than others.

18. High Budget Risk Analysis

high_budget_risk <- movies_clean %>%
  filter(budget_category == "High") %>%
  mutate(failure = ifelse(profit <= 0, "Loss", "Profit")) %>%
  group_by(failure) %>%
  summarise(total = n(), .groups = "drop")
high_budget_risk

## # A tibble: 2 × 2
##   failure total
##   <chr>   <int>
## 1 Loss        6
## 2 Profit    137

Interpretation: This table shows profit and loss in high budget movies.

Insights: Even high budget movies can fail. this shows high investment also has high risk.

19. Overhyped Movies

movies_clean <- movies_clean %>%
  mutate(hype = popularity - vote_average * 10)
hype_movies <- movies_clean %>%
  arrange(desc(hype)) %>%
  select(title, popularity, vote_average, hype) %>%
  head(10)
hype_movies

## # A tibble: 10 × 4
##    title                                           popularity vote_average  hype
##    <chr>                                                <dbl>        <dbl> <dbl>
##  1 Minions                                               876.          6.4  812.
##  2 Interstellar                                          724.          8.1  643.
##  3 Deadpool                                              515.          7.4  441.
##  4 Guardians of the Galaxy                               481.          7.9  402.
##  5 Mad Max: Fury Road                                    434.          7.2  362.
##  6 Jurassic World                                        419.          6.5  354.
##  7 Pirates of the Caribbean: The Curse of the Bla…       272.          7.5  197.
##  8 Dawn of the Planet of the Apes                        244.          7.3  171.
##  9 Terminator Genisys                                    202.          5.8  144.
## 10 The Hunger Games: Mockingjay - Part 1                 206.          6.6  140.

Interpretation: This table shows movies with high popularity but lower ratings.

Insights: These movies may be overhyped. they are popular but not rated highly.

20. Runtime by genre

runtime_genre <- movies_clean %>%
  group_by(main_genre) %>%
  summarise(avg_runtime = mean(runtime),
            total_movies = n(),
            .groups = "drop") %>%
  filter(total_movies > 20)
runtime_genre

## # A tibble: 15 × 3
##    main_genre      avg_runtime total_movies
##    <chr>                 <dbl>        <int>
##  1 Action                111.           588
##  2 Adventure             116.           288
##  3 Animation              91.3           99
##  4 Comedy                103.           634
##  5 Crime                 116.           141
##  6 Documentary            98.6           30
##  7 Drama                 121.           747
##  8 Family                 94.2           38
##  9 Fantasy               108.            93
## 10 Horror                 98.1          197
## 11 Mystery               105.            27
## 12 Romance               108.            70
## 13 Science Fiction       113.            79
## 14 Thriller              111.           118
## 15 Western               122.            22

ggplot(runtime_genre, aes(x = reorder(main_genre, avg_runtime), y = avg_runtime)) +
  geom_col(fill = "thistle") +
  coord_flip() +
  labs(title = "avg Runtime by Genre",
       x = "Genre",
       y = "avg Runtime") +
  theme_minimal()

Interpretation: This graph shows average runtime for each genre.

Insights: Different genres have different runtimes. this shows storytelling style varies by genre.

TMDB Movies Analysis

Khushboo

2026-04-13