options(scipen = 999)
library(lubridate)

## Warning: package 'lubridate' was built under R version 4.5.3

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.5.3

library(tidyr)

## Warning: package 'tidyr' was built under R version 4.5.3

library(corrplot)

## Warning: package 'corrplot' was built under R version 4.5.3

## corrplot 0.95 loaded

library(stringr)

## Warning: package 'stringr' was built under R version 4.5.3

library(readr)

## Warning: package 'readr' was built under R version 4.5.3

tmdb_5000_movies <- read_csv(
  "C:/Users/mkuma/Downloads/archive(2)/tmdb_5000_movies.csv",locale = locale(encoding = "UTF-8"))

## Rows: 4803 Columns: 20

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (12): genres, homepage, keywords, original_language, original_title, ov...
## dbl   (7): budget, id, popularity, revenue, runtime, vote_average, vote_count
## date  (1): release_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Data cleaning and preparation

colSums(is.na(tmdb_5000_movies))

##               budget               genres             homepage 
##                    0                    0                 3091 
##                   id             keywords    original_language 
##                    0                    0                    0 
##       original_title             overview           popularity 
##                    0                    4                    0 
## production_companies production_countries         release_date 
##                    0                    0                    1 
##              revenue              runtime     spoken_languages 
##                    0                    2                    0 
##               status              tagline                title 
##                    0                  844                    0 
##         vote_average           vote_count 
##                    0                    0

movies_clean <- tmdb_5000_movies %>%
  filter(!is.na(runtime),
         !is.na(release_date)) %>%
  
  mutate(
    year = year(as.Date(release_date)),
    
    main_genre = str_extract(genres, '"name":\\s*"[^"]+"'),
    main_genre = str_remove_all(main_genre, '"name":\\s*"|\\"'),
    
    main_country = str_extract(production_countries, '"name":\\s*"[^"]+"'),
    main_country = str_remove_all(main_country, '"name":\\s*"|\\"'),
    
    profit = revenue - budget,
    
    budget_category = case_when(
      budget >= 150000000 ~ "High",
      budget >= 50000000 ~ "Medium",
      TRUE ~ "Low"
    ),
    
    rating_category = case_when(
      vote_average >= 7 ~ "High",
      vote_average >= 5 ~ "Medium",
      TRUE ~ "Low"
    )
  ) %>%
  
  filter(budget > 0,
         revenue > 0,
         runtime > 0)

interpretation: dataset was cleaned by removing missing values from important variable such as runtime and release date.new variables such as budget,year category,rating category and main genere were creater for analysis.

#Exploratory data analysis

A. Basic distribution

1. Distribution of movie ratings.

ggplot(movies_clean,aes(x=vote_average))+
  geom_histogram(bins=20,fill="thistle",color="black")+
  labs(title = "distribution of movies ratings",
       x="ratings",
       y="numbers of movies")+
  theme_minimal()

Interpretation: The graph shows how ratings are spread across all movies in the dataset.

Insights: Most movies have ratings between 5 and 7. very few movies have extremely high or very low ratings. this means average rated movies are more common.

2.Distribution of Runtime

ggplot(movies_clean,aes(x=runtime))+
  geom_histogram(bins=20,fill="lightpink1",color="black")+
  labs(title = "distribution of movies ratings",
       x="runtime",
       y="numbers of movies")+
  theme_minimal()

Interpretation: The graph shows how movie runtimes are distributed.

Insights: Most movies have a runtime in the middle range. very short and very long movies are relatively lesser

B.relationship analysis

3.Vote count VS rating

ggplot(movies_clean, aes(x = vote_count, y = vote_average)) +
  geom_point(alpha = 0.5, color = "pink3") +
  geom_smooth(method = "lm", color = "darkgreen") +
  labs(title = "Vote Count vs Rating",
       x = "Number of Votes",
       y = "Rating") +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

Interpretation: This graph shows the relationship between vote count and ratings.

Insights: Movies with more votes have more stable ratings. movies with fewer votes show more variation. this means ratings become more reliable when more people vote.

4.Runtime VS rating

ggplot(movies_clean,aes(x=runtime,y=vote_average))+
  geom_point(alpha=0.5,color="violet")+
  geom_smooth(method = "lm",color="skyblue")+
  labs(title = "runtime vs rating",
       x="runtime",
       y="rating")+
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

Interpretation: This graph shows how runtime is related to ratings.

Insights: From the graph, there is a slight positive relationship between runtime and rating.That means as runtime increases, rating tends to increase a little, but not very strongly.

C.Time trend analysis

5. Movies over time

movies_year <- movies_clean %>%
  group_by(year) %>%
  summarise(total_movies = n(), .groups = "drop")
movies_year

## # A tibble: 89 × 2
##     year total_movies
##    <dbl>        <int>
##  1  1916            1
##  2  1925            1
##  3  1927            1
##  4  1929            1
##  5  1930            1
##  6  1932            1
##  7  1933            2
##  8  1934            1
##  9  1935            1
## 10  1936            2
## # ℹ 79 more rows

ggplot(movies_year, aes(x = year, y = total_movies)) +
  geom_line(color = "violet",linewidth =1) +
  labs(title = "Number of Movies Released Per Year",
       x = "Year",
       y = "Number of Movies") +
  theme_minimal()

Interpretation: This graph shows how many movies were released each year.

Insights: The number of movies increases over time. this shows growth in the film industry.

6.Average profit over time

profit_year <- movies_clean %>%
  group_by(year) %>%
  summarise(avg_profit = mean(profit), .groups = "drop")
profit_year

## # A tibble: 89 × 2
##     year avg_profit
##    <dbl>      <dbl>
##  1  1916   8008844 
##  2  1925  21755000 
##  3  1927 -91969578 
##  4  1929   3979000 
##  5  1930   4050000 
##  6  1932        21 
##  7  1933   1921000 
##  8  1934   4175000 
##  9  1935   2593000 
## 10  1936   5018000.
## # ℹ 79 more rows

ggplot(profit_year, aes(x = year, y = avg_profit)) +
  geom_line(color = "purple") +
  labs(title = "avg Profit Over Years",
       x = "Year",
       y = "avg Profit") +
  theme_minimal()+scale_y_continuous(labels = scales::label_number(scale = 1e-6, suffix = "M"))

Interpretation: This graph shows how average profit changes over years.

Insights: Profit is not stable every year. some years have higher profits than others. this shows fluctuation in movie success.

8.Total revenue over years

revenue_year <- movies_clean %>%
  group_by(year) %>%
  summarise(total_revenue = sum(revenue), .groups = "drop")
revenue_year

## # A tibble: 89 × 2
##     year total_revenue
##    <dbl>         <dbl>
##  1  1916       8394751
##  2  1925      22000000
##  3  1927        650422
##  4  1929       4358000
##  5  1930       8000000
##  6  1932            25
##  7  1933       4481000
##  8  1934       4500000
##  9  1935       3202000
## 10  1936      11236000
## # ℹ 79 more rows

ggplot(revenue_year, aes(x = year, y = total_revenue)) +
  geom_line(color = "red") +
  labs(title = "Total Revenue Over Years",
       x = "Year",
       y = "Total Revenue") +
  theme_minimal()+scale_y_continuous(labels = scales::label_number(scale = 1e-6, suffix = "M"))

Interpretation: This graph shows total revenue generated each year.

Insights: Revenue increases over time. this means the movie industry is growing in value.

9.Profit by Budget Category

profit_category <- movies_clean %>%
  group_by(budget_category) %>%
  summarise(avg_profit = mean(profit),
            total_movies = n(),
            .groups = "drop")
profit_category

## # A tibble: 3 × 3
##   budget_category avg_profit total_movies
##   <chr>                <dbl>        <int>
## 1 High            421265144.          143
## 2 Low              43045487.         2273
## 3 Medium          125629765.          813

ggplot(profit_category, aes(x = budget_category, y = avg_profit, fill = budget_category)) +
  geom_col() +
  labs(title = "avg Profit by Budget Category",
       x = "Budget Category",
       y = "avg Profit (in Millions)") +
  theme_minimal() +
  scale_fill_manual(values = c(
    "Low" = "purple",
    "Medium" = "lightblue",
    "High" = "darkblue"
  )) +
  scale_y_continuous(labels = scales::label_number(scale = 1e-6, suffix = "M")) +
  theme(legend.position = "none")

Interpretation: This graph compares profit for low, medium and high budget movies.

Insights: High budget movies earn more profit on average. but they also require more investment.

E.Genre and Language analysis

10. Top 10 Languages by Number of Movies

language_count <- movies_clean %>%
  group_by(original_language) %>%
  summarise(total_movies = n(), .groups = "drop") %>%
  arrange(desc(total_movies)) %>%
  head(10)
language_count

## # A tibble: 10 × 2
##    original_language total_movies
##    <chr>                    <int>
##  1 en                        3102
##  2 fr                          25
##  3 es                          15
##  4 ja                          13
##  5 zh                          13
##  6 de                           9
##  7 hi                           7
##  8 it                           6
##  9 ru                           6
## 10 cn                           5

ggplot(language_count, aes(x = reorder(original_language, total_movies), y = total_movies)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  labs(title = "Top 10 Languages by Number of Movies",
       x = "Language",
       y = "Number of Movies") +
  theme_minimal()

Interpretation: This graph shows top 10 langauges

Insights: Movies with higher ratings tend to earn more revenue. this shows that better movies attract more audience.

11. Taking top 10 profitable movies

top_profit <- movies_clean %>%
  arrange(desc(profit)) %>%
  select(title, profit) %>%
  head(10)
top_profit

## # A tibble: 10 × 2
##    title                                             profit
##    <chr>                                              <dbl>
##  1 Avatar                                        2550965087
##  2 Titanic                                       1645034188
##  3 Jurassic World                                1363528810
##  4 Furious 7                                     1316249360
##  5 The Avengers                                  1299557910
##  6 Avengers: Age of Ultron                       1125403694
##  7 Frozen                                        1124219009
##  8 Minions                                       1082730962
##  9 The Lord of the Rings: The Return of the King 1024888979
## 10 Iron Man 3                                    1015439994

ggplot(top_profit, aes(x = reorder(title, profit), y = profit)) +
  geom_col(fill = "pink4") +
  coord_flip() +
  labs(title = "Top 10 Profitable Movies",
       x = "Movie Title",
       y = "Profit") +
  theme_minimal()

Interpretation: This graph shows the most common languages in the dataset.

Insights: English movies are highest in number. other languages have much fewer movies.

12. Top Genres

genre_count <- movies_clean %>%
  group_by(main_genre) %>%
  summarise(total_movies = n(), .groups = "drop") %>%
  arrange(desc(total_movies)) %>%head(10)
genre_count

## # A tibble: 10 × 2
##    main_genre      total_movies
##    <chr>                  <int>
##  1 Drama                    747
##  2 Comedy                   634
##  3 Action                   588
##  4 Adventure                288
##  5 Horror                   197
##  6 Crime                    141
##  7 Thriller                 118
##  8 Animation                 99
##  9 Fantasy                   93
## 10 Science Fiction           79

ggplot(genre_count, aes(x = reorder(main_genre, total_movies), y = total_movies)) +
  geom_col(fill = "darkcyan") +
  coord_flip() +
  labs(title = "Top Genres by Number of Movies",
       x = "Genre",
       y = "Number of Movies") +
  theme_minimal()

Interpretation: This graph shows which genres are most common.

Insights: Some genres appear more frequently. this shows that certain types of movies are made more often.

13. Profit by Genre

profit_genre <- movies_clean %>%
  group_by(main_genre) %>%
  summarise(avg_profit = mean(profit),
            total_movies = n(),
            .groups = "drop") %>%
  filter(total_movies > 20)
profit_genre

## # A tibble: 15 × 3
##    main_genre      avg_profit total_movies
##    <chr>                <dbl>        <int>
##  1 Action           97322816.          588
##  2 Adventure       170718674.          288
##  3 Animation       216260494.           99
##  4 Comedy           56210243.          634
##  5 Crime            39166665.          141
##  6 Documentary      18670263.           30
##  7 Drama            46064427.          747
##  8 Family          172206323.           38
##  9 Fantasy         119642574.           93
## 10 Horror           50941923.          197
## 11 Mystery          65985201.           27
## 12 Romance          69887043.           70
## 13 Science Fiction 146139275.           79
## 14 Thriller         60873100.          118
## 15 Western          32930690            22

ggplot(profit_genre, aes(x = reorder(main_genre, avg_profit), y = avg_profit)) +
  geom_col(fill = "purple4") +coord_flip() +
  labs(title = "avg Profit by Genre",
       x = "Genre",
       y = "avg Profit") +
  theme_minimal() +
  scale_y_continuous(labels = scales::label_number(scale = 1e-6, suffix = "M"))

Interpretation: This graph shows average profit for each genre.

Insights: Some genres earn more profit than others. this means genre affects movie earnings.

14. Popularity by Genre

popularity_genre <- movies_clean %>%
  group_by(main_genre) %>%
  summarise(avg_popularity = mean(popularity),
            total_movies = n(),
            .groups = "drop") %>%
  filter(total_movies > 20)
popularity_genre

## # A tibble: 15 × 3
##    main_genre      avg_popularity total_movies
##    <chr>                    <dbl>        <int>
##  1 Action                   34.6           588
##  2 Adventure                44.6           288
##  3 Animation                41.7            99
##  4 Comedy                   20.3           634
##  5 Crime                    25.2           141
##  6 Documentary               6.60           30
##  7 Drama                    22.5           747
##  8 Family                   52.0            38
##  9 Fantasy                  40.3            93
## 10 Horror                   26.1           197
## 11 Mystery                  33.2            27
## 12 Romance                  25.9            70
## 13 Science Fiction          49.1            79
## 14 Thriller                 31.2           118
## 15 Western                  26.8            22

ggplot(popularity_genre, aes(x = reorder(main_genre, avg_popularity), y = avg_popularity)) +
  geom_col(fill = "purple") +coord_flip() +
  labs(title = "avg Popularity by Genre",
       x = "Genre",
       y = "Popularity") +
  theme_minimal()

Interpretation: This graph shows popularity across different genres.

Insights: Some genres are more popular among audience. this shows audience interest varies by genre.

15. Revenue by Rating Category

revenue_rating <- movies_clean %>%
  group_by(rating_category) %>%
  summarise(avg_revenue = mean(revenue),
            total_movies = n(),
            .groups = "drop")
revenue_rating

## # A tibble: 3 × 3
##   rating_category avg_revenue total_movies
##   <chr>                 <dbl>        <int>
## 1 High             168980432.          768
## 2 Low               43147160.          192
## 3 Medium           111693381.         2269

ggplot(revenue_rating, aes(x = rating_category, y = avg_revenue, fill = rating_category)) +
  geom_col() +
  labs(title = "avg Revenue by Rating Category",
       x = "Rating Category",
       y = "avg Revenue") +
  theme_minimal() +
  scale_fill_manual(values = c("Low" = "seagreen2", "Medium" = "green4", "High" = "darkgreen")) +
  scale_y_continuous(labels = scales::label_number(scale = 1e-6, suffix = "M"))

Interpretation: This graph shows revenue based on rating levels.

Insights: Higher rated movies earn more money. this shows quality may influence earnings.

F.Special Analysis

16. Hit vs Flop Analysis

movies_clean <- movies_clean %>%
  mutate(success = ifelse(profit > 0, "Hit", "Flop"))
success_count <- movies_clean %>%
  group_by(success) %>%
  summarise(total = n(), .groups = "drop")
success_count

## # A tibble: 2 × 2
##   success total
##   <chr>   <int>
## 1 Flop      791
## 2 Hit      2438

Interpretation: This table shows how many movies are hits or flops.

Insights: Not all movies make profit. some movies fail even after release.

17. Runtime Group Performance

movies_clean <- movies_clean %>%
  mutate(runtime_group = case_when(
    runtime < 90 ~ "Short",
    runtime <= 120 ~ "Medium",
    TRUE ~ "Long"
  ))
runtime_perf <- movies_clean %>%
  group_by(runtime_group) %>%
  summarise(
    avg_rating = mean(vote_average),
    avg_profit = mean(profit),
    .groups = "drop"
  )

runtime_perf

## # A tibble: 3 × 3
##   runtime_group avg_rating avg_profit
##   <chr>              <dbl>      <dbl>
## 1 Long                6.79 138279307.
## 2 Medium              6.19  62110087.
## 3 Short               5.83  51957018.

Interpretation:

This table shows the average rating and average profit for movies grouped by their runtime into short, medium, and long categories. It helps in comparing how movie length is related to both audience ratings and financial performance.

Insights:

From the table, it can be observed that long movies have the highest average rating and profit, followed by medium movies, while short movies have the lowest values. This indicates that longer movies tend to perform better overall. One possible reason is that long movies often have bigger budgets, better storytelling, and higher production quality, which attract more audience and generate higher revenue.

18. High Budget Risk Analysis

high_budget_risk <- movies_clean %>%
  filter(budget_category == "High") %>%
  mutate(failure = ifelse(profit <= 0, "Loss", "Profit")) %>%
  group_by(failure) %>%
  summarise(total = n(), .groups = "drop")
high_budget_risk

## # A tibble: 2 × 2
##   failure total
##   <chr>   <int>
## 1 Loss        6
## 2 Profit    137

Interpretation: This table shows profit and loss in high budget movies.

Insights: Even high budget movies can fail. this shows high investment also has high risk.

19. gap analysis

gap_analysis <- movies_clean %>%
  mutate(
    popularity_scaled = (popularity - min(popularity)) /
      (max(popularity) - min(popularity)),
    
    rating_scaled = vote_average / 10,
    
    popularity_rating_gap = popularity_scaled - rating_scaled
  ) %>%
  arrange(desc(popularity_rating_gap)) %>%
  select(title, popularity, vote_average,
         popularity_scaled, rating_scaled,
         popularity_rating_gap) %>%
  head(10)

gap_analysis

## # A tibble: 10 × 6
##    title                 popularity vote_average popularity_scaled rating_scaled
##    <chr>                      <dbl>        <dbl>             <dbl>         <dbl>
##  1 Minions                 876.              6.4         1                  0.64
##  2 Interstellar            724.              8.1         0.827              0.81
##  3 Naturally Native          0.579           0           0.000638           0   
##  4 Mi America                0.0390          0           0.0000217          0   
##  5 Deadpool                515.              7.4         0.588              0.74
##  6 Jurassic World          419.              6.5         0.478              0.65
##  7 Foodfight!                5.26            2.3         0.00598            0.23
##  8 Mad Max: Fury Road      434.              7.2         0.496              0.72
##  9 Guardians of the Gal…   481.              7.9         0.549              0.79
## 10 Disaster Movie           16.2             3           0.0185             0.3 
## # ℹ 1 more variable: popularity_rating_gap <dbl>

Interpretation: This table shows difference between popularity and rating.

Insights: Some movies are popular but not highly rated. this shows popularity does not always reflect quality.

20. Runtime by genre

runtime_genre <- movies_clean %>%
  group_by(main_genre) %>%
  summarise(avg_runtime = mean(runtime),
            total_movies = n(),
            .groups = "drop") %>%
  filter(total_movies > 20)
runtime_genre

## # A tibble: 15 × 3
##    main_genre      avg_runtime total_movies
##    <chr>                 <dbl>        <int>
##  1 Action                111.           588
##  2 Adventure             116.           288
##  3 Animation              91.3           99
##  4 Comedy                103.           634
##  5 Crime                 116.           141
##  6 Documentary            98.6           30
##  7 Drama                 121.           747
##  8 Family                 94.2           38
##  9 Fantasy               108.            93
## 10 Horror                 98.1          197
## 11 Mystery               105.            27
## 12 Romance               108.            70
## 13 Science Fiction       113.            79
## 14 Thriller              111.           118
## 15 Western               122.            22

ggplot(runtime_genre, aes(x = reorder(main_genre, avg_runtime), y = avg_runtime)) +
  geom_col(fill = "thistle") +
  coord_flip() +
  labs(title = "avg Runtime by Genre",
       x = "Genre",
       y = "avg Runtime") +
  theme_minimal()

Interpretation: This graph shows average runtime for each genre.

Insights: Different genres have different runtimes. this shows storytelling style varies by genre.

21. Revenue by Genre

revenue_genre <- movies_clean %>%
  group_by(main_genre) %>%
  summarise(avg_revenue = mean(revenue),
            total_movies = n(),
            .groups = "drop") %>%
  filter(total_movies > 20)

revenue_genre

## # A tibble: 15 × 3
##    main_genre      avg_revenue total_movies
##    <chr>                 <dbl>        <int>
##  1 Action           155542667.          588
##  2 Adventure        246084431.          288
##  3 Animation        298941629.           99
##  4 Comedy            83654762.          634
##  5 Crime             66727636.          141
##  6 Documentary       23801772.           30
##  7 Drama             72425378.          747
##  8 Family           234061586.           38
##  9 Fantasy          183410733.           93
## 10 Horror            67269518.          197
## 11 Mystery           98494830.           27
## 12 Romance           98928146.           70
## 13 Science Fiction  204763414.           79
## 14 Thriller          99042761.          118
## 15 Western           60836316.           22

ggplot(revenue_genre, aes(x = reorder(main_genre, avg_revenue), y = avg_revenue)) +
  geom_col(fill = "orange") +
  coord_flip() +
  labs(title = "avg revenue by genre",
       x = "genre",
       y = "avg revenue") +
  theme_minimal()

Interpretation: This graph shows average revenue earned by different genres.

Insights: Some genres generate higher revenue compared to others. this means genre selection plays an important role in movie earnings.

22. Top Genres Trend Over Time

genre_time <- movies_clean %>%
  group_by(year, main_genre) %>%
  summarise(total = n(), .groups = "drop") %>%
  filter(main_genre %in% head(names(sort(table(movies_clean$main_genre), decreasing = TRUE)),5))
genre_time

## # A tibble: 292 × 3
##     year main_genre total
##    <dbl> <chr>      <int>
##  1  1916 Drama          1
##  2  1925 Drama          1
##  3  1927 Drama          1
##  4  1929 Drama          1
##  5  1930 Action         1
##  6  1932 Drama          1
##  7  1933 Comedy         1
##  8  1934 Comedy         1
##  9  1935 Comedy         1
## 10  1936 Action         1
## # ℹ 282 more rows

ggplot(genre_time, aes(x = year, y = total, color = main_genre)) +
  geom_line() +
  labs(title = "top genres trend over time",
       x = "year",
       y = "number of movies") +
  theme_minimal()

Interpretation: This graph shows how number of movies in top genres changes over years.

Insights: Some genres grow more over time while others remain stable. this shows changing trends in audience preferences. The rise of genres like Action and Adventure can also be linked to advancements in technology and visual effects. The slight decline in recent years may be due to changes in distribution platform or the shift towards streaming services.

23. Runtime Distribution by Budget Category

ggplot(movies_clean, aes(x = budget_category, y = runtime, fill = budget_category)) +
  geom_boxplot() +
  labs(title = "runtime distribution by budget category",
       x = "budget category",
       y = "runtime") +
  theme_minimal() +
  theme(legend.position = "none")

Interpretation: This boxplot shows how runtime varies across different budget categories.

Insights: High budget movies generally have slightly higher runtime compared to low budget movies. medium budget movies fall in between. however, there is overlap between all categories, showing runtime is not strictly dependent on budget. some low budget movies also have very high runtime, indicating variation in movie length across all budget types

24. Hit vs Flop by Rating Category

hit_rating <- movies_clean %>%
  mutate(success = ifelse(profit > 0, "Hit", "Flop")) %>%
  group_by(rating_category, success) %>%
  summarise(total = n(), .groups = "drop")

hit_rating

## # A tibble: 6 × 3
##   rating_category success total
##   <chr>           <chr>   <int>
## 1 High            Flop       88
## 2 High            Hit       680
## 3 Low             Flop       98
## 4 Low             Hit        94
## 5 Medium          Flop      605
## 6 Medium          Hit      1664

ggplot(hit_rating, aes(x = rating_category, y = total, fill = success)) +
  geom_col(position = "dodge") +
  labs(title = "hit vs flop by rating category",
       x = "rating category",
       y = "number of movies") +
  theme_minimal()

Interpretation: This graph shows number of hit and flop movies across rating categories.

Insights: High rated movies have more hits compared to flops. low rated movies tend to fail more. this shows rating impacts success. However, we can also see that even some high rated movies can be flops, and some low rated movies can still be hits. This shows that rating is an important factor, but not the only factor affecting success

25. Language vs Rating Category

lang_rating <- movies_clean %>%
  group_by(original_language, rating_category) %>%
  summarise(total = n(), .groups = "drop") %>%
  filter(original_language %in% names(sort(table(movies_clean$original_language), decreasing = TRUE)[1:6]))

lang_rating

## # A tibble: 14 × 3
##    original_language rating_category total
##    <chr>             <chr>           <int>
##  1 de                High                8
##  2 de                Low                 1
##  3 en                High              702
##  4 en                Low               189
##  5 en                Medium           2211
##  6 es                High               10
##  7 es                Medium              5
##  8 fr                High               11
##  9 fr                Medium             14
## 10 ja                High                7
## 11 ja                Medium              6
## 12 zh                High                5
## 13 zh                Low                 1
## 14 zh                Medium              7

ggplot(lang_rating, aes(x = original_language, y = total, fill = rating_category)) +
  geom_col(position = "dodge") +
  labs(title = "language vs rating category",
       x = "language",
       y = "number of movies") +
  theme_minimal()

Interpretation: This graph shows distribution of rating categories across different languages.

Insights: In graph, English language movies dominate in all categories, especially in the medium rating category, followed by a large number of high rated movies. Other languages like French, Japanese, Spanish, and Chinese have fewer movies overall, but most of them fall into high or medium rating categories.

There are very few low rated movies in these languages compared to English..

26. Correlation Analysis

numeric_data <- movies_clean %>%
  select(budget, revenue, profit, runtime, vote_average, vote_count, popularity)

cor_matrix <- cor(numeric_data)
corrplot(cor_matrix,
         method = "circle",
         type = "upper",
         tl.col = "black",
         tl.cex = 0.8)

Interpretation: This correlation plot shows relationship between numerical variables like budget, revenue, profit, rating, and popularity.

Insights: Budget and revenue show strong positive relationship. profit is also related to revenue. rating and popularity have weaker relationships.

27. Top 10 Loss Making Movies

loss_movies <- movies_clean %>%
  arrange(profit) %>%
  select(title, profit) %>%
  head(10) %>%
  mutate(loss = abs(profit))

ggplot(loss_movies, aes(x = reorder(title, loss), y = loss)) +
  geom_col(fill = "red") +
  coord_flip() +
  labs(title = "top 10 loss making movies",
       x = "movie",
       y = "loss") +
  theme_minimal()

Interpretation: This graph shows movies with highest losses.

Insights: Some movies face very high losses. this shows that movie industry has high risk and not all investments are successful.

28. Vote Count vs Revenue

ggplot(movies_clean, aes(x = vote_count, y = revenue)) +
  geom_point(alpha = 0.5, color = "purple") +
  geom_smooth(method = "lm", color = "black") +
  labs(title = "vote count vs revenue",
       x = "vote count",
       y = "revenue") +
  theme_minimal() +
  scale_y_continuous(labels = scales::label_number(scale = 1e-6, suffix = "M"))

## `geom_smooth()` using formula = 'y ~ x'

Interpretation: This graph shows relationship between vote count and revenue.

Insights: Movies with higher votes tend to earn more. this shows audience engagement affects earnings. However, the data points are widely scattered, especially at higher vote counts. This means that although there is a general upward trend, the relationship is not very strong or consistent.This can be because vote count reflects audience engagement, but revenue is influenced by multiple factors like budget, marketing, genre, and release timing. So even if a movie gets many votes, it may not always generate the highest revenue

29. Rating Variation by Genre

rating_variation <- movies_clean %>%
  group_by(main_genre) %>%
  summarise(rating_sd = sd(vote_average),
            total_movies = n(),
            .groups = "drop") %>%
  filter(total_movies > 20)

rating_variation

## # A tibble: 15 × 3
##    main_genre      rating_sd total_movies
##    <chr>               <dbl>        <int>
##  1 Action              0.857          588
##  2 Adventure           0.818          288
##  3 Animation           0.911           99
##  4 Comedy              0.813          634
##  5 Crime               0.713          141
##  6 Documentary         0.809           30
##  7 Drama               0.859          747
##  8 Family              0.728           38
##  9 Fantasy             0.885           93
## 10 Horror              0.809          197
## 11 Mystery             0.863           27
## 12 Romance             0.817           70
## 13 Science Fiction     0.873           79
## 14 Thriller            0.849          118
## 15 Western             0.685           22

ggplot(rating_variation, aes(x = reorder(main_genre, rating_sd), y = rating_sd)) +
  geom_col(fill = "darkgreen") +
  coord_flip() +
  labs(title = "rating variation by genre",
       x = "genre",
       y = "standard deviation of ratings") +
  theme_minimal()

Interpretation: This graph shows variation in ratings within each genre.

Insights: Some genres have consistent ratings while others show high variation. this means audience response differs across genres.

30. Number of Genres vs Rating

movies_clean <- movies_clean %>%
  mutate(genre_count = str_count(genres, "name"))

genre_complexity <- movies_clean %>%
  group_by(genre_count) %>%
  summarise(avg_rating = mean(vote_average),
            avg_profit = mean(profit),
            total_movies = n(),
            .groups = "drop")

genre_complexity

## # A tibble: 8 × 4
##   genre_count avg_rating avg_profit total_movies
##         <int>      <dbl>      <dbl>        <int>
## 1           0       5      -327270             1
## 2           1       6.28  46654050.          535
## 3           2       6.36  62260020.          928
## 4           3       6.32  98843290.         1103
## 5           4       6.30 105623279.          483
## 6           5       6.15 103127251.          149
## 7           6       5.8   70435306.           28
## 8           7       5.15  52538631             2

ggplot(genre_complexity, aes(x = genre_count, y = avg_rating)) +
  geom_line() +
  geom_point() +
  labs(title = "number of genres vs rating",
       x = "number of genres",
       y = "average rating") +
  theme_minimal()

Interpretation: This graph shows how number of genres affects movie rating.

Insights: Movies with a moderate number of genres (around 2–4) tend to perform better, while having too many genres can reduce focus and lead to lower ratings,also note that very high genre counts have very few movies, so those values may be less reliable

31. Revenue by Country

country_analysis <- movies_clean %>%
  group_by(main_country) %>%
  summarise(avg_revenue = mean(revenue),
            total_movies = n(),
            .groups = "drop") %>%
  arrange(desc(total_movies)) %>%
  head(10)

country_analysis

## # A tibble: 10 × 3
##    main_country             avg_revenue total_movies
##    <chr>                          <dbl>        <int>
##  1 United States of America  124898009.         2246
##  2 United Kingdom            143863130.          254
##  3 Germany                   115757931.          144
##  4 Canada                     92259291.          130
##  5 France                     75047292.          108
##  6 Australia                 114679392.           62
##  7 Japan                     161400115.           25
##  8 China                     186544279.           24
##  9 India                      87262122.           23
## 10 Spain                      76997000.           22

ggplot(country_analysis, aes(x = reorder(main_country, avg_revenue), y = avg_revenue)) +
  geom_col(fill = "darkred") +
  coord_flip() +
  labs(title = "average revenue by country",
       x = "country",
       y = "average revenue") +
  theme_minimal() +
  scale_y_continuous(labels = scales::label_number(scale = 1e-6, suffix = "M"))

Interpretation: This graph shows average revenue generated by movies from different countries.

Insights: Some countries produce higher revenue movies. this shows production location influences earnings.

TMDB Movies Analysis

Khushboo

2026-04-13